26 #include <cuda_runtime_api.h> 38 extern __shared__ float2 IMG[];
57 __device__ __constant__
float cIw0 = 0.f;
66 float num = -0.8436825781374849e-19
f;
67 num = fmaf(num, x2, -0.93466495199548700e-17
f);
68 num = fmaf(num, x2, -0.15716375332511895e-13
f);
69 num = fmaf(num, x2, -0.42520971595532318e-11
f);
70 num = fmaf(num, x2, -0.13704363824102120e-8
f);
71 num = fmaf(num, x2, -0.28508770483148419e-6
f);
72 num = fmaf(num, x2, -0.44322160233346062e-4
f);
73 num = fmaf(num, x2, -0.46703811755736946e-2
f);
74 num = fmaf(num, x2, -0.31112484643702141e-0
f);
75 num = fmaf(num, x2, -0.11512633616429962e+2
f);
76 num = fmaf(num, x2, -0.18720283332732112e+3
f);
77 num = fmaf(num, x2, -0.75281108169006924e+3
f);
80 den = fmaf(den, x2, -0.75281109410939403e+3
f);
89 if ((ax = fabsf(x)) < 3.75
f)
93 ans = 1.f + y * (3.5156229f + y * (3.0899424f + y * (1.2067492f
94 + y * (0.2659732f + y * (0.360768e-1
f + y * 0.45813e-2
f)))));
99 ans = (expf(ax) * rsqrtf(ax)) * (0.39894228
f + y * (0.1328592e-1
f 100 + y * (0.225319e-2
f + y * (-0.157565e-2
f + y * (0.916281e-2
f 101 + y * (-0.2057706e-1
f + y * (0.2635537e-1
f + y * (-0.1647633e-1
f 102 + y * 0.392377e-2
f))))))));
113 if ((ax = fabsf(x)) < 3.75
f)
117 ans = ax * (0.5f + y * (0.87890594f + y * (0.51498869f + y * (0.15084934f
118 + y * (0.2658733e-1
f + y * (0.301532e-2
f + y * 0.32411e-3
f))))));
123 ans = 0.2282967e-1
f + y * (-0.2895312e-1
f + y * (0.1787654e-1
f 124 - y * 0.420059e-2
f));
125 ans = 0.39894228f + y * (-0.3988024e-1
f + y * (-0.362018e-2
f 126 + y * (0.163801e-2
f + y * (-0.1031555e-1
f + y * ans))));
127 ans *= (expf(ax) * rsqrtf(ax));
129 return x < 0.0 ? -ans : ans;
135 return (x == 0) ? 0 :
bessi0(x) - ((2*1) / x) *
bessi1(x);
141 return (x == 0) ? 0 :
bessi1(x) - ((2*2) / x) *
bessi2(x);
147 return (x == 0) ? 0 :
bessi2(x) - ((2*3) / x) *
bessi3(x);
155 float rda, rdas, arg,
w;
168 w = sqrtf (1.
f - rdas);
173 w = sqrtf (1.
f - rdas);
179 w = sqrtf (1.
f - rdas);
185 w = sqrtf (1.
f - rdas);
190 printf(
"order (%d) out of range in kaiser_value(): %s, %d\n", order, __FILE__, __LINE__);
261 copy(orig->
FFTs,
FFTs, orig, stream);
262 copy(orig->
CTFs,
CTFs, orig, stream);
284 if (NULL != srcArray) {
286 cudaMemcpyAsync(dstArray, srcArray, bytes, cudaMemcpyHostToDevice,
streams[stream]);
298 cudaMalloc((
void **) &dstArray, bytes);
323 cpuCopy->create(orig);
331 cudaFreeHost(cpuCopy);
335 cpuCopy->copyDataFrom(orig, stream);
339 if (NULL == gpuCopy) {
348 size_t bytes = (size_t)size * size * size * typeSize;
349 cudaMalloc((
void**)&ptr, bytes);
350 cudaMemset(ptr, 0.
f, bytes);
357 float* tempVolGPU,
float* tempWeightsGPU,
359 for (
int z = 0;
z < size;
z++) {
360 for (
int y = 0;
y < size;
y++) {
361 int index = (
z * size * size) + (
y * size);
362 cudaMemcpy(tempVol[
z][
y], &tempVolGPU[2 * index], 2 * size *
sizeof(
float), cudaMemcpyDeviceToHost);
363 cudaMemcpy(tempWeights[
z][y] , &tempWeightsGPU[index], size *
sizeof(
float), cudaMemcpyDeviceToHost);
383 if (size <= 1)
return 0;
384 return ((idx <= (size / 2)) ? idx : (-size + idx)) / (float)size;
393 return (-n.
x*(x-p0.
x)-n.
y*(y-p0.
y))/n.
z + p0.
z;
402 return (-n.
x*(x-p0.
x)-n.
z*(z-p0.
z))/n.
y + p0.
y;
412 return (-n.
y*(y-p0.
y)-n.
z*(z-p0.
z))/n.
x + p0.
x;
418 float tmp0 = transform[0][0] * inOut.
x + transform[0][1] * inOut.
y + transform[0][2] * inOut.
z;
419 float tmp1 = transform[1][0] * inOut.
x + transform[1][1] * inOut.
y + transform[1][2] * inOut.
z;
420 float tmp2 = transform[2][0] * inOut.
x + transform[2][1] * inOut.
y + transform[2][2] * inOut.
z;
429 AABB[0].
x = AABB[0].
y = AABB[0].
z = INFINITY;
430 AABB[1].
x = AABB[1].
y = AABB[1].
z = -INFINITY;
432 for (
int i = 0;
i < 8;
i++) {
434 if (AABB[0].
x > tmp.
x) AABB[0].
x = tmp.
x;
435 if (AABB[0].
y > tmp.
y) AABB[0].
y = tmp.
y;
436 if (AABB[0].
z > tmp.
z) AABB[0].
z = tmp.
z;
437 if (AABB[1].
x < tmp.
x) AABB[1].
x = tmp.
x;
438 if (AABB[1].
y < tmp.
y) AABB[1].
y = tmp.
y;
439 if (AABB[1].
z < tmp.
z) AABB[1].
z = tmp.
z;
441 AABB[0].
x = ceilf(AABB[0].
x);
442 AABB[0].
y = ceilf(AABB[0].
y);
443 AABB[0].
z = ceilf(AABB[0].
z);
445 AABB[1].
x = floorf(AABB[1].x);
446 AABB[1].
y = floorf(AABB[1].y);
447 AABB[1].
z = floorf(AABB[1].z);
455 template<
bool hasCTF>
458 float2* tempVolumeGPU,
float* tempWeightsGPU,
460 int xSize,
int ySize,
461 const float* __restrict__
CTF,
462 const float* __restrict__ modulator,
463 const float2* __restrict__ FFT,
469 float wModulator = 1.f;
471 float dataWeight = space->
weight;
482 if (imgPos.
x < 0.f)
return;
486 int imgX = clamp((
int)(imgPos.
x + 0.5f), 0, xSize - 1);
490 int index2D = imgY * xSize + imgX;
494 wModulator = modulator[index2D];
497 float weight = wBlob * wModulator * dataWeight;
500 atomicAdd(&tempVolumeGPU[index3D].x, FFT[index2D].x * weight * wCTF);
501 atomicAdd(&tempVolumeGPU[index3D].y, FFT[index2D].y * weight * wCTF);
502 atomicAdd(&tempWeightsGPU[index3D], weight);
510 template<
bool hasCTF,
int blobOrder,
bool useFastKaiser>
513 float2* tempVolumeGPU,
float *tempWeightsGPU,
515 int xSize,
int ySize,
516 const float* __restrict__
CTF,
517 const float* __restrict__ modulator,
518 const float2* __restrict__ FFT,
520 const float* blobTableSqrt,
539 float zSqr = imgPos.
z * imgPos.
z;
540 if (zSqr > radiusSqr)
return;
543 int minX = ceilf(imgPos.
x - cBlobRadius);
544 int maxX = floorf(imgPos.
x + cBlobRadius);
545 int minY = ceilf(imgPos.
y - cBlobRadius);
546 int maxY = floorf(imgPos.
y + cBlobRadius);
547 minX = fmaxf(minX, 0);
548 minY = fmaxf(minY, 0);
549 maxX = fminf(maxX, xSize-1);
550 maxY = fminf(maxY, ySize-1);
555 vol.x = vol.y = w = 0.f;
558 float dataWeight = space->
weight;
563 for (
int i = minY;
i <= maxY;
i++) {
564 float ySqr = (imgPos.
y -
i) * (imgPos.
y -
i);
565 float yzSqr = ySqr + zSqr;
566 if (yzSqr > radiusSqr)
continue;
567 for (
int j = minX;
j <= maxX;
j++) {
568 float xD = imgPos.
x -
j;
569 float distanceSqr = xD*xD + yzSqr;
570 if (distanceSqr > radiusSqr)
continue;
573 int index2D = (
i - SHARED_AABB[0].y) * imgCacheDim + (
j-SHARED_AABB[0].x);
575 int index2D =
i * xSize +
j;
578 float wCTF = CTF[index2D];
579 float wModulator = modulator[index2D];
580 #if PRECOMPUTE_BLOB_VAL 582 #if SHARED_BLOB_TABLE 585 float wBlob = blobTableSqrt[aux];
596 float weight = wBlob * wModulator * dataWeight;
599 vol += IMG[index2D] * weight * wCTF;
601 vol += FFT[index2D] * weight * wCTF;
607 for (
int i = minY;
i <= maxY;
i++) {
608 float ySqr = (imgPos.
y -
i) * (imgPos.
y -
i);
609 float yzSqr = ySqr + zSqr;
610 if (yzSqr > radiusSqr)
continue;
611 for (
int j = minX;
j <= maxX;
j++) {
612 float xD = imgPos.
x -
j;
613 float distanceSqr = xD*xD + yzSqr;
614 if (distanceSqr > radiusSqr)
continue;
617 int index2D = (
i - SHARED_AABB[0].y) * imgCacheDim + (
j-SHARED_AABB[0].x);
619 int index2D =
i * xSize +
j;
622 #if PRECOMPUTE_BLOB_VAL 624 #if SHARED_BLOB_TABLE 627 float wBlob = blobTableSqrt[aux];
638 float weight = wBlob * dataWeight;
641 vol += IMG[index2D] * weight;
643 vol += FFT[index2D] * weight;
649 atomicAdd(&tempVolumeGPU[index3D].x, vol.x);
650 atomicAdd(&tempVolumeGPU[index3D].y, vol.y);
658 template<
bool useFast,
bool hasCTF,
int blobOrder,
bool useFastKaiser>
661 float2* tempVolumeGPU,
float *tempWeightsGPU,
662 int xSize,
int ySize,
663 const float* __restrict__
CTF,
664 const float* __restrict__ modulator,
665 const float2* __restrict__ FFT,
672 int id = threadIdx.y * blockDim.x + threadIdx.x;
673 int tidX = threadIdx.x %
TILE + (
id / (blockDim.y *
TILE)) *
TILE;
674 int tidY = (
id /
TILE) % blockDim.y;
675 int idx = blockIdx.x*blockDim.x + tidX;
676 int idy = blockIdx.y*blockDim.y + tidY;
679 volatile int idx = blockIdx.x*blockDim.x + threadIdx.x;
680 volatile int idy = blockIdx.y*blockDim.y + threadIdx.y;
683 if (tSpace->
XY == tSpace->
dir) {
684 if (idy >= tSpace->
minY && idy <= tSpace->maxY) {
685 if (idx >= tSpace->
minX && idx <= tSpace->maxX) {
688 int z = (int)(hitZ + 0.5
f);
689 processVoxel<hasCTF>(tempVolumeGPU, tempWeightsGPU, idx, idy,
z, xSize, ySize ,
CTF, modulator, FFT, tSpace);
695 int lower = floorf(fminf(z1, z2));
696 int upper = ceilf(fmaxf(z1, z2));
697 for (
int z = lower;
z <= upper;
z++) {
698 processVoxelBlob<hasCTF, blobOrder, useFastKaiser>(tempVolumeGPU, tempWeightsGPU, idx, idy,
z, xSize, ySize ,
CTF, modulator, FFT, tSpace,
devBlobTableSqrt, imgCacheDim);
703 }
else if (tSpace->
XZ == tSpace->
dir) {
704 if (idy >= tSpace->
minZ && idy <= tSpace->maxZ) {
705 if (idx >= tSpace->
minX && idx <= tSpace->maxX) {
708 int y = (int)(hitY + 0.5
f);
709 processVoxel<hasCTF>(tempVolumeGPU, tempWeightsGPU, idx,
y, idy, xSize, ySize ,
CTF, modulator, FFT, tSpace);
715 int lower = floorf(fminf(y1, y2));
716 int upper = ceilf(fmaxf(y1, y2));
717 for (
int y = lower;
y <= upper;
y++) {
718 processVoxelBlob<hasCTF, blobOrder, useFastKaiser>(tempVolumeGPU, tempWeightsGPU, idx,
y, idy, xSize, ySize ,
CTF, modulator, FFT, tSpace,
devBlobTableSqrt, imgCacheDim);
724 if (idy >= tSpace->
minZ && idy <= tSpace->maxZ) {
725 if (idx >= tSpace->
minY && idx <= tSpace->maxY) {
728 int x = (int)(hitX + 0.5
f);
729 processVoxel<hasCTF>(tempVolumeGPU, tempWeightsGPU,
x, idx, idy, xSize, ySize ,
CTF, modulator, FFT, tSpace);
735 int lower = floorf(fminf(x1, x2));
736 int upper = ceilf(fmaxf(x1, x2));
737 for (
int x = lower;
x <= upper;
x++) {
738 processVoxelBlob<hasCTF, blobOrder, useFastKaiser>(tempVolumeGPU, tempWeightsGPU,
x, idx, idy, xSize, ySize ,
CTF, modulator, FFT, tSpace,
devBlobTableSqrt, imgCacheDim);
752 for (
int i = 0;
i < 8;
i++) {
777 if (tSpace->
XY == tSpace->
dir) {
778 box[0].
x = box[3].
x = box[4].
x = box[7].
x = blockIdx.x*blockDim.x -
cBlobRadius;
779 box[1].
x = box[2].
x = box[5].
x = box[6].
x = (blockIdx.x+1)*blockDim.x +
cBlobRadius - 1.f;
781 box[2].
y = box[3].
y = box[6].
y = box[7].
y = (blockIdx.y+1)*blockDim.y +
cBlobRadius - 1.f;
782 box[0].
y = box[1].
y = box[4].
y = box[5].
y = blockIdx.y*blockDim.y-
cBlobRadius;
795 }
else if (tSpace->
XZ == tSpace->
dir) {
796 box[0].
x = box[3].
x = box[4].
x = box[7].
x = blockIdx.x*blockDim.x -
cBlobRadius;
797 box[1].
x = box[2].
x = box[5].
x = box[6].
x = (blockIdx.x+1)*blockDim.x +
cBlobRadius - 1.f;
799 box[2].
z = box[3].
z = box[6].
z = box[7].
z = (blockIdx.y+1)*blockDim.y +
cBlobRadius - 1.f;
800 box[0].
z = box[1].
z = box[4].
z = box[5].
z = blockIdx.y*blockDim.y-
cBlobRadius;
814 box[0].
y = box[3].
y = box[4].
y = box[7].
y = blockIdx.x*blockDim.x -
cBlobRadius;
815 box[1].
y = box[2].
y = box[5].
y = box[6].
y = (blockIdx.x+1)*blockDim.x +
cBlobRadius - 1.f;
817 box[2].
z = box[3].
z = box[6].
z = box[7].
z = (blockIdx.y+1)*blockDim.y +
cBlobRadius - 1.f;
818 box[0].
z = box[1].
z = box[4].
z = box[5].
z = blockIdx.y*blockDim.y-
cBlobRadius;
843 return (AABB[0].
x < imgXSize)
845 && (AABB[0].
y < imgYSize)
857 int tXindex,
int tYindex,
859 float& vReal,
float& vImag) {
860 int imgXindex = tXindex + AABB[0].
x;
861 int imgYindex = tYindex + AABB[0].
y;
865 && (imgYindex < buffer->
fftSizeY)) {
886 for (
int y = threadIdx.y;
y < imgCacheDim;
y += blockDim.y) {
887 for (
int x = threadIdx.x;
x < imgCacheDim;
x += blockDim.x) {
888 int memIndex =
y * imgCacheDim +
x;
889 getImgData(AABB, x,
y, buffer, imgIndex, dest[memIndex].x, dest[memIndex].
y);
898 template<
bool useFast,
bool hasCTF,
int blobOrder,
bool useFastKaiser>
901 float* tempVolumeGPU,
float *tempWeightsGPU,
905 #if SHARED_BLOB_TABLE 908 volatile int id = threadIdx.y*blockDim.x + threadIdx.x;
909 volatile int blockSize = blockDim.x * blockDim.y;
916 for (
int i = blockIdx.z; i < buffer->
getNoOfSpaces();
i += gridDim.z) {
924 if ((threadIdx.x == 0) && (threadIdx.y == 0)) {
940 processProjection<useFast, hasCTF, blobOrder, useFastKaiser>(
941 (float2*)tempVolumeGPU, tempWeightsGPU,
966 volatile int idx = blockIdx.x*blockDim.x + threadIdx.x;
967 volatile int idy = blockIdx.y*blockDim.y + threadIdx.y;
969 int halfY = iSizeY / 2;
970 float normFactor = iSizeY*iSizeY;
975 for (
int n = 0;
n < iLength;
n++) {
981 if (idy < oSizeX || idy >= (iSizeY - oSizeX)) {
985 if ((freq.x * freq.x + freq.y * freq.y) > maxResolutionSqr) {
989 int newY = (idy < halfY) ? (idy + oSizeX) : (idy - iSizeY + oSizeX);
990 int oIndex = newY*oSizeX + idx;
992 int iIndex =
n*iSizeY*iSizeX + idy*iSizeX + idx;
993 float* iValue = (
float*)&(iFouriers[iIndex]);
996 oBuffer->
getNthItem(oBuffer->
FFTs,
n)[2*oIndex] = iValue[0] / normFactor;
997 oBuffer->
getNthItem(oBuffer->
FFTs,
n)[2*oIndex + 1] = iValue[1] / normFactor;
1010 float maxResolutionSqr,
1013 cudaStream_t stream =
streams[streamIndex];
1022 imagesGPU.
fft(resultingFFT, myhandle);
1035 dim3 dimGrid(ceil(resultingFFT.
Xdim/(
float)dimBlock.x), ceil(resultingFFT.
Ydim/(
float)dimBlock.y));
1036 convertImagesKernel<<<dimGrid, dimBlock, 0, stream>>>(
1038 wrapper->
gpuCopy, maxResolutionSqr);
1049 streams =
new cudaStream_t[count];
1050 for (
int i = 0;
i < count;
i++) {
1056 for (
int i = 0;
i < count;
i++) {
1100 cudaMemcpy(devBlobTableSqrt, blobTableSqrt, blobTableSize*
sizeof(
float), cudaMemcpyHostToDevice);
1110 delete wrappers[streamIndex];
1114 int maxVolIndexX,
int maxVolIndexYZ,
1115 float blobRadius,
float blobAlpha,
1116 float iDeltaSqrt,
float iw0,
float oneOverBessiOrderAlpha) {
1119 cudaMemcpyToSymbol(
cBlobRadius, &blobRadius,
sizeof(blobRadius));
1120 cudaMemcpyToSymbol(
cBlobAlpha, &blobAlpha,
sizeof(blobAlpha));
1121 cudaMemcpyToSymbol(
cIw0, &iw0,
sizeof(iw0));
1122 cudaMemcpyToSymbol(
cIDeltaSqrt, &iDeltaSqrt,
sizeof(iDeltaSqrt));
1124 float oneOverBlobRadiusSqr = 1.f / (blobRadius * blobRadius);
1135 template<
int blobOrder,
bool useFastKaiser>
1138 float blobRadius,
int maxVolIndexYZ,
bool useFast,
1139 float maxResolutionSqr,
int streamIndex) {
1141 cudaStream_t stream =
streams[streamIndex];
1145 wrapper->
copyFrom(buffer, streamIndex);
1153 cudaStreamSynchronize(stream);
1156 int size2D = maxVolIndexYZ + 1;
1159 dim3 dimGrid(ceil(size2D/(
float)dimBlock.x),ceil(size2D/(
float)dimBlock.y),
GRID_DIM_Z);
1162 if (useFast && buffer->
hasCTFs) {
1163 processBufferKernel<true, true, blobOrder,useFastKaiser><<<dimGrid, dimBlock, 0, stream>>>(
1164 tempVolumeGPU, tempWeightsGPU,
1170 if (useFast && !buffer->
hasCTFs) {
1171 processBufferKernel<true, false, blobOrder,useFastKaiser><<<dimGrid, dimBlock, 0, stream>>>(
1172 tempVolumeGPU, tempWeightsGPU,
1179 int sharedMemSize =
SHARED_IMG ? (imgCacheDim*imgCacheDim*
sizeof(float2)) : 0;
1180 if (!useFast && buffer->
hasCTFs) {
1181 processBufferKernel<false, true, blobOrder,useFastKaiser><<<dimGrid, dimBlock, sharedMemSize, stream>>>(
1182 tempVolumeGPU, tempWeightsGPU,
1188 if (!useFast && !buffer->
hasCTFs) {
1189 processBufferKernel<false, false, blobOrder,useFastKaiser><<<dimGrid, dimBlock, sharedMemSize, stream>>>(
1190 tempVolumeGPU, tempWeightsGPU,
1200 float blobRadius,
int maxVolIndexYZ,
bool useFast,
1201 float maxResolutionSqr,
int streamIndex,
int blobOrder,
float blobAlpha) {
1202 switch (blobOrder) {
1204 if (blobAlpha <= 15.0) {
1205 processBufferGPU_<0, true>(tempVolumeGPU, tempWeightsGPU,
1207 blobRadius, maxVolIndexYZ, useFast,
1211 processBufferGPU_<0, false>(tempVolumeGPU, tempWeightsGPU,
1213 blobRadius, maxVolIndexYZ, useFast,
1219 processBufferGPU_<1, false>(tempVolumeGPU, tempWeightsGPU,
1221 blobRadius, maxVolIndexYZ, useFast,
1226 processBufferGPU_<2, false>(tempVolumeGPU, tempWeightsGPU,
1228 blobRadius, maxVolIndexYZ, useFast,
1233 processBufferGPU_<3, false>(tempVolumeGPU, tempWeightsGPU,
1235 blobRadius, maxVolIndexYZ, useFast,
1240 processBufferGPU_<4, false>(tempVolumeGPU, tempWeightsGPU,
1242 blobRadius, maxVolIndexYZ, useFast,
__device__ __constant__ float cBlobRadius
enum RecFourierProjectionTraverseSpace::Direction dir
__global__ void processBufferKernel(float *tempVolumeGPU, float *tempWeightsGPU, RecFourierBufferDataGPU *buffer, float *devBlobTableSqrt, int imgCacheDim)
__device__ void processVoxelBlob(float2 *tempVolumeGPU, float *tempWeightsGPU, int x, int y, int z, int xSize, int ySize, const float *__restrict__ CTF, const float *__restrict__ modulator, const float2 *__restrict__ FFT, const RecFourierProjectionTraverseSpace *const space, const float *blobTableSqrt, int imgCacheDim)
void copyBlobTable(float *blobTableSqrt, int blobTableSize)
__device__ float bessi1(float x)
void processBufferGPU_(float *tempVolumeGPU, float *tempWeightsGPU, RecFourierBufferData *buffer, float blobRadius, int maxVolIndexYZ, bool useFast, float maxResolutionSqr, int streamIndex)
#define REPORT_ERROR(nerr, ErrormMsg)
__device__ float getX(float y, float z, const Point3D< float > &n, const Point3D< float > &p0)
void create(RecFourierBufferData *orig)
void sqrt(Image< double > &op)
__device__ void computeAABB(Point3D< float > *AABB, Point3D< float > *cuboid)
void copyToDevice(int stream)
int getNoOfElements(float *array)
__device__ float bessi0Fast(float x)
__device__ float bessi4(float x)
__shared__ float BLOB_TABLE[BLOB_TABLE_SIZE_SQRT]
__device__ void getImgData(Point3D< float > *AABB, int tXindex, int tYindex, RecFourierBufferDataGPU *const buffer, int imgIndex, float &vReal, float &vImag)
void copyTempVolumes(std::complex< float > ***tempVol, float ***tempWeights, float *tempVolGPU, float *tempWeightsGPU, int size)
__device__ __constant__ float cOneOverBlobRadiusSqr
__device__ __constant__ int cMaxVolumeIndexX
__device__ float kaiserValueFast(float distSqr)
__device__ __constant__ int cMaxVolumeIndexYZ
FRecBufferDataGPUWrapper ** wrappers
void deleteStreams(int count)
__device__ int getNoOfSpaces()
__device__ void rotate(Point3D< float > *box, const float transform[3][3])
#define BLOB_TABLE_SIZE_SQRT
__device__ float bessi0(float x)
__device__ __constant__ float cIDeltaSqrt
__device__ void copyImgToCache(float2 *dest, Point3D< float > *AABB, RecFourierBufferDataGPU *const buffer, int imgIndex, int imgCacheDim)
void pinMemory(RecFourierBufferData *buffer)
__device__ void calculateAABB(const RecFourierProjectionTraverseSpace *tSpace, const RecFourierBufferDataGPU *buffer, Point3D< float > *dest)
RecFourierBufferDataGPU * cpuCopy
__device__ float bessi3(float x)
void releaseWrapper(int streamIndex)
static void pinMemory(const void *h_mem, size_t bytes, unsigned int flags=0)
if(fabs(c[*nmax+ *nmax *c_dim1])==0.e0)
~FRecBufferDataGPUWrapper()
void releaseTempVolumeGPU(float *&ptr)
void copyConstants(int maxVolIndexX, int maxVolIndexYZ, float blobRadius, float blobAlpha, float iDeltaSqrt, float iw0, float oneOverBessiOrderAlpha)
__device__ __constant__ float cBlobAlpha
void processBufferGPU(float *tempVolumeGPU, float *tempWeightsGPU, RecFourierBufferData *buffer, float blobRadius, int maxVolIndexYZ, bool useFast, float maxResolutionSqr, int streamIndex, int blobOrder, float blobAlpha)
__device__ float bessi2(float x)
__device__ float kaiserValue(float r, float a)
void allocateWrapper(RecFourierBufferData *buffer, int streamIndex)
Point3D< float > bottomOrigin
void convertImages(FRecBufferDataGPUWrapper *wrapper, float maxResolutionSqr, int streamIndex)
Point3D< float > topOrigin
__device__ void processVoxel(float2 *tempVolumeGPU, float *tempWeightsGPU, int x, int y, int z, int xSize, int ySize, const float *__restrict__ CTF, const float *__restrict__ modulator, const float2 *__restrict__ FFT, const RecFourierProjectionTraverseSpace *const space)
__device__ float getZ(float x, float y, const Point3D< float > &n, const Point3D< float > &p0)
__device__ float FFT_IDX2DIGFREQ(int idx, int size)
__device__ void processProjection(float2 *tempVolumeGPU, float *tempWeightsGPU, int xSize, int ySize, const float *__restrict__ CTF, const float *__restrict__ modulator, const float2 *__restrict__ FFT, const RecFourierProjectionTraverseSpace *const tSpace, const float *devBlobTableSqrt, int imgCacheDim)
float * allocateTempVolumeGPU(float *&ptr, int size, int typeSize)
__device__ __constant__ float cIw0
RecFourierProjectionTraverseSpace * spaces
Point3D< float > unitNormal
__global__ void convertImagesKernel(std::complex< float > *iFouriers, int iSizeX, int iSizeY, int iLength, RecFourierBufferDataGPU *oBuffer, float maxResolutionSqr)
__device__ bool isWithin(Point3D< float > *AABB, int imgXSize, int imgYSize)
__device__ float getY(float x, float z, const Point3D< float > &n, const Point3D< float > &p0)
FRecBufferDataGPUWrapper(RecFourierBufferData *orig)
void unpinMemory(RecFourierBufferData *buffer)
void fft(GpuMultidimArrayAtGpu< T1 > &fourierTransform, mycufftHandle &myhandle)
int getMaxByteSize(float *array)
__device__ double atomicAdd(double *address, double val)
RecFourierBufferDataGPU * gpuCopy
__device__ void multiply(const float transform[3][3], Point3D< float > &inOut)
__device__ float * getNthItem(float *array, int itemIndex)
void copyDataFrom(RecFourierBufferData *orig, int stream)
__device__ __constant__ float cOneOverBessiOrderAlpha
void createStreams(int count)
static void unpinMemory(const void *h_mem)
Incorrect value received.
void copyFrom(RecFourierBufferData *orig, int stream)