37 #include <cuda_runtime_api.h> 51 extern __shared__ float2 IMG[];
70 static void cuda_set_constants(
void *gpuConstants) {
75 int maxVolIndexX,
int maxVolIndexYZ,
76 float blobRadius,
float blobAlpha,
77 float iDeltaSqrt,
float iw0,
float oneOverBessiOrderAlpha) {
90 starpu_execute_on_each_worker(&cuda_set_constants, &constants, STARPU_CUDA);
103 float num = -0.8436825781374849e-19
f;
104 num = fmaf(num, x2, -0.93466495199548700e-17
f);
105 num = fmaf(num, x2, -0.15716375332511895e-13
f);
106 num = fmaf(num, x2, -0.42520971595532318e-11
f);
107 num = fmaf(num, x2, -0.13704363824102120e-8
f);
108 num = fmaf(num, x2, -0.28508770483148419e-6
f);
109 num = fmaf(num, x2, -0.44322160233346062e-4
f);
110 num = fmaf(num, x2, -0.46703811755736946e-2
f);
111 num = fmaf(num, x2, -0.31112484643702141e-0
f);
112 num = fmaf(num, x2, -0.11512633616429962e+2
f);
113 num = fmaf(num, x2, -0.18720283332732112e+3
f);
114 num = fmaf(num, x2, -0.75281108169006924e+3
f);
117 den = fmaf(den, x2, -0.75281109410939403e+3
f);
125 if ((ax = fabsf(x)) < 3.75
f)
129 ans = 1.f + y * (3.5156229f + y * (3.0899424f + y * (1.2067492f
130 + y * (0.2659732f + y * (0.360768e-1
f + y * 0.45813e-2
f)))));
135 ans = (expf(ax) * rsqrtf(ax)) * (0.39894228
f + y * (0.1328592e-1
f 136 + y * (0.225319e-2
f + y * (-0.157565e-2
f + y * (0.916281e-2
f 137 + y * (-0.2057706e-1
f + y * (0.2635537e-1
f + y * (-0.1647633e-1
f 138 + y * 0.392377e-2
f))))))));
147 if ((ax = fabsf(x)) < 3.75
f)
151 ans = ax * (0.5f + y * (0.87890594f + y * (0.51498869f + y * (0.15084934f
152 + y * (0.2658733e-1
f + y * (0.301532e-2
f + y * 0.32411e-3
f))))));
157 ans = 0.2282967e-1
f + y * (-0.2895312e-1
f + y * (0.1787654e-1
f 158 - y * 0.420059e-2
f));
159 ans = 0.39894228f + y * (-0.3988024e-1
f + y * (-0.362018e-2
f 160 + y * (0.163801e-2
f + y * (-0.1031555e-1
f + y * ans))));
161 ans *= (expf(ax) * rsqrtf(ax));
163 return x < 0.0 ? -ans : ans;
168 return (x == 0) ? 0 :
bessi0(x) - ((2*1) / x) *
bessi1(x);
173 return (x == 0) ? 0 :
bessi1(x) - ((2*2) / x) *
bessi2(x);
178 return (x == 0) ? 0 :
bessi2(x) - ((2*3) / x) *
bessi3(x);
195 float rdas = rda * rda;
203 w = sqrtf (1.
f - rdas);
208 w = sqrtf (1.
f - rdas);
214 w = sqrtf (1.
f - rdas);
220 w = sqrtf (1.
f - rdas);
225 printf(
"order (%d) out of range in kaiser_value(): %s, %d\n", order, __FILE__, __LINE__);
256 return (-n.
x*(x-p0.
x)-n.
y*(y-p0.
y))/n.
z + p0.
z;
263 return (-n.
x*(x-p0.
x)-n.
z*(z-p0.
z))/n.
y + p0.
y;
271 return (-n.
y*(y-p0.
y)-n.
z*(z-p0.
z))/n.
x + p0.
x;
277 float tmp0 = transform[0][0] * inOut.
x + transform[0][1] * inOut.
y + transform[0][2] * inOut.
z;
278 float tmp1 = transform[1][0] * inOut.
x + transform[1][1] * inOut.
y + transform[1][2] * inOut.
z;
279 float tmp2 = transform[2][0] * inOut.
x + transform[2][1] * inOut.
y + transform[2][2] * inOut.
z;
298 for (
int i = 0;
i < 8;
i++) {
318 AABB[0].
x = AABB[0].
y = AABB[0].
z = INFINITY;
319 AABB[1].
x = AABB[1].
y = AABB[1].
z = -INFINITY;
320 for (
int i = 0;
i < 8;
i++) {
322 if (AABB[0].
x > tmp.
x) AABB[0].
x = tmp.
x;
323 if (AABB[0].
y > tmp.
y) AABB[0].
y = tmp.
y;
324 if (AABB[0].
z > tmp.
z) AABB[0].
z = tmp.
z;
325 if (AABB[1].
x < tmp.
x) AABB[1].
x = tmp.
x;
326 if (AABB[1].
y < tmp.
y) AABB[1].
y = tmp.
y;
327 if (AABB[1].
z < tmp.
z) AABB[1].
z = tmp.
z;
329 AABB[0].
x = ceilf(AABB[0].
x);
330 AABB[0].
y = ceilf(AABB[0].
y);
331 AABB[0].
z = ceilf(AABB[0].
z);
333 AABB[1].
x = floorf(AABB[1].x);
334 AABB[1].
y = floorf(AABB[1].y);
335 AABB[1].
z = floorf(AABB[1].z);
356 if (tSpace->
XY == tSpace->
dir) {
357 box[0].
x = box[3].
x = box[4].
x = box[7].
x = blockIdx.x*blockDim.x - c.
cBlobRadius;
358 box[1].
x = box[2].
x = box[5].
x = box[6].
x = (blockIdx.x+1)*blockDim.x + c.
cBlobRadius - 1.f;
360 box[2].
y = box[3].
y = box[6].
y = box[7].
y = (blockIdx.y+1)*blockDim.y + c.
cBlobRadius - 1.f;
361 box[0].
y = box[1].
y = box[4].
y = box[5].
y = blockIdx.y*blockDim.y- c.
cBlobRadius;
374 }
else if (tSpace->
XZ == tSpace->
dir) {
375 box[0].
x = box[3].
x = box[4].
x = box[7].
x = blockIdx.x*blockDim.x - c.
cBlobRadius;
376 box[1].
x = box[2].
x = box[5].
x = box[6].
x = (blockIdx.x+1)*blockDim.x + c.
cBlobRadius - 1.f;
378 box[2].
z = box[3].
z = box[6].
z = box[7].
z = (blockIdx.y+1)*blockDim.y + c.
cBlobRadius - 1.f;
379 box[0].
z = box[1].
z = box[4].
z = box[5].
z = blockIdx.y*blockDim.y - c.
cBlobRadius;
393 box[0].
y = box[3].
y = box[4].
y = box[7].
y = blockIdx.x*blockDim.x - c.
cBlobRadius;
394 box[1].
y = box[2].
y = box[5].
y = box[6].
y = (blockIdx.x+1)*blockDim.x + c.
cBlobRadius - 1.f;
396 box[2].
z = box[3].
z = box[6].
z = box[7].
z = (blockIdx.y+1)*blockDim.y + c.
cBlobRadius - 1.f;
397 box[0].
z = box[1].
z = box[4].
z = box[5].
z = blockIdx.y*blockDim.y- c.
cBlobRadius;
420 return (AABB[0].
x < imgXSize)
422 && (AABB[0].
y < imgYSize)
436 float2* tempVolumeGPU,
float* tempWeightsGPU,
438 int xSize,
int ySize,
439 const float2* __restrict__ FFT,
445 float dataWeight = space->
weight;
456 if (imgPos.
x < 0.f)
return;
460 int imgX = clamp((
int)(imgPos.
x + 0.5f), 0, xSize - 1);
464 int index2D = imgY * xSize + imgX;
466 float weight = wBlob * dataWeight;
469 atomicAdd(&tempVolumeGPU[index3D].x, FFT[index2D].x * weight);
470 atomicAdd(&tempVolumeGPU[index3D].y, FFT[index2D].y * weight);
471 atomicAdd(&tempWeightsGPU[index3D], weight);
479 template<
int blobOrder,
bool useFastKaiser>
482 float2* tempVolumeGPU,
float *tempWeightsGPU,
483 const int x,
const int y,
const int z,
484 const int xSize,
const int ySize,
485 const float2* __restrict__ FFT,
487 const float* blobTableSqrt,
488 const int imgCacheDim)
506 float zSqr = imgPos.
z * imgPos.
z;
507 if (zSqr > radiusSqr)
return;
514 minX = fmaxf(minX, 0);
515 minY = fmaxf(minY, 0);
516 maxX = fminf(maxX, xSize-1);
517 maxY = fminf(maxY, ySize-1);
522 vol.x = vol.y = w = 0.f;
523 float dataWeight = space->
weight;
526 for (
int i = minY;
i <= maxY;
i++) {
527 float ySqr = (imgPos.
y -
i) * (imgPos.
y -
i);
528 float yzSqr = ySqr + zSqr;
529 if (yzSqr > radiusSqr)
continue;
530 for (
int j = minX;
j <= maxX;
j++) {
531 float xD = imgPos.
x -
j;
532 float distanceSqr = xD*xD + yzSqr;
533 if (distanceSqr > radiusSqr)
continue;
536 int index2D = (
i - SHARED_AABB[0].y) * imgCacheDim + (
j-SHARED_AABB[0].x);
538 int index2D =
i * xSize +
j;
541 #if PRECOMPUTE_BLOB_VAL 543 #if SHARED_BLOB_TABLE 546 float wBlob = blobTableSqrt[aux];
557 float weight = wBlob * dataWeight;
560 vol += IMG[index2D] * weight;
562 vol += FFT[index2D] * weight;
568 atomicAdd(&tempVolumeGPU[index3D].x, vol.x);
569 atomicAdd(&tempVolumeGPU[index3D].y, vol.y);
577 template<
bool useFast,
int blobOrder,
bool useFastKaiser>
580 float2* tempVolumeGPU,
float *tempWeightsGPU,
581 const int xSize,
const int ySize,
582 const float2* __restrict__ FFT,
584 const float* blobTableSqrt,
585 const int imgCacheDim)
589 int id = threadIdx.y * blockDim.x + threadIdx.x;
590 int tidX = threadIdx.x %
TILE + (
id / (blockDim.y *
TILE)) *
TILE;
591 int tidY = (
id /
TILE) % blockDim.y;
592 int idx = blockIdx.x * blockDim.x + tidX;
593 int idy = blockIdx.y * blockDim.y + tidY;
596 volatile int idx = blockIdx.x*blockDim.x + threadIdx.x;
597 volatile int idy = blockIdx.y*blockDim.y + threadIdx.y;
600 if (tSpace->
XY == tSpace->
dir) {
601 if (idy >= tSpace->
minY && idy <= tSpace->maxY) {
602 if (idx >= tSpace->
minX && idx <= tSpace->maxX) {
605 int z = (int)(hitZ + 0.5
f);
606 processVoxel(tempVolumeGPU, tempWeightsGPU, idx, idy, z, xSize, ySize, FFT, tSpace);
612 int lower =
static_cast<int>(floorf(fminf(z1, z2)));
613 int upper =
static_cast<int>(ceilf(fmaxf(z1, z2)));
614 for (
int z = lower;
z <= upper;
z++) {
615 processVoxelBlob<blobOrder, useFastKaiser>(tempVolumeGPU, tempWeightsGPU, idx, idy,
z, xSize, ySize, FFT, tSpace, blobTableSqrt, imgCacheDim);
620 }
else if (tSpace->
XZ == tSpace->
dir) {
621 if (idy >= tSpace->
minZ && idy <= tSpace->maxZ) {
622 if (idx >= tSpace->
minX && idx <= tSpace->maxX) {
625 int y = (int)(hitY + 0.5
f);
626 processVoxel(tempVolumeGPU, tempWeightsGPU, idx, y, idy, xSize, ySize, FFT, tSpace);
632 int lower =
static_cast<int>(floorf(fminf(y1, y2)));
633 int upper =
static_cast<int>(ceilf(fmaxf(y1, y2)));
634 for (
int y = lower;
y <= upper;
y++) {
635 processVoxelBlob<blobOrder, useFastKaiser>(tempVolumeGPU, tempWeightsGPU, idx,
y, idy, xSize, ySize, FFT, tSpace, blobTableSqrt, imgCacheDim);
641 if (idy >= tSpace->
minZ && idy <= tSpace->maxZ) {
642 if (idx >= tSpace->
minY && idx <= tSpace->maxY) {
645 int x = (int)(hitX + 0.5
f);
646 processVoxel(tempVolumeGPU, tempWeightsGPU, x, idx, idy, xSize, ySize, FFT, tSpace);
652 int lower =
static_cast<int>(floorf(fminf(x1, x2)));
653 int upper =
static_cast<int>(ceilf(fmaxf(x1, x2)));
654 for (
int x = lower;
x <= upper;
x++) {
655 processVoxelBlob<blobOrder, useFastKaiser>(tempVolumeGPU, tempWeightsGPU,
x, idx, idy, xSize, ySize, FFT, tSpace, blobTableSqrt, imgCacheDim);
671 const int tXindex,
const int tYindex,
672 const float2* FFTs,
const int fftSizeX,
const int fftSizeY,
const int imgIndex,
674 int imgXindex = tXindex +
static_cast<int>(AABB[0].
x);
675 int imgYindex = tYindex +
static_cast<int>(AABB[0].
y);
677 && (imgXindex < fftSizeX)
679 && (imgYindex < fftSizeY)) {
680 int index = imgYindex * fftSizeX + imgXindex;
681 vComplex = (FFTs + fftSizeX * fftSizeY * imgIndex)[index];
683 vComplex = {0.f, 0.f};
696 const float2* FFTs,
const int fftSizeX,
const int fftSizeY,
const int imgIndex,
697 const int imgCacheDim) {
698 for (
int y = threadIdx.y;
y < imgCacheDim;
y += blockDim.y) {
699 for (
int x = threadIdx.x;
x < imgCacheDim;
x += blockDim.x) {
700 int memIndex =
y * imgCacheDim +
x;
701 getImgData(AABB, x,
y, FFTs, fftSizeX, fftSizeY, imgIndex, dest[memIndex]);
710 template<
bool fastLateBlobbing,
int blobOrder,
bool useFastKaiser>
713 float2* outVolumeBuffer,
float *outWeightsBuffer,
714 const int fftSizeX,
const int fftSizeY,
717 const float* blobTableSqrt,
720 #if SHARED_BLOB_TABLE 721 if ( ! fastLateBlobbing) {
723 volatile int id = threadIdx.y*blockDim.x + threadIdx.x;
724 volatile int blockSize = blockDim.x * blockDim.y;
731 for (
int i = blockIdx.z;
i < traverseSpaceCount;
i += gridDim.z) {
735 if ( ! fastLateBlobbing) {
739 if ((threadIdx.x == 0) && (threadIdx.y == 0)) {
745 if (
isWithin(SHARED_AABB, fftSizeX, fftSizeY)) {
755 processProjection<fastLateBlobbing, blobOrder, useFastKaiser>(
756 outVolumeBuffer, outWeightsBuffer,
773 template<
int blobOrder,
bool useFastKaiser>
775 float2 *outVolumeBuffer,
float *outWeightsBuffer,
776 const int fftSizeX,
const int fftSizeY,
778 const float2 *inFFTs,
779 const float *blobTableSqrt,
780 const bool fastLateBlobbing,
781 const float blobRadius,
const int maxVolIndexYZ) {
784 const int imgCacheDim =
static_cast<int>(ceil(
sqrt(2.
f) *
sqrt(3.
f) * (
BLOCK_DIM + 2 * blobRadius)));
787 const int size2D = maxVolIndexYZ + 1;
788 dim3 dimGrid(static_cast<unsigned int>(ceil(size2D / (
float)dimBlock.x)),
789 static_cast<unsigned int>(ceil(size2D / (
float)dimBlock.y)),
793 if (fastLateBlobbing) {
794 processBufferKernel<true, blobOrder,useFastKaiser><<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>>(
795 outVolumeBuffer, outWeightsBuffer,
797 traverseSpaceCount, traverseSpaces,
803 int sharedMemSize =
SHARED_IMG ? (imgCacheDim*imgCacheDim*
sizeof(float2)) : 0;
804 processBufferKernel<false, blobOrder,useFastKaiser><<<dimGrid, dimBlock, sharedMemSize, starpu_cuda_get_local_stream()>>>(
805 outVolumeBuffer, outWeightsBuffer,
807 traverseSpaceCount, traverseSpaces,
817 const float2* inFFTs = (float2*)STARPU_VECTOR_GET_PTR(buffers[0]);
819 const float* inBlobTableSqrt = (
float*)(STARPU_VECTOR_GET_PTR(buffers[2]));
820 float2* outVolumeBuffer = (float2*)(STARPU_VECTOR_GET_PTR(buffers[3]));
821 float* outWeightsBuffer = (
float*)(STARPU_VECTOR_GET_PTR(buffers[4]));
822 const uint32_t noOfImages = ((
LoadedImagesBuffer*) STARPU_VARIABLE_GET_PTR(buffers[5]))->noOfImages;
827 processBufferGPU<0, true>(outVolumeBuffer, outWeightsBuffer,
835 processBufferGPU<0, false>(outVolumeBuffer, outWeightsBuffer,
845 processBufferGPU<1, false>(outVolumeBuffer, outWeightsBuffer,
854 processBufferGPU<2, false>(outVolumeBuffer, outWeightsBuffer,
863 processBufferGPU<3, false>(outVolumeBuffer, outWeightsBuffer,
872 processBufferGPU<4, false>(outVolumeBuffer, outWeightsBuffer,
893 static_assert(
sizeof(
float) ==
sizeof(uint32_t),
"atomicAddFloat requires floats to be 32bit");
898 volatile std::atomic<float>& atomicPtr = *
reinterpret_cast<volatile std::atomic<float>*
>(ptr);
899 float current = atomicPtr.load(std::memory_order::memory_order_relaxed);
901 const float newValue = current + addedValue;
903 if (atomicPtr.compare_exchange_weak(current, newValue, std::memory_order::memory_order_relaxed)) {
912 float2*
const tempVolumeGPU,
float*
const tempWeightsGPU,
913 const int x,
const int y,
const int z,
914 const int xSize,
const int ySize,
915 const float2*
const __restrict__ FFT,
921 float dataWeight = space->
weight;
932 if (imgPos.
x < 0.f)
return;
936 int imgX = clamp((
int)(imgPos.
x + 0.5f), 0, xSize - 1);
940 int index2D = imgY * xSize + imgX;
942 float weight = wBlob * dataWeight;
945 atomicAddFloat(&tempVolumeGPU[index3D].x, FFT[index2D].x * weight);
946 atomicAddFloat(&tempVolumeGPU[index3D].y, FFT[index2D].y * weight);
953 template<
int blobOrder,
bool useFastKaiser,
bool usePrecomputedInterpolation>
955 float2*
const tempVolumeGPU,
float*
const tempWeightsGPU,
956 const int x,
const int y,
const int z,
957 const int xSize,
const int ySize,
958 const float2*
const __restrict__ FFT,
960 const float* blobTableSqrt)
978 float zSqr = imgPos.
z * imgPos.
z;
979 if (zSqr > radiusSqr)
return;
986 minX = fmaxf(minX, 0);
987 minY = fmaxf(minY, 0);
988 maxX = fminf(maxX, xSize-1);
989 maxY = fminf(maxY, ySize-1);
994 vol.x = vol.y = w = 0.f;
995 float dataWeight = space->
weight;
998 for (
int i = minY;
i <= maxY;
i++) {
999 float ySqr = (imgPos.
y -
i) * (imgPos.
y -
i);
1000 float yzSqr = ySqr + zSqr;
1001 if (yzSqr > radiusSqr)
continue;
1002 for (
int j = minX;
j <= maxX;
j++) {
1003 float xD = imgPos.
x -
j;
1004 float distanceSqr = xD*xD + yzSqr;
1005 if (distanceSqr > radiusSqr)
continue;
1007 int index2D =
i * xSize +
j;
1010 if (usePrecomputedInterpolation) {
1011 int aux = (int) ((distanceSqr * cpuC.
cIDeltaSqrt + 0.5f));
1012 wBlob = blobTableSqrt[aux];
1013 }
else if (useFastKaiser) {
1016 wBlob = kaiserValue<blobOrder>(sqrtf(distanceSqr), cpuC.
cBlobRadius) * cpuC.
cIw0;
1019 float weight = wBlob * dataWeight;
1021 vol += FFT[index2D] * weight;
1033 template<
bool useFast,
int blobOrder,
bool useFastKaiser,
bool usePrecomputedInterpolation>
1035 float2* tempVolumeGPU,
float *tempWeightsGPU,
1036 const int xSize,
const int ySize,
1037 const float2* __restrict__ FFT,
1039 const float* blobTableSqrt) {
1041 if (tSpace->
XY == tSpace->
dir) {
1042 for (
int idy = tSpace->
minY; idy <= tSpace->maxY; idy++) {
1043 for (
int idx = tSpace->
minX; idx <= tSpace->maxX; idx++) {
1046 int z = (int)(hitZ + 0.5
f);
1047 processVoxelCPU(tempVolumeGPU, tempWeightsGPU, idx, idy, z, xSize, ySize, FFT, tSpace);
1053 int lower =
static_cast<int>(floorf(fminf(z1, z2)));
1054 int upper =
static_cast<int>(ceilf(fmaxf(z1, z2)));
1055 for (
int z = lower;
z <= upper;
z++) {
1056 processVoxelBlobCPU<blobOrder, useFastKaiser, usePrecomputedInterpolation>(tempVolumeGPU, tempWeightsGPU, idx, idy,
z, xSize, ySize, FFT, tSpace, blobTableSqrt);
1061 }
else if (tSpace->
XZ == tSpace->
dir) {
1062 for (
int idy = tSpace->
minZ; idy <= tSpace->maxZ; idy++) {
1063 for (
int idx = tSpace->
minX; idx <= tSpace->maxX; idx++) {
1066 int y = (int)(hitY + 0.5
f);
1067 processVoxelCPU(tempVolumeGPU, tempWeightsGPU, idx, y, idy, xSize, ySize, FFT, tSpace);
1073 int lower =
static_cast<int>(floorf(fminf(y1, y2)));
1074 int upper =
static_cast<int>(ceilf(fmaxf(y1, y2)));
1075 for (
int y = lower;
y <= upper;
y++) {
1076 processVoxelBlobCPU<blobOrder, useFastKaiser, usePrecomputedInterpolation>(tempVolumeGPU, tempWeightsGPU, idx,
y, idy, xSize, ySize, FFT, tSpace, blobTableSqrt);
1082 for (
int idy = tSpace->
minZ; idy <= tSpace->maxZ; idy++) {
1083 for (
int idx = tSpace->
minY; idx <= tSpace->maxY; idx++) {
1086 int x = (int)(hitX + 0.5
f);
1087 processVoxelCPU(tempVolumeGPU, tempWeightsGPU, x, idx, idy, xSize, ySize, FFT, tSpace);
1093 int lower =
static_cast<int>(floorf(fminf(x1, x2)));
1094 int upper =
static_cast<int>(ceilf(fmaxf(x1, x2)));
1095 for (
int x = lower;
x <= upper;
x++) {
1096 processVoxelBlobCPU<blobOrder, useFastKaiser, usePrecomputedInterpolation>(tempVolumeGPU, tempWeightsGPU,
x, idx, idy, xSize, ySize, FFT, tSpace, blobTableSqrt);
1104 template<
int blobOrder,
bool useFastKaiser,
bool usePrecomputedInterpolation>
1106 float2 *outVolumeBuffer,
float *outWeightsBuffer,
1107 const int fftSizeX,
const int fftSizeY,
1109 const float2 *inFFTs,
1110 const float *blobTableSqrt,
1111 const bool fastLateBlobbing) {
1113 const int groupSize = starpu_combined_worker_get_size();
1114 const int groupRank = starpu_combined_worker_get_rank();
1116 for (
int i = groupRank;
i < traverseSpaceCount;
i += groupSize) {
1119 const float2* spaceFFT = inFFTs + fftSizeX * fftSizeY * space.
projectionIndex;
1122 if (fastLateBlobbing) {
1123 processProjectionCPU<true, blobOrder, useFastKaiser, usePrecomputedInterpolation>(
1124 outVolumeBuffer, outWeightsBuffer,
1130 processProjectionCPU<false, blobOrder, useFastKaiser, usePrecomputedInterpolation>(
1131 outVolumeBuffer, outWeightsBuffer,
1140 template<
bool usePrecomputedInterpolation>
1143 const float2* inFFTs = (float2*)STARPU_VECTOR_GET_PTR(buffers[0]);
1145 const float* inBlobTableSqrt = (
float*)(STARPU_VECTOR_GET_PTR(buffers[2]));
1146 float2* outVolumeBuffer = (float2*)(STARPU_VECTOR_GET_PTR(buffers[3]));
1147 float* outWeightsBuffer = (
float*)(STARPU_VECTOR_GET_PTR(buffers[4]));
1148 const uint32_t noOfImages = ((
LoadedImagesBuffer*) STARPU_VARIABLE_GET_PTR(buffers[5]))->noOfImages;
1153 processBufferCPU<0, true, usePrecomputedInterpolation>(outVolumeBuffer, outWeightsBuffer,
1160 processBufferCPU<0, false, usePrecomputedInterpolation>(outVolumeBuffer, outWeightsBuffer,
1169 processBufferCPU<1, false, usePrecomputedInterpolation>(outVolumeBuffer, outWeightsBuffer,
1177 processBufferCPU<2, false, usePrecomputedInterpolation>(outVolumeBuffer, outWeightsBuffer,
1185 processBufferCPU<3, false, usePrecomputedInterpolation>(outVolumeBuffer, outWeightsBuffer,
1193 processBufferCPU<4, false, usePrecomputedInterpolation>(outVolumeBuffer, outWeightsBuffer,
1206 func_reconstruct_cpu_template<true>(buffers, cl_arg);
1210 func_reconstruct_cpu_template<false>(buffers, cl_arg);
enum RecFourierProjectionTraverseSpace::Direction dir
__device__ void processVoxel(float2 *tempVolumeGPU, float *tempWeightsGPU, int x, int y, int z, int xSize, int ySize, const float2 *__restrict__ FFT, const RecFourierProjectionTraverseSpace *const space)
__host__ __device__ float kaiserValue(float r, float a)
#define REPORT_ERROR(nerr, ErrormMsg)
void processVoxelBlobCPU(float2 *const tempVolumeGPU, float *const tempWeightsGPU, const int x, const int y, const int z, const int xSize, const int ySize, const float2 *const __restrict__ FFT, const RecFourierProjectionTraverseSpace *const space, const float *blobTableSqrt)
__device__ void processVoxelBlob(float2 *tempVolumeGPU, float *tempWeightsGPU, const int x, const int y, const int z, const int xSize, const int ySize, const float2 *__restrict__ FFT, const RecFourierProjectionTraverseSpace *const space, const float *blobTableSqrt, const int imgCacheDim)
void sqrt(Image< double > &op)
void func_reconstruct_cpu_template(void *buffers[], void *cl_arg)
__host__ __device__ float bessi3(float x)
__shared__ float BLOB_TABLE[BLOB_TABLE_SIZE_SQRT]
__host__ __device__ float getZ(float x, float y, const Point3D< float > &n, const Point3D< float > &p0)
__device__ __constant__ CodeletConstants gpuC
__global__ void processBufferKernel(float2 *outVolumeBuffer, float *outWeightsBuffer, const int fftSizeX, const int fftSizeY, const int traverseSpaceCount, const RecFourierProjectionTraverseSpace *traverseSpaces, const float2 *FFTs, const float *blobTableSqrt, int imgCacheDim)
float cOneOverBessiOrderAlpha
void reconstruct_cuda_initialize_constants(int maxVolIndexX, int maxVolIndexYZ, float blobRadius, float blobAlpha, float iDeltaSqrt, float iw0, float oneOverBessiOrderAlpha)
#define BLOB_TABLE_SIZE_SQRT
void processBufferCPU(float2 *outVolumeBuffer, float *outWeightsBuffer, const int fftSizeX, const int fftSizeY, const int traverseSpaceCount, const RecFourierProjectionTraverseSpace *traverseSpaces, const float2 *inFFTs, const float *blobTableSqrt, const bool fastLateBlobbing)
float cOneOverBlobRadiusSqr
__device__ void calculateAABB(const RecFourierProjectionTraverseSpace *tSpace, const RecFourierBufferDataGPU *buffer, Point3D< float > *dest)
void atomicAddFloat(volatile float *ptr, float addedValue)
if(fabs(c[*nmax+ *nmax *c_dim1])==0.e0)
__host__ __device__ float bessi2(float x)
__host__ __device__ float getY(float x, float z, const Point3D< float > &n, const Point3D< float > &p0)
__host__ __device__ float bessi1(float x)
__host__ __device__ float kaiserValueFast(float distSqr)
Point3D< float > bottomOrigin
void processProjectionCPU(float2 *tempVolumeGPU, float *tempWeightsGPU, const int xSize, const int ySize, const float2 *__restrict__ FFT, const RecFourierProjectionTraverseSpace *const tSpace, const float *blobTableSqrt)
Point3D< float > topOrigin
__host__ __device__ void rotate(Point3D< float > box[8], const float transform[3][3])
__host__ __device__ void multiply(const float transform[3][3], Point3D< float > &inOut)
void func_reconstruct_cuda(void *buffers[], void *cl_arg)
__host__ __device__ float bessi4(float x)
__host__ __device__ float bessi0(float x)
__device__ void copyImgToCache(float2 *dest, const Point3D< float > AABB[2], const float2 *FFTs, const int fftSizeX, const int fftSizeY, const int imgIndex, const int imgCacheDim)
Point3D< float > unitNormal
void processBufferGPU(float2 *outVolumeBuffer, float *outWeightsBuffer, const int fftSizeX, const int fftSizeY, const int traverseSpaceCount, const RecFourierProjectionTraverseSpace *traverseSpaces, const float2 *inFFTs, const float *blobTableSqrt, const bool fastLateBlobbing, const float blobRadius, const int maxVolIndexYZ)
__device__ bool isWithin(Point3D< float > *AABB, int imgXSize, int imgYSize)
void func_reconstruct_cpu_lookup_interpolation(void *buffers[], void *cl_arg)
void func_reconstruct_cpu_dynamic_interpolation(void *buffers[], void *cl_arg)
__device__ double atomicAdd(double *address, double val)
__device__ void processProjection(float2 *tempVolumeGPU, float *tempWeightsGPU, const int xSize, const int ySize, const float2 *__restrict__ FFT, const RecFourierProjectionTraverseSpace *const tSpace, const float *blobTableSqrt, const int imgCacheDim)
__device__ void getImgData(const Point3D< float > AABB[2], const int tXindex, const int tYindex, const float2 *FFTs, const int fftSizeX, const int fftSizeY, const int imgIndex, float2 &vComplex)
__host__ __device__ float bessi0Fast(float x)
Incorrect value received.
void processVoxelCPU(float2 *const tempVolumeGPU, float *const tempWeightsGPU, const int x, const int y, const int z, const int xSize, const int ySize, const float2 *const __restrict__ FFT, const RecFourierProjectionTraverseSpace *const space)
__host__ __device__ void computeAABB(Point3D< float > AABB[2], const Point3D< float > cuboid[8])
__host__ __device__ float getX(float y, float z, const Point3D< float > &n, const Point3D< float > &p0)