#include <atomic>
#include <core/multidim_array.h>
#include <core/xmipp_fft.h>
#include <core/xmipp_fftw.h>
#include <reconstruction/reconstruct_fourier_projection_traverse_space.h>
#include <reconstruction_cuda/cuda_basic_math.h>
#include <reconstruction_cuda/cuda_xmipp_utils.h>
#include <reconstruction_cuda/cuda_asserts.h>
#include <cuda_runtime_api.h>
#include <starpu.h>
#include "reconstruct_fourier_codelets.h"
#include "reconstruct_fourier_defines.h"

Include dependency graph for reconstruct_fourier_codelet_reconstruct.cpp:

Classes
struct	CodeletConstants

Functions
void	reconstruct_cuda_initialize_constants (int maxVolIndexX, int maxVolIndexYZ, float blobRadius, float blobAlpha, float iDeltaSqrt, float iw0, float oneOverBessiOrderAlpha)

__host__ __device__ float	bessi0Fast (float x)

__host__ __device__ float	bessi0 (float x)

__host__ __device__ float	bessi1 (float x)

__host__ __device__ float	bessi2 (float x)

__host__ __device__ float	bessi3 (float x)

__host__ __device__ float	bessi4 (float x)

template<int order>
__host__ __device__ float	kaiserValue (float r, float a)

__host__ __device__ float	kaiserValueFast (float distSqr)

__host__ __device__ float	getZ (float x, float y, const Point3D< float > &n, const Point3D< float > &p0)

__host__ __device__ float	getY (float x, float z, const Point3D< float > &n, const Point3D< float > &p0)

__host__ __device__ float	getX (float y, float z, const Point3D< float > &n, const Point3D< float > &p0)

__host__ __device__ void	multiply (const float transform[3][3], Point3D< float > &inOut)

__host__ __device__ void	rotate (Point3D< float > box[8], const float transform[3][3])

__host__ __device__ void	computeAABB (Point3D< float > AABB[2], const Point3D< float > cuboid[8])

__device__ void	processVoxel (float2 tempVolumeGPU, float tempWeightsGPU, int x, int y, int z, int xSize, int ySize, const float2 __restrict__ FFT, const RecFourierProjectionTraverseSpace const space)

template<int blobOrder, bool useFastKaiser>
__device__ void	processVoxelBlob (float2 tempVolumeGPU, float tempWeightsGPU, const int x, const int y, const int z, const int xSize, const int ySize, const float2 __restrict__ FFT, const RecFourierProjectionTraverseSpace const space, const float *blobTableSqrt, const int imgCacheDim)

template<bool useFast, int blobOrder, bool useFastKaiser>
__device__ void	processProjection (float2 tempVolumeGPU, float tempWeightsGPU, const int xSize, const int ySize, const float2 __restrict__ FFT, const RecFourierProjectionTraverseSpace const tSpace, const float *blobTableSqrt, const int imgCacheDim)

__device__ void	getImgData (const Point3D< float > AABB[2], const int tXindex, const int tYindex, const float2 *FFTs, const int fftSizeX, const int fftSizeY, const int imgIndex, float2 &vComplex)

__device__ void	copyImgToCache (float2 dest, const Point3D< float > AABB[2], const float2 FFTs, const int fftSizeX, const int fftSizeY, const int imgIndex, const int imgCacheDim)

template<bool fastLateBlobbing, int blobOrder, bool useFastKaiser>
__global__ void	processBufferKernel (float2 outVolumeBuffer, float outWeightsBuffer, const int fftSizeX, const int fftSizeY, const int traverseSpaceCount, const RecFourierProjectionTraverseSpace traverseSpaces, const float2 FFTs, const float *blobTableSqrt, int imgCacheDim)

template<int blobOrder, bool useFastKaiser>
void	processBufferGPU (float2 outVolumeBuffer, float outWeightsBuffer, const int fftSizeX, const int fftSizeY, const int traverseSpaceCount, const RecFourierProjectionTraverseSpace traverseSpaces, const float2 inFFTs, const float *blobTableSqrt, const bool fastLateBlobbing, const float blobRadius, const int maxVolIndexYZ)

void	func_reconstruct_cuda (void buffers[], void cl_arg)

void	atomicAddFloat (volatile float *ptr, float addedValue)

void	processVoxelCPU (float2 const tempVolumeGPU, float const tempWeightsGPU, const int x, const int y, const int z, const int xSize, const int ySize, const float2 const __restrict__ FFT, const RecFourierProjectionTraverseSpace const space)

template<int blobOrder, bool useFastKaiser, bool usePrecomputedInterpolation>
void	processVoxelBlobCPU (float2 const tempVolumeGPU, float const tempWeightsGPU, const int x, const int y, const int z, const int xSize, const int ySize, const float2 const __restrict__ FFT, const RecFourierProjectionTraverseSpace const space, const float *blobTableSqrt)

template<bool useFast, int blobOrder, bool useFastKaiser, bool usePrecomputedInterpolation>
void	processProjectionCPU (float2 tempVolumeGPU, float tempWeightsGPU, const int xSize, const int ySize, const float2 __restrict__ FFT, const RecFourierProjectionTraverseSpace const tSpace, const float *blobTableSqrt)

template<int blobOrder, bool useFastKaiser, bool usePrecomputedInterpolation>
void	processBufferCPU (float2 outVolumeBuffer, float outWeightsBuffer, const int fftSizeX, const int fftSizeY, const int traverseSpaceCount, const RecFourierProjectionTraverseSpace traverseSpaces, const float2 inFFTs, const float *blobTableSqrt, const bool fastLateBlobbing)

template<bool usePrecomputedInterpolation>
void	func_reconstruct_cpu_template (void buffers[], void cl_arg)

void	func_reconstruct_cpu_lookup_interpolation (void buffers[], void cl_arg)

void	func_reconstruct_cpu_dynamic_interpolation (void buffers[], void cl_arg)

Variables
__shared__ float	BLOB_TABLE [BLOB_TABLE_SIZE_SQRT]

__device__ __constant__ CodeletConstants	gpuC

CodeletConstants	cpuC

Function Documentation

◆ atomicAddFloat()

void atomicAddFloat	(	volatile float *	ptr,
		float	addedValue
	)

Atomically increments the value pointed at by ptr by value. Uses relaxed memory model with no reordering guarantees.

Definition at line 892 of file reconstruct_fourier_codelet_reconstruct.cpp.

                                                            {
     static_assert(sizeof(float) == sizeof(uint32_t), "atomicAddFloat requires floats to be 32bit");
 
     // This is probably fine, since the constructor/destructor should be trivial
     // (As of C++11, this is guaranteed only for integral type specializations, but it is probably reasonably safe to assume
     // that this will hold for floats as well. C++20 requies that by spec.)
     volatile std::atomic<float>& atomicPtr = *reinterpret_cast<volatile std::atomic<float>*>(ptr);
     float current = atomicPtr.load(std::memory_order::memory_order_relaxed);
     while (true) {
         const float newValue = current + addedValue;
         // Since x86 does not allow atomic add of floats (only integers), we have to implement it through CAS
         if (atomicPtr.compare_exchange_weak(current, newValue, std::memory_order::memory_order_relaxed)) {
             // Current was still current and was replaced with the newValue. Done.
             return;
         }
         // Comparison failed. current now contains the new value and we try again.
     }
 }

◆ bessi0()

__host__ __device__ float bessi0 ( float x )

Definition at line 123 of file reconstruct_fourier_codelet_reconstruct.cpp.

                       {
     float y, ax, ans;
     if ((ax = fabsf(x)) < 3.75f)
     {
         y = x / 3.75f;
         y *= y;
         ans = 1.f + y * (3.5156229f + y * (3.0899424f + y * (1.2067492f
                                                              + y * (0.2659732f + y * (0.360768e-1f + y * 0.45813e-2f)))));
     }
     else
     {
         y = 3.75f / ax;
         ans = (expf(ax) * rsqrtf(ax)) * (0.39894228f + y * (0.1328592e-1f
                                                             + y * (0.225319e-2f + y * (-0.157565e-2f + y * (0.916281e-2f
                                                                                                             + y * (-0.2057706e-1f + y * (0.2635537e-1f + y * (-0.1647633e-1f
                                                                                                                                                               + y * 0.392377e-2f))))))));
     }
     return ans;
 }

◆ bessi0Fast()

__host__ __device__ float bessi0Fast ( float x )

Definition at line 99 of file reconstruct_fourier_codelet_reconstruct.cpp.

                           { // X must be <= 15
     // stable rational minimax approximations to the modified bessel functions, blair, edwards
     // from table 5
     float x2 = x*x;
     float num = -0.8436825781374849e-19f; // p11
     num = fmaf(num, x2, -0.93466495199548700e-17f); // p10
     num = fmaf(num, x2, -0.15716375332511895e-13f); // p09
     num = fmaf(num, x2, -0.42520971595532318e-11f); // p08
     num = fmaf(num, x2, -0.13704363824102120e-8f);  // p07
     num = fmaf(num, x2, -0.28508770483148419e-6f);  // p06
     num = fmaf(num, x2, -0.44322160233346062e-4f);  // p05
     num = fmaf(num, x2, -0.46703811755736946e-2f);  // p04
     num = fmaf(num, x2, -0.31112484643702141e-0f);  // p03
     num = fmaf(num, x2, -0.11512633616429962e+2f);  // p02
     num = fmaf(num, x2, -0.18720283332732112e+3f);  // p01
     num = fmaf(num, x2, -0.75281108169006924e+3f);  // p00
 
     float den = 1.f; // q01
     den = fmaf(den, x2, -0.75281109410939403e+3f); // q00
 
     return num/den;
 }

◆ bessi1()

__host__ __device__ float bessi1 ( float x )

Definition at line 144 of file reconstruct_fourier_codelet_reconstruct.cpp.

                       {
     float ax, ans;
     float y;
     if ((ax = fabsf(x)) < 3.75f)
     {
         y = x / 3.75f;
         y *= y;
         ans = ax * (0.5f + y * (0.87890594f + y * (0.51498869f + y * (0.15084934f
                                                                       + y * (0.2658733e-1f + y * (0.301532e-2f + y * 0.32411e-3f))))));
     }
     else
     {
         y = 3.75f / ax;
         ans = 0.2282967e-1f + y * (-0.2895312e-1f + y * (0.1787654e-1f
                                                          - y * 0.420059e-2f));
         ans = 0.39894228f + y * (-0.3988024e-1f + y * (-0.362018e-2f
                                                        + y * (0.163801e-2f + y * (-0.1031555e-1f + y * ans))));
         ans *= (expf(ax) * rsqrtf(ax));
     }
     return x < 0.0 ? -ans : ans;
 }

◆ bessi2()

__host__ __device__ float bessi2 ( float x )

Definition at line 167 of file reconstruct_fourier_codelet_reconstruct.cpp.

                       {
     return (x == 0) ? 0 : bessi0(x) - ((2*1) / x) * bessi1(x);
 }

◆ bessi3()

__host__ __device__ float bessi3 ( float x )

Definition at line 172 of file reconstruct_fourier_codelet_reconstruct.cpp.

                       {
     return (x == 0) ? 0 : bessi1(x) - ((2*2) / x) * bessi2(x);
 }

◆ bessi4()

__host__ __device__ float bessi4 ( float x )

Definition at line 177 of file reconstruct_fourier_codelet_reconstruct.cpp.

                       {
     return (x == 0) ? 0 : bessi2(x) - ((2*3) / x) * bessi3(x);
 }

◆ computeAABB()

__host__ __device__ void computeAABB	(	Point3D< float >	AABB[2],
		const Point3D< float >	cuboid[8]
	)

Compute Axis Aligned Bounding Box of given cuboid

Definition at line 317 of file reconstruct_fourier_codelet_reconstruct.cpp.

                                                                          {
     AABB[0].x = AABB[0].y = AABB[0].z = INFINITY;
     AABB[1].x = AABB[1].y = AABB[1].z = -INFINITY;
     for (int i = 0; i < 8; i++) {
         Point3D<float> tmp = cuboid[i];
         if (AABB[0].x > tmp.x) AABB[0].x = tmp.x;
         if (AABB[0].y > tmp.y) AABB[0].y = tmp.y;
         if (AABB[0].z > tmp.z) AABB[0].z = tmp.z;
         if (AABB[1].x < tmp.x) AABB[1].x = tmp.x;
         if (AABB[1].y < tmp.y) AABB[1].y = tmp.y;
         if (AABB[1].z < tmp.z) AABB[1].z = tmp.z;
     }
     AABB[0].x = ceilf(AABB[0].x);
     AABB[0].y = ceilf(AABB[0].y);
     AABB[0].z = ceilf(AABB[0].z);
 
     AABB[1].x = floorf(AABB[1].x);
     AABB[1].y = floorf(AABB[1].y);
     AABB[1].z = floorf(AABB[1].z);
 }

◆ copyImgToCache()

__device__ void copyImgToCache	(	float2 *	dest,
		const Point3D< float >	AABB[2],
		const float2 *	FFTs,
		const int	fftSizeX,
		const int	fftSizeY,
		const int	imgIndex,
		const int	imgCacheDim
	)

Method will copy imgIndex(th) data from buffer to given destination (shared memory). Only data within AABB will be copied. Destination is expected to be continuous array of sufficient size (imgCacheDim^2)

Definition at line 695 of file reconstruct_fourier_codelet_reconstruct.cpp.

                                            {
     for (int y = threadIdx.y; y < imgCacheDim; y += blockDim.y) {
         for (int x = threadIdx.x; x < imgCacheDim; x += blockDim.x) {
             int memIndex = y * imgCacheDim + x;
             getImgData(AABB, x, y, FFTs, fftSizeX, fftSizeY, imgIndex, dest[memIndex]);
         }
     }
 }

◆ func_reconstruct_cpu_dynamic_interpolation()

void func_reconstruct_cpu_dynamic_interpolation	(	void *	buffers[],
		void *	cl_arg
	)

Definition at line 1209 of file reconstruct_fourier_codelet_reconstruct.cpp.

                                                                                {
     func_reconstruct_cpu_template<false>(buffers, cl_arg);
 }

◆ func_reconstruct_cpu_lookup_interpolation()

void func_reconstruct_cpu_lookup_interpolation	(	void *	buffers[],
		void *	cl_arg
	)

Definition at line 1205 of file reconstruct_fourier_codelet_reconstruct.cpp.

                                                                               {
     func_reconstruct_cpu_template<true>(buffers, cl_arg);
 }

◆ func_reconstruct_cpu_template()

template<bool usePrecomputedInterpolation>

void func_reconstruct_cpu_template	(	void *	buffers[],
		void *	cl_arg
	)

Definition at line 1141 of file reconstruct_fourier_codelet_reconstruct.cpp.

                                                                   {
     const ReconstructFftArgs& arg = *(ReconstructFftArgs*) cl_arg;
     const float2* inFFTs = (float2*)STARPU_VECTOR_GET_PTR(buffers[0]);
     const RecFourierProjectionTraverseSpace* inSpaces = (RecFourierProjectionTraverseSpace*)STARPU_MATRIX_GET_PTR(buffers[1]);
     const float* inBlobTableSqrt = (float*)(STARPU_VECTOR_GET_PTR(buffers[2]));
     float2* outVolumeBuffer = (float2*)(STARPU_VECTOR_GET_PTR(buffers[3])); // Actually std::complex<float>
     float* outWeightsBuffer = (float*)(STARPU_VECTOR_GET_PTR(buffers[4]));
     const uint32_t noOfImages = ((LoadedImagesBuffer*) STARPU_VARIABLE_GET_PTR(buffers[5]))->noOfImages;
 
     switch (arg.blobOrder) {
         case 0:
             if (arg.blobAlpha <= 15.0) {
                 processBufferCPU<0, true, usePrecomputedInterpolation>(outVolumeBuffer, outWeightsBuffer,
                                           arg.fftSizeX, arg.fftSizeY,
                                           arg.noOfSymmetries * noOfImages, inSpaces,
                                           inFFTs,
                                           inBlobTableSqrt,
                                           arg.fastLateBlobbing);
             } else {
                 processBufferCPU<0, false, usePrecomputedInterpolation>(outVolumeBuffer, outWeightsBuffer,
                                            arg.fftSizeX, arg.fftSizeY,
                                            arg.noOfSymmetries * noOfImages, inSpaces,
                                            inFFTs,
                                            inBlobTableSqrt,
                                            arg.fastLateBlobbing);
             }
             break;
         case 1:
             processBufferCPU<1, false, usePrecomputedInterpolation>(outVolumeBuffer, outWeightsBuffer,
                                        arg.fftSizeX, arg.fftSizeY,
                                        arg.noOfSymmetries * noOfImages, inSpaces,
                                        inFFTs,
                                        inBlobTableSqrt,
                                        arg.fastLateBlobbing);
             break;
         case 2:
             processBufferCPU<2, false, usePrecomputedInterpolation>(outVolumeBuffer, outWeightsBuffer,
                                        arg.fftSizeX, arg.fftSizeY,
                                        arg.noOfSymmetries * noOfImages, inSpaces,
                                        inFFTs,
                                        inBlobTableSqrt,
                                        arg.fastLateBlobbing);
             break;
         case 3:
             processBufferCPU<3, false, usePrecomputedInterpolation>(outVolumeBuffer, outWeightsBuffer,
                                        arg.fftSizeX, arg.fftSizeY,
                                        arg.noOfSymmetries * noOfImages, inSpaces,
                                        inFFTs,
                                        inBlobTableSqrt,
                                        arg.fastLateBlobbing);
             break;
         case 4:
             processBufferCPU<4, false, usePrecomputedInterpolation>(outVolumeBuffer, outWeightsBuffer,
                                        arg.fftSizeX, arg.fftSizeY,
                                        arg.noOfSymmetries * noOfImages, inSpaces,
                                        inFFTs,
                                        inBlobTableSqrt,
                                        arg.fastLateBlobbing);
             break;
         default:
             REPORT_ERROR(ERR_VALUE_INCORRECT, "m out of range [0..4] in kaiser_value()");
     }
 }

◆ func_reconstruct_cuda()

void func_reconstruct_cuda	(	void *	buffers[],
		void *	cl_arg
	)

Definition at line 815 of file reconstruct_fourier_codelet_reconstruct.cpp.

                                                           {
     const ReconstructFftArgs& arg = *(ReconstructFftArgs*) cl_arg;
     const float2* inFFTs = (float2*)STARPU_VECTOR_GET_PTR(buffers[0]);
     const RecFourierProjectionTraverseSpace* inSpaces = (RecFourierProjectionTraverseSpace*)STARPU_MATRIX_GET_PTR(buffers[1]);
     const float* inBlobTableSqrt = (float*)(STARPU_VECTOR_GET_PTR(buffers[2]));
     float2* outVolumeBuffer = (float2*)(STARPU_VECTOR_GET_PTR(buffers[3])); // Actually std::complex<float>
     float* outWeightsBuffer = (float*)(STARPU_VECTOR_GET_PTR(buffers[4]));
     const uint32_t noOfImages = ((LoadedImagesBuffer*) STARPU_VARIABLE_GET_PTR(buffers[5]))->noOfImages;
 
     switch (arg.blobOrder) {
         case 0:
             if (arg.blobAlpha <= 15.0) {
                 processBufferGPU<0, true>(outVolumeBuffer, outWeightsBuffer,
                                           arg.fftSizeX, arg.fftSizeY,
                                           arg.noOfSymmetries * noOfImages, inSpaces,
                                           inFFTs,
                                           inBlobTableSqrt,
                                           arg.fastLateBlobbing,
                                           arg.blobRadius, arg.maxVolIndexYZ);
             } else {
                 processBufferGPU<0, false>(outVolumeBuffer, outWeightsBuffer,
                                            arg.fftSizeX, arg.fftSizeY,
                                            arg.noOfSymmetries * noOfImages, inSpaces,
                                            inFFTs,
                                            inBlobTableSqrt,
                                            arg.fastLateBlobbing,
                                            arg.blobRadius, arg.maxVolIndexYZ);
             }
             break;
         case 1:
             processBufferGPU<1, false>(outVolumeBuffer, outWeightsBuffer,
                                        arg.fftSizeX, arg.fftSizeY,
                                        arg.noOfSymmetries * noOfImages, inSpaces,
                                        inFFTs,
                                        inBlobTableSqrt,
                                        arg.fastLateBlobbing,
                                        arg.blobRadius, arg.maxVolIndexYZ);
             break;
         case 2:
             processBufferGPU<2, false>(outVolumeBuffer, outWeightsBuffer,
                                        arg.fftSizeX, arg.fftSizeY,
                                        arg.noOfSymmetries * noOfImages, inSpaces,
                                        inFFTs,
                                        inBlobTableSqrt,
                                        arg.fastLateBlobbing,
                                        arg.blobRadius, arg.maxVolIndexYZ);
             break;
         case 3:
             processBufferGPU<3, false>(outVolumeBuffer, outWeightsBuffer,
                                        arg.fftSizeX, arg.fftSizeY,
                                        arg.noOfSymmetries * noOfImages, inSpaces,
                                        inFFTs,
                                        inBlobTableSqrt,
                                        arg.fastLateBlobbing,
                                        arg.blobRadius, arg.maxVolIndexYZ);
             break;
         case 4:
             processBufferGPU<4, false>(outVolumeBuffer, outWeightsBuffer,
                                        arg.fftSizeX, arg.fftSizeY,
                                        arg.noOfSymmetries * noOfImages, inSpaces,
                                        inFFTs,
                                        inBlobTableSqrt,
                                        arg.fastLateBlobbing,
                                        arg.blobRadius, arg.maxVolIndexYZ);
             break;
         default:
             REPORT_ERROR(ERR_VALUE_INCORRECT, "m out of range [0..4] in kaiser_value()");
     }
 
     // gpuErrchk(cudaStreamSynchronize(starpu_cuda_get_local_stream())); disabled because codelet is async
 }

◆ getImgData()

__device__ void getImgData	(	const Point3D< float >	AABB[2],
		const int	tXindex,
		const int	tYindex,
		const float2 *	FFTs,
		const int	fftSizeX,
		const int	fftSizeY,
		const int	imgIndex,
		float2 &	vComplex
	)

Method will load data from image at position tXindex, tYindex and return them. In case the data lies outside of the image boundaries, zeros (0,0) are returned

Definition at line 670 of file reconstruct_fourier_codelet_reconstruct.cpp.

                                   {
     int imgXindex = tXindex + static_cast<int>(AABB[0].x);
     int imgYindex = tYindex + static_cast<int>(AABB[0].y);
     if ((imgXindex >= 0)
         && (imgXindex < fftSizeX)
         && (imgYindex >=0)
         && (imgYindex < fftSizeY))  {
         int index = imgYindex * fftSizeX + imgXindex; // copy data from image
         vComplex = (FFTs + fftSizeX * fftSizeY * imgIndex)[index];
     } else {
         vComplex = {0.f, 0.f}; // out of image bound, so return zero
     }
 }

◆ getX()

__host__ __device__ float getX	(	float	y,
		float	z,
		const Point3D< float > &	n,
		const Point3D< float > &	p0
	)

Calculates X coordinate of the point [y, z] on the plane defined by p0 (origin) and normal

Definition at line 269 of file reconstruct_fourier_codelet_reconstruct.cpp.

                                                                                {
     // from a(x-x0)+b(y-y0)+c(z-z0)=0
     return (-n.y*(y-p0.y)-n.z*(z-p0.z))/n.x + p0.x;
 }

◆ getY()

__host__ __device__ float getY	(	float	x,
		float	z,
		const Point3D< float > &	n,
		const Point3D< float > &	p0
	)

Calculates Y coordinate of the point [x, z] on the plane defined by p0 (origin) and normal

Definition at line 261 of file reconstruct_fourier_codelet_reconstruct.cpp.

                                                                                {
     // from a(x-x0)+b(y-y0)+c(z-z0)=0
     return (-n.x*(x-p0.x)-n.z*(z-p0.z))/n.y + p0.y;
 }

◆ getZ()

__host__ __device__ float getZ	(	float	x,
		float	y,
		const Point3D< float > &	n,
		const Point3D< float > &	p0
	)

Calculates Z coordinate of the point [x, y] on the plane defined by p0 (origin) and normal

Definition at line 254 of file reconstruct_fourier_codelet_reconstruct.cpp.

                                                                                 {
     // from a(x-x0)+b(y-y0)+c(z-z0)=0
     return (-n.x*(x-p0.x)-n.y*(y-p0.y))/n.z + p0.z;
 }

◆ kaiserValue()

template<int order>

__host__ __device__ float kaiserValue	(	float	r,
		float	a
	)

Definition at line 183 of file reconstruct_fourier_codelet_reconstruct.cpp.

                                     {
     const CodeletConstants& c =
 #ifdef __CUDA_ARCH__
         gpuC;
 #else
         cpuC;
 #endif
 
     float w;
     float rda = r / a;
     if (rda <= 1.f)
     {
         float rdas = rda * rda;
         float arg = c.cBlobAlpha * sqrtf(1.f - rdas);
         if (order == 0)
         {
             w = bessi0(arg) * c.cOneOverBessiOrderAlpha;
         }
         else if (order == 1)
         {
             w = sqrtf (1.f - rdas);
             w *= bessi1(arg) * c.cOneOverBessiOrderAlpha;
         }
         else if (order == 2)
         {
             w = sqrtf (1.f - rdas);
             w = w * w;
             w *= bessi2(arg) * c.cOneOverBessiOrderAlpha;
         }
         else if (order == 3)
         {
             w = sqrtf (1.f - rdas);
             w = w * w * w;
             w *= bessi3(arg) * c.cOneOverBessiOrderAlpha;
         }
         else if (order == 4)
         {
             w = sqrtf (1.f - rdas);
             w = w * w * w *w;
             w *= bessi4(arg) * c.cOneOverBessiOrderAlpha;
         }
         else {
             printf("order (%d) out of range in kaiser_value(): %s, %d\n", order, __FILE__, __LINE__);
             w = 0.f;
         }
     }
     else
         w = 0.f;
 
     return w;
 }

◆ kaiserValueFast()

__host__ __device__ float kaiserValueFast ( float distSqr )

Definition at line 236 of file reconstruct_fourier_codelet_reconstruct.cpp.

                                      {
     const CodeletConstants& c =
 #ifdef __CUDA_ARCH__
             gpuC;
 #else
             cpuC;
 #endif
 
     float arg = c.cBlobAlpha * sqrtf(1.f - (distSqr * c.cOneOverBlobRadiusSqr)); // alpha * sqrt(1-(dist/blobRadius^2))
     return bessi0Fast(arg) * c.cOneOverBessiOrderAlpha * c.cIw0;
 }

◆ multiply()

__host__ __device__ void multiply	(	const float	transform[3][3],
		Point3D< float > &	inOut
	)

Do 3x3 x 1x3 matrix-vector multiplication

Definition at line 276 of file reconstruct_fourier_codelet_reconstruct.cpp.

                                                                   {
     float tmp0 = transform[0][0] * inOut.x + transform[0][1] * inOut.y + transform[0][2] * inOut.z;
     float tmp1 = transform[1][0] * inOut.x + transform[1][1] * inOut.y + transform[1][2] * inOut.z;
     float tmp2 = transform[2][0] * inOut.x + transform[2][1] * inOut.y + transform[2][2] * inOut.z;
     inOut.x = tmp0;
     inOut.y = tmp1;
     inOut.z = tmp2;
 }

◆ processBufferCPU()

template<int blobOrder, bool useFastKaiser, bool usePrecomputedInterpolation>

void processBufferCPU	(	float2 *	outVolumeBuffer,
		float *	outWeightsBuffer,
		const int	fftSizeX,
		const int	fftSizeY,
		const int	traverseSpaceCount,
		const RecFourierProjectionTraverseSpace *	traverseSpaces,
		const float2 *	inFFTs,
		const float *	blobTableSqrt,
		const bool	fastLateBlobbing
	)

Definition at line 1105 of file reconstruct_fourier_codelet_reconstruct.cpp.

                                      {
 
     const int groupSize = starpu_combined_worker_get_size();
     const int groupRank = starpu_combined_worker_get_rank();
 
     for (int i = groupRank; i < traverseSpaceCount; i += groupSize) {
         const RecFourierProjectionTraverseSpace &space = traverseSpaces[i];
 
         const float2* spaceFFT = inFFTs + fftSizeX * fftSizeY * space.projectionIndex;
 
         // by using templates, we can save some registers, especially for 'fast' version
         if (fastLateBlobbing) {
             processProjectionCPU<true, blobOrder, useFastKaiser, usePrecomputedInterpolation>(
                     outVolumeBuffer, outWeightsBuffer,
                     fftSizeX, fftSizeY,
                     spaceFFT,
                     &space,
                     blobTableSqrt);
         } else {
             processProjectionCPU<false, blobOrder, useFastKaiser, usePrecomputedInterpolation>(
                     outVolumeBuffer, outWeightsBuffer,
                     fftSizeX, fftSizeY,
                     spaceFFT,
                     &space,
                     blobTableSqrt);
         }
     }
 }

◆ processBufferGPU()

template<int blobOrder, bool useFastKaiser>

void processBufferGPU	(	float2 *	outVolumeBuffer,
		float *	outWeightsBuffer,
		const int	fftSizeX,
		const int	fftSizeY,
		const int	traverseSpaceCount,
		const RecFourierProjectionTraverseSpace *	traverseSpaces,
		const float2 *	inFFTs,
		const float *	blobTableSqrt,
		const bool	fastLateBlobbing,
		const float	blobRadius,
		const int	maxVolIndexYZ
	)

Method will use data stored in the buffer and update temporal storages appropriately. Actual calculation is done asynchronously, but 'buffer' can be reused once the method returns.

Definition at line 774 of file reconstruct_fourier_codelet_reconstruct.cpp.

                                                          {
 
     // enqueue kernel and return control
     const int imgCacheDim = static_cast<int>(ceil(sqrt(2.f) * sqrt(3.f) * (BLOCK_DIM + 2 * blobRadius)));
     dim3 dimBlock(BLOCK_DIM, BLOCK_DIM);
 
     const int size2D = maxVolIndexYZ + 1;
     dim3 dimGrid(static_cast<unsigned int>(ceil(size2D / (float)dimBlock.x)),
                  static_cast<unsigned int>(ceil(size2D / (float)dimBlock.y)),
                  GRID_DIM_Z);
 
     // by using templates, we can save some registers, especially for 'fast' version
     if (fastLateBlobbing) {
         processBufferKernel<true, blobOrder,useFastKaiser><<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>>(
                 outVolumeBuffer, outWeightsBuffer,
                 fftSizeX, fftSizeY,
                 traverseSpaceCount, traverseSpaces,
                 inFFTs,
                 blobTableSqrt,
                 imgCacheDim);
     } else {
         // if making copy of the image in shared memory, allocate enough space
         int sharedMemSize = SHARED_IMG ? (imgCacheDim*imgCacheDim*sizeof(float2)) : 0;
         processBufferKernel<false, blobOrder,useFastKaiser><<<dimGrid, dimBlock, sharedMemSize, starpu_cuda_get_local_stream()>>>(
                 outVolumeBuffer, outWeightsBuffer,
                 fftSizeX, fftSizeY,
                 traverseSpaceCount, traverseSpaces,
                 inFFTs,
                 blobTableSqrt,
                 imgCacheDim);
     }
     gpuErrchk(cudaPeekAtLastError());
 }

◆ processBufferKernel()

template<bool fastLateBlobbing, int blobOrder, bool useFastKaiser>

__global__ void processBufferKernel	(	float2 *	outVolumeBuffer,
		float *	outWeightsBuffer,
		const int	fftSizeX,
		const int	fftSizeY,
		const int	traverseSpaceCount,
		const RecFourierProjectionTraverseSpace *	traverseSpaces,
		const float2 *	FFTs,
		const float *	blobTableSqrt,
		int	imgCacheDim
	)

Method will use data stored in the buffer and update temporal storages appropriately.

Definition at line 712 of file reconstruct_fourier_codelet_reconstruct.cpp.

                          {
 
 #if SHARED_BLOB_TABLE
     if ( ! fastLateBlobbing) {
         // copy blob table to shared memory
         volatile int id = threadIdx.y*blockDim.x + threadIdx.x;
         volatile int blockSize = blockDim.x * blockDim.y;
         for (int i = id; i < BLOB_TABLE_SIZE_SQRT; i+= blockSize)
             BLOB_TABLE[i] = blobTableSqrt[i];
         __syncthreads();
     }
 #endif
 
     for (int i = blockIdx.z; i < traverseSpaceCount; i += gridDim.z) {
         const RecFourierProjectionTraverseSpace& space = traverseSpaces[i];
 
 #if SHARED_IMG
         if ( ! fastLateBlobbing) {
             // make sure that all threads start at the same time
             // as they can come from previous iteration
             __syncthreads();
             if ((threadIdx.x == 0) && (threadIdx.y == 0)) {
                 // first thread calculates which part of the image should be shared
                 calculateAABB(&space, SHARED_AABB);
             }
             __syncthreads();
             // check if the block will have to copy data from image
             if (isWithin(SHARED_AABB, fftSizeX, fftSizeY)) {
                 // all threads copy image data to shared memory
                 copyImgToCache(IMG, SHARED_AABB, FFTs, fftSizeX, fftSizeY, space.projectionIndex, imgCacheDim);
                 __syncthreads();
             } else {
                 continue; // whole block can exit, as it's not reading from image
             }
         }
 #endif
 
         processProjection<fastLateBlobbing, blobOrder, useFastKaiser>(
                 outVolumeBuffer, outWeightsBuffer,
                 fftSizeX, fftSizeY,
                 FFTs + fftSizeX * fftSizeY * space.projectionIndex,
                 &space,
                 blobTableSqrt,
                 imgCacheDim);
 
         __syncthreads(); // sync threads to avoid write after read problems
     }
 }

◆ processProjection()

template<bool useFast, int blobOrder, bool useFastKaiser>

__device__ void processProjection	(	float2 *	tempVolumeGPU,
		float *	tempWeightsGPU,
		const int	xSize,
		const int	ySize,
		const float2 *__restrict__	FFT,
		const RecFourierProjectionTraverseSpace *const	tSpace,
		const float *	blobTableSqrt,
		const int	imgCacheDim
	)

Method will process one projection image and add result to temporal spaces.

Definition at line 579 of file reconstruct_fourier_codelet_reconstruct.cpp.

 {
     // map thread to each (2D) voxel
 #if TILE > 1
     int id = threadIdx.y * blockDim.x + threadIdx.x;
     int tidX = threadIdx.x % TILE + (id / (blockDim.y * TILE)) * TILE;
     int tidY = (id / TILE) % blockDim.y;
     int idx = blockIdx.x * blockDim.x + tidX;
     int idy = blockIdx.y * blockDim.y + tidY;
 #else
     // map thread to each (2D) voxel
     volatile int idx = blockIdx.x*blockDim.x + threadIdx.x;
     volatile int idy = blockIdx.y*blockDim.y + threadIdx.y;
 #endif
 
     if (tSpace->XY == tSpace->dir) { // iterate XY plane
         if (idy >= tSpace->minY && idy <= tSpace->maxY) {
             if (idx >= tSpace->minX && idx <= tSpace->maxX) {
                 if (useFast) {
                     float hitZ = getZ(idx, idy, tSpace->unitNormal, tSpace->bottomOrigin);
                     int z = (int)(hitZ + 0.5f); // rounding
                     processVoxel(tempVolumeGPU, tempWeightsGPU, idx, idy, z, xSize, ySize, FFT, tSpace);
                 } else {
                     float z1 = getZ(idx, idy, tSpace->unitNormal, tSpace->bottomOrigin); // lower plane
                     float z2 = getZ(idx, idy, tSpace->unitNormal, tSpace->topOrigin); // upper plane
                     z1 = clamp(z1, 0, gpuC.cMaxVolumeIndexYZ);
                     z2 = clamp(z2, 0, gpuC.cMaxVolumeIndexYZ);
                     int lower = static_cast<int>(floorf(fminf(z1, z2)));
                     int upper = static_cast<int>(ceilf(fmaxf(z1, z2)));
                     for (int z = lower; z <= upper; z++) {
                         processVoxelBlob<blobOrder, useFastKaiser>(tempVolumeGPU, tempWeightsGPU, idx, idy, z, xSize, ySize, FFT, tSpace, blobTableSqrt, imgCacheDim);
                     }
                 }
             }
         }
     } else if (tSpace->XZ == tSpace->dir) { // iterate XZ plane
         if (idy >= tSpace->minZ && idy <= tSpace->maxZ) { // map z -> y
             if (idx >= tSpace->minX && idx <= tSpace->maxX) {
                 if (useFast) {
                     float hitY =getY(idx, idy, tSpace->unitNormal, tSpace->bottomOrigin);
                     int y = (int)(hitY + 0.5f); // rounding
                     processVoxel(tempVolumeGPU, tempWeightsGPU, idx, y, idy, xSize, ySize, FFT, tSpace);
                 } else {
                     float y1 = getY(idx, idy, tSpace->unitNormal, tSpace->bottomOrigin); // lower plane
                     float y2 = getY(idx, idy, tSpace->unitNormal, tSpace->topOrigin); // upper plane
                     y1 = clamp(y1, 0, gpuC.cMaxVolumeIndexYZ);
                     y2 = clamp(y2, 0, gpuC.cMaxVolumeIndexYZ);
                     int lower = static_cast<int>(floorf(fminf(y1, y2)));
                     int upper = static_cast<int>(ceilf(fmaxf(y1, y2)));
                     for (int y = lower; y <= upper; y++) {
                         processVoxelBlob<blobOrder, useFastKaiser>(tempVolumeGPU, tempWeightsGPU, idx, y, idy, xSize, ySize, FFT, tSpace, blobTableSqrt, imgCacheDim);
                     }
                 }
             }
         }
     } else { // iterate YZ plane
         if (idy >= tSpace->minZ && idy <= tSpace->maxZ) { // map z -> y
             if (idx >= tSpace->minY && idx <= tSpace->maxY) { // map y > x
                 if (useFast) {
                     float hitX = getX(idx, idy, tSpace->unitNormal, tSpace->bottomOrigin);
                     int x = (int)(hitX + 0.5f); // rounding
                     processVoxel(tempVolumeGPU, tempWeightsGPU, x, idx, idy, xSize, ySize, FFT, tSpace);
                 } else {
                     float x1 = getX(idx, idy, tSpace->unitNormal, tSpace->bottomOrigin); // lower plane
                     float x2 = getX(idx, idy, tSpace->unitNormal, tSpace->topOrigin); // upper plane
                     x1 = clamp(x1, 0, gpuC.cMaxVolumeIndexX);
                     x2 = clamp(x2, 0, gpuC.cMaxVolumeIndexX);
                     int lower = static_cast<int>(floorf(fminf(x1, x2)));
                     int upper = static_cast<int>(ceilf(fmaxf(x1, x2)));
                     for (int x = lower; x <= upper; x++) {
                         processVoxelBlob<blobOrder, useFastKaiser>(tempVolumeGPU, tempWeightsGPU, x, idx, idy, xSize, ySize, FFT, tSpace, blobTableSqrt, imgCacheDim);
                     }
                 }
             }
         }
     }
 }

◆ processProjectionCPU()

template<bool useFast, int blobOrder, bool useFastKaiser, bool usePrecomputedInterpolation>

void processProjectionCPU	(	float2 *	tempVolumeGPU,
		float *	tempWeightsGPU,
		const int	xSize,
		const int	ySize,
		const float2 *__restrict__	FFT,
		const RecFourierProjectionTraverseSpace *const	tSpace,
		const float *	blobTableSqrt
	)

Definition at line 1034 of file reconstruct_fourier_codelet_reconstruct.cpp.

                                     {
 
     if (tSpace->XY == tSpace->dir) { // iterate XY plane
         for (int idy = tSpace->minY; idy <= tSpace->maxY; idy++) {
             for (int idx = tSpace->minX; idx <= tSpace->maxX; idx++) {
                 if (useFast) {
                     float hitZ = getZ(idx, idy, tSpace->unitNormal, tSpace->bottomOrigin);
                     int z = (int)(hitZ + 0.5f); // rounding
                     processVoxelCPU(tempVolumeGPU, tempWeightsGPU, idx, idy, z, xSize, ySize, FFT, tSpace);
                 } else {
                     float z1 = getZ(idx, idy, tSpace->unitNormal, tSpace->bottomOrigin); // lower plane
                     float z2 = getZ(idx, idy, tSpace->unitNormal, tSpace->topOrigin); // upper plane
                     z1 = clamp(z1, 0, cpuC.cMaxVolumeIndexYZ);
                     z2 = clamp(z2, 0, cpuC.cMaxVolumeIndexYZ);
                     int lower = static_cast<int>(floorf(fminf(z1, z2)));
                     int upper = static_cast<int>(ceilf(fmaxf(z1, z2)));
                     for (int z = lower; z <= upper; z++) {
                         processVoxelBlobCPU<blobOrder, useFastKaiser, usePrecomputedInterpolation>(tempVolumeGPU, tempWeightsGPU, idx, idy, z, xSize, ySize, FFT, tSpace, blobTableSqrt);
                     }
                 }
             }
         }
     } else if (tSpace->XZ == tSpace->dir) { // iterate XZ plane
         for (int idy = tSpace->minZ; idy <= tSpace->maxZ; idy++) { // map z -> y
             for (int idx = tSpace->minX; idx <= tSpace->maxX; idx++) {
                 if (useFast) {
                     float hitY =getY(idx, idy, tSpace->unitNormal, tSpace->bottomOrigin);
                     int y = (int)(hitY + 0.5f); // rounding
                     processVoxelCPU(tempVolumeGPU, tempWeightsGPU, idx, y, idy, xSize, ySize, FFT, tSpace);
                 } else {
                     float y1 = getY(idx, idy, tSpace->unitNormal, tSpace->bottomOrigin); // lower plane
                     float y2 = getY(idx, idy, tSpace->unitNormal, tSpace->topOrigin); // upper plane
                     y1 = clamp(y1, 0, cpuC.cMaxVolumeIndexYZ);
                     y2 = clamp(y2, 0, cpuC.cMaxVolumeIndexYZ);
                     int lower = static_cast<int>(floorf(fminf(y1, y2)));
                     int upper = static_cast<int>(ceilf(fmaxf(y1, y2)));
                     for (int y = lower; y <= upper; y++) {
                         processVoxelBlobCPU<blobOrder, useFastKaiser, usePrecomputedInterpolation>(tempVolumeGPU, tempWeightsGPU, idx, y, idy, xSize, ySize, FFT, tSpace, blobTableSqrt);
                     }
                 }
             }
         }
     } else { // iterate YZ plane
         for (int idy = tSpace->minZ; idy <= tSpace->maxZ; idy++) { // map z -> y
             for (int idx = tSpace->minY; idx <= tSpace->maxY; idx++) { // map y > x
                 if (useFast) {
                     float hitX = getX(idx, idy, tSpace->unitNormal, tSpace->bottomOrigin);
                     int x = (int)(hitX + 0.5f); // rounding
                     processVoxelCPU(tempVolumeGPU, tempWeightsGPU, x, idx, idy, xSize, ySize, FFT, tSpace);
                 } else {
                     float x1 = getX(idx, idy, tSpace->unitNormal, tSpace->bottomOrigin); // lower plane
                     float x2 = getX(idx, idy, tSpace->unitNormal, tSpace->topOrigin); // upper plane
                     x1 = clamp(x1, 0, cpuC.cMaxVolumeIndexX);
                     x2 = clamp(x2, 0, cpuC.cMaxVolumeIndexX);
                     int lower = static_cast<int>(floorf(fminf(x1, x2)));
                     int upper = static_cast<int>(ceilf(fmaxf(x1, x2)));
                     for (int x = lower; x <= upper; x++) {
                         processVoxelBlobCPU<blobOrder, useFastKaiser, usePrecomputedInterpolation>(tempVolumeGPU, tempWeightsGPU, x, idx, idy, xSize, ySize, FFT, tSpace, blobTableSqrt);
                     }
                 }
             }
         }
     }
 }

◆ processVoxel()

__device__ void processVoxel	(	float2 *	tempVolumeGPU,
		float *	tempWeightsGPU,
		int	x,
		int	y,
		int	z,
		int	xSize,
		int	ySize,
		const float2 *__restrict__	FFT,
		const RecFourierProjectionTraverseSpace *const	space
	)

Method will map one voxel from the temporal spaces to the given projection and update temporal spaces using the pixel value of the projection.

Definition at line 435 of file reconstruct_fourier_codelet_reconstruct.cpp.

 {
     Point3D<float> imgPos;
     float wBlob = 1.f;
 
     float dataWeight = space->weight;
 
     // transform current point to center
     imgPos.x = x - gpuC.cMaxVolumeIndexX/2;
     imgPos.y = y - gpuC.cMaxVolumeIndexYZ/2;
     imgPos.z = z - gpuC.cMaxVolumeIndexYZ/2;
     if (imgPos.x*imgPos.x + imgPos.y*imgPos.y + imgPos.z*imgPos.z > space->maxDistanceSqr) {
         return; // discard iterations that would access pixel with too high frequency
     }
     // rotate around center
     multiply(space->transformInv, imgPos);
     if (imgPos.x < 0.f) return; // reading outside of the image boundary. Z is always correct and Y is checked by the condition above
 
     // transform back and round
     // just Y coordinate needs adjusting, since X now matches to picture and Z is irrelevant
     int imgX = clamp((int)(imgPos.x + 0.5f), 0, xSize - 1);
     int imgY = clamp((int)(imgPos.y + 0.5f + gpuC.cMaxVolumeIndexYZ / 2), 0, ySize - 1);
 
     int index3D = z * (gpuC.cMaxVolumeIndexYZ+1) * (gpuC.cMaxVolumeIndexX+1) + y * (gpuC.cMaxVolumeIndexX+1) + x;
     int index2D = imgY * xSize + imgX;
 
     float weight = wBlob * dataWeight;
 
     // use atomic as two blocks can write to same voxel
     atomicAdd(&tempVolumeGPU[index3D].x, FFT[index2D].x * weight);
     atomicAdd(&tempVolumeGPU[index3D].y, FFT[index2D].y * weight);
     atomicAdd(&tempWeightsGPU[index3D], weight);
 }

◆ processVoxelBlob()

template<int blobOrder, bool useFastKaiser>

__device__ void processVoxelBlob	(	float2 *	tempVolumeGPU,
		float *	tempWeightsGPU,
		const int	x,
		const int	y,
		const int	z,
		const int	xSize,
		const int	ySize,
		const float2 *__restrict__	FFT,
		const RecFourierProjectionTraverseSpace *const	space,
		const float *	blobTableSqrt,
		const int	imgCacheDim
	)

Method will map one voxel from the temporal spaces to the given projection and update temporal spaces using the pixel values of the projection withing the blob distance.

Definition at line 481 of file reconstruct_fourier_codelet_reconstruct.cpp.

 {
     Point3D<float> imgPos;
     // transform current point to center
     imgPos.x = x - gpuC.cMaxVolumeIndexX/2;
     imgPos.y = y - gpuC.cMaxVolumeIndexYZ/2;
     imgPos.z = z - gpuC.cMaxVolumeIndexYZ/2;
     if ((imgPos.x*imgPos.x + imgPos.y*imgPos.y + imgPos.z*imgPos.z) > space->maxDistanceSqr) {
         return; // discard iterations that would access pixel with too high frequency
     }
     // rotate around center
     multiply(space->transformInv, imgPos);
     if (imgPos.x < -gpuC.cBlobRadius) return; // reading outside of the image boundary. Z is always correct and Y is checked by the condition above
     // transform back just Y coordinate, since X now matches to picture and Z is irrelevant
     imgPos.y += gpuC.cMaxVolumeIndexYZ / 2;
 
     // check that we don't want to collect data from far far away ...
     float radiusSqr = gpuC.cBlobRadius * gpuC.cBlobRadius;
     float zSqr = imgPos.z * imgPos.z;
     if (zSqr > radiusSqr) return;
 
     // create blob bounding box
     int minX = ceilf(imgPos.x - gpuC.cBlobRadius);
     int maxX = floorf(imgPos.x + gpuC.cBlobRadius);
     int minY = ceilf(imgPos.y - gpuC.cBlobRadius);
     int maxY = floorf(imgPos.y + gpuC.cBlobRadius);
     minX = fmaxf(minX, 0);
     minY = fmaxf(minY, 0);
     maxX = fminf(maxX, xSize-1);
     maxY = fminf(maxY, ySize-1);
 
     int index3D = z * (gpuC.cMaxVolumeIndexYZ+1) * (gpuC.cMaxVolumeIndexX+1) + y * (gpuC.cMaxVolumeIndexX+1) + x;
     float2 vol;
     float w;
     vol.x = vol.y = w = 0.f;
     float dataWeight = space->weight;
 
     // check which pixel in the vicinity should contribute
     for (int i = minY; i <= maxY; i++) {
         float ySqr = (imgPos.y - i) * (imgPos.y - i);
         float yzSqr = ySqr + zSqr;
         if (yzSqr > radiusSqr) continue;
         for (int j = minX; j <= maxX; j++) {
             float xD = imgPos.x - j;
             float distanceSqr = xD*xD + yzSqr;
             if (distanceSqr > radiusSqr) continue;
 
 #if SHARED_IMG
             int index2D = (i - SHARED_AABB[0].y) * imgCacheDim + (j-SHARED_AABB[0].x); // position in img - offset of the AABB
 #else
             int index2D = i * xSize + j;
 #endif
 
 #if PRECOMPUTE_BLOB_VAL
             int aux = (int) ((distanceSqr * gpuC.cIDeltaSqrt + 0.5f));
 #if SHARED_BLOB_TABLE
             float wBlob = BLOB_TABLE[aux];
 #else
             float wBlob = blobTableSqrt[aux];
 #endif
 #else
             float wBlob;
                 if (useFastKaiser) {
                     wBlob = kaiserValueFast(distanceSqr);
                 }
                 else {
                     wBlob = kaiserValue<blobOrder>(sqrtf(distanceSqr), gpuC.cBlobRadius) * gpuC.cIw0;
                 }
 #endif
             float weight = wBlob * dataWeight;
             w += weight;
 #if SHARED_IMG
             vol += IMG[index2D] * weight;
 #else
             vol += FFT[index2D] * weight;
 #endif
         }
     }
 
     // use atomic as two blocks can write to same voxel
     atomicAdd(&tempVolumeGPU[index3D].x, vol.x);
     atomicAdd(&tempVolumeGPU[index3D].y, vol.y);
     atomicAdd(&tempWeightsGPU[index3D], w);
 }

◆ processVoxelBlobCPU()

template<int blobOrder, bool useFastKaiser, bool usePrecomputedInterpolation>

void processVoxelBlobCPU	(	float2 *const	tempVolumeGPU,
		float *const	tempWeightsGPU,
		const int	x,
		const int	y,
		const int	z,
		const int	xSize,
		const int	ySize,
		const float2 *const __restrict__	FFT,
		const RecFourierProjectionTraverseSpace *const	space,
		const float *	blobTableSqrt
	)

Definition at line 954 of file reconstruct_fourier_codelet_reconstruct.cpp.

 {
     Point3D<float> imgPos;
     // transform current point to center
     imgPos.x = x - cpuC.cMaxVolumeIndexX/2;
     imgPos.y = y - cpuC.cMaxVolumeIndexYZ/2;
     imgPos.z = z - cpuC.cMaxVolumeIndexYZ/2;
     if ((imgPos.x*imgPos.x + imgPos.y*imgPos.y + imgPos.z*imgPos.z) > space->maxDistanceSqr) {
         return; // discard iterations that would access pixel with too high frequency
     }
     // rotate around center
     multiply(space->transformInv, imgPos);
     if (imgPos.x < -cpuC.cBlobRadius) return; // reading outside of the image boundary. Z is always correct and Y is checked by the condition above
     // transform back just Y coordinate, since X now matches to picture and Z is irrelevant
     imgPos.y += cpuC.cMaxVolumeIndexYZ / 2;
 
     // check that we don't want to collect data from far far away ...
     float radiusSqr = cpuC.cBlobRadius * cpuC.cBlobRadius;
     float zSqr = imgPos.z * imgPos.z;
     if (zSqr > radiusSqr) return;
 
     // create blob bounding box
     int minX = ceilf(imgPos.x - cpuC.cBlobRadius);
     int maxX = floorf(imgPos.x + cpuC.cBlobRadius);
     int minY = ceilf(imgPos.y - cpuC.cBlobRadius);
     int maxY = floorf(imgPos.y + cpuC.cBlobRadius);
     minX = fmaxf(minX, 0);
     minY = fmaxf(minY, 0);
     maxX = fminf(maxX, xSize-1);
     maxY = fminf(maxY, ySize-1);
 
     int index3D = z * (cpuC.cMaxVolumeIndexYZ+1) * (cpuC.cMaxVolumeIndexX+1) + y * (cpuC.cMaxVolumeIndexX+1) + x;
     float2 vol;
     float w;
     vol.x = vol.y = w = 0.f;
     float dataWeight = space->weight;
 
     // check which pixel in the vicinity should contribute
     for (int i = minY; i <= maxY; i++) {
         float ySqr = (imgPos.y - i) * (imgPos.y - i);
         float yzSqr = ySqr + zSqr;
         if (yzSqr > radiusSqr) continue;
         for (int j = minX; j <= maxX; j++) {
             float xD = imgPos.x - j;
             float distanceSqr = xD*xD + yzSqr;
             if (distanceSqr > radiusSqr) continue;
 
             int index2D = i * xSize + j;
 
             float wBlob;
             if (usePrecomputedInterpolation) {
                 int aux = (int) ((distanceSqr * cpuC.cIDeltaSqrt + 0.5f));
                 wBlob = blobTableSqrt[aux];
             } else if (useFastKaiser) {
                 wBlob = kaiserValueFast(distanceSqr);
             } else {
                 wBlob = kaiserValue<blobOrder>(sqrtf(distanceSqr), cpuC.cBlobRadius) * cpuC.cIw0;
             }
 
             float weight = wBlob * dataWeight;
             w += weight;
             vol += FFT[index2D] * weight;
         }
     }
 
     atomicAddFloat(&tempVolumeGPU[index3D].x, vol.x);
     atomicAddFloat(&tempVolumeGPU[index3D].y, vol.y);
     atomicAddFloat(&tempWeightsGPU[index3D], w);
     //tempVolumeGPU[index3D].x += vol.x;
     //tempVolumeGPU[index3D].y += vol.y;
     //tempWeightsGPU[index3D] += w;
 }

◆ processVoxelCPU()

void processVoxelCPU	(	float2 *const	tempVolumeGPU,
		float *const	tempWeightsGPU,
		const int	x,
		const int	y,
		const int	z,
		const int	xSize,
		const int	ySize,
		const float2 *const __restrict__	FFT,
		const RecFourierProjectionTraverseSpace *const	space
	)

Definition at line 911 of file reconstruct_fourier_codelet_reconstruct.cpp.

 {
     Point3D<float> imgPos;
     float wBlob = 1.f;
 
     float dataWeight = space->weight;
 
     // transform current point to center
     imgPos.x = x - cpuC.cMaxVolumeIndexX/2;
     imgPos.y = y - cpuC.cMaxVolumeIndexYZ/2;
     imgPos.z = z - cpuC.cMaxVolumeIndexYZ/2;
     if (imgPos.x*imgPos.x + imgPos.y*imgPos.y + imgPos.z*imgPos.z > space->maxDistanceSqr) {
         return; // discard iterations that would access pixel with too high frequency
     }
     // rotate around center
     multiply(space->transformInv, imgPos);
     if (imgPos.x < 0.f) return; // reading outside of the image boundary. Z is always correct and Y is checked by the condition above
 
     // transform back and round
     // just Y coordinate needs adjusting, since X now matches to picture and Z is irrelevant
     int imgX = clamp((int)(imgPos.x + 0.5f), 0, xSize - 1);
     int imgY = clamp((int)(imgPos.y + 0.5f + cpuC.cMaxVolumeIndexYZ / 2), 0, ySize - 1);
 
     int index3D = z * (cpuC.cMaxVolumeIndexYZ+1) * (cpuC.cMaxVolumeIndexX+1) + y * (cpuC.cMaxVolumeIndexX+1) + x;
     int index2D = imgY * xSize + imgX;
 
     float weight = wBlob * dataWeight;
 
     // use atomic as two blocks can write to same voxel
     atomicAddFloat(&tempVolumeGPU[index3D].x, FFT[index2D].x * weight);
     atomicAddFloat(&tempVolumeGPU[index3D].y, FFT[index2D].y * weight);
     atomicAddFloat(&tempWeightsGPU[index3D], weight);
     //tempVolumeGPU[index3D].x += FFT[index2D].x * weight;
     //tempVolumeGPU[index3D].y += FFT[index2D].y * weight;
     //tempWeightsGPU[index3D] += weight;
 }

◆ reconstruct_cuda_initialize_constants()

void reconstruct_cuda_initialize_constants	(	int	maxVolIndexX,
		int	maxVolIndexYZ,
		float	blobRadius,
		float	blobAlpha,
		float	iDeltaSqrt,
		float	iw0,
		float	oneOverBessiOrderAlpha
	)

Copy constants used for calculation to GPU memory. Blocking operation.

Definition at line 74 of file reconstruct_fourier_codelet_reconstruct.cpp.

                                                                    {
     CodeletConstants constants = {0};
     constants.cMaxVolumeIndexX = maxVolIndexX;
     constants.cMaxVolumeIndexYZ = maxVolIndexYZ;
     constants.cBlobRadius = blobRadius;
     constants.cOneOverBlobRadiusSqr = 1.f / (blobRadius * blobRadius);
     constants.cBlobAlpha = blobAlpha;
     constants.cIw0 = iw0;
     constants.cIDeltaSqrt = iDeltaSqrt;
     constants.cOneOverBessiOrderAlpha = oneOverBessiOrderAlpha;
 
     // Fill GPU side
     // http://starpu.gforge.inria.fr/doc/html/FrequentlyAskedQuestions.html#HowToInitializeAComputationLibraryOnceForEachWorker
     starpu_execute_on_each_worker(&cuda_set_constants, &constants, STARPU_CUDA);
 
     // Fill CPU side
     memcpy(&cpuC, &constants, sizeof(CodeletConstants));
 }

◆ rotate()

__host__ __device__ void rotate	(	Point3D< float >	box[8],
		const float	transform[3][3]
	)

Method will rotate box using transformation matrix around center of the working space

Definition at line 290 of file reconstruct_fourier_codelet_reconstruct.cpp.

                                                                 {
     const CodeletConstants& c =
 #ifdef __CUDA_ARCH__
             gpuC;
 #else
             cpuC;
 #endif
 
     for (int i = 0; i < 8; i++) {
         Point3D<float> imgPos;
         // transform current point to center
         imgPos.x = box[i].x - c.cMaxVolumeIndexX/2;
         imgPos.y = box[i].y - c.cMaxVolumeIndexYZ/2;
         imgPos.z = box[i].z - c.cMaxVolumeIndexYZ/2;
         // rotate around center
         multiply(transform, imgPos);
         // transform back just Y coordinate, since X now matches to picture and Z is irrelevant
         imgPos.y += c.cMaxVolumeIndexYZ / 2;
 
         box[i] = imgPos;
     }
 }

Variable Documentation

◆ BLOB_TABLE

__shared__ float BLOB_TABLE[BLOB_TABLE_SIZE_SQRT]

Definition at line 46 of file reconstruct_fourier_codelet_reconstruct.cpp.

◆ cpuC

CodeletConstants cpuC

Definition at line 68 of file reconstruct_fourier_codelet_reconstruct.cpp.

◆ gpuC

__device__ __constant__ CodeletConstants gpuC

Definition at line 67 of file reconstruct_fourier_codelet_reconstruct.cpp.

Classes

Functions

Variables