#include <cuda_runtime_api.h>
#include "reconstruction_cuda/cuda_asserts.h"
#include "cuda_gpu_reconstruct_fourier.h"
#include "reconstruction_cuda/cuda_basic_math.h"
#include "gpu.h"

Include dependency graph for cuda_gpu_reconstruct_fourier.cpp:

Classes
struct	RecFourierBufferDataGPU

Functions
__device__ float	bessi0Fast (float x)

__device__ float	bessi0 (float x)

__device__ float	bessi1 (float x)

__device__ float	bessi2 (float x)

__device__ float	bessi3 (float x)

__device__ float	bessi4 (float x)

template<int order>
__device__ float	kaiserValue (float r, float a)

__device__ float	kaiserValueFast (float distSqr)

float *	allocateTempVolumeGPU (float *&ptr, int size, int typeSize)

void	copyTempVolumes (std::complex< float > *tempVol, float tempWeights, float tempVolGPU, float *tempWeightsGPU, int size)

void	releaseTempVolumeGPU (float *&ptr)

__device__ float	FFT_IDX2DIGFREQ (int idx, int size)

__device__ float	getZ (float x, float y, const Point3D< float > &n, const Point3D< float > &p0)

__device__ float	getY (float x, float z, const Point3D< float > &n, const Point3D< float > &p0)

__device__ float	getX (float y, float z, const Point3D< float > &n, const Point3D< float > &p0)

__device__ void	multiply (const float transform[3][3], Point3D< float > &inOut)

__device__ void	computeAABB (Point3D< float > AABB, Point3D< float > cuboid)

template<bool hasCTF>
__device__ void	processVoxel (float2 tempVolumeGPU, float tempWeightsGPU, int x, int y, int z, int xSize, int ySize, const float __restrict__ CTF, const float __restrict__ modulator, const float2 __restrict__ FFT, const RecFourierProjectionTraverseSpace const space)

template<bool hasCTF, int blobOrder, bool useFastKaiser>
__device__ void	processVoxelBlob (float2 tempVolumeGPU, float tempWeightsGPU, int x, int y, int z, int xSize, int ySize, const float __restrict__ CTF, const float __restrict__ modulator, const float2 __restrict__ FFT, const RecFourierProjectionTraverseSpace const space, const float *blobTableSqrt, int imgCacheDim)

template<bool useFast, bool hasCTF, int blobOrder, bool useFastKaiser>
__device__ void	processProjection (float2 tempVolumeGPU, float tempWeightsGPU, int xSize, int ySize, const float __restrict__ CTF, const float __restrict__ modulator, const float2 __restrict__ FFT, const RecFourierProjectionTraverseSpace const tSpace, const float *devBlobTableSqrt, int imgCacheDim)

__device__ void	rotate (Point3D< float > *box, const float transform[3][3])

__device__ void	calculateAABB (const RecFourierProjectionTraverseSpace tSpace, const RecFourierBufferDataGPU buffer, Point3D< float > *dest)

__device__ bool	isWithin (Point3D< float > *AABB, int imgXSize, int imgYSize)

__device__ void	getImgData (Point3D< float > AABB, int tXindex, int tYindex, RecFourierBufferDataGPU const buffer, int imgIndex, float &vReal, float &vImag)

__device__ void	copyImgToCache (float2 dest, Point3D< float > AABB, RecFourierBufferDataGPU *const buffer, int imgIndex, int imgCacheDim)

template<bool useFast, bool hasCTF, int blobOrder, bool useFastKaiser>
__global__ void	processBufferKernel (float tempVolumeGPU, float tempWeightsGPU, RecFourierBufferDataGPU buffer, float devBlobTableSqrt, int imgCacheDim)

__global__ void	convertImagesKernel (std::complex< float > iFouriers, int iSizeX, int iSizeY, int iLength, RecFourierBufferDataGPU oBuffer, float maxResolutionSqr)

void	convertImages (FRecBufferDataGPUWrapper *wrapper, float maxResolutionSqr, int streamIndex)

void	waitForGPU ()

void	createStreams (int count)

void	deleteStreams (int count)

void	pinMemory (RecFourierBufferData *buffer)

void	unpinMemory (RecFourierBufferData *buffer)

void	allocateWrapper (RecFourierBufferData *buffer, int streamIndex)

void	copyBlobTable (float *blobTableSqrt, int blobTableSize)

void	releaseBlobTable ()

void	releaseWrapper (int streamIndex)

void	copyConstants (int maxVolIndexX, int maxVolIndexYZ, float blobRadius, float blobAlpha, float iDeltaSqrt, float iw0, float oneOverBessiOrderAlpha)

template<int blobOrder, bool useFastKaiser>
void	processBufferGPU_ (float tempVolumeGPU, float tempWeightsGPU, RecFourierBufferData *buffer, float blobRadius, int maxVolIndexYZ, bool useFast, float maxResolutionSqr, int streamIndex)

void	processBufferGPU (float tempVolumeGPU, float tempWeightsGPU, RecFourierBufferData *buffer, float blobRadius, int maxVolIndexYZ, bool useFast, float maxResolutionSqr, int streamIndex, int blobOrder, float blobAlpha)

Variables
cudaStream_t *	streams

FRecBufferDataGPUWrapper **	wrappers

float *	devBlobTableSqrt = NULL

__device__ __constant__ int	cMaxVolumeIndexX = 0

__device__ __constant__ int	cMaxVolumeIndexYZ = 0

__device__ __constant__ float	cBlobRadius = 0.f

__device__ __constant__ float	cOneOverBlobRadiusSqr = 0.f

__device__ __constant__ float	cBlobAlpha = 0.f

__device__ __constant__ float	cIw0 = 0.f

__device__ __constant__ float	cIDeltaSqrt = 0.f

__device__ __constant__ float	cOneOverBessiOrderAlpha = 0.f

Function Documentation

◆ bessi0()

__device__ float bessi0 ( float x )

Definition at line 86 of file cuda_gpu_reconstruct_fourier.cpp.

 {
     float y, ax, ans;
     if ((ax = fabsf(x)) < 3.75f)
     {
         y = x / 3.75f;
         y *= y;
         ans = 1.f + y * (3.5156229f + y * (3.0899424f + y * (1.2067492f
                                           + y * (0.2659732f + y * (0.360768e-1f + y * 0.45813e-2f)))));
     }
     else
     {
         y = 3.75f / ax;
         ans = (expf(ax) * rsqrtf(ax)) * (0.39894228f + y * (0.1328592e-1f
                                       + y * (0.225319e-2f + y * (-0.157565e-2f + y * (0.916281e-2f
                                                                 + y * (-0.2057706e-1f + y * (0.2635537e-1f + y * (-0.1647633e-1f
                                                                                             + y * 0.392377e-2f))))))));
     }
     return ans;
 }

◆ bessi0Fast()

__device__ float bessi0Fast ( float x )

Definition at line 62 of file cuda_gpu_reconstruct_fourier.cpp.

                           { // X must be <= 15
     // stable rational minimax approximations to the modified bessel functions, blair, edwards
     // from table 5
     float x2 = x*x;
     float num = -0.8436825781374849e-19f; // p11
     num = fmaf(num, x2, -0.93466495199548700e-17f); // p10
     num = fmaf(num, x2, -0.15716375332511895e-13f); // p09
     num = fmaf(num, x2, -0.42520971595532318e-11f); // p08
     num = fmaf(num, x2, -0.13704363824102120e-8f);  // p07
     num = fmaf(num, x2, -0.28508770483148419e-6f);  // p06
     num = fmaf(num, x2, -0.44322160233346062e-4f);  // p05
     num = fmaf(num, x2, -0.46703811755736946e-2f);  // p04
     num = fmaf(num, x2, -0.31112484643702141e-0f);  // p03
     num = fmaf(num, x2, -0.11512633616429962e+2f);  // p02
     num = fmaf(num, x2, -0.18720283332732112e+3f);  // p01
     num = fmaf(num, x2, -0.75281108169006924e+3f);  // p00
 
     float den = 1.f; // q01
     den = fmaf(den, x2, -0.75281109410939403e+3f); // q00
 
     return num/den;
 }

◆ bessi1()

__device__ float bessi1 ( float x )

Definition at line 109 of file cuda_gpu_reconstruct_fourier.cpp.

 {
     float ax, ans;
     float y;
     if ((ax = fabsf(x)) < 3.75f)
     {
         y = x / 3.75f;
         y *= y;
         ans = ax * (0.5f + y * (0.87890594f + y * (0.51498869f + y * (0.15084934f
                                + y * (0.2658733e-1f + y * (0.301532e-2f + y * 0.32411e-3f))))));
     }
     else
     {
         y = 3.75f / ax;
         ans = 0.2282967e-1f + y * (-0.2895312e-1f + y * (0.1787654e-1f
                                   - y * 0.420059e-2f));
         ans = 0.39894228f + y * (-0.3988024e-1f + y * (-0.362018e-2f
                                 + y * (0.163801e-2f + y * (-0.1031555e-1f + y * ans))));
         ans *= (expf(ax) * rsqrtf(ax));
     }
     return x < 0.0 ? -ans : ans;
 }

◆ bessi2()

__device__ float bessi2 ( float x )

Definition at line 133 of file cuda_gpu_reconstruct_fourier.cpp.

 {
     return (x == 0) ? 0 : bessi0(x) - ((2*1) / x) * bessi1(x);
 }

◆ bessi3()

__device__ float bessi3 ( float x )

Definition at line 139 of file cuda_gpu_reconstruct_fourier.cpp.

 {
     return (x == 0) ? 0 : bessi1(x) - ((2*2) / x) * bessi2(x);
 }

◆ bessi4()

__device__ float bessi4 ( float x )

Definition at line 145 of file cuda_gpu_reconstruct_fourier.cpp.

 {
     return (x == 0) ? 0 : bessi2(x) - ((2*3) / x) * bessi3(x);
 }

◆ calculateAABB()

__device__ void calculateAABB	(	const RecFourierProjectionTraverseSpace *	tSpace,
		const RecFourierBufferDataGPU *	buffer,
		Point3D< float > *	dest
	)

Method calculates an Axis Aligned Bounding Box in the image space. AABB is guaranteed to be big enough that all threads in the block, while processing the traverse space, will not read image data outside of the AABB

Definition at line 774 of file cuda_gpu_reconstruct_fourier.cpp.

                                                                                                                                  {
     Point3D<float> box[8];
     // calculate AABB for the whole working block
     if (tSpace->XY == tSpace->dir) { // iterate XY plane
         box[0].x = box[3].x = box[4].x = box[7].x = blockIdx.x*blockDim.x - cBlobRadius;
         box[1].x = box[2].x = box[5].x = box[6].x = (blockIdx.x+1)*blockDim.x + cBlobRadius - 1.f;
 
         box[2].y = box[3].y = box[6].y = box[7].y = (blockIdx.y+1)*blockDim.y + cBlobRadius - 1.f;
         box[0].y = box[1].y = box[4].y = box[5].y = blockIdx.y*blockDim.y- cBlobRadius;
 
         box[0].z = getZ(box[0].x, box[0].y, tSpace->unitNormal, tSpace->bottomOrigin);
         box[4].z = getZ(box[4].x, box[4].y, tSpace->unitNormal, tSpace->topOrigin);
 
         box[3].z = getZ(box[3].x, box[3].y, tSpace->unitNormal, tSpace->bottomOrigin);
         box[7].z = getZ(box[7].x, box[7].y, tSpace->unitNormal, tSpace->topOrigin);
 
         box[2].z = getZ(box[2].x, box[2].y, tSpace->unitNormal, tSpace->bottomOrigin);
         box[6].z = getZ(box[6].x, box[6].y, tSpace->unitNormal, tSpace->topOrigin);
 
         box[1].z = getZ(box[1].x, box[1].y, tSpace->unitNormal, tSpace->bottomOrigin);
         box[5].z = getZ(box[5].x, box[5].y, tSpace->unitNormal, tSpace->topOrigin);
     } else if (tSpace->XZ == tSpace->dir) { // iterate XZ plane
         box[0].x = box[3].x = box[4].x = box[7].x = blockIdx.x*blockDim.x - cBlobRadius;
         box[1].x = box[2].x = box[5].x = box[6].x = (blockIdx.x+1)*blockDim.x + cBlobRadius - 1.f;
 
         box[2].z = box[3].z = box[6].z = box[7].z = (blockIdx.y+1)*blockDim.y + cBlobRadius - 1.f;
         box[0].z = box[1].z = box[4].z = box[5].z = blockIdx.y*blockDim.y- cBlobRadius;
 
         box[0].y = getY(box[0].x, box[0].z, tSpace->unitNormal, tSpace->bottomOrigin);
         box[4].y = getY(box[4].x, box[4].z, tSpace->unitNormal, tSpace->topOrigin);
 
         box[3].y = getY(box[3].x, box[3].z, tSpace->unitNormal, tSpace->bottomOrigin);
         box[7].y = getY(box[7].x, box[7].z, tSpace->unitNormal, tSpace->topOrigin);
 
         box[2].y = getY(box[2].x, box[2].z, tSpace->unitNormal, tSpace->bottomOrigin);
         box[6].y = getY(box[6].x, box[6].z, tSpace->unitNormal, tSpace->topOrigin);
 
         box[1].y = getY(box[1].x, box[1].z, tSpace->unitNormal, tSpace->bottomOrigin);
         box[5].y = getY(box[5].x, box[5].z, tSpace->unitNormal, tSpace->topOrigin);
     } else { // iterate YZ plane
         box[0].y = box[3].y = box[4].y = box[7].y = blockIdx.x*blockDim.x - cBlobRadius;
         box[1].y = box[2].y = box[5].y = box[6].y = (blockIdx.x+1)*blockDim.x + cBlobRadius - 1.f;
 
         box[2].z = box[3].z = box[6].z = box[7].z = (blockIdx.y+1)*blockDim.y + cBlobRadius - 1.f;
         box[0].z = box[1].z = box[4].z = box[5].z = blockIdx.y*blockDim.y- cBlobRadius;
 
         box[0].x = getX(box[0].y, box[0].z, tSpace->unitNormal, tSpace->bottomOrigin);
         box[4].x = getX(box[4].y, box[4].z, tSpace->unitNormal, tSpace->topOrigin);
 
         box[3].x = getX(box[3].y, box[3].z, tSpace->unitNormal, tSpace->bottomOrigin);
         box[7].x = getX(box[7].y, box[7].z, tSpace->unitNormal, tSpace->topOrigin);
 
         box[2].x = getX(box[2].y, box[2].z, tSpace->unitNormal, tSpace->bottomOrigin);
         box[6].x = getX(box[6].y, box[6].z, tSpace->unitNormal, tSpace->topOrigin);
 
         box[1].x = getX(box[1].y, box[1].z, tSpace->unitNormal, tSpace->bottomOrigin);
         box[5].x = getX(box[5].y, box[5].z, tSpace->unitNormal, tSpace->topOrigin);
     }
     // transform AABB to the image domain
     rotate(box, tSpace->transformInv);
     // AABB is projected on image. Create new AABB that will encompass all vertices
     computeAABB(dest, box);
 }

◆ computeAABB()

__device__ void computeAABB	(	Point3D< float > *	AABB,
		Point3D< float > *	cuboid
	)

Compute Axis Aligned Bounding Box of given cuboid

Definition at line 428 of file cuda_gpu_reconstruct_fourier.cpp.

                                                                {
     AABB[0].x = AABB[0].y = AABB[0].z = INFINITY;
     AABB[1].x = AABB[1].y = AABB[1].z = -INFINITY;
     Point3D<float> tmp;
     for (int i = 0; i < 8; i++) {
         tmp = cuboid[i];
         if (AABB[0].x > tmp.x) AABB[0].x = tmp.x;
         if (AABB[0].y > tmp.y) AABB[0].y = tmp.y;
         if (AABB[0].z > tmp.z) AABB[0].z = tmp.z;
         if (AABB[1].x < tmp.x) AABB[1].x = tmp.x;
         if (AABB[1].y < tmp.y) AABB[1].y = tmp.y;
         if (AABB[1].z < tmp.z) AABB[1].z = tmp.z;
     }
     AABB[0].x = ceilf(AABB[0].x);
     AABB[0].y = ceilf(AABB[0].y);
     AABB[0].z = ceilf(AABB[0].z);
 
     AABB[1].x = floorf(AABB[1].x);
     AABB[1].y = floorf(AABB[1].y);
     AABB[1].z = floorf(AABB[1].z);
 }

◆ convertImages()

void convertImages	(	FRecBufferDataGPUWrapper *	wrapper,
		float	maxResolutionSqr,
		int	streamIndex
	)

Method takes padded input pictures, performs FFT and convert resulting images as necessary for the algorithm. Asynchronous method.

Definition at line 1008 of file cuda_gpu_reconstruct_fourier.cpp.

                          {
 
     cudaStream_t stream = streams[streamIndex];
 
     RecFourierBufferDataGPU* hostBuffer = wrapper->cpuCopy;
     // store to proper structure
     GpuMultidimArrayAtGpu<float> imagesGPU(
             hostBuffer->paddedImgSize, hostBuffer->paddedImgSize, 1, hostBuffer->noOfImages, hostBuffer->paddedImages);
     // perform FFT
     GpuMultidimArrayAtGpu<std::complex<float> > resultingFFT;
     mycufftHandle myhandle;
     imagesGPU.fft(resultingFFT, myhandle);
     myhandle.clear(); // release unnecessary memory
     imagesGPU.d_data = NULL; // unbind the data
 
     // now we have performed FFTs of the input images
     // buffers have to be updated accordingly
     hostBuffer->hasFFTs = true;
     cudaMemsetAsync(hostBuffer->FFTs, 0.f, hostBuffer->getFFTsByteSize(), stream); // clear it, as kernel writes only to some parts
     wrapper->copyToDevice(streamIndex);
     gpuErrchk( cudaPeekAtLastError() );
 
     // run kernel, one thread for each pixel of input FFT
     dim3 dimBlock(BLOCK_DIM, BLOCK_DIM);
     dim3 dimGrid(ceil(resultingFFT.Xdim/(float)dimBlock.x), ceil(resultingFFT.Ydim/(float)dimBlock.y));
     convertImagesKernel<<<dimGrid, dimBlock, 0, stream>>>(
             resultingFFT.d_data, resultingFFT.Xdim, resultingFFT.Ydim, resultingFFT.Ndim,
             wrapper->gpuCopy, maxResolutionSqr);
     // now we have converted input images to FFTs in the required format
 }

◆ convertImagesKernel()

__global__ void convertImagesKernel	(	std::complex< float > *	iFouriers,
		int	iSizeX,
		int	iSizeY,
		int	iLength,
		RecFourierBufferDataGPU *	oBuffer,
		float	maxResolutionSqr
	)

Method will process the 'paddedFourier' (not shifted, i.e. low frequencies are in corners) in the following way: high frequencies are skipped (replaced by zero (0)) space is shifted, so that low frequencies are in the middle of the Y axis resulting space is cropped. Method returns a 2D array with Fourier coefficients, shifted so that low frequencies are in the center of the Y axis (i.e. semicircle)

Definition at line 963 of file cuda_gpu_reconstruct_fourier.cpp.

                                                                    {
     // assign pixel to thread
     volatile int idx = blockIdx.x*blockDim.x + threadIdx.x;
     volatile int idy = blockIdx.y*blockDim.y + threadIdx.y;
 
     int halfY = iSizeY / 2;
     float normFactor = iSizeY*iSizeY;
     int oSizeX = oBuffer->fftSizeX;
 
     // input is an image in Fourier space (not normalized)
     // with low frequencies in the inner corners
     for (int n = 0; n < iLength; n++) {
         float2 freq;
         if ((idy < iSizeY) // for all input lines
                 && (idx < oSizeX)) { // for all output pixels in the line
             // process line only if it can hold sufficiently high frequency, i.e. process only
             // first and last N lines
             if (idy < oSizeX || idy >= (iSizeY - oSizeX)) {
                 // check the frequency
                 freq.x = FFT_IDX2DIGFREQ(idx, iSizeY);
                 freq.y = FFT_IDX2DIGFREQ(idy, iSizeY);
                 if ((freq.x * freq.x + freq.y * freq.y) > maxResolutionSqr) {
                     continue;
                 }
                 // do the shift (lower line will move up, upper down)
                 int newY = (idy < halfY) ? (idy + oSizeX) : (idy - iSizeY + oSizeX);
                 int oIndex = newY*oSizeX + idx;
 
                 int iIndex = n*iSizeY*iSizeX + idy*iSizeX + idx;
                 float* iValue = (float*)&(iFouriers[iIndex]);
 
                 // copy data and perform normalization
                 oBuffer->getNthItem(oBuffer->FFTs, n)[2*oIndex] = iValue[0] / normFactor;
                 oBuffer->getNthItem(oBuffer->FFTs, n)[2*oIndex + 1] = iValue[1] / normFactor;
             }
         }
     }
 }

◆ copyImgToCache()

__device__ void copyImgToCache	(	float2 *	dest,
		Point3D< float > *	AABB,
		RecFourierBufferDataGPU *const	buffer,
		int	imgIndex,
		int	imgCacheDim
	)

Method will copy imgIndex(th) data from buffer to given destination (shared memory). Only data within AABB will be copied. Destination is expected to be continuous array of sufficient size (imgCacheDim^2)

Definition at line 883 of file cuda_gpu_reconstruct_fourier.cpp.

                           {
     for (int y = threadIdx.y; y < imgCacheDim; y += blockDim.y) {
         for (int x = threadIdx.x; x < imgCacheDim; x += blockDim.x) {
             int memIndex = y * imgCacheDim + x;
             getImgData(AABB, x, y, buffer, imgIndex, dest[memIndex].x, dest[memIndex].y);
         }
     }
 }

◆ FFT_IDX2DIGFREQ()

__device__ float FFT_IDX2DIGFREQ	(	int	idx,
		int	size
	)

Index to frequency

Given an index and a size of the FFT, this function returns the corresponding digital frequency (-1/2 to 1/2)

Definition at line 382 of file cuda_gpu_reconstruct_fourier.cpp.

                                          {
     if (size <= 1) return 0;
     return ((idx <= (size / 2)) ? idx : (-size + idx)) / (float)size;
 }

◆ getImgData()

__device__ void getImgData	(	Point3D< float > *	AABB,
		int	tXindex,
		int	tYindex,
		RecFourierBufferDataGPU *const	buffer,
		int	imgIndex,
		float &	vReal,
		float &	vImag
	)

Method will load data from image at position tXindex, tYindex and return them. In case the data lies outside of the image boundaries, zeros (0,0) are returned

Definition at line 856 of file cuda_gpu_reconstruct_fourier.cpp.

                                     {
     int imgXindex = tXindex + AABB[0].x;
     int imgYindex = tYindex + AABB[0].y;
     if ((imgXindex >=0)
             && (imgXindex < buffer->fftSizeX)
             && (imgYindex >=0)
             && (imgYindex < buffer->fftSizeY))  {
         int index = imgYindex * buffer->fftSizeX + imgXindex; // copy data from image
         vReal = buffer->getNthItem(buffer->FFTs, imgIndex)[2*index];
         vImag = buffer->getNthItem(buffer->FFTs, imgIndex)[2*index + 1];
 
     } else {
         vReal = vImag = 0.f; // out of image bound, so return zero
     }
 }

◆ getX()

__device__ float getX	(	float	y,
		float	z,
		const Point3D< float > &	n,
		const Point3D< float > &	p0
	)

Calculates X coordinate of the point [y, z] on the plane defined by p0 (origin) and normal

Definition at line 410 of file cuda_gpu_reconstruct_fourier.cpp.

                                                                                {
     // from a(x-x0)+b(y-y0)+c(z-z0)=0
     return (-n.y*(y-p0.y)-n.z*(z-p0.z))/n.x + p0.x;
 }

◆ getY()

__device__ float getY	(	float	x,
		float	z,
		const Point3D< float > &	n,
		const Point3D< float > &	p0
	)

Calculates Y coordinate of the point [x, z] on the plane defined by p0 (origin) and normal

Definition at line 400 of file cuda_gpu_reconstruct_fourier.cpp.

                                                                                {
     // from a(x-x0)+b(y-y0)+c(z-z0)=0
     return (-n.x*(x-p0.x)-n.z*(z-p0.z))/n.y + p0.y;
 }

◆ getZ()

__device__ float getZ	(	float	x,
		float	y,
		const Point3D< float > &	n,
		const Point3D< float > &	p0
	)

Calculates Z coordinate of the point [x, y] on the plane defined by p0 (origin) and normal

Definition at line 391 of file cuda_gpu_reconstruct_fourier.cpp.

                                                                                 {
     // from a(x-x0)+b(y-y0)+c(z-z0)=0
     return (-n.x*(x-p0.x)-n.y*(y-p0.y))/n.z + p0.z;
 }

◆ isWithin()

__device__ bool isWithin	(	Point3D< float > *	AABB,
		int	imgXSize,
		int	imgYSize
	)

Method returns true if AABB lies within the image boundaries

Definition at line 842 of file cuda_gpu_reconstruct_fourier.cpp.

                                                                 {
     return (AABB[0].x < imgXSize)
             && (AABB[1].x >= 0)
             && (AABB[0].y < imgYSize)
             && (AABB[1].y >= 0);
 }

◆ kaiserValue()

template<int order>

__device__ float kaiserValue	(	float	r,
		float	a
	)

Definition at line 153 of file cuda_gpu_reconstruct_fourier.cpp.

 {
     float rda, rdas, arg, w;
 
     rda = r / a;
     if (rda <= 1.f)
     {
         rdas = rda * rda;
         arg = cBlobAlpha * sqrtf(1.f - rdas);
         if (order == 0)
         {
             w = bessi0(arg) * cOneOverBessiOrderAlpha;
         }
         else if (order == 1)
         {
             w = sqrtf (1.f - rdas);
             w *= bessi1(arg) * cOneOverBessiOrderAlpha;
         }
         else if (order == 2)
         {
             w = sqrtf (1.f - rdas);
             w = w * w;
             w *= bessi2(arg) * cOneOverBessiOrderAlpha;
         }
         else if (order == 3)
         {
             w = sqrtf (1.f - rdas);
             w = w * w * w;
             w *= bessi3(arg) * cOneOverBessiOrderAlpha;
         }
         else if (order == 4)
         {
             w = sqrtf (1.f - rdas);
             w = w * w * w *w;
             w *= bessi4(arg) * cOneOverBessiOrderAlpha;
         }
         else {
             printf("order (%d) out of range in kaiser_value(): %s, %d\n", order, __FILE__, __LINE__);
         }
     }
     else
         w = 0.f;
 
     return w;
 }

◆ kaiserValueFast()

__device__ float kaiserValueFast ( float distSqr )

Definition at line 200 of file cuda_gpu_reconstruct_fourier.cpp.

                                      {
     float arg = cBlobAlpha * sqrtf(1.f - (distSqr * cOneOverBlobRadiusSqr)); // alpha * sqrt(1-(dist/blobRadius^2))
     return bessi0Fast(arg) * cOneOverBessiOrderAlpha * cIw0;
 }

◆ multiply()

__device__ void multiply	(	const float	transform[3][3],
		Point3D< float > &	inOut
	)

Do 3x3 x 1x3 matrix-vector multiplication

Definition at line 417 of file cuda_gpu_reconstruct_fourier.cpp.

                                                                   {
     float tmp0 = transform[0][0] * inOut.x + transform[0][1] * inOut.y + transform[0][2] * inOut.z;
     float tmp1 = transform[1][0] * inOut.x + transform[1][1] * inOut.y + transform[1][2] * inOut.z;
     float tmp2 = transform[2][0] * inOut.x + transform[2][1] * inOut.y + transform[2][2] * inOut.z;
     inOut.x = tmp0;
     inOut.y = tmp1;
     inOut.z = tmp2;
 }

◆ processBufferGPU_()

template<int blobOrder, bool useFastKaiser>

void processBufferGPU_	(	float *	tempVolumeGPU,
		float *	tempWeightsGPU,
		RecFourierBufferData *	buffer,
		float	blobRadius,
		int	maxVolIndexYZ,
		bool	useFast,
		float	maxResolutionSqr,
		int	streamIndex
	)

Method will use data stored in the buffer and update temporal storages appropriately. Actual calculation is done asynchronously, but 'buffer' can be reused once the method returns.

Definition at line 1136 of file cuda_gpu_reconstruct_fourier.cpp.

                                                  {
 
     cudaStream_t stream = streams[streamIndex];
 
     // copy all data to gpu
     FRecBufferDataGPUWrapper* wrapper = wrappers[streamIndex];
     wrapper->copyFrom(buffer, streamIndex);
     wrapper->copyToDevice(streamIndex);
 
     // process input data if necessary
     if ( ! wrapper->cpuCopy->hasFFTs) {
         convertImages(wrapper, maxResolutionSqr, streamIndex);
     }
     // now wait till all necessary data are loaded to GPU (so that host can continue in work)
     cudaStreamSynchronize(stream);
 
     // enqueue kernel and return control
     int size2D = maxVolIndexYZ + 1;
     int imgCacheDim = ceil(sqrt(2.f) * sqrt(3.f) *(BLOCK_DIM + 2*blobRadius));
     dim3 dimBlock(BLOCK_DIM, BLOCK_DIM);
     dim3 dimGrid(ceil(size2D/(float)dimBlock.x),ceil(size2D/(float)dimBlock.y), GRID_DIM_Z);
 
     // by using templates, we can save some registers, especially for 'fast' version
     if (useFast && buffer->hasCTFs) {
         processBufferKernel<true, true, blobOrder,useFastKaiser><<<dimGrid, dimBlock, 0, stream>>>(
             tempVolumeGPU, tempWeightsGPU,
             wrapper->gpuCopy,
             devBlobTableSqrt,
             imgCacheDim);
            return;
    }
    if (useFast && !buffer->hasCTFs) {
        processBufferKernel<true, false, blobOrder,useFastKaiser><<<dimGrid, dimBlock, 0, stream>>>(
                 tempVolumeGPU, tempWeightsGPU,
                 wrapper->gpuCopy,
                 devBlobTableSqrt,
                 imgCacheDim);
        return;
    }
    // if making copy of the image in shared memory, allocate enough space
    int sharedMemSize = SHARED_IMG ? (imgCacheDim*imgCacheDim*sizeof(float2)) : 0;
    if (!useFast && buffer->hasCTFs) {
        processBufferKernel<false, true, blobOrder,useFastKaiser><<<dimGrid, dimBlock, sharedMemSize, stream>>>(
             tempVolumeGPU, tempWeightsGPU,
             wrapper->gpuCopy,
             devBlobTableSqrt,
             imgCacheDim);
        return;
    }
    if (!useFast && !buffer->hasCTFs) {
        processBufferKernel<false, false, blobOrder,useFastKaiser><<<dimGrid, dimBlock, sharedMemSize, stream>>>(
             tempVolumeGPU, tempWeightsGPU,
             wrapper->gpuCopy,
             devBlobTableSqrt,
             imgCacheDim);
        return;
    }
 }

◆ processBufferKernel()

template<bool useFast, bool hasCTF, int blobOrder, bool useFastKaiser>

__global__ void processBufferKernel	(	float *	tempVolumeGPU,
		float *	tempWeightsGPU,
		RecFourierBufferDataGPU *	buffer,
		float *	devBlobTableSqrt,
		int	imgCacheDim
	)

Method will use data stored in the buffer and update temporal storages appropriately.

Definition at line 900 of file cuda_gpu_reconstruct_fourier.cpp.

                          {
 #if SHARED_BLOB_TABLE
     if ( ! useFast) {
         // copy blob table to shared memory
         volatile int id = threadIdx.y*blockDim.x + threadIdx.x;
         volatile int blockSize = blockDim.x * blockDim.y;
         for (int i = id; i < BLOB_TABLE_SIZE_SQRT; i+= blockSize)
             BLOB_TABLE[i] = devBlobTableSqrt[i];
         __syncthreads();
     }
 #endif
 
     for (int i = blockIdx.z; i < buffer->getNoOfSpaces(); i += gridDim.z) {
         RecFourierProjectionTraverseSpace* space = &buffer->spaces[i];
 
 #if SHARED_IMG
         if ( ! useFast) {
             // make sure that all threads start at the same time
             // as they can come from previous iteration
             __syncthreads();
             if ((threadIdx.x == 0) && (threadIdx.y == 0)) {
                 // first thread calculates which part of the image should be shared
                 calculateAABB(space, buffer, SHARED_AABB);
             }
             __syncthreads();
             // check if the block will have to copy data from image
             if (isWithin(SHARED_AABB, buffer->fftSizeX, buffer->fftSizeY)) {
                 // all threads copy image data to shared memory
                 copyImgToCache(IMG, SHARED_AABB, buffer, space->projectionIndex, imgCacheDim);
                 __syncthreads();
             } else {
                 continue; // whole block can exit, as it's not reading from image
             }
         }
 #endif
 
         processProjection<useFast, hasCTF, blobOrder, useFastKaiser>(
             (float2*)tempVolumeGPU, tempWeightsGPU,
             buffer->fftSizeX, buffer->fftSizeY,
             buffer->getNthItem(buffer->CTFs, space->projectionIndex),
             buffer->getNthItem(buffer->modulators, space->projectionIndex),
             (float2*)buffer->getNthItem(buffer->FFTs, space->projectionIndex),
             space,
             devBlobTableSqrt,
             imgCacheDim);
         __syncthreads(); // sync threads to avoid write after read problems
     }
 }

◆ processProjection()

template<bool useFast, bool hasCTF, int blobOrder, bool useFastKaiser>

__device__ void processProjection	(	float2 *	tempVolumeGPU,
		float *	tempWeightsGPU,
		int	xSize,
		int	ySize,
		const float *__restrict__	CTF,
		const float *__restrict__	modulator,
		const float2 *__restrict__	FFT,
		const RecFourierProjectionTraverseSpace *const	tSpace,
		const float *	devBlobTableSqrt,
		int	imgCacheDim
	)

Method will process one projection image and add result to temporal spaces.

Definition at line 660 of file cuda_gpu_reconstruct_fourier.cpp.

 {
     // map thread to each (2D) voxel
 #if TILE > 1
     int id = threadIdx.y * blockDim.x + threadIdx.x;
     int tidX = threadIdx.x % TILE + (id / (blockDim.y * TILE)) * TILE;
     int tidY = (id / TILE) % blockDim.y;
     int idx = blockIdx.x*blockDim.x + tidX;
     int idy = blockIdx.y*blockDim.y + tidY;
 #else
     // map thread to each (2D) voxel
     volatile int idx = blockIdx.x*blockDim.x + threadIdx.x;
     volatile int idy = blockIdx.y*blockDim.y + threadIdx.y;
 #endif
 
     if (tSpace->XY == tSpace->dir) { // iterate XY plane
         if (idy >= tSpace->minY && idy <= tSpace->maxY) {
             if (idx >= tSpace->minX && idx <= tSpace->maxX) {
                 if (useFast) {
                     float hitZ = getZ(idx, idy, tSpace->unitNormal, tSpace->bottomOrigin);
                     int z = (int)(hitZ + 0.5f); // rounding
                     processVoxel<hasCTF>(tempVolumeGPU, tempWeightsGPU, idx, idy, z, xSize, ySize , CTF, modulator, FFT, tSpace);
                 } else {
                     float z1 = getZ(idx, idy, tSpace->unitNormal, tSpace->bottomOrigin); // lower plane
                     float z2 = getZ(idx, idy, tSpace->unitNormal, tSpace->topOrigin); // upper plane
                     z1 = clamp(z1, 0, cMaxVolumeIndexYZ);
                     z2 = clamp(z2, 0, cMaxVolumeIndexYZ);
                     int lower = floorf(fminf(z1, z2));
                     int upper = ceilf(fmaxf(z1, z2));
                     for (int z = lower; z <= upper; z++) {
                         processVoxelBlob<hasCTF, blobOrder, useFastKaiser>(tempVolumeGPU, tempWeightsGPU, idx, idy, z, xSize, ySize , CTF, modulator, FFT, tSpace, devBlobTableSqrt, imgCacheDim);
                     }
                 }
             }
         }
     } else if (tSpace->XZ == tSpace->dir) { // iterate XZ plane
         if (idy >= tSpace->minZ && idy <= tSpace->maxZ) { // map z -> y
             if (idx >= tSpace->minX && idx <= tSpace->maxX) {
                 if (useFast) {
                     float hitY =getY(idx, idy, tSpace->unitNormal, tSpace->bottomOrigin);
                     int y = (int)(hitY + 0.5f); // rounding
                     processVoxel<hasCTF>(tempVolumeGPU, tempWeightsGPU, idx, y, idy, xSize, ySize , CTF, modulator, FFT, tSpace);
                 } else {
                     float y1 = getY(idx, idy, tSpace->unitNormal, tSpace->bottomOrigin); // lower plane
                     float y2 = getY(idx, idy, tSpace->unitNormal, tSpace->topOrigin); // upper plane
                     y1 = clamp(y1, 0, cMaxVolumeIndexYZ);
                     y2 = clamp(y2, 0, cMaxVolumeIndexYZ);
                     int lower = floorf(fminf(y1, y2));
                     int upper = ceilf(fmaxf(y1, y2));
                     for (int y = lower; y <= upper; y++) {
                         processVoxelBlob<hasCTF, blobOrder, useFastKaiser>(tempVolumeGPU, tempWeightsGPU, idx, y, idy, xSize, ySize , CTF, modulator, FFT, tSpace, devBlobTableSqrt, imgCacheDim);
                     }
                 }
             }
         }
     } else { // iterate YZ plane
         if (idy >= tSpace->minZ && idy <= tSpace->maxZ) { // map z -> y
             if (idx >= tSpace->minY && idx <= tSpace->maxY) { // map y > x
                 if (useFast) {
                     float hitX = getX(idx, idy, tSpace->unitNormal, tSpace->bottomOrigin);
                     int x = (int)(hitX + 0.5f); // rounding
                     processVoxel<hasCTF>(tempVolumeGPU, tempWeightsGPU, x, idx, idy, xSize, ySize , CTF, modulator, FFT, tSpace);
                 } else {
                     float x1 = getX(idx, idy, tSpace->unitNormal, tSpace->bottomOrigin); // lower plane
                     float x2 = getX(idx, idy, tSpace->unitNormal, tSpace->topOrigin); // upper plane
                     x1 = clamp(x1, 0, cMaxVolumeIndexX);
                     x2 = clamp(x2, 0, cMaxVolumeIndexX);
                     int lower = floorf(fminf(x1, x2));
                     int upper = ceilf(fmaxf(x1, x2));
                     for (int x = lower; x <= upper; x++) {
                         processVoxelBlob<hasCTF, blobOrder, useFastKaiser>(tempVolumeGPU, tempWeightsGPU, x, idx, idy, xSize, ySize , CTF, modulator, FFT, tSpace, devBlobTableSqrt, imgCacheDim);
                     }
                 }
             }
         }
     }
 }

◆ processVoxel()

template<bool hasCTF>

__device__ void processVoxel	(	float2 *	tempVolumeGPU,
		float *	tempWeightsGPU,
		int	x,
		int	y,
		int	z,
		int	xSize,
		int	ySize,
		const float *__restrict__	CTF,
		const float *__restrict__	modulator,
		const float2 *__restrict__	FFT,
		const RecFourierProjectionTraverseSpace *const	space
	)

Method will map one voxel from the temporal spaces to the given projection and update temporal spaces using the pixel value of the projection.

Definition at line 457 of file cuda_gpu_reconstruct_fourier.cpp.

 {
     Point3D<float> imgPos;
     float wBlob = 1.f;
     float wCTF = 1.f;
     float wModulator = 1.f;
 
     float dataWeight = space->weight;
 
     // transform current point to center
     imgPos.x = x - cMaxVolumeIndexX/2;
     imgPos.y = y - cMaxVolumeIndexYZ/2;
     imgPos.z = z - cMaxVolumeIndexYZ/2;
     if (imgPos.x*imgPos.x + imgPos.y*imgPos.y + imgPos.z*imgPos.z > space->maxDistanceSqr) {
         return; // discard iterations that would access pixel with too high frequency
     }
     // rotate around center
     multiply(space->transformInv, imgPos);
     if (imgPos.x < 0.f) return; // reading outside of the image boundary. Z is always correct and Y is checked by the condition above
 
     // transform back and round
     // just Y coordinate needs adjusting, since X now matches to picture and Z is irrelevant
     int imgX = clamp((int)(imgPos.x + 0.5f), 0, xSize - 1);
     int imgY = clamp((int)(imgPos.y + 0.5f + cMaxVolumeIndexYZ / 2), 0, ySize - 1);
 
     int index3D = z * (cMaxVolumeIndexYZ+1) * (cMaxVolumeIndexX+1) + y * (cMaxVolumeIndexX+1) + x;
     int index2D = imgY * xSize + imgX;
 
     if (hasCTF) {
         wCTF = CTF[index2D];
         wModulator = modulator[index2D];
     }
 
     float weight = wBlob * wModulator * dataWeight;
 
      // use atomic as two blocks can write to same voxel
     atomicAdd(&tempVolumeGPU[index3D].x, FFT[index2D].x * weight * wCTF);
     atomicAdd(&tempVolumeGPU[index3D].y, FFT[index2D].y * weight * wCTF);
     atomicAdd(&tempWeightsGPU[index3D], weight);
 }

◆ processVoxelBlob()

template<bool hasCTF, int blobOrder, bool useFastKaiser>

__device__ void processVoxelBlob	(	float2 *	tempVolumeGPU,
		float *	tempWeightsGPU,
		int	x,
		int	y,
		int	z,
		int	xSize,
		int	ySize,
		const float *__restrict__	CTF,
		const float *__restrict__	modulator,
		const float2 *__restrict__	FFT,
		const RecFourierProjectionTraverseSpace *const	space,
		const float *	blobTableSqrt,
		int	imgCacheDim
	)

Method will map one voxel from the temporal spaces to the given projection and update temporal spaces using the pixel values of the projection withing the blob distance.

Definition at line 512 of file cuda_gpu_reconstruct_fourier.cpp.

 {
     Point3D<float> imgPos;
     // transform current point to center
     imgPos.x = x - cMaxVolumeIndexX/2;
     imgPos.y = y - cMaxVolumeIndexYZ/2;
     imgPos.z = z - cMaxVolumeIndexYZ/2;
     if ((imgPos.x*imgPos.x + imgPos.y*imgPos.y + imgPos.z*imgPos.z) > space->maxDistanceSqr) {
         return; // discard iterations that would access pixel with too high frequency
     }
     // rotate around center
     multiply(space->transformInv, imgPos);
     if (imgPos.x < -cBlobRadius) return; // reading outside of the image boundary. Z is always correct and Y is checked by the condition above
     // transform back just Y coordinate, since X now matches to picture and Z is irrelevant
     imgPos.y += cMaxVolumeIndexYZ / 2;
 
     // check that we don't want to collect data from far far away ...
     float radiusSqr = cBlobRadius * cBlobRadius;
     float zSqr = imgPos.z * imgPos.z;
     if (zSqr > radiusSqr) return;
 
     // create blob bounding box
     int minX = ceilf(imgPos.x - cBlobRadius);
     int maxX = floorf(imgPos.x + cBlobRadius);
     int minY = ceilf(imgPos.y - cBlobRadius);
     int maxY = floorf(imgPos.y + cBlobRadius);
     minX = fmaxf(minX, 0);
     minY = fmaxf(minY, 0);
     maxX = fminf(maxX, xSize-1);
     maxY = fminf(maxY, ySize-1);
 
     int index3D = z * (cMaxVolumeIndexYZ+1) * (cMaxVolumeIndexX+1) + y * (cMaxVolumeIndexX+1) + x;
     float2 vol;
     float w;
     vol.x = vol.y = w = 0.f;
 #if !SHARED_IMG
 #endif
     float dataWeight = space->weight;
 
     // ugly spaghetti code, but improves performance by app. 10%
     if (hasCTF) {
         // check which pixel in the vicinity should contribute
         for (int i = minY; i <= maxY; i++) {
             float ySqr = (imgPos.y - i) * (imgPos.y - i);
             float yzSqr = ySqr + zSqr;
             if (yzSqr > radiusSqr) continue;
             for (int j = minX; j <= maxX; j++) {
                 float xD = imgPos.x - j;
                 float distanceSqr = xD*xD + yzSqr;
                 if (distanceSqr > radiusSqr) continue;
 
 #if SHARED_IMG
                 int index2D = (i - SHARED_AABB[0].y) * imgCacheDim + (j-SHARED_AABB[0].x); // position in img - offset of the AABB
 #else
                 int index2D = i * xSize + j;
 #endif
 
                 float wCTF = CTF[index2D];
                 float wModulator = modulator[index2D];
 #if PRECOMPUTE_BLOB_VAL
                 int aux = (int) ((distanceSqr * cIDeltaSqrt + 0.5f));
     #if SHARED_BLOB_TABLE
                 float wBlob = BLOB_TABLE[aux];
     #else
                 float wBlob = blobTableSqrt[aux];
     #endif
 #else
                 float wBlob;
                 if (useFastKaiser) {
                     wBlob = kaiserValueFast(distanceSqr);
                 }
                 else {
                     wBlob = kaiserValue<blobOrder>(sqrtf(distanceSqr),cBlobRadius) * cIw0;
                 }
 #endif
                 float weight = wBlob * wModulator * dataWeight;
                 w += weight;
 #if SHARED_IMG
                 vol += IMG[index2D] * weight * wCTF;
 #else
                 vol += FFT[index2D] * weight * wCTF;
 #endif
             }
         }
     } else {
         // check which pixel in the vicinity should contribute
         for (int i = minY; i <= maxY; i++) {
             float ySqr = (imgPos.y - i) * (imgPos.y - i);
             float yzSqr = ySqr + zSqr;
             if (yzSqr > radiusSqr) continue;
             for (int j = minX; j <= maxX; j++) {
                 float xD = imgPos.x - j;
                 float distanceSqr = xD*xD + yzSqr;
                 if (distanceSqr > radiusSqr) continue;
 
 #if SHARED_IMG
                 int index2D = (i - SHARED_AABB[0].y) * imgCacheDim + (j-SHARED_AABB[0].x); // position in img - offset of the AABB
 #else
                 int index2D = i * xSize + j;
 #endif
 
 #if PRECOMPUTE_BLOB_VAL
                 int aux = (int) ((distanceSqr * cIDeltaSqrt + 0.5f));
 #if SHARED_BLOB_TABLE
                 float wBlob = BLOB_TABLE[aux];
 #else
                 float wBlob = blobTableSqrt[aux];
 #endif
 #else
                 float wBlob;
                 if (useFastKaiser) {
                     wBlob = kaiserValueFast(distanceSqr);
                 }
                 else {
                     wBlob = kaiserValue<blobOrder>(sqrtf(distanceSqr),cBlobRadius) * cIw0;
                 }
 #endif
                 float weight = wBlob * dataWeight;
                 w += weight;
 #if SHARED_IMG
                 vol += IMG[index2D] * weight;
 #else
                 vol += FFT[index2D] * weight;
 #endif
             }
         }
     }
     // use atomic as two blocks can write to same voxel
     atomicAdd(&tempVolumeGPU[index3D].x, vol.x);
     atomicAdd(&tempVolumeGPU[index3D].y, vol.y);
     atomicAdd(&tempWeightsGPU[index3D], w);
 }

◆ rotate()

__device__ void rotate	(	Point3D< float > *	box,
		const float	transform[3][3]
	)

Method will rotate box using transformation matrix around center of the working space

Definition at line 751 of file cuda_gpu_reconstruct_fourier.cpp.

                                                               {
     for (int i = 0; i < 8; i++) {
         Point3D<float> imgPos;
         // transform current point to center
         imgPos.x = box[i].x - cMaxVolumeIndexX/2;
         imgPos.y = box[i].y - cMaxVolumeIndexYZ/2;
         imgPos.z = box[i].z - cMaxVolumeIndexYZ/2;
         // rotate around center
         multiply(transform, imgPos);
         // transform back just Y coordinate, since X now matches to picture and Z is irrelevant
         imgPos.y += cMaxVolumeIndexYZ / 2;
 
         box[i] = imgPos;
     }
 }