Xmipp  v3.23.11-Nereus
Classes | Functions | Variables
cuda_gpu_reconstruct_fourier.cpp File Reference
#include <cuda_runtime_api.h>
#include "reconstruction_cuda/cuda_asserts.h"
#include "cuda_gpu_reconstruct_fourier.h"
#include "reconstruction_cuda/cuda_basic_math.h"
#include "gpu.h"
Include dependency graph for cuda_gpu_reconstruct_fourier.cpp:

Go to the source code of this file.

Classes

struct  RecFourierBufferDataGPU
 

Functions

__device__ float bessi0Fast (float x)
 
__device__ float bessi0 (float x)
 
__device__ float bessi1 (float x)
 
__device__ float bessi2 (float x)
 
__device__ float bessi3 (float x)
 
__device__ float bessi4 (float x)
 
template<int order>
__device__ float kaiserValue (float r, float a)
 
__device__ float kaiserValueFast (float distSqr)
 
float * allocateTempVolumeGPU (float *&ptr, int size, int typeSize)
 
void copyTempVolumes (std::complex< float > ***tempVol, float ***tempWeights, float *tempVolGPU, float *tempWeightsGPU, int size)
 
void releaseTempVolumeGPU (float *&ptr)
 
__device__ float FFT_IDX2DIGFREQ (int idx, int size)
 
__device__ float getZ (float x, float y, const Point3D< float > &n, const Point3D< float > &p0)
 
__device__ float getY (float x, float z, const Point3D< float > &n, const Point3D< float > &p0)
 
__device__ float getX (float y, float z, const Point3D< float > &n, const Point3D< float > &p0)
 
__device__ void multiply (const float transform[3][3], Point3D< float > &inOut)
 
__device__ void computeAABB (Point3D< float > *AABB, Point3D< float > *cuboid)
 
template<bool hasCTF>
__device__ void processVoxel (float2 *tempVolumeGPU, float *tempWeightsGPU, int x, int y, int z, int xSize, int ySize, const float *__restrict__ CTF, const float *__restrict__ modulator, const float2 *__restrict__ FFT, const RecFourierProjectionTraverseSpace *const space)
 
template<bool hasCTF, int blobOrder, bool useFastKaiser>
__device__ void processVoxelBlob (float2 *tempVolumeGPU, float *tempWeightsGPU, int x, int y, int z, int xSize, int ySize, const float *__restrict__ CTF, const float *__restrict__ modulator, const float2 *__restrict__ FFT, const RecFourierProjectionTraverseSpace *const space, const float *blobTableSqrt, int imgCacheDim)
 
template<bool useFast, bool hasCTF, int blobOrder, bool useFastKaiser>
__device__ void processProjection (float2 *tempVolumeGPU, float *tempWeightsGPU, int xSize, int ySize, const float *__restrict__ CTF, const float *__restrict__ modulator, const float2 *__restrict__ FFT, const RecFourierProjectionTraverseSpace *const tSpace, const float *devBlobTableSqrt, int imgCacheDim)
 
__device__ void rotate (Point3D< float > *box, const float transform[3][3])
 
__device__ void calculateAABB (const RecFourierProjectionTraverseSpace *tSpace, const RecFourierBufferDataGPU *buffer, Point3D< float > *dest)
 
__device__ bool isWithin (Point3D< float > *AABB, int imgXSize, int imgYSize)
 
__device__ void getImgData (Point3D< float > *AABB, int tXindex, int tYindex, RecFourierBufferDataGPU *const buffer, int imgIndex, float &vReal, float &vImag)
 
__device__ void copyImgToCache (float2 *dest, Point3D< float > *AABB, RecFourierBufferDataGPU *const buffer, int imgIndex, int imgCacheDim)
 
template<bool useFast, bool hasCTF, int blobOrder, bool useFastKaiser>
__global__ void processBufferKernel (float *tempVolumeGPU, float *tempWeightsGPU, RecFourierBufferDataGPU *buffer, float *devBlobTableSqrt, int imgCacheDim)
 
__global__ void convertImagesKernel (std::complex< float > *iFouriers, int iSizeX, int iSizeY, int iLength, RecFourierBufferDataGPU *oBuffer, float maxResolutionSqr)
 
void convertImages (FRecBufferDataGPUWrapper *wrapper, float maxResolutionSqr, int streamIndex)
 
void waitForGPU ()
 
void createStreams (int count)
 
void deleteStreams (int count)
 
void pinMemory (RecFourierBufferData *buffer)
 
void unpinMemory (RecFourierBufferData *buffer)
 
void allocateWrapper (RecFourierBufferData *buffer, int streamIndex)
 
void copyBlobTable (float *blobTableSqrt, int blobTableSize)
 
void releaseBlobTable ()
 
void releaseWrapper (int streamIndex)
 
void copyConstants (int maxVolIndexX, int maxVolIndexYZ, float blobRadius, float blobAlpha, float iDeltaSqrt, float iw0, float oneOverBessiOrderAlpha)
 
template<int blobOrder, bool useFastKaiser>
void processBufferGPU_ (float *tempVolumeGPU, float *tempWeightsGPU, RecFourierBufferData *buffer, float blobRadius, int maxVolIndexYZ, bool useFast, float maxResolutionSqr, int streamIndex)
 
void processBufferGPU (float *tempVolumeGPU, float *tempWeightsGPU, RecFourierBufferData *buffer, float blobRadius, int maxVolIndexYZ, bool useFast, float maxResolutionSqr, int streamIndex, int blobOrder, float blobAlpha)
 

Variables

cudaStream_t * streams
 
FRecBufferDataGPUWrapper ** wrappers
 
float * devBlobTableSqrt = NULL
 
__device__ __constant__ int cMaxVolumeIndexX = 0
 
__device__ __constant__ int cMaxVolumeIndexYZ = 0
 
__device__ __constant__ float cBlobRadius = 0.f
 
__device__ __constant__ float cOneOverBlobRadiusSqr = 0.f
 
__device__ __constant__ float cBlobAlpha = 0.f
 
__device__ __constant__ float cIw0 = 0.f
 
__device__ __constant__ float cIDeltaSqrt = 0.f
 
__device__ __constant__ float cOneOverBessiOrderAlpha = 0.f
 

Function Documentation

◆ bessi0()

__device__ float bessi0 ( float  x)

Definition at line 86 of file cuda_gpu_reconstruct_fourier.cpp.

87 {
88  float y, ax, ans;
89  if ((ax = fabsf(x)) < 3.75f)
90  {
91  y = x / 3.75f;
92  y *= y;
93  ans = 1.f + y * (3.5156229f + y * (3.0899424f + y * (1.2067492f
94  + y * (0.2659732f + y * (0.360768e-1f + y * 0.45813e-2f)))));
95  }
96  else
97  {
98  y = 3.75f / ax;
99  ans = (expf(ax) * rsqrtf(ax)) * (0.39894228f + y * (0.1328592e-1f
100  + y * (0.225319e-2f + y * (-0.157565e-2f + y * (0.916281e-2f
101  + y * (-0.2057706e-1f + y * (0.2635537e-1f + y * (-0.1647633e-1f
102  + y * 0.392377e-2f))))))));
103  }
104  return ans;
105 }
static double * y
doublereal * x
double * f

◆ bessi0Fast()

__device__ float bessi0Fast ( float  x)

Definition at line 62 of file cuda_gpu_reconstruct_fourier.cpp.

62  { // X must be <= 15
63  // stable rational minimax approximations to the modified bessel functions, blair, edwards
64  // from table 5
65  float x2 = x*x;
66  float num = -0.8436825781374849e-19f; // p11
67  num = fmaf(num, x2, -0.93466495199548700e-17f); // p10
68  num = fmaf(num, x2, -0.15716375332511895e-13f); // p09
69  num = fmaf(num, x2, -0.42520971595532318e-11f); // p08
70  num = fmaf(num, x2, -0.13704363824102120e-8f); // p07
71  num = fmaf(num, x2, -0.28508770483148419e-6f); // p06
72  num = fmaf(num, x2, -0.44322160233346062e-4f); // p05
73  num = fmaf(num, x2, -0.46703811755736946e-2f); // p04
74  num = fmaf(num, x2, -0.31112484643702141e-0f); // p03
75  num = fmaf(num, x2, -0.11512633616429962e+2f); // p02
76  num = fmaf(num, x2, -0.18720283332732112e+3f); // p01
77  num = fmaf(num, x2, -0.75281108169006924e+3f); // p00
78 
79  float den = 1.f; // q01
80  den = fmaf(den, x2, -0.75281109410939403e+3f); // q00
81 
82  return num/den;
83 }
doublereal * x
double * f

◆ bessi1()

__device__ float bessi1 ( float  x)

Definition at line 109 of file cuda_gpu_reconstruct_fourier.cpp.

110 {
111  float ax, ans;
112  float y;
113  if ((ax = fabsf(x)) < 3.75f)
114  {
115  y = x / 3.75f;
116  y *= y;
117  ans = ax * (0.5f + y * (0.87890594f + y * (0.51498869f + y * (0.15084934f
118  + y * (0.2658733e-1f + y * (0.301532e-2f + y * 0.32411e-3f))))));
119  }
120  else
121  {
122  y = 3.75f / ax;
123  ans = 0.2282967e-1f + y * (-0.2895312e-1f + y * (0.1787654e-1f
124  - y * 0.420059e-2f));
125  ans = 0.39894228f + y * (-0.3988024e-1f + y * (-0.362018e-2f
126  + y * (0.163801e-2f + y * (-0.1031555e-1f + y * ans))));
127  ans *= (expf(ax) * rsqrtf(ax));
128  }
129  return x < 0.0 ? -ans : ans;
130 }
static double * y
doublereal * x
double * f

◆ bessi2()

__device__ float bessi2 ( float  x)

Definition at line 133 of file cuda_gpu_reconstruct_fourier.cpp.

134 {
135  return (x == 0) ? 0 : bessi0(x) - ((2*1) / x) * bessi1(x);
136 }
__device__ float bessi1(float x)
doublereal * x
__device__ float bessi0(float x)

◆ bessi3()

__device__ float bessi3 ( float  x)

Definition at line 139 of file cuda_gpu_reconstruct_fourier.cpp.

140 {
141  return (x == 0) ? 0 : bessi1(x) - ((2*2) / x) * bessi2(x);
142 }
__device__ float bessi1(float x)
doublereal * x
__device__ float bessi2(float x)

◆ bessi4()

__device__ float bessi4 ( float  x)

Definition at line 145 of file cuda_gpu_reconstruct_fourier.cpp.

146 {
147  return (x == 0) ? 0 : bessi2(x) - ((2*3) / x) * bessi3(x);
148 }
doublereal * x
__device__ float bessi3(float x)
__device__ float bessi2(float x)

◆ calculateAABB()

__device__ void calculateAABB ( const RecFourierProjectionTraverseSpace tSpace,
const RecFourierBufferDataGPU buffer,
Point3D< float > *  dest 
)

Method calculates an Axis Aligned Bounding Box in the image space. AABB is guaranteed to be big enough that all threads in the block, while processing the traverse space, will not read image data outside of the AABB

Definition at line 774 of file cuda_gpu_reconstruct_fourier.cpp.

774  {
775  Point3D<float> box[8];
776  // calculate AABB for the whole working block
777  if (tSpace->XY == tSpace->dir) { // iterate XY plane
778  box[0].x = box[3].x = box[4].x = box[7].x = blockIdx.x*blockDim.x - cBlobRadius;
779  box[1].x = box[2].x = box[5].x = box[6].x = (blockIdx.x+1)*blockDim.x + cBlobRadius - 1.f;
780 
781  box[2].y = box[3].y = box[6].y = box[7].y = (blockIdx.y+1)*blockDim.y + cBlobRadius - 1.f;
782  box[0].y = box[1].y = box[4].y = box[5].y = blockIdx.y*blockDim.y- cBlobRadius;
783 
784  box[0].z = getZ(box[0].x, box[0].y, tSpace->unitNormal, tSpace->bottomOrigin);
785  box[4].z = getZ(box[4].x, box[4].y, tSpace->unitNormal, tSpace->topOrigin);
786 
787  box[3].z = getZ(box[3].x, box[3].y, tSpace->unitNormal, tSpace->bottomOrigin);
788  box[7].z = getZ(box[7].x, box[7].y, tSpace->unitNormal, tSpace->topOrigin);
789 
790  box[2].z = getZ(box[2].x, box[2].y, tSpace->unitNormal, tSpace->bottomOrigin);
791  box[6].z = getZ(box[6].x, box[6].y, tSpace->unitNormal, tSpace->topOrigin);
792 
793  box[1].z = getZ(box[1].x, box[1].y, tSpace->unitNormal, tSpace->bottomOrigin);
794  box[5].z = getZ(box[5].x, box[5].y, tSpace->unitNormal, tSpace->topOrigin);
795  } else if (tSpace->XZ == tSpace->dir) { // iterate XZ plane
796  box[0].x = box[3].x = box[4].x = box[7].x = blockIdx.x*blockDim.x - cBlobRadius;
797  box[1].x = box[2].x = box[5].x = box[6].x = (blockIdx.x+1)*blockDim.x + cBlobRadius - 1.f;
798 
799  box[2].z = box[3].z = box[6].z = box[7].z = (blockIdx.y+1)*blockDim.y + cBlobRadius - 1.f;
800  box[0].z = box[1].z = box[4].z = box[5].z = blockIdx.y*blockDim.y- cBlobRadius;
801 
802  box[0].y = getY(box[0].x, box[0].z, tSpace->unitNormal, tSpace->bottomOrigin);
803  box[4].y = getY(box[4].x, box[4].z, tSpace->unitNormal, tSpace->topOrigin);
804 
805  box[3].y = getY(box[3].x, box[3].z, tSpace->unitNormal, tSpace->bottomOrigin);
806  box[7].y = getY(box[7].x, box[7].z, tSpace->unitNormal, tSpace->topOrigin);
807 
808  box[2].y = getY(box[2].x, box[2].z, tSpace->unitNormal, tSpace->bottomOrigin);
809  box[6].y = getY(box[6].x, box[6].z, tSpace->unitNormal, tSpace->topOrigin);
810 
811  box[1].y = getY(box[1].x, box[1].z, tSpace->unitNormal, tSpace->bottomOrigin);
812  box[5].y = getY(box[5].x, box[5].z, tSpace->unitNormal, tSpace->topOrigin);
813  } else { // iterate YZ plane
814  box[0].y = box[3].y = box[4].y = box[7].y = blockIdx.x*blockDim.x - cBlobRadius;
815  box[1].y = box[2].y = box[5].y = box[6].y = (blockIdx.x+1)*blockDim.x + cBlobRadius - 1.f;
816 
817  box[2].z = box[3].z = box[6].z = box[7].z = (blockIdx.y+1)*blockDim.y + cBlobRadius - 1.f;
818  box[0].z = box[1].z = box[4].z = box[5].z = blockIdx.y*blockDim.y- cBlobRadius;
819 
820  box[0].x = getX(box[0].y, box[0].z, tSpace->unitNormal, tSpace->bottomOrigin);
821  box[4].x = getX(box[4].y, box[4].z, tSpace->unitNormal, tSpace->topOrigin);
822 
823  box[3].x = getX(box[3].y, box[3].z, tSpace->unitNormal, tSpace->bottomOrigin);
824  box[7].x = getX(box[7].y, box[7].z, tSpace->unitNormal, tSpace->topOrigin);
825 
826  box[2].x = getX(box[2].y, box[2].z, tSpace->unitNormal, tSpace->bottomOrigin);
827  box[6].x = getX(box[6].y, box[6].z, tSpace->unitNormal, tSpace->topOrigin);
828 
829  box[1].x = getX(box[1].y, box[1].z, tSpace->unitNormal, tSpace->bottomOrigin);
830  box[5].x = getX(box[5].y, box[5].z, tSpace->unitNormal, tSpace->topOrigin);
831  }
832  // transform AABB to the image domain
833  rotate(box, tSpace->transformInv);
834  // AABB is projected on image. Create new AABB that will encompass all vertices
835  computeAABB(dest, box);
836 }
__device__ __constant__ float cBlobRadius
enum RecFourierProjectionTraverseSpace::Direction dir
__device__ float getX(float y, float z, const Point3D< float > &n, const Point3D< float > &p0)
__device__ void computeAABB(Point3D< float > *AABB, Point3D< float > *cuboid)
static double * y
doublereal * x
__device__ void rotate(Point3D< float > *box, const float transform[3][3])
T z
Definition: point3D.h:56
T x
Definition: point3D.h:54
double z
__device__ float getZ(float x, float y, const Point3D< float > &n, const Point3D< float > &p0)
__device__ float getY(float x, float z, const Point3D< float > &n, const Point3D< float > &p0)
T y
Definition: point3D.h:55

◆ computeAABB()

__device__ void computeAABB ( Point3D< float > *  AABB,
Point3D< float > *  cuboid 
)

Compute Axis Aligned Bounding Box of given cuboid

Definition at line 428 of file cuda_gpu_reconstruct_fourier.cpp.

428  {
429  AABB[0].x = AABB[0].y = AABB[0].z = INFINITY;
430  AABB[1].x = AABB[1].y = AABB[1].z = -INFINITY;
431  Point3D<float> tmp;
432  for (int i = 0; i < 8; i++) {
433  tmp = cuboid[i];
434  if (AABB[0].x > tmp.x) AABB[0].x = tmp.x;
435  if (AABB[0].y > tmp.y) AABB[0].y = tmp.y;
436  if (AABB[0].z > tmp.z) AABB[0].z = tmp.z;
437  if (AABB[1].x < tmp.x) AABB[1].x = tmp.x;
438  if (AABB[1].y < tmp.y) AABB[1].y = tmp.y;
439  if (AABB[1].z < tmp.z) AABB[1].z = tmp.z;
440  }
441  AABB[0].x = ceilf(AABB[0].x);
442  AABB[0].y = ceilf(AABB[0].y);
443  AABB[0].z = ceilf(AABB[0].z);
444 
445  AABB[1].x = floorf(AABB[1].x);
446  AABB[1].y = floorf(AABB[1].y);
447  AABB[1].z = floorf(AABB[1].z);
448 }
static double * y
doublereal * x
#define i
T z
Definition: point3D.h:56
T x
Definition: point3D.h:54
double z
T y
Definition: point3D.h:55

◆ convertImages()

void convertImages ( FRecBufferDataGPUWrapper wrapper,
float  maxResolutionSqr,
int  streamIndex 
)

Method takes padded input pictures, performs FFT and convert resulting images as necessary for the algorithm. Asynchronous method.

Definition at line 1008 of file cuda_gpu_reconstruct_fourier.cpp.

1011  {
1012 
1013  cudaStream_t stream = streams[streamIndex];
1014 
1015  RecFourierBufferDataGPU* hostBuffer = wrapper->cpuCopy;
1016  // store to proper structure
1017  GpuMultidimArrayAtGpu<float> imagesGPU(
1018  hostBuffer->paddedImgSize, hostBuffer->paddedImgSize, 1, hostBuffer->noOfImages, hostBuffer->paddedImages);
1019  // perform FFT
1021  mycufftHandle myhandle;
1022  imagesGPU.fft(resultingFFT, myhandle);
1023  myhandle.clear(); // release unnecessary memory
1024  imagesGPU.d_data = NULL; // unbind the data
1025 
1026  // now we have performed FFTs of the input images
1027  // buffers have to be updated accordingly
1028  hostBuffer->hasFFTs = true;
1029  cudaMemsetAsync(hostBuffer->FFTs, 0.f, hostBuffer->getFFTsByteSize(), stream); // clear it, as kernel writes only to some parts
1030  wrapper->copyToDevice(streamIndex);
1031  gpuErrchk( cudaPeekAtLastError() );
1032 
1033  // run kernel, one thread for each pixel of input FFT
1034  dim3 dimBlock(BLOCK_DIM, BLOCK_DIM);
1035  dim3 dimGrid(ceil(resultingFFT.Xdim/(float)dimBlock.x), ceil(resultingFFT.Ydim/(float)dimBlock.y));
1036  convertImagesKernel<<<dimGrid, dimBlock, 0, stream>>>(
1037  resultingFFT.d_data, resultingFFT.Xdim, resultingFFT.Ydim, resultingFFT.Ndim,
1038  wrapper->gpuCopy, maxResolutionSqr);
1039  // now we have converted input images to FFTs in the required format
1040 }
#define gpuErrchk(code)
Definition: cuda_asserts.h:31
constexpr int BLOCK_DIM
RecFourierBufferDataGPU * cpuCopy
cudaStream_t * streams
RecFourierBufferDataGPU * gpuCopy

◆ convertImagesKernel()

__global__ void convertImagesKernel ( std::complex< float > *  iFouriers,
int  iSizeX,
int  iSizeY,
int  iLength,
RecFourierBufferDataGPU oBuffer,
float  maxResolutionSqr 
)

Method will process the 'paddedFourier' (not shifted, i.e. low frequencies are in corners) in the following way: high frequencies are skipped (replaced by zero (0)) space is shifted, so that low frequencies are in the middle of the Y axis resulting space is cropped. Method returns a 2D array with Fourier coefficients, shifted so that low frequencies are in the center of the Y axis (i.e. semicircle)

Definition at line 963 of file cuda_gpu_reconstruct_fourier.cpp.

964  {
965  // assign pixel to thread
966  volatile int idx = blockIdx.x*blockDim.x + threadIdx.x;
967  volatile int idy = blockIdx.y*blockDim.y + threadIdx.y;
968 
969  int halfY = iSizeY / 2;
970  float normFactor = iSizeY*iSizeY;
971  int oSizeX = oBuffer->fftSizeX;
972 
973  // input is an image in Fourier space (not normalized)
974  // with low frequencies in the inner corners
975  for (int n = 0; n < iLength; n++) {
976  float2 freq;
977  if ((idy < iSizeY) // for all input lines
978  && (idx < oSizeX)) { // for all output pixels in the line
979  // process line only if it can hold sufficiently high frequency, i.e. process only
980  // first and last N lines
981  if (idy < oSizeX || idy >= (iSizeY - oSizeX)) {
982  // check the frequency
983  freq.x = FFT_IDX2DIGFREQ(idx, iSizeY);
984  freq.y = FFT_IDX2DIGFREQ(idy, iSizeY);
985  if ((freq.x * freq.x + freq.y * freq.y) > maxResolutionSqr) {
986  continue;
987  }
988  // do the shift (lower line will move up, upper down)
989  int newY = (idy < halfY) ? (idy + oSizeX) : (idy - iSizeY + oSizeX);
990  int oIndex = newY*oSizeX + idx;
991 
992  int iIndex = n*iSizeY*iSizeX + idy*iSizeX + idx;
993  float* iValue = (float*)&(iFouriers[iIndex]);
994 
995  // copy data and perform normalization
996  oBuffer->getNthItem(oBuffer->FFTs, n)[2*oIndex] = iValue[0] / normFactor;
997  oBuffer->getNthItem(oBuffer->FFTs, n)[2*oIndex + 1] = iValue[1] / normFactor;
998  }
999  }
1000  }
1001 }
__device__ float FFT_IDX2DIGFREQ(int idx, int size)
__device__ float * getNthItem(float *array, int itemIndex)
int * n

◆ copyImgToCache()

__device__ void copyImgToCache ( float2 *  dest,
Point3D< float > *  AABB,
RecFourierBufferDataGPU *const  buffer,
int  imgIndex,
int  imgCacheDim 
)

Method will copy imgIndex(th) data from buffer to given destination (shared memory). Only data within AABB will be copied. Destination is expected to be continuous array of sufficient size (imgCacheDim^2)

Definition at line 883 of file cuda_gpu_reconstruct_fourier.cpp.

885  {
886  for (int y = threadIdx.y; y < imgCacheDim; y += blockDim.y) {
887  for (int x = threadIdx.x; x < imgCacheDim; x += blockDim.x) {
888  int memIndex = y * imgCacheDim + x;
889  getImgData(AABB, x, y, buffer, imgIndex, dest[memIndex].x, dest[memIndex].y);
890  }
891  }
892 }
static double * y
__device__ void getImgData(Point3D< float > *AABB, int tXindex, int tYindex, RecFourierBufferDataGPU *const buffer, int imgIndex, float &vReal, float &vImag)
doublereal * x

◆ FFT_IDX2DIGFREQ()

__device__ float FFT_IDX2DIGFREQ ( int  idx,
int  size 
)

Index to frequency

Given an index and a size of the FFT, this function returns the corresponding digital frequency (-1/2 to 1/2)

Definition at line 382 of file cuda_gpu_reconstruct_fourier.cpp.

382  {
383  if (size <= 1) return 0;
384  return ((idx <= (size / 2)) ? idx : (-size + idx)) / (float)size;
385 }

◆ getImgData()

__device__ void getImgData ( Point3D< float > *  AABB,
int  tXindex,
int  tYindex,
RecFourierBufferDataGPU *const  buffer,
int  imgIndex,
float &  vReal,
float &  vImag 
)

Method will load data from image at position tXindex, tYindex and return them. In case the data lies outside of the image boundaries, zeros (0,0) are returned

Definition at line 856 of file cuda_gpu_reconstruct_fourier.cpp.

859  {
860  int imgXindex = tXindex + AABB[0].x;
861  int imgYindex = tYindex + AABB[0].y;
862  if ((imgXindex >=0)
863  && (imgXindex < buffer->fftSizeX)
864  && (imgYindex >=0)
865  && (imgYindex < buffer->fftSizeY)) {
866  int index = imgYindex * buffer->fftSizeX + imgXindex; // copy data from image
867  vReal = buffer->getNthItem(buffer->FFTs, imgIndex)[2*index];
868  vImag = buffer->getNthItem(buffer->FFTs, imgIndex)[2*index + 1];
869 
870  } else {
871  vReal = vImag = 0.f; // out of image bound, so return zero
872  }
873 }
viol index
T x
Definition: point3D.h:54
__device__ float * getNthItem(float *array, int itemIndex)
T y
Definition: point3D.h:55

◆ getX()

__device__ float getX ( float  y,
float  z,
const Point3D< float > &  n,
const Point3D< float > &  p0 
)

Calculates X coordinate of the point [y, z] on the plane defined by p0 (origin) and normal

Definition at line 410 of file cuda_gpu_reconstruct_fourier.cpp.

410  {
411  // from a(x-x0)+b(y-y0)+c(z-z0)=0
412  return (-n.y*(y-p0.y)-n.z*(z-p0.z))/n.x + p0.x;
413 }
static double * y
T z
Definition: point3D.h:56
T x
Definition: point3D.h:54
double z
T y
Definition: point3D.h:55

◆ getY()

__device__ float getY ( float  x,
float  z,
const Point3D< float > &  n,
const Point3D< float > &  p0 
)

Calculates Y coordinate of the point [x, z] on the plane defined by p0 (origin) and normal

Definition at line 400 of file cuda_gpu_reconstruct_fourier.cpp.

400  {
401  // from a(x-x0)+b(y-y0)+c(z-z0)=0
402  return (-n.x*(x-p0.x)-n.z*(z-p0.z))/n.y + p0.y;
403 }
doublereal * x
T z
Definition: point3D.h:56
T x
Definition: point3D.h:54
double z
T y
Definition: point3D.h:55

◆ getZ()

__device__ float getZ ( float  x,
float  y,
const Point3D< float > &  n,
const Point3D< float > &  p0 
)

Calculates Z coordinate of the point [x, y] on the plane defined by p0 (origin) and normal

Definition at line 391 of file cuda_gpu_reconstruct_fourier.cpp.

391  {
392  // from a(x-x0)+b(y-y0)+c(z-z0)=0
393  return (-n.x*(x-p0.x)-n.y*(y-p0.y))/n.z + p0.z;
394 }
static double * y
doublereal * x
T z
Definition: point3D.h:56
T x
Definition: point3D.h:54
T y
Definition: point3D.h:55

◆ isWithin()

__device__ bool isWithin ( Point3D< float > *  AABB,
int  imgXSize,
int  imgYSize 
)

Method returns true if AABB lies within the image boundaries

Definition at line 842 of file cuda_gpu_reconstruct_fourier.cpp.

842  {
843  return (AABB[0].x < imgXSize)
844  && (AABB[1].x >= 0)
845  && (AABB[0].y < imgYSize)
846  && (AABB[1].y >= 0);
847 }
static double * y
doublereal * x
T x
Definition: point3D.h:54
T y
Definition: point3D.h:55

◆ kaiserValue()

template<int order>
__device__ float kaiserValue ( float  r,
float  a 
)

Definition at line 153 of file cuda_gpu_reconstruct_fourier.cpp.

154 {
155  float rda, rdas, arg, w;
156 
157  rda = r / a;
158  if (rda <= 1.f)
159  {
160  rdas = rda * rda;
161  arg = cBlobAlpha * sqrtf(1.f - rdas);
162  if (order == 0)
163  {
164  w = bessi0(arg) * cOneOverBessiOrderAlpha;
165  }
166  else if (order == 1)
167  {
168  w = sqrtf (1.f - rdas);
169  w *= bessi1(arg) * cOneOverBessiOrderAlpha;
170  }
171  else if (order == 2)
172  {
173  w = sqrtf (1.f - rdas);
174  w = w * w;
175  w *= bessi2(arg) * cOneOverBessiOrderAlpha;
176  }
177  else if (order == 3)
178  {
179  w = sqrtf (1.f - rdas);
180  w = w * w * w;
181  w *= bessi3(arg) * cOneOverBessiOrderAlpha;
182  }
183  else if (order == 4)
184  {
185  w = sqrtf (1.f - rdas);
186  w = w * w * w *w;
187  w *= bessi4(arg) * cOneOverBessiOrderAlpha;
188  }
189  else {
190  printf("order (%d) out of range in kaiser_value(): %s, %d\n", order, __FILE__, __LINE__);
191  }
192  }
193  else
194  w = 0.f;
195 
196  return w;
197 }
__device__ float bessi1(float x)
__device__ float bessi4(float x)
doublereal * w
__device__ float bessi0(float x)
__device__ float bessi3(float x)
double * f
__device__ __constant__ float cBlobAlpha
__device__ float bessi2(float x)
int order
Derivation order and Bessel function order.
Definition: blobs.h:118
__device__ __constant__ float cOneOverBessiOrderAlpha
doublereal * a

◆ kaiserValueFast()

__device__ float kaiserValueFast ( float  distSqr)

Definition at line 200 of file cuda_gpu_reconstruct_fourier.cpp.

200  {
201  float arg = cBlobAlpha * sqrtf(1.f - (distSqr * cOneOverBlobRadiusSqr)); // alpha * sqrt(1-(dist/blobRadius^2))
202  return bessi0Fast(arg) * cOneOverBessiOrderAlpha * cIw0;
203 }
__device__ float bessi0Fast(float x)
__device__ __constant__ float cOneOverBlobRadiusSqr
double * f
__device__ __constant__ float cBlobAlpha
__device__ __constant__ float cIw0
__device__ __constant__ float cOneOverBessiOrderAlpha

◆ multiply()

__device__ void multiply ( const float  transform[3][3],
Point3D< float > &  inOut 
)

Do 3x3 x 1x3 matrix-vector multiplication

Definition at line 417 of file cuda_gpu_reconstruct_fourier.cpp.

417  {
418  float tmp0 = transform[0][0] * inOut.x + transform[0][1] * inOut.y + transform[0][2] * inOut.z;
419  float tmp1 = transform[1][0] * inOut.x + transform[1][1] * inOut.y + transform[1][2] * inOut.z;
420  float tmp2 = transform[2][0] * inOut.x + transform[2][1] * inOut.y + transform[2][2] * inOut.z;
421  inOut.x = tmp0;
422  inOut.y = tmp1;
423  inOut.z = tmp2;
424 }
T z
Definition: point3D.h:56
T x
Definition: point3D.h:54
T y
Definition: point3D.h:55

◆ processBufferGPU_()

template<int blobOrder, bool useFastKaiser>
void processBufferGPU_ ( float *  tempVolumeGPU,
float *  tempWeightsGPU,
RecFourierBufferData buffer,
float  blobRadius,
int  maxVolIndexYZ,
bool  useFast,
float  maxResolutionSqr,
int  streamIndex 
)

Method will use data stored in the buffer and update temporal storages appropriately. Actual calculation is done asynchronously, but 'buffer' can be reused once the method returns.

Definition at line 1136 of file cuda_gpu_reconstruct_fourier.cpp.

1139  {
1140 
1141  cudaStream_t stream = streams[streamIndex];
1142 
1143  // copy all data to gpu
1144  FRecBufferDataGPUWrapper* wrapper = wrappers[streamIndex];
1145  wrapper->copyFrom(buffer, streamIndex);
1146  wrapper->copyToDevice(streamIndex);
1147 
1148  // process input data if necessary
1149  if ( ! wrapper->cpuCopy->hasFFTs) {
1150  convertImages(wrapper, maxResolutionSqr, streamIndex);
1151  }
1152  // now wait till all necessary data are loaded to GPU (so that host can continue in work)
1153  cudaStreamSynchronize(stream);
1154 
1155  // enqueue kernel and return control
1156  int size2D = maxVolIndexYZ + 1;
1157  int imgCacheDim = ceil(sqrt(2.f) * sqrt(3.f) *(BLOCK_DIM + 2*blobRadius));
1158  dim3 dimBlock(BLOCK_DIM, BLOCK_DIM);
1159  dim3 dimGrid(ceil(size2D/(float)dimBlock.x),ceil(size2D/(float)dimBlock.y), GRID_DIM_Z);
1160 
1161  // by using templates, we can save some registers, especially for 'fast' version
1162  if (useFast && buffer->hasCTFs) {
1163  processBufferKernel<true, true, blobOrder,useFastKaiser><<<dimGrid, dimBlock, 0, stream>>>(
1164  tempVolumeGPU, tempWeightsGPU,
1165  wrapper->gpuCopy,
1167  imgCacheDim);
1168  return;
1169  }
1170  if (useFast && !buffer->hasCTFs) {
1171  processBufferKernel<true, false, blobOrder,useFastKaiser><<<dimGrid, dimBlock, 0, stream>>>(
1172  tempVolumeGPU, tempWeightsGPU,
1173  wrapper->gpuCopy,
1175  imgCacheDim);
1176  return;
1177  }
1178  // if making copy of the image in shared memory, allocate enough space
1179  int sharedMemSize = SHARED_IMG ? (imgCacheDim*imgCacheDim*sizeof(float2)) : 0;
1180  if (!useFast && buffer->hasCTFs) {
1181  processBufferKernel<false, true, blobOrder,useFastKaiser><<<dimGrid, dimBlock, sharedMemSize, stream>>>(
1182  tempVolumeGPU, tempWeightsGPU,
1183  wrapper->gpuCopy,
1185  imgCacheDim);
1186  return;
1187  }
1188  if (!useFast && !buffer->hasCTFs) {
1189  processBufferKernel<false, false, blobOrder,useFastKaiser><<<dimGrid, dimBlock, sharedMemSize, stream>>>(
1190  tempVolumeGPU, tempWeightsGPU,
1191  wrapper->gpuCopy,
1193  imgCacheDim);
1194  return;
1195  }
1196 }
constexpr int BLOCK_DIM
void sqrt(Image< double > &op)
constexpr int SHARED_IMG
FRecBufferDataGPUWrapper ** wrappers
constexpr int GRID_DIM_Z
RecFourierBufferDataGPU * cpuCopy
cudaStream_t * streams
double * f
void convertImages(FRecBufferDataGPUWrapper *wrapper, float maxResolutionSqr, int streamIndex)
RecFourierBufferDataGPU * gpuCopy
void copyFrom(RecFourierBufferData *orig, int stream)
float * devBlobTableSqrt

◆ processBufferKernel()

template<bool useFast, bool hasCTF, int blobOrder, bool useFastKaiser>
__global__ void processBufferKernel ( float *  tempVolumeGPU,
float *  tempWeightsGPU,
RecFourierBufferDataGPU buffer,
float *  devBlobTableSqrt,
int  imgCacheDim 
)

Method will use data stored in the buffer and update temporal storages appropriately.

Definition at line 900 of file cuda_gpu_reconstruct_fourier.cpp.

904  {
905 #if SHARED_BLOB_TABLE
906  if ( ! useFast) {
907  // copy blob table to shared memory
908  volatile int id = threadIdx.y*blockDim.x + threadIdx.x;
909  volatile int blockSize = blockDim.x * blockDim.y;
910  for (int i = id; i < BLOB_TABLE_SIZE_SQRT; i+= blockSize)
912  __syncthreads();
913  }
914 #endif
915 
916  for (int i = blockIdx.z; i < buffer->getNoOfSpaces(); i += gridDim.z) {
918 
919 #if SHARED_IMG
920  if ( ! useFast) {
921  // make sure that all threads start at the same time
922  // as they can come from previous iteration
923  __syncthreads();
924  if ((threadIdx.x == 0) && (threadIdx.y == 0)) {
925  // first thread calculates which part of the image should be shared
926  calculateAABB(space, buffer, SHARED_AABB);
927  }
928  __syncthreads();
929  // check if the block will have to copy data from image
930  if (isWithin(SHARED_AABB, buffer->fftSizeX, buffer->fftSizeY)) {
931  // all threads copy image data to shared memory
932  copyImgToCache(IMG, SHARED_AABB, buffer, space->projectionIndex, imgCacheDim);
933  __syncthreads();
934  } else {
935  continue; // whole block can exit, as it's not reading from image
936  }
937  }
938 #endif
939 
940  processProjection<useFast, hasCTF, blobOrder, useFastKaiser>(
941  (float2*)tempVolumeGPU, tempWeightsGPU,
942  buffer->fftSizeX, buffer->fftSizeY,
943  buffer->getNthItem(buffer->CTFs, space->projectionIndex),
944  buffer->getNthItem(buffer->modulators, space->projectionIndex),
945  (float2*)buffer->getNthItem(buffer->FFTs, space->projectionIndex),
946  space,
948  imgCacheDim);
949  __syncthreads(); // sync threads to avoid write after read problems
950  }
951 }
__shared__ float BLOB_TABLE[BLOB_TABLE_SIZE_SQRT]
#define i
#define BLOB_TABLE_SIZE_SQRT
__device__ void copyImgToCache(float2 *dest, Point3D< float > *AABB, RecFourierBufferDataGPU *const buffer, int imgIndex, int imgCacheDim)
__device__ void calculateAABB(const RecFourierProjectionTraverseSpace *tSpace, const RecFourierBufferDataGPU *buffer, Point3D< float > *dest)
int space
Definition: rwDM3.cpp:394
RecFourierProjectionTraverseSpace * spaces
__device__ bool isWithin(Point3D< float > *AABB, int imgXSize, int imgYSize)
__device__ float * getNthItem(float *array, int itemIndex)
float * devBlobTableSqrt

◆ processProjection()

template<bool useFast, bool hasCTF, int blobOrder, bool useFastKaiser>
__device__ void processProjection ( float2 *  tempVolumeGPU,
float *  tempWeightsGPU,
int  xSize,
int  ySize,
const float *__restrict__  CTF,
const float *__restrict__  modulator,
const float2 *__restrict__  FFT,
const RecFourierProjectionTraverseSpace *const  tSpace,
const float *  devBlobTableSqrt,
int  imgCacheDim 
)

Method will process one projection image and add result to temporal spaces.

Definition at line 660 of file cuda_gpu_reconstruct_fourier.cpp.

669 {
670  // map thread to each (2D) voxel
671 #if TILE > 1
672  int id = threadIdx.y * blockDim.x + threadIdx.x;
673  int tidX = threadIdx.x % TILE + (id / (blockDim.y * TILE)) * TILE;
674  int tidY = (id / TILE) % blockDim.y;
675  int idx = blockIdx.x*blockDim.x + tidX;
676  int idy = blockIdx.y*blockDim.y + tidY;
677 #else
678  // map thread to each (2D) voxel
679  volatile int idx = blockIdx.x*blockDim.x + threadIdx.x;
680  volatile int idy = blockIdx.y*blockDim.y + threadIdx.y;
681 #endif
682 
683  if (tSpace->XY == tSpace->dir) { // iterate XY plane
684  if (idy >= tSpace->minY && idy <= tSpace->maxY) {
685  if (idx >= tSpace->minX && idx <= tSpace->maxX) {
686  if (useFast) {
687  float hitZ = getZ(idx, idy, tSpace->unitNormal, tSpace->bottomOrigin);
688  int z = (int)(hitZ + 0.5f); // rounding
689  processVoxel<hasCTF>(tempVolumeGPU, tempWeightsGPU, idx, idy, z, xSize, ySize , CTF, modulator, FFT, tSpace);
690  } else {
691  float z1 = getZ(idx, idy, tSpace->unitNormal, tSpace->bottomOrigin); // lower plane
692  float z2 = getZ(idx, idy, tSpace->unitNormal, tSpace->topOrigin); // upper plane
693  z1 = clamp(z1, 0, cMaxVolumeIndexYZ);
694  z2 = clamp(z2, 0, cMaxVolumeIndexYZ);
695  int lower = floorf(fminf(z1, z2));
696  int upper = ceilf(fmaxf(z1, z2));
697  for (int z = lower; z <= upper; z++) {
698  processVoxelBlob<hasCTF, blobOrder, useFastKaiser>(tempVolumeGPU, tempWeightsGPU, idx, idy, z, xSize, ySize , CTF, modulator, FFT, tSpace, devBlobTableSqrt, imgCacheDim);
699  }
700  }
701  }
702  }
703  } else if (tSpace->XZ == tSpace->dir) { // iterate XZ plane
704  if (idy >= tSpace->minZ && idy <= tSpace->maxZ) { // map z -> y
705  if (idx >= tSpace->minX && idx <= tSpace->maxX) {
706  if (useFast) {
707  float hitY =getY(idx, idy, tSpace->unitNormal, tSpace->bottomOrigin);
708  int y = (int)(hitY + 0.5f); // rounding
709  processVoxel<hasCTF>(tempVolumeGPU, tempWeightsGPU, idx, y, idy, xSize, ySize , CTF, modulator, FFT, tSpace);
710  } else {
711  float y1 = getY(idx, idy, tSpace->unitNormal, tSpace->bottomOrigin); // lower plane
712  float y2 = getY(idx, idy, tSpace->unitNormal, tSpace->topOrigin); // upper plane
713  y1 = clamp(y1, 0, cMaxVolumeIndexYZ);
714  y2 = clamp(y2, 0, cMaxVolumeIndexYZ);
715  int lower = floorf(fminf(y1, y2));
716  int upper = ceilf(fmaxf(y1, y2));
717  for (int y = lower; y <= upper; y++) {
718  processVoxelBlob<hasCTF, blobOrder, useFastKaiser>(tempVolumeGPU, tempWeightsGPU, idx, y, idy, xSize, ySize , CTF, modulator, FFT, tSpace, devBlobTableSqrt, imgCacheDim);
719  }
720  }
721  }
722  }
723  } else { // iterate YZ plane
724  if (idy >= tSpace->minZ && idy <= tSpace->maxZ) { // map z -> y
725  if (idx >= tSpace->minY && idx <= tSpace->maxY) { // map y > x
726  if (useFast) {
727  float hitX = getX(idx, idy, tSpace->unitNormal, tSpace->bottomOrigin);
728  int x = (int)(hitX + 0.5f); // rounding
729  processVoxel<hasCTF>(tempVolumeGPU, tempWeightsGPU, x, idx, idy, xSize, ySize , CTF, modulator, FFT, tSpace);
730  } else {
731  float x1 = getX(idx, idy, tSpace->unitNormal, tSpace->bottomOrigin); // lower plane
732  float x2 = getX(idx, idy, tSpace->unitNormal, tSpace->topOrigin); // upper plane
733  x1 = clamp(x1, 0, cMaxVolumeIndexX);
734  x2 = clamp(x2, 0, cMaxVolumeIndexX);
735  int lower = floorf(fminf(x1, x2));
736  int upper = ceilf(fmaxf(x1, x2));
737  for (int x = lower; x <= upper; x++) {
738  processVoxelBlob<hasCTF, blobOrder, useFastKaiser>(tempVolumeGPU, tempWeightsGPU, x, idx, idy, xSize, ySize , CTF, modulator, FFT, tSpace, devBlobTableSqrt, imgCacheDim);
739  }
740  }
741  }
742  }
743  }
744 }
enum RecFourierProjectionTraverseSpace::Direction dir
__device__ float getX(float y, float z, const Point3D< float > &n, const Point3D< float > &p0)
static double * y
__device__ __constant__ int cMaxVolumeIndexX
__device__ __constant__ int cMaxVolumeIndexYZ
doublereal * x
if(fabs(c[*nmax+ *nmax *c_dim1])==0.e0)
constexpr int TILE
#define CTF
double * f
double z
__device__ float getZ(float x, float y, const Point3D< float > &n, const Point3D< float > &p0)
__device__ float getY(float x, float z, const Point3D< float > &n, const Point3D< float > &p0)
float * devBlobTableSqrt

◆ processVoxel()

template<bool hasCTF>
__device__ void processVoxel ( float2 *  tempVolumeGPU,
float *  tempWeightsGPU,
int  x,
int  y,
int  z,
int  xSize,
int  ySize,
const float *__restrict__  CTF,
const float *__restrict__  modulator,
const float2 *__restrict__  FFT,
const RecFourierProjectionTraverseSpace *const  space 
)

Method will map one voxel from the temporal spaces to the given projection and update temporal spaces using the pixel value of the projection.

Definition at line 457 of file cuda_gpu_reconstruct_fourier.cpp.

465 {
466  Point3D<float> imgPos;
467  float wBlob = 1.f;
468  float wCTF = 1.f;
469  float wModulator = 1.f;
470 
471  float dataWeight = space->weight;
472 
473  // transform current point to center
474  imgPos.x = x - cMaxVolumeIndexX/2;
475  imgPos.y = y - cMaxVolumeIndexYZ/2;
476  imgPos.z = z - cMaxVolumeIndexYZ/2;
477  if (imgPos.x*imgPos.x + imgPos.y*imgPos.y + imgPos.z*imgPos.z > space->maxDistanceSqr) {
478  return; // discard iterations that would access pixel with too high frequency
479  }
480  // rotate around center
481  multiply(space->transformInv, imgPos);
482  if (imgPos.x < 0.f) return; // reading outside of the image boundary. Z is always correct and Y is checked by the condition above
483 
484  // transform back and round
485  // just Y coordinate needs adjusting, since X now matches to picture and Z is irrelevant
486  int imgX = clamp((int)(imgPos.x + 0.5f), 0, xSize - 1);
487  int imgY = clamp((int)(imgPos.y + 0.5f + cMaxVolumeIndexYZ / 2), 0, ySize - 1);
488 
489  int index3D = z * (cMaxVolumeIndexYZ+1) * (cMaxVolumeIndexX+1) + y * (cMaxVolumeIndexX+1) + x;
490  int index2D = imgY * xSize + imgX;
491 
492  if (hasCTF) {
493  wCTF = CTF[index2D];
494  wModulator = modulator[index2D];
495  }
496 
497  float weight = wBlob * wModulator * dataWeight;
498 
499  // use atomic as two blocks can write to same voxel
500  atomicAdd(&tempVolumeGPU[index3D].x, FFT[index2D].x * weight * wCTF);
501  atomicAdd(&tempVolumeGPU[index3D].y, FFT[index2D].y * weight * wCTF);
502  atomicAdd(&tempWeightsGPU[index3D], weight);
503 }
static double * y
__device__ __constant__ int cMaxVolumeIndexX
__device__ __constant__ int cMaxVolumeIndexYZ
doublereal * x
T z
Definition: point3D.h:56
#define CTF
T x
Definition: point3D.h:54
double z
__device__ double atomicAdd(double *address, double val)
__device__ void multiply(const float transform[3][3], Point3D< float > &inOut)
T y
Definition: point3D.h:55

◆ processVoxelBlob()

template<bool hasCTF, int blobOrder, bool useFastKaiser>
__device__ void processVoxelBlob ( float2 *  tempVolumeGPU,
float *  tempWeightsGPU,
int  x,
int  y,
int  z,
int  xSize,
int  ySize,
const float *__restrict__  CTF,
const float *__restrict__  modulator,
const float2 *__restrict__  FFT,
const RecFourierProjectionTraverseSpace *const  space,
const float *  blobTableSqrt,
int  imgCacheDim 
)

Method will map one voxel from the temporal spaces to the given projection and update temporal spaces using the pixel values of the projection withing the blob distance.

Definition at line 512 of file cuda_gpu_reconstruct_fourier.cpp.

522 {
523  Point3D<float> imgPos;
524  // transform current point to center
525  imgPos.x = x - cMaxVolumeIndexX/2;
526  imgPos.y = y - cMaxVolumeIndexYZ/2;
527  imgPos.z = z - cMaxVolumeIndexYZ/2;
528  if ((imgPos.x*imgPos.x + imgPos.y*imgPos.y + imgPos.z*imgPos.z) > space->maxDistanceSqr) {
529  return; // discard iterations that would access pixel with too high frequency
530  }
531  // rotate around center
532  multiply(space->transformInv, imgPos);
533  if (imgPos.x < -cBlobRadius) return; // reading outside of the image boundary. Z is always correct and Y is checked by the condition above
534  // transform back just Y coordinate, since X now matches to picture and Z is irrelevant
535  imgPos.y += cMaxVolumeIndexYZ / 2;
536 
537  // check that we don't want to collect data from far far away ...
538  float radiusSqr = cBlobRadius * cBlobRadius;
539  float zSqr = imgPos.z * imgPos.z;
540  if (zSqr > radiusSqr) return;
541 
542  // create blob bounding box
543  int minX = ceilf(imgPos.x - cBlobRadius);
544  int maxX = floorf(imgPos.x + cBlobRadius);
545  int minY = ceilf(imgPos.y - cBlobRadius);
546  int maxY = floorf(imgPos.y + cBlobRadius);
547  minX = fmaxf(minX, 0);
548  minY = fmaxf(minY, 0);
549  maxX = fminf(maxX, xSize-1);
550  maxY = fminf(maxY, ySize-1);
551 
552  int index3D = z * (cMaxVolumeIndexYZ+1) * (cMaxVolumeIndexX+1) + y * (cMaxVolumeIndexX+1) + x;
553  float2 vol;
554  float w;
555  vol.x = vol.y = w = 0.f;
556 #if !SHARED_IMG
557 #endif
558  float dataWeight = space->weight;
559 
560  // ugly spaghetti code, but improves performance by app. 10%
561  if (hasCTF) {
562  // check which pixel in the vicinity should contribute
563  for (int i = minY; i <= maxY; i++) {
564  float ySqr = (imgPos.y - i) * (imgPos.y - i);
565  float yzSqr = ySqr + zSqr;
566  if (yzSqr > radiusSqr) continue;
567  for (int j = minX; j <= maxX; j++) {
568  float xD = imgPos.x - j;
569  float distanceSqr = xD*xD + yzSqr;
570  if (distanceSqr > radiusSqr) continue;
571 
572 #if SHARED_IMG
573  int index2D = (i - SHARED_AABB[0].y) * imgCacheDim + (j-SHARED_AABB[0].x); // position in img - offset of the AABB
574 #else
575  int index2D = i * xSize + j;
576 #endif
577 
578  float wCTF = CTF[index2D];
579  float wModulator = modulator[index2D];
580 #if PRECOMPUTE_BLOB_VAL
581  int aux = (int) ((distanceSqr * cIDeltaSqrt + 0.5f));
582  #if SHARED_BLOB_TABLE
583  float wBlob = BLOB_TABLE[aux];
584  #else
585  float wBlob = blobTableSqrt[aux];
586  #endif
587 #else
588  float wBlob;
589  if (useFastKaiser) {
590  wBlob = kaiserValueFast(distanceSqr);
591  }
592  else {
593  wBlob = kaiserValue<blobOrder>(sqrtf(distanceSqr),cBlobRadius) * cIw0;
594  }
595 #endif
596  float weight = wBlob * wModulator * dataWeight;
597  w += weight;
598 #if SHARED_IMG
599  vol += IMG[index2D] * weight * wCTF;
600 #else
601  vol += FFT[index2D] * weight * wCTF;
602 #endif
603  }
604  }
605  } else {
606  // check which pixel in the vicinity should contribute
607  for (int i = minY; i <= maxY; i++) {
608  float ySqr = (imgPos.y - i) * (imgPos.y - i);
609  float yzSqr = ySqr + zSqr;
610  if (yzSqr > radiusSqr) continue;
611  for (int j = minX; j <= maxX; j++) {
612  float xD = imgPos.x - j;
613  float distanceSqr = xD*xD + yzSqr;
614  if (distanceSqr > radiusSqr) continue;
615 
616 #if SHARED_IMG
617  int index2D = (i - SHARED_AABB[0].y) * imgCacheDim + (j-SHARED_AABB[0].x); // position in img - offset of the AABB
618 #else
619  int index2D = i * xSize + j;
620 #endif
621 
622 #if PRECOMPUTE_BLOB_VAL
623  int aux = (int) ((distanceSqr * cIDeltaSqrt + 0.5f));
624 #if SHARED_BLOB_TABLE
625  float wBlob = BLOB_TABLE[aux];
626 #else
627  float wBlob = blobTableSqrt[aux];
628 #endif
629 #else
630  float wBlob;
631  if (useFastKaiser) {
632  wBlob = kaiserValueFast(distanceSqr);
633  }
634  else {
635  wBlob = kaiserValue<blobOrder>(sqrtf(distanceSqr),cBlobRadius) * cIw0;
636  }
637 #endif
638  float weight = wBlob * dataWeight;
639  w += weight;
640 #if SHARED_IMG
641  vol += IMG[index2D] * weight;
642 #else
643  vol += FFT[index2D] * weight;
644 #endif
645  }
646  }
647  }
648  // use atomic as two blocks can write to same voxel
649  atomicAdd(&tempVolumeGPU[index3D].x, vol.x);
650  atomicAdd(&tempVolumeGPU[index3D].y, vol.y);
651  atomicAdd(&tempWeightsGPU[index3D], w);
652 }
__device__ __constant__ float cBlobRadius
static double * y
__shared__ float BLOB_TABLE[BLOB_TABLE_SIZE_SQRT]
doublereal * w
__device__ __constant__ int cMaxVolumeIndexX
__device__ float kaiserValueFast(float distSqr)
__device__ __constant__ int cMaxVolumeIndexYZ
doublereal * x
#define i
__device__ __constant__ float cIDeltaSqrt
T z
Definition: point3D.h:56
#define CTF
double * f
T x
Definition: point3D.h:54
double z
#define j
__device__ __constant__ float cIw0
__device__ double atomicAdd(double *address, double val)
__device__ void multiply(const float transform[3][3], Point3D< float > &inOut)
T y
Definition: point3D.h:55

◆ rotate()

__device__ void rotate ( Point3D< float > *  box,
const float  transform[3][3] 
)

Method will rotate box using transformation matrix around center of the working space

Definition at line 751 of file cuda_gpu_reconstruct_fourier.cpp.

751  {
752  for (int i = 0; i < 8; i++) {
753  Point3D<float> imgPos;
754  // transform current point to center
755  imgPos.x = box[i].x - cMaxVolumeIndexX/2;
756  imgPos.y = box[i].y - cMaxVolumeIndexYZ/2;
757  imgPos.z = box[i].z - cMaxVolumeIndexYZ/2;
758  // rotate around center
759  multiply(transform, imgPos);
760  // transform back just Y coordinate, since X now matches to picture and Z is irrelevant
761  imgPos.y += cMaxVolumeIndexYZ / 2;
762 
763  box[i] = imgPos;
764  }
765 }
__device__ __constant__ int cMaxVolumeIndexX
__device__ __constant__ int cMaxVolumeIndexYZ
#define i
T z
Definition: point3D.h:56
T x
Definition: point3D.h:54
__device__ void multiply(const float transform[3][3], Point3D< float > &inOut)
T y
Definition: point3D.h:55

Variable Documentation

◆ cBlobAlpha

__device__ __constant__ float cBlobAlpha = 0.f

Definition at line 56 of file cuda_gpu_reconstruct_fourier.cpp.

◆ cBlobRadius

__device__ __constant__ float cBlobRadius = 0.f

Definition at line 54 of file cuda_gpu_reconstruct_fourier.cpp.

◆ cIDeltaSqrt

__device__ __constant__ float cIDeltaSqrt = 0.f

Definition at line 58 of file cuda_gpu_reconstruct_fourier.cpp.

◆ cIw0

__device__ __constant__ float cIw0 = 0.f

Definition at line 57 of file cuda_gpu_reconstruct_fourier.cpp.

◆ cMaxVolumeIndexX

__device__ __constant__ int cMaxVolumeIndexX = 0

Definition at line 52 of file cuda_gpu_reconstruct_fourier.cpp.

◆ cMaxVolumeIndexYZ

__device__ __constant__ int cMaxVolumeIndexYZ = 0

Definition at line 53 of file cuda_gpu_reconstruct_fourier.cpp.

◆ cOneOverBessiOrderAlpha

__device__ __constant__ float cOneOverBessiOrderAlpha = 0.f

Definition at line 59 of file cuda_gpu_reconstruct_fourier.cpp.

◆ cOneOverBlobRadiusSqr

__device__ __constant__ float cOneOverBlobRadiusSqr = 0.f

Definition at line 55 of file cuda_gpu_reconstruct_fourier.cpp.

◆ devBlobTableSqrt

float* devBlobTableSqrt = NULL

Definition at line 50 of file cuda_gpu_reconstruct_fourier.cpp.

◆ streams

cudaStream_t* streams

Definition at line 44 of file cuda_gpu_reconstruct_fourier.cpp.

◆ wrappers

Definition at line 47 of file cuda_gpu_reconstruct_fourier.cpp.