Xmipp  v3.23.11-Nereus
Classes | Functions | Variables
reconstruct_fourier_codelet_reconstruct.cpp File Reference
#include <atomic>
#include <core/multidim_array.h>
#include <core/xmipp_fft.h>
#include <core/xmipp_fftw.h>
#include <reconstruction/reconstruct_fourier_projection_traverse_space.h>
#include <reconstruction_cuda/cuda_basic_math.h>
#include <reconstruction_cuda/cuda_xmipp_utils.h>
#include <reconstruction_cuda/cuda_asserts.h>
#include <cuda_runtime_api.h>
#include <starpu.h>
#include "reconstruct_fourier_codelets.h"
#include "reconstruct_fourier_defines.h"
Include dependency graph for reconstruct_fourier_codelet_reconstruct.cpp:

Go to the source code of this file.

Classes

struct  CodeletConstants
 

Functions

void reconstruct_cuda_initialize_constants (int maxVolIndexX, int maxVolIndexYZ, float blobRadius, float blobAlpha, float iDeltaSqrt, float iw0, float oneOverBessiOrderAlpha)
 
__host__ __device__ float bessi0Fast (float x)
 
__host__ __device__ float bessi0 (float x)
 
__host__ __device__ float bessi1 (float x)
 
__host__ __device__ float bessi2 (float x)
 
__host__ __device__ float bessi3 (float x)
 
__host__ __device__ float bessi4 (float x)
 
template<int order>
__host__ __device__ float kaiserValue (float r, float a)
 
__host__ __device__ float kaiserValueFast (float distSqr)
 
__host__ __device__ float getZ (float x, float y, const Point3D< float > &n, const Point3D< float > &p0)
 
__host__ __device__ float getY (float x, float z, const Point3D< float > &n, const Point3D< float > &p0)
 
__host__ __device__ float getX (float y, float z, const Point3D< float > &n, const Point3D< float > &p0)
 
__host__ __device__ void multiply (const float transform[3][3], Point3D< float > &inOut)
 
__host__ __device__ void rotate (Point3D< float > box[8], const float transform[3][3])
 
__host__ __device__ void computeAABB (Point3D< float > AABB[2], const Point3D< float > cuboid[8])
 
__device__ void processVoxel (float2 *tempVolumeGPU, float *tempWeightsGPU, int x, int y, int z, int xSize, int ySize, const float2 *__restrict__ FFT, const RecFourierProjectionTraverseSpace *const space)
 
template<int blobOrder, bool useFastKaiser>
__device__ void processVoxelBlob (float2 *tempVolumeGPU, float *tempWeightsGPU, const int x, const int y, const int z, const int xSize, const int ySize, const float2 *__restrict__ FFT, const RecFourierProjectionTraverseSpace *const space, const float *blobTableSqrt, const int imgCacheDim)
 
template<bool useFast, int blobOrder, bool useFastKaiser>
__device__ void processProjection (float2 *tempVolumeGPU, float *tempWeightsGPU, const int xSize, const int ySize, const float2 *__restrict__ FFT, const RecFourierProjectionTraverseSpace *const tSpace, const float *blobTableSqrt, const int imgCacheDim)
 
__device__ void getImgData (const Point3D< float > AABB[2], const int tXindex, const int tYindex, const float2 *FFTs, const int fftSizeX, const int fftSizeY, const int imgIndex, float2 &vComplex)
 
__device__ void copyImgToCache (float2 *dest, const Point3D< float > AABB[2], const float2 *FFTs, const int fftSizeX, const int fftSizeY, const int imgIndex, const int imgCacheDim)
 
template<bool fastLateBlobbing, int blobOrder, bool useFastKaiser>
__global__ void processBufferKernel (float2 *outVolumeBuffer, float *outWeightsBuffer, const int fftSizeX, const int fftSizeY, const int traverseSpaceCount, const RecFourierProjectionTraverseSpace *traverseSpaces, const float2 *FFTs, const float *blobTableSqrt, int imgCacheDim)
 
template<int blobOrder, bool useFastKaiser>
void processBufferGPU (float2 *outVolumeBuffer, float *outWeightsBuffer, const int fftSizeX, const int fftSizeY, const int traverseSpaceCount, const RecFourierProjectionTraverseSpace *traverseSpaces, const float2 *inFFTs, const float *blobTableSqrt, const bool fastLateBlobbing, const float blobRadius, const int maxVolIndexYZ)
 
void func_reconstruct_cuda (void *buffers[], void *cl_arg)
 
void atomicAddFloat (volatile float *ptr, float addedValue)
 
void processVoxelCPU (float2 *const tempVolumeGPU, float *const tempWeightsGPU, const int x, const int y, const int z, const int xSize, const int ySize, const float2 *const __restrict__ FFT, const RecFourierProjectionTraverseSpace *const space)
 
template<int blobOrder, bool useFastKaiser, bool usePrecomputedInterpolation>
void processVoxelBlobCPU (float2 *const tempVolumeGPU, float *const tempWeightsGPU, const int x, const int y, const int z, const int xSize, const int ySize, const float2 *const __restrict__ FFT, const RecFourierProjectionTraverseSpace *const space, const float *blobTableSqrt)
 
template<bool useFast, int blobOrder, bool useFastKaiser, bool usePrecomputedInterpolation>
void processProjectionCPU (float2 *tempVolumeGPU, float *tempWeightsGPU, const int xSize, const int ySize, const float2 *__restrict__ FFT, const RecFourierProjectionTraverseSpace *const tSpace, const float *blobTableSqrt)
 
template<int blobOrder, bool useFastKaiser, bool usePrecomputedInterpolation>
void processBufferCPU (float2 *outVolumeBuffer, float *outWeightsBuffer, const int fftSizeX, const int fftSizeY, const int traverseSpaceCount, const RecFourierProjectionTraverseSpace *traverseSpaces, const float2 *inFFTs, const float *blobTableSqrt, const bool fastLateBlobbing)
 
template<bool usePrecomputedInterpolation>
void func_reconstruct_cpu_template (void *buffers[], void *cl_arg)
 
void func_reconstruct_cpu_lookup_interpolation (void *buffers[], void *cl_arg)
 
void func_reconstruct_cpu_dynamic_interpolation (void *buffers[], void *cl_arg)
 

Variables

__shared__ float BLOB_TABLE [BLOB_TABLE_SIZE_SQRT]
 
__device__ __constant__ CodeletConstants gpuC
 
CodeletConstants cpuC
 

Function Documentation

◆ atomicAddFloat()

void atomicAddFloat ( volatile float *  ptr,
float  addedValue 
)

Atomically increments the value pointed at by ptr by value. Uses relaxed memory model with no reordering guarantees.

Definition at line 892 of file reconstruct_fourier_codelet_reconstruct.cpp.

892  {
893  static_assert(sizeof(float) == sizeof(uint32_t), "atomicAddFloat requires floats to be 32bit");
894 
895  // This is probably fine, since the constructor/destructor should be trivial
896  // (As of C++11, this is guaranteed only for integral type specializations, but it is probably reasonably safe to assume
897  // that this will hold for floats as well. C++20 requies that by spec.)
898  volatile std::atomic<float>& atomicPtr = *reinterpret_cast<volatile std::atomic<float>*>(ptr);
899  float current = atomicPtr.load(std::memory_order::memory_order_relaxed);
900  while (true) {
901  const float newValue = current + addedValue;
902  // Since x86 does not allow atomic add of floats (only integers), we have to implement it through CAS
903  if (atomicPtr.compare_exchange_weak(current, newValue, std::memory_order::memory_order_relaxed)) {
904  // Current was still current and was replaced with the newValue. Done.
905  return;
906  }
907  // Comparison failed. current now contains the new value and we try again.
908  }
909 }

◆ bessi0()

__host__ __device__ float bessi0 ( float  x)

Definition at line 123 of file reconstruct_fourier_codelet_reconstruct.cpp.

123  {
124  float y, ax, ans;
125  if ((ax = fabsf(x)) < 3.75f)
126  {
127  y = x / 3.75f;
128  y *= y;
129  ans = 1.f + y * (3.5156229f + y * (3.0899424f + y * (1.2067492f
130  + y * (0.2659732f + y * (0.360768e-1f + y * 0.45813e-2f)))));
131  }
132  else
133  {
134  y = 3.75f / ax;
135  ans = (expf(ax) * rsqrtf(ax)) * (0.39894228f + y * (0.1328592e-1f
136  + y * (0.225319e-2f + y * (-0.157565e-2f + y * (0.916281e-2f
137  + y * (-0.2057706e-1f + y * (0.2635537e-1f + y * (-0.1647633e-1f
138  + y * 0.392377e-2f))))))));
139  }
140  return ans;
141 }
static double * y
doublereal * x
double * f

◆ bessi0Fast()

__host__ __device__ float bessi0Fast ( float  x)

Definition at line 99 of file reconstruct_fourier_codelet_reconstruct.cpp.

99  { // X must be <= 15
100  // stable rational minimax approximations to the modified bessel functions, blair, edwards
101  // from table 5
102  float x2 = x*x;
103  float num = -0.8436825781374849e-19f; // p11
104  num = fmaf(num, x2, -0.93466495199548700e-17f); // p10
105  num = fmaf(num, x2, -0.15716375332511895e-13f); // p09
106  num = fmaf(num, x2, -0.42520971595532318e-11f); // p08
107  num = fmaf(num, x2, -0.13704363824102120e-8f); // p07
108  num = fmaf(num, x2, -0.28508770483148419e-6f); // p06
109  num = fmaf(num, x2, -0.44322160233346062e-4f); // p05
110  num = fmaf(num, x2, -0.46703811755736946e-2f); // p04
111  num = fmaf(num, x2, -0.31112484643702141e-0f); // p03
112  num = fmaf(num, x2, -0.11512633616429962e+2f); // p02
113  num = fmaf(num, x2, -0.18720283332732112e+3f); // p01
114  num = fmaf(num, x2, -0.75281108169006924e+3f); // p00
115 
116  float den = 1.f; // q01
117  den = fmaf(den, x2, -0.75281109410939403e+3f); // q00
118 
119  return num/den;
120 }
doublereal * x
double * f

◆ bessi1()

__host__ __device__ float bessi1 ( float  x)

Definition at line 144 of file reconstruct_fourier_codelet_reconstruct.cpp.

144  {
145  float ax, ans;
146  float y;
147  if ((ax = fabsf(x)) < 3.75f)
148  {
149  y = x / 3.75f;
150  y *= y;
151  ans = ax * (0.5f + y * (0.87890594f + y * (0.51498869f + y * (0.15084934f
152  + y * (0.2658733e-1f + y * (0.301532e-2f + y * 0.32411e-3f))))));
153  }
154  else
155  {
156  y = 3.75f / ax;
157  ans = 0.2282967e-1f + y * (-0.2895312e-1f + y * (0.1787654e-1f
158  - y * 0.420059e-2f));
159  ans = 0.39894228f + y * (-0.3988024e-1f + y * (-0.362018e-2f
160  + y * (0.163801e-2f + y * (-0.1031555e-1f + y * ans))));
161  ans *= (expf(ax) * rsqrtf(ax));
162  }
163  return x < 0.0 ? -ans : ans;
164 }
static double * y
doublereal * x
double * f

◆ bessi2()

__host__ __device__ float bessi2 ( float  x)

Definition at line 167 of file reconstruct_fourier_codelet_reconstruct.cpp.

167  {
168  return (x == 0) ? 0 : bessi0(x) - ((2*1) / x) * bessi1(x);
169 }
doublereal * x
__host__ __device__ float bessi1(float x)
__host__ __device__ float bessi0(float x)

◆ bessi3()

__host__ __device__ float bessi3 ( float  x)

Definition at line 172 of file reconstruct_fourier_codelet_reconstruct.cpp.

172  {
173  return (x == 0) ? 0 : bessi1(x) - ((2*2) / x) * bessi2(x);
174 }
doublereal * x
__host__ __device__ float bessi2(float x)
__host__ __device__ float bessi1(float x)

◆ bessi4()

__host__ __device__ float bessi4 ( float  x)

Definition at line 177 of file reconstruct_fourier_codelet_reconstruct.cpp.

177  {
178  return (x == 0) ? 0 : bessi2(x) - ((2*3) / x) * bessi3(x);
179 }
__host__ __device__ float bessi3(float x)
doublereal * x
__host__ __device__ float bessi2(float x)

◆ computeAABB()

__host__ __device__ void computeAABB ( Point3D< float >  AABB[2],
const Point3D< float >  cuboid[8] 
)

Compute Axis Aligned Bounding Box of given cuboid

Definition at line 317 of file reconstruct_fourier_codelet_reconstruct.cpp.

317  {
318  AABB[0].x = AABB[0].y = AABB[0].z = INFINITY;
319  AABB[1].x = AABB[1].y = AABB[1].z = -INFINITY;
320  for (int i = 0; i < 8; i++) {
321  Point3D<float> tmp = cuboid[i];
322  if (AABB[0].x > tmp.x) AABB[0].x = tmp.x;
323  if (AABB[0].y > tmp.y) AABB[0].y = tmp.y;
324  if (AABB[0].z > tmp.z) AABB[0].z = tmp.z;
325  if (AABB[1].x < tmp.x) AABB[1].x = tmp.x;
326  if (AABB[1].y < tmp.y) AABB[1].y = tmp.y;
327  if (AABB[1].z < tmp.z) AABB[1].z = tmp.z;
328  }
329  AABB[0].x = ceilf(AABB[0].x);
330  AABB[0].y = ceilf(AABB[0].y);
331  AABB[0].z = ceilf(AABB[0].z);
332 
333  AABB[1].x = floorf(AABB[1].x);
334  AABB[1].y = floorf(AABB[1].y);
335  AABB[1].z = floorf(AABB[1].z);
336 }
static double * y
doublereal * x
#define i
T z
Definition: point3D.h:56
T x
Definition: point3D.h:54
double z
T y
Definition: point3D.h:55

◆ copyImgToCache()

__device__ void copyImgToCache ( float2 *  dest,
const Point3D< float >  AABB[2],
const float2 *  FFTs,
const int  fftSizeX,
const int  fftSizeY,
const int  imgIndex,
const int  imgCacheDim 
)

Method will copy imgIndex(th) data from buffer to given destination (shared memory). Only data within AABB will be copied. Destination is expected to be continuous array of sufficient size (imgCacheDim^2)

Definition at line 695 of file reconstruct_fourier_codelet_reconstruct.cpp.

697  {
698  for (int y = threadIdx.y; y < imgCacheDim; y += blockDim.y) {
699  for (int x = threadIdx.x; x < imgCacheDim; x += blockDim.x) {
700  int memIndex = y * imgCacheDim + x;
701  getImgData(AABB, x, y, FFTs, fftSizeX, fftSizeY, imgIndex, dest[memIndex]);
702  }
703  }
704 }
static double * y
doublereal * x
__device__ void getImgData(const Point3D< float > AABB[2], const int tXindex, const int tYindex, const float2 *FFTs, const int fftSizeX, const int fftSizeY, const int imgIndex, float2 &vComplex)

◆ func_reconstruct_cpu_dynamic_interpolation()

void func_reconstruct_cpu_dynamic_interpolation ( void *  buffers[],
void *  cl_arg 
)

Definition at line 1209 of file reconstruct_fourier_codelet_reconstruct.cpp.

1209  {
1210  func_reconstruct_cpu_template<false>(buffers, cl_arg);
1211 }

◆ func_reconstruct_cpu_lookup_interpolation()

void func_reconstruct_cpu_lookup_interpolation ( void *  buffers[],
void *  cl_arg 
)

Definition at line 1205 of file reconstruct_fourier_codelet_reconstruct.cpp.

1205  {
1206  func_reconstruct_cpu_template<true>(buffers, cl_arg);
1207 }

◆ func_reconstruct_cpu_template()

template<bool usePrecomputedInterpolation>
void func_reconstruct_cpu_template ( void *  buffers[],
void *  cl_arg 
)

Definition at line 1141 of file reconstruct_fourier_codelet_reconstruct.cpp.

1141  {
1142  const ReconstructFftArgs& arg = *(ReconstructFftArgs*) cl_arg;
1143  const float2* inFFTs = (float2*)STARPU_VECTOR_GET_PTR(buffers[0]);
1144  const RecFourierProjectionTraverseSpace* inSpaces = (RecFourierProjectionTraverseSpace*)STARPU_MATRIX_GET_PTR(buffers[1]);
1145  const float* inBlobTableSqrt = (float*)(STARPU_VECTOR_GET_PTR(buffers[2]));
1146  float2* outVolumeBuffer = (float2*)(STARPU_VECTOR_GET_PTR(buffers[3])); // Actually std::complex<float>
1147  float* outWeightsBuffer = (float*)(STARPU_VECTOR_GET_PTR(buffers[4]));
1148  const uint32_t noOfImages = ((LoadedImagesBuffer*) STARPU_VARIABLE_GET_PTR(buffers[5]))->noOfImages;
1149 
1150  switch (arg.blobOrder) {
1151  case 0:
1152  if (arg.blobAlpha <= 15.0) {
1153  processBufferCPU<0, true, usePrecomputedInterpolation>(outVolumeBuffer, outWeightsBuffer,
1154  arg.fftSizeX, arg.fftSizeY,
1155  arg.noOfSymmetries * noOfImages, inSpaces,
1156  inFFTs,
1157  inBlobTableSqrt,
1158  arg.fastLateBlobbing);
1159  } else {
1160  processBufferCPU<0, false, usePrecomputedInterpolation>(outVolumeBuffer, outWeightsBuffer,
1161  arg.fftSizeX, arg.fftSizeY,
1162  arg.noOfSymmetries * noOfImages, inSpaces,
1163  inFFTs,
1164  inBlobTableSqrt,
1165  arg.fastLateBlobbing);
1166  }
1167  break;
1168  case 1:
1169  processBufferCPU<1, false, usePrecomputedInterpolation>(outVolumeBuffer, outWeightsBuffer,
1170  arg.fftSizeX, arg.fftSizeY,
1171  arg.noOfSymmetries * noOfImages, inSpaces,
1172  inFFTs,
1173  inBlobTableSqrt,
1174  arg.fastLateBlobbing);
1175  break;
1176  case 2:
1177  processBufferCPU<2, false, usePrecomputedInterpolation>(outVolumeBuffer, outWeightsBuffer,
1178  arg.fftSizeX, arg.fftSizeY,
1179  arg.noOfSymmetries * noOfImages, inSpaces,
1180  inFFTs,
1181  inBlobTableSqrt,
1182  arg.fastLateBlobbing);
1183  break;
1184  case 3:
1185  processBufferCPU<3, false, usePrecomputedInterpolation>(outVolumeBuffer, outWeightsBuffer,
1186  arg.fftSizeX, arg.fftSizeY,
1187  arg.noOfSymmetries * noOfImages, inSpaces,
1188  inFFTs,
1189  inBlobTableSqrt,
1190  arg.fastLateBlobbing);
1191  break;
1192  case 4:
1193  processBufferCPU<4, false, usePrecomputedInterpolation>(outVolumeBuffer, outWeightsBuffer,
1194  arg.fftSizeX, arg.fftSizeY,
1195  arg.noOfSymmetries * noOfImages, inSpaces,
1196  inFFTs,
1197  inBlobTableSqrt,
1198  arg.fastLateBlobbing);
1199  break;
1200  default:
1201  REPORT_ERROR(ERR_VALUE_INCORRECT, "m out of range [0..4] in kaiser_value()");
1202  }
1203 }
#define REPORT_ERROR(nerr, ErrormMsg)
Definition: xmipp_error.h:211
Incorrect value received.
Definition: xmipp_error.h:195

◆ func_reconstruct_cuda()

void func_reconstruct_cuda ( void *  buffers[],
void *  cl_arg 
)

Definition at line 815 of file reconstruct_fourier_codelet_reconstruct.cpp.

815  {
816  const ReconstructFftArgs& arg = *(ReconstructFftArgs*) cl_arg;
817  const float2* inFFTs = (float2*)STARPU_VECTOR_GET_PTR(buffers[0]);
818  const RecFourierProjectionTraverseSpace* inSpaces = (RecFourierProjectionTraverseSpace*)STARPU_MATRIX_GET_PTR(buffers[1]);
819  const float* inBlobTableSqrt = (float*)(STARPU_VECTOR_GET_PTR(buffers[2]));
820  float2* outVolumeBuffer = (float2*)(STARPU_VECTOR_GET_PTR(buffers[3])); // Actually std::complex<float>
821  float* outWeightsBuffer = (float*)(STARPU_VECTOR_GET_PTR(buffers[4]));
822  const uint32_t noOfImages = ((LoadedImagesBuffer*) STARPU_VARIABLE_GET_PTR(buffers[5]))->noOfImages;
823 
824  switch (arg.blobOrder) {
825  case 0:
826  if (arg.blobAlpha <= 15.0) {
827  processBufferGPU<0, true>(outVolumeBuffer, outWeightsBuffer,
828  arg.fftSizeX, arg.fftSizeY,
829  arg.noOfSymmetries * noOfImages, inSpaces,
830  inFFTs,
831  inBlobTableSqrt,
832  arg.fastLateBlobbing,
833  arg.blobRadius, arg.maxVolIndexYZ);
834  } else {
835  processBufferGPU<0, false>(outVolumeBuffer, outWeightsBuffer,
836  arg.fftSizeX, arg.fftSizeY,
837  arg.noOfSymmetries * noOfImages, inSpaces,
838  inFFTs,
839  inBlobTableSqrt,
840  arg.fastLateBlobbing,
841  arg.blobRadius, arg.maxVolIndexYZ);
842  }
843  break;
844  case 1:
845  processBufferGPU<1, false>(outVolumeBuffer, outWeightsBuffer,
846  arg.fftSizeX, arg.fftSizeY,
847  arg.noOfSymmetries * noOfImages, inSpaces,
848  inFFTs,
849  inBlobTableSqrt,
850  arg.fastLateBlobbing,
851  arg.blobRadius, arg.maxVolIndexYZ);
852  break;
853  case 2:
854  processBufferGPU<2, false>(outVolumeBuffer, outWeightsBuffer,
855  arg.fftSizeX, arg.fftSizeY,
856  arg.noOfSymmetries * noOfImages, inSpaces,
857  inFFTs,
858  inBlobTableSqrt,
859  arg.fastLateBlobbing,
860  arg.blobRadius, arg.maxVolIndexYZ);
861  break;
862  case 3:
863  processBufferGPU<3, false>(outVolumeBuffer, outWeightsBuffer,
864  arg.fftSizeX, arg.fftSizeY,
865  arg.noOfSymmetries * noOfImages, inSpaces,
866  inFFTs,
867  inBlobTableSqrt,
868  arg.fastLateBlobbing,
869  arg.blobRadius, arg.maxVolIndexYZ);
870  break;
871  case 4:
872  processBufferGPU<4, false>(outVolumeBuffer, outWeightsBuffer,
873  arg.fftSizeX, arg.fftSizeY,
874  arg.noOfSymmetries * noOfImages, inSpaces,
875  inFFTs,
876  inBlobTableSqrt,
877  arg.fastLateBlobbing,
878  arg.blobRadius, arg.maxVolIndexYZ);
879  break;
880  default:
881  REPORT_ERROR(ERR_VALUE_INCORRECT, "m out of range [0..4] in kaiser_value()");
882  }
883 
884  // gpuErrchk(cudaStreamSynchronize(starpu_cuda_get_local_stream())); disabled because codelet is async
885 }
#define REPORT_ERROR(nerr, ErrormMsg)
Definition: xmipp_error.h:211
Incorrect value received.
Definition: xmipp_error.h:195

◆ getImgData()

__device__ void getImgData ( const Point3D< float >  AABB[2],
const int  tXindex,
const int  tYindex,
const float2 *  FFTs,
const int  fftSizeX,
const int  fftSizeY,
const int  imgIndex,
float2 &  vComplex 
)

Method will load data from image at position tXindex, tYindex and return them. In case the data lies outside of the image boundaries, zeros (0,0) are returned

Definition at line 670 of file reconstruct_fourier_codelet_reconstruct.cpp.

673  {
674  int imgXindex = tXindex + static_cast<int>(AABB[0].x);
675  int imgYindex = tYindex + static_cast<int>(AABB[0].y);
676  if ((imgXindex >= 0)
677  && (imgXindex < fftSizeX)
678  && (imgYindex >=0)
679  && (imgYindex < fftSizeY)) {
680  int index = imgYindex * fftSizeX + imgXindex; // copy data from image
681  vComplex = (FFTs + fftSizeX * fftSizeY * imgIndex)[index];
682  } else {
683  vComplex = {0.f, 0.f}; // out of image bound, so return zero
684  }
685 }
viol index
T x
Definition: point3D.h:54
T y
Definition: point3D.h:55

◆ getX()

__host__ __device__ float getX ( float  y,
float  z,
const Point3D< float > &  n,
const Point3D< float > &  p0 
)

Calculates X coordinate of the point [y, z] on the plane defined by p0 (origin) and normal

Definition at line 269 of file reconstruct_fourier_codelet_reconstruct.cpp.

269  {
270  // from a(x-x0)+b(y-y0)+c(z-z0)=0
271  return (-n.y*(y-p0.y)-n.z*(z-p0.z))/n.x + p0.x;
272 }
static double * y
T z
Definition: point3D.h:56
T x
Definition: point3D.h:54
double z
T y
Definition: point3D.h:55

◆ getY()

__host__ __device__ float getY ( float  x,
float  z,
const Point3D< float > &  n,
const Point3D< float > &  p0 
)

Calculates Y coordinate of the point [x, z] on the plane defined by p0 (origin) and normal

Definition at line 261 of file reconstruct_fourier_codelet_reconstruct.cpp.

261  {
262  // from a(x-x0)+b(y-y0)+c(z-z0)=0
263  return (-n.x*(x-p0.x)-n.z*(z-p0.z))/n.y + p0.y;
264 }
doublereal * x
T z
Definition: point3D.h:56
T x
Definition: point3D.h:54
double z
T y
Definition: point3D.h:55

◆ getZ()

__host__ __device__ float getZ ( float  x,
float  y,
const Point3D< float > &  n,
const Point3D< float > &  p0 
)

Calculates Z coordinate of the point [x, y] on the plane defined by p0 (origin) and normal

Definition at line 254 of file reconstruct_fourier_codelet_reconstruct.cpp.

254  {
255  // from a(x-x0)+b(y-y0)+c(z-z0)=0
256  return (-n.x*(x-p0.x)-n.y*(y-p0.y))/n.z + p0.z;
257 }
static double * y
doublereal * x
T z
Definition: point3D.h:56
T x
Definition: point3D.h:54
T y
Definition: point3D.h:55

◆ kaiserValue()

template<int order>
__host__ __device__ float kaiserValue ( float  r,
float  a 
)

Definition at line 183 of file reconstruct_fourier_codelet_reconstruct.cpp.

183  {
184  const CodeletConstants& c =
185 #ifdef __CUDA_ARCH__
186  gpuC;
187 #else
188  cpuC;
189 #endif
190 
191  float w;
192  float rda = r / a;
193  if (rda <= 1.f)
194  {
195  float rdas = rda * rda;
196  float arg = c.cBlobAlpha * sqrtf(1.f - rdas);
197  if (order == 0)
198  {
199  w = bessi0(arg) * c.cOneOverBessiOrderAlpha;
200  }
201  else if (order == 1)
202  {
203  w = sqrtf (1.f - rdas);
204  w *= bessi1(arg) * c.cOneOverBessiOrderAlpha;
205  }
206  else if (order == 2)
207  {
208  w = sqrtf (1.f - rdas);
209  w = w * w;
210  w *= bessi2(arg) * c.cOneOverBessiOrderAlpha;
211  }
212  else if (order == 3)
213  {
214  w = sqrtf (1.f - rdas);
215  w = w * w * w;
216  w *= bessi3(arg) * c.cOneOverBessiOrderAlpha;
217  }
218  else if (order == 4)
219  {
220  w = sqrtf (1.f - rdas);
221  w = w * w * w *w;
222  w *= bessi4(arg) * c.cOneOverBessiOrderAlpha;
223  }
224  else {
225  printf("order (%d) out of range in kaiser_value(): %s, %d\n", order, __FILE__, __LINE__);
226  w = 0.f;
227  }
228  }
229  else
230  w = 0.f;
231 
232  return w;
233 }
doublereal * c
__host__ __device__ float bessi3(float x)
doublereal * w
__device__ __constant__ CodeletConstants gpuC
__host__ __device__ float bessi2(float x)
double * f
__host__ __device__ float bessi1(float x)
__host__ __device__ float bessi4(float x)
__host__ __device__ float bessi0(float x)
int order
Derivation order and Bessel function order.
Definition: blobs.h:118
doublereal * a

◆ kaiserValueFast()

__host__ __device__ float kaiserValueFast ( float  distSqr)

Definition at line 236 of file reconstruct_fourier_codelet_reconstruct.cpp.

236  {
237  const CodeletConstants& c =
238 #ifdef __CUDA_ARCH__
239  gpuC;
240 #else
241  cpuC;
242 #endif
243 
244  float arg = c.cBlobAlpha * sqrtf(1.f - (distSqr * c.cOneOverBlobRadiusSqr)); // alpha * sqrt(1-(dist/blobRadius^2))
245  return bessi0Fast(arg) * c.cOneOverBessiOrderAlpha * c.cIw0;
246 }
doublereal * c
__device__ __constant__ CodeletConstants gpuC
double * f
__host__ __device__ float bessi0Fast(float x)

◆ multiply()

__host__ __device__ void multiply ( const float  transform[3][3],
Point3D< float > &  inOut 
)

Do 3x3 x 1x3 matrix-vector multiplication

Definition at line 276 of file reconstruct_fourier_codelet_reconstruct.cpp.

276  {
277  float tmp0 = transform[0][0] * inOut.x + transform[0][1] * inOut.y + transform[0][2] * inOut.z;
278  float tmp1 = transform[1][0] * inOut.x + transform[1][1] * inOut.y + transform[1][2] * inOut.z;
279  float tmp2 = transform[2][0] * inOut.x + transform[2][1] * inOut.y + transform[2][2] * inOut.z;
280  inOut.x = tmp0;
281  inOut.y = tmp1;
282  inOut.z = tmp2;
283 }
T z
Definition: point3D.h:56
T x
Definition: point3D.h:54
T y
Definition: point3D.h:55

◆ processBufferCPU()

template<int blobOrder, bool useFastKaiser, bool usePrecomputedInterpolation>
void processBufferCPU ( float2 *  outVolumeBuffer,
float *  outWeightsBuffer,
const int  fftSizeX,
const int  fftSizeY,
const int  traverseSpaceCount,
const RecFourierProjectionTraverseSpace traverseSpaces,
const float2 *  inFFTs,
const float *  blobTableSqrt,
const bool  fastLateBlobbing 
)

Definition at line 1105 of file reconstruct_fourier_codelet_reconstruct.cpp.

1111  {
1112 
1113  const int groupSize = starpu_combined_worker_get_size();
1114  const int groupRank = starpu_combined_worker_get_rank();
1115 
1116  for (int i = groupRank; i < traverseSpaceCount; i += groupSize) {
1117  const RecFourierProjectionTraverseSpace &space = traverseSpaces[i];
1118 
1119  const float2* spaceFFT = inFFTs + fftSizeX * fftSizeY * space.projectionIndex;
1120 
1121  // by using templates, we can save some registers, especially for 'fast' version
1122  if (fastLateBlobbing) {
1123  processProjectionCPU<true, blobOrder, useFastKaiser, usePrecomputedInterpolation>(
1124  outVolumeBuffer, outWeightsBuffer,
1125  fftSizeX, fftSizeY,
1126  spaceFFT,
1127  &space,
1128  blobTableSqrt);
1129  } else {
1130  processProjectionCPU<false, blobOrder, useFastKaiser, usePrecomputedInterpolation>(
1131  outVolumeBuffer, outWeightsBuffer,
1132  fftSizeX, fftSizeY,
1133  spaceFFT,
1134  &space,
1135  blobTableSqrt);
1136  }
1137  }
1138 }
#define i
int space
Definition: rwDM3.cpp:394

◆ processBufferGPU()

template<int blobOrder, bool useFastKaiser>
void processBufferGPU ( float2 *  outVolumeBuffer,
float *  outWeightsBuffer,
const int  fftSizeX,
const int  fftSizeY,
const int  traverseSpaceCount,
const RecFourierProjectionTraverseSpace traverseSpaces,
const float2 *  inFFTs,
const float *  blobTableSqrt,
const bool  fastLateBlobbing,
const float  blobRadius,
const int  maxVolIndexYZ 
)

Method will use data stored in the buffer and update temporal storages appropriately. Actual calculation is done asynchronously, but 'buffer' can be reused once the method returns.

Definition at line 774 of file reconstruct_fourier_codelet_reconstruct.cpp.

781  {
782 
783  // enqueue kernel and return control
784  const int imgCacheDim = static_cast<int>(ceil(sqrt(2.f) * sqrt(3.f) * (BLOCK_DIM + 2 * blobRadius)));
785  dim3 dimBlock(BLOCK_DIM, BLOCK_DIM);
786 
787  const int size2D = maxVolIndexYZ + 1;
788  dim3 dimGrid(static_cast<unsigned int>(ceil(size2D / (float)dimBlock.x)),
789  static_cast<unsigned int>(ceil(size2D / (float)dimBlock.y)),
790  GRID_DIM_Z);
791 
792  // by using templates, we can save some registers, especially for 'fast' version
793  if (fastLateBlobbing) {
794  processBufferKernel<true, blobOrder,useFastKaiser><<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>>(
795  outVolumeBuffer, outWeightsBuffer,
796  fftSizeX, fftSizeY,
797  traverseSpaceCount, traverseSpaces,
798  inFFTs,
799  blobTableSqrt,
800  imgCacheDim);
801  } else {
802  // if making copy of the image in shared memory, allocate enough space
803  int sharedMemSize = SHARED_IMG ? (imgCacheDim*imgCacheDim*sizeof(float2)) : 0;
804  processBufferKernel<false, blobOrder,useFastKaiser><<<dimGrid, dimBlock, sharedMemSize, starpu_cuda_get_local_stream()>>>(
805  outVolumeBuffer, outWeightsBuffer,
806  fftSizeX, fftSizeY,
807  traverseSpaceCount, traverseSpaces,
808  inFFTs,
809  blobTableSqrt,
810  imgCacheDim);
811  }
812  gpuErrchk(cudaPeekAtLastError());
813 }
#define gpuErrchk(code)
Definition: cuda_asserts.h:31
constexpr int BLOCK_DIM
void sqrt(Image< double > &op)
constexpr int SHARED_IMG
constexpr int GRID_DIM_Z
double * f

◆ processBufferKernel()

template<bool fastLateBlobbing, int blobOrder, bool useFastKaiser>
__global__ void processBufferKernel ( float2 *  outVolumeBuffer,
float *  outWeightsBuffer,
const int  fftSizeX,
const int  fftSizeY,
const int  traverseSpaceCount,
const RecFourierProjectionTraverseSpace traverseSpaces,
const float2 *  FFTs,
const float *  blobTableSqrt,
int  imgCacheDim 
)

Method will use data stored in the buffer and update temporal storages appropriately.

Definition at line 712 of file reconstruct_fourier_codelet_reconstruct.cpp.

718  {
719 
720 #if SHARED_BLOB_TABLE
721  if ( ! fastLateBlobbing) {
722  // copy blob table to shared memory
723  volatile int id = threadIdx.y*blockDim.x + threadIdx.x;
724  volatile int blockSize = blockDim.x * blockDim.y;
725  for (int i = id; i < BLOB_TABLE_SIZE_SQRT; i+= blockSize)
726  BLOB_TABLE[i] = blobTableSqrt[i];
727  __syncthreads();
728  }
729 #endif
730 
731  for (int i = blockIdx.z; i < traverseSpaceCount; i += gridDim.z) {
732  const RecFourierProjectionTraverseSpace& space = traverseSpaces[i];
733 
734 #if SHARED_IMG
735  if ( ! fastLateBlobbing) {
736  // make sure that all threads start at the same time
737  // as they can come from previous iteration
738  __syncthreads();
739  if ((threadIdx.x == 0) && (threadIdx.y == 0)) {
740  // first thread calculates which part of the image should be shared
741  calculateAABB(&space, SHARED_AABB);
742  }
743  __syncthreads();
744  // check if the block will have to copy data from image
745  if (isWithin(SHARED_AABB, fftSizeX, fftSizeY)) {
746  // all threads copy image data to shared memory
747  copyImgToCache(IMG, SHARED_AABB, FFTs, fftSizeX, fftSizeY, space.projectionIndex, imgCacheDim);
748  __syncthreads();
749  } else {
750  continue; // whole block can exit, as it's not reading from image
751  }
752  }
753 #endif
754 
755  processProjection<fastLateBlobbing, blobOrder, useFastKaiser>(
756  outVolumeBuffer, outWeightsBuffer,
757  fftSizeX, fftSizeY,
758  FFTs + fftSizeX * fftSizeY * space.projectionIndex,
759  &space,
760  blobTableSqrt,
761  imgCacheDim);
762 
763  __syncthreads(); // sync threads to avoid write after read problems
764  }
765 }
__shared__ float BLOB_TABLE[BLOB_TABLE_SIZE_SQRT]
#define i
#define BLOB_TABLE_SIZE_SQRT
__device__ void calculateAABB(const RecFourierProjectionTraverseSpace *tSpace, const RecFourierBufferDataGPU *buffer, Point3D< float > *dest)
int space
Definition: rwDM3.cpp:394
__device__ void copyImgToCache(float2 *dest, const Point3D< float > AABB[2], const float2 *FFTs, const int fftSizeX, const int fftSizeY, const int imgIndex, const int imgCacheDim)
__device__ bool isWithin(Point3D< float > *AABB, int imgXSize, int imgYSize)

◆ processProjection()

template<bool useFast, int blobOrder, bool useFastKaiser>
__device__ void processProjection ( float2 *  tempVolumeGPU,
float *  tempWeightsGPU,
const int  xSize,
const int  ySize,
const float2 *__restrict__  FFT,
const RecFourierProjectionTraverseSpace *const  tSpace,
const float *  blobTableSqrt,
const int  imgCacheDim 
)

Method will process one projection image and add result to temporal spaces.

Definition at line 579 of file reconstruct_fourier_codelet_reconstruct.cpp.

586 {
587  // map thread to each (2D) voxel
588 #if TILE > 1
589  int id = threadIdx.y * blockDim.x + threadIdx.x;
590  int tidX = threadIdx.x % TILE + (id / (blockDim.y * TILE)) * TILE;
591  int tidY = (id / TILE) % blockDim.y;
592  int idx = blockIdx.x * blockDim.x + tidX;
593  int idy = blockIdx.y * blockDim.y + tidY;
594 #else
595  // map thread to each (2D) voxel
596  volatile int idx = blockIdx.x*blockDim.x + threadIdx.x;
597  volatile int idy = blockIdx.y*blockDim.y + threadIdx.y;
598 #endif
599 
600  if (tSpace->XY == tSpace->dir) { // iterate XY plane
601  if (idy >= tSpace->minY && idy <= tSpace->maxY) {
602  if (idx >= tSpace->minX && idx <= tSpace->maxX) {
603  if (useFast) {
604  float hitZ = getZ(idx, idy, tSpace->unitNormal, tSpace->bottomOrigin);
605  int z = (int)(hitZ + 0.5f); // rounding
606  processVoxel(tempVolumeGPU, tempWeightsGPU, idx, idy, z, xSize, ySize, FFT, tSpace);
607  } else {
608  float z1 = getZ(idx, idy, tSpace->unitNormal, tSpace->bottomOrigin); // lower plane
609  float z2 = getZ(idx, idy, tSpace->unitNormal, tSpace->topOrigin); // upper plane
610  z1 = clamp(z1, 0, gpuC.cMaxVolumeIndexYZ);
611  z2 = clamp(z2, 0, gpuC.cMaxVolumeIndexYZ);
612  int lower = static_cast<int>(floorf(fminf(z1, z2)));
613  int upper = static_cast<int>(ceilf(fmaxf(z1, z2)));
614  for (int z = lower; z <= upper; z++) {
615  processVoxelBlob<blobOrder, useFastKaiser>(tempVolumeGPU, tempWeightsGPU, idx, idy, z, xSize, ySize, FFT, tSpace, blobTableSqrt, imgCacheDim);
616  }
617  }
618  }
619  }
620  } else if (tSpace->XZ == tSpace->dir) { // iterate XZ plane
621  if (idy >= tSpace->minZ && idy <= tSpace->maxZ) { // map z -> y
622  if (idx >= tSpace->minX && idx <= tSpace->maxX) {
623  if (useFast) {
624  float hitY =getY(idx, idy, tSpace->unitNormal, tSpace->bottomOrigin);
625  int y = (int)(hitY + 0.5f); // rounding
626  processVoxel(tempVolumeGPU, tempWeightsGPU, idx, y, idy, xSize, ySize, FFT, tSpace);
627  } else {
628  float y1 = getY(idx, idy, tSpace->unitNormal, tSpace->bottomOrigin); // lower plane
629  float y2 = getY(idx, idy, tSpace->unitNormal, tSpace->topOrigin); // upper plane
630  y1 = clamp(y1, 0, gpuC.cMaxVolumeIndexYZ);
631  y2 = clamp(y2, 0, gpuC.cMaxVolumeIndexYZ);
632  int lower = static_cast<int>(floorf(fminf(y1, y2)));
633  int upper = static_cast<int>(ceilf(fmaxf(y1, y2)));
634  for (int y = lower; y <= upper; y++) {
635  processVoxelBlob<blobOrder, useFastKaiser>(tempVolumeGPU, tempWeightsGPU, idx, y, idy, xSize, ySize, FFT, tSpace, blobTableSqrt, imgCacheDim);
636  }
637  }
638  }
639  }
640  } else { // iterate YZ plane
641  if (idy >= tSpace->minZ && idy <= tSpace->maxZ) { // map z -> y
642  if (idx >= tSpace->minY && idx <= tSpace->maxY) { // map y > x
643  if (useFast) {
644  float hitX = getX(idx, idy, tSpace->unitNormal, tSpace->bottomOrigin);
645  int x = (int)(hitX + 0.5f); // rounding
646  processVoxel(tempVolumeGPU, tempWeightsGPU, x, idx, idy, xSize, ySize, FFT, tSpace);
647  } else {
648  float x1 = getX(idx, idy, tSpace->unitNormal, tSpace->bottomOrigin); // lower plane
649  float x2 = getX(idx, idy, tSpace->unitNormal, tSpace->topOrigin); // upper plane
650  x1 = clamp(x1, 0, gpuC.cMaxVolumeIndexX);
651  x2 = clamp(x2, 0, gpuC.cMaxVolumeIndexX);
652  int lower = static_cast<int>(floorf(fminf(x1, x2)));
653  int upper = static_cast<int>(ceilf(fmaxf(x1, x2)));
654  for (int x = lower; x <= upper; x++) {
655  processVoxelBlob<blobOrder, useFastKaiser>(tempVolumeGPU, tempWeightsGPU, x, idx, idy, xSize, ySize, FFT, tSpace, blobTableSqrt, imgCacheDim);
656  }
657  }
658  }
659  }
660  }
661 }
enum RecFourierProjectionTraverseSpace::Direction dir
__device__ void processVoxel(float2 *tempVolumeGPU, float *tempWeightsGPU, int x, int y, int z, int xSize, int ySize, const float2 *__restrict__ FFT, const RecFourierProjectionTraverseSpace *const space)
static double * y
__host__ __device__ float getZ(float x, float y, const Point3D< float > &n, const Point3D< float > &p0)
__device__ __constant__ CodeletConstants gpuC
doublereal * x
if(fabs(c[*nmax+ *nmax *c_dim1])==0.e0)
constexpr int TILE
__host__ __device__ float getY(float x, float z, const Point3D< float > &n, const Point3D< float > &p0)
double * f
double z
__host__ __device__ float getX(float y, float z, const Point3D< float > &n, const Point3D< float > &p0)

◆ processProjectionCPU()

template<bool useFast, int blobOrder, bool useFastKaiser, bool usePrecomputedInterpolation>
void processProjectionCPU ( float2 *  tempVolumeGPU,
float *  tempWeightsGPU,
const int  xSize,
const int  ySize,
const float2 *__restrict__  FFT,
const RecFourierProjectionTraverseSpace *const  tSpace,
const float *  blobTableSqrt 
)

Definition at line 1034 of file reconstruct_fourier_codelet_reconstruct.cpp.

1039  {
1040 
1041  if (tSpace->XY == tSpace->dir) { // iterate XY plane
1042  for (int idy = tSpace->minY; idy <= tSpace->maxY; idy++) {
1043  for (int idx = tSpace->minX; idx <= tSpace->maxX; idx++) {
1044  if (useFast) {
1045  float hitZ = getZ(idx, idy, tSpace->unitNormal, tSpace->bottomOrigin);
1046  int z = (int)(hitZ + 0.5f); // rounding
1047  processVoxelCPU(tempVolumeGPU, tempWeightsGPU, idx, idy, z, xSize, ySize, FFT, tSpace);
1048  } else {
1049  float z1 = getZ(idx, idy, tSpace->unitNormal, tSpace->bottomOrigin); // lower plane
1050  float z2 = getZ(idx, idy, tSpace->unitNormal, tSpace->topOrigin); // upper plane
1051  z1 = clamp(z1, 0, cpuC.cMaxVolumeIndexYZ);
1052  z2 = clamp(z2, 0, cpuC.cMaxVolumeIndexYZ);
1053  int lower = static_cast<int>(floorf(fminf(z1, z2)));
1054  int upper = static_cast<int>(ceilf(fmaxf(z1, z2)));
1055  for (int z = lower; z <= upper; z++) {
1056  processVoxelBlobCPU<blobOrder, useFastKaiser, usePrecomputedInterpolation>(tempVolumeGPU, tempWeightsGPU, idx, idy, z, xSize, ySize, FFT, tSpace, blobTableSqrt);
1057  }
1058  }
1059  }
1060  }
1061  } else if (tSpace->XZ == tSpace->dir) { // iterate XZ plane
1062  for (int idy = tSpace->minZ; idy <= tSpace->maxZ; idy++) { // map z -> y
1063  for (int idx = tSpace->minX; idx <= tSpace->maxX; idx++) {
1064  if (useFast) {
1065  float hitY =getY(idx, idy, tSpace->unitNormal, tSpace->bottomOrigin);
1066  int y = (int)(hitY + 0.5f); // rounding
1067  processVoxelCPU(tempVolumeGPU, tempWeightsGPU, idx, y, idy, xSize, ySize, FFT, tSpace);
1068  } else {
1069  float y1 = getY(idx, idy, tSpace->unitNormal, tSpace->bottomOrigin); // lower plane
1070  float y2 = getY(idx, idy, tSpace->unitNormal, tSpace->topOrigin); // upper plane
1071  y1 = clamp(y1, 0, cpuC.cMaxVolumeIndexYZ);
1072  y2 = clamp(y2, 0, cpuC.cMaxVolumeIndexYZ);
1073  int lower = static_cast<int>(floorf(fminf(y1, y2)));
1074  int upper = static_cast<int>(ceilf(fmaxf(y1, y2)));
1075  for (int y = lower; y <= upper; y++) {
1076  processVoxelBlobCPU<blobOrder, useFastKaiser, usePrecomputedInterpolation>(tempVolumeGPU, tempWeightsGPU, idx, y, idy, xSize, ySize, FFT, tSpace, blobTableSqrt);
1077  }
1078  }
1079  }
1080  }
1081  } else { // iterate YZ plane
1082  for (int idy = tSpace->minZ; idy <= tSpace->maxZ; idy++) { // map z -> y
1083  for (int idx = tSpace->minY; idx <= tSpace->maxY; idx++) { // map y > x
1084  if (useFast) {
1085  float hitX = getX(idx, idy, tSpace->unitNormal, tSpace->bottomOrigin);
1086  int x = (int)(hitX + 0.5f); // rounding
1087  processVoxelCPU(tempVolumeGPU, tempWeightsGPU, x, idx, idy, xSize, ySize, FFT, tSpace);
1088  } else {
1089  float x1 = getX(idx, idy, tSpace->unitNormal, tSpace->bottomOrigin); // lower plane
1090  float x2 = getX(idx, idy, tSpace->unitNormal, tSpace->topOrigin); // upper plane
1091  x1 = clamp(x1, 0, cpuC.cMaxVolumeIndexX);
1092  x2 = clamp(x2, 0, cpuC.cMaxVolumeIndexX);
1093  int lower = static_cast<int>(floorf(fminf(x1, x2)));
1094  int upper = static_cast<int>(ceilf(fmaxf(x1, x2)));
1095  for (int x = lower; x <= upper; x++) {
1096  processVoxelBlobCPU<blobOrder, useFastKaiser, usePrecomputedInterpolation>(tempVolumeGPU, tempWeightsGPU, x, idx, idy, xSize, ySize, FFT, tSpace, blobTableSqrt);
1097  }
1098  }
1099  }
1100  }
1101  }
1102 }
enum RecFourierProjectionTraverseSpace::Direction dir
static double * y
__host__ __device__ float getZ(float x, float y, const Point3D< float > &n, const Point3D< float > &p0)
doublereal * x
__host__ __device__ float getY(float x, float z, const Point3D< float > &n, const Point3D< float > &p0)
double * f
double z
void processVoxelCPU(float2 *const tempVolumeGPU, float *const tempWeightsGPU, const int x, const int y, const int z, const int xSize, const int ySize, const float2 *const __restrict__ FFT, const RecFourierProjectionTraverseSpace *const space)
__host__ __device__ float getX(float y, float z, const Point3D< float > &n, const Point3D< float > &p0)

◆ processVoxel()

__device__ void processVoxel ( float2 *  tempVolumeGPU,
float *  tempWeightsGPU,
int  x,
int  y,
int  z,
int  xSize,
int  ySize,
const float2 *__restrict__  FFT,
const RecFourierProjectionTraverseSpace *const  space 
)

Method will map one voxel from the temporal spaces to the given projection and update temporal spaces using the pixel value of the projection.

Definition at line 435 of file reconstruct_fourier_codelet_reconstruct.cpp.

441 {
442  Point3D<float> imgPos;
443  float wBlob = 1.f;
444 
445  float dataWeight = space->weight;
446 
447  // transform current point to center
448  imgPos.x = x - gpuC.cMaxVolumeIndexX/2;
449  imgPos.y = y - gpuC.cMaxVolumeIndexYZ/2;
450  imgPos.z = z - gpuC.cMaxVolumeIndexYZ/2;
451  if (imgPos.x*imgPos.x + imgPos.y*imgPos.y + imgPos.z*imgPos.z > space->maxDistanceSqr) {
452  return; // discard iterations that would access pixel with too high frequency
453  }
454  // rotate around center
455  multiply(space->transformInv, imgPos);
456  if (imgPos.x < 0.f) return; // reading outside of the image boundary. Z is always correct and Y is checked by the condition above
457 
458  // transform back and round
459  // just Y coordinate needs adjusting, since X now matches to picture and Z is irrelevant
460  int imgX = clamp((int)(imgPos.x + 0.5f), 0, xSize - 1);
461  int imgY = clamp((int)(imgPos.y + 0.5f + gpuC.cMaxVolumeIndexYZ / 2), 0, ySize - 1);
462 
463  int index3D = z * (gpuC.cMaxVolumeIndexYZ+1) * (gpuC.cMaxVolumeIndexX+1) + y * (gpuC.cMaxVolumeIndexX+1) + x;
464  int index2D = imgY * xSize + imgX;
465 
466  float weight = wBlob * dataWeight;
467 
468  // use atomic as two blocks can write to same voxel
469  atomicAdd(&tempVolumeGPU[index3D].x, FFT[index2D].x * weight);
470  atomicAdd(&tempVolumeGPU[index3D].y, FFT[index2D].y * weight);
471  atomicAdd(&tempWeightsGPU[index3D], weight);
472 }
static double * y
__device__ __constant__ CodeletConstants gpuC
doublereal * x
T z
Definition: point3D.h:56
T x
Definition: point3D.h:54
double z
__host__ __device__ void multiply(const float transform[3][3], Point3D< float > &inOut)
__device__ double atomicAdd(double *address, double val)
T y
Definition: point3D.h:55

◆ processVoxelBlob()

template<int blobOrder, bool useFastKaiser>
__device__ void processVoxelBlob ( float2 *  tempVolumeGPU,
float *  tempWeightsGPU,
const int  x,
const int  y,
const int  z,
const int  xSize,
const int  ySize,
const float2 *__restrict__  FFT,
const RecFourierProjectionTraverseSpace *const  space,
const float *  blobTableSqrt,
const int  imgCacheDim 
)

Method will map one voxel from the temporal spaces to the given projection and update temporal spaces using the pixel values of the projection withing the blob distance.

Definition at line 481 of file reconstruct_fourier_codelet_reconstruct.cpp.

489 {
490  Point3D<float> imgPos;
491  // transform current point to center
492  imgPos.x = x - gpuC.cMaxVolumeIndexX/2;
493  imgPos.y = y - gpuC.cMaxVolumeIndexYZ/2;
494  imgPos.z = z - gpuC.cMaxVolumeIndexYZ/2;
495  if ((imgPos.x*imgPos.x + imgPos.y*imgPos.y + imgPos.z*imgPos.z) > space->maxDistanceSqr) {
496  return; // discard iterations that would access pixel with too high frequency
497  }
498  // rotate around center
499  multiply(space->transformInv, imgPos);
500  if (imgPos.x < -gpuC.cBlobRadius) return; // reading outside of the image boundary. Z is always correct and Y is checked by the condition above
501  // transform back just Y coordinate, since X now matches to picture and Z is irrelevant
502  imgPos.y += gpuC.cMaxVolumeIndexYZ / 2;
503 
504  // check that we don't want to collect data from far far away ...
505  float radiusSqr = gpuC.cBlobRadius * gpuC.cBlobRadius;
506  float zSqr = imgPos.z * imgPos.z;
507  if (zSqr > radiusSqr) return;
508 
509  // create blob bounding box
510  int minX = ceilf(imgPos.x - gpuC.cBlobRadius);
511  int maxX = floorf(imgPos.x + gpuC.cBlobRadius);
512  int minY = ceilf(imgPos.y - gpuC.cBlobRadius);
513  int maxY = floorf(imgPos.y + gpuC.cBlobRadius);
514  minX = fmaxf(minX, 0);
515  minY = fmaxf(minY, 0);
516  maxX = fminf(maxX, xSize-1);
517  maxY = fminf(maxY, ySize-1);
518 
519  int index3D = z * (gpuC.cMaxVolumeIndexYZ+1) * (gpuC.cMaxVolumeIndexX+1) + y * (gpuC.cMaxVolumeIndexX+1) + x;
520  float2 vol;
521  float w;
522  vol.x = vol.y = w = 0.f;
523  float dataWeight = space->weight;
524 
525  // check which pixel in the vicinity should contribute
526  for (int i = minY; i <= maxY; i++) {
527  float ySqr = (imgPos.y - i) * (imgPos.y - i);
528  float yzSqr = ySqr + zSqr;
529  if (yzSqr > radiusSqr) continue;
530  for (int j = minX; j <= maxX; j++) {
531  float xD = imgPos.x - j;
532  float distanceSqr = xD*xD + yzSqr;
533  if (distanceSqr > radiusSqr) continue;
534 
535 #if SHARED_IMG
536  int index2D = (i - SHARED_AABB[0].y) * imgCacheDim + (j-SHARED_AABB[0].x); // position in img - offset of the AABB
537 #else
538  int index2D = i * xSize + j;
539 #endif
540 
541 #if PRECOMPUTE_BLOB_VAL
542  int aux = (int) ((distanceSqr * gpuC.cIDeltaSqrt + 0.5f));
543 #if SHARED_BLOB_TABLE
544  float wBlob = BLOB_TABLE[aux];
545 #else
546  float wBlob = blobTableSqrt[aux];
547 #endif
548 #else
549  float wBlob;
550  if (useFastKaiser) {
551  wBlob = kaiserValueFast(distanceSqr);
552  }
553  else {
554  wBlob = kaiserValue<blobOrder>(sqrtf(distanceSqr), gpuC.cBlobRadius) * gpuC.cIw0;
555  }
556 #endif
557  float weight = wBlob * dataWeight;
558  w += weight;
559 #if SHARED_IMG
560  vol += IMG[index2D] * weight;
561 #else
562  vol += FFT[index2D] * weight;
563 #endif
564  }
565  }
566 
567  // use atomic as two blocks can write to same voxel
568  atomicAdd(&tempVolumeGPU[index3D].x, vol.x);
569  atomicAdd(&tempVolumeGPU[index3D].y, vol.y);
570  atomicAdd(&tempWeightsGPU[index3D], w);
571 }
static double * y
__shared__ float BLOB_TABLE[BLOB_TABLE_SIZE_SQRT]
doublereal * w
__device__ __constant__ CodeletConstants gpuC
doublereal * x
#define i
T z
Definition: point3D.h:56
T x
Definition: point3D.h:54
double z
__host__ __device__ float kaiserValueFast(float distSqr)
__host__ __device__ void multiply(const float transform[3][3], Point3D< float > &inOut)
#define j
__device__ double atomicAdd(double *address, double val)
T y
Definition: point3D.h:55

◆ processVoxelBlobCPU()

template<int blobOrder, bool useFastKaiser, bool usePrecomputedInterpolation>
void processVoxelBlobCPU ( float2 *const  tempVolumeGPU,
float *const  tempWeightsGPU,
const int  x,
const int  y,
const int  z,
const int  xSize,
const int  ySize,
const float2 *const __restrict__  FFT,
const RecFourierProjectionTraverseSpace *const  space,
const float *  blobTableSqrt 
)

Definition at line 954 of file reconstruct_fourier_codelet_reconstruct.cpp.

961 {
962  Point3D<float> imgPos;
963  // transform current point to center
964  imgPos.x = x - cpuC.cMaxVolumeIndexX/2;
965  imgPos.y = y - cpuC.cMaxVolumeIndexYZ/2;
966  imgPos.z = z - cpuC.cMaxVolumeIndexYZ/2;
967  if ((imgPos.x*imgPos.x + imgPos.y*imgPos.y + imgPos.z*imgPos.z) > space->maxDistanceSqr) {
968  return; // discard iterations that would access pixel with too high frequency
969  }
970  // rotate around center
971  multiply(space->transformInv, imgPos);
972  if (imgPos.x < -cpuC.cBlobRadius) return; // reading outside of the image boundary. Z is always correct and Y is checked by the condition above
973  // transform back just Y coordinate, since X now matches to picture and Z is irrelevant
974  imgPos.y += cpuC.cMaxVolumeIndexYZ / 2;
975 
976  // check that we don't want to collect data from far far away ...
977  float radiusSqr = cpuC.cBlobRadius * cpuC.cBlobRadius;
978  float zSqr = imgPos.z * imgPos.z;
979  if (zSqr > radiusSqr) return;
980 
981  // create blob bounding box
982  int minX = ceilf(imgPos.x - cpuC.cBlobRadius);
983  int maxX = floorf(imgPos.x + cpuC.cBlobRadius);
984  int minY = ceilf(imgPos.y - cpuC.cBlobRadius);
985  int maxY = floorf(imgPos.y + cpuC.cBlobRadius);
986  minX = fmaxf(minX, 0);
987  minY = fmaxf(minY, 0);
988  maxX = fminf(maxX, xSize-1);
989  maxY = fminf(maxY, ySize-1);
990 
991  int index3D = z * (cpuC.cMaxVolumeIndexYZ+1) * (cpuC.cMaxVolumeIndexX+1) + y * (cpuC.cMaxVolumeIndexX+1) + x;
992  float2 vol;
993  float w;
994  vol.x = vol.y = w = 0.f;
995  float dataWeight = space->weight;
996 
997  // check which pixel in the vicinity should contribute
998  for (int i = minY; i <= maxY; i++) {
999  float ySqr = (imgPos.y - i) * (imgPos.y - i);
1000  float yzSqr = ySqr + zSqr;
1001  if (yzSqr > radiusSqr) continue;
1002  for (int j = minX; j <= maxX; j++) {
1003  float xD = imgPos.x - j;
1004  float distanceSqr = xD*xD + yzSqr;
1005  if (distanceSqr > radiusSqr) continue;
1006 
1007  int index2D = i * xSize + j;
1008 
1009  float wBlob;
1010  if (usePrecomputedInterpolation) {
1011  int aux = (int) ((distanceSqr * cpuC.cIDeltaSqrt + 0.5f));
1012  wBlob = blobTableSqrt[aux];
1013  } else if (useFastKaiser) {
1014  wBlob = kaiserValueFast(distanceSqr);
1015  } else {
1016  wBlob = kaiserValue<blobOrder>(sqrtf(distanceSqr), cpuC.cBlobRadius) * cpuC.cIw0;
1017  }
1018 
1019  float weight = wBlob * dataWeight;
1020  w += weight;
1021  vol += FFT[index2D] * weight;
1022  }
1023  }
1024 
1025  atomicAddFloat(&tempVolumeGPU[index3D].x, vol.x);
1026  atomicAddFloat(&tempVolumeGPU[index3D].y, vol.y);
1027  atomicAddFloat(&tempWeightsGPU[index3D], w);
1028  //tempVolumeGPU[index3D].x += vol.x;
1029  //tempVolumeGPU[index3D].y += vol.y;
1030  //tempWeightsGPU[index3D] += w;
1031 }
static double * y
doublereal * w
doublereal * x
#define i
void atomicAddFloat(volatile float *ptr, float addedValue)
T z
Definition: point3D.h:56
T x
Definition: point3D.h:54
double z
__host__ __device__ float kaiserValueFast(float distSqr)
__host__ __device__ void multiply(const float transform[3][3], Point3D< float > &inOut)
#define j
T y
Definition: point3D.h:55

◆ processVoxelCPU()

void processVoxelCPU ( float2 *const  tempVolumeGPU,
float *const  tempWeightsGPU,
const int  x,
const int  y,
const int  z,
const int  xSize,
const int  ySize,
const float2 *const __restrict__  FFT,
const RecFourierProjectionTraverseSpace *const  space 
)

Definition at line 911 of file reconstruct_fourier_codelet_reconstruct.cpp.

917 {
918  Point3D<float> imgPos;
919  float wBlob = 1.f;
920 
921  float dataWeight = space->weight;
922 
923  // transform current point to center
924  imgPos.x = x - cpuC.cMaxVolumeIndexX/2;
925  imgPos.y = y - cpuC.cMaxVolumeIndexYZ/2;
926  imgPos.z = z - cpuC.cMaxVolumeIndexYZ/2;
927  if (imgPos.x*imgPos.x + imgPos.y*imgPos.y + imgPos.z*imgPos.z > space->maxDistanceSqr) {
928  return; // discard iterations that would access pixel with too high frequency
929  }
930  // rotate around center
931  multiply(space->transformInv, imgPos);
932  if (imgPos.x < 0.f) return; // reading outside of the image boundary. Z is always correct and Y is checked by the condition above
933 
934  // transform back and round
935  // just Y coordinate needs adjusting, since X now matches to picture and Z is irrelevant
936  int imgX = clamp((int)(imgPos.x + 0.5f), 0, xSize - 1);
937  int imgY = clamp((int)(imgPos.y + 0.5f + cpuC.cMaxVolumeIndexYZ / 2), 0, ySize - 1);
938 
939  int index3D = z * (cpuC.cMaxVolumeIndexYZ+1) * (cpuC.cMaxVolumeIndexX+1) + y * (cpuC.cMaxVolumeIndexX+1) + x;
940  int index2D = imgY * xSize + imgX;
941 
942  float weight = wBlob * dataWeight;
943 
944  // use atomic as two blocks can write to same voxel
945  atomicAddFloat(&tempVolumeGPU[index3D].x, FFT[index2D].x * weight);
946  atomicAddFloat(&tempVolumeGPU[index3D].y, FFT[index2D].y * weight);
947  atomicAddFloat(&tempWeightsGPU[index3D], weight);
948  //tempVolumeGPU[index3D].x += FFT[index2D].x * weight;
949  //tempVolumeGPU[index3D].y += FFT[index2D].y * weight;
950  //tempWeightsGPU[index3D] += weight;
951 }
static double * y
doublereal * x
void atomicAddFloat(volatile float *ptr, float addedValue)
T z
Definition: point3D.h:56
T x
Definition: point3D.h:54
double z
__host__ __device__ void multiply(const float transform[3][3], Point3D< float > &inOut)
T y
Definition: point3D.h:55

◆ reconstruct_cuda_initialize_constants()

void reconstruct_cuda_initialize_constants ( int  maxVolIndexX,
int  maxVolIndexYZ,
float  blobRadius,
float  blobAlpha,
float  iDeltaSqrt,
float  iw0,
float  oneOverBessiOrderAlpha 
)

Copy constants used for calculation to GPU memory. Blocking operation.

Definition at line 74 of file reconstruct_fourier_codelet_reconstruct.cpp.

77  {
78  CodeletConstants constants = {0};
79  constants.cMaxVolumeIndexX = maxVolIndexX;
80  constants.cMaxVolumeIndexYZ = maxVolIndexYZ;
81  constants.cBlobRadius = blobRadius;
82  constants.cOneOverBlobRadiusSqr = 1.f / (blobRadius * blobRadius);
83  constants.cBlobAlpha = blobAlpha;
84  constants.cIw0 = iw0;
85  constants.cIDeltaSqrt = iDeltaSqrt;
86  constants.cOneOverBessiOrderAlpha = oneOverBessiOrderAlpha;
87 
88  // Fill GPU side
89  // http://starpu.gforge.inria.fr/doc/html/FrequentlyAskedQuestions.html#HowToInitializeAComputationLibraryOnceForEachWorker
90  starpu_execute_on_each_worker(&cuda_set_constants, &constants, STARPU_CUDA);
91 
92  // Fill CPU side
93  memcpy(&cpuC, &constants, sizeof(CodeletConstants));
94 }

◆ rotate()

__host__ __device__ void rotate ( Point3D< float >  box[8],
const float  transform[3][3] 
)

Method will rotate box using transformation matrix around center of the working space

Definition at line 290 of file reconstruct_fourier_codelet_reconstruct.cpp.

290  {
291  const CodeletConstants& c =
292 #ifdef __CUDA_ARCH__
293  gpuC;
294 #else
295  cpuC;
296 #endif
297 
298  for (int i = 0; i < 8; i++) {
299  Point3D<float> imgPos;
300  // transform current point to center
301  imgPos.x = box[i].x - c.cMaxVolumeIndexX/2;
302  imgPos.y = box[i].y - c.cMaxVolumeIndexYZ/2;
303  imgPos.z = box[i].z - c.cMaxVolumeIndexYZ/2;
304  // rotate around center
305  multiply(transform, imgPos);
306  // transform back just Y coordinate, since X now matches to picture and Z is irrelevant
307  imgPos.y += c.cMaxVolumeIndexYZ / 2;
308 
309  box[i] = imgPos;
310  }
311 }
doublereal * c
__device__ __constant__ CodeletConstants gpuC
#define i
T z
Definition: point3D.h:56
T x
Definition: point3D.h:54
__host__ __device__ void multiply(const float transform[3][3], Point3D< float > &inOut)
T y
Definition: point3D.h:55

Variable Documentation

◆ BLOB_TABLE

__shared__ float BLOB_TABLE[BLOB_TABLE_SIZE_SQRT]

Definition at line 46 of file reconstruct_fourier_codelet_reconstruct.cpp.

◆ cpuC

Definition at line 68 of file reconstruct_fourier_codelet_reconstruct.cpp.

◆ gpuC

__device__ __constant__ CodeletConstants gpuC

Definition at line 67 of file reconstruct_fourier_codelet_reconstruct.cpp.