Xmipp  v3.23.11-Nereus
reconstruct_fourier_codelet_reconstruct.cpp
Go to the documentation of this file.
1 /***************************************************************************
2  *
3  * Authors: David Strelak (davidstrelak@gmail.com)
4  * Jan Polak (456647@mail.muni.cz)
5  *
6  * Unidad de Bioinformatica of Centro Nacional de Biotecnologia , CSIC
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation; either version 2 of the License, or
11  * (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program; if not, write to the Free Software
20  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
21  * 02111-1307 USA
22  *
23  * All comments concerning this program package may be sent to the
24  * e-mail address 'xmipp@cnb.csic.es'
25  ***************************************************************************/
26 
27 #include <atomic>
28 #include <core/multidim_array.h>
29 #include <core/xmipp_fft.h>
30 #include <core/xmipp_fftw.h>
32 
36 
37 #include <cuda_runtime_api.h>
38 #include <starpu.h>
39 
42 
43 // ======================================= Fields and their manipulation =====================================
44 
45 #if SHARED_BLOB_TABLE
46 __shared__ float BLOB_TABLE[BLOB_TABLE_SIZE_SQRT];
47 #endif
48 
49 #if SHARED_IMG
50 __shared__ Point3D<float> SHARED_AABB[2];
51 extern __shared__ float2 IMG[];
52 #endif
53 
54 // NOTE(jp): These are all used exclusively in this file. It was originally in constant storage, so it has been kept there.
55 // StarPU would probably like to put it in codelet arguments, but that would uglify the code a lot and I am not sure about performance impact of that.
59  float cBlobRadius;
61  float cBlobAlpha;
62  float cIw0;
63  float cIDeltaSqrt;
65 };
66 
67 __device__ __constant__ CodeletConstants gpuC;
69 
70 static void cuda_set_constants(void *gpuConstants) {
71  gpuErrchk(cudaMemcpyToSymbol(gpuC, gpuConstants, sizeof(CodeletConstants)));
72 }
73 
75  int maxVolIndexX, int maxVolIndexYZ,
76  float blobRadius, float blobAlpha,
77  float iDeltaSqrt, float iw0, float oneOverBessiOrderAlpha) {
78  CodeletConstants constants = {0};
79  constants.cMaxVolumeIndexX = maxVolIndexX;
80  constants.cMaxVolumeIndexYZ = maxVolIndexYZ;
81  constants.cBlobRadius = blobRadius;
82  constants.cOneOverBlobRadiusSqr = 1.f / (blobRadius * blobRadius);
83  constants.cBlobAlpha = blobAlpha;
84  constants.cIw0 = iw0;
85  constants.cIDeltaSqrt = iDeltaSqrt;
86  constants.cOneOverBessiOrderAlpha = oneOverBessiOrderAlpha;
87 
88  // Fill GPU side
89  // http://starpu.gforge.inria.fr/doc/html/FrequentlyAskedQuestions.html#HowToInitializeAComputationLibraryOnceForEachWorker
90  starpu_execute_on_each_worker(&cuda_set_constants, &constants, STARPU_CUDA);
91 
92  // Fill CPU side
93  memcpy(&cpuC, &constants, sizeof(CodeletConstants));
94 }
95 
96 // ========================================= Bessi Kaiser math ====================================
97 
98 __host__ __device__
99 float bessi0Fast(float x) { // X must be <= 15
100  // stable rational minimax approximations to the modified bessel functions, blair, edwards
101  // from table 5
102  float x2 = x*x;
103  float num = -0.8436825781374849e-19f; // p11
104  num = fmaf(num, x2, -0.93466495199548700e-17f); // p10
105  num = fmaf(num, x2, -0.15716375332511895e-13f); // p09
106  num = fmaf(num, x2, -0.42520971595532318e-11f); // p08
107  num = fmaf(num, x2, -0.13704363824102120e-8f); // p07
108  num = fmaf(num, x2, -0.28508770483148419e-6f); // p06
109  num = fmaf(num, x2, -0.44322160233346062e-4f); // p05
110  num = fmaf(num, x2, -0.46703811755736946e-2f); // p04
111  num = fmaf(num, x2, -0.31112484643702141e-0f); // p03
112  num = fmaf(num, x2, -0.11512633616429962e+2f); // p02
113  num = fmaf(num, x2, -0.18720283332732112e+3f); // p01
114  num = fmaf(num, x2, -0.75281108169006924e+3f); // p00
115 
116  float den = 1.f; // q01
117  den = fmaf(den, x2, -0.75281109410939403e+3f); // q00
118 
119  return num/den;
120 }
121 
122 __host__ __device__
123 float bessi0(float x) {
124  float y, ax, ans;
125  if ((ax = fabsf(x)) < 3.75f)
126  {
127  y = x / 3.75f;
128  y *= y;
129  ans = 1.f + y * (3.5156229f + y * (3.0899424f + y * (1.2067492f
130  + y * (0.2659732f + y * (0.360768e-1f + y * 0.45813e-2f)))));
131  }
132  else
133  {
134  y = 3.75f / ax;
135  ans = (expf(ax) * rsqrtf(ax)) * (0.39894228f + y * (0.1328592e-1f
136  + y * (0.225319e-2f + y * (-0.157565e-2f + y * (0.916281e-2f
137  + y * (-0.2057706e-1f + y * (0.2635537e-1f + y * (-0.1647633e-1f
138  + y * 0.392377e-2f))))))));
139  }
140  return ans;
141 }
142 
143 __host__ __device__
144 float bessi1(float x) {
145  float ax, ans;
146  float y;
147  if ((ax = fabsf(x)) < 3.75f)
148  {
149  y = x / 3.75f;
150  y *= y;
151  ans = ax * (0.5f + y * (0.87890594f + y * (0.51498869f + y * (0.15084934f
152  + y * (0.2658733e-1f + y * (0.301532e-2f + y * 0.32411e-3f))))));
153  }
154  else
155  {
156  y = 3.75f / ax;
157  ans = 0.2282967e-1f + y * (-0.2895312e-1f + y * (0.1787654e-1f
158  - y * 0.420059e-2f));
159  ans = 0.39894228f + y * (-0.3988024e-1f + y * (-0.362018e-2f
160  + y * (0.163801e-2f + y * (-0.1031555e-1f + y * ans))));
161  ans *= (expf(ax) * rsqrtf(ax));
162  }
163  return x < 0.0 ? -ans : ans;
164 }
165 
166 __host__ __device__
167 float bessi2(float x) {
168  return (x == 0) ? 0 : bessi0(x) - ((2*1) / x) * bessi1(x);
169 }
170 
171 __host__ __device__
172 float bessi3(float x) {
173  return (x == 0) ? 0 : bessi1(x) - ((2*2) / x) * bessi2(x);
174 }
175 
176 __host__ __device__
177 float bessi4(float x) {
178  return (x == 0) ? 0 : bessi2(x) - ((2*3) / x) * bessi3(x);
179 }
180 
181 template<int order>
182 __host__ __device__
183 float kaiserValue(float r, float a) {
184  const CodeletConstants& c =
185 #ifdef __CUDA_ARCH__
186  gpuC;
187 #else
188  cpuC;
189 #endif
190 
191  float w;
192  float rda = r / a;
193  if (rda <= 1.f)
194  {
195  float rdas = rda * rda;
196  float arg = c.cBlobAlpha * sqrtf(1.f - rdas);
197  if (order == 0)
198  {
199  w = bessi0(arg) * c.cOneOverBessiOrderAlpha;
200  }
201  else if (order == 1)
202  {
203  w = sqrtf (1.f - rdas);
204  w *= bessi1(arg) * c.cOneOverBessiOrderAlpha;
205  }
206  else if (order == 2)
207  {
208  w = sqrtf (1.f - rdas);
209  w = w * w;
210  w *= bessi2(arg) * c.cOneOverBessiOrderAlpha;
211  }
212  else if (order == 3)
213  {
214  w = sqrtf (1.f - rdas);
215  w = w * w * w;
216  w *= bessi3(arg) * c.cOneOverBessiOrderAlpha;
217  }
218  else if (order == 4)
219  {
220  w = sqrtf (1.f - rdas);
221  w = w * w * w *w;
222  w *= bessi4(arg) * c.cOneOverBessiOrderAlpha;
223  }
224  else {
225  printf("order (%d) out of range in kaiser_value(): %s, %d\n", order, __FILE__, __LINE__);
226  w = 0.f;
227  }
228  }
229  else
230  w = 0.f;
231 
232  return w;
233 }
234 
235 __host__ __device__
236 float kaiserValueFast(float distSqr) {
237  const CodeletConstants& c =
238 #ifdef __CUDA_ARCH__
239  gpuC;
240 #else
241  cpuC;
242 #endif
243 
244  float arg = c.cBlobAlpha * sqrtf(1.f - (distSqr * c.cOneOverBlobRadiusSqr)); // alpha * sqrt(1-(dist/blobRadius^2))
245  return bessi0Fast(arg) * c.cOneOverBessiOrderAlpha * c.cIw0;
246 }
247 
248 // ========================================= Math utilities ====================================
249 
250 
251 
253 __host__ __device__
254 float getZ(float x, float y, const Point3D<float>& n, const Point3D<float>& p0) {
255  // from a(x-x0)+b(y-y0)+c(z-z0)=0
256  return (-n.x*(x-p0.x)-n.y*(y-p0.y))/n.z + p0.z;
257 }
258 
260 __host__ __device__
261 float getY(float x, float z, const Point3D<float>& n, const Point3D<float>& p0){
262  // from a(x-x0)+b(y-y0)+c(z-z0)=0
263  return (-n.x*(x-p0.x)-n.z*(z-p0.z))/n.y + p0.y;
264 }
265 
266 
268 __host__ __device__
269 float getX(float y, float z, const Point3D<float>& n, const Point3D<float>& p0){
270  // from a(x-x0)+b(y-y0)+c(z-z0)=0
271  return (-n.y*(y-p0.y)-n.z*(z-p0.z))/n.x + p0.x;
272 }
273 
275 __host__ __device__
276 void multiply(const float transform[3][3], Point3D<float>& inOut) {
277  float tmp0 = transform[0][0] * inOut.x + transform[0][1] * inOut.y + transform[0][2] * inOut.z;
278  float tmp1 = transform[1][0] * inOut.x + transform[1][1] * inOut.y + transform[1][2] * inOut.z;
279  float tmp2 = transform[2][0] * inOut.x + transform[2][1] * inOut.y + transform[2][2] * inOut.z;
280  inOut.x = tmp0;
281  inOut.y = tmp1;
282  inOut.z = tmp2;
283 }
284 
289 __host__ __device__
290 void rotate(Point3D<float> box[8], const float transform[3][3]) {
291  const CodeletConstants& c =
292 #ifdef __CUDA_ARCH__
293  gpuC;
294 #else
295  cpuC;
296 #endif
297 
298  for (int i = 0; i < 8; i++) {
299  Point3D<float> imgPos;
300  // transform current point to center
301  imgPos.x = box[i].x - c.cMaxVolumeIndexX/2;
302  imgPos.y = box[i].y - c.cMaxVolumeIndexYZ/2;
303  imgPos.z = box[i].z - c.cMaxVolumeIndexYZ/2;
304  // rotate around center
305  multiply(transform, imgPos);
306  // transform back just Y coordinate, since X now matches to picture and Z is irrelevant
307  imgPos.y += c.cMaxVolumeIndexYZ / 2;
308 
309  box[i] = imgPos;
310  }
311 }
312 
313 // ========================================= AABBs ====================================
314 
316 __host__ __device__
317 void computeAABB(Point3D<float> AABB[2], const Point3D<float> cuboid[8]) {
318  AABB[0].x = AABB[0].y = AABB[0].z = INFINITY;
319  AABB[1].x = AABB[1].y = AABB[1].z = -INFINITY;
320  for (int i = 0; i < 8; i++) {
321  Point3D<float> tmp = cuboid[i];
322  if (AABB[0].x > tmp.x) AABB[0].x = tmp.x;
323  if (AABB[0].y > tmp.y) AABB[0].y = tmp.y;
324  if (AABB[0].z > tmp.z) AABB[0].z = tmp.z;
325  if (AABB[1].x < tmp.x) AABB[1].x = tmp.x;
326  if (AABB[1].y < tmp.y) AABB[1].y = tmp.y;
327  if (AABB[1].z < tmp.z) AABB[1].z = tmp.z;
328  }
329  AABB[0].x = ceilf(AABB[0].x);
330  AABB[0].y = ceilf(AABB[0].y);
331  AABB[0].z = ceilf(AABB[0].z);
332 
333  AABB[1].x = floorf(AABB[1].x);
334  AABB[1].y = floorf(AABB[1].y);
335  AABB[1].z = floorf(AABB[1].z);
336 }
337 
338 #if SHARED_IMG
339 
345 __device__
347  const CodeletConstants& c =
348 #ifdef __CUDA_ARCH__
349  gpuC;
350 #else
351  cpuC;
352 #endif
353 
354  Point3D<float> box[8];
355  // calculate AABB for the whole working block
356  if (tSpace->XY == tSpace->dir) { // iterate XY plane
357  box[0].x = box[3].x = box[4].x = box[7].x = blockIdx.x*blockDim.x - c.cBlobRadius;
358  box[1].x = box[2].x = box[5].x = box[6].x = (blockIdx.x+1)*blockDim.x + c.cBlobRadius - 1.f;
359 
360  box[2].y = box[3].y = box[6].y = box[7].y = (blockIdx.y+1)*blockDim.y + c.cBlobRadius - 1.f;
361  box[0].y = box[1].y = box[4].y = box[5].y = blockIdx.y*blockDim.y- c.cBlobRadius;
362 
363  box[0].z = getZ(box[0].x, box[0].y, tSpace->unitNormal, tSpace->bottomOrigin);
364  box[4].z = getZ(box[4].x, box[4].y, tSpace->unitNormal, tSpace->topOrigin);
365 
366  box[3].z = getZ(box[3].x, box[3].y, tSpace->unitNormal, tSpace->bottomOrigin);
367  box[7].z = getZ(box[7].x, box[7].y, tSpace->unitNormal, tSpace->topOrigin);
368 
369  box[2].z = getZ(box[2].x, box[2].y, tSpace->unitNormal, tSpace->bottomOrigin);
370  box[6].z = getZ(box[6].x, box[6].y, tSpace->unitNormal, tSpace->topOrigin);
371 
372  box[1].z = getZ(box[1].x, box[1].y, tSpace->unitNormal, tSpace->bottomOrigin);
373  box[5].z = getZ(box[5].x, box[5].y, tSpace->unitNormal, tSpace->topOrigin);
374  } else if (tSpace->XZ == tSpace->dir) { // iterate XZ plane
375  box[0].x = box[3].x = box[4].x = box[7].x = blockIdx.x*blockDim.x - c.cBlobRadius;
376  box[1].x = box[2].x = box[5].x = box[6].x = (blockIdx.x+1)*blockDim.x + c.cBlobRadius - 1.f;
377 
378  box[2].z = box[3].z = box[6].z = box[7].z = (blockIdx.y+1)*blockDim.y + c.cBlobRadius - 1.f;
379  box[0].z = box[1].z = box[4].z = box[5].z = blockIdx.y*blockDim.y - c.cBlobRadius;
380 
381  box[0].y = getY(box[0].x, box[0].z, tSpace->unitNormal, tSpace->bottomOrigin);
382  box[4].y = getY(box[4].x, box[4].z, tSpace->unitNormal, tSpace->topOrigin);
383 
384  box[3].y = getY(box[3].x, box[3].z, tSpace->unitNormal, tSpace->bottomOrigin);
385  box[7].y = getY(box[7].x, box[7].z, tSpace->unitNormal, tSpace->topOrigin);
386 
387  box[2].y = getY(box[2].x, box[2].z, tSpace->unitNormal, tSpace->bottomOrigin);
388  box[6].y = getY(box[6].x, box[6].z, tSpace->unitNormal, tSpace->topOrigin);
389 
390  box[1].y = getY(box[1].x, box[1].z, tSpace->unitNormal, tSpace->bottomOrigin);
391  box[5].y = getY(box[5].x, box[5].z, tSpace->unitNormal, tSpace->topOrigin);
392  } else { // iterate YZ plane
393  box[0].y = box[3].y = box[4].y = box[7].y = blockIdx.x*blockDim.x - c.cBlobRadius;
394  box[1].y = box[2].y = box[5].y = box[6].y = (blockIdx.x+1)*blockDim.x + c.cBlobRadius - 1.f;
395 
396  box[2].z = box[3].z = box[6].z = box[7].z = (blockIdx.y+1)*blockDim.y + c.cBlobRadius - 1.f;
397  box[0].z = box[1].z = box[4].z = box[5].z = blockIdx.y*blockDim.y- c.cBlobRadius;
398 
399  box[0].x = getX(box[0].y, box[0].z, tSpace->unitNormal, tSpace->bottomOrigin);
400  box[4].x = getX(box[4].y, box[4].z, tSpace->unitNormal, tSpace->topOrigin);
401 
402  box[3].x = getX(box[3].y, box[3].z, tSpace->unitNormal, tSpace->bottomOrigin);
403  box[7].x = getX(box[7].y, box[7].z, tSpace->unitNormal, tSpace->topOrigin);
404 
405  box[2].x = getX(box[2].y, box[2].z, tSpace->unitNormal, tSpace->bottomOrigin);
406  box[6].x = getX(box[6].y, box[6].z, tSpace->unitNormal, tSpace->topOrigin);
407 
408  box[1].x = getX(box[1].y, box[1].z, tSpace->unitNormal, tSpace->bottomOrigin);
409  box[5].x = getX(box[5].y, box[5].z, tSpace->unitNormal, tSpace->topOrigin);
410  }
411  // transform AABB to the image domain
412  rotate(box, tSpace->transformInv);
413  // AABB is projected on image. Create new AABB that will encompass all vertices
414  computeAABB(dest, box);
415 }
416 
418 __device__
419 bool isWithin(Point3D<float> AABB[2], int imgXSize, int imgYSize) {
420  return (AABB[0].x < imgXSize)
421  && (AABB[1].x >= 0)
422  && (AABB[0].y < imgYSize)
423  && (AABB[1].y >= 0);
424 }
425 #endif
426 
427 // ========================================= Actual processing ====================================
428 
434 __device__
436  float2* tempVolumeGPU, float* tempWeightsGPU,
437  int x, int y, int z,
438  int xSize, int ySize,
439  const float2* __restrict__ FFT,
441 {
442  Point3D<float> imgPos;
443  float wBlob = 1.f;
444 
445  float dataWeight = space->weight;
446 
447  // transform current point to center
448  imgPos.x = x - gpuC.cMaxVolumeIndexX/2;
449  imgPos.y = y - gpuC.cMaxVolumeIndexYZ/2;
450  imgPos.z = z - gpuC.cMaxVolumeIndexYZ/2;
451  if (imgPos.x*imgPos.x + imgPos.y*imgPos.y + imgPos.z*imgPos.z > space->maxDistanceSqr) {
452  return; // discard iterations that would access pixel with too high frequency
453  }
454  // rotate around center
455  multiply(space->transformInv, imgPos);
456  if (imgPos.x < 0.f) return; // reading outside of the image boundary. Z is always correct and Y is checked by the condition above
457 
458  // transform back and round
459  // just Y coordinate needs adjusting, since X now matches to picture and Z is irrelevant
460  int imgX = clamp((int)(imgPos.x + 0.5f), 0, xSize - 1);
461  int imgY = clamp((int)(imgPos.y + 0.5f + gpuC.cMaxVolumeIndexYZ / 2), 0, ySize - 1);
462 
463  int index3D = z * (gpuC.cMaxVolumeIndexYZ+1) * (gpuC.cMaxVolumeIndexX+1) + y * (gpuC.cMaxVolumeIndexX+1) + x;
464  int index2D = imgY * xSize + imgX;
465 
466  float weight = wBlob * dataWeight;
467 
468  // use atomic as two blocks can write to same voxel
469  atomicAdd(&tempVolumeGPU[index3D].x, FFT[index2D].x * weight);
470  atomicAdd(&tempVolumeGPU[index3D].y, FFT[index2D].y * weight);
471  atomicAdd(&tempWeightsGPU[index3D], weight);
472 }
473 
479 template<int blobOrder, bool useFastKaiser>
480 __device__
482  float2* tempVolumeGPU, float *tempWeightsGPU,
483  const int x, const int y, const int z,
484  const int xSize, const int ySize,
485  const float2* __restrict__ FFT,
487  const float* blobTableSqrt,
488  const int imgCacheDim)
489 {
490  Point3D<float> imgPos;
491  // transform current point to center
492  imgPos.x = x - gpuC.cMaxVolumeIndexX/2;
493  imgPos.y = y - gpuC.cMaxVolumeIndexYZ/2;
494  imgPos.z = z - gpuC.cMaxVolumeIndexYZ/2;
495  if ((imgPos.x*imgPos.x + imgPos.y*imgPos.y + imgPos.z*imgPos.z) > space->maxDistanceSqr) {
496  return; // discard iterations that would access pixel with too high frequency
497  }
498  // rotate around center
499  multiply(space->transformInv, imgPos);
500  if (imgPos.x < -gpuC.cBlobRadius) return; // reading outside of the image boundary. Z is always correct and Y is checked by the condition above
501  // transform back just Y coordinate, since X now matches to picture and Z is irrelevant
502  imgPos.y += gpuC.cMaxVolumeIndexYZ / 2;
503 
504  // check that we don't want to collect data from far far away ...
505  float radiusSqr = gpuC.cBlobRadius * gpuC.cBlobRadius;
506  float zSqr = imgPos.z * imgPos.z;
507  if (zSqr > radiusSqr) return;
508 
509  // create blob bounding box
510  int minX = ceilf(imgPos.x - gpuC.cBlobRadius);
511  int maxX = floorf(imgPos.x + gpuC.cBlobRadius);
512  int minY = ceilf(imgPos.y - gpuC.cBlobRadius);
513  int maxY = floorf(imgPos.y + gpuC.cBlobRadius);
514  minX = fmaxf(minX, 0);
515  minY = fmaxf(minY, 0);
516  maxX = fminf(maxX, xSize-1);
517  maxY = fminf(maxY, ySize-1);
518 
519  int index3D = z * (gpuC.cMaxVolumeIndexYZ+1) * (gpuC.cMaxVolumeIndexX+1) + y * (gpuC.cMaxVolumeIndexX+1) + x;
520  float2 vol;
521  float w;
522  vol.x = vol.y = w = 0.f;
523  float dataWeight = space->weight;
524 
525  // check which pixel in the vicinity should contribute
526  for (int i = minY; i <= maxY; i++) {
527  float ySqr = (imgPos.y - i) * (imgPos.y - i);
528  float yzSqr = ySqr + zSqr;
529  if (yzSqr > radiusSqr) continue;
530  for (int j = minX; j <= maxX; j++) {
531  float xD = imgPos.x - j;
532  float distanceSqr = xD*xD + yzSqr;
533  if (distanceSqr > radiusSqr) continue;
534 
535 #if SHARED_IMG
536  int index2D = (i - SHARED_AABB[0].y) * imgCacheDim + (j-SHARED_AABB[0].x); // position in img - offset of the AABB
537 #else
538  int index2D = i * xSize + j;
539 #endif
540 
541 #if PRECOMPUTE_BLOB_VAL
542  int aux = (int) ((distanceSqr * gpuC.cIDeltaSqrt + 0.5f));
543 #if SHARED_BLOB_TABLE
544  float wBlob = BLOB_TABLE[aux];
545 #else
546  float wBlob = blobTableSqrt[aux];
547 #endif
548 #else
549  float wBlob;
550  if (useFastKaiser) {
551  wBlob = kaiserValueFast(distanceSqr);
552  }
553  else {
554  wBlob = kaiserValue<blobOrder>(sqrtf(distanceSqr), gpuC.cBlobRadius) * gpuC.cIw0;
555  }
556 #endif
557  float weight = wBlob * dataWeight;
558  w += weight;
559 #if SHARED_IMG
560  vol += IMG[index2D] * weight;
561 #else
562  vol += FFT[index2D] * weight;
563 #endif
564  }
565  }
566 
567  // use atomic as two blocks can write to same voxel
568  atomicAdd(&tempVolumeGPU[index3D].x, vol.x);
569  atomicAdd(&tempVolumeGPU[index3D].y, vol.y);
570  atomicAdd(&tempWeightsGPU[index3D], w);
571 }
572 
577 template<bool useFast, int blobOrder, bool useFastKaiser>
578 __device__
580  float2* tempVolumeGPU, float *tempWeightsGPU,
581  const int xSize, const int ySize,
582  const float2* __restrict__ FFT,
583  const RecFourierProjectionTraverseSpace* const tSpace,
584  const float* blobTableSqrt,
585  const int imgCacheDim)
586 {
587  // map thread to each (2D) voxel
588 #if TILE > 1
589  int id = threadIdx.y * blockDim.x + threadIdx.x;
590  int tidX = threadIdx.x % TILE + (id / (blockDim.y * TILE)) * TILE;
591  int tidY = (id / TILE) % blockDim.y;
592  int idx = blockIdx.x * blockDim.x + tidX;
593  int idy = blockIdx.y * blockDim.y + tidY;
594 #else
595  // map thread to each (2D) voxel
596  volatile int idx = blockIdx.x*blockDim.x + threadIdx.x;
597  volatile int idy = blockIdx.y*blockDim.y + threadIdx.y;
598 #endif
599 
600  if (tSpace->XY == tSpace->dir) { // iterate XY plane
601  if (idy >= tSpace->minY && idy <= tSpace->maxY) {
602  if (idx >= tSpace->minX && idx <= tSpace->maxX) {
603  if (useFast) {
604  float hitZ = getZ(idx, idy, tSpace->unitNormal, tSpace->bottomOrigin);
605  int z = (int)(hitZ + 0.5f); // rounding
606  processVoxel(tempVolumeGPU, tempWeightsGPU, idx, idy, z, xSize, ySize, FFT, tSpace);
607  } else {
608  float z1 = getZ(idx, idy, tSpace->unitNormal, tSpace->bottomOrigin); // lower plane
609  float z2 = getZ(idx, idy, tSpace->unitNormal, tSpace->topOrigin); // upper plane
610  z1 = clamp(z1, 0, gpuC.cMaxVolumeIndexYZ);
611  z2 = clamp(z2, 0, gpuC.cMaxVolumeIndexYZ);
612  int lower = static_cast<int>(floorf(fminf(z1, z2)));
613  int upper = static_cast<int>(ceilf(fmaxf(z1, z2)));
614  for (int z = lower; z <= upper; z++) {
615  processVoxelBlob<blobOrder, useFastKaiser>(tempVolumeGPU, tempWeightsGPU, idx, idy, z, xSize, ySize, FFT, tSpace, blobTableSqrt, imgCacheDim);
616  }
617  }
618  }
619  }
620  } else if (tSpace->XZ == tSpace->dir) { // iterate XZ plane
621  if (idy >= tSpace->minZ && idy <= tSpace->maxZ) { // map z -> y
622  if (idx >= tSpace->minX && idx <= tSpace->maxX) {
623  if (useFast) {
624  float hitY =getY(idx, idy, tSpace->unitNormal, tSpace->bottomOrigin);
625  int y = (int)(hitY + 0.5f); // rounding
626  processVoxel(tempVolumeGPU, tempWeightsGPU, idx, y, idy, xSize, ySize, FFT, tSpace);
627  } else {
628  float y1 = getY(idx, idy, tSpace->unitNormal, tSpace->bottomOrigin); // lower plane
629  float y2 = getY(idx, idy, tSpace->unitNormal, tSpace->topOrigin); // upper plane
630  y1 = clamp(y1, 0, gpuC.cMaxVolumeIndexYZ);
631  y2 = clamp(y2, 0, gpuC.cMaxVolumeIndexYZ);
632  int lower = static_cast<int>(floorf(fminf(y1, y2)));
633  int upper = static_cast<int>(ceilf(fmaxf(y1, y2)));
634  for (int y = lower; y <= upper; y++) {
635  processVoxelBlob<blobOrder, useFastKaiser>(tempVolumeGPU, tempWeightsGPU, idx, y, idy, xSize, ySize, FFT, tSpace, blobTableSqrt, imgCacheDim);
636  }
637  }
638  }
639  }
640  } else { // iterate YZ plane
641  if (idy >= tSpace->minZ && idy <= tSpace->maxZ) { // map z -> y
642  if (idx >= tSpace->minY && idx <= tSpace->maxY) { // map y > x
643  if (useFast) {
644  float hitX = getX(idx, idy, tSpace->unitNormal, tSpace->bottomOrigin);
645  int x = (int)(hitX + 0.5f); // rounding
646  processVoxel(tempVolumeGPU, tempWeightsGPU, x, idx, idy, xSize, ySize, FFT, tSpace);
647  } else {
648  float x1 = getX(idx, idy, tSpace->unitNormal, tSpace->bottomOrigin); // lower plane
649  float x2 = getX(idx, idy, tSpace->unitNormal, tSpace->topOrigin); // upper plane
650  x1 = clamp(x1, 0, gpuC.cMaxVolumeIndexX);
651  x2 = clamp(x2, 0, gpuC.cMaxVolumeIndexX);
652  int lower = static_cast<int>(floorf(fminf(x1, x2)));
653  int upper = static_cast<int>(ceilf(fmaxf(x1, x2)));
654  for (int x = lower; x <= upper; x++) {
655  processVoxelBlob<blobOrder, useFastKaiser>(tempVolumeGPU, tempWeightsGPU, x, idx, idy, xSize, ySize, FFT, tSpace, blobTableSqrt, imgCacheDim);
656  }
657  }
658  }
659  }
660  }
661 }
662 
669 __device__
670 void getImgData(const Point3D<float> AABB[2],
671  const int tXindex, const int tYindex,
672  const float2* FFTs, const int fftSizeX, const int fftSizeY, const int imgIndex,
673  float2& vComplex) {
674  int imgXindex = tXindex + static_cast<int>(AABB[0].x);
675  int imgYindex = tYindex + static_cast<int>(AABB[0].y);
676  if ((imgXindex >= 0)
677  && (imgXindex < fftSizeX)
678  && (imgYindex >=0)
679  && (imgYindex < fftSizeY)) {
680  int index = imgYindex * fftSizeX + imgXindex; // copy data from image
681  vComplex = (FFTs + fftSizeX * fftSizeY * imgIndex)[index];
682  } else {
683  vComplex = {0.f, 0.f}; // out of image bound, so return zero
684  }
685 }
686 
694 __device__
695 void copyImgToCache(float2* dest, const Point3D<float> AABB[2],
696  const float2* FFTs, const int fftSizeX, const int fftSizeY, const int imgIndex,
697  const int imgCacheDim) {
698  for (int y = threadIdx.y; y < imgCacheDim; y += blockDim.y) {
699  for (int x = threadIdx.x; x < imgCacheDim; x += blockDim.x) {
700  int memIndex = y * imgCacheDim + x;
701  getImgData(AABB, x, y, FFTs, fftSizeX, fftSizeY, imgIndex, dest[memIndex]);
702  }
703  }
704 }
705 
710 template<bool fastLateBlobbing, int blobOrder, bool useFastKaiser>
711 __global__
713  float2* outVolumeBuffer, float *outWeightsBuffer,
714  const int fftSizeX, const int fftSizeY,
715  const int traverseSpaceCount, const RecFourierProjectionTraverseSpace* traverseSpaces,
716  const float2* FFTs,
717  const float* blobTableSqrt,
718  int imgCacheDim) {
719 
720 #if SHARED_BLOB_TABLE
721  if ( ! fastLateBlobbing) {
722  // copy blob table to shared memory
723  volatile int id = threadIdx.y*blockDim.x + threadIdx.x;
724  volatile int blockSize = blockDim.x * blockDim.y;
725  for (int i = id; i < BLOB_TABLE_SIZE_SQRT; i+= blockSize)
726  BLOB_TABLE[i] = blobTableSqrt[i];
727  __syncthreads();
728  }
729 #endif
730 
731  for (int i = blockIdx.z; i < traverseSpaceCount; i += gridDim.z) {
732  const RecFourierProjectionTraverseSpace& space = traverseSpaces[i];
733 
734 #if SHARED_IMG
735  if ( ! fastLateBlobbing) {
736  // make sure that all threads start at the same time
737  // as they can come from previous iteration
738  __syncthreads();
739  if ((threadIdx.x == 0) && (threadIdx.y == 0)) {
740  // first thread calculates which part of the image should be shared
741  calculateAABB(&space, SHARED_AABB);
742  }
743  __syncthreads();
744  // check if the block will have to copy data from image
745  if (isWithin(SHARED_AABB, fftSizeX, fftSizeY)) {
746  // all threads copy image data to shared memory
747  copyImgToCache(IMG, SHARED_AABB, FFTs, fftSizeX, fftSizeY, space.projectionIndex, imgCacheDim);
748  __syncthreads();
749  } else {
750  continue; // whole block can exit, as it's not reading from image
751  }
752  }
753 #endif
754 
755  processProjection<fastLateBlobbing, blobOrder, useFastKaiser>(
756  outVolumeBuffer, outWeightsBuffer,
757  fftSizeX, fftSizeY,
758  FFTs + fftSizeX * fftSizeY * space.projectionIndex,
759  &space,
760  blobTableSqrt,
761  imgCacheDim);
762 
763  __syncthreads(); // sync threads to avoid write after read problems
764  }
765 }
766 
773 template<int blobOrder, bool useFastKaiser>
775  float2 *outVolumeBuffer, float *outWeightsBuffer,
776  const int fftSizeX, const int fftSizeY,
777  const int traverseSpaceCount, const RecFourierProjectionTraverseSpace *traverseSpaces,
778  const float2 *inFFTs,
779  const float *blobTableSqrt,
780  const bool fastLateBlobbing,
781  const float blobRadius, const int maxVolIndexYZ) {
782 
783  // enqueue kernel and return control
784  const int imgCacheDim = static_cast<int>(ceil(sqrt(2.f) * sqrt(3.f) * (BLOCK_DIM + 2 * blobRadius)));
785  dim3 dimBlock(BLOCK_DIM, BLOCK_DIM);
786 
787  const int size2D = maxVolIndexYZ + 1;
788  dim3 dimGrid(static_cast<unsigned int>(ceil(size2D / (float)dimBlock.x)),
789  static_cast<unsigned int>(ceil(size2D / (float)dimBlock.y)),
790  GRID_DIM_Z);
791 
792  // by using templates, we can save some registers, especially for 'fast' version
793  if (fastLateBlobbing) {
794  processBufferKernel<true, blobOrder,useFastKaiser><<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>>(
795  outVolumeBuffer, outWeightsBuffer,
796  fftSizeX, fftSizeY,
797  traverseSpaceCount, traverseSpaces,
798  inFFTs,
799  blobTableSqrt,
800  imgCacheDim);
801  } else {
802  // if making copy of the image in shared memory, allocate enough space
803  int sharedMemSize = SHARED_IMG ? (imgCacheDim*imgCacheDim*sizeof(float2)) : 0;
804  processBufferKernel<false, blobOrder,useFastKaiser><<<dimGrid, dimBlock, sharedMemSize, starpu_cuda_get_local_stream()>>>(
805  outVolumeBuffer, outWeightsBuffer,
806  fftSizeX, fftSizeY,
807  traverseSpaceCount, traverseSpaces,
808  inFFTs,
809  blobTableSqrt,
810  imgCacheDim);
811  }
812  gpuErrchk(cudaPeekAtLastError());
813 }
814 
815 void func_reconstruct_cuda(void* buffers[], void* cl_arg) {
816  const ReconstructFftArgs& arg = *(ReconstructFftArgs*) cl_arg;
817  const float2* inFFTs = (float2*)STARPU_VECTOR_GET_PTR(buffers[0]);
818  const RecFourierProjectionTraverseSpace* inSpaces = (RecFourierProjectionTraverseSpace*)STARPU_MATRIX_GET_PTR(buffers[1]);
819  const float* inBlobTableSqrt = (float*)(STARPU_VECTOR_GET_PTR(buffers[2]));
820  float2* outVolumeBuffer = (float2*)(STARPU_VECTOR_GET_PTR(buffers[3])); // Actually std::complex<float>
821  float* outWeightsBuffer = (float*)(STARPU_VECTOR_GET_PTR(buffers[4]));
822  const uint32_t noOfImages = ((LoadedImagesBuffer*) STARPU_VARIABLE_GET_PTR(buffers[5]))->noOfImages;
823 
824  switch (arg.blobOrder) {
825  case 0:
826  if (arg.blobAlpha <= 15.0) {
827  processBufferGPU<0, true>(outVolumeBuffer, outWeightsBuffer,
828  arg.fftSizeX, arg.fftSizeY,
829  arg.noOfSymmetries * noOfImages, inSpaces,
830  inFFTs,
831  inBlobTableSqrt,
832  arg.fastLateBlobbing,
833  arg.blobRadius, arg.maxVolIndexYZ);
834  } else {
835  processBufferGPU<0, false>(outVolumeBuffer, outWeightsBuffer,
836  arg.fftSizeX, arg.fftSizeY,
837  arg.noOfSymmetries * noOfImages, inSpaces,
838  inFFTs,
839  inBlobTableSqrt,
840  arg.fastLateBlobbing,
841  arg.blobRadius, arg.maxVolIndexYZ);
842  }
843  break;
844  case 1:
845  processBufferGPU<1, false>(outVolumeBuffer, outWeightsBuffer,
846  arg.fftSizeX, arg.fftSizeY,
847  arg.noOfSymmetries * noOfImages, inSpaces,
848  inFFTs,
849  inBlobTableSqrt,
850  arg.fastLateBlobbing,
851  arg.blobRadius, arg.maxVolIndexYZ);
852  break;
853  case 2:
854  processBufferGPU<2, false>(outVolumeBuffer, outWeightsBuffer,
855  arg.fftSizeX, arg.fftSizeY,
856  arg.noOfSymmetries * noOfImages, inSpaces,
857  inFFTs,
858  inBlobTableSqrt,
859  arg.fastLateBlobbing,
860  arg.blobRadius, arg.maxVolIndexYZ);
861  break;
862  case 3:
863  processBufferGPU<3, false>(outVolumeBuffer, outWeightsBuffer,
864  arg.fftSizeX, arg.fftSizeY,
865  arg.noOfSymmetries * noOfImages, inSpaces,
866  inFFTs,
867  inBlobTableSqrt,
868  arg.fastLateBlobbing,
869  arg.blobRadius, arg.maxVolIndexYZ);
870  break;
871  case 4:
872  processBufferGPU<4, false>(outVolumeBuffer, outWeightsBuffer,
873  arg.fftSizeX, arg.fftSizeY,
874  arg.noOfSymmetries * noOfImages, inSpaces,
875  inFFTs,
876  inBlobTableSqrt,
877  arg.fastLateBlobbing,
878  arg.blobRadius, arg.maxVolIndexYZ);
879  break;
880  default:
881  REPORT_ERROR(ERR_VALUE_INCORRECT, "m out of range [0..4] in kaiser_value()");
882  }
883 
884  // gpuErrchk(cudaStreamSynchronize(starpu_cuda_get_local_stream())); disabled because codelet is async
885 }
886 
887 //------------------------------------------------ CPU -----------------------------------------------------------------
888 // uses copies of the GPU functions, rewritten for single thread runtime
889 
892 void atomicAddFloat(volatile float* ptr, float addedValue) {
893  static_assert(sizeof(float) == sizeof(uint32_t), "atomicAddFloat requires floats to be 32bit");
894 
895  // This is probably fine, since the constructor/destructor should be trivial
896  // (As of C++11, this is guaranteed only for integral type specializations, but it is probably reasonably safe to assume
897  // that this will hold for floats as well. C++20 requies that by spec.)
898  volatile std::atomic<float>& atomicPtr = *reinterpret_cast<volatile std::atomic<float>*>(ptr);
899  float current = atomicPtr.load(std::memory_order::memory_order_relaxed);
900  while (true) {
901  const float newValue = current + addedValue;
902  // Since x86 does not allow atomic add of floats (only integers), we have to implement it through CAS
903  if (atomicPtr.compare_exchange_weak(current, newValue, std::memory_order::memory_order_relaxed)) {
904  // Current was still current and was replaced with the newValue. Done.
905  return;
906  }
907  // Comparison failed. current now contains the new value and we try again.
908  }
909 }
910 
912  float2* const tempVolumeGPU, float* const tempWeightsGPU,
913  const int x, const int y, const int z,
914  const int xSize, const int ySize,
915  const float2* const __restrict__ FFT,
917 {
918  Point3D<float> imgPos;
919  float wBlob = 1.f;
920 
921  float dataWeight = space->weight;
922 
923  // transform current point to center
924  imgPos.x = x - cpuC.cMaxVolumeIndexX/2;
925  imgPos.y = y - cpuC.cMaxVolumeIndexYZ/2;
926  imgPos.z = z - cpuC.cMaxVolumeIndexYZ/2;
927  if (imgPos.x*imgPos.x + imgPos.y*imgPos.y + imgPos.z*imgPos.z > space->maxDistanceSqr) {
928  return; // discard iterations that would access pixel with too high frequency
929  }
930  // rotate around center
931  multiply(space->transformInv, imgPos);
932  if (imgPos.x < 0.f) return; // reading outside of the image boundary. Z is always correct and Y is checked by the condition above
933 
934  // transform back and round
935  // just Y coordinate needs adjusting, since X now matches to picture and Z is irrelevant
936  int imgX = clamp((int)(imgPos.x + 0.5f), 0, xSize - 1);
937  int imgY = clamp((int)(imgPos.y + 0.5f + cpuC.cMaxVolumeIndexYZ / 2), 0, ySize - 1);
938 
939  int index3D = z * (cpuC.cMaxVolumeIndexYZ+1) * (cpuC.cMaxVolumeIndexX+1) + y * (cpuC.cMaxVolumeIndexX+1) + x;
940  int index2D = imgY * xSize + imgX;
941 
942  float weight = wBlob * dataWeight;
943 
944  // use atomic as two blocks can write to same voxel
945  atomicAddFloat(&tempVolumeGPU[index3D].x, FFT[index2D].x * weight);
946  atomicAddFloat(&tempVolumeGPU[index3D].y, FFT[index2D].y * weight);
947  atomicAddFloat(&tempWeightsGPU[index3D], weight);
948  //tempVolumeGPU[index3D].x += FFT[index2D].x * weight;
949  //tempVolumeGPU[index3D].y += FFT[index2D].y * weight;
950  //tempWeightsGPU[index3D] += weight;
951 }
952 
953 template<int blobOrder, bool useFastKaiser, bool usePrecomputedInterpolation>
955  float2* const tempVolumeGPU, float* const tempWeightsGPU,
956  const int x, const int y, const int z,
957  const int xSize, const int ySize,
958  const float2* const __restrict__ FFT,
960  const float* blobTableSqrt)
961 {
962  Point3D<float> imgPos;
963  // transform current point to center
964  imgPos.x = x - cpuC.cMaxVolumeIndexX/2;
965  imgPos.y = y - cpuC.cMaxVolumeIndexYZ/2;
966  imgPos.z = z - cpuC.cMaxVolumeIndexYZ/2;
967  if ((imgPos.x*imgPos.x + imgPos.y*imgPos.y + imgPos.z*imgPos.z) > space->maxDistanceSqr) {
968  return; // discard iterations that would access pixel with too high frequency
969  }
970  // rotate around center
971  multiply(space->transformInv, imgPos);
972  if (imgPos.x < -cpuC.cBlobRadius) return; // reading outside of the image boundary. Z is always correct and Y is checked by the condition above
973  // transform back just Y coordinate, since X now matches to picture and Z is irrelevant
974  imgPos.y += cpuC.cMaxVolumeIndexYZ / 2;
975 
976  // check that we don't want to collect data from far far away ...
977  float radiusSqr = cpuC.cBlobRadius * cpuC.cBlobRadius;
978  float zSqr = imgPos.z * imgPos.z;
979  if (zSqr > radiusSqr) return;
980 
981  // create blob bounding box
982  int minX = ceilf(imgPos.x - cpuC.cBlobRadius);
983  int maxX = floorf(imgPos.x + cpuC.cBlobRadius);
984  int minY = ceilf(imgPos.y - cpuC.cBlobRadius);
985  int maxY = floorf(imgPos.y + cpuC.cBlobRadius);
986  minX = fmaxf(minX, 0);
987  minY = fmaxf(minY, 0);
988  maxX = fminf(maxX, xSize-1);
989  maxY = fminf(maxY, ySize-1);
990 
991  int index3D = z * (cpuC.cMaxVolumeIndexYZ+1) * (cpuC.cMaxVolumeIndexX+1) + y * (cpuC.cMaxVolumeIndexX+1) + x;
992  float2 vol;
993  float w;
994  vol.x = vol.y = w = 0.f;
995  float dataWeight = space->weight;
996 
997  // check which pixel in the vicinity should contribute
998  for (int i = minY; i <= maxY; i++) {
999  float ySqr = (imgPos.y - i) * (imgPos.y - i);
1000  float yzSqr = ySqr + zSqr;
1001  if (yzSqr > radiusSqr) continue;
1002  for (int j = minX; j <= maxX; j++) {
1003  float xD = imgPos.x - j;
1004  float distanceSqr = xD*xD + yzSqr;
1005  if (distanceSqr > radiusSqr) continue;
1006 
1007  int index2D = i * xSize + j;
1008 
1009  float wBlob;
1010  if (usePrecomputedInterpolation) {
1011  int aux = (int) ((distanceSqr * cpuC.cIDeltaSqrt + 0.5f));
1012  wBlob = blobTableSqrt[aux];
1013  } else if (useFastKaiser) {
1014  wBlob = kaiserValueFast(distanceSqr);
1015  } else {
1016  wBlob = kaiserValue<blobOrder>(sqrtf(distanceSqr), cpuC.cBlobRadius) * cpuC.cIw0;
1017  }
1018 
1019  float weight = wBlob * dataWeight;
1020  w += weight;
1021  vol += FFT[index2D] * weight;
1022  }
1023  }
1024 
1025  atomicAddFloat(&tempVolumeGPU[index3D].x, vol.x);
1026  atomicAddFloat(&tempVolumeGPU[index3D].y, vol.y);
1027  atomicAddFloat(&tempWeightsGPU[index3D], w);
1028  //tempVolumeGPU[index3D].x += vol.x;
1029  //tempVolumeGPU[index3D].y += vol.y;
1030  //tempWeightsGPU[index3D] += w;
1031 }
1032 
1033 template<bool useFast, int blobOrder, bool useFastKaiser, bool usePrecomputedInterpolation>
1035  float2* tempVolumeGPU, float *tempWeightsGPU,
1036  const int xSize, const int ySize,
1037  const float2* __restrict__ FFT,
1038  const RecFourierProjectionTraverseSpace* const tSpace,
1039  const float* blobTableSqrt) {
1040 
1041  if (tSpace->XY == tSpace->dir) { // iterate XY plane
1042  for (int idy = tSpace->minY; idy <= tSpace->maxY; idy++) {
1043  for (int idx = tSpace->minX; idx <= tSpace->maxX; idx++) {
1044  if (useFast) {
1045  float hitZ = getZ(idx, idy, tSpace->unitNormal, tSpace->bottomOrigin);
1046  int z = (int)(hitZ + 0.5f); // rounding
1047  processVoxelCPU(tempVolumeGPU, tempWeightsGPU, idx, idy, z, xSize, ySize, FFT, tSpace);
1048  } else {
1049  float z1 = getZ(idx, idy, tSpace->unitNormal, tSpace->bottomOrigin); // lower plane
1050  float z2 = getZ(idx, idy, tSpace->unitNormal, tSpace->topOrigin); // upper plane
1051  z1 = clamp(z1, 0, cpuC.cMaxVolumeIndexYZ);
1052  z2 = clamp(z2, 0, cpuC.cMaxVolumeIndexYZ);
1053  int lower = static_cast<int>(floorf(fminf(z1, z2)));
1054  int upper = static_cast<int>(ceilf(fmaxf(z1, z2)));
1055  for (int z = lower; z <= upper; z++) {
1056  processVoxelBlobCPU<blobOrder, useFastKaiser, usePrecomputedInterpolation>(tempVolumeGPU, tempWeightsGPU, idx, idy, z, xSize, ySize, FFT, tSpace, blobTableSqrt);
1057  }
1058  }
1059  }
1060  }
1061  } else if (tSpace->XZ == tSpace->dir) { // iterate XZ plane
1062  for (int idy = tSpace->minZ; idy <= tSpace->maxZ; idy++) { // map z -> y
1063  for (int idx = tSpace->minX; idx <= tSpace->maxX; idx++) {
1064  if (useFast) {
1065  float hitY =getY(idx, idy, tSpace->unitNormal, tSpace->bottomOrigin);
1066  int y = (int)(hitY + 0.5f); // rounding
1067  processVoxelCPU(tempVolumeGPU, tempWeightsGPU, idx, y, idy, xSize, ySize, FFT, tSpace);
1068  } else {
1069  float y1 = getY(idx, idy, tSpace->unitNormal, tSpace->bottomOrigin); // lower plane
1070  float y2 = getY(idx, idy, tSpace->unitNormal, tSpace->topOrigin); // upper plane
1071  y1 = clamp(y1, 0, cpuC.cMaxVolumeIndexYZ);
1072  y2 = clamp(y2, 0, cpuC.cMaxVolumeIndexYZ);
1073  int lower = static_cast<int>(floorf(fminf(y1, y2)));
1074  int upper = static_cast<int>(ceilf(fmaxf(y1, y2)));
1075  for (int y = lower; y <= upper; y++) {
1076  processVoxelBlobCPU<blobOrder, useFastKaiser, usePrecomputedInterpolation>(tempVolumeGPU, tempWeightsGPU, idx, y, idy, xSize, ySize, FFT, tSpace, blobTableSqrt);
1077  }
1078  }
1079  }
1080  }
1081  } else { // iterate YZ plane
1082  for (int idy = tSpace->minZ; idy <= tSpace->maxZ; idy++) { // map z -> y
1083  for (int idx = tSpace->minY; idx <= tSpace->maxY; idx++) { // map y > x
1084  if (useFast) {
1085  float hitX = getX(idx, idy, tSpace->unitNormal, tSpace->bottomOrigin);
1086  int x = (int)(hitX + 0.5f); // rounding
1087  processVoxelCPU(tempVolumeGPU, tempWeightsGPU, x, idx, idy, xSize, ySize, FFT, tSpace);
1088  } else {
1089  float x1 = getX(idx, idy, tSpace->unitNormal, tSpace->bottomOrigin); // lower plane
1090  float x2 = getX(idx, idy, tSpace->unitNormal, tSpace->topOrigin); // upper plane
1091  x1 = clamp(x1, 0, cpuC.cMaxVolumeIndexX);
1092  x2 = clamp(x2, 0, cpuC.cMaxVolumeIndexX);
1093  int lower = static_cast<int>(floorf(fminf(x1, x2)));
1094  int upper = static_cast<int>(ceilf(fmaxf(x1, x2)));
1095  for (int x = lower; x <= upper; x++) {
1096  processVoxelBlobCPU<blobOrder, useFastKaiser, usePrecomputedInterpolation>(tempVolumeGPU, tempWeightsGPU, x, idx, idy, xSize, ySize, FFT, tSpace, blobTableSqrt);
1097  }
1098  }
1099  }
1100  }
1101  }
1102 }
1103 
1104 template<int blobOrder, bool useFastKaiser, bool usePrecomputedInterpolation>
1106  float2 *outVolumeBuffer, float *outWeightsBuffer,
1107  const int fftSizeX, const int fftSizeY,
1108  const int traverseSpaceCount, const RecFourierProjectionTraverseSpace *traverseSpaces,
1109  const float2 *inFFTs,
1110  const float *blobTableSqrt,
1111  const bool fastLateBlobbing) {
1112 
1113  const int groupSize = starpu_combined_worker_get_size();
1114  const int groupRank = starpu_combined_worker_get_rank();
1115 
1116  for (int i = groupRank; i < traverseSpaceCount; i += groupSize) {
1117  const RecFourierProjectionTraverseSpace &space = traverseSpaces[i];
1118 
1119  const float2* spaceFFT = inFFTs + fftSizeX * fftSizeY * space.projectionIndex;
1120 
1121  // by using templates, we can save some registers, especially for 'fast' version
1122  if (fastLateBlobbing) {
1123  processProjectionCPU<true, blobOrder, useFastKaiser, usePrecomputedInterpolation>(
1124  outVolumeBuffer, outWeightsBuffer,
1125  fftSizeX, fftSizeY,
1126  spaceFFT,
1127  &space,
1128  blobTableSqrt);
1129  } else {
1130  processProjectionCPU<false, blobOrder, useFastKaiser, usePrecomputedInterpolation>(
1131  outVolumeBuffer, outWeightsBuffer,
1132  fftSizeX, fftSizeY,
1133  spaceFFT,
1134  &space,
1135  blobTableSqrt);
1136  }
1137  }
1138 }
1139 
1140 template<bool usePrecomputedInterpolation>
1141 void func_reconstruct_cpu_template(void* buffers[], void* cl_arg) {
1142  const ReconstructFftArgs& arg = *(ReconstructFftArgs*) cl_arg;
1143  const float2* inFFTs = (float2*)STARPU_VECTOR_GET_PTR(buffers[0]);
1144  const RecFourierProjectionTraverseSpace* inSpaces = (RecFourierProjectionTraverseSpace*)STARPU_MATRIX_GET_PTR(buffers[1]);
1145  const float* inBlobTableSqrt = (float*)(STARPU_VECTOR_GET_PTR(buffers[2]));
1146  float2* outVolumeBuffer = (float2*)(STARPU_VECTOR_GET_PTR(buffers[3])); // Actually std::complex<float>
1147  float* outWeightsBuffer = (float*)(STARPU_VECTOR_GET_PTR(buffers[4]));
1148  const uint32_t noOfImages = ((LoadedImagesBuffer*) STARPU_VARIABLE_GET_PTR(buffers[5]))->noOfImages;
1149 
1150  switch (arg.blobOrder) {
1151  case 0:
1152  if (arg.blobAlpha <= 15.0) {
1153  processBufferCPU<0, true, usePrecomputedInterpolation>(outVolumeBuffer, outWeightsBuffer,
1154  arg.fftSizeX, arg.fftSizeY,
1155  arg.noOfSymmetries * noOfImages, inSpaces,
1156  inFFTs,
1157  inBlobTableSqrt,
1158  arg.fastLateBlobbing);
1159  } else {
1160  processBufferCPU<0, false, usePrecomputedInterpolation>(outVolumeBuffer, outWeightsBuffer,
1161  arg.fftSizeX, arg.fftSizeY,
1162  arg.noOfSymmetries * noOfImages, inSpaces,
1163  inFFTs,
1164  inBlobTableSqrt,
1165  arg.fastLateBlobbing);
1166  }
1167  break;
1168  case 1:
1169  processBufferCPU<1, false, usePrecomputedInterpolation>(outVolumeBuffer, outWeightsBuffer,
1170  arg.fftSizeX, arg.fftSizeY,
1171  arg.noOfSymmetries * noOfImages, inSpaces,
1172  inFFTs,
1173  inBlobTableSqrt,
1174  arg.fastLateBlobbing);
1175  break;
1176  case 2:
1177  processBufferCPU<2, false, usePrecomputedInterpolation>(outVolumeBuffer, outWeightsBuffer,
1178  arg.fftSizeX, arg.fftSizeY,
1179  arg.noOfSymmetries * noOfImages, inSpaces,
1180  inFFTs,
1181  inBlobTableSqrt,
1182  arg.fastLateBlobbing);
1183  break;
1184  case 3:
1185  processBufferCPU<3, false, usePrecomputedInterpolation>(outVolumeBuffer, outWeightsBuffer,
1186  arg.fftSizeX, arg.fftSizeY,
1187  arg.noOfSymmetries * noOfImages, inSpaces,
1188  inFFTs,
1189  inBlobTableSqrt,
1190  arg.fastLateBlobbing);
1191  break;
1192  case 4:
1193  processBufferCPU<4, false, usePrecomputedInterpolation>(outVolumeBuffer, outWeightsBuffer,
1194  arg.fftSizeX, arg.fftSizeY,
1195  arg.noOfSymmetries * noOfImages, inSpaces,
1196  inFFTs,
1197  inBlobTableSqrt,
1198  arg.fastLateBlobbing);
1199  break;
1200  default:
1201  REPORT_ERROR(ERR_VALUE_INCORRECT, "m out of range [0..4] in kaiser_value()");
1202  }
1203 }
1204 
1205 void func_reconstruct_cpu_lookup_interpolation(void* buffers[], void* cl_arg) {
1206  func_reconstruct_cpu_template<true>(buffers, cl_arg);
1207 }
1208 
1209 void func_reconstruct_cpu_dynamic_interpolation(void* buffers[], void* cl_arg) {
1210  func_reconstruct_cpu_template<false>(buffers, cl_arg);
1211 }
#define gpuErrchk(code)
Definition: cuda_asserts.h:31
enum RecFourierProjectionTraverseSpace::Direction dir
__device__ void processVoxel(float2 *tempVolumeGPU, float *tempWeightsGPU, int x, int y, int z, int xSize, int ySize, const float2 *__restrict__ FFT, const RecFourierProjectionTraverseSpace *const space)
constexpr int BLOCK_DIM
__host__ __device__ float kaiserValue(float r, float a)
#define REPORT_ERROR(nerr, ErrormMsg)
Definition: xmipp_error.h:211
doublereal * c
void processVoxelBlobCPU(float2 *const tempVolumeGPU, float *const tempWeightsGPU, const int x, const int y, const int z, const int xSize, const int ySize, const float2 *const __restrict__ FFT, const RecFourierProjectionTraverseSpace *const space, const float *blobTableSqrt)
__device__ void processVoxelBlob(float2 *tempVolumeGPU, float *tempWeightsGPU, const int x, const int y, const int z, const int xSize, const int ySize, const float2 *__restrict__ FFT, const RecFourierProjectionTraverseSpace *const space, const float *blobTableSqrt, const int imgCacheDim)
void sqrt(Image< double > &op)
void func_reconstruct_cpu_template(void *buffers[], void *cl_arg)
static double * y
__host__ __device__ float bessi3(float x)
__shared__ float BLOB_TABLE[BLOB_TABLE_SIZE_SQRT]
doublereal * w
__host__ __device__ float getZ(float x, float y, const Point3D< float > &n, const Point3D< float > &p0)
__device__ __constant__ CodeletConstants gpuC
constexpr int SHARED_IMG
doublereal * x
#define i
__global__ void processBufferKernel(float2 *outVolumeBuffer, float *outWeightsBuffer, const int fftSizeX, const int fftSizeY, const int traverseSpaceCount, const RecFourierProjectionTraverseSpace *traverseSpaces, const float2 *FFTs, const float *blobTableSqrt, int imgCacheDim)
void reconstruct_cuda_initialize_constants(int maxVolIndexX, int maxVolIndexYZ, float blobRadius, float blobAlpha, float iDeltaSqrt, float iw0, float oneOverBessiOrderAlpha)
#define BLOB_TABLE_SIZE_SQRT
void processBufferCPU(float2 *outVolumeBuffer, float *outWeightsBuffer, const int fftSizeX, const int fftSizeY, const int traverseSpaceCount, const RecFourierProjectionTraverseSpace *traverseSpaces, const float2 *inFFTs, const float *blobTableSqrt, const bool fastLateBlobbing)
constexpr int GRID_DIM_Z
__device__ void calculateAABB(const RecFourierProjectionTraverseSpace *tSpace, const RecFourierBufferDataGPU *buffer, Point3D< float > *dest)
void atomicAddFloat(volatile float *ptr, float addedValue)
if(fabs(c[*nmax+ *nmax *c_dim1])==0.e0)
__host__ __device__ float bessi2(float x)
constexpr int TILE
viol index
T z
Definition: point3D.h:56
__host__ __device__ float getY(float x, float z, const Point3D< float > &n, const Point3D< float > &p0)
double * f
__host__ __device__ float bessi1(float x)
T x
Definition: point3D.h:54
double z
__host__ __device__ float kaiserValueFast(float distSqr)
void processProjectionCPU(float2 *tempVolumeGPU, float *tempWeightsGPU, const int xSize, const int ySize, const float2 *__restrict__ FFT, const RecFourierProjectionTraverseSpace *const tSpace, const float *blobTableSqrt)
__host__ __device__ void rotate(Point3D< float > box[8], const float transform[3][3])
__host__ __device__ void multiply(const float transform[3][3], Point3D< float > &inOut)
#define j
void func_reconstruct_cuda(void *buffers[], void *cl_arg)
__host__ __device__ float bessi4(float x)
int space
Definition: rwDM3.cpp:394
__host__ __device__ float bessi0(float x)
__device__ void copyImgToCache(float2 *dest, const Point3D< float > AABB[2], const float2 *FFTs, const int fftSizeX, const int fftSizeY, const int imgIndex, const int imgCacheDim)
void processBufferGPU(float2 *outVolumeBuffer, float *outWeightsBuffer, const int fftSizeX, const int fftSizeY, const int traverseSpaceCount, const RecFourierProjectionTraverseSpace *traverseSpaces, const float2 *inFFTs, const float *blobTableSqrt, const bool fastLateBlobbing, const float blobRadius, const int maxVolIndexYZ)
__device__ bool isWithin(Point3D< float > *AABB, int imgXSize, int imgYSize)
void func_reconstruct_cpu_lookup_interpolation(void *buffers[], void *cl_arg)
void func_reconstruct_cpu_dynamic_interpolation(void *buffers[], void *cl_arg)
__device__ double atomicAdd(double *address, double val)
__device__ void processProjection(float2 *tempVolumeGPU, float *tempWeightsGPU, const int xSize, const int ySize, const float2 *__restrict__ FFT, const RecFourierProjectionTraverseSpace *const tSpace, const float *blobTableSqrt, const int imgCacheDim)
__device__ void getImgData(const Point3D< float > AABB[2], const int tXindex, const int tYindex, const float2 *FFTs, const int fftSizeX, const int fftSizeY, const int imgIndex, float2 &vComplex)
__host__ __device__ float bessi0Fast(float x)
Incorrect value received.
Definition: xmipp_error.h:195
void processVoxelCPU(float2 *const tempVolumeGPU, float *const tempWeightsGPU, const int x, const int y, const int z, const int xSize, const int ySize, const float2 *const __restrict__ FFT, const RecFourierProjectionTraverseSpace *const space)
int * n
doublereal * a
__host__ __device__ void computeAABB(Point3D< float > AABB[2], const Point3D< float > cuboid[8])
T y
Definition: point3D.h:55
__host__ __device__ float getX(float y, float z, const Point3D< float > &n, const Point3D< float > &p0)