Xmipp  v3.23.11-Nereus
cuda_gpu_reconstruct_fourier.cpp
Go to the documentation of this file.
1 /***************************************************************************
2  *
3  * Authors: David Strelak (davidstrelak@gmail.com)
4  *
5  * Unidad de Bioinformatica of Centro Nacional de Biotecnologia , CSIC
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 2 of the License, or
10  * (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, write to the Free Software
19  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
20  * 02111-1307 USA
21  *
22  * All comments concerning this program package may be sent to the
23  * e-mail address 'xmipp@cnb.csic.es'
24  ***************************************************************************/
25 
26 #include <cuda_runtime_api.h>
27 #include "reconstruction_cuda/cuda_asserts.h" // cannot be in header as it includes cuda headers
30 #include "gpu.h"
31 
32 #if SHARED_BLOB_TABLE
33 __shared__ float BLOB_TABLE[BLOB_TABLE_SIZE_SQRT];
34 #endif
35 
36 #if SHARED_IMG
37 __shared__ Point3D<float> SHARED_AABB[2];
38 extern __shared__ float2 IMG[];
39 #endif
40 
41 // FIELDS
42 
43 // Holding streams used for calculation. Present on CPU
44 cudaStream_t* streams;
45 
46 // Wrapper to hold pointers to GPU memory (and have it also accessible from CPU)
48 
49 // Holding blob coefficient table. Present on GPU
50 float* devBlobTableSqrt = NULL;
51 
52 __device__ __constant__ int cMaxVolumeIndexX = 0;
53 __device__ __constant__ int cMaxVolumeIndexYZ = 0;
54 __device__ __constant__ float cBlobRadius = 0.f;
55 __device__ __constant__ float cOneOverBlobRadiusSqr = 0.f;
56 __device__ __constant__ float cBlobAlpha = 0.f;
57 __device__ __constant__ float cIw0 = 0.f;
58 __device__ __constant__ float cIDeltaSqrt = 0.f;
59 __device__ __constant__ float cOneOverBessiOrderAlpha = 0.f;
60 
61 __device__
62 float bessi0Fast(float x) { // X must be <= 15
63  // stable rational minimax approximations to the modified bessel functions, blair, edwards
64  // from table 5
65  float x2 = x*x;
66  float num = -0.8436825781374849e-19f; // p11
67  num = fmaf(num, x2, -0.93466495199548700e-17f); // p10
68  num = fmaf(num, x2, -0.15716375332511895e-13f); // p09
69  num = fmaf(num, x2, -0.42520971595532318e-11f); // p08
70  num = fmaf(num, x2, -0.13704363824102120e-8f); // p07
71  num = fmaf(num, x2, -0.28508770483148419e-6f); // p06
72  num = fmaf(num, x2, -0.44322160233346062e-4f); // p05
73  num = fmaf(num, x2, -0.46703811755736946e-2f); // p04
74  num = fmaf(num, x2, -0.31112484643702141e-0f); // p03
75  num = fmaf(num, x2, -0.11512633616429962e+2f); // p02
76  num = fmaf(num, x2, -0.18720283332732112e+3f); // p01
77  num = fmaf(num, x2, -0.75281108169006924e+3f); // p00
78 
79  float den = 1.f; // q01
80  den = fmaf(den, x2, -0.75281109410939403e+3f); // q00
81 
82  return num/den;
83 }
84 
85 __device__
86 float bessi0(float x)
87 {
88  float y, ax, ans;
89  if ((ax = fabsf(x)) < 3.75f)
90  {
91  y = x / 3.75f;
92  y *= y;
93  ans = 1.f + y * (3.5156229f + y * (3.0899424f + y * (1.2067492f
94  + y * (0.2659732f + y * (0.360768e-1f + y * 0.45813e-2f)))));
95  }
96  else
97  {
98  y = 3.75f / ax;
99  ans = (expf(ax) * rsqrtf(ax)) * (0.39894228f + y * (0.1328592e-1f
100  + y * (0.225319e-2f + y * (-0.157565e-2f + y * (0.916281e-2f
101  + y * (-0.2057706e-1f + y * (0.2635537e-1f + y * (-0.1647633e-1f
102  + y * 0.392377e-2f))))))));
103  }
104  return ans;
105 }
106 
107 
108 __device__
109 float bessi1(float x)
110 {
111  float ax, ans;
112  float y;
113  if ((ax = fabsf(x)) < 3.75f)
114  {
115  y = x / 3.75f;
116  y *= y;
117  ans = ax * (0.5f + y * (0.87890594f + y * (0.51498869f + y * (0.15084934f
118  + y * (0.2658733e-1f + y * (0.301532e-2f + y * 0.32411e-3f))))));
119  }
120  else
121  {
122  y = 3.75f / ax;
123  ans = 0.2282967e-1f + y * (-0.2895312e-1f + y * (0.1787654e-1f
124  - y * 0.420059e-2f));
125  ans = 0.39894228f + y * (-0.3988024e-1f + y * (-0.362018e-2f
126  + y * (0.163801e-2f + y * (-0.1031555e-1f + y * ans))));
127  ans *= (expf(ax) * rsqrtf(ax));
128  }
129  return x < 0.0 ? -ans : ans;
130 }
131 
132 __device__
133 float bessi2(float x)
134 {
135  return (x == 0) ? 0 : bessi0(x) - ((2*1) / x) * bessi1(x);
136 }
137 
138 __device__
139 float bessi3(float x)
140 {
141  return (x == 0) ? 0 : bessi1(x) - ((2*2) / x) * bessi2(x);
142 }
143 
144 __device__
145 float bessi4(float x)
146 {
147  return (x == 0) ? 0 : bessi2(x) - ((2*3) / x) * bessi3(x);
148 }
149 
150 
151 template<int order>
152 __device__
153 float kaiserValue(float r, float a)
154 {
155  float rda, rdas, arg, w;
156 
157  rda = r / a;
158  if (rda <= 1.f)
159  {
160  rdas = rda * rda;
161  arg = cBlobAlpha * sqrtf(1.f - rdas);
162  if (order == 0)
163  {
164  w = bessi0(arg) * cOneOverBessiOrderAlpha;
165  }
166  else if (order == 1)
167  {
168  w = sqrtf (1.f - rdas);
169  w *= bessi1(arg) * cOneOverBessiOrderAlpha;
170  }
171  else if (order == 2)
172  {
173  w = sqrtf (1.f - rdas);
174  w = w * w;
175  w *= bessi2(arg) * cOneOverBessiOrderAlpha;
176  }
177  else if (order == 3)
178  {
179  w = sqrtf (1.f - rdas);
180  w = w * w * w;
181  w *= bessi3(arg) * cOneOverBessiOrderAlpha;
182  }
183  else if (order == 4)
184  {
185  w = sqrtf (1.f - rdas);
186  w = w * w * w *w;
187  w *= bessi4(arg) * cOneOverBessiOrderAlpha;
188  }
189  else {
190  printf("order (%d) out of range in kaiser_value(): %s, %d\n", order, __FILE__, __LINE__);
191  }
192  }
193  else
194  w = 0.f;
195 
196  return w;
197 }
198 
199 __device__
200 float kaiserValueFast(float distSqr) {
201  float arg = cBlobAlpha * sqrtf(1.f - (distSqr * cOneOverBlobRadiusSqr)); // alpha * sqrt(1-(dist/blobRadius^2))
202  return bessi0Fast(arg) * cOneOverBessiOrderAlpha * cIw0;
203 }
204 
212 private: // private to prevent unintended initialization
215 public:
216 
218  copyMetadata(orig);
219  FFTs = CTFs = paddedImages = modulators = NULL;
220 
221  // allocate space at GPU
222  alloc(orig->FFTs, FFTs, orig); // FFT are always necessary
223  alloc(orig->spaces, spaces, orig);
224  if ( ! hasFFTs) {
225  alloc(orig->paddedImages, paddedImages, orig);
226  }
227  if (hasCTFs) {
228  alloc(orig->CTFs, CTFs, orig);
229  alloc(orig->modulators, modulators, orig);
230  }
231  }
232 
233  void destroy() {
234  cudaFree(FFTs);
235  cudaFree(CTFs);
236  cudaFree(paddedImages);
237  cudaFree(modulators);
238  cudaFree(spaces);
239  gpuErrchk( cudaPeekAtLastError() );
240 
241  invalidate();
242  }
243 
247  __device__
248  float* getNthItem(float* array, int itemIndex) {
249  if (array == FFTs) return array + (fftSizeX * fftSizeY * itemIndex * 2); // *2 since it's complex
250  if (array == CTFs) return array + (fftSizeX * fftSizeY * itemIndex);
251  if (array == modulators) return array + (fftSizeX * fftSizeY * itemIndex);
252  if (array == paddedImages) return array + (paddedImgSize * paddedImgSize * itemIndex);
253  return NULL; // undefined
254  }
255 
259  void copyDataFrom(RecFourierBufferData* orig, int stream) {
260  copyMetadata(orig);
261  copy(orig->FFTs, FFTs, orig, stream);
262  copy(orig->CTFs, CTFs, orig, stream);
263  copy(orig->paddedImages, paddedImages, orig, stream);
264  copy(orig->modulators, modulators, orig, stream);
265  copy(orig->spaces, spaces, orig, stream);
266  }
267 
271  __device__
273  return noOfImages * noOfSymmetries;
274  }
275 
276 private:
282  template<typename T>
283  void copy(T* srcArray, T*& dstArray, RecFourierBufferData* orig, int stream) {
284  if (NULL != srcArray) {
285  size_t bytes = sizeof(T) * orig->getNoOfElements(srcArray);
286  cudaMemcpyAsync(dstArray, srcArray, bytes, cudaMemcpyHostToDevice, streams[stream]);
287  gpuErrchk( cudaPeekAtLastError() );
288  }
289  }
290 
295  template<typename T>
296  void alloc(T* srcArray, T*& dstArray, RecFourierBufferData* orig) {
297  size_t bytes = orig->getMaxByteSize(srcArray);
298  cudaMalloc((void **) &dstArray, bytes);
299  gpuErrchk( cudaPeekAtLastError() );
300  }
301 
306  void copyMetadata(RecFourierBufferData* orig) {
307  hasCTFs = orig->hasCTFs;
308  hasFFTs = orig->hasFFTs;
309  noOfImages = orig->noOfImages;
311  fftSizeX = orig->fftSizeX;
312  fftSizeY = orig->fftSizeY;
315  }
316 };
317 
318 
320  void* ptr;
321  cudaMallocHost(&ptr, sizeof(RecFourierBufferDataGPU)); // allocate page-locked
322  cpuCopy = (RecFourierBufferDataGPU*)ptr;
323  cpuCopy->create(orig);
324  gpuCopy = NULL;
325 }
326 
328  cudaFree(gpuCopy);
329  gpuErrchk( cudaPeekAtLastError() );
330  cpuCopy->destroy();
331  cudaFreeHost(cpuCopy);
332 }
333 
335  cpuCopy->copyDataFrom(orig, stream);
336 }
337 
339  if (NULL == gpuCopy) {
340  cudaMalloc((void **) &gpuCopy, sizeof(RecFourierBufferDataGPU));
341  gpuErrchk( cudaPeekAtLastError() );
342  }
343  cudaMemcpyAsync(gpuCopy, cpuCopy, sizeof(RecFourierBufferDataGPU), cudaMemcpyHostToDevice, streams[stream]);
344  gpuErrchk( cudaPeekAtLastError() );
345 }
346 
347 float* allocateTempVolumeGPU(float*& ptr, int size, int typeSize) {
348  size_t bytes = (size_t)size * size * size * typeSize;
349  cudaMalloc((void**)&ptr, bytes);
350  cudaMemset(ptr, 0.f, bytes);
351  gpuErrchk( cudaPeekAtLastError() );
352 
353  return ptr;
354 }
355 
356 void copyTempVolumes(std::complex<float>*** tempVol, float*** tempWeights,
357  float* tempVolGPU, float* tempWeightsGPU,
358  int size) {
359  for (int z = 0; z < size; z++) {
360  for (int y = 0; y < size; y++) {
361  int index = (z * size * size) + (y * size);
362  cudaMemcpy(tempVol[z][y], &tempVolGPU[2 * index], 2 * size * sizeof(float), cudaMemcpyDeviceToHost);
363  cudaMemcpy(tempWeights[z][y] , &tempWeightsGPU[index], size * sizeof(float), cudaMemcpyDeviceToHost);
364  }
365  }
366  gpuErrchk(cudaPeekAtLastError());
367 }
368 
369 void releaseTempVolumeGPU(float*& ptr) {
370  cudaFree(ptr);
371  ptr = NULL;
372  gpuErrchk(cudaPeekAtLastError());
373 }
374 
375  // FIXME unify with xmipp_fft.h::FFT_IDX2DIGFREQ
381 __device__
382 float FFT_IDX2DIGFREQ(int idx, int size) {
383  if (size <= 1) return 0;
384  return ((idx <= (size / 2)) ? idx : (-size + idx)) / (float)size;
385 }
386 
390 __device__
391 float getZ(float x, float y, const Point3D<float>& n, const Point3D<float>& p0) {
392  // from a(x-x0)+b(y-y0)+c(z-z0)=0
393  return (-n.x*(x-p0.x)-n.y*(y-p0.y))/n.z + p0.z;
394 }
395 
399 __device__
400 float getY(float x, float z, const Point3D<float>& n, const Point3D<float>& p0){
401  // from a(x-x0)+b(y-y0)+c(z-z0)=0
402  return (-n.x*(x-p0.x)-n.z*(z-p0.z))/n.y + p0.y;
403 }
404 
405 
409 __device__
410 float getX(float y, float z, const Point3D<float>& n, const Point3D<float>& p0){
411  // from a(x-x0)+b(y-y0)+c(z-z0)=0
412  return (-n.y*(y-p0.y)-n.z*(z-p0.z))/n.x + p0.x;
413 }
414 
416 __device__
417 void multiply(const float transform[3][3], Point3D<float>& inOut) {
418  float tmp0 = transform[0][0] * inOut.x + transform[0][1] * inOut.y + transform[0][2] * inOut.z;
419  float tmp1 = transform[1][0] * inOut.x + transform[1][1] * inOut.y + transform[1][2] * inOut.z;
420  float tmp2 = transform[2][0] * inOut.x + transform[2][1] * inOut.y + transform[2][2] * inOut.z;
421  inOut.x = tmp0;
422  inOut.y = tmp1;
423  inOut.z = tmp2;
424 }
425 
427 __device__
429  AABB[0].x = AABB[0].y = AABB[0].z = INFINITY;
430  AABB[1].x = AABB[1].y = AABB[1].z = -INFINITY;
431  Point3D<float> tmp;
432  for (int i = 0; i < 8; i++) {
433  tmp = cuboid[i];
434  if (AABB[0].x > tmp.x) AABB[0].x = tmp.x;
435  if (AABB[0].y > tmp.y) AABB[0].y = tmp.y;
436  if (AABB[0].z > tmp.z) AABB[0].z = tmp.z;
437  if (AABB[1].x < tmp.x) AABB[1].x = tmp.x;
438  if (AABB[1].y < tmp.y) AABB[1].y = tmp.y;
439  if (AABB[1].z < tmp.z) AABB[1].z = tmp.z;
440  }
441  AABB[0].x = ceilf(AABB[0].x);
442  AABB[0].y = ceilf(AABB[0].y);
443  AABB[0].z = ceilf(AABB[0].z);
444 
445  AABB[1].x = floorf(AABB[1].x);
446  AABB[1].y = floorf(AABB[1].y);
447  AABB[1].z = floorf(AABB[1].z);
448 }
449 
455 template<bool hasCTF>
456 __device__
458  float2* tempVolumeGPU, float* tempWeightsGPU,
459  int x, int y, int z,
460  int xSize, int ySize,
461  const float* __restrict__ CTF,
462  const float* __restrict__ modulator,
463  const float2* __restrict__ FFT,
465 {
466  Point3D<float> imgPos;
467  float wBlob = 1.f;
468  float wCTF = 1.f;
469  float wModulator = 1.f;
470 
471  float dataWeight = space->weight;
472 
473  // transform current point to center
474  imgPos.x = x - cMaxVolumeIndexX/2;
475  imgPos.y = y - cMaxVolumeIndexYZ/2;
476  imgPos.z = z - cMaxVolumeIndexYZ/2;
477  if (imgPos.x*imgPos.x + imgPos.y*imgPos.y + imgPos.z*imgPos.z > space->maxDistanceSqr) {
478  return; // discard iterations that would access pixel with too high frequency
479  }
480  // rotate around center
481  multiply(space->transformInv, imgPos);
482  if (imgPos.x < 0.f) return; // reading outside of the image boundary. Z is always correct and Y is checked by the condition above
483 
484  // transform back and round
485  // just Y coordinate needs adjusting, since X now matches to picture and Z is irrelevant
486  int imgX = clamp((int)(imgPos.x + 0.5f), 0, xSize - 1);
487  int imgY = clamp((int)(imgPos.y + 0.5f + cMaxVolumeIndexYZ / 2), 0, ySize - 1);
488 
489  int index3D = z * (cMaxVolumeIndexYZ+1) * (cMaxVolumeIndexX+1) + y * (cMaxVolumeIndexX+1) + x;
490  int index2D = imgY * xSize + imgX;
491 
492  if (hasCTF) {
493  wCTF = CTF[index2D];
494  wModulator = modulator[index2D];
495  }
496 
497  float weight = wBlob * wModulator * dataWeight;
498 
499  // use atomic as two blocks can write to same voxel
500  atomicAdd(&tempVolumeGPU[index3D].x, FFT[index2D].x * weight * wCTF);
501  atomicAdd(&tempVolumeGPU[index3D].y, FFT[index2D].y * weight * wCTF);
502  atomicAdd(&tempWeightsGPU[index3D], weight);
503 }
504 
510 template<bool hasCTF, int blobOrder, bool useFastKaiser>
511 __device__
513  float2* tempVolumeGPU, float *tempWeightsGPU,
514  int x, int y, int z,
515  int xSize, int ySize,
516  const float* __restrict__ CTF,
517  const float* __restrict__ modulator,
518  const float2* __restrict__ FFT,
520  const float* blobTableSqrt,
521  int imgCacheDim)
522 {
523  Point3D<float> imgPos;
524  // transform current point to center
525  imgPos.x = x - cMaxVolumeIndexX/2;
526  imgPos.y = y - cMaxVolumeIndexYZ/2;
527  imgPos.z = z - cMaxVolumeIndexYZ/2;
528  if ((imgPos.x*imgPos.x + imgPos.y*imgPos.y + imgPos.z*imgPos.z) > space->maxDistanceSqr) {
529  return; // discard iterations that would access pixel with too high frequency
530  }
531  // rotate around center
532  multiply(space->transformInv, imgPos);
533  if (imgPos.x < -cBlobRadius) return; // reading outside of the image boundary. Z is always correct and Y is checked by the condition above
534  // transform back just Y coordinate, since X now matches to picture and Z is irrelevant
535  imgPos.y += cMaxVolumeIndexYZ / 2;
536 
537  // check that we don't want to collect data from far far away ...
538  float radiusSqr = cBlobRadius * cBlobRadius;
539  float zSqr = imgPos.z * imgPos.z;
540  if (zSqr > radiusSqr) return;
541 
542  // create blob bounding box
543  int minX = ceilf(imgPos.x - cBlobRadius);
544  int maxX = floorf(imgPos.x + cBlobRadius);
545  int minY = ceilf(imgPos.y - cBlobRadius);
546  int maxY = floorf(imgPos.y + cBlobRadius);
547  minX = fmaxf(minX, 0);
548  minY = fmaxf(minY, 0);
549  maxX = fminf(maxX, xSize-1);
550  maxY = fminf(maxY, ySize-1);
551 
552  int index3D = z * (cMaxVolumeIndexYZ+1) * (cMaxVolumeIndexX+1) + y * (cMaxVolumeIndexX+1) + x;
553  float2 vol;
554  float w;
555  vol.x = vol.y = w = 0.f;
556 #if !SHARED_IMG
557 #endif
558  float dataWeight = space->weight;
559 
560  // ugly spaghetti code, but improves performance by app. 10%
561  if (hasCTF) {
562  // check which pixel in the vicinity should contribute
563  for (int i = minY; i <= maxY; i++) {
564  float ySqr = (imgPos.y - i) * (imgPos.y - i);
565  float yzSqr = ySqr + zSqr;
566  if (yzSqr > radiusSqr) continue;
567  for (int j = minX; j <= maxX; j++) {
568  float xD = imgPos.x - j;
569  float distanceSqr = xD*xD + yzSqr;
570  if (distanceSqr > radiusSqr) continue;
571 
572 #if SHARED_IMG
573  int index2D = (i - SHARED_AABB[0].y) * imgCacheDim + (j-SHARED_AABB[0].x); // position in img - offset of the AABB
574 #else
575  int index2D = i * xSize + j;
576 #endif
577 
578  float wCTF = CTF[index2D];
579  float wModulator = modulator[index2D];
580 #if PRECOMPUTE_BLOB_VAL
581  int aux = (int) ((distanceSqr * cIDeltaSqrt + 0.5f));
582  #if SHARED_BLOB_TABLE
583  float wBlob = BLOB_TABLE[aux];
584  #else
585  float wBlob = blobTableSqrt[aux];
586  #endif
587 #else
588  float wBlob;
589  if (useFastKaiser) {
590  wBlob = kaiserValueFast(distanceSqr);
591  }
592  else {
593  wBlob = kaiserValue<blobOrder>(sqrtf(distanceSqr),cBlobRadius) * cIw0;
594  }
595 #endif
596  float weight = wBlob * wModulator * dataWeight;
597  w += weight;
598 #if SHARED_IMG
599  vol += IMG[index2D] * weight * wCTF;
600 #else
601  vol += FFT[index2D] * weight * wCTF;
602 #endif
603  }
604  }
605  } else {
606  // check which pixel in the vicinity should contribute
607  for (int i = minY; i <= maxY; i++) {
608  float ySqr = (imgPos.y - i) * (imgPos.y - i);
609  float yzSqr = ySqr + zSqr;
610  if (yzSqr > radiusSqr) continue;
611  for (int j = minX; j <= maxX; j++) {
612  float xD = imgPos.x - j;
613  float distanceSqr = xD*xD + yzSqr;
614  if (distanceSqr > radiusSqr) continue;
615 
616 #if SHARED_IMG
617  int index2D = (i - SHARED_AABB[0].y) * imgCacheDim + (j-SHARED_AABB[0].x); // position in img - offset of the AABB
618 #else
619  int index2D = i * xSize + j;
620 #endif
621 
622 #if PRECOMPUTE_BLOB_VAL
623  int aux = (int) ((distanceSqr * cIDeltaSqrt + 0.5f));
624 #if SHARED_BLOB_TABLE
625  float wBlob = BLOB_TABLE[aux];
626 #else
627  float wBlob = blobTableSqrt[aux];
628 #endif
629 #else
630  float wBlob;
631  if (useFastKaiser) {
632  wBlob = kaiserValueFast(distanceSqr);
633  }
634  else {
635  wBlob = kaiserValue<blobOrder>(sqrtf(distanceSqr),cBlobRadius) * cIw0;
636  }
637 #endif
638  float weight = wBlob * dataWeight;
639  w += weight;
640 #if SHARED_IMG
641  vol += IMG[index2D] * weight;
642 #else
643  vol += FFT[index2D] * weight;
644 #endif
645  }
646  }
647  }
648  // use atomic as two blocks can write to same voxel
649  atomicAdd(&tempVolumeGPU[index3D].x, vol.x);
650  atomicAdd(&tempVolumeGPU[index3D].y, vol.y);
651  atomicAdd(&tempWeightsGPU[index3D], w);
652 }
653 
658 template<bool useFast, bool hasCTF, int blobOrder, bool useFastKaiser>
659 __device__
661  float2* tempVolumeGPU, float *tempWeightsGPU,
662  int xSize, int ySize,
663  const float* __restrict__ CTF,
664  const float* __restrict__ modulator,
665  const float2* __restrict__ FFT,
666  const RecFourierProjectionTraverseSpace* const tSpace,
667  const float* devBlobTableSqrt,
668  int imgCacheDim)
669 {
670  // map thread to each (2D) voxel
671 #if TILE > 1
672  int id = threadIdx.y * blockDim.x + threadIdx.x;
673  int tidX = threadIdx.x % TILE + (id / (blockDim.y * TILE)) * TILE;
674  int tidY = (id / TILE) % blockDim.y;
675  int idx = blockIdx.x*blockDim.x + tidX;
676  int idy = blockIdx.y*blockDim.y + tidY;
677 #else
678  // map thread to each (2D) voxel
679  volatile int idx = blockIdx.x*blockDim.x + threadIdx.x;
680  volatile int idy = blockIdx.y*blockDim.y + threadIdx.y;
681 #endif
682 
683  if (tSpace->XY == tSpace->dir) { // iterate XY plane
684  if (idy >= tSpace->minY && idy <= tSpace->maxY) {
685  if (idx >= tSpace->minX && idx <= tSpace->maxX) {
686  if (useFast) {
687  float hitZ = getZ(idx, idy, tSpace->unitNormal, tSpace->bottomOrigin);
688  int z = (int)(hitZ + 0.5f); // rounding
689  processVoxel<hasCTF>(tempVolumeGPU, tempWeightsGPU, idx, idy, z, xSize, ySize , CTF, modulator, FFT, tSpace);
690  } else {
691  float z1 = getZ(idx, idy, tSpace->unitNormal, tSpace->bottomOrigin); // lower plane
692  float z2 = getZ(idx, idy, tSpace->unitNormal, tSpace->topOrigin); // upper plane
693  z1 = clamp(z1, 0, cMaxVolumeIndexYZ);
694  z2 = clamp(z2, 0, cMaxVolumeIndexYZ);
695  int lower = floorf(fminf(z1, z2));
696  int upper = ceilf(fmaxf(z1, z2));
697  for (int z = lower; z <= upper; z++) {
698  processVoxelBlob<hasCTF, blobOrder, useFastKaiser>(tempVolumeGPU, tempWeightsGPU, idx, idy, z, xSize, ySize , CTF, modulator, FFT, tSpace, devBlobTableSqrt, imgCacheDim);
699  }
700  }
701  }
702  }
703  } else if (tSpace->XZ == tSpace->dir) { // iterate XZ plane
704  if (idy >= tSpace->minZ && idy <= tSpace->maxZ) { // map z -> y
705  if (idx >= tSpace->minX && idx <= tSpace->maxX) {
706  if (useFast) {
707  float hitY =getY(idx, idy, tSpace->unitNormal, tSpace->bottomOrigin);
708  int y = (int)(hitY + 0.5f); // rounding
709  processVoxel<hasCTF>(tempVolumeGPU, tempWeightsGPU, idx, y, idy, xSize, ySize , CTF, modulator, FFT, tSpace);
710  } else {
711  float y1 = getY(idx, idy, tSpace->unitNormal, tSpace->bottomOrigin); // lower plane
712  float y2 = getY(idx, idy, tSpace->unitNormal, tSpace->topOrigin); // upper plane
713  y1 = clamp(y1, 0, cMaxVolumeIndexYZ);
714  y2 = clamp(y2, 0, cMaxVolumeIndexYZ);
715  int lower = floorf(fminf(y1, y2));
716  int upper = ceilf(fmaxf(y1, y2));
717  for (int y = lower; y <= upper; y++) {
718  processVoxelBlob<hasCTF, blobOrder, useFastKaiser>(tempVolumeGPU, tempWeightsGPU, idx, y, idy, xSize, ySize , CTF, modulator, FFT, tSpace, devBlobTableSqrt, imgCacheDim);
719  }
720  }
721  }
722  }
723  } else { // iterate YZ plane
724  if (idy >= tSpace->minZ && idy <= tSpace->maxZ) { // map z -> y
725  if (idx >= tSpace->minY && idx <= tSpace->maxY) { // map y > x
726  if (useFast) {
727  float hitX = getX(idx, idy, tSpace->unitNormal, tSpace->bottomOrigin);
728  int x = (int)(hitX + 0.5f); // rounding
729  processVoxel<hasCTF>(tempVolumeGPU, tempWeightsGPU, x, idx, idy, xSize, ySize , CTF, modulator, FFT, tSpace);
730  } else {
731  float x1 = getX(idx, idy, tSpace->unitNormal, tSpace->bottomOrigin); // lower plane
732  float x2 = getX(idx, idy, tSpace->unitNormal, tSpace->topOrigin); // upper plane
733  x1 = clamp(x1, 0, cMaxVolumeIndexX);
734  x2 = clamp(x2, 0, cMaxVolumeIndexX);
735  int lower = floorf(fminf(x1, x2));
736  int upper = ceilf(fmaxf(x1, x2));
737  for (int x = lower; x <= upper; x++) {
738  processVoxelBlob<hasCTF, blobOrder, useFastKaiser>(tempVolumeGPU, tempWeightsGPU, x, idx, idy, xSize, ySize , CTF, modulator, FFT, tSpace, devBlobTableSqrt, imgCacheDim);
739  }
740  }
741  }
742  }
743  }
744 }
745 
750 __device__
751 void rotate(Point3D<float>* box, const float transform[3][3]) {
752  for (int i = 0; i < 8; i++) {
753  Point3D<float> imgPos;
754  // transform current point to center
755  imgPos.x = box[i].x - cMaxVolumeIndexX/2;
756  imgPos.y = box[i].y - cMaxVolumeIndexYZ/2;
757  imgPos.z = box[i].z - cMaxVolumeIndexYZ/2;
758  // rotate around center
759  multiply(transform, imgPos);
760  // transform back just Y coordinate, since X now matches to picture and Z is irrelevant
761  imgPos.y += cMaxVolumeIndexYZ / 2;
762 
763  box[i] = imgPos;
764  }
765 }
766 
773 __device__
775  Point3D<float> box[8];
776  // calculate AABB for the whole working block
777  if (tSpace->XY == tSpace->dir) { // iterate XY plane
778  box[0].x = box[3].x = box[4].x = box[7].x = blockIdx.x*blockDim.x - cBlobRadius;
779  box[1].x = box[2].x = box[5].x = box[6].x = (blockIdx.x+1)*blockDim.x + cBlobRadius - 1.f;
780 
781  box[2].y = box[3].y = box[6].y = box[7].y = (blockIdx.y+1)*blockDim.y + cBlobRadius - 1.f;
782  box[0].y = box[1].y = box[4].y = box[5].y = blockIdx.y*blockDim.y- cBlobRadius;
783 
784  box[0].z = getZ(box[0].x, box[0].y, tSpace->unitNormal, tSpace->bottomOrigin);
785  box[4].z = getZ(box[4].x, box[4].y, tSpace->unitNormal, tSpace->topOrigin);
786 
787  box[3].z = getZ(box[3].x, box[3].y, tSpace->unitNormal, tSpace->bottomOrigin);
788  box[7].z = getZ(box[7].x, box[7].y, tSpace->unitNormal, tSpace->topOrigin);
789 
790  box[2].z = getZ(box[2].x, box[2].y, tSpace->unitNormal, tSpace->bottomOrigin);
791  box[6].z = getZ(box[6].x, box[6].y, tSpace->unitNormal, tSpace->topOrigin);
792 
793  box[1].z = getZ(box[1].x, box[1].y, tSpace->unitNormal, tSpace->bottomOrigin);
794  box[5].z = getZ(box[5].x, box[5].y, tSpace->unitNormal, tSpace->topOrigin);
795  } else if (tSpace->XZ == tSpace->dir) { // iterate XZ plane
796  box[0].x = box[3].x = box[4].x = box[7].x = blockIdx.x*blockDim.x - cBlobRadius;
797  box[1].x = box[2].x = box[5].x = box[6].x = (blockIdx.x+1)*blockDim.x + cBlobRadius - 1.f;
798 
799  box[2].z = box[3].z = box[6].z = box[7].z = (blockIdx.y+1)*blockDim.y + cBlobRadius - 1.f;
800  box[0].z = box[1].z = box[4].z = box[5].z = blockIdx.y*blockDim.y- cBlobRadius;
801 
802  box[0].y = getY(box[0].x, box[0].z, tSpace->unitNormal, tSpace->bottomOrigin);
803  box[4].y = getY(box[4].x, box[4].z, tSpace->unitNormal, tSpace->topOrigin);
804 
805  box[3].y = getY(box[3].x, box[3].z, tSpace->unitNormal, tSpace->bottomOrigin);
806  box[7].y = getY(box[7].x, box[7].z, tSpace->unitNormal, tSpace->topOrigin);
807 
808  box[2].y = getY(box[2].x, box[2].z, tSpace->unitNormal, tSpace->bottomOrigin);
809  box[6].y = getY(box[6].x, box[6].z, tSpace->unitNormal, tSpace->topOrigin);
810 
811  box[1].y = getY(box[1].x, box[1].z, tSpace->unitNormal, tSpace->bottomOrigin);
812  box[5].y = getY(box[5].x, box[5].z, tSpace->unitNormal, tSpace->topOrigin);
813  } else { // iterate YZ plane
814  box[0].y = box[3].y = box[4].y = box[7].y = blockIdx.x*blockDim.x - cBlobRadius;
815  box[1].y = box[2].y = box[5].y = box[6].y = (blockIdx.x+1)*blockDim.x + cBlobRadius - 1.f;
816 
817  box[2].z = box[3].z = box[6].z = box[7].z = (blockIdx.y+1)*blockDim.y + cBlobRadius - 1.f;
818  box[0].z = box[1].z = box[4].z = box[5].z = blockIdx.y*blockDim.y- cBlobRadius;
819 
820  box[0].x = getX(box[0].y, box[0].z, tSpace->unitNormal, tSpace->bottomOrigin);
821  box[4].x = getX(box[4].y, box[4].z, tSpace->unitNormal, tSpace->topOrigin);
822 
823  box[3].x = getX(box[3].y, box[3].z, tSpace->unitNormal, tSpace->bottomOrigin);
824  box[7].x = getX(box[7].y, box[7].z, tSpace->unitNormal, tSpace->topOrigin);
825 
826  box[2].x = getX(box[2].y, box[2].z, tSpace->unitNormal, tSpace->bottomOrigin);
827  box[6].x = getX(box[6].y, box[6].z, tSpace->unitNormal, tSpace->topOrigin);
828 
829  box[1].x = getX(box[1].y, box[1].z, tSpace->unitNormal, tSpace->bottomOrigin);
830  box[5].x = getX(box[5].y, box[5].z, tSpace->unitNormal, tSpace->topOrigin);
831  }
832  // transform AABB to the image domain
833  rotate(box, tSpace->transformInv);
834  // AABB is projected on image. Create new AABB that will encompass all vertices
835  computeAABB(dest, box);
836 }
837 
841 __device__
842 bool isWithin(Point3D<float>* AABB, int imgXSize, int imgYSize) {
843  return (AABB[0].x < imgXSize)
844  && (AABB[1].x >= 0)
845  && (AABB[0].y < imgYSize)
846  && (AABB[1].y >= 0);
847 }
848 
855 __device__
857  int tXindex, int tYindex,
858  RecFourierBufferDataGPU* const buffer, int imgIndex,
859  float& vReal, float& vImag) {
860  int imgXindex = tXindex + AABB[0].x;
861  int imgYindex = tYindex + AABB[0].y;
862  if ((imgXindex >=0)
863  && (imgXindex < buffer->fftSizeX)
864  && (imgYindex >=0)
865  && (imgYindex < buffer->fftSizeY)) {
866  int index = imgYindex * buffer->fftSizeX + imgXindex; // copy data from image
867  vReal = buffer->getNthItem(buffer->FFTs, imgIndex)[2*index];
868  vImag = buffer->getNthItem(buffer->FFTs, imgIndex)[2*index + 1];
869 
870  } else {
871  vReal = vImag = 0.f; // out of image bound, so return zero
872  }
873 }
874 
882 __device__
883 void copyImgToCache(float2* dest, Point3D<float>* AABB,
884  RecFourierBufferDataGPU* const buffer, int imgIndex,
885  int imgCacheDim) {
886  for (int y = threadIdx.y; y < imgCacheDim; y += blockDim.y) {
887  for (int x = threadIdx.x; x < imgCacheDim; x += blockDim.x) {
888  int memIndex = y * imgCacheDim + x;
889  getImgData(AABB, x, y, buffer, imgIndex, dest[memIndex].x, dest[memIndex].y);
890  }
891  }
892 }
893 
898 template<bool useFast, bool hasCTF, int blobOrder, bool useFastKaiser>
899 __global__
901  float* tempVolumeGPU, float *tempWeightsGPU,
903  float* devBlobTableSqrt,
904  int imgCacheDim) {
905 #if SHARED_BLOB_TABLE
906  if ( ! useFast) {
907  // copy blob table to shared memory
908  volatile int id = threadIdx.y*blockDim.x + threadIdx.x;
909  volatile int blockSize = blockDim.x * blockDim.y;
910  for (int i = id; i < BLOB_TABLE_SIZE_SQRT; i+= blockSize)
911  BLOB_TABLE[i] = devBlobTableSqrt[i];
912  __syncthreads();
913  }
914 #endif
915 
916  for (int i = blockIdx.z; i < buffer->getNoOfSpaces(); i += gridDim.z) {
918 
919 #if SHARED_IMG
920  if ( ! useFast) {
921  // make sure that all threads start at the same time
922  // as they can come from previous iteration
923  __syncthreads();
924  if ((threadIdx.x == 0) && (threadIdx.y == 0)) {
925  // first thread calculates which part of the image should be shared
926  calculateAABB(space, buffer, SHARED_AABB);
927  }
928  __syncthreads();
929  // check if the block will have to copy data from image
930  if (isWithin(SHARED_AABB, buffer->fftSizeX, buffer->fftSizeY)) {
931  // all threads copy image data to shared memory
932  copyImgToCache(IMG, SHARED_AABB, buffer, space->projectionIndex, imgCacheDim);
933  __syncthreads();
934  } else {
935  continue; // whole block can exit, as it's not reading from image
936  }
937  }
938 #endif
939 
940  processProjection<useFast, hasCTF, blobOrder, useFastKaiser>(
941  (float2*)tempVolumeGPU, tempWeightsGPU,
942  buffer->fftSizeX, buffer->fftSizeY,
943  buffer->getNthItem(buffer->CTFs, space->projectionIndex),
944  buffer->getNthItem(buffer->modulators, space->projectionIndex),
945  (float2*)buffer->getNthItem(buffer->FFTs, space->projectionIndex),
946  space,
948  imgCacheDim);
949  __syncthreads(); // sync threads to avoid write after read problems
950  }
951 }
952 
962 __global__
963 void convertImagesKernel(std::complex<float>* iFouriers, int iSizeX, int iSizeY, int iLength,
964  RecFourierBufferDataGPU* oBuffer, float maxResolutionSqr) {
965  // assign pixel to thread
966  volatile int idx = blockIdx.x*blockDim.x + threadIdx.x;
967  volatile int idy = blockIdx.y*blockDim.y + threadIdx.y;
968 
969  int halfY = iSizeY / 2;
970  float normFactor = iSizeY*iSizeY;
971  int oSizeX = oBuffer->fftSizeX;
972 
973  // input is an image in Fourier space (not normalized)
974  // with low frequencies in the inner corners
975  for (int n = 0; n < iLength; n++) {
976  float2 freq;
977  if ((idy < iSizeY) // for all input lines
978  && (idx < oSizeX)) { // for all output pixels in the line
979  // process line only if it can hold sufficiently high frequency, i.e. process only
980  // first and last N lines
981  if (idy < oSizeX || idy >= (iSizeY - oSizeX)) {
982  // check the frequency
983  freq.x = FFT_IDX2DIGFREQ(idx, iSizeY);
984  freq.y = FFT_IDX2DIGFREQ(idy, iSizeY);
985  if ((freq.x * freq.x + freq.y * freq.y) > maxResolutionSqr) {
986  continue;
987  }
988  // do the shift (lower line will move up, upper down)
989  int newY = (idy < halfY) ? (idy + oSizeX) : (idy - iSizeY + oSizeX);
990  int oIndex = newY*oSizeX + idx;
991 
992  int iIndex = n*iSizeY*iSizeX + idy*iSizeX + idx;
993  float* iValue = (float*)&(iFouriers[iIndex]);
994 
995  // copy data and perform normalization
996  oBuffer->getNthItem(oBuffer->FFTs, n)[2*oIndex] = iValue[0] / normFactor;
997  oBuffer->getNthItem(oBuffer->FFTs, n)[2*oIndex + 1] = iValue[1] / normFactor;
998  }
999  }
1000  }
1001 }
1002 
1009  FRecBufferDataGPUWrapper* wrapper,
1010  float maxResolutionSqr,
1011  int streamIndex) {
1012 
1013  cudaStream_t stream = streams[streamIndex];
1014 
1015  RecFourierBufferDataGPU* hostBuffer = wrapper->cpuCopy;
1016  // store to proper structure
1017  GpuMultidimArrayAtGpu<float> imagesGPU(
1018  hostBuffer->paddedImgSize, hostBuffer->paddedImgSize, 1, hostBuffer->noOfImages, hostBuffer->paddedImages);
1019  // perform FFT
1021  mycufftHandle myhandle;
1022  imagesGPU.fft(resultingFFT, myhandle);
1023  myhandle.clear(); // release unnecessary memory
1024  imagesGPU.d_data = NULL; // unbind the data
1025 
1026  // now we have performed FFTs of the input images
1027  // buffers have to be updated accordingly
1028  hostBuffer->hasFFTs = true;
1029  cudaMemsetAsync(hostBuffer->FFTs, 0.f, hostBuffer->getFFTsByteSize(), stream); // clear it, as kernel writes only to some parts
1030  wrapper->copyToDevice(streamIndex);
1031  gpuErrchk( cudaPeekAtLastError() );
1032 
1033  // run kernel, one thread for each pixel of input FFT
1034  dim3 dimBlock(BLOCK_DIM, BLOCK_DIM);
1035  dim3 dimGrid(ceil(resultingFFT.Xdim/(float)dimBlock.x), ceil(resultingFFT.Ydim/(float)dimBlock.y));
1036  convertImagesKernel<<<dimGrid, dimBlock, 0, stream>>>(
1037  resultingFFT.d_data, resultingFFT.Xdim, resultingFFT.Ydim, resultingFFT.Ndim,
1038  wrapper->gpuCopy, maxResolutionSqr);
1039  // now we have converted input images to FFTs in the required format
1040 }
1041 
1042 void waitForGPU() {
1043  gpuErrchk( cudaPeekAtLastError() );
1044  gpuErrchk( cudaDeviceSynchronize() );
1045 }
1046 
1047 void createStreams(int count) {
1048  wrappers = new FRecBufferDataGPUWrapper*[count];
1049  streams = new cudaStream_t[count];
1050  for (int i = 0; i < count; i++) {
1051  cudaStreamCreate(&streams[i]);
1052  }
1053 }
1054 
1055 void deleteStreams(int count) {
1056  for (int i = 0; i < count; i++) {
1057  cudaStreamDestroy(streams[i]);
1058  }
1059  delete[] streams;
1060  delete[] wrappers;
1061 }
1062 
1063 
1065  if (buffer->hasCTFs) {
1066  GPU::pinMemory(buffer->CTFs, buffer->getMaxByteSize(buffer->CTFs));
1067  GPU::pinMemory(buffer->modulators, buffer->getMaxByteSize(buffer->modulators));
1068  }
1069  if (buffer->hasFFTs) {
1070  GPU::pinMemory(buffer->FFTs, buffer->getMaxByteSize(buffer->FFTs));
1071  } else {
1072  GPU::pinMemory(buffer->paddedImages, buffer->getMaxByteSize(buffer->paddedImages));
1073  }
1074  GPU::pinMemory(buffer->spaces, buffer->getMaxByteSize(buffer->spaces));
1075  GPU::pinMemory(buffer, sizeof(*buffer));
1076 }
1077 
1079  if (buffer->hasCTFs) {
1080  GPU::unpinMemory(buffer->CTFs);
1081  GPU::unpinMemory(buffer->modulators);
1082  }
1083  if (buffer->hasFFTs) {
1084  GPU::unpinMemory(buffer->FFTs);
1085  } else {
1086  GPU::unpinMemory(buffer->paddedImages);
1087  }
1088  GPU::unpinMemory(buffer->spaces);
1089  GPU::unpinMemory(buffer);
1090 }
1091 
1092 
1094  wrappers[streamIndex] = new FRecBufferDataGPUWrapper(buffer);
1095  gpuErrchk( cudaPeekAtLastError() );
1096 }
1097 
1098 void copyBlobTable(float* blobTableSqrt, int blobTableSize) {
1099  cudaMalloc((void **) &devBlobTableSqrt, blobTableSize*sizeof(float));
1100  cudaMemcpy(devBlobTableSqrt, blobTableSqrt, blobTableSize*sizeof(float), cudaMemcpyHostToDevice);
1101  gpuErrchk( cudaPeekAtLastError() );
1102 }
1103 
1105  cudaFree(devBlobTableSqrt);
1106  gpuErrchk( cudaPeekAtLastError() );
1107 }
1108 
1109 void releaseWrapper(int streamIndex) {
1110  delete wrappers[streamIndex];
1111 }
1112 
1114  int maxVolIndexX, int maxVolIndexYZ,
1115  float blobRadius, float blobAlpha,
1116  float iDeltaSqrt, float iw0, float oneOverBessiOrderAlpha) {
1117  cudaMemcpyToSymbol(cMaxVolumeIndexX, &maxVolIndexX,sizeof(maxVolIndexX));
1118  cudaMemcpyToSymbol(cMaxVolumeIndexYZ, &maxVolIndexYZ,sizeof(maxVolIndexYZ));
1119  cudaMemcpyToSymbol(cBlobRadius, &blobRadius,sizeof(blobRadius));
1120  cudaMemcpyToSymbol(cBlobAlpha, &blobAlpha,sizeof(blobAlpha));
1121  cudaMemcpyToSymbol(cIw0, &iw0,sizeof(iw0));
1122  cudaMemcpyToSymbol(cIDeltaSqrt, &iDeltaSqrt,sizeof(iDeltaSqrt));
1123  cudaMemcpyToSymbol(cOneOverBessiOrderAlpha, &oneOverBessiOrderAlpha,sizeof(oneOverBessiOrderAlpha));
1124  float oneOverBlobRadiusSqr = 1.f / (blobRadius * blobRadius);
1125  cudaMemcpyToSymbol(cOneOverBlobRadiusSqr, &oneOverBlobRadiusSqr,sizeof(oneOverBlobRadiusSqr));
1126  gpuErrchk( cudaPeekAtLastError() );
1127 }
1128 
1135 template<int blobOrder, bool useFastKaiser>
1136 void processBufferGPU_(float* tempVolumeGPU, float* tempWeightsGPU,
1138  float blobRadius, int maxVolIndexYZ, bool useFast,
1139  float maxResolutionSqr, int streamIndex) {
1140 
1141  cudaStream_t stream = streams[streamIndex];
1142 
1143  // copy all data to gpu
1144  FRecBufferDataGPUWrapper* wrapper = wrappers[streamIndex];
1145  wrapper->copyFrom(buffer, streamIndex);
1146  wrapper->copyToDevice(streamIndex);
1147 
1148  // process input data if necessary
1149  if ( ! wrapper->cpuCopy->hasFFTs) {
1150  convertImages(wrapper, maxResolutionSqr, streamIndex);
1151  }
1152  // now wait till all necessary data are loaded to GPU (so that host can continue in work)
1153  cudaStreamSynchronize(stream);
1154 
1155  // enqueue kernel and return control
1156  int size2D = maxVolIndexYZ + 1;
1157  int imgCacheDim = ceil(sqrt(2.f) * sqrt(3.f) *(BLOCK_DIM + 2*blobRadius));
1158  dim3 dimBlock(BLOCK_DIM, BLOCK_DIM);
1159  dim3 dimGrid(ceil(size2D/(float)dimBlock.x),ceil(size2D/(float)dimBlock.y), GRID_DIM_Z);
1160 
1161  // by using templates, we can save some registers, especially for 'fast' version
1162  if (useFast && buffer->hasCTFs) {
1163  processBufferKernel<true, true, blobOrder,useFastKaiser><<<dimGrid, dimBlock, 0, stream>>>(
1164  tempVolumeGPU, tempWeightsGPU,
1165  wrapper->gpuCopy,
1167  imgCacheDim);
1168  return;
1169  }
1170  if (useFast && !buffer->hasCTFs) {
1171  processBufferKernel<true, false, blobOrder,useFastKaiser><<<dimGrid, dimBlock, 0, stream>>>(
1172  tempVolumeGPU, tempWeightsGPU,
1173  wrapper->gpuCopy,
1175  imgCacheDim);
1176  return;
1177  }
1178  // if making copy of the image in shared memory, allocate enough space
1179  int sharedMemSize = SHARED_IMG ? (imgCacheDim*imgCacheDim*sizeof(float2)) : 0;
1180  if (!useFast && buffer->hasCTFs) {
1181  processBufferKernel<false, true, blobOrder,useFastKaiser><<<dimGrid, dimBlock, sharedMemSize, stream>>>(
1182  tempVolumeGPU, tempWeightsGPU,
1183  wrapper->gpuCopy,
1185  imgCacheDim);
1186  return;
1187  }
1188  if (!useFast && !buffer->hasCTFs) {
1189  processBufferKernel<false, false, blobOrder,useFastKaiser><<<dimGrid, dimBlock, sharedMemSize, stream>>>(
1190  tempVolumeGPU, tempWeightsGPU,
1191  wrapper->gpuCopy,
1193  imgCacheDim);
1194  return;
1195  }
1196 }
1197 
1198 void processBufferGPU(float* tempVolumeGPU, float* tempWeightsGPU,
1200  float blobRadius, int maxVolIndexYZ, bool useFast,
1201  float maxResolutionSqr, int streamIndex, int blobOrder, float blobAlpha) {
1202  switch (blobOrder) {
1203  case 0:
1204  if (blobAlpha <= 15.0) {
1205  processBufferGPU_<0, true>(tempVolumeGPU, tempWeightsGPU,
1206  buffer,
1207  blobRadius, maxVolIndexYZ, useFast,
1208  maxResolutionSqr,
1209  streamIndex);
1210  } else {
1211  processBufferGPU_<0, false>(tempVolumeGPU, tempWeightsGPU,
1212  buffer,
1213  blobRadius, maxVolIndexYZ, useFast,
1214  maxResolutionSqr,
1215  streamIndex);
1216  }
1217  break;
1218  case 1:
1219  processBufferGPU_<1, false>(tempVolumeGPU, tempWeightsGPU,
1220  buffer,
1221  blobRadius, maxVolIndexYZ, useFast,
1222  maxResolutionSqr,
1223  streamIndex);
1224  break;
1225  case 2:
1226  processBufferGPU_<2, false>(tempVolumeGPU, tempWeightsGPU,
1227  buffer,
1228  blobRadius, maxVolIndexYZ, useFast,
1229  maxResolutionSqr,
1230  streamIndex);
1231  break;
1232  case 3:
1233  processBufferGPU_<3, false>(tempVolumeGPU, tempWeightsGPU,
1234  buffer,
1235  blobRadius, maxVolIndexYZ, useFast,
1236  maxResolutionSqr,
1237  streamIndex);
1238  break;
1239  case 4:
1240  processBufferGPU_<4, false>(tempVolumeGPU, tempWeightsGPU,
1241  buffer,
1242  blobRadius, maxVolIndexYZ, useFast,
1243  maxResolutionSqr,
1244  streamIndex);
1245  break;
1246  default:
1247  REPORT_ERROR(ERR_VALUE_INCORRECT, "m out of range [0..4] in kaiser_value()");
1248  }
1249 }
1250 
#define gpuErrchk(code)
Definition: cuda_asserts.h:31
__device__ __constant__ float cBlobRadius
enum RecFourierProjectionTraverseSpace::Direction dir
__global__ void processBufferKernel(float *tempVolumeGPU, float *tempWeightsGPU, RecFourierBufferDataGPU *buffer, float *devBlobTableSqrt, int imgCacheDim)
__device__ void processVoxelBlob(float2 *tempVolumeGPU, float *tempWeightsGPU, int x, int y, int z, int xSize, int ySize, const float *__restrict__ CTF, const float *__restrict__ modulator, const float2 *__restrict__ FFT, const RecFourierProjectionTraverseSpace *const space, const float *blobTableSqrt, int imgCacheDim)
void copyBlobTable(float *blobTableSqrt, int blobTableSize)
__device__ float bessi1(float x)
constexpr int BLOCK_DIM
void processBufferGPU_(float *tempVolumeGPU, float *tempWeightsGPU, RecFourierBufferData *buffer, float blobRadius, int maxVolIndexYZ, bool useFast, float maxResolutionSqr, int streamIndex)
#define REPORT_ERROR(nerr, ErrormMsg)
Definition: xmipp_error.h:211
__device__ float getX(float y, float z, const Point3D< float > &n, const Point3D< float > &p0)
void create(RecFourierBufferData *orig)
void sqrt(Image< double > &op)
__device__ void computeAABB(Point3D< float > *AABB, Point3D< float > *cuboid)
HBITMAP buffer
Definition: svm-toy.cpp:37
static double * y
__device__ float bessi0Fast(float x)
__device__ float bessi4(float x)
__shared__ float BLOB_TABLE[BLOB_TABLE_SIZE_SQRT]
__device__ void getImgData(Point3D< float > *AABB, int tXindex, int tYindex, RecFourierBufferDataGPU *const buffer, int imgIndex, float &vReal, float &vImag)
void copyTempVolumes(std::complex< float > ***tempVol, float ***tempWeights, float *tempVolGPU, float *tempWeightsGPU, int size)
doublereal * w
__device__ __constant__ float cOneOverBlobRadiusSqr
__device__ __constant__ int cMaxVolumeIndexX
__device__ float kaiserValueFast(float distSqr)
__device__ __constant__ int cMaxVolumeIndexYZ
constexpr int SHARED_IMG
doublereal * x
#define i
FRecBufferDataGPUWrapper ** wrappers
void deleteStreams(int count)
__device__ void rotate(Point3D< float > *box, const float transform[3][3])
#define BLOB_TABLE_SIZE_SQRT
__device__ float bessi0(float x)
__device__ __constant__ float cIDeltaSqrt
constexpr int GRID_DIM_Z
__device__ void copyImgToCache(float2 *dest, Point3D< float > *AABB, RecFourierBufferDataGPU *const buffer, int imgIndex, int imgCacheDim)
void pinMemory(RecFourierBufferData *buffer)
__device__ void calculateAABB(const RecFourierProjectionTraverseSpace *tSpace, const RecFourierBufferDataGPU *buffer, Point3D< float > *dest)
RecFourierBufferDataGPU * cpuCopy
__device__ float bessi3(float x)
void releaseWrapper(int streamIndex)
static void pinMemory(const void *h_mem, size_t bytes, unsigned int flags=0)
Definition: gpu.cpp:92
cudaStream_t * streams
if(fabs(c[*nmax+ *nmax *c_dim1])==0.e0)
constexpr int TILE
viol index
T z
Definition: point3D.h:56
#define CTF
double * f
void releaseTempVolumeGPU(float *&ptr)
void copyConstants(int maxVolIndexX, int maxVolIndexYZ, float blobRadius, float blobAlpha, float iDeltaSqrt, float iw0, float oneOverBessiOrderAlpha)
__device__ __constant__ float cBlobAlpha
T x
Definition: point3D.h:54
void processBufferGPU(float *tempVolumeGPU, float *tempWeightsGPU, RecFourierBufferData *buffer, float blobRadius, int maxVolIndexYZ, bool useFast, float maxResolutionSqr, int streamIndex, int blobOrder, float blobAlpha)
__device__ float bessi2(float x)
__device__ float kaiserValue(float r, float a)
double z
void allocateWrapper(RecFourierBufferData *buffer, int streamIndex)
void convertImages(FRecBufferDataGPUWrapper *wrapper, float maxResolutionSqr, int streamIndex)
__device__ void processVoxel(float2 *tempVolumeGPU, float *tempWeightsGPU, int x, int y, int z, int xSize, int ySize, const float *__restrict__ CTF, const float *__restrict__ modulator, const float2 *__restrict__ FFT, const RecFourierProjectionTraverseSpace *const space)
__device__ float getZ(float x, float y, const Point3D< float > &n, const Point3D< float > &p0)
__device__ float FFT_IDX2DIGFREQ(int idx, int size)
__device__ void processProjection(float2 *tempVolumeGPU, float *tempWeightsGPU, int xSize, int ySize, const float *__restrict__ CTF, const float *__restrict__ modulator, const float2 *__restrict__ FFT, const RecFourierProjectionTraverseSpace *const tSpace, const float *devBlobTableSqrt, int imgCacheDim)
#define j
int space
Definition: rwDM3.cpp:394
float * allocateTempVolumeGPU(float *&ptr, int size, int typeSize)
__device__ __constant__ float cIw0
RecFourierProjectionTraverseSpace * spaces
__global__ void convertImagesKernel(std::complex< float > *iFouriers, int iSizeX, int iSizeY, int iLength, RecFourierBufferDataGPU *oBuffer, float maxResolutionSqr)
__device__ bool isWithin(Point3D< float > *AABB, int imgXSize, int imgYSize)
__device__ float getY(float x, float z, const Point3D< float > &n, const Point3D< float > &p0)
FRecBufferDataGPUWrapper(RecFourierBufferData *orig)
void unpinMemory(RecFourierBufferData *buffer)
void fft(GpuMultidimArrayAtGpu< T1 > &fourierTransform, mycufftHandle &myhandle)
__device__ double atomicAdd(double *address, double val)
RecFourierBufferDataGPU * gpuCopy
__device__ void multiply(const float transform[3][3], Point3D< float > &inOut)
__device__ float * getNthItem(float *array, int itemIndex)
void copyDataFrom(RecFourierBufferData *orig, int stream)
__device__ __constant__ float cOneOverBessiOrderAlpha
void createStreams(int count)
static void unpinMemory(const void *h_mem)
Definition: gpu.cpp:108
Incorrect value received.
Definition: xmipp_error.h:195
int * n
void copyFrom(RecFourierBufferData *orig, int stream)
doublereal * a
float * devBlobTableSqrt
T y
Definition: point3D.h:55