Xmipp  v3.23.11-Nereus
cuda_gpu_correlation.cpp
Go to the documentation of this file.
1 //Host includes
2 #include "cuda_gpu_correlation.h"
3 
4 #include <iostream>
5 #include <stdio.h>
6 #include <math.h>
7 #include <algorithm>
8 
9 
10 //CUDA includes
11 #include <cuda_runtime.h>
12 #include <cufft.h>
13 #include "cuda_asserts.h"
14 #include "cuda_basic_math.h"
15 #include <time.h>
16 #include <sys/time.h>
17 #include <vector>
18 
19 #define PI 3.14159265359
20 
21 
22 __global__ void calcAbsKernel(cufftComplex *d_in, float *d_out, int dim){
23 
24  int idx = blockDim.x * blockIdx.x + threadIdx.x;
25 
26  if(idx>=dim)
27  return;
28 
29  d_out[idx]=d_in[idx].x;
30 
31 }
32 
33 __global__ void sumRadiusKernel(float *d_in, float *d_out, float *d_out_max, float *d_out_zero,
34  int dim, int radius, int ndim){
35 
36  int idx = blockDim.x * blockIdx.x + threadIdx.x;
37  int numIm = floorf(idx/360.0f);
38  int angle = idx%360;
39 
40  if(idx>=dim)
41  return;
42 
43  float out = 0.0;
44  float out_max = -100000;
45  int idxRead=360*radius*numIm;
46  for(int i=0; i<radius; i++){
47  if(d_in[idxRead+(360*i)+angle]==-1.0){
48  continue;
49  }
50  out += d_in[idxRead+(360*i)+angle];
51  if(d_in[idxRead+(360*i)+angle]>d_out_max[idx]){
52  out_max = d_in[idxRead+(360*i)+angle];
53  }
54 
55  if(i==0)
56  d_out_zero[idx] = d_in[idxRead+angle];
57  }
58  d_out[idx] = out;
59  d_out_max[idx] = out_max;
60 
61 }
62 
63 
64 __global__ void calculateMax2(float *d_in, float *d_out, float *position, int yxdim, int Ndim, bool firstCall){
65 
66  extern __shared__ float sdata[];
67 
68  int idx = threadIdx.x;
69  int blockSize = blockDim.x;
70 
71 
72  //Version 6
73  int i = blockIdx.x * blockSize + idx;
74 
75  //printf("d_in[%i] %f \n", i, d_in[i]);
76 
77  int index = 0;
78  for(int imN=0; imN<Ndim; imN++){
79 
80  if(i<yxdim*gridDim.x){
81  sdata[idx]=d_in[i+index];
82 
83  if(firstCall)
84  sdata[idx+blockSize] = (float)idx; //AJ position
85  else
86  sdata[idx+blockSize] = position[i+index];
87  }
88  //if (idx==0)
89  //printf("i %i, sdata %f \n", i, sdata[idx]);
90 
91  __syncthreads();
92 
93  if(i>=yxdim*gridDim.x)
94  sdata[idx]=-1.0;
95  __syncthreads();
96 
97 
98  if(blockSize >= 1024){
99  if(idx<512){
100  sdata[idx] = fmaxf(sdata[idx], sdata[idx+512]);
101  sdata[idx+blockSize] = (sdata[idx]==sdata[idx+512]) ? sdata[idx+blockSize+512] : sdata[idx+blockSize];
102  }
103  __syncthreads();
104  }
105  if(blockSize >= 512){
106  if(idx<256){
107  sdata[idx] = fmaxf(sdata[idx], sdata[idx+256]);
108  sdata[idx+blockSize] = (sdata[idx]==sdata[idx+256]) ? sdata[idx+blockSize+256] : sdata[idx+blockSize];
109  }
110  __syncthreads();
111  }
112  if(blockSize >= 256){
113  if(idx<128){
114  sdata[idx] = fmaxf(sdata[idx], sdata[idx+128]);
115  sdata[idx+blockSize] = (sdata[idx]==sdata[idx+128]) ? sdata[idx+blockSize+128] : sdata[idx+blockSize];
116  }
117  __syncthreads();
118  }
119  if(blockSize >= 128){
120  if(idx<64){
121  sdata[idx] = fmaxf(sdata[idx], sdata[idx+64]);
122  sdata[idx+blockSize] = (sdata[idx]==sdata[idx+64]) ? sdata[idx+blockSize+64] : sdata[idx+blockSize];
123  }
124  __syncthreads();
125  }
126  if(idx<32){
127  if(blockSize>=64){
128  if(idx<32){
129  sdata[idx] = fmaxf(sdata[idx], sdata[idx+32]);
130  sdata[idx+blockSize] = (sdata[idx]==sdata[idx+32]) ? sdata[idx+blockSize+32] : sdata[idx+blockSize];
131  }
132  }
133  if(blockSize>=32){
134  if(idx<16){
135  sdata[idx] = fmaxf(sdata[idx], sdata[idx+16]);
136  sdata[idx+blockSize] = (sdata[idx]==sdata[idx+16]) ? sdata[idx+blockSize+16] : sdata[idx+blockSize];
137  }
138  }
139  if(blockSize>=16){
140  if(idx<8){
141  sdata[idx] = fmaxf(sdata[idx], sdata[idx+8]);
142  sdata[idx+blockSize] = (sdata[idx]==sdata[idx+8]) ? sdata[idx+blockSize+8] : sdata[idx+blockSize];
143  }
144  }
145  if(blockSize>=8){
146  if(idx<4){
147  sdata[idx] = fmaxf(sdata[idx], sdata[idx+4]);
148  sdata[idx+blockSize] = (sdata[idx]==sdata[idx+4]) ? sdata[idx+blockSize+4] : sdata[idx+blockSize];
149  }
150  }
151  if(blockSize>=4){
152  if(idx<2){
153  sdata[idx] = fmaxf(sdata[idx], sdata[idx+2]);
154  sdata[idx+blockSize] = (sdata[idx]==sdata[idx+2]) ? sdata[idx+blockSize+2] : sdata[idx+blockSize];
155  }
156  }
157  if(blockSize>=2){
158  if(idx<1){
159  sdata[idx] = fmaxf(sdata[idx], sdata[idx+1]);
160  sdata[idx+blockSize] = (sdata[idx]==sdata[idx+1]) ? sdata[idx+blockSize+1] : sdata[idx+blockSize];
161  }
162  }
163  }
164 
165  if(idx==0){
166  //printf("idx %i sdata[0] %f sdata[blockSize] %f \n", blockIdx.x+(gridDim.x*imN), sdata[0], sdata[blockSize]);
167  d_out[blockIdx.x+(gridDim.x*imN)] = sdata[0];
168  position[blockIdx.x+(gridDim.x*imN)] = sdata[blockSize];
169  }
170 
171  __syncthreads();
172 
173  index = index+(int)yxdim;
174 
175  }
176 
177 }
178 
179 
181  cufftComplex *manyF, cufftComplex *MmanyF, cufftComplex *manyF_sq, cufftComplex *MmanyF_sq,
182  cufftComplex *maskAux, int nzyxdim, int yxdim, int ndim, bool power2)
183 {
184 
185  int idx = threadIdx.x;
186  int nIm = blockIdx.x;
187  int blockSize = blockDim.x;
188 
189  int posTh = nIm*yxdim + idx;
190  int n = ceilf(((float)yxdim/(float)blockSize));
191 
192  if (idx>=yxdim)
193  return;
194 
195  float normFactor = 1.0f/yxdim;
196 
197  int myIdx = posTh;
198  int myIdxMask = idx;
199 
200  cufftComplex myM = M[myIdxMask];
201  cuComplex mulOut = cuCmulf(manyF[myIdx], myM);
202  cuComplex mulOut_sq = cuCmulf(manyF_sq[myIdx], myM);
203 
204  MmanyF[myIdx] = mulOut*normFactor;
205  MmanyF_sq[myIdx] = mulOut_sq*normFactor;
206  if(nIm==0)
207  maskAux[myIdx] = cuCmulf(myM, myM)*normFactor;
208 
209  myIdx+=blockSize;
210  myIdxMask+=blockSize;
211 
212  for (int i=1; i<n; i++){
213 
214  if (posTh+i*blockSize < yxdim*(nIm+1)){
215 
216  myM = M[myIdxMask];
217  mulOut = cuCmulf(manyF[myIdx], myM);
218  mulOut_sq = cuCmulf(manyF_sq[myIdx], myM);
219 
220  MmanyF[myIdx] = mulOut*normFactor;
221  MmanyF_sq[myIdx] = mulOut_sq*normFactor;
222  if(nIm==0)
223  maskAux[myIdx] = cuCmulf(myM, myM)*normFactor;
224 
225  myIdx+=blockSize;
226  myIdxMask+=blockSize;
227 
228  }
229  }
230 }
231 
232 
233 __global__ void pointwiseMultiplicationComplexOneManyKernel_two(cufftComplex *M,
234  cufftComplex *manyF, cufftComplex *MmanyF, cufftComplex *manyF_sq, cufftComplex *MmanyF_sq,
235  int nzyxdim, int yxdim, int ndim, bool power2)
236 {
237 
238  int idx = threadIdx.x;
239  int nIm = blockIdx.x;
240  int blockSize = blockDim.x;
241 
242  int posTh = nIm*yxdim + idx;
243  int n = ceilf(((float)yxdim/(float)blockSize));
244 
245  if (idx>=yxdim)
246  return;
247 
248  float normFactor = 1.0f/yxdim;
249 
250  int myIdx = posTh;
251  int myIdxMask = idx;
252 
253  cufftComplex myM = M[myIdxMask];
254  cuComplex mulOut = cuCmulf(manyF[myIdx], myM);
255  cuComplex mulOut_sq = cuCmulf(manyF_sq[myIdx], myM);
256 
257  MmanyF[myIdx] = mulOut*normFactor;
258  MmanyF_sq[myIdx] = mulOut_sq*normFactor;
259 
260  myIdx+=blockSize;
261  myIdxMask+=blockSize;
262 
263  for (int i=1; i<n; i++){
264 
265  if (posTh+i*blockSize < yxdim*(nIm+1)){
266 
267  myM = M[myIdxMask];
268  mulOut = cuCmulf(manyF[myIdx], myM);
269  mulOut_sq = cuCmulf(manyF_sq[myIdx], myM);
270 
271  MmanyF[myIdx] = mulOut*normFactor;
272  MmanyF_sq[myIdx] = mulOut_sq*normFactor;
273 
274  myIdx+=blockSize;
275  myIdxMask+=blockSize;
276 
277  }
278  }
279 }
280 
281 
282 
283 __global__ void pointwiseMultiplicationComplexOneManyKernel(cufftComplex *M, cufftComplex *manyF, cufftComplex *MmanyF,
284  int nzyxdim, int yxdim, bool power2)
285 {
286 
287  int idx = blockDim.x * blockIdx.x + threadIdx.x;
288  int idxLow;
289  if (power2==true)
290  idxLow = idx & (yxdim-1);
291  else
292  idxLow = idx%yxdim;
293 
294  if (idx>=nzyxdim)
295  return;
296 
297  float normFactor = (1.0f/yxdim);
298 
299  cuComplex mulOut = cuCmulf(manyF[idx], M[idxLow]);
300  MmanyF[idx] = mulOut*normFactor;
301 
302 }
303 
304 //NOW WE DONT USE THIS KERNEL
305 __global__ void calculateDenomFunctionKernel(float *MFrealSpace, float *MF2realSpace, float *maskAutocorrelation,
306  float *out, int nzyxdim, int yxdim, bool power2)
307 {
308  int idx = blockDim.x * blockIdx.x + threadIdx.x;
309  int idxLow;
310  if (power2==true)
311  idxLow = idx & (yxdim-1);
312  else
313  idxLow = idx%yxdim;
314 
315  if (idx>=nzyxdim)
316  return;
317 
318  out[idx] = sqrt(MF2realSpace[idx] - (MFrealSpace[idx]*MFrealSpace[idx]/maskAutocorrelation[idxLow]));
319 
320 }
321 
322 
323 __global__ void calculateNccKernel(float *RefExpRealSpace, float *MFrealSpaceRef, float *MFrealSpaceExp,
324  float *MF2realSpaceRef, float *MF2realSpaceExp, float *mask, float *NCC,
325  int nzyxdim, int yxdim, int xdim, int ydim, int maskCount, int max_shift, bool power2yx, bool power2x)
326 {
327 
328  //change unsigned... and size_t
329  int idx = blockDim.x * blockIdx.x + threadIdx.x;
330  int idxLow;
331  if (power2yx==true)
332  idxLow = idx & (yxdim-1);
333  else
334  idxLow = idx%yxdim;
335 
336  if(idx>=nzyxdim)
337  return;
338 
339  int idx_x;
340  if (power2x==true)
341  idx_x = idxLow & (xdim-1);
342  else
343  idx_x = idxLow%xdim;
344 
345  int idx_y=idxLow/xdim;
346  if(idx_x>=max_shift && idx_x<xdim-max_shift){
347  NCC[idx] = -1;
348  return;
349  }
350  if(idx_y>=max_shift && idx_y<ydim-max_shift){
351  NCC[idx] = -1;
352  return;
353  }
354 
355  float maskTh = mask[idxLow];
356  float MF2realSpaceRefTh = MF2realSpaceRef[idx];
357  float MFrealSpaceRefTh = MFrealSpaceRef[idx];
358  float MF2realSpaceExpTh = MF2realSpaceExp[idx];
359  float MFrealSpaceExpTh = MFrealSpaceExp[idx];
360  float RefExpRealSpaceTh = RefExpRealSpace[idx];
361 
362  float den1 = sqrt(MF2realSpaceRefTh - (MFrealSpaceRefTh*MFrealSpaceRefTh/maskTh));
363  float den2 = sqrt(MF2realSpaceExpTh - (MFrealSpaceExpTh*MFrealSpaceExpTh/maskTh));
364 
365  float num;
366  if(den1!=0.0 && den2!=0.0 && !isnan(den1) && !isnan(den2) && maskTh>maskCount*0.9){
367  num = (RefExpRealSpaceTh - ((MFrealSpaceRefTh*MFrealSpaceExpTh)/(maskTh)) );
368  NCC[idx] = num/(den1*den2);
369  }else
370  NCC[idx] = -1;
371 
372 }
373 
374 __device__ void wrapping (int &x, int &y, int xdim, int ydim){
375  //mirror wrapping
376  if(x<0)
377  x=-x;
378  else if(x>=xdim)
379  x=xdim-(x-xdim)-1;
380  if(y<0)
381  y=-y;
382  else if(y>=ydim)
383  y=ydim-(y-ydim)-1;
384 }
385 
386 __global__ void applyTransformKernel(float *d_in, float *d_out, float *transMat, int nzyxdim, int yxdim,
387  int xdim, int ydim, bool power2yx, bool power2x)
388 {
389 
390  extern __shared__ float transMatSD[];
391 
392  int idxIm = blockDim.x * blockIdx.x + threadIdx.x;
393  int idxTh = threadIdx.x;
394  int numIm = blockIdx.y;
395 
396 
397  if(idxIm>=yxdim)
398  return;
399 
400  if(idxTh<9)
401  transMatSD[idxTh] = transMat[idxTh+(numIm*9)];
402  __syncthreads();
403 
404  float x;
405  if (power2x==true)
406  x = idxIm & (xdim-1);
407  else
408  x = idxIm%xdim;
409 
410  float y = idxIm/xdim;
411  float x_orig = 0;
412  float y_orig = 0;
413 
414  //transMat into shared memory
415  x -= transMatSD[2]; //transMat[2+(numIm*9)];
416  y -= transMatSD[5]; //transMat[5+(numIm*9)];
417 
418  x = x - xdim/2.0f;
419  y = y - ydim/2.0f;
420 
421  x_orig += transMatSD[0]*x - transMatSD[1]*y + xdim/2.0f;
422  y_orig += -transMatSD[3]*x + transMatSD[4]*y + xdim/2.0f;
423  // x_orig += transMat[(numIm*9)]*x - transMat[1+(numIm*9)]*y + xdim/2.0f;
424  //y_orig += -transMat[3+(numIm*9)]*x + transMat[4+(numIm*9)]*y + ydim/2.0f;
425 
426  int x_orig00 = (int)floorf(x_orig);
427  int y_orig00 = (int)floorf(y_orig);
428  int x_orig01 = x_orig00+1;
429  int y_orig01 = y_orig00;
430  int x_orig10 = x_orig00;
431  int y_orig10 = y_orig00+1;
432  int x_orig11 = x_orig00+1;
433  int y_orig11 = y_orig00+1;
434 
435  float x_x_low=x_orig-x_orig00;
436  float y_y_low=y_orig-y_orig00;
437  float one_x=1.0-x_x_low;
438  float one_y=1.0-y_y_low;
439  float w00=one_y*one_x;
440  float w01=one_y*x_x_low;
441  float w10=y_y_low*one_x;
442  float w11=y_y_low*x_x_low;
443 
444  wrapping (x_orig00, y_orig00, xdim, ydim);
445  wrapping (x_orig01, y_orig01, xdim, ydim);
446  wrapping (x_orig10, y_orig10, xdim, ydim);
447  wrapping (x_orig11, y_orig11, xdim, ydim);
448 
449  int imgIdx00=y_orig00 * xdim + x_orig00;
450  int imgIdx01=y_orig01 * xdim + x_orig01;
451  int imgIdx10=y_orig10 * xdim + x_orig10;
452  int imgIdx11=y_orig11 * xdim + x_orig11;
453 
454  int imgOffset = numIm*yxdim;
455  float I00 = d_in[imgIdx00+imgOffset];
456  float I01 = d_in[imgIdx01+imgOffset];
457  float I10 = d_in[imgIdx10+imgOffset];
458  float I11 = d_in[imgIdx11+imgOffset];
459  float imVal = I00*w00 + I01*w01 + I10*w10 + I11*w11;
460 
461  d_out[idxIm+(numIm*yxdim)] = imVal;
462 
463 }
464 
465 
466 
467 __global__ void calculateNccRotationKernel(float *RefExpRealSpace, cufftComplex *polarFFTRef, cufftComplex *polarFFTExp,
468  cufftComplex *polarSquaredFFTRef, cufftComplex *polarSquaredFFTExp, float maskFFTPolarReal, float *NCC,
469  int yxdimFFT, int nzyxdim, int yxdim)
470 {
471 
472  //change unsigned lont int for int and all the size_t
473  int idx = blockDim.x * blockIdx.x + threadIdx.x;
474  int idxLow = (idx/yxdim)*yxdimFFT;
475 
476  if(idx>=nzyxdim)
477  return;
478 
479  float normValue = 1.0f/yxdimFFT;
480  float maskNorm = maskFFTPolarReal*normValue;
481 
482  float M1M2Polar = maskFFTPolarReal*maskNorm;
483  float polarValRef = cuCrealf(polarFFTRef[idxLow])*maskNorm;
484  float polarSqValRef = cuCrealf(polarSquaredFFTRef[idxLow])*maskNorm;
485 
486  float polarValExp = cuCrealf(polarFFTExp[idxLow])*maskNorm;
487  float polarSqValExp = cuCrealf(polarSquaredFFTExp[idxLow])*maskNorm;
488 
489  float num = (RefExpRealSpace[idx] - (polarValRef*polarValExp/M1M2Polar) );
490  float den1 = sqrt(polarSqValRef - (polarValRef*polarValRef/M1M2Polar) );
491  float den2 = sqrt(polarSqValExp - (polarValExp*polarValExp/M1M2Polar) );
492 
493  if(den1!=0.0 && den2!=0.0 && !isnan(den1) && !isnan(den2))
494  NCC[idx] = num/(den1*den2);
495  else
496  NCC[idx] = -1.0;
497 
498 }
499 
500 
501 __global__ void pointwiseMultiplicationComplexKernel(cufftComplex *reference, cufftComplex *experimental,
502  cufftComplex *RefExpFourier, int nzyxdim, int yxdim)
503 {
504  int idx = blockDim.x * blockIdx.x + threadIdx.x;
505 
506  if(idx>=nzyxdim)
507  return;
508 
509  float normFactor = (1.0f/yxdim);
510 
511  cuComplex mulOut = cuCmulf(reference[idx], experimental[idx]);
512  RefExpFourier[idx] = mulOut*normFactor;
513 }
514 
515 //TODO: try this instead with blockIdx.y being the number of images maybe with a subset,
516 //i.e., every thread processing a bunch of images, instead of just one
517 __global__ void maskingPaddingKernel(float *d_in, float *mask, float *padded_image_gpu,
518  float *padded_image2_gpu, float *padded_mask_gpu, int xdim, int ydim, int yxdim,
519  int numImag, int pad_xdim, int pad_ydim, int pad_yxdim, bool experimental, bool power2x){
520 
521  int idx = blockDim.x * blockIdx.x + threadIdx.x;
522  int numIm = blockIdx.y;
523 
524  if(idx>=yxdim)
525  return;
526 
527  int x_idx1;
528  if (power2x==true)
529  x_idx1 = idx & (xdim-1);
530  else
531  x_idx1 = idx%xdim;
532 
533  int y_idx1 = idx/xdim;
534  int idxWriteToMask;
535  if(experimental)
536  idxWriteToMask = (ydim-1 - y_idx1)*xdim + (xdim-1 - x_idx1);
537  else
538  idxWriteToMask = y_idx1*xdim + x_idx1;
539 
540  int xdim2Im = (int)floorf((pad_xdim-xdim)/2.0f);
541  int ydim2Im = (int)floorf((pad_ydim-ydim)/2.0f);
542  int xdim2Mask = xdim2Im;
543  int ydim2Mask = ydim2Im;
544  if(experimental && (xdim&1)==0){
545  xdim2Im+=1;
546  ydim2Im+=1;
547  }
548 
549  int x_idx;
550  if (power2x==true)
551  x_idx = idxWriteToMask & (xdim-1);
552  else
553  x_idx = idxWriteToMask%xdim;
554 
555  int y_idx = idxWriteToMask/xdim;
556  int idxWrite;
557  int idxWriteMask;
558  float d_out, d_out2;
559 
560  int offset=0;
561  //for(int j=0; j<numImag; j++){
562 
563  int j = numIm;
564 
565  offset=j*yxdim;
566 
567  d_out = d_in[idx+offset]*mask[idx];
568  d_out2 = d_out*d_out;
569 
570  idxWrite = (pad_yxdim*j) + (ydim2Im*pad_xdim) + (y_idx*pad_xdim) + xdim2Im + x_idx;
571  if((xdim&1)==0)
572  idxWriteMask = (pad_yxdim*j) + (ydim2Mask*pad_xdim) + (y_idx*pad_xdim) + xdim2Mask + x_idx;
573  else
574  idxWriteMask = idxWrite;
575  padded_image_gpu[idxWrite] = d_out;
576  padded_image2_gpu[idxWrite] = d_out2;
577  if(j==0 && padded_mask_gpu!=NULL)
578  padded_mask_gpu[idxWriteMask] = mask[idx];
579 
580  //}
581 }
582 
583 
584 __global__ void buildTranslationMatrix (float *d_pos, float* lastMat, float* result,
585  float *maxGpu, float *NCC, int Xdim, int Ydim, int Ndim, int NCC_yxdim,
586  int fixPadding, double maxShift2, bool power2x){
587 
588  int idx = blockDim.x * blockIdx.x + threadIdx.x;
589 
590  if(idx>=Ndim)
591  return;
592 
593  int position = (int)d_pos[idx];
594 
595  float posX_aux;
596  if (power2x==true)
597  posX_aux = position & (Xdim-1);
598  else
599  posX_aux = position%Xdim;
600 
601  float posY_aux = (float)(position/Xdim);
602  float Xdim2 = (Xdim/2.0f);
603  float Ydim2 = (Ydim/2.0f);
604  float posX, posY;
605 
606  if(posX_aux>=Xdim2 && posY_aux>=Ydim2){
607  posX = Xdim-1-posX_aux;
608  posY = Ydim-1-posY_aux;
609  }else if(posX_aux<Xdim2 && posY_aux>=Ydim2){
610  posX = -(posX_aux+1);
611  posY = Ydim-1-posY_aux;
612  }else if(posX_aux<Xdim2 && posY_aux<Ydim2){
613  posX = -(posX_aux+1);
614  posY = -(posY_aux+1);
615  }else if(posX_aux>=Xdim2 && posY_aux<Ydim2){
616  posX = Xdim-1-posX_aux;
617  posY = -(posY_aux+1);
618  }
619 
620  //Fixing padding problem
621  posX+=fixPadding;
622  posY+=fixPadding;
623 
624  int idx9 = idx*9;
625 
626  float new0 = 1.0f;
627  float new1 = 0.0f;
628  float new2 = -posX;
629  float new3 = 0.0f;
630  float new4 = 1.0f;
631  float new5 = -posY;
632  //float new6 = 0.0f;
633  //float new7 = 0.0f;
634  //float new8 = 1.0f;
635 
636  float last0 = lastMat[idx9];
637  float last1 = lastMat[idx9+1];
638  float last2 = lastMat[idx9+2];
639  float last3 = lastMat[idx9+3];
640  float last4 = lastMat[idx9+4];
641  float last5 = lastMat[idx9+5];
642  float last6 = lastMat[idx9+6];
643  float last7 = lastMat[idx9+7];
644  float last8 = lastMat[idx9+8];
645 
646  float shiftx = new0*last2 + new1*last5 + new2*last8;
647  float shifty = new3*last2 + new4*last5 + new5*last8;
648  float radShift = shiftx*shiftx + shifty*shifty;
649  if(radShift > maxShift2){
650  result[idx9] = last0;
651  result[idx9+1] = last1;
652  result[idx9+2] = last2;
653  result[idx9+3] = last3;
654  result[idx9+4] = last4;
655  result[idx9+5] = last5;
656  maxGpu[idx] = NCC[idx*NCC_yxdim];
657  }else{
658  result[idx9] = new0*last0 + new1*last3 + new2*last6;
659  result[idx9+2] = shiftx;
660  result[idx9+1] = new0*last1 + new1*last4 + new2*last7;
661  result[idx9+3] = new3*last0 + new4*last3 + new5*last6;
662  result[idx9+4] = new3*last1 + new4*last4 + new5*last7;
663  result[idx9+5] = shifty;
664  }
665 
666 }
667 
668 
669 
670 __global__ void buildRotationMatrix (float *d_pos, float* lastMat, float* result,
671  float *maxGpu, float *auxMax, float *NCC, int Xdim, int Ndim, int NCC_yxdim,
672  int fixPadding, double maxShift2, bool power2x){
673 
674  int idx = blockDim.x * blockIdx.x + threadIdx.x;
675 
676  if(idx>=Ndim)
677  return;
678 
679  float posX;
680  int position = (int)d_pos[idx];
681  maxGpu[idx] = auxMax[position+(idx*360)];
682 
683  float posX_aux;
684  if (power2x==true)
685  posX_aux = position & (Xdim-1);
686  else
687  posX_aux = position%Xdim;
688 
689  float Xdim2 = (Xdim/2.0f);
690 
691 
692  if(posX_aux<Xdim2){
693  posX = -(posX_aux+1);
694  }else if(posX_aux>=Xdim2){
695  posX = Xdim-1-posX_aux;
696  }
697 
698  //Fixing padding problem
699  posX+=fixPadding;
700 
701  float rad = (float)(-posX*PI/180);
702 
703  int idx9 = idx*9;
704 
705  float new0 = cosf(rad);
706  float new1 = -sinf(rad);
707  float new2 = 0.0f;
708  float new3 = sinf(rad);
709  float new4 = cosf(rad);
710  float new5 = 0.0f;
711  //float new6 = 0.0f;
712  //float new7 = 0.0f;
713  //float new8 = 1.0f;
714 
715  float last0 = lastMat[idx9];
716  float last1 = lastMat[idx9+1];
717  float last2 = lastMat[idx9+2];
718  float last3 = lastMat[idx9+3];
719  float last4 = lastMat[idx9+4];
720  float last5 = lastMat[idx9+5];
721  float last6 = lastMat[idx9+6];
722  float last7 = lastMat[idx9+7];
723  float last8 = lastMat[idx9+8];
724 
725  float shiftx = new0*last2 + new1*last5 + new2*last8;
726  float shifty = new3*last2 + new4*last5 + new5*last8;
727  float radShift = shiftx*shiftx + shifty*shifty;
728  if(radShift > maxShift2){
729  result[idx9] = last0;
730  result[idx9+1] = last1;
731  result[idx9+2] = last2;
732  result[idx9+3] = last3;
733  result[idx9+4] = last4;
734  result[idx9+5] = last5;
735  maxGpu[idx] = NCC[idx*NCC_yxdim];
736  }else{
737  result[idx9] = new0*last0 + new1*last3 + new2*last6;
738  result[idx9+2] = shiftx;
739  result[idx9+1] = new0*last1 + new1*last4 + new2*last7;
740  result[idx9+3] = new3*last0 + new4*last3 + new5*last6;
741  result[idx9+4] = new3*last1 + new4*last4 + new5*last7;
742  result[idx9+5] = shifty;
743  }
744 
745 }
746 
747 
748 
749 __global__ void cart2polar(float *image, float *polar, float *polar2, int maxRadius, int maxAng,
750  int Nimgs, int Ydim, int Xdim, bool rotate)
751 {
752  int angle = blockDim.x * blockIdx.x + threadIdx.x;
753  int radius = blockDim.y * blockIdx.y + threadIdx.y;
754 
755  int numIm = blockIdx.z;
756 
757  if (radius>=maxRadius || angle>=maxAng)
758  return;
759 
760  float x = (float)(radius)*cosf((float)angle*(float)PI/180.0f) + (float)Xdim/2.0f;
761  float y = (float)(radius)*sinf((float)angle*(float)PI/180.0f) + (float)Ydim/2.0f;
762 
763  float dx_low = floorf(x);
764  float dy_low = floorf(y);
765  int x_low = (int)dx_low;
766  int y_low = (int)dy_low;
767  float x_x_low=x-dx_low;
768  float y_y_low=y-dy_low;
769  float one_x=1.0-x_x_low;
770  float one_y=1.0-y_y_low;
771  float w00=one_y*one_x;
772  float w01=one_y*x_x_low;
773  float w10=y_y_low*one_x;
774  float w11=y_y_low*x_x_low;
775 
776  int NXY=Xdim*Ydim;
777  int NXYpolar=maxAng*maxRadius;
778  int imgIdx00=y_low * Xdim + x_low;
779  int imgIdx01=imgIdx00+1;
780  int imgIdx10=imgIdx00+Xdim;
781  int imgIdx11=imgIdx10+1;
782  int imgOffset=0;
783  int polarOffset=0;
784  int polarIdx;
785  if(!rotate)
786  polarIdx=angle+(radius*maxAng);
787  else
788  polarIdx = (maxAng-angle-1)+((maxRadius-radius-1)*maxAng);
789 
790  int n=numIm;
791 
792  imgOffset = n*NXY;
793  polarOffset = n*NXYpolar;
794  float I00 = image[imgIdx00+imgOffset];
795  float I01 = image[imgIdx01+imgOffset];
796  float I10 = image[imgIdx10+imgOffset];
797  float I11 = image[imgIdx11+imgOffset];
798  float imVal = I00*w00 + I01*w01 + I10*w10 + I11*w11;
799  int finalPolarIndex=polarIdx+polarOffset;
800  polar[finalPolarIndex] = imVal;
801  polar2[finalPolarIndex] = imVal*imVal;
802 
803 }
804 
805 
806 
807 __global__ void calculateMaxThreads (float *d_in, float *d_out, float *position,
808  int yxdim, int Ndim, int yxdim2, int Ndim2){
809 
810  int idx = threadIdx.x;
811  int nIm = blockIdx.x;
812  int blockSize = blockDim.x;
813 
814  int posTh = nIm*yxdim + idx;
815  int n = ceilf(((float)yxdim/(float)blockSize));
816 
817  float tmp, tmpPos;
818 
819  if(idx>=yxdim){
820  tmp = -1.0f;
821  tmpPos = -1.0f;
822  }else{
823  tmp = d_in[posTh];
824  tmpPos = idx;
825  for (int i=1; i<n; i++){
826  if (posTh+i*blockSize < yxdim*(nIm+1)){
827  tmp = fmaxf(tmp, d_in[posTh+i*blockSize]);
828  tmpPos = (tmp==d_in[posTh+i*blockSize]) ? idx+i*blockSize : tmpPos;
829  }
830  }
831  }
832  int posOut = nIm*blockSize + idx;
833  d_out[posOut] = tmp;
834  position[posOut] = tmpPos;
835 
836  __syncthreads();
837 
838 
839  extern __shared__ float sdata[];
840 
841 
842  //Version 6
843  int i = blockIdx.x * blockSize + idx;
844 
845 
846  int index = 0;
847  for(int imN=0; imN<Ndim2; imN++){
848 
849  if(i<yxdim2*gridDim.x){
850  sdata[idx]=d_out[i+index];
851  sdata[idx+blockSize] = position[i+index];
852  }
853 
854  __syncthreads();
855 
856  if(i>=yxdim2*gridDim.x)
857  sdata[idx]=-1.0;
858  __syncthreads();
859 
860 
861  if(blockSize >= 1024){
862  if(idx<512){
863  sdata[idx] = fmaxf(sdata[idx], sdata[idx+512]);
864  sdata[idx+blockSize] = (sdata[idx]==sdata[idx+512]) ? sdata[idx+blockSize+512] : sdata[idx+blockSize];
865  }
866  __syncthreads();
867  }
868  if(blockSize >= 512){
869  if(idx<256){
870  sdata[idx] = fmaxf(sdata[idx], sdata[idx+256]);
871  sdata[idx+blockSize] = (sdata[idx]==sdata[idx+256]) ? sdata[idx+blockSize+256] : sdata[idx+blockSize];
872  }
873  __syncthreads();
874  }
875  if(blockSize >= 256){
876  if(idx<128){
877  sdata[idx] = fmaxf(sdata[idx], sdata[idx+128]);
878  sdata[idx+blockSize] = (sdata[idx]==sdata[idx+128]) ? sdata[idx+blockSize+128] : sdata[idx+blockSize];
879  }
880  __syncthreads();
881  }
882  if(blockSize >= 128){
883  if(idx<64){
884  sdata[idx] = fmaxf(sdata[idx], sdata[idx+64]);
885  sdata[idx+blockSize] = (sdata[idx]==sdata[idx+64]) ? sdata[idx+blockSize+64] : sdata[idx+blockSize];
886  }
887  __syncthreads();
888  }
889  if(idx<32){
890  if(blockSize>=64){
891  if(idx<32){
892  sdata[idx] = fmaxf(sdata[idx], sdata[idx+32]);
893  sdata[idx+blockSize] = (sdata[idx]==sdata[idx+32]) ? sdata[idx+blockSize+32] : sdata[idx+blockSize];
894  }
895  }
896  if(blockSize>=32){
897  if(idx<16){
898  sdata[idx] = fmaxf(sdata[idx], sdata[idx+16]);
899  sdata[idx+blockSize] = (sdata[idx]==sdata[idx+16]) ? sdata[idx+blockSize+16] : sdata[idx+blockSize];
900  }
901  }
902  if(blockSize>=16){
903  if(idx<8){
904  sdata[idx] = fmaxf(sdata[idx], sdata[idx+8]);
905  sdata[idx+blockSize] = (sdata[idx]==sdata[idx+8]) ? sdata[idx+blockSize+8] : sdata[idx+blockSize];
906  }
907  }
908  if(blockSize>=8){
909  if(idx<4){
910  sdata[idx] = fmaxf(sdata[idx], sdata[idx+4]);
911  sdata[idx+blockSize] = (sdata[idx]==sdata[idx+4]) ? sdata[idx+blockSize+4] : sdata[idx+blockSize];
912  }
913  }
914  if(blockSize>=4){
915  if(idx<2){
916  sdata[idx] = fmaxf(sdata[idx], sdata[idx+2]);
917  sdata[idx+blockSize] = (sdata[idx]==sdata[idx+2]) ? sdata[idx+blockSize+2] : sdata[idx+blockSize];
918  }
919  }
920  if(blockSize>=2){
921  if(idx<1){
922  sdata[idx] = fmaxf(sdata[idx], sdata[idx+1]);
923  sdata[idx+blockSize] = (sdata[idx]==sdata[idx+1]) ? sdata[idx+blockSize+1] : sdata[idx+blockSize];
924  }
925  }
926  }
927 
928  if(idx==0){
929  d_out[blockIdx.x+(gridDim.x*imN)] = sdata[0];
930  position[blockIdx.x+(gridDim.x*imN)] = sdata[blockSize];
931  }
932 
933  __syncthreads();
934 
935  index = index+(int)yxdim2;
936 
937  }
938 
939 }
940 
941 
943  GpuMultidimArrayAtGpu<float> &padded_image_gpu, GpuMultidimArrayAtGpu<float> &padded_image2_gpu,
944  GpuMultidimArrayAtGpu<float> &padded_mask_gpu, bool experimental, myStreamHandle &myStream){
945 
946  int numTh = 1024;
947  int numBlk = d_orig_image.yxdim/numTh;
948  if(d_orig_image.yxdim%numTh > 0)
949  numBlk++;
950 
951  dim3 blockSize(numTh,1,1);
952  dim3 gridSize(numBlk, d_orig_image.Ndim, 1);
953 
954  cudaStream_t *stream = (cudaStream_t*) myStream.ptr;
955  gpuErrchk(cudaMemsetAsync(padded_image_gpu.d_data, 0, padded_image_gpu.nzyxdim*sizeof(float), *stream));
956  gpuErrchk(cudaMemsetAsync(padded_image2_gpu.d_data, 0, padded_image2_gpu.nzyxdim*sizeof(float), *stream));
957  if(padded_mask_gpu.d_data!=NULL)
958  gpuErrchk(cudaMemsetAsync(padded_mask_gpu.d_data, 0, padded_mask_gpu.nzyxdim*sizeof(float), *stream));
959 
960  bool power2;
961  if (d_orig_image.Xdim & (d_orig_image.Xdim-1))
962  power2 = false;
963  else
964  power2 = true;
965  maskingPaddingKernel<<< gridSize, blockSize, 0, *stream>>>(d_orig_image.d_data, mask.d_data,
966  padded_image_gpu.d_data, padded_image2_gpu.d_data, padded_mask_gpu.d_data,
967  d_orig_image.Xdim, d_orig_image.Ydim, d_orig_image.yxdim, d_orig_image.Ndim,
968  padded_image_gpu.Xdim, padded_image_gpu.Ydim, padded_image_gpu.yxdim, experimental, power2);
969 
970 }
971 
972 
973 void pointwiseMultiplicationFourier(const GpuMultidimArrayAtGpu< std::complex<float> > &M, const GpuMultidimArrayAtGpu < std::complex<float> >& manyF,
974  GpuMultidimArrayAtGpu< std::complex<float> > &MmanyF, myStreamHandle &myStream)
975 {
976  cudaStream_t *stream = (cudaStream_t*) myStream.ptr;
977  int numTh = 1024;
978  XmippDim3 blockSize(numTh, 1, 1), gridSize;
979  manyF.calculateGridSizeVectorized(blockSize, gridSize);
980 
981  bool power2;
982  if (manyF.yxdim & (manyF.yxdim-1))
983  power2 = false;
984  else
985  power2 = true;
986  pointwiseMultiplicationComplexOneManyKernel <<< CONVERT2DIM3(gridSize), CONVERT2DIM3(blockSize), 0, *stream >>>
987  ((cufftComplex*)M.d_data, (cufftComplex*)manyF.d_data, (cufftComplex*) MmanyF.d_data, manyF.nzyxdim, manyF.yxdim, power2);
988 
989 }
990 
991 void pointwiseMultiplicationFourier_two(const GpuMultidimArrayAtGpu< std::complex<float> > &M,
992  const GpuMultidimArrayAtGpu < std::complex<float> >& manyF,
993  GpuMultidimArrayAtGpu< std::complex<float> > &MmanyF,
994  const GpuMultidimArrayAtGpu < std::complex<float> >& manyF_sq,
995  GpuMultidimArrayAtGpu< std::complex<float> > &MmanyF_sq,
996  myStreamHandle &myStream)
997 {
998  cudaStream_t *stream = (cudaStream_t*) myStream.ptr;
999  int numTh = 1024;
1000  int numBlk = manyF.Ndim;
1001 
1002  bool power2;
1003  if (manyF.yxdim & (manyF.yxdim-1))
1004  power2 = false;
1005  else
1006  power2 = true;
1007 
1008  pointwiseMultiplicationComplexOneManyKernel_two <<< numBlk, numTh, 0, *stream >>>
1009  ((cufftComplex*)M.d_data, (cufftComplex*)manyF.d_data, (cufftComplex*) MmanyF.d_data,
1010  (cufftComplex*)manyF_sq.d_data, (cufftComplex*) MmanyF_sq.d_data,
1011  manyF.nzyxdim, manyF.yxdim, manyF.Ndim, power2);
1012 
1013 }
1014 
1016  const GpuMultidimArrayAtGpu < std::complex<float> >& manyF,
1017  GpuMultidimArrayAtGpu< std::complex<float> > &MmanyF,
1018  const GpuMultidimArrayAtGpu < std::complex<float> >& manyF_sq,
1019  GpuMultidimArrayAtGpu< std::complex<float> > &MmanyF_sq,
1020  myStreamHandle &myStream,
1021  GpuMultidimArrayAtGpu< std::complex<float> > &maskAux)
1022 {
1023  cudaStream_t *stream = (cudaStream_t*) myStream.ptr;
1024  int numTh = 1024;
1025  int numBlk = manyF.Ndim;
1026 
1027  bool power2;
1028  if (manyF.yxdim & (manyF.yxdim-1))
1029  power2 = false;
1030  else
1031  power2 = true;
1032 
1033  pointwiseMultiplicationComplexOneManyKernel_three <<< numBlk, numTh, 0, *stream >>>
1034  ((cufftComplex*)M.d_data, (cufftComplex*)manyF.d_data, (cufftComplex*) MmanyF.d_data,
1035  (cufftComplex*)manyF_sq.d_data, (cufftComplex*) MmanyF_sq.d_data,
1036  (cufftComplex*) maskAux.d_data,
1037  manyF.nzyxdim, manyF.yxdim, manyF.Ndim, power2);
1038 
1039 }
1040 
1041 
1043  StructuresAux &myStructureAux, myStreamHandle &myStream)
1044 {
1045  myStructureAux.MF.resize(d_projFFT);
1046  myStructureAux.MF2.resize(d_projSquaredFFT);
1048 
1050  myStructureAux.MF2, myStream, maskAux);
1051 
1054 
1056 
1057  myStructureAux.MF.ifftStream(MFrealSpace, myhandlePaddedB, myStream, false, dull);
1058  myStructureAux.MF2.ifftStream(MF2realSpace, myhandlePaddedB, myStream, false, dull);
1059 
1061  maskAux.ifftStream(maskAutocorrelation, myhandleMaskB, myStream, false, dull);
1062  maskAux.clear();
1063 
1064 }
1065 
1066 
1067 
1069  StructuresAux &myStructureAux, GpuMultidimArrayAtGpu<float> &maskAutocorr, myStreamHandle &myStream)
1070 {
1071 
1072  myStructureAux.MF.resize(d_projFFT);
1073  myStructureAux.MF2.resize(d_projSquaredFFT);
1074 
1075 
1077  myStructureAux.MF2, myStream);
1078 
1080 
1083 
1084  myStructureAux.MF.ifftStream(MFrealSpace, myhandlePaddedB, myStream, false, dull);
1085  myStructureAux.MF2.ifftStream(MF2realSpace, myhandlePaddedB, myStream, false, dull);
1086 
1087  maskAutocorr.copyGpuToGpuStream(maskAutocorrelation, myStream);
1088 
1089 }
1090 
1091 
1093  StructuresAux &myStructureAux, GpuMultidimArrayAtGpu<float> &maskAutocorr, myStreamHandle &myStream,
1094  bool skip, mycufftHandle &ifftcb)
1095 {
1096 
1098 
1101 
1102  d_projFFT.ifftStream(MFrealSpace, ifftcb, myStream, true, d_maskFFT);
1103  d_projSquaredFFT.ifftStream(MF2realSpace, ifftcb, myStream, true, d_maskFFT);
1104 
1105  maskAutocorr.copyGpuToGpuStream(maskAutocorrelation, myStream);
1106 
1107 
1108 }
1109 
1110 
1111 void calculateMaxNew2DNew(int yxdim, int Ndim, float *d_data,
1113 
1114  cudaStream_t *stream = (cudaStream_t*) myStream.ptr;
1115 
1116  int numTh = 1024;
1117  int numBlk = Ndim;
1118 
1119  d_out.resize(numTh * numBlk);
1120  d_pos.resize(numTh * numBlk);
1121 
1122  calculateMaxThreads<<<numBlk, numTh, 2*numTh * sizeof(float), *stream>>>(d_data, d_out.d_data, d_pos.d_data, yxdim, Ndim, numTh, 1);
1123 
1124 }
1125 
1126 
1128  float *max_vector, int maxShift, mycufftHandle &myhandlePadded, bool mirror,
1129  StructuresAux &myStructureAux, myStreamHandle &myStream, TransformMatrix<float> &resultRT)
1130 {
1131 
1132 
1133  cudaStream_t *stream = (cudaStream_t*) myStream.ptr;
1134 
1135  myStructureAux.RefExpFourierPolar.resize(referenceAux.d_projPolarFFT.Xdim, referenceAux.d_projPolarFFT.Ydim,
1136  referenceAux.d_projPolarFFT.Zdim, referenceAux.d_projPolarFFT.Ndim);
1137 
1138  int numTh = 1024;
1139  XmippDim3 blockSize(numTh, 1, 1), gridSize;
1140  referenceAux.d_projPolarFFT.calculateGridSizeVectorized(blockSize, gridSize);
1141 
1142  pointwiseMultiplicationComplexKernel<<< CONVERT2DIM3(gridSize), CONVERT2DIM3(blockSize), 0, *stream >>>
1143  ((cufftComplex*)referenceAux.d_projPolarFFT.d_data, (cufftComplex*)experimentalAux.d_projPolarFFT.d_data,
1144  (cufftComplex*)myStructureAux.RefExpFourierPolar.d_data, referenceAux.d_projPolarFFT.nzyxdim,
1145  referenceAux.d_projPolarFFT.yxdim);
1146 
1148  myStructureAux.RefExpRealSpacePolar.resize(referenceAux.XdimPolar, referenceAux.YdimPolar, referenceAux.d_projPolarFFT.Zdim,
1149  referenceAux.d_projPolarFFT.Ndim);
1150  myStructureAux.RefExpFourierPolar.ifftStream(myStructureAux.RefExpRealSpacePolar, myhandlePadded, myStream, false, dull);
1151 
1152  XmippDim3 blockSize2(numTh, 1, 1), gridSize2;
1153  myStructureAux.RefExpRealSpacePolar.calculateGridSizeVectorized(blockSize2, gridSize2);
1154 
1155  myStructureAux.d_NCCPolar.resize(referenceAux.XdimPolar, referenceAux.YdimPolar, referenceAux.d_projPolarFFT.Zdim,
1156  referenceAux.d_projPolarFFT.Ndim);
1157 
1158  double maskFFTPolar = (referenceAux.XdimPolar*referenceAux.YdimPolar);
1159  calculateNccRotationKernel<<< CONVERT2DIM3(gridSize2), CONVERT2DIM3(blockSize2), 0, *stream >>>
1160  (myStructureAux.RefExpRealSpacePolar.d_data, (cufftComplex*)referenceAux.d_projPolarFFT.d_data, (cufftComplex*)experimentalAux.d_projPolarFFT.d_data,
1161  (cufftComplex*)referenceAux.d_projPolarSquaredFFT.d_data, (cufftComplex*)experimentalAux.d_projPolarSquaredFFT.d_data,
1162  maskFFTPolar, myStructureAux.d_NCCPolar.d_data, referenceAux.d_projPolarFFT.yxdim, myStructureAux.RefExpRealSpacePolar.nzyxdim,
1163  myStructureAux.RefExpRealSpacePolar.yxdim);
1164 
1165  //AJ sum along the radius
1166  numTh = 1024;
1167  int numBlk = (myStructureAux.d_NCCPolar.Xdim*myStructureAux.d_NCCPolar.Ndim)/numTh;
1168  if((myStructureAux.d_NCCPolar.Xdim*myStructureAux.d_NCCPolar.Ndim)%numTh!=0)
1169  numBlk++;
1170 
1171  myStructureAux.d_NCCPolar1D.resize(myStructureAux.d_NCCPolar.Xdim,1,1,myStructureAux.d_NCCPolar.Ndim);
1172  myStructureAux.auxMax.resize(myStructureAux.d_NCCPolar.Xdim,1,1,myStructureAux.d_NCCPolar.Ndim);
1173  myStructureAux.auxZero.resize(myStructureAux.d_NCCPolar.Xdim,1,1,myStructureAux.d_NCCPolar.Ndim);
1174  sumRadiusKernel<<< numBlk, numTh, 0, *stream >>>(myStructureAux.d_NCCPolar.d_data, myStructureAux.d_NCCPolar1D.d_data, myStructureAux.auxMax.d_data,
1175  myStructureAux.auxZero.d_data, myStructureAux.d_NCCPolar.Xdim*myStructureAux.d_NCCPolar.Ndim, myStructureAux.d_NCCPolar.Ydim,
1176  myStructureAux.d_NCCPolar.Ndim);
1177 
1178  calculateMaxNew2DNew(myStructureAux.d_NCCPolar1D.Xdim, myStructureAux.d_NCCPolar1D.Ndim, myStructureAux.d_NCCPolar1D.d_data,
1179  myStructureAux.d_out_polar_max, myStructureAux.d_pos_polar_max, myStream);
1180 
1181  numTh = 1024;
1182  numBlk = transMat.Ndim/numTh;
1183  if(transMat.Ndim%numTh > 0)
1184  numBlk++;
1185 
1186  bool _power2x;
1187  if (myStructureAux.d_NCCPolar1D.Xdim & (myStructureAux.d_NCCPolar1D.Xdim-1))
1188  _power2x = false;
1189  else
1190  _power2x = true;
1191  double maxShift2 = (2*maxShift)*(2*maxShift);
1192  myStructureAux.maxGpu.resize(myStructureAux.d_NCCPolar1D.Ndim);
1193  buildRotationMatrix<<<numBlk, numTh, 0, *stream>>> (myStructureAux.d_pos_polar_max.d_data, transMat.d_data,
1194  resultRT.d_data, myStructureAux.maxGpu.d_data, myStructureAux.auxMax.d_data, myStructureAux.auxZero.d_data,
1195  myStructureAux.d_NCCPolar1D.Xdim, myStructureAux.d_NCCPolar1D.Ndim,
1196  myStructureAux.d_NCCPolar1D.yxdim, 0, maxShift2, _power2x);
1197 
1198  resultRT.copyMatrix(transMat, myStream);
1199 
1200  gpuErrchk(cudaMemcpyAsync(max_vector, myStructureAux.maxGpu.d_data, myStructureAux.maxGpu.Ndim*sizeof(float), cudaMemcpyDeviceToHost, *stream));
1201 
1202 }
1203 
1204 
1206  float *max_vector, int maxShift, mycufftHandle &myhandlePadded, bool mirror,
1207  StructuresAux &myStructureAux, myStreamHandle &myStream, TransformMatrix<float> &resultTR,
1208  bool saveMaxVector)
1209 {
1210 
1211  cudaStream_t *stream = (cudaStream_t*) myStream.ptr;
1212 
1213  myStructureAux.RefExpFourier.resize(referenceAux.d_projFFT.Xdim, referenceAux.d_projFFT.Ydim,
1214  referenceAux.d_projFFT.Zdim, referenceAux.d_projFFT.Ndim);
1215 
1216  int numTh = 1024;
1217  XmippDim3 blockSize(numTh, 1, 1), gridSize;
1218  referenceAux.d_projFFT.calculateGridSizeVectorized(blockSize, gridSize);
1219 
1220  pointwiseMultiplicationComplexKernel<<< CONVERT2DIM3(gridSize), CONVERT2DIM3(blockSize), 0, *stream >>>
1221  ((cufftComplex*)referenceAux.d_projFFT.d_data, (cufftComplex*)experimentalAux.d_projFFT.d_data, (cufftComplex*)myStructureAux.RefExpFourier.d_data,
1222  referenceAux.d_projFFT.nzyxdim, referenceAux.d_projFFT.yxdim);
1223 
1224 
1225  myStructureAux.RefExpRealSpace.resize(referenceAux.Xdim, referenceAux.Ydim, referenceAux.d_projFFT.Zdim,
1226  referenceAux.d_projFFT.Ndim);
1227 
1229  myStructureAux.RefExpFourier.ifftStream(myStructureAux.RefExpRealSpace, myhandlePadded, myStream, false, dull);
1230 
1231 
1232  XmippDim3 blockSize2(numTh, 1, 1), gridSize2;
1233  myStructureAux.RefExpRealSpace.calculateGridSizeVectorized(blockSize2, gridSize2);
1234 
1235  myStructureAux.d_NCC.resize(referenceAux.Xdim, referenceAux.Ydim, referenceAux.d_projFFT.Zdim,
1236  referenceAux.d_projFFT.Ndim);
1237 
1238  bool power2yx, power2x;
1239  if (referenceAux.MFrealSpace.yxdim & (referenceAux.MFrealSpace.yxdim-1))
1240  power2yx = false;
1241  else
1242  power2yx = true;
1243  if (referenceAux.MFrealSpace.Xdim & (referenceAux.MFrealSpace.Xdim-1))
1244  power2x = false;
1245  else
1246  power2x = true;
1247  calculateNccKernel<<< CONVERT2DIM3(gridSize2), CONVERT2DIM3(blockSize2), 0, *stream >>>
1248  (myStructureAux.RefExpRealSpace.d_data, referenceAux.MFrealSpace.d_data, experimentalAux.MFrealSpace.d_data, referenceAux.MF2realSpace.d_data,
1249  experimentalAux.MF2realSpace.d_data, referenceAux.maskAutocorrelation.d_data, myStructureAux.d_NCC.d_data, referenceAux.MFrealSpace.nzyxdim,
1250  referenceAux.MFrealSpace.yxdim, referenceAux.MFrealSpace.Xdim, referenceAux.MFrealSpace.Ydim, referenceAux.maskCount, maxShift, power2yx, power2x);
1251 
1252  int fixPadding=0;
1253  if(referenceAux.XdimOrig%2==0 && referenceAux.Xdim%2==0)
1254  fixPadding=1;
1255  if(referenceAux.XdimOrig%2==0 && referenceAux.Xdim%2!=0)
1256  fixPadding=0;
1257  if(referenceAux.XdimOrig%2!=0 && referenceAux.Xdim%2==0)
1258  fixPadding=-1;
1259  if(referenceAux.XdimOrig%2!=0 && referenceAux.Xdim%2!=0)
1260  fixPadding=0;
1261 
1262  calculateMaxNew2DNew(myStructureAux.d_NCC.yxdim, myStructureAux.d_NCC.Ndim,
1263  myStructureAux.d_NCC.d_data, myStructureAux.d_out_max, myStructureAux.d_pos_max, myStream);
1264 
1265  numTh = 1024;
1266  int numBlk = transMat.Ndim/numTh;
1267  if(transMat.Ndim%numTh > 0)
1268  numBlk++;
1269 
1270  bool _power2x;
1271  if (myStructureAux.d_NCC.Xdim & (myStructureAux.d_NCC.Xdim-1))
1272  _power2x = false;
1273  else
1274  _power2x = true;
1275  double maxShift2 = (2*maxShift)*(2*maxShift);
1276  buildTranslationMatrix<<<numBlk, numTh, 0, *stream>>> (myStructureAux.d_pos_max.d_data, transMat.d_data, resultTR.d_data,
1277  myStructureAux.d_out_max.d_data, myStructureAux.d_NCC.d_data, myStructureAux.d_NCC.Xdim, myStructureAux.d_NCC.Ydim,
1278  myStructureAux.d_NCC.Ndim, myStructureAux.d_NCC.yxdim, fixPadding, maxShift2, _power2x);
1279 
1280  resultTR.copyMatrix(transMat, myStream);
1281 
1282  if(saveMaxVector)
1283  gpuErrchk(cudaMemcpyAsync(max_vector, myStructureAux.d_out_max.d_data, myStructureAux.d_NCC.Ndim*sizeof(float), cudaMemcpyDeviceToHost, *stream));
1284 
1285 }
1286 
1287 
1289  TransformMatrix<float> &transMatTR, float *max_vectorTR, int maxShift,
1290  mycufftHandle &myhandlePaddedTR, bool mirror, StructuresAux &myStructureAuxTR,
1291  myStreamHandle &myStreamTR,
1292  GpuCorrelationAux &experimentalAuxRT, TransformMatrix<float> &transMatRT,
1293  float *max_vectorRT, mycufftHandle &myhandlePaddedRT,
1294  StructuresAux &myStructureAuxRT, myStreamHandle &myStreamRT,
1295  TransformMatrix<float> &resultTR, TransformMatrix<float> &resultRT,
1296  mycufftHandle &ifftcb, bool saveMaxVector)
1297 {
1298 
1299  cudaStream_t *streamTR = (cudaStream_t*) myStreamTR.ptr;
1300  cudaStream_t *streamRT = (cudaStream_t*) myStreamRT.ptr;
1301 
1302 
1303  myStructureAuxTR.RefExpFourier.resize(referenceAux.d_projFFT.Xdim, referenceAux.d_projFFT.Ydim,
1304  referenceAux.d_projFFT.Zdim, referenceAux.d_projFFT.Ndim);
1305  myStructureAuxTR.RefExpRealSpace.resize(referenceAux.Xdim, referenceAux.Ydim, referenceAux.d_projFFT.Zdim,
1306  referenceAux.d_projFFT.Ndim);
1307  myStructureAuxTR.d_NCC.resize(referenceAux.Xdim, referenceAux.Ydim, referenceAux.d_projFFT.Zdim,
1308  referenceAux.d_projFFT.Ndim);
1309 
1310  myStructureAuxRT.RefExpFourierPolar.resize(referenceAux.d_projPolarFFT.Xdim, referenceAux.d_projPolarFFT.Ydim,
1311  referenceAux.d_projPolarFFT.Zdim, referenceAux.d_projPolarFFT.Ndim);
1312  myStructureAuxRT.RefExpRealSpacePolar.resize(referenceAux.XdimPolar, referenceAux.YdimPolar, referenceAux.d_projPolarFFT.Zdim,
1313  referenceAux.d_projPolarFFT.Ndim);
1314  myStructureAuxRT.d_NCCPolar.resize(referenceAux.XdimPolar, referenceAux.YdimPolar, referenceAux.d_projPolarFFT.Zdim,
1315  referenceAux.d_projPolarFFT.Ndim);
1316  myStructureAuxRT.d_NCCPolar1D.resize(myStructureAuxRT.d_NCCPolar.Xdim,1,1,myStructureAuxRT.d_NCCPolar.Ndim);
1317  myStructureAuxRT.auxMax.resize(myStructureAuxRT.d_NCCPolar.Xdim,1,1,myStructureAuxRT.d_NCCPolar.Ndim);
1318  myStructureAuxRT.auxZero.resize(myStructureAuxRT.d_NCCPolar.Xdim,1,1,myStructureAuxRT.d_NCCPolar.Ndim);
1319  myStructureAuxRT.maxGpu.resize(myStructureAuxRT.d_NCCPolar1D.Ndim);
1320 
1321 
1322 
1323  int numTh = 1024;
1324  XmippDim3 blockSize(numTh, 1, 1), gridSize;
1325  referenceAux.d_projFFT.calculateGridSizeVectorized(blockSize, gridSize);
1326 
1327 
1328  pointwiseMultiplicationComplexKernel<<< CONVERT2DIM3(gridSize), CONVERT2DIM3(blockSize), 0, *streamTR >>>
1329  ((cufftComplex*)referenceAux.d_projFFT.d_data, (cufftComplex*)experimentalAuxTR.d_projFFT.d_data,
1330  (cufftComplex*)myStructureAuxTR.RefExpFourier.d_data,
1331  referenceAux.d_projFFT.nzyxdim, referenceAux.d_projFFT.yxdim);
1332 
1333  XmippDim3 blockSize3(numTh, 1, 1), gridSize3;
1334  referenceAux.d_projPolarFFT.calculateGridSizeVectorized(blockSize3, gridSize3);
1335 
1336 
1337  pointwiseMultiplicationComplexKernel<<< CONVERT2DIM3(gridSize3), CONVERT2DIM3(blockSize3), 0, *streamRT >>>
1338  ((cufftComplex*)referenceAux.d_projPolarFFT.d_data, (cufftComplex*)experimentalAuxRT.d_projPolarFFT.d_data,
1339  (cufftComplex*)myStructureAuxRT.RefExpFourierPolar.d_data, referenceAux.d_projPolarFFT.nzyxdim,
1340  referenceAux.d_projPolarFFT.yxdim);
1341 
1342 
1344  myStructureAuxTR.RefExpFourier.ifftStream(myStructureAuxTR.RefExpRealSpace, myhandlePaddedTR, myStreamTR, false, dull);
1345 
1346  myStructureAuxRT.RefExpFourierPolar.ifftStream(myStructureAuxRT.RefExpRealSpacePolar, myhandlePaddedRT, myStreamRT, false, dull);
1347 
1348  XmippDim3 blockSize2(numTh, 1, 1), gridSize2;
1349  myStructureAuxTR.RefExpRealSpace.calculateGridSizeVectorized(blockSize2, gridSize2);
1350 
1351  bool power2yx, power2x;
1352  if (referenceAux.MFrealSpace.yxdim & (referenceAux.MFrealSpace.yxdim-1))
1353  power2yx = false;
1354  else
1355  power2yx = true;
1356  if (referenceAux.MFrealSpace.Xdim & (referenceAux.MFrealSpace.Xdim-1))
1357  power2x = false;
1358  else
1359  power2x = true;
1360  calculateNccKernel<<< CONVERT2DIM3(gridSize2), CONVERT2DIM3(blockSize2), 0, *streamTR >>>
1361  (myStructureAuxTR.RefExpRealSpace.d_data, referenceAux.MFrealSpace.d_data, experimentalAuxTR.MFrealSpace.d_data, referenceAux.MF2realSpace.d_data,
1362  experimentalAuxTR.MF2realSpace.d_data, referenceAux.maskAutocorrelation.d_data, myStructureAuxTR.d_NCC.d_data, referenceAux.MFrealSpace.nzyxdim,
1363  referenceAux.MFrealSpace.yxdim, referenceAux.MFrealSpace.Xdim, referenceAux.MFrealSpace.Ydim, referenceAux.maskCount, maxShift, power2yx, power2x);
1364 
1365 
1366  int fixPadding=0;
1367  if(referenceAux.XdimOrig%2==0 && referenceAux.Xdim%2==0)
1368  fixPadding=1;
1369  if(referenceAux.XdimOrig%2==0 && referenceAux.Xdim%2!=0)
1370  fixPadding=0;
1371  if(referenceAux.XdimOrig%2!=0 && referenceAux.Xdim%2==0)
1372  fixPadding=-1;
1373  if(referenceAux.XdimOrig%2!=0 && referenceAux.Xdim%2!=0)
1374  fixPadding=0;
1375 
1376  numTh = 1024;
1377  XmippDim3 blockSize4(numTh, 1, 1), gridSize4;
1378  myStructureAuxRT.RefExpRealSpacePolar.calculateGridSizeVectorized(blockSize4, gridSize4);
1379 
1380  double maskFFTPolar = (referenceAux.XdimPolar*referenceAux.YdimPolar);
1381  calculateNccRotationKernel<<< CONVERT2DIM3(gridSize4), CONVERT2DIM3(blockSize4), 0, *streamRT >>>
1382  (myStructureAuxRT.RefExpRealSpacePolar.d_data, (cufftComplex*)referenceAux.d_projPolarFFT.d_data, (cufftComplex*)experimentalAuxRT.d_projPolarFFT.d_data,
1383  (cufftComplex*)referenceAux.d_projPolarSquaredFFT.d_data, (cufftComplex*)experimentalAuxRT.d_projPolarSquaredFFT.d_data,
1384  maskFFTPolar, myStructureAuxRT.d_NCCPolar.d_data, referenceAux.d_projPolarFFT.yxdim, myStructureAuxRT.RefExpRealSpacePolar.nzyxdim,
1385  myStructureAuxRT.RefExpRealSpacePolar.yxdim);
1386 
1387  //AJ sum along the radius
1388  numTh = 1024;
1389  int numBlk = (myStructureAuxRT.d_NCCPolar.Xdim*myStructureAuxRT.d_NCCPolar.Ndim)/numTh;
1390  if((myStructureAuxRT.d_NCCPolar.Xdim*myStructureAuxRT.d_NCCPolar.Ndim)%numTh!=0)
1391  numBlk++;
1392 
1393  sumRadiusKernel<<< numBlk, numTh, 0, *streamRT >>>(myStructureAuxRT.d_NCCPolar.d_data, myStructureAuxRT.d_NCCPolar1D.d_data, myStructureAuxRT.auxMax.d_data,
1394  myStructureAuxRT.auxZero.d_data, myStructureAuxRT.d_NCCPolar.Xdim*myStructureAuxRT.d_NCCPolar.Ndim, myStructureAuxRT.d_NCCPolar.Ydim,
1395  myStructureAuxRT.d_NCCPolar.Ndim);
1396 
1397  calculateMaxNew2DNew(myStructureAuxTR.d_NCC.yxdim, myStructureAuxTR.d_NCC.Ndim,
1398  myStructureAuxTR.d_NCC.d_data, myStructureAuxTR.d_out_max, myStructureAuxTR.d_pos_max, myStreamTR);
1399 
1400  calculateMaxNew2DNew(myStructureAuxRT.d_NCCPolar1D.Xdim, myStructureAuxRT.d_NCCPolar1D.Ndim, myStructureAuxRT.d_NCCPolar1D.d_data,
1401  myStructureAuxRT.d_out_polar_max, myStructureAuxRT.d_pos_polar_max, myStreamRT);
1402 
1403  numTh = 1024;
1404  numBlk = transMatTR.Ndim/numTh;
1405  if(transMatTR.Ndim%numTh > 0)
1406  numBlk++;
1407 
1408  bool _power2x;
1409  if (myStructureAuxTR.d_NCC.Xdim & (myStructureAuxTR.d_NCC.Xdim-1))
1410  _power2x = false;
1411  else
1412  _power2x = true;
1413  double maxShift2 = (2*maxShift)*(2*maxShift);
1414  buildTranslationMatrix<<<numBlk, numTh, 0, *streamTR>>> (myStructureAuxTR.d_pos_max.d_data, transMatTR.d_data, resultTR.d_data,
1415  myStructureAuxTR.d_out_max.d_data, myStructureAuxTR.d_NCC.d_data, myStructureAuxTR.d_NCC.Xdim, myStructureAuxTR.d_NCC.Ydim,
1416  myStructureAuxTR.d_NCC.Ndim, myStructureAuxTR.d_NCC.yxdim, fixPadding, maxShift2, _power2x);
1417 
1418  numBlk = transMatRT.Ndim/numTh;
1419  if(transMatRT.Ndim%numTh > 0)
1420  numBlk++;
1421 
1422  bool __power2x;
1423  if (myStructureAuxRT.d_NCCPolar1D.Xdim & (myStructureAuxRT.d_NCCPolar1D.Xdim-1))
1424  __power2x = false;
1425  else
1426  __power2x = true;
1427  buildRotationMatrix<<<numBlk, numTh, 0, *streamRT>>> (myStructureAuxRT.d_pos_polar_max.d_data, transMatRT.d_data,
1428  resultRT.d_data, myStructureAuxRT.maxGpu.d_data, myStructureAuxRT.auxMax.d_data, myStructureAuxRT.auxZero.d_data,
1429  myStructureAuxRT.d_NCCPolar1D.Xdim, myStructureAuxRT.d_NCCPolar1D.Ndim,
1430  myStructureAuxRT.d_NCCPolar1D.yxdim, 0, maxShift2, __power2x);
1431 
1432 
1433  resultTR.copyMatrix(transMatTR, myStreamTR);
1434 
1435  resultRT.copyMatrix(transMatRT, myStreamRT);
1436 
1437  if(saveMaxVector){
1438  gpuErrchk(cudaMemcpyAsync(max_vectorTR, myStructureAuxTR.d_out_max.d_data, myStructureAuxTR.d_NCC.Ndim*sizeof(float), cudaMemcpyDeviceToHost, *streamTR));
1439  gpuErrchk(cudaMemcpyAsync(max_vectorRT, myStructureAuxRT.maxGpu.d_data, myStructureAuxRT.maxGpu.Ndim*sizeof(float), cudaMemcpyDeviceToHost, *streamRT));
1440  }
1441 
1442 }
1443 
1444 
1445 
1446 
1448  TransformMatrix<float> &transMat, myStreamHandle &myStream){
1449 
1450  cudaStream_t *stream = (cudaStream_t*) myStream.ptr;
1451 
1452  int numTh = 1024;
1453 
1454  int numBlk = d_transform_image.yxdim/numTh;
1455  if(d_transform_image.yxdim%numTh > 0)
1456  numBlk++;
1457  dim3 blockSize(numTh, 1, 1);
1458  dim3 gridSize(numBlk, d_transform_image.Ndim, 1);
1459 
1460  bool power2yx, power2x;
1461  if (d_original_image.yxdim & (d_original_image.yxdim-1))
1462  power2yx = false;
1463  else
1464  power2yx = true;
1465  if (d_original_image.Xdim & (d_original_image.Xdim-1))
1466  power2x = false;
1467  else
1468  power2x = true;
1469  applyTransformKernel<<< gridSize, blockSize, 9*sizeof(float), *stream >>>
1470  (d_original_image.d_data, d_transform_image.d_data, transMat.d_data,
1471  d_original_image.nzyxdim, d_original_image.yxdim, d_original_image.Xdim,
1472  d_original_image.Ydim, power2yx, power2x);
1473 
1474 }
1475 
1476 
1477 
1479  GpuMultidimArrayAtGpu<float> &polar2_image, bool rotate, myStreamHandle &myStream)
1480 {
1481  cudaStream_t *stream = (cudaStream_t*) myStream.ptr;
1482  int numTh = 32;
1483  int numBlkx = polar_image.Xdim/numTh;
1484  if(polar_image.Xdim%numTh > 0)
1485  numBlkx++;
1486  int numBlky = polar_image.Ydim/numTh;
1487  if(polar_image.Ydim%numTh > 0)
1488  numBlky++;
1489 
1490  dim3 blockSize(numTh, numTh, 1);
1491  dim3 gridSize(numBlkx, numBlky, polar_image.Ndim);
1492 
1493  cart2polar <<< gridSize, blockSize, 0, *stream>>>
1494  (image.d_data, polar_image.d_data, polar2_image.d_data, polar_image.Ydim, polar_image.Xdim, polar_image.Ndim, image.Ydim, image.Xdim, rotate);
1495 }
1496 
1497 void waitGpu (myStreamHandle &myStream, bool allStreams){
1498  if(!allStreams){
1499  cudaStream_t *stream = (cudaStream_t*) myStream.ptr;
1500  gpuErrchk(cudaStreamSynchronize(*stream));
1501  }else
1502  gpuErrchk(cudaDeviceSynchronize());
1503 }
1504 
1505 void calculateAbs (std::complex<float> *data, float *out, int size, myStreamHandle &myStream){
1506 
1507  cudaStream_t *stream = (cudaStream_t*) myStream.ptr;
1508  int numTh = 1024;
1509  int numBlk = size/numTh;
1510  if(size%numTh > 0)
1511  numBlk++;
1512  calcAbsKernel <<< numBlk, numTh, 0, *stream>>> ((cufftComplex*)data, out, size);
1513 
1514 }
#define gpuErrchk(code)
Definition: cuda_asserts.h:31
void cuda_calculate_correlation(GpuCorrelationAux &referenceAux, GpuCorrelationAux &experimentalAux, TransformMatrix< float > &transMat, float *max_vector, int maxShift, mycufftHandle &myhandlePadded, bool mirror, StructuresAux &myStructureAux, myStreamHandle &myStream, TransformMatrix< float > &resultTR, bool saveMaxVector)
void pointwiseMultiplicationFourier(const GpuMultidimArrayAtGpu< std::complex< float > > &M, const GpuMultidimArrayAtGpu< std::complex< float > > &manyF, GpuMultidimArrayAtGpu< std::complex< float > > &MmanyF, myStreamHandle &myStream)
void resize(const GpuMultidimArrayAtGpu< T1 > &array)
__global__ void calculateNccKernel(float *RefExpRealSpace, float *MFrealSpaceRef, float *MFrealSpaceExp, float *MF2realSpaceRef, float *MF2realSpaceExp, float *mask, float *NCC, int nzyxdim, int yxdim, int xdim, int ydim, int maskCount, int max_shift, bool power2yx, bool power2x)
__global__ void buildRotationMatrix(float *d_pos, float *lastMat, float *result, float *maxGpu, float *auxMax, float *NCC, int Xdim, int Ndim, int NCC_yxdim, int fixPadding, double maxShift2, bool power2x)
void padding_masking(GpuMultidimArrayAtGpu< float > &d_orig_image, GpuMultidimArrayAtGpu< float > &mask, GpuMultidimArrayAtGpu< float > &padded_image_gpu, GpuMultidimArrayAtGpu< float > &padded_image2_gpu, GpuMultidimArrayAtGpu< float > &padded_mask_gpu, bool experimental, myStreamHandle &myStream)
GpuMultidimArrayAtGpu< float > RefExpRealSpacePolar
__global__ void sumRadiusKernel(float *d_in, float *d_out, float *d_out_max, float *d_out_zero, int dim, int radius, int ndim)
__global__ void pointwiseMultiplicationComplexKernel(cufftComplex *reference, cufftComplex *experimental, cufftComplex *RefExpFourier, int nzyxdim, int yxdim)
void apply_transform(GpuMultidimArrayAtGpu< float > &d_original_image, GpuMultidimArrayAtGpu< float > &d_transform_image, TransformMatrix< float > &transMat, myStreamHandle &myStream)
GpuMultidimArrayAtGpu< std::complex< float > > d_projPolarSquaredFFT
void sqrt(Image< double > &op)
static double * y
__device__ void wrapping(int &x, int &y, int xdim, int ydim)
GpuMultidimArrayAtGpu< float > d_pos_polar_max
GpuMultidimArrayAtGpu< float > auxMax
__global__ void applyTransformKernel(float *d_in, float *d_out, float *transMat, int nzyxdim, int yxdim, int xdim, int ydim, bool power2yx, bool power2x)
GpuMultidimArrayAtGpu< float > RefExpRealSpace
void calculateMaxNew2DNew(int yxdim, int Ndim, float *d_data, GpuMultidimArrayAtGpu< float > &d_out, GpuMultidimArrayAtGpu< float > &d_pos, myStreamHandle &myStream)
__global__ void calcAbsKernel(cufftComplex *d_in, float *d_out, int dim)
doublereal * x
#define i
#define rotate(a, i, j, k, l)
__global__ void pointwiseMultiplicationComplexOneManyKernel_two(cufftComplex *M, cufftComplex *manyF, cufftComplex *MmanyF, cufftComplex *manyF_sq, cufftComplex *MmanyF_sq, int nzyxdim, int yxdim, int ndim, bool power2)
GpuMultidimArrayAtGpu< std::complex< float > > RefExpFourier
GpuMultidimArrayAtGpu< float > maskAutocorrelation
GpuMultidimArrayAtGpu< std::complex< float > > d_projFFT
void ifftStream(GpuMultidimArrayAtGpu< T1 > &realSpace, mycufftHandle &myhandle, myStreamHandle &myStream, bool useCallback, GpuMultidimArrayAtGpu< std::complex< float > > &dataExp)
GpuMultidimArrayAtGpu< float > d_transform_image
GpuMultidimArrayAtGpu< std::complex< float > > RefExpFourierPolar
void cuda_calculate_correlation_two(GpuCorrelationAux &referenceAux, GpuCorrelationAux &experimentalAuxTR, TransformMatrix< float > &transMatTR, float *max_vectorTR, int maxShift, mycufftHandle &myhandlePaddedTR, bool mirror, StructuresAux &myStructureAuxTR, myStreamHandle &myStreamTR, GpuCorrelationAux &experimentalAuxRT, TransformMatrix< float > &transMatRT, float *max_vectorRT, mycufftHandle &myhandlePaddedRT, StructuresAux &myStructureAuxRT, myStreamHandle &myStreamRT, TransformMatrix< float > &resultTR, TransformMatrix< float > &resultRT, mycufftHandle &ifftcb, bool saveMaxVector)
if(fabs(c[*nmax+ *nmax *c_dim1])==0.e0)
GpuMultidimArrayAtGpu< float > d_out_max
viol index
__global__ void calculateDenomFunctionKernel(float *MFrealSpace, float *MF2realSpace, float *maskAutocorrelation, float *out, int nzyxdim, int yxdim, bool power2)
GpuMultidimArrayAtGpu< std::complex< float > > d_projPolarFFT
double * f
GpuMultidimArrayAtGpu< float > d_original_image
GpuMultidimArrayAtGpu< float > d_NCCPolar
void calculateGridSizeVectorized(const XmippDim3 &blockSize, XmippDim3 &gridSize) const
GpuMultidimArrayAtGpu< float > d_NCCPolar1D
GpuMultidimArrayAtGpu< std::complex< float > > MF2
void cuda_cart2polar(GpuMultidimArrayAtGpu< float > &image, GpuMultidimArrayAtGpu< float > &polar_image, GpuMultidimArrayAtGpu< float > &polar2_image, bool rotate, myStreamHandle &myStream)
void pointwiseMultiplicationFourier_two(const GpuMultidimArrayAtGpu< std::complex< float > > &M, const GpuMultidimArrayAtGpu< std::complex< float > > &manyF, GpuMultidimArrayAtGpu< std::complex< float > > &MmanyF, const GpuMultidimArrayAtGpu< std::complex< float > > &manyF_sq, GpuMultidimArrayAtGpu< std::complex< float > > &MmanyF_sq, myStreamHandle &myStream)
__global__ void cart2polar(float *image, float *polar, float *polar2, int maxRadius, int maxAng, int Nimgs, int Ydim, int Xdim, bool rotate)
GpuMultidimArrayAtGpu< std::complex< float > > d_projSquaredFFT
#define j
void produceSideInfo(mycufftHandle &myhandlePadded, mycufftHandle &myhandleMask, StructuresAux &myStructureAux, myStreamHandle &myStream)
GpuMultidimArrayAtGpu< float > d_pos_max
GpuMultidimArrayAtGpu< float > MF2realSpace
GpuMultidimArrayAtGpu< float > maxGpu
GpuMultidimArrayAtGpu< float > MFrealSpace
__global__ void calculateNccRotationKernel(float *RefExpRealSpace, cufftComplex *polarFFTRef, cufftComplex *polarFFTExp, cufftComplex *polarSquaredFFTRef, cufftComplex *polarSquaredFFTExp, float maskFFTPolarReal, float *NCC, int yxdimFFT, int nzyxdim, int yxdim)
__global__ void calculateMaxThreads(float *d_in, float *d_out, float *position, int yxdim, int Ndim, int yxdim2, int Ndim2)
void pointwiseMultiplicationFourier_three(const GpuMultidimArrayAtGpu< std::complex< float > > &M, const GpuMultidimArrayAtGpu< std::complex< float > > &manyF, GpuMultidimArrayAtGpu< std::complex< float > > &MmanyF, const GpuMultidimArrayAtGpu< std::complex< float > > &manyF_sq, GpuMultidimArrayAtGpu< std::complex< float > > &MmanyF_sq, myStreamHandle &myStream, GpuMultidimArrayAtGpu< std::complex< float > > &maskAux)
void calculateAbs(std::complex< float > *data, float *out, int size, myStreamHandle &myStream)
GpuMultidimArrayAtGpu< float > d_NCC
void copyMatrix(TransformMatrix< float > &lastMatrix, myStreamHandle &myStream)
GpuMultidimArrayAtGpu< float > auxZero
GpuMultidimArrayAtGpu< float > d_out_polar_max
void cuda_calculate_correlation_rotation(GpuCorrelationAux &referenceAux, GpuCorrelationAux &experimentalAux, TransformMatrix< float > &transMat, float *max_vector, int maxShift, mycufftHandle &myhandlePadded, bool mirror, StructuresAux &myStructureAux, myStreamHandle &myStream, TransformMatrix< float > &resultRT)
__global__ void calculateMax2(float *d_in, float *d_out, float *position, int yxdim, int Ndim, bool firstCall)
GpuMultidimArrayAtGpu< std::complex< float > > MF
#define PI
void copyGpuToGpuStream(GpuMultidimArrayAtGpu< T > &gpuArray, myStreamHandle &myStream)
GpuMultidimArrayAtGpu< std::complex< float > > d_maskFFT
int * n
__global__ void buildTranslationMatrix(float *d_pos, float *lastMat, float *result, float *maxGpu, float *NCC, int Xdim, int Ydim, int Ndim, int NCC_yxdim, int fixPadding, double maxShift2, bool power2x)
__global__ void pointwiseMultiplicationComplexOneManyKernel_three(cufftComplex *M, cufftComplex *manyF, cufftComplex *MmanyF, cufftComplex *manyF_sq, cufftComplex *MmanyF_sq, cufftComplex *maskAux, int nzyxdim, int yxdim, int ndim, bool power2)
void waitGpu(myStreamHandle &myStream, bool allStreams)
__global__ void maskingPaddingKernel(float *d_in, float *mask, float *padded_image_gpu, float *padded_image2_gpu, float *padded_mask_gpu, int xdim, int ydim, int yxdim, int numImag, int pad_xdim, int pad_ydim, int pad_yxdim, bool experimental, bool power2x)
__global__ void pointwiseMultiplicationComplexOneManyKernel(cufftComplex *M, cufftComplex *manyF, cufftComplex *MmanyF, int nzyxdim, int yxdim, bool power2)