11 #include <cuda_runtime.h> 19 #define PI 3.14159265359 22 __global__
void calcAbsKernel(cufftComplex *d_in,
float *d_out,
int dim){
24 int idx = blockDim.x * blockIdx.x + threadIdx.x;
29 d_out[idx]=d_in[idx].x;
33 __global__
void sumRadiusKernel(
float *d_in,
float *d_out,
float *d_out_max,
float *d_out_zero,
34 int dim,
int radius,
int ndim){
36 int idx = blockDim.x * blockIdx.x + threadIdx.x;
37 int numIm = floorf(idx/360.0
f);
44 float out_max = -100000;
45 int idxRead=360*radius*numIm;
46 for(
int i=0;
i<radius;
i++){
47 if(d_in[idxRead+(360*
i)+angle]==-1.0){
50 out += d_in[idxRead+(360*
i)+angle];
51 if(d_in[idxRead+(360*
i)+angle]>d_out_max[idx]){
52 out_max = d_in[idxRead+(360*
i)+angle];
56 d_out_zero[idx] = d_in[idxRead+angle];
59 d_out_max[idx] = out_max;
64 __global__
void calculateMax2(
float *d_in,
float *d_out,
float *position,
int yxdim,
int Ndim,
bool firstCall){
66 extern __shared__
float sdata[];
68 int idx = threadIdx.x;
69 int blockSize = blockDim.x;
73 int i = blockIdx.x * blockSize + idx;
78 for(
int imN=0; imN<Ndim; imN++){
80 if(i<yxdim*gridDim.x){
81 sdata[idx]=d_in[i+
index];
84 sdata[idx+blockSize] = (float)idx;
86 sdata[idx+blockSize] = position[i+
index];
93 if(i>=yxdim*gridDim.x)
98 if(blockSize >= 1024){
100 sdata[idx] = fmaxf(sdata[idx], sdata[idx+512]);
101 sdata[idx+blockSize] = (sdata[idx]==sdata[idx+512]) ? sdata[idx+blockSize+512] : sdata[idx+blockSize];
105 if(blockSize >= 512){
107 sdata[idx] = fmaxf(sdata[idx], sdata[idx+256]);
108 sdata[idx+blockSize] = (sdata[idx]==sdata[idx+256]) ? sdata[idx+blockSize+256] : sdata[idx+blockSize];
112 if(blockSize >= 256){
114 sdata[idx] = fmaxf(sdata[idx], sdata[idx+128]);
115 sdata[idx+blockSize] = (sdata[idx]==sdata[idx+128]) ? sdata[idx+blockSize+128] : sdata[idx+blockSize];
119 if(blockSize >= 128){
121 sdata[idx] = fmaxf(sdata[idx], sdata[idx+64]);
122 sdata[idx+blockSize] = (sdata[idx]==sdata[idx+64]) ? sdata[idx+blockSize+64] : sdata[idx+blockSize];
129 sdata[idx] = fmaxf(sdata[idx], sdata[idx+32]);
130 sdata[idx+blockSize] = (sdata[idx]==sdata[idx+32]) ? sdata[idx+blockSize+32] : sdata[idx+blockSize];
135 sdata[idx] = fmaxf(sdata[idx], sdata[idx+16]);
136 sdata[idx+blockSize] = (sdata[idx]==sdata[idx+16]) ? sdata[idx+blockSize+16] : sdata[idx+blockSize];
141 sdata[idx] = fmaxf(sdata[idx], sdata[idx+8]);
142 sdata[idx+blockSize] = (sdata[idx]==sdata[idx+8]) ? sdata[idx+blockSize+8] : sdata[idx+blockSize];
147 sdata[idx] = fmaxf(sdata[idx], sdata[idx+4]);
148 sdata[idx+blockSize] = (sdata[idx]==sdata[idx+4]) ? sdata[idx+blockSize+4] : sdata[idx+blockSize];
153 sdata[idx] = fmaxf(sdata[idx], sdata[idx+2]);
154 sdata[idx+blockSize] = (sdata[idx]==sdata[idx+2]) ? sdata[idx+blockSize+2] : sdata[idx+blockSize];
159 sdata[idx] = fmaxf(sdata[idx], sdata[idx+1]);
160 sdata[idx+blockSize] = (sdata[idx]==sdata[idx+1]) ? sdata[idx+blockSize+1] : sdata[idx+blockSize];
167 d_out[blockIdx.x+(gridDim.x*imN)] = sdata[0];
168 position[blockIdx.x+(gridDim.x*imN)] = sdata[blockSize];
173 index = index+(int)yxdim;
181 cufftComplex *manyF, cufftComplex *MmanyF, cufftComplex *manyF_sq, cufftComplex *MmanyF_sq,
182 cufftComplex *maskAux,
int nzyxdim,
int yxdim,
int ndim,
bool power2)
185 int idx = threadIdx.x;
186 int nIm = blockIdx.x;
187 int blockSize = blockDim.x;
189 int posTh = nIm*yxdim + idx;
190 int n = ceilf(((
float)yxdim/(
float)blockSize));
195 float normFactor = 1.0f/yxdim;
200 cufftComplex myM = M[myIdxMask];
201 cuComplex mulOut = cuCmulf(manyF[myIdx], myM);
202 cuComplex mulOut_sq = cuCmulf(manyF_sq[myIdx], myM);
204 MmanyF[myIdx] = mulOut*normFactor;
205 MmanyF_sq[myIdx] = mulOut_sq*normFactor;
207 maskAux[myIdx] = cuCmulf(myM, myM)*normFactor;
210 myIdxMask+=blockSize;
212 for (
int i=1;
i<
n;
i++){
214 if (posTh+
i*blockSize < yxdim*(nIm+1)){
217 mulOut = cuCmulf(manyF[myIdx], myM);
218 mulOut_sq = cuCmulf(manyF_sq[myIdx], myM);
220 MmanyF[myIdx] = mulOut*normFactor;
221 MmanyF_sq[myIdx] = mulOut_sq*normFactor;
223 maskAux[myIdx] = cuCmulf(myM, myM)*normFactor;
226 myIdxMask+=blockSize;
234 cufftComplex *manyF, cufftComplex *MmanyF, cufftComplex *manyF_sq, cufftComplex *MmanyF_sq,
235 int nzyxdim,
int yxdim,
int ndim,
bool power2)
238 int idx = threadIdx.x;
239 int nIm = blockIdx.x;
240 int blockSize = blockDim.x;
242 int posTh = nIm*yxdim + idx;
243 int n = ceilf(((
float)yxdim/(
float)blockSize));
248 float normFactor = 1.0f/yxdim;
253 cufftComplex myM = M[myIdxMask];
254 cuComplex mulOut = cuCmulf(manyF[myIdx], myM);
255 cuComplex mulOut_sq = cuCmulf(manyF_sq[myIdx], myM);
257 MmanyF[myIdx] = mulOut*normFactor;
258 MmanyF_sq[myIdx] = mulOut_sq*normFactor;
261 myIdxMask+=blockSize;
263 for (
int i=1;
i<
n;
i++){
265 if (posTh+
i*blockSize < yxdim*(nIm+1)){
268 mulOut = cuCmulf(manyF[myIdx], myM);
269 mulOut_sq = cuCmulf(manyF_sq[myIdx], myM);
271 MmanyF[myIdx] = mulOut*normFactor;
272 MmanyF_sq[myIdx] = mulOut_sq*normFactor;
275 myIdxMask+=blockSize;
284 int nzyxdim,
int yxdim,
bool power2)
287 int idx = blockDim.x * blockIdx.x + threadIdx.x;
290 idxLow = idx & (yxdim-1);
297 float normFactor = (1.0f/yxdim);
299 cuComplex mulOut = cuCmulf(manyF[idx], M[idxLow]);
300 MmanyF[idx] = mulOut*normFactor;
306 float *out,
int nzyxdim,
int yxdim,
bool power2)
308 int idx = blockDim.x * blockIdx.x + threadIdx.x;
311 idxLow = idx & (yxdim-1);
318 out[idx] =
sqrt(MF2realSpace[idx] - (MFrealSpace[idx]*MFrealSpace[idx]/maskAutocorrelation[idxLow]));
323 __global__
void calculateNccKernel(
float *RefExpRealSpace,
float *MFrealSpaceRef,
float *MFrealSpaceExp,
324 float *MF2realSpaceRef,
float *MF2realSpaceExp,
float *mask,
float *NCC,
325 int nzyxdim,
int yxdim,
int xdim,
int ydim,
int maskCount,
int max_shift,
bool power2yx,
bool power2x)
329 int idx = blockDim.x * blockIdx.x + threadIdx.x;
332 idxLow = idx & (yxdim-1);
341 idx_x = idxLow & (xdim-1);
345 int idx_y=idxLow/xdim;
346 if(idx_x>=max_shift && idx_x<xdim-max_shift){
350 if(idx_y>=max_shift && idx_y<ydim-max_shift){
355 float maskTh = mask[idxLow];
356 float MF2realSpaceRefTh = MF2realSpaceRef[idx];
357 float MFrealSpaceRefTh = MFrealSpaceRef[idx];
358 float MF2realSpaceExpTh = MF2realSpaceExp[idx];
359 float MFrealSpaceExpTh = MFrealSpaceExp[idx];
360 float RefExpRealSpaceTh = RefExpRealSpace[idx];
362 float den1 =
sqrt(MF2realSpaceRefTh - (MFrealSpaceRefTh*MFrealSpaceRefTh/maskTh));
363 float den2 =
sqrt(MF2realSpaceExpTh - (MFrealSpaceExpTh*MFrealSpaceExpTh/maskTh));
366 if(den1!=0.0 && den2!=0.0 && !isnan(den1) && !isnan(den2) && maskTh>maskCount*0.9){
367 num = (RefExpRealSpaceTh - ((MFrealSpaceRefTh*MFrealSpaceExpTh)/(maskTh)) );
368 NCC[idx] = num/(den1*den2);
374 __device__
void wrapping (
int &
x,
int &
y,
int xdim,
int ydim){
387 int xdim,
int ydim,
bool power2yx,
bool power2x)
390 extern __shared__
float transMatSD[];
392 int idxIm = blockDim.x * blockIdx.x + threadIdx.x;
393 int idxTh = threadIdx.x;
394 int numIm = blockIdx.y;
401 transMatSD[idxTh] = transMat[idxTh+(numIm*9)];
406 x = idxIm & (xdim-1);
410 float y = idxIm/xdim;
421 x_orig += transMatSD[0]*x - transMatSD[1]*y + xdim/2.0f;
422 y_orig += -transMatSD[3]*x + transMatSD[4]*y + xdim/2.0f;
426 int x_orig00 = (int)floorf(x_orig);
427 int y_orig00 = (int)floorf(y_orig);
428 int x_orig01 = x_orig00+1;
429 int y_orig01 = y_orig00;
430 int x_orig10 = x_orig00;
431 int y_orig10 = y_orig00+1;
432 int x_orig11 = x_orig00+1;
433 int y_orig11 = y_orig00+1;
435 float x_x_low=x_orig-x_orig00;
436 float y_y_low=y_orig-y_orig00;
437 float one_x=1.0-x_x_low;
438 float one_y=1.0-y_y_low;
439 float w00=one_y*one_x;
440 float w01=one_y*x_x_low;
441 float w10=y_y_low*one_x;
442 float w11=y_y_low*x_x_low;
444 wrapping (x_orig00, y_orig00, xdim, ydim);
445 wrapping (x_orig01, y_orig01, xdim, ydim);
446 wrapping (x_orig10, y_orig10, xdim, ydim);
447 wrapping (x_orig11, y_orig11, xdim, ydim);
449 int imgIdx00=y_orig00 * xdim + x_orig00;
450 int imgIdx01=y_orig01 * xdim + x_orig01;
451 int imgIdx10=y_orig10 * xdim + x_orig10;
452 int imgIdx11=y_orig11 * xdim + x_orig11;
454 int imgOffset = numIm*yxdim;
455 float I00 = d_in[imgIdx00+imgOffset];
456 float I01 = d_in[imgIdx01+imgOffset];
457 float I10 = d_in[imgIdx10+imgOffset];
458 float I11 = d_in[imgIdx11+imgOffset];
459 float imVal = I00*w00 + I01*w01 + I10*w10 + I11*w11;
461 d_out[idxIm+(numIm*yxdim)] = imVal;
468 cufftComplex *polarSquaredFFTRef, cufftComplex *polarSquaredFFTExp,
float maskFFTPolarReal,
float *NCC,
469 int yxdimFFT,
int nzyxdim,
int yxdim)
473 int idx = blockDim.x * blockIdx.x + threadIdx.x;
474 int idxLow = (idx/yxdim)*yxdimFFT;
479 float normValue = 1.0f/yxdimFFT;
480 float maskNorm = maskFFTPolarReal*normValue;
482 float M1M2Polar = maskFFTPolarReal*maskNorm;
483 float polarValRef = cuCrealf(polarFFTRef[idxLow])*maskNorm;
484 float polarSqValRef = cuCrealf(polarSquaredFFTRef[idxLow])*maskNorm;
486 float polarValExp = cuCrealf(polarFFTExp[idxLow])*maskNorm;
487 float polarSqValExp = cuCrealf(polarSquaredFFTExp[idxLow])*maskNorm;
489 float num = (RefExpRealSpace[idx] - (polarValRef*polarValExp/M1M2Polar) );
490 float den1 =
sqrt(polarSqValRef - (polarValRef*polarValRef/M1M2Polar) );
491 float den2 =
sqrt(polarSqValExp - (polarValExp*polarValExp/M1M2Polar) );
493 if(den1!=0.0 && den2!=0.0 && !isnan(den1) && !isnan(den2))
494 NCC[idx] = num/(den1*den2);
502 cufftComplex *RefExpFourier,
int nzyxdim,
int yxdim)
504 int idx = blockDim.x * blockIdx.x + threadIdx.x;
509 float normFactor = (1.0f/yxdim);
511 cuComplex mulOut = cuCmulf(reference[idx], experimental[idx]);
512 RefExpFourier[idx] = mulOut*normFactor;
518 float *padded_image2_gpu,
float *padded_mask_gpu,
int xdim,
int ydim,
int yxdim,
519 int numImag,
int pad_xdim,
int pad_ydim,
int pad_yxdim,
bool experimental,
bool power2x){
521 int idx = blockDim.x * blockIdx.x + threadIdx.x;
522 int numIm = blockIdx.y;
529 x_idx1 = idx & (xdim-1);
533 int y_idx1 = idx/xdim;
536 idxWriteToMask = (ydim-1 - y_idx1)*xdim + (xdim-1 - x_idx1);
538 idxWriteToMask = y_idx1*xdim + x_idx1;
540 int xdim2Im = (int)floorf((pad_xdim-xdim)/2.0f);
541 int ydim2Im = (int)floorf((pad_ydim-ydim)/2.0f);
542 int xdim2Mask = xdim2Im;
543 int ydim2Mask = ydim2Im;
544 if(experimental && (xdim&1)==0){
551 x_idx = idxWriteToMask & (xdim-1);
553 x_idx = idxWriteToMask%xdim;
555 int y_idx = idxWriteToMask/xdim;
567 d_out = d_in[idx+offset]*mask[idx];
568 d_out2 = d_out*d_out;
570 idxWrite = (pad_yxdim*
j) + (ydim2Im*pad_xdim) + (y_idx*pad_xdim) + xdim2Im + x_idx;
572 idxWriteMask = (pad_yxdim*j) + (ydim2Mask*pad_xdim) + (y_idx*pad_xdim) + xdim2Mask + x_idx;
574 idxWriteMask = idxWrite;
575 padded_image_gpu[idxWrite] = d_out;
576 padded_image2_gpu[idxWrite] = d_out2;
577 if(j==0 && padded_mask_gpu!=NULL)
578 padded_mask_gpu[idxWriteMask] = mask[idx];
585 float *maxGpu,
float *NCC,
int Xdim,
int Ydim,
int Ndim,
int NCC_yxdim,
586 int fixPadding,
double maxShift2,
bool power2x){
588 int idx = blockDim.x * blockIdx.x + threadIdx.x;
593 int position = (int)d_pos[idx];
597 posX_aux = position & (Xdim-1);
599 posX_aux = position%Xdim;
601 float posY_aux = (float)(position/Xdim);
602 float Xdim2 = (Xdim/2.0f);
603 float Ydim2 = (Ydim/2.0f);
606 if(posX_aux>=Xdim2 && posY_aux>=Ydim2){
607 posX = Xdim-1-posX_aux;
608 posY = Ydim-1-posY_aux;
609 }
else if(posX_aux<Xdim2 && posY_aux>=Ydim2){
610 posX = -(posX_aux+1);
611 posY = Ydim-1-posY_aux;
612 }
else if(posX_aux<Xdim2 && posY_aux<Ydim2){
613 posX = -(posX_aux+1);
614 posY = -(posY_aux+1);
615 }
else if(posX_aux>=Xdim2 && posY_aux<Ydim2){
616 posX = Xdim-1-posX_aux;
617 posY = -(posY_aux+1);
636 float last0 = lastMat[idx9];
637 float last1 = lastMat[idx9+1];
638 float last2 = lastMat[idx9+2];
639 float last3 = lastMat[idx9+3];
640 float last4 = lastMat[idx9+4];
641 float last5 = lastMat[idx9+5];
642 float last6 = lastMat[idx9+6];
643 float last7 = lastMat[idx9+7];
644 float last8 = lastMat[idx9+8];
646 float shiftx = new0*last2 + new1*last5 + new2*last8;
647 float shifty = new3*last2 + new4*last5 + new5*last8;
648 float radShift = shiftx*shiftx + shifty*shifty;
649 if(radShift > maxShift2){
650 result[idx9] = last0;
651 result[idx9+1] = last1;
652 result[idx9+2] = last2;
653 result[idx9+3] = last3;
654 result[idx9+4] = last4;
655 result[idx9+5] = last5;
656 maxGpu[idx] = NCC[idx*NCC_yxdim];
658 result[idx9] = new0*last0 + new1*last3 + new2*last6;
659 result[idx9+2] = shiftx;
660 result[idx9+1] = new0*last1 + new1*last4 + new2*last7;
661 result[idx9+3] = new3*last0 + new4*last3 + new5*last6;
662 result[idx9+4] = new3*last1 + new4*last4 + new5*last7;
663 result[idx9+5] = shifty;
671 float *maxGpu,
float *auxMax,
float *NCC,
int Xdim,
int Ndim,
int NCC_yxdim,
672 int fixPadding,
double maxShift2,
bool power2x){
674 int idx = blockDim.x * blockIdx.x + threadIdx.x;
680 int position = (int)d_pos[idx];
681 maxGpu[idx] = auxMax[position+(idx*360)];
685 posX_aux = position & (Xdim-1);
687 posX_aux = position%Xdim;
689 float Xdim2 = (Xdim/2.0f);
693 posX = -(posX_aux+1);
694 }
else if(posX_aux>=Xdim2){
695 posX = Xdim-1-posX_aux;
701 float rad = (float)(-posX*
PI/180);
705 float new0 = cosf(rad);
706 float new1 = -sinf(rad);
708 float new3 = sinf(rad);
709 float new4 = cosf(rad);
715 float last0 = lastMat[idx9];
716 float last1 = lastMat[idx9+1];
717 float last2 = lastMat[idx9+2];
718 float last3 = lastMat[idx9+3];
719 float last4 = lastMat[idx9+4];
720 float last5 = lastMat[idx9+5];
721 float last6 = lastMat[idx9+6];
722 float last7 = lastMat[idx9+7];
723 float last8 = lastMat[idx9+8];
725 float shiftx = new0*last2 + new1*last5 + new2*last8;
726 float shifty = new3*last2 + new4*last5 + new5*last8;
727 float radShift = shiftx*shiftx + shifty*shifty;
728 if(radShift > maxShift2){
729 result[idx9] = last0;
730 result[idx9+1] = last1;
731 result[idx9+2] = last2;
732 result[idx9+3] = last3;
733 result[idx9+4] = last4;
734 result[idx9+5] = last5;
735 maxGpu[idx] = NCC[idx*NCC_yxdim];
737 result[idx9] = new0*last0 + new1*last3 + new2*last6;
738 result[idx9+2] = shiftx;
739 result[idx9+1] = new0*last1 + new1*last4 + new2*last7;
740 result[idx9+3] = new3*last0 + new4*last3 + new5*last6;
741 result[idx9+4] = new3*last1 + new4*last4 + new5*last7;
742 result[idx9+5] = shifty;
749 __global__
void cart2polar(
float *image,
float *polar,
float *polar2,
int maxRadius,
int maxAng,
750 int Nimgs,
int Ydim,
int Xdim,
bool rotate)
752 int angle = blockDim.x * blockIdx.x + threadIdx.x;
753 int radius = blockDim.y * blockIdx.y + threadIdx.y;
755 int numIm = blockIdx.z;
757 if (radius>=maxRadius || angle>=maxAng)
760 float x = (float)(radius)*cosf((
float)angle*(
float)
PI/180.0
f) + (float)Xdim/2.0
f;
761 float y = (float)(radius)*sinf((
float)angle*(
float)
PI/180.0
f) + (float)Ydim/2.0
f;
763 float dx_low = floorf(x);
764 float dy_low = floorf(y);
765 int x_low = (int)dx_low;
766 int y_low = (int)dy_low;
767 float x_x_low=x-dx_low;
768 float y_y_low=y-dy_low;
769 float one_x=1.0-x_x_low;
770 float one_y=1.0-y_y_low;
771 float w00=one_y*one_x;
772 float w01=one_y*x_x_low;
773 float w10=y_y_low*one_x;
774 float w11=y_y_low*x_x_low;
777 int NXYpolar=maxAng*maxRadius;
778 int imgIdx00=y_low * Xdim + x_low;
779 int imgIdx01=imgIdx00+1;
780 int imgIdx10=imgIdx00+Xdim;
781 int imgIdx11=imgIdx10+1;
786 polarIdx=angle+(radius*maxAng);
788 polarIdx = (maxAng-angle-1)+((maxRadius-radius-1)*maxAng);
793 polarOffset = n*NXYpolar;
794 float I00 = image[imgIdx00+imgOffset];
795 float I01 = image[imgIdx01+imgOffset];
796 float I10 = image[imgIdx10+imgOffset];
797 float I11 = image[imgIdx11+imgOffset];
798 float imVal = I00*w00 + I01*w01 + I10*w10 + I11*w11;
799 int finalPolarIndex=polarIdx+polarOffset;
800 polar[finalPolarIndex] = imVal;
801 polar2[finalPolarIndex] = imVal*imVal;
808 int yxdim,
int Ndim,
int yxdim2,
int Ndim2){
810 int idx = threadIdx.x;
811 int nIm = blockIdx.x;
812 int blockSize = blockDim.x;
814 int posTh = nIm*yxdim + idx;
815 int n = ceilf(((
float)yxdim/(
float)blockSize));
825 for (
int i=1;
i<
n;
i++){
826 if (posTh+
i*blockSize < yxdim*(nIm+1)){
827 tmp = fmaxf(tmp, d_in[posTh+
i*blockSize]);
828 tmpPos = (tmp==d_in[posTh+
i*blockSize]) ? idx+
i*blockSize : tmpPos;
832 int posOut = nIm*blockSize + idx;
834 position[posOut] = tmpPos;
839 extern __shared__
float sdata[];
843 int i = blockIdx.x * blockSize + idx;
847 for(
int imN=0; imN<Ndim2; imN++){
849 if(i<yxdim2*gridDim.x){
850 sdata[idx]=d_out[i+
index];
851 sdata[idx+blockSize] = position[i+
index];
856 if(i>=yxdim2*gridDim.x)
861 if(blockSize >= 1024){
863 sdata[idx] = fmaxf(sdata[idx], sdata[idx+512]);
864 sdata[idx+blockSize] = (sdata[idx]==sdata[idx+512]) ? sdata[idx+blockSize+512] : sdata[idx+blockSize];
868 if(blockSize >= 512){
870 sdata[idx] = fmaxf(sdata[idx], sdata[idx+256]);
871 sdata[idx+blockSize] = (sdata[idx]==sdata[idx+256]) ? sdata[idx+blockSize+256] : sdata[idx+blockSize];
875 if(blockSize >= 256){
877 sdata[idx] = fmaxf(sdata[idx], sdata[idx+128]);
878 sdata[idx+blockSize] = (sdata[idx]==sdata[idx+128]) ? sdata[idx+blockSize+128] : sdata[idx+blockSize];
882 if(blockSize >= 128){
884 sdata[idx] = fmaxf(sdata[idx], sdata[idx+64]);
885 sdata[idx+blockSize] = (sdata[idx]==sdata[idx+64]) ? sdata[idx+blockSize+64] : sdata[idx+blockSize];
892 sdata[idx] = fmaxf(sdata[idx], sdata[idx+32]);
893 sdata[idx+blockSize] = (sdata[idx]==sdata[idx+32]) ? sdata[idx+blockSize+32] : sdata[idx+blockSize];
898 sdata[idx] = fmaxf(sdata[idx], sdata[idx+16]);
899 sdata[idx+blockSize] = (sdata[idx]==sdata[idx+16]) ? sdata[idx+blockSize+16] : sdata[idx+blockSize];
904 sdata[idx] = fmaxf(sdata[idx], sdata[idx+8]);
905 sdata[idx+blockSize] = (sdata[idx]==sdata[idx+8]) ? sdata[idx+blockSize+8] : sdata[idx+blockSize];
910 sdata[idx] = fmaxf(sdata[idx], sdata[idx+4]);
911 sdata[idx+blockSize] = (sdata[idx]==sdata[idx+4]) ? sdata[idx+blockSize+4] : sdata[idx+blockSize];
916 sdata[idx] = fmaxf(sdata[idx], sdata[idx+2]);
917 sdata[idx+blockSize] = (sdata[idx]==sdata[idx+2]) ? sdata[idx+blockSize+2] : sdata[idx+blockSize];
922 sdata[idx] = fmaxf(sdata[idx], sdata[idx+1]);
923 sdata[idx+blockSize] = (sdata[idx]==sdata[idx+1]) ? sdata[idx+blockSize+1] : sdata[idx+blockSize];
929 d_out[blockIdx.x+(gridDim.x*imN)] = sdata[0];
930 position[blockIdx.x+(gridDim.x*imN)] = sdata[blockSize];
935 index = index+(int)yxdim2;
947 int numBlk = d_orig_image.
yxdim/numTh;
948 if(d_orig_image.
yxdim%numTh > 0)
951 dim3 blockSize(numTh,1,1);
952 dim3 gridSize(numBlk, d_orig_image.
Ndim, 1);
954 cudaStream_t *stream = (cudaStream_t*) myStream.
ptr;
956 gpuErrchk(cudaMemsetAsync(padded_image2_gpu.
d_data, 0, padded_image2_gpu.
nzyxdim*
sizeof(
float), *stream));
957 if(padded_mask_gpu.
d_data!=NULL)
961 if (d_orig_image.
Xdim & (d_orig_image.
Xdim-1))
965 maskingPaddingKernel<<< gridSize, blockSize, 0, *stream>>>(d_orig_image.
d_data, mask.
d_data,
968 padded_image_gpu.
Xdim, padded_image_gpu.
Ydim, padded_image_gpu.
yxdim, experimental, power2);
976 cudaStream_t *stream = (cudaStream_t*) myStream.
ptr;
978 XmippDim3 blockSize(numTh, 1, 1), gridSize;
979 manyF.calculateGridSizeVectorized(blockSize, gridSize);
982 if (manyF.yxdim & (manyF.yxdim-1))
986 pointwiseMultiplicationComplexOneManyKernel <<< CONVERT2DIM3(gridSize), CONVERT2DIM3(blockSize), 0, *stream >>>
987 ((cufftComplex*)M.d_data, (cufftComplex*)manyF.d_data, (cufftComplex*) MmanyF.d_data, manyF.nzyxdim, manyF.yxdim, power2);
998 cudaStream_t *stream = (cudaStream_t*) myStream.
ptr;
1000 int numBlk = manyF.Ndim;
1003 if (manyF.yxdim & (manyF.yxdim-1))
1008 pointwiseMultiplicationComplexOneManyKernel_two <<< numBlk, numTh, 0, *stream >>>
1009 ((cufftComplex*)M.d_data, (cufftComplex*)manyF.d_data, (cufftComplex*) MmanyF.d_data,
1010 (cufftComplex*)manyF_sq.d_data, (cufftComplex*) MmanyF_sq.d_data,
1011 manyF.nzyxdim, manyF.yxdim, manyF.Ndim, power2);
1023 cudaStream_t *stream = (cudaStream_t*) myStream.
ptr;
1025 int numBlk = manyF.Ndim;
1028 if (manyF.yxdim & (manyF.yxdim-1))
1033 pointwiseMultiplicationComplexOneManyKernel_three <<< numBlk, numTh, 0, *stream >>>
1034 ((cufftComplex*)M.d_data, (cufftComplex*)manyF.d_data, (cufftComplex*) MmanyF.d_data,
1035 (cufftComplex*)manyF_sq.d_data, (cufftComplex*) MmanyF_sq.d_data,
1036 (cufftComplex*) maskAux.d_data,
1037 manyF.nzyxdim, manyF.yxdim, manyF.Ndim, power2);
1050 myStructureAux.
MF2, myStream, maskAux);
1077 myStructureAux.
MF2, myStream);
1114 cudaStream_t *stream = (cudaStream_t*) myStream.
ptr;
1119 d_out.
resize(numTh * numBlk);
1120 d_pos.
resize(numTh * numBlk);
1122 calculateMaxThreads<<<numBlk, numTh, 2*numTh * sizeof(float), *stream>>>(d_data, d_out.
d_data, d_pos.
d_data, yxdim, Ndim, numTh, 1);
1128 float *max_vector,
int maxShift,
mycufftHandle &myhandlePadded,
bool mirror,
1133 cudaStream_t *stream = (cudaStream_t*) myStream.
ptr;
1139 XmippDim3 blockSize(numTh, 1, 1), gridSize;
1142 pointwiseMultiplicationComplexKernel<<< CONVERT2DIM3(gridSize), CONVERT2DIM3(blockSize), 0, *stream >>>
1152 XmippDim3 blockSize2(numTh, 1, 1), gridSize2;
1159 calculateNccRotationKernel<<< CONVERT2DIM3(gridSize2), CONVERT2DIM3(blockSize2), 0, *stream >>>
1182 numBlk = transMat.
Ndim/numTh;
1183 if(transMat.
Ndim%numTh > 0)
1191 double maxShift2 = (2*maxShift)*(2*maxShift);
1206 float *max_vector,
int maxShift,
mycufftHandle &myhandlePadded,
bool mirror,
1211 cudaStream_t *stream = (cudaStream_t*) myStream.
ptr;
1217 XmippDim3 blockSize(numTh, 1, 1), gridSize;
1220 pointwiseMultiplicationComplexKernel<<< CONVERT2DIM3(gridSize), CONVERT2DIM3(blockSize), 0, *stream >>>
1232 XmippDim3 blockSize2(numTh, 1, 1), gridSize2;
1238 bool power2yx, power2x;
1247 calculateNccKernel<<< CONVERT2DIM3(gridSize2), CONVERT2DIM3(blockSize2), 0, *stream >>>
1253 if(referenceAux.
XdimOrig%2==0 && referenceAux.
Xdim%2==0)
1255 if(referenceAux.
XdimOrig%2==0 && referenceAux.
Xdim%2!=0)
1257 if(referenceAux.
XdimOrig%2!=0 && referenceAux.
Xdim%2==0)
1259 if(referenceAux.
XdimOrig%2!=0 && referenceAux.
Xdim%2!=0)
1266 int numBlk = transMat.
Ndim/numTh;
1267 if(transMat.
Ndim%numTh > 0)
1275 double maxShift2 = (2*maxShift)*(2*maxShift);
1299 cudaStream_t *streamTR = (cudaStream_t*) myStreamTR.
ptr;
1300 cudaStream_t *streamRT = (cudaStream_t*) myStreamRT.
ptr;
1324 XmippDim3 blockSize(numTh, 1, 1), gridSize;
1328 pointwiseMultiplicationComplexKernel<<< CONVERT2DIM3(gridSize), CONVERT2DIM3(blockSize), 0, *streamTR >>>
1333 XmippDim3 blockSize3(numTh, 1, 1), gridSize3;
1337 pointwiseMultiplicationComplexKernel<<< CONVERT2DIM3(gridSize3), CONVERT2DIM3(blockSize3), 0, *streamRT >>>
1348 XmippDim3 blockSize2(numTh, 1, 1), gridSize2;
1351 bool power2yx, power2x;
1360 calculateNccKernel<<< CONVERT2DIM3(gridSize2), CONVERT2DIM3(blockSize2), 0, *streamTR >>>
1367 if(referenceAux.
XdimOrig%2==0 && referenceAux.
Xdim%2==0)
1369 if(referenceAux.
XdimOrig%2==0 && referenceAux.
Xdim%2!=0)
1371 if(referenceAux.
XdimOrig%2!=0 && referenceAux.
Xdim%2==0)
1373 if(referenceAux.
XdimOrig%2!=0 && referenceAux.
Xdim%2!=0)
1377 XmippDim3 blockSize4(numTh, 1, 1), gridSize4;
1381 calculateNccRotationKernel<<< CONVERT2DIM3(gridSize4), CONVERT2DIM3(blockSize4), 0, *streamRT >>>
1404 numBlk = transMatTR.
Ndim/numTh;
1405 if(transMatTR.
Ndim%numTh > 0)
1413 double maxShift2 = (2*maxShift)*(2*maxShift);
1416 myStructureAuxTR.
d_NCC.
Ndim, myStructureAuxTR.
d_NCC.
yxdim, fixPadding, maxShift2, _power2x);
1418 numBlk = transMatRT.
Ndim/numTh;
1419 if(transMatRT.
Ndim%numTh > 0)
1439 gpuErrchk(cudaMemcpyAsync(max_vectorRT, myStructureAuxRT.
maxGpu.
d_data, myStructureAuxRT.
maxGpu.
Ndim*
sizeof(
float), cudaMemcpyDeviceToHost, *streamRT));
1450 cudaStream_t *stream = (cudaStream_t*) myStream.
ptr;
1454 int numBlk = d_transform_image.
yxdim/numTh;
1455 if(d_transform_image.
yxdim%numTh > 0)
1457 dim3 blockSize(numTh, 1, 1);
1458 dim3 gridSize(numBlk, d_transform_image.
Ndim, 1);
1460 bool power2yx, power2x;
1461 if (d_original_image.
yxdim & (d_original_image.
yxdim-1))
1465 if (d_original_image.
Xdim & (d_original_image.
Xdim-1))
1469 applyTransformKernel<<< gridSize, blockSize, 9*sizeof(float), *stream >>>
1472 d_original_image.
Ydim, power2yx, power2x);
1481 cudaStream_t *stream = (cudaStream_t*) myStream.
ptr;
1483 int numBlkx = polar_image.
Xdim/numTh;
1484 if(polar_image.
Xdim%numTh > 0)
1486 int numBlky = polar_image.
Ydim/numTh;
1487 if(polar_image.
Ydim%numTh > 0)
1490 dim3 blockSize(numTh, numTh, 1);
1491 dim3 gridSize(numBlkx, numBlky, polar_image.
Ndim);
1493 cart2polar <<< gridSize, blockSize, 0, *stream>>>
1499 cudaStream_t *stream = (cudaStream_t*) myStream.
ptr;
1500 gpuErrchk(cudaStreamSynchronize(*stream));
1507 cudaStream_t *stream = (cudaStream_t*) myStream.
ptr;
1509 int numBlk = size/numTh;
1512 calcAbsKernel <<< numBlk, numTh, 0, *stream>>> ((cufftComplex*)data, out, size);
void cuda_calculate_correlation(GpuCorrelationAux &referenceAux, GpuCorrelationAux &experimentalAux, TransformMatrix< float > &transMat, float *max_vector, int maxShift, mycufftHandle &myhandlePadded, bool mirror, StructuresAux &myStructureAux, myStreamHandle &myStream, TransformMatrix< float > &resultTR, bool saveMaxVector)
void pointwiseMultiplicationFourier(const GpuMultidimArrayAtGpu< std::complex< float > > &M, const GpuMultidimArrayAtGpu< std::complex< float > > &manyF, GpuMultidimArrayAtGpu< std::complex< float > > &MmanyF, myStreamHandle &myStream)
void resize(const GpuMultidimArrayAtGpu< T1 > &array)
__global__ void calculateNccKernel(float *RefExpRealSpace, float *MFrealSpaceRef, float *MFrealSpaceExp, float *MF2realSpaceRef, float *MF2realSpaceExp, float *mask, float *NCC, int nzyxdim, int yxdim, int xdim, int ydim, int maskCount, int max_shift, bool power2yx, bool power2x)
__global__ void buildRotationMatrix(float *d_pos, float *lastMat, float *result, float *maxGpu, float *auxMax, float *NCC, int Xdim, int Ndim, int NCC_yxdim, int fixPadding, double maxShift2, bool power2x)
void padding_masking(GpuMultidimArrayAtGpu< float > &d_orig_image, GpuMultidimArrayAtGpu< float > &mask, GpuMultidimArrayAtGpu< float > &padded_image_gpu, GpuMultidimArrayAtGpu< float > &padded_image2_gpu, GpuMultidimArrayAtGpu< float > &padded_mask_gpu, bool experimental, myStreamHandle &myStream)
GpuMultidimArrayAtGpu< float > RefExpRealSpacePolar
__global__ void sumRadiusKernel(float *d_in, float *d_out, float *d_out_max, float *d_out_zero, int dim, int radius, int ndim)
__global__ void pointwiseMultiplicationComplexKernel(cufftComplex *reference, cufftComplex *experimental, cufftComplex *RefExpFourier, int nzyxdim, int yxdim)
void apply_transform(GpuMultidimArrayAtGpu< float > &d_original_image, GpuMultidimArrayAtGpu< float > &d_transform_image, TransformMatrix< float > &transMat, myStreamHandle &myStream)
GpuMultidimArrayAtGpu< std::complex< float > > d_projPolarSquaredFFT
void sqrt(Image< double > &op)
__device__ void wrapping(int &x, int &y, int xdim, int ydim)
GpuMultidimArrayAtGpu< float > d_pos_polar_max
GpuMultidimArrayAtGpu< float > auxMax
__global__ void applyTransformKernel(float *d_in, float *d_out, float *transMat, int nzyxdim, int yxdim, int xdim, int ydim, bool power2yx, bool power2x)
GpuMultidimArrayAtGpu< float > RefExpRealSpace
void calculateMaxNew2DNew(int yxdim, int Ndim, float *d_data, GpuMultidimArrayAtGpu< float > &d_out, GpuMultidimArrayAtGpu< float > &d_pos, myStreamHandle &myStream)
__global__ void calcAbsKernel(cufftComplex *d_in, float *d_out, int dim)
#define rotate(a, i, j, k, l)
__global__ void pointwiseMultiplicationComplexOneManyKernel_two(cufftComplex *M, cufftComplex *manyF, cufftComplex *MmanyF, cufftComplex *manyF_sq, cufftComplex *MmanyF_sq, int nzyxdim, int yxdim, int ndim, bool power2)
GpuMultidimArrayAtGpu< std::complex< float > > RefExpFourier
GpuMultidimArrayAtGpu< float > maskAutocorrelation
GpuMultidimArrayAtGpu< std::complex< float > > d_projFFT
void ifftStream(GpuMultidimArrayAtGpu< T1 > &realSpace, mycufftHandle &myhandle, myStreamHandle &myStream, bool useCallback, GpuMultidimArrayAtGpu< std::complex< float > > &dataExp)
GpuMultidimArrayAtGpu< float > d_transform_image
GpuMultidimArrayAtGpu< std::complex< float > > RefExpFourierPolar
void cuda_calculate_correlation_two(GpuCorrelationAux &referenceAux, GpuCorrelationAux &experimentalAuxTR, TransformMatrix< float > &transMatTR, float *max_vectorTR, int maxShift, mycufftHandle &myhandlePaddedTR, bool mirror, StructuresAux &myStructureAuxTR, myStreamHandle &myStreamTR, GpuCorrelationAux &experimentalAuxRT, TransformMatrix< float > &transMatRT, float *max_vectorRT, mycufftHandle &myhandlePaddedRT, StructuresAux &myStructureAuxRT, myStreamHandle &myStreamRT, TransformMatrix< float > &resultTR, TransformMatrix< float > &resultRT, mycufftHandle &ifftcb, bool saveMaxVector)
if(fabs(c[*nmax+ *nmax *c_dim1])==0.e0)
GpuMultidimArrayAtGpu< float > d_out_max
__global__ void calculateDenomFunctionKernel(float *MFrealSpace, float *MF2realSpace, float *maskAutocorrelation, float *out, int nzyxdim, int yxdim, bool power2)
GpuMultidimArrayAtGpu< std::complex< float > > d_projPolarFFT
GpuMultidimArrayAtGpu< float > d_original_image
GpuMultidimArrayAtGpu< float > d_NCCPolar
void calculateGridSizeVectorized(const XmippDim3 &blockSize, XmippDim3 &gridSize) const
GpuMultidimArrayAtGpu< float > d_NCCPolar1D
GpuMultidimArrayAtGpu< std::complex< float > > MF2
void cuda_cart2polar(GpuMultidimArrayAtGpu< float > &image, GpuMultidimArrayAtGpu< float > &polar_image, GpuMultidimArrayAtGpu< float > &polar2_image, bool rotate, myStreamHandle &myStream)
void pointwiseMultiplicationFourier_two(const GpuMultidimArrayAtGpu< std::complex< float > > &M, const GpuMultidimArrayAtGpu< std::complex< float > > &manyF, GpuMultidimArrayAtGpu< std::complex< float > > &MmanyF, const GpuMultidimArrayAtGpu< std::complex< float > > &manyF_sq, GpuMultidimArrayAtGpu< std::complex< float > > &MmanyF_sq, myStreamHandle &myStream)
__global__ void cart2polar(float *image, float *polar, float *polar2, int maxRadius, int maxAng, int Nimgs, int Ydim, int Xdim, bool rotate)
GpuMultidimArrayAtGpu< std::complex< float > > d_projSquaredFFT
void produceSideInfo(mycufftHandle &myhandlePadded, mycufftHandle &myhandleMask, StructuresAux &myStructureAux, myStreamHandle &myStream)
GpuMultidimArrayAtGpu< float > d_pos_max
GpuMultidimArrayAtGpu< float > MF2realSpace
GpuMultidimArrayAtGpu< float > maxGpu
GpuMultidimArrayAtGpu< float > MFrealSpace
__global__ void calculateNccRotationKernel(float *RefExpRealSpace, cufftComplex *polarFFTRef, cufftComplex *polarFFTExp, cufftComplex *polarSquaredFFTRef, cufftComplex *polarSquaredFFTExp, float maskFFTPolarReal, float *NCC, int yxdimFFT, int nzyxdim, int yxdim)
__global__ void calculateMaxThreads(float *d_in, float *d_out, float *position, int yxdim, int Ndim, int yxdim2, int Ndim2)
void pointwiseMultiplicationFourier_three(const GpuMultidimArrayAtGpu< std::complex< float > > &M, const GpuMultidimArrayAtGpu< std::complex< float > > &manyF, GpuMultidimArrayAtGpu< std::complex< float > > &MmanyF, const GpuMultidimArrayAtGpu< std::complex< float > > &manyF_sq, GpuMultidimArrayAtGpu< std::complex< float > > &MmanyF_sq, myStreamHandle &myStream, GpuMultidimArrayAtGpu< std::complex< float > > &maskAux)
void calculateAbs(std::complex< float > *data, float *out, int size, myStreamHandle &myStream)
GpuMultidimArrayAtGpu< float > d_NCC
GpuMultidimArrayAtGpu< float > auxZero
GpuMultidimArrayAtGpu< float > d_out_polar_max
void cuda_calculate_correlation_rotation(GpuCorrelationAux &referenceAux, GpuCorrelationAux &experimentalAux, TransformMatrix< float > &transMat, float *max_vector, int maxShift, mycufftHandle &myhandlePadded, bool mirror, StructuresAux &myStructureAux, myStreamHandle &myStream, TransformMatrix< float > &resultRT)
__global__ void calculateMax2(float *d_in, float *d_out, float *position, int yxdim, int Ndim, bool firstCall)
GpuMultidimArrayAtGpu< std::complex< float > > MF
void copyGpuToGpuStream(GpuMultidimArrayAtGpu< T > &gpuArray, myStreamHandle &myStream)
GpuMultidimArrayAtGpu< std::complex< float > > d_maskFFT
__global__ void buildTranslationMatrix(float *d_pos, float *lastMat, float *result, float *maxGpu, float *NCC, int Xdim, int Ydim, int Ndim, int NCC_yxdim, int fixPadding, double maxShift2, bool power2x)
__global__ void pointwiseMultiplicationComplexOneManyKernel_three(cufftComplex *M, cufftComplex *manyF, cufftComplex *MmanyF, cufftComplex *manyF_sq, cufftComplex *MmanyF_sq, cufftComplex *maskAux, int nzyxdim, int yxdim, int ndim, bool power2)
void waitGpu(myStreamHandle &myStream, bool allStreams)
__global__ void maskingPaddingKernel(float *d_in, float *mask, float *padded_image_gpu, float *padded_image2_gpu, float *padded_mask_gpu, int xdim, int ydim, int yxdim, int numImag, int pad_xdim, int pad_ydim, int pad_yxdim, bool experimental, bool power2x)
__global__ void pointwiseMultiplicationComplexOneManyKernel(cufftComplex *M, cufftComplex *manyF, cufftComplex *MmanyF, int nzyxdim, int yxdim, bool power2)