Xmipp  v3.23.11-Nereus
Macros | Functions
cuda_gpu_correlation.cpp File Reference
#include "cuda_gpu_correlation.h"
#include <iostream>
#include <stdio.h>
#include <math.h>
#include <algorithm>
#include <cuda_runtime.h>
#include <cufft.h>
#include "cuda_asserts.h"
#include "cuda_basic_math.h"
#include <time.h>
#include <sys/time.h>
#include <vector>
Include dependency graph for cuda_gpu_correlation.cpp:

Go to the source code of this file.

Macros

#define PI   3.14159265359
 

Functions

__global__ void calcAbsKernel (cufftComplex *d_in, float *d_out, int dim)
 
__global__ void sumRadiusKernel (float *d_in, float *d_out, float *d_out_max, float *d_out_zero, int dim, int radius, int ndim)
 
__global__ void calculateMax2 (float *d_in, float *d_out, float *position, int yxdim, int Ndim, bool firstCall)
 
__global__ void pointwiseMultiplicationComplexOneManyKernel_three (cufftComplex *M, cufftComplex *manyF, cufftComplex *MmanyF, cufftComplex *manyF_sq, cufftComplex *MmanyF_sq, cufftComplex *maskAux, int nzyxdim, int yxdim, int ndim, bool power2)
 
__global__ void pointwiseMultiplicationComplexOneManyKernel_two (cufftComplex *M, cufftComplex *manyF, cufftComplex *MmanyF, cufftComplex *manyF_sq, cufftComplex *MmanyF_sq, int nzyxdim, int yxdim, int ndim, bool power2)
 
__global__ void pointwiseMultiplicationComplexOneManyKernel (cufftComplex *M, cufftComplex *manyF, cufftComplex *MmanyF, int nzyxdim, int yxdim, bool power2)
 
__global__ void calculateDenomFunctionKernel (float *MFrealSpace, float *MF2realSpace, float *maskAutocorrelation, float *out, int nzyxdim, int yxdim, bool power2)
 
__global__ void calculateNccKernel (float *RefExpRealSpace, float *MFrealSpaceRef, float *MFrealSpaceExp, float *MF2realSpaceRef, float *MF2realSpaceExp, float *mask, float *NCC, int nzyxdim, int yxdim, int xdim, int ydim, int maskCount, int max_shift, bool power2yx, bool power2x)
 
__device__ void wrapping (int &x, int &y, int xdim, int ydim)
 
__global__ void applyTransformKernel (float *d_in, float *d_out, float *transMat, int nzyxdim, int yxdim, int xdim, int ydim, bool power2yx, bool power2x)
 
__global__ void calculateNccRotationKernel (float *RefExpRealSpace, cufftComplex *polarFFTRef, cufftComplex *polarFFTExp, cufftComplex *polarSquaredFFTRef, cufftComplex *polarSquaredFFTExp, float maskFFTPolarReal, float *NCC, int yxdimFFT, int nzyxdim, int yxdim)
 
__global__ void pointwiseMultiplicationComplexKernel (cufftComplex *reference, cufftComplex *experimental, cufftComplex *RefExpFourier, int nzyxdim, int yxdim)
 
__global__ void maskingPaddingKernel (float *d_in, float *mask, float *padded_image_gpu, float *padded_image2_gpu, float *padded_mask_gpu, int xdim, int ydim, int yxdim, int numImag, int pad_xdim, int pad_ydim, int pad_yxdim, bool experimental, bool power2x)
 
__global__ void buildTranslationMatrix (float *d_pos, float *lastMat, float *result, float *maxGpu, float *NCC, int Xdim, int Ydim, int Ndim, int NCC_yxdim, int fixPadding, double maxShift2, bool power2x)
 
__global__ void buildRotationMatrix (float *d_pos, float *lastMat, float *result, float *maxGpu, float *auxMax, float *NCC, int Xdim, int Ndim, int NCC_yxdim, int fixPadding, double maxShift2, bool power2x)
 
__global__ void cart2polar (float *image, float *polar, float *polar2, int maxRadius, int maxAng, int Nimgs, int Ydim, int Xdim, bool rotate)
 
__global__ void calculateMaxThreads (float *d_in, float *d_out, float *position, int yxdim, int Ndim, int yxdim2, int Ndim2)
 
void padding_masking (GpuMultidimArrayAtGpu< float > &d_orig_image, GpuMultidimArrayAtGpu< float > &mask, GpuMultidimArrayAtGpu< float > &padded_image_gpu, GpuMultidimArrayAtGpu< float > &padded_image2_gpu, GpuMultidimArrayAtGpu< float > &padded_mask_gpu, bool experimental, myStreamHandle &myStream)
 
void pointwiseMultiplicationFourier (const GpuMultidimArrayAtGpu< std::complex< float > > &M, const GpuMultidimArrayAtGpu< std::complex< float > > &manyF, GpuMultidimArrayAtGpu< std::complex< float > > &MmanyF, myStreamHandle &myStream)
 
void pointwiseMultiplicationFourier_two (const GpuMultidimArrayAtGpu< std::complex< float > > &M, const GpuMultidimArrayAtGpu< std::complex< float > > &manyF, GpuMultidimArrayAtGpu< std::complex< float > > &MmanyF, const GpuMultidimArrayAtGpu< std::complex< float > > &manyF_sq, GpuMultidimArrayAtGpu< std::complex< float > > &MmanyF_sq, myStreamHandle &myStream)
 
void pointwiseMultiplicationFourier_three (const GpuMultidimArrayAtGpu< std::complex< float > > &M, const GpuMultidimArrayAtGpu< std::complex< float > > &manyF, GpuMultidimArrayAtGpu< std::complex< float > > &MmanyF, const GpuMultidimArrayAtGpu< std::complex< float > > &manyF_sq, GpuMultidimArrayAtGpu< std::complex< float > > &MmanyF_sq, myStreamHandle &myStream, GpuMultidimArrayAtGpu< std::complex< float > > &maskAux)
 
void calculateMaxNew2DNew (int yxdim, int Ndim, float *d_data, GpuMultidimArrayAtGpu< float > &d_out, GpuMultidimArrayAtGpu< float > &d_pos, myStreamHandle &myStream)
 
void cuda_calculate_correlation_rotation (GpuCorrelationAux &referenceAux, GpuCorrelationAux &experimentalAux, TransformMatrix< float > &transMat, float *max_vector, int maxShift, mycufftHandle &myhandlePadded, bool mirror, StructuresAux &myStructureAux, myStreamHandle &myStream, TransformMatrix< float > &resultRT)
 
void cuda_calculate_correlation (GpuCorrelationAux &referenceAux, GpuCorrelationAux &experimentalAux, TransformMatrix< float > &transMat, float *max_vector, int maxShift, mycufftHandle &myhandlePadded, bool mirror, StructuresAux &myStructureAux, myStreamHandle &myStream, TransformMatrix< float > &resultTR, bool saveMaxVector)
 
void cuda_calculate_correlation_two (GpuCorrelationAux &referenceAux, GpuCorrelationAux &experimentalAuxTR, TransformMatrix< float > &transMatTR, float *max_vectorTR, int maxShift, mycufftHandle &myhandlePaddedTR, bool mirror, StructuresAux &myStructureAuxTR, myStreamHandle &myStreamTR, GpuCorrelationAux &experimentalAuxRT, TransformMatrix< float > &transMatRT, float *max_vectorRT, mycufftHandle &myhandlePaddedRT, StructuresAux &myStructureAuxRT, myStreamHandle &myStreamRT, TransformMatrix< float > &resultTR, TransformMatrix< float > &resultRT, mycufftHandle &ifftcb, bool saveMaxVector)
 
void apply_transform (GpuMultidimArrayAtGpu< float > &d_original_image, GpuMultidimArrayAtGpu< float > &d_transform_image, TransformMatrix< float > &transMat, myStreamHandle &myStream)
 
void cuda_cart2polar (GpuMultidimArrayAtGpu< float > &image, GpuMultidimArrayAtGpu< float > &polar_image, GpuMultidimArrayAtGpu< float > &polar2_image, bool rotate, myStreamHandle &myStream)
 
void waitGpu (myStreamHandle &myStream, bool allStreams)
 
void calculateAbs (std::complex< float > *data, float *out, int size, myStreamHandle &myStream)
 

Macro Definition Documentation

◆ PI

#define PI   3.14159265359

Definition at line 19 of file cuda_gpu_correlation.cpp.

Function Documentation

◆ applyTransformKernel()

__global__ void applyTransformKernel ( float *  d_in,
float *  d_out,
float *  transMat,
int  nzyxdim,
int  yxdim,
int  xdim,
int  ydim,
bool  power2yx,
bool  power2x 
)

Definition at line 386 of file cuda_gpu_correlation.cpp.

388 {
389 
390  extern __shared__ float transMatSD[];
391 
392  int idxIm = blockDim.x * blockIdx.x + threadIdx.x;
393  int idxTh = threadIdx.x;
394  int numIm = blockIdx.y;
395 
396 
397  if(idxIm>=yxdim)
398  return;
399 
400  if(idxTh<9)
401  transMatSD[idxTh] = transMat[idxTh+(numIm*9)];
402  __syncthreads();
403 
404  float x;
405  if (power2x==true)
406  x = idxIm & (xdim-1);
407  else
408  x = idxIm%xdim;
409 
410  float y = idxIm/xdim;
411  float x_orig = 0;
412  float y_orig = 0;
413 
414  //transMat into shared memory
415  x -= transMatSD[2]; //transMat[2+(numIm*9)];
416  y -= transMatSD[5]; //transMat[5+(numIm*9)];
417 
418  x = x - xdim/2.0f;
419  y = y - ydim/2.0f;
420 
421  x_orig += transMatSD[0]*x - transMatSD[1]*y + xdim/2.0f;
422  y_orig += -transMatSD[3]*x + transMatSD[4]*y + xdim/2.0f;
423  // x_orig += transMat[(numIm*9)]*x - transMat[1+(numIm*9)]*y + xdim/2.0f;
424  //y_orig += -transMat[3+(numIm*9)]*x + transMat[4+(numIm*9)]*y + ydim/2.0f;
425 
426  int x_orig00 = (int)floorf(x_orig);
427  int y_orig00 = (int)floorf(y_orig);
428  int x_orig01 = x_orig00+1;
429  int y_orig01 = y_orig00;
430  int x_orig10 = x_orig00;
431  int y_orig10 = y_orig00+1;
432  int x_orig11 = x_orig00+1;
433  int y_orig11 = y_orig00+1;
434 
435  float x_x_low=x_orig-x_orig00;
436  float y_y_low=y_orig-y_orig00;
437  float one_x=1.0-x_x_low;
438  float one_y=1.0-y_y_low;
439  float w00=one_y*one_x;
440  float w01=one_y*x_x_low;
441  float w10=y_y_low*one_x;
442  float w11=y_y_low*x_x_low;
443 
444  wrapping (x_orig00, y_orig00, xdim, ydim);
445  wrapping (x_orig01, y_orig01, xdim, ydim);
446  wrapping (x_orig10, y_orig10, xdim, ydim);
447  wrapping (x_orig11, y_orig11, xdim, ydim);
448 
449  int imgIdx00=y_orig00 * xdim + x_orig00;
450  int imgIdx01=y_orig01 * xdim + x_orig01;
451  int imgIdx10=y_orig10 * xdim + x_orig10;
452  int imgIdx11=y_orig11 * xdim + x_orig11;
453 
454  int imgOffset = numIm*yxdim;
455  float I00 = d_in[imgIdx00+imgOffset];
456  float I01 = d_in[imgIdx01+imgOffset];
457  float I10 = d_in[imgIdx10+imgOffset];
458  float I11 = d_in[imgIdx11+imgOffset];
459  float imVal = I00*w00 + I01*w01 + I10*w10 + I11*w11;
460 
461  d_out[idxIm+(numIm*yxdim)] = imVal;
462 
463 }
static double * y
__device__ void wrapping(int &x, int &y, int xdim, int ydim)
doublereal * x

◆ buildRotationMatrix()

__global__ void buildRotationMatrix ( float *  d_pos,
float *  lastMat,
float *  result,
float *  maxGpu,
float *  auxMax,
float *  NCC,
int  Xdim,
int  Ndim,
int  NCC_yxdim,
int  fixPadding,
double  maxShift2,
bool  power2x 
)

Definition at line 670 of file cuda_gpu_correlation.cpp.

672  {
673 
674  int idx = blockDim.x * blockIdx.x + threadIdx.x;
675 
676  if(idx>=Ndim)
677  return;
678 
679  float posX;
680  int position = (int)d_pos[idx];
681  maxGpu[idx] = auxMax[position+(idx*360)];
682 
683  float posX_aux;
684  if (power2x==true)
685  posX_aux = position & (Xdim-1);
686  else
687  posX_aux = position%Xdim;
688 
689  float Xdim2 = (Xdim/2.0f);
690 
691 
692  if(posX_aux<Xdim2){
693  posX = -(posX_aux+1);
694  }else if(posX_aux>=Xdim2){
695  posX = Xdim-1-posX_aux;
696  }
697 
698  //Fixing padding problem
699  posX+=fixPadding;
700 
701  float rad = (float)(-posX*PI/180);
702 
703  int idx9 = idx*9;
704 
705  float new0 = cosf(rad);
706  float new1 = -sinf(rad);
707  float new2 = 0.0f;
708  float new3 = sinf(rad);
709  float new4 = cosf(rad);
710  float new5 = 0.0f;
711  //float new6 = 0.0f;
712  //float new7 = 0.0f;
713  //float new8 = 1.0f;
714 
715  float last0 = lastMat[idx9];
716  float last1 = lastMat[idx9+1];
717  float last2 = lastMat[idx9+2];
718  float last3 = lastMat[idx9+3];
719  float last4 = lastMat[idx9+4];
720  float last5 = lastMat[idx9+5];
721  float last6 = lastMat[idx9+6];
722  float last7 = lastMat[idx9+7];
723  float last8 = lastMat[idx9+8];
724 
725  float shiftx = new0*last2 + new1*last5 + new2*last8;
726  float shifty = new3*last2 + new4*last5 + new5*last8;
727  float radShift = shiftx*shiftx + shifty*shifty;
728  if(radShift > maxShift2){
729  result[idx9] = last0;
730  result[idx9+1] = last1;
731  result[idx9+2] = last2;
732  result[idx9+3] = last3;
733  result[idx9+4] = last4;
734  result[idx9+5] = last5;
735  maxGpu[idx] = NCC[idx*NCC_yxdim];
736  }else{
737  result[idx9] = new0*last0 + new1*last3 + new2*last6;
738  result[idx9+2] = shiftx;
739  result[idx9+1] = new0*last1 + new1*last4 + new2*last7;
740  result[idx9+3] = new3*last0 + new4*last3 + new5*last6;
741  result[idx9+4] = new3*last1 + new4*last4 + new5*last7;
742  result[idx9+5] = shifty;
743  }
744 
745 }
#define PI

◆ buildTranslationMatrix()

__global__ void buildTranslationMatrix ( float *  d_pos,
float *  lastMat,
float *  result,
float *  maxGpu,
float *  NCC,
int  Xdim,
int  Ydim,
int  Ndim,
int  NCC_yxdim,
int  fixPadding,
double  maxShift2,
bool  power2x 
)

Definition at line 584 of file cuda_gpu_correlation.cpp.

586  {
587 
588  int idx = blockDim.x * blockIdx.x + threadIdx.x;
589 
590  if(idx>=Ndim)
591  return;
592 
593  int position = (int)d_pos[idx];
594 
595  float posX_aux;
596  if (power2x==true)
597  posX_aux = position & (Xdim-1);
598  else
599  posX_aux = position%Xdim;
600 
601  float posY_aux = (float)(position/Xdim);
602  float Xdim2 = (Xdim/2.0f);
603  float Ydim2 = (Ydim/2.0f);
604  float posX, posY;
605 
606  if(posX_aux>=Xdim2 && posY_aux>=Ydim2){
607  posX = Xdim-1-posX_aux;
608  posY = Ydim-1-posY_aux;
609  }else if(posX_aux<Xdim2 && posY_aux>=Ydim2){
610  posX = -(posX_aux+1);
611  posY = Ydim-1-posY_aux;
612  }else if(posX_aux<Xdim2 && posY_aux<Ydim2){
613  posX = -(posX_aux+1);
614  posY = -(posY_aux+1);
615  }else if(posX_aux>=Xdim2 && posY_aux<Ydim2){
616  posX = Xdim-1-posX_aux;
617  posY = -(posY_aux+1);
618  }
619 
620  //Fixing padding problem
621  posX+=fixPadding;
622  posY+=fixPadding;
623 
624  int idx9 = idx*9;
625 
626  float new0 = 1.0f;
627  float new1 = 0.0f;
628  float new2 = -posX;
629  float new3 = 0.0f;
630  float new4 = 1.0f;
631  float new5 = -posY;
632  //float new6 = 0.0f;
633  //float new7 = 0.0f;
634  //float new8 = 1.0f;
635 
636  float last0 = lastMat[idx9];
637  float last1 = lastMat[idx9+1];
638  float last2 = lastMat[idx9+2];
639  float last3 = lastMat[idx9+3];
640  float last4 = lastMat[idx9+4];
641  float last5 = lastMat[idx9+5];
642  float last6 = lastMat[idx9+6];
643  float last7 = lastMat[idx9+7];
644  float last8 = lastMat[idx9+8];
645 
646  float shiftx = new0*last2 + new1*last5 + new2*last8;
647  float shifty = new3*last2 + new4*last5 + new5*last8;
648  float radShift = shiftx*shiftx + shifty*shifty;
649  if(radShift > maxShift2){
650  result[idx9] = last0;
651  result[idx9+1] = last1;
652  result[idx9+2] = last2;
653  result[idx9+3] = last3;
654  result[idx9+4] = last4;
655  result[idx9+5] = last5;
656  maxGpu[idx] = NCC[idx*NCC_yxdim];
657  }else{
658  result[idx9] = new0*last0 + new1*last3 + new2*last6;
659  result[idx9+2] = shiftx;
660  result[idx9+1] = new0*last1 + new1*last4 + new2*last7;
661  result[idx9+3] = new3*last0 + new4*last3 + new5*last6;
662  result[idx9+4] = new3*last1 + new4*last4 + new5*last7;
663  result[idx9+5] = shifty;
664  }
665 
666 }

◆ calcAbsKernel()

__global__ void calcAbsKernel ( cufftComplex *  d_in,
float *  d_out,
int  dim 
)

Definition at line 22 of file cuda_gpu_correlation.cpp.

22  {
23 
24  int idx = blockDim.x * blockIdx.x + threadIdx.x;
25 
26  if(idx>=dim)
27  return;
28 
29  d_out[idx]=d_in[idx].x;
30 
31 }

◆ calculateAbs()

void calculateAbs ( std::complex< float > *  data,
float *  out,
int  size,
myStreamHandle myStream 
)

Definition at line 1505 of file cuda_gpu_correlation.cpp.

1505  {
1506 
1507  cudaStream_t *stream = (cudaStream_t*) myStream.ptr;
1508  int numTh = 1024;
1509  int numBlk = size/numTh;
1510  if(size%numTh > 0)
1511  numBlk++;
1512  calcAbsKernel <<< numBlk, numTh, 0, *stream>>> ((cufftComplex*)data, out, size);
1513 
1514 }

◆ calculateDenomFunctionKernel()

__global__ void calculateDenomFunctionKernel ( float *  MFrealSpace,
float *  MF2realSpace,
float *  maskAutocorrelation,
float *  out,
int  nzyxdim,
int  yxdim,
bool  power2 
)

Definition at line 305 of file cuda_gpu_correlation.cpp.

307 {
308  int idx = blockDim.x * blockIdx.x + threadIdx.x;
309  int idxLow;
310  if (power2==true)
311  idxLow = idx & (yxdim-1);
312  else
313  idxLow = idx%yxdim;
314 
315  if (idx>=nzyxdim)
316  return;
317 
318  out[idx] = sqrt(MF2realSpace[idx] - (MFrealSpace[idx]*MFrealSpace[idx]/maskAutocorrelation[idxLow]));
319 
320 }
void sqrt(Image< double > &op)

◆ calculateMax2()

__global__ void calculateMax2 ( float *  d_in,
float *  d_out,
float *  position,
int  yxdim,
int  Ndim,
bool  firstCall 
)

Definition at line 64 of file cuda_gpu_correlation.cpp.

64  {
65 
66  extern __shared__ float sdata[];
67 
68  int idx = threadIdx.x;
69  int blockSize = blockDim.x;
70 
71 
72  //Version 6
73  int i = blockIdx.x * blockSize + idx;
74 
75  //printf("d_in[%i] %f \n", i, d_in[i]);
76 
77  int index = 0;
78  for(int imN=0; imN<Ndim; imN++){
79 
80  if(i<yxdim*gridDim.x){
81  sdata[idx]=d_in[i+index];
82 
83  if(firstCall)
84  sdata[idx+blockSize] = (float)idx; //AJ position
85  else
86  sdata[idx+blockSize] = position[i+index];
87  }
88  //if (idx==0)
89  //printf("i %i, sdata %f \n", i, sdata[idx]);
90 
91  __syncthreads();
92 
93  if(i>=yxdim*gridDim.x)
94  sdata[idx]=-1.0;
95  __syncthreads();
96 
97 
98  if(blockSize >= 1024){
99  if(idx<512){
100  sdata[idx] = fmaxf(sdata[idx], sdata[idx+512]);
101  sdata[idx+blockSize] = (sdata[idx]==sdata[idx+512]) ? sdata[idx+blockSize+512] : sdata[idx+blockSize];
102  }
103  __syncthreads();
104  }
105  if(blockSize >= 512){
106  if(idx<256){
107  sdata[idx] = fmaxf(sdata[idx], sdata[idx+256]);
108  sdata[idx+blockSize] = (sdata[idx]==sdata[idx+256]) ? sdata[idx+blockSize+256] : sdata[idx+blockSize];
109  }
110  __syncthreads();
111  }
112  if(blockSize >= 256){
113  if(idx<128){
114  sdata[idx] = fmaxf(sdata[idx], sdata[idx+128]);
115  sdata[idx+blockSize] = (sdata[idx]==sdata[idx+128]) ? sdata[idx+blockSize+128] : sdata[idx+blockSize];
116  }
117  __syncthreads();
118  }
119  if(blockSize >= 128){
120  if(idx<64){
121  sdata[idx] = fmaxf(sdata[idx], sdata[idx+64]);
122  sdata[idx+blockSize] = (sdata[idx]==sdata[idx+64]) ? sdata[idx+blockSize+64] : sdata[idx+blockSize];
123  }
124  __syncthreads();
125  }
126  if(idx<32){
127  if(blockSize>=64){
128  if(idx<32){
129  sdata[idx] = fmaxf(sdata[idx], sdata[idx+32]);
130  sdata[idx+blockSize] = (sdata[idx]==sdata[idx+32]) ? sdata[idx+blockSize+32] : sdata[idx+blockSize];
131  }
132  }
133  if(blockSize>=32){
134  if(idx<16){
135  sdata[idx] = fmaxf(sdata[idx], sdata[idx+16]);
136  sdata[idx+blockSize] = (sdata[idx]==sdata[idx+16]) ? sdata[idx+blockSize+16] : sdata[idx+blockSize];
137  }
138  }
139  if(blockSize>=16){
140  if(idx<8){
141  sdata[idx] = fmaxf(sdata[idx], sdata[idx+8]);
142  sdata[idx+blockSize] = (sdata[idx]==sdata[idx+8]) ? sdata[idx+blockSize+8] : sdata[idx+blockSize];
143  }
144  }
145  if(blockSize>=8){
146  if(idx<4){
147  sdata[idx] = fmaxf(sdata[idx], sdata[idx+4]);
148  sdata[idx+blockSize] = (sdata[idx]==sdata[idx+4]) ? sdata[idx+blockSize+4] : sdata[idx+blockSize];
149  }
150  }
151  if(blockSize>=4){
152  if(idx<2){
153  sdata[idx] = fmaxf(sdata[idx], sdata[idx+2]);
154  sdata[idx+blockSize] = (sdata[idx]==sdata[idx+2]) ? sdata[idx+blockSize+2] : sdata[idx+blockSize];
155  }
156  }
157  if(blockSize>=2){
158  if(idx<1){
159  sdata[idx] = fmaxf(sdata[idx], sdata[idx+1]);
160  sdata[idx+blockSize] = (sdata[idx]==sdata[idx+1]) ? sdata[idx+blockSize+1] : sdata[idx+blockSize];
161  }
162  }
163  }
164 
165  if(idx==0){
166  //printf("idx %i sdata[0] %f sdata[blockSize] %f \n", blockIdx.x+(gridDim.x*imN), sdata[0], sdata[blockSize]);
167  d_out[blockIdx.x+(gridDim.x*imN)] = sdata[0];
168  position[blockIdx.x+(gridDim.x*imN)] = sdata[blockSize];
169  }
170 
171  __syncthreads();
172 
173  index = index+(int)yxdim;
174 
175  }
176 
177 }
#define i
viol index

◆ calculateMaxNew2DNew()

void calculateMaxNew2DNew ( int  yxdim,
int  Ndim,
float *  d_data,
GpuMultidimArrayAtGpu< float > &  d_out,
GpuMultidimArrayAtGpu< float > &  d_pos,
myStreamHandle myStream 
)

Definition at line 1111 of file cuda_gpu_correlation.cpp.

1112  {
1113 
1114  cudaStream_t *stream = (cudaStream_t*) myStream.ptr;
1115 
1116  int numTh = 1024;
1117  int numBlk = Ndim;
1118 
1119  d_out.resize(numTh * numBlk);
1120  d_pos.resize(numTh * numBlk);
1121 
1122  calculateMaxThreads<<<numBlk, numTh, 2*numTh * sizeof(float), *stream>>>(d_data, d_out.d_data, d_pos.d_data, yxdim, Ndim, numTh, 1);
1123 
1124 }
void resize(const GpuMultidimArrayAtGpu< T1 > &array)

◆ calculateMaxThreads()

__global__ void calculateMaxThreads ( float *  d_in,
float *  d_out,
float *  position,
int  yxdim,
int  Ndim,
int  yxdim2,
int  Ndim2 
)

Definition at line 807 of file cuda_gpu_correlation.cpp.

808  {
809 
810  int idx = threadIdx.x;
811  int nIm = blockIdx.x;
812  int blockSize = blockDim.x;
813 
814  int posTh = nIm*yxdim + idx;
815  int n = ceilf(((float)yxdim/(float)blockSize));
816 
817  float tmp, tmpPos;
818 
819  if(idx>=yxdim){
820  tmp = -1.0f;
821  tmpPos = -1.0f;
822  }else{
823  tmp = d_in[posTh];
824  tmpPos = idx;
825  for (int i=1; i<n; i++){
826  if (posTh+i*blockSize < yxdim*(nIm+1)){
827  tmp = fmaxf(tmp, d_in[posTh+i*blockSize]);
828  tmpPos = (tmp==d_in[posTh+i*blockSize]) ? idx+i*blockSize : tmpPos;
829  }
830  }
831  }
832  int posOut = nIm*blockSize + idx;
833  d_out[posOut] = tmp;
834  position[posOut] = tmpPos;
835 
836  __syncthreads();
837 
838 
839  extern __shared__ float sdata[];
840 
841 
842  //Version 6
843  int i = blockIdx.x * blockSize + idx;
844 
845 
846  int index = 0;
847  for(int imN=0; imN<Ndim2; imN++){
848 
849  if(i<yxdim2*gridDim.x){
850  sdata[idx]=d_out[i+index];
851  sdata[idx+blockSize] = position[i+index];
852  }
853 
854  __syncthreads();
855 
856  if(i>=yxdim2*gridDim.x)
857  sdata[idx]=-1.0;
858  __syncthreads();
859 
860 
861  if(blockSize >= 1024){
862  if(idx<512){
863  sdata[idx] = fmaxf(sdata[idx], sdata[idx+512]);
864  sdata[idx+blockSize] = (sdata[idx]==sdata[idx+512]) ? sdata[idx+blockSize+512] : sdata[idx+blockSize];
865  }
866  __syncthreads();
867  }
868  if(blockSize >= 512){
869  if(idx<256){
870  sdata[idx] = fmaxf(sdata[idx], sdata[idx+256]);
871  sdata[idx+blockSize] = (sdata[idx]==sdata[idx+256]) ? sdata[idx+blockSize+256] : sdata[idx+blockSize];
872  }
873  __syncthreads();
874  }
875  if(blockSize >= 256){
876  if(idx<128){
877  sdata[idx] = fmaxf(sdata[idx], sdata[idx+128]);
878  sdata[idx+blockSize] = (sdata[idx]==sdata[idx+128]) ? sdata[idx+blockSize+128] : sdata[idx+blockSize];
879  }
880  __syncthreads();
881  }
882  if(blockSize >= 128){
883  if(idx<64){
884  sdata[idx] = fmaxf(sdata[idx], sdata[idx+64]);
885  sdata[idx+blockSize] = (sdata[idx]==sdata[idx+64]) ? sdata[idx+blockSize+64] : sdata[idx+blockSize];
886  }
887  __syncthreads();
888  }
889  if(idx<32){
890  if(blockSize>=64){
891  if(idx<32){
892  sdata[idx] = fmaxf(sdata[idx], sdata[idx+32]);
893  sdata[idx+blockSize] = (sdata[idx]==sdata[idx+32]) ? sdata[idx+blockSize+32] : sdata[idx+blockSize];
894  }
895  }
896  if(blockSize>=32){
897  if(idx<16){
898  sdata[idx] = fmaxf(sdata[idx], sdata[idx+16]);
899  sdata[idx+blockSize] = (sdata[idx]==sdata[idx+16]) ? sdata[idx+blockSize+16] : sdata[idx+blockSize];
900  }
901  }
902  if(blockSize>=16){
903  if(idx<8){
904  sdata[idx] = fmaxf(sdata[idx], sdata[idx+8]);
905  sdata[idx+blockSize] = (sdata[idx]==sdata[idx+8]) ? sdata[idx+blockSize+8] : sdata[idx+blockSize];
906  }
907  }
908  if(blockSize>=8){
909  if(idx<4){
910  sdata[idx] = fmaxf(sdata[idx], sdata[idx+4]);
911  sdata[idx+blockSize] = (sdata[idx]==sdata[idx+4]) ? sdata[idx+blockSize+4] : sdata[idx+blockSize];
912  }
913  }
914  if(blockSize>=4){
915  if(idx<2){
916  sdata[idx] = fmaxf(sdata[idx], sdata[idx+2]);
917  sdata[idx+blockSize] = (sdata[idx]==sdata[idx+2]) ? sdata[idx+blockSize+2] : sdata[idx+blockSize];
918  }
919  }
920  if(blockSize>=2){
921  if(idx<1){
922  sdata[idx] = fmaxf(sdata[idx], sdata[idx+1]);
923  sdata[idx+blockSize] = (sdata[idx]==sdata[idx+1]) ? sdata[idx+blockSize+1] : sdata[idx+blockSize];
924  }
925  }
926  }
927 
928  if(idx==0){
929  d_out[blockIdx.x+(gridDim.x*imN)] = sdata[0];
930  position[blockIdx.x+(gridDim.x*imN)] = sdata[blockSize];
931  }
932 
933  __syncthreads();
934 
935  index = index+(int)yxdim2;
936 
937  }
938 
939 }
#define i
viol index
int * n

◆ calculateNccKernel()

__global__ void calculateNccKernel ( float *  RefExpRealSpace,
float *  MFrealSpaceRef,
float *  MFrealSpaceExp,
float *  MF2realSpaceRef,
float *  MF2realSpaceExp,
float *  mask,
float *  NCC,
int  nzyxdim,
int  yxdim,
int  xdim,
int  ydim,
int  maskCount,
int  max_shift,
bool  power2yx,
bool  power2x 
)

Definition at line 323 of file cuda_gpu_correlation.cpp.

326 {
327 
328  //change unsigned... and size_t
329  int idx = blockDim.x * blockIdx.x + threadIdx.x;
330  int idxLow;
331  if (power2yx==true)
332  idxLow = idx & (yxdim-1);
333  else
334  idxLow = idx%yxdim;
335 
336  if(idx>=nzyxdim)
337  return;
338 
339  int idx_x;
340  if (power2x==true)
341  idx_x = idxLow & (xdim-1);
342  else
343  idx_x = idxLow%xdim;
344 
345  int idx_y=idxLow/xdim;
346  if(idx_x>=max_shift && idx_x<xdim-max_shift){
347  NCC[idx] = -1;
348  return;
349  }
350  if(idx_y>=max_shift && idx_y<ydim-max_shift){
351  NCC[idx] = -1;
352  return;
353  }
354 
355  float maskTh = mask[idxLow];
356  float MF2realSpaceRefTh = MF2realSpaceRef[idx];
357  float MFrealSpaceRefTh = MFrealSpaceRef[idx];
358  float MF2realSpaceExpTh = MF2realSpaceExp[idx];
359  float MFrealSpaceExpTh = MFrealSpaceExp[idx];
360  float RefExpRealSpaceTh = RefExpRealSpace[idx];
361 
362  float den1 = sqrt(MF2realSpaceRefTh - (MFrealSpaceRefTh*MFrealSpaceRefTh/maskTh));
363  float den2 = sqrt(MF2realSpaceExpTh - (MFrealSpaceExpTh*MFrealSpaceExpTh/maskTh));
364 
365  float num;
366  if(den1!=0.0 && den2!=0.0 && !isnan(den1) && !isnan(den2) && maskTh>maskCount*0.9){
367  num = (RefExpRealSpaceTh - ((MFrealSpaceRefTh*MFrealSpaceExpTh)/(maskTh)) );
368  NCC[idx] = num/(den1*den2);
369  }else
370  NCC[idx] = -1;
371 
372 }
void sqrt(Image< double > &op)

◆ calculateNccRotationKernel()

__global__ void calculateNccRotationKernel ( float *  RefExpRealSpace,
cufftComplex *  polarFFTRef,
cufftComplex *  polarFFTExp,
cufftComplex *  polarSquaredFFTRef,
cufftComplex *  polarSquaredFFTExp,
float  maskFFTPolarReal,
float *  NCC,
int  yxdimFFT,
int  nzyxdim,
int  yxdim 
)

Definition at line 467 of file cuda_gpu_correlation.cpp.

470 {
471 
472  //change unsigned lont int for int and all the size_t
473  int idx = blockDim.x * blockIdx.x + threadIdx.x;
474  int idxLow = (idx/yxdim)*yxdimFFT;
475 
476  if(idx>=nzyxdim)
477  return;
478 
479  float normValue = 1.0f/yxdimFFT;
480  float maskNorm = maskFFTPolarReal*normValue;
481 
482  float M1M2Polar = maskFFTPolarReal*maskNorm;
483  float polarValRef = cuCrealf(polarFFTRef[idxLow])*maskNorm;
484  float polarSqValRef = cuCrealf(polarSquaredFFTRef[idxLow])*maskNorm;
485 
486  float polarValExp = cuCrealf(polarFFTExp[idxLow])*maskNorm;
487  float polarSqValExp = cuCrealf(polarSquaredFFTExp[idxLow])*maskNorm;
488 
489  float num = (RefExpRealSpace[idx] - (polarValRef*polarValExp/M1M2Polar) );
490  float den1 = sqrt(polarSqValRef - (polarValRef*polarValRef/M1M2Polar) );
491  float den2 = sqrt(polarSqValExp - (polarValExp*polarValExp/M1M2Polar) );
492 
493  if(den1!=0.0 && den2!=0.0 && !isnan(den1) && !isnan(den2))
494  NCC[idx] = num/(den1*den2);
495  else
496  NCC[idx] = -1.0;
497 
498 }
void sqrt(Image< double > &op)

◆ cart2polar()

__global__ void cart2polar ( float *  image,
float *  polar,
float *  polar2,
int  maxRadius,
int  maxAng,
int  Nimgs,
int  Ydim,
int  Xdim,
bool  rotate 
)

Definition at line 749 of file cuda_gpu_correlation.cpp.

751 {
752  int angle = blockDim.x * blockIdx.x + threadIdx.x;
753  int radius = blockDim.y * blockIdx.y + threadIdx.y;
754 
755  int numIm = blockIdx.z;
756 
757  if (radius>=maxRadius || angle>=maxAng)
758  return;
759 
760  float x = (float)(radius)*cosf((float)angle*(float)PI/180.0f) + (float)Xdim/2.0f;
761  float y = (float)(radius)*sinf((float)angle*(float)PI/180.0f) + (float)Ydim/2.0f;
762 
763  float dx_low = floorf(x);
764  float dy_low = floorf(y);
765  int x_low = (int)dx_low;
766  int y_low = (int)dy_low;
767  float x_x_low=x-dx_low;
768  float y_y_low=y-dy_low;
769  float one_x=1.0-x_x_low;
770  float one_y=1.0-y_y_low;
771  float w00=one_y*one_x;
772  float w01=one_y*x_x_low;
773  float w10=y_y_low*one_x;
774  float w11=y_y_low*x_x_low;
775 
776  int NXY=Xdim*Ydim;
777  int NXYpolar=maxAng*maxRadius;
778  int imgIdx00=y_low * Xdim + x_low;
779  int imgIdx01=imgIdx00+1;
780  int imgIdx10=imgIdx00+Xdim;
781  int imgIdx11=imgIdx10+1;
782  int imgOffset=0;
783  int polarOffset=0;
784  int polarIdx;
785  if(!rotate)
786  polarIdx=angle+(radius*maxAng);
787  else
788  polarIdx = (maxAng-angle-1)+((maxRadius-radius-1)*maxAng);
789 
790  int n=numIm;
791 
792  imgOffset = n*NXY;
793  polarOffset = n*NXYpolar;
794  float I00 = image[imgIdx00+imgOffset];
795  float I01 = image[imgIdx01+imgOffset];
796  float I10 = image[imgIdx10+imgOffset];
797  float I11 = image[imgIdx11+imgOffset];
798  float imVal = I00*w00 + I01*w01 + I10*w10 + I11*w11;
799  int finalPolarIndex=polarIdx+polarOffset;
800  polar[finalPolarIndex] = imVal;
801  polar2[finalPolarIndex] = imVal*imVal;
802 
803 }
static double * y
doublereal * x
#define rotate(a, i, j, k, l)
double * f
#define PI
double radius
Spatial radius in Universal System units.
Definition: blobs.h:115
int * n

◆ cuda_cart2polar()

void cuda_cart2polar ( GpuMultidimArrayAtGpu< float > &  image,
GpuMultidimArrayAtGpu< float > &  polar_image,
GpuMultidimArrayAtGpu< float > &  polar2_image,
bool  rotate,
myStreamHandle myStream 
)

Definition at line 1478 of file cuda_gpu_correlation.cpp.

1480 {
1481  cudaStream_t *stream = (cudaStream_t*) myStream.ptr;
1482  int numTh = 32;
1483  int numBlkx = polar_image.Xdim/numTh;
1484  if(polar_image.Xdim%numTh > 0)
1485  numBlkx++;
1486  int numBlky = polar_image.Ydim/numTh;
1487  if(polar_image.Ydim%numTh > 0)
1488  numBlky++;
1489 
1490  dim3 blockSize(numTh, numTh, 1);
1491  dim3 gridSize(numBlkx, numBlky, polar_image.Ndim);
1492 
1493  cart2polar <<< gridSize, blockSize, 0, *stream>>>
1494  (image.d_data, polar_image.d_data, polar2_image.d_data, polar_image.Ydim, polar_image.Xdim, polar_image.Ndim, image.Ydim, image.Xdim, rotate);
1495 }
#define rotate(a, i, j, k, l)

◆ maskingPaddingKernel()

__global__ void maskingPaddingKernel ( float *  d_in,
float *  mask,
float *  padded_image_gpu,
float *  padded_image2_gpu,
float *  padded_mask_gpu,
int  xdim,
int  ydim,
int  yxdim,
int  numImag,
int  pad_xdim,
int  pad_ydim,
int  pad_yxdim,
bool  experimental,
bool  power2x 
)

Definition at line 517 of file cuda_gpu_correlation.cpp.

519  {
520 
521  int idx = blockDim.x * blockIdx.x + threadIdx.x;
522  int numIm = blockIdx.y;
523 
524  if(idx>=yxdim)
525  return;
526 
527  int x_idx1;
528  if (power2x==true)
529  x_idx1 = idx & (xdim-1);
530  else
531  x_idx1 = idx%xdim;
532 
533  int y_idx1 = idx/xdim;
534  int idxWriteToMask;
535  if(experimental)
536  idxWriteToMask = (ydim-1 - y_idx1)*xdim + (xdim-1 - x_idx1);
537  else
538  idxWriteToMask = y_idx1*xdim + x_idx1;
539 
540  int xdim2Im = (int)floorf((pad_xdim-xdim)/2.0f);
541  int ydim2Im = (int)floorf((pad_ydim-ydim)/2.0f);
542  int xdim2Mask = xdim2Im;
543  int ydim2Mask = ydim2Im;
544  if(experimental && (xdim&1)==0){
545  xdim2Im+=1;
546  ydim2Im+=1;
547  }
548 
549  int x_idx;
550  if (power2x==true)
551  x_idx = idxWriteToMask & (xdim-1);
552  else
553  x_idx = idxWriteToMask%xdim;
554 
555  int y_idx = idxWriteToMask/xdim;
556  int idxWrite;
557  int idxWriteMask;
558  float d_out, d_out2;
559 
560  int offset=0;
561  //for(int j=0; j<numImag; j++){
562 
563  int j = numIm;
564 
565  offset=j*yxdim;
566 
567  d_out = d_in[idx+offset]*mask[idx];
568  d_out2 = d_out*d_out;
569 
570  idxWrite = (pad_yxdim*j) + (ydim2Im*pad_xdim) + (y_idx*pad_xdim) + xdim2Im + x_idx;
571  if((xdim&1)==0)
572  idxWriteMask = (pad_yxdim*j) + (ydim2Mask*pad_xdim) + (y_idx*pad_xdim) + xdim2Mask + x_idx;
573  else
574  idxWriteMask = idxWrite;
575  padded_image_gpu[idxWrite] = d_out;
576  padded_image2_gpu[idxWrite] = d_out2;
577  if(j==0 && padded_mask_gpu!=NULL)
578  padded_mask_gpu[idxWriteMask] = mask[idx];
579 
580  //}
581 }
#define j

◆ pointwiseMultiplicationComplexKernel()

__global__ void pointwiseMultiplicationComplexKernel ( cufftComplex *  reference,
cufftComplex *  experimental,
cufftComplex *  RefExpFourier,
int  nzyxdim,
int  yxdim 
)

Definition at line 501 of file cuda_gpu_correlation.cpp.

503 {
504  int idx = blockDim.x * blockIdx.x + threadIdx.x;
505 
506  if(idx>=nzyxdim)
507  return;
508 
509  float normFactor = (1.0f/yxdim);
510 
511  cuComplex mulOut = cuCmulf(reference[idx], experimental[idx]);
512  RefExpFourier[idx] = mulOut*normFactor;
513 }

◆ pointwiseMultiplicationComplexOneManyKernel()

__global__ void pointwiseMultiplicationComplexOneManyKernel ( cufftComplex *  M,
cufftComplex *  manyF,
cufftComplex *  MmanyF,
int  nzyxdim,
int  yxdim,
bool  power2 
)

Definition at line 283 of file cuda_gpu_correlation.cpp.

285 {
286 
287  int idx = blockDim.x * blockIdx.x + threadIdx.x;
288  int idxLow;
289  if (power2==true)
290  idxLow = idx & (yxdim-1);
291  else
292  idxLow = idx%yxdim;
293 
294  if (idx>=nzyxdim)
295  return;
296 
297  float normFactor = (1.0f/yxdim);
298 
299  cuComplex mulOut = cuCmulf(manyF[idx], M[idxLow]);
300  MmanyF[idx] = mulOut*normFactor;
301 
302 }

◆ pointwiseMultiplicationComplexOneManyKernel_three()

__global__ void pointwiseMultiplicationComplexOneManyKernel_three ( cufftComplex *  M,
cufftComplex *  manyF,
cufftComplex *  MmanyF,
cufftComplex *  manyF_sq,
cufftComplex *  MmanyF_sq,
cufftComplex *  maskAux,
int  nzyxdim,
int  yxdim,
int  ndim,
bool  power2 
)

Definition at line 180 of file cuda_gpu_correlation.cpp.

183 {
184 
185  int idx = threadIdx.x;
186  int nIm = blockIdx.x;
187  int blockSize = blockDim.x;
188 
189  int posTh = nIm*yxdim + idx;
190  int n = ceilf(((float)yxdim/(float)blockSize));
191 
192  if (idx>=yxdim)
193  return;
194 
195  float normFactor = 1.0f/yxdim;
196 
197  int myIdx = posTh;
198  int myIdxMask = idx;
199 
200  cufftComplex myM = M[myIdxMask];
201  cuComplex mulOut = cuCmulf(manyF[myIdx], myM);
202  cuComplex mulOut_sq = cuCmulf(manyF_sq[myIdx], myM);
203 
204  MmanyF[myIdx] = mulOut*normFactor;
205  MmanyF_sq[myIdx] = mulOut_sq*normFactor;
206  if(nIm==0)
207  maskAux[myIdx] = cuCmulf(myM, myM)*normFactor;
208 
209  myIdx+=blockSize;
210  myIdxMask+=blockSize;
211 
212  for (int i=1; i<n; i++){
213 
214  if (posTh+i*blockSize < yxdim*(nIm+1)){
215 
216  myM = M[myIdxMask];
217  mulOut = cuCmulf(manyF[myIdx], myM);
218  mulOut_sq = cuCmulf(manyF_sq[myIdx], myM);
219 
220  MmanyF[myIdx] = mulOut*normFactor;
221  MmanyF_sq[myIdx] = mulOut_sq*normFactor;
222  if(nIm==0)
223  maskAux[myIdx] = cuCmulf(myM, myM)*normFactor;
224 
225  myIdx+=blockSize;
226  myIdxMask+=blockSize;
227 
228  }
229  }
230 }
#define i
int * n

◆ pointwiseMultiplicationComplexOneManyKernel_two()

__global__ void pointwiseMultiplicationComplexOneManyKernel_two ( cufftComplex *  M,
cufftComplex *  manyF,
cufftComplex *  MmanyF,
cufftComplex *  manyF_sq,
cufftComplex *  MmanyF_sq,
int  nzyxdim,
int  yxdim,
int  ndim,
bool  power2 
)

Definition at line 233 of file cuda_gpu_correlation.cpp.

236 {
237 
238  int idx = threadIdx.x;
239  int nIm = blockIdx.x;
240  int blockSize = blockDim.x;
241 
242  int posTh = nIm*yxdim + idx;
243  int n = ceilf(((float)yxdim/(float)blockSize));
244 
245  if (idx>=yxdim)
246  return;
247 
248  float normFactor = 1.0f/yxdim;
249 
250  int myIdx = posTh;
251  int myIdxMask = idx;
252 
253  cufftComplex myM = M[myIdxMask];
254  cuComplex mulOut = cuCmulf(manyF[myIdx], myM);
255  cuComplex mulOut_sq = cuCmulf(manyF_sq[myIdx], myM);
256 
257  MmanyF[myIdx] = mulOut*normFactor;
258  MmanyF_sq[myIdx] = mulOut_sq*normFactor;
259 
260  myIdx+=blockSize;
261  myIdxMask+=blockSize;
262 
263  for (int i=1; i<n; i++){
264 
265  if (posTh+i*blockSize < yxdim*(nIm+1)){
266 
267  myM = M[myIdxMask];
268  mulOut = cuCmulf(manyF[myIdx], myM);
269  mulOut_sq = cuCmulf(manyF_sq[myIdx], myM);
270 
271  MmanyF[myIdx] = mulOut*normFactor;
272  MmanyF_sq[myIdx] = mulOut_sq*normFactor;
273 
274  myIdx+=blockSize;
275  myIdxMask+=blockSize;
276 
277  }
278  }
279 }
#define i
int * n

◆ pointwiseMultiplicationFourier()

void pointwiseMultiplicationFourier ( const GpuMultidimArrayAtGpu< std::complex< float > > &  M,
const GpuMultidimArrayAtGpu< std::complex< float > > &  manyF,
GpuMultidimArrayAtGpu< std::complex< float > > &  MmanyF,
myStreamHandle myStream 
)

Definition at line 973 of file cuda_gpu_correlation.cpp.

975 {
976  cudaStream_t *stream = (cudaStream_t*) myStream.ptr;
977  int numTh = 1024;
978  XmippDim3 blockSize(numTh, 1, 1), gridSize;
979  manyF.calculateGridSizeVectorized(blockSize, gridSize);
980 
981  bool power2;
982  if (manyF.yxdim & (manyF.yxdim-1))
983  power2 = false;
984  else
985  power2 = true;
986  pointwiseMultiplicationComplexOneManyKernel <<< CONVERT2DIM3(gridSize), CONVERT2DIM3(blockSize), 0, *stream >>>
987  ((cufftComplex*)M.d_data, (cufftComplex*)manyF.d_data, (cufftComplex*) MmanyF.d_data, manyF.nzyxdim, manyF.yxdim, power2);
988 
989 }
void calculateGridSizeVectorized(const XmippDim3 &blockSize, XmippDim3 &gridSize) const

◆ pointwiseMultiplicationFourier_three()

void pointwiseMultiplicationFourier_three ( const GpuMultidimArrayAtGpu< std::complex< float > > &  M,
const GpuMultidimArrayAtGpu< std::complex< float > > &  manyF,
GpuMultidimArrayAtGpu< std::complex< float > > &  MmanyF,
const GpuMultidimArrayAtGpu< std::complex< float > > &  manyF_sq,
GpuMultidimArrayAtGpu< std::complex< float > > &  MmanyF_sq,
myStreamHandle myStream,
GpuMultidimArrayAtGpu< std::complex< float > > &  maskAux 
)

Definition at line 1015 of file cuda_gpu_correlation.cpp.

1022 {
1023  cudaStream_t *stream = (cudaStream_t*) myStream.ptr;
1024  int numTh = 1024;
1025  int numBlk = manyF.Ndim;
1026 
1027  bool power2;
1028  if (manyF.yxdim & (manyF.yxdim-1))
1029  power2 = false;
1030  else
1031  power2 = true;
1032 
1033  pointwiseMultiplicationComplexOneManyKernel_three <<< numBlk, numTh, 0, *stream >>>
1034  ((cufftComplex*)M.d_data, (cufftComplex*)manyF.d_data, (cufftComplex*) MmanyF.d_data,
1035  (cufftComplex*)manyF_sq.d_data, (cufftComplex*) MmanyF_sq.d_data,
1036  (cufftComplex*) maskAux.d_data,
1037  manyF.nzyxdim, manyF.yxdim, manyF.Ndim, power2);
1038 
1039 }
if(fabs(c[*nmax+ *nmax *c_dim1])==0.e0)

◆ pointwiseMultiplicationFourier_two()

void pointwiseMultiplicationFourier_two ( const GpuMultidimArrayAtGpu< std::complex< float > > &  M,
const GpuMultidimArrayAtGpu< std::complex< float > > &  manyF,
GpuMultidimArrayAtGpu< std::complex< float > > &  MmanyF,
const GpuMultidimArrayAtGpu< std::complex< float > > &  manyF_sq,
GpuMultidimArrayAtGpu< std::complex< float > > &  MmanyF_sq,
myStreamHandle myStream 
)

Definition at line 991 of file cuda_gpu_correlation.cpp.

997 {
998  cudaStream_t *stream = (cudaStream_t*) myStream.ptr;
999  int numTh = 1024;
1000  int numBlk = manyF.Ndim;
1001 
1002  bool power2;
1003  if (manyF.yxdim & (manyF.yxdim-1))
1004  power2 = false;
1005  else
1006  power2 = true;
1007 
1008  pointwiseMultiplicationComplexOneManyKernel_two <<< numBlk, numTh, 0, *stream >>>
1009  ((cufftComplex*)M.d_data, (cufftComplex*)manyF.d_data, (cufftComplex*) MmanyF.d_data,
1010  (cufftComplex*)manyF_sq.d_data, (cufftComplex*) MmanyF_sq.d_data,
1011  manyF.nzyxdim, manyF.yxdim, manyF.Ndim, power2);
1012 
1013 }
if(fabs(c[*nmax+ *nmax *c_dim1])==0.e0)

◆ sumRadiusKernel()

__global__ void sumRadiusKernel ( float *  d_in,
float *  d_out,
float *  d_out_max,
float *  d_out_zero,
int  dim,
int  radius,
int  ndim 
)

Definition at line 33 of file cuda_gpu_correlation.cpp.

34  {
35 
36  int idx = blockDim.x * blockIdx.x + threadIdx.x;
37  int numIm = floorf(idx/360.0f);
38  int angle = idx%360;
39 
40  if(idx>=dim)
41  return;
42 
43  float out = 0.0;
44  float out_max = -100000;
45  int idxRead=360*radius*numIm;
46  for(int i=0; i<radius; i++){
47  if(d_in[idxRead+(360*i)+angle]==-1.0){
48  continue;
49  }
50  out += d_in[idxRead+(360*i)+angle];
51  if(d_in[idxRead+(360*i)+angle]>d_out_max[idx]){
52  out_max = d_in[idxRead+(360*i)+angle];
53  }
54 
55  if(i==0)
56  d_out_zero[idx] = d_in[idxRead+angle];
57  }
58  d_out[idx] = out;
59  d_out_max[idx] = out_max;
60 
61 }
#define i
double * f
double radius
Spatial radius in Universal System units.
Definition: blobs.h:115

◆ waitGpu()

void waitGpu ( myStreamHandle myStream,
bool  allStreams 
)

Definition at line 1497 of file cuda_gpu_correlation.cpp.

1497  {
1498  if(!allStreams){
1499  cudaStream_t *stream = (cudaStream_t*) myStream.ptr;
1500  gpuErrchk(cudaStreamSynchronize(*stream));
1501  }else
1502  gpuErrchk(cudaDeviceSynchronize());
1503 }
#define gpuErrchk(code)
Definition: cuda_asserts.h:31

◆ wrapping()

__device__ void wrapping ( int &  x,
int &  y,
int  xdim,
int  ydim 
)

Definition at line 374 of file cuda_gpu_correlation.cpp.

374  {
375  //mirror wrapping
376  if(x<0)
377  x=-x;
378  else if(x>=xdim)
379  x=xdim-(x-xdim)-1;
380  if(y<0)
381  y=-y;
382  else if(y>=ydim)
383  y=ydim-(y-ydim)-1;
384 }
static double * y
doublereal * x