Xmipp  v3.23.11-Nereus
cuda_xmipp_utils.cpp
Go to the documentation of this file.
1 
2 
3 #include "cuda_xmipp_utils.h"
4 #include "cuda_asserts.h"
5 
6 #include <cuda_runtime.h>
7 #include <cufft.h>
8 #include <cufftXt.h>
9 #include <cuComplex.h>
10 #include <nvml.h>
11 
12 #include <time.h>
13 #include <sys/time.h>
14 
17  cufftComplex *data;
18 };
19 
20 template<typename T>
22  size_t _Ndim, size_t _Xdim, size_t _Ydim, size_t _Zdim) {
23  if (_Xdim*_Ydim*_Zdim*_Ndim==nzyxdim)
24  return;
25 
26  clear();
27 
28  Xdim=_Xdim;
29  Ydim=_Ydim;
30  Zdim=_Zdim;
31  Ndim=_Ndim;
32  yxdim=_Ydim*_Xdim;
33  zyxdim=yxdim*_Zdim;
34  nzyxdim=zyxdim*_Ndim;
35  gpuErrchk(cudaMalloc(&d_data,nzyxdim*sizeof(T)));
36  gpuErrchk(cudaMallocHost(&h_data,nzyxdim*sizeof(T)));
37  initializeIdentity(d_data, h_data, Ndim, myStream);
38 }
39 
40 template void TransformMatrix<float>::resize(myStreamHandle &myStream,
41  size_t _Ndim, size_t _Xdim, size_t _Ydim, size_t _Zdim);
42 
43 template<typename T>
44 void GpuMultidimArrayAtGpu<T>::resize(size_t _Xdim, size_t _Ydim, size_t _Zdim, size_t _Ndim)
45 {
46  if (_Xdim*_Ydim*_Zdim*_Ndim==nzyxdim){
47 
48  return;
49  }
50 
51  clear();
52 
53  setDims(_Xdim, _Ydim, _Zdim, _Ndim);
54  gpuErrchk(cudaMalloc(&d_data,nzyxdim*sizeof(T)));
55 }
56 
57 void myStreamDestroy(void *ptr)
58 {
59  cudaStream_t *streamPtr = (cudaStream_t *)ptr;
60  cudaStreamDestroy(*streamPtr);
61 }
62 
64 {
65  cudaStream_t *streamPtr = new cudaStream_t;
66  gpuErrchk(cudaStreamCreate(streamPtr));
67  myStream.ptr = (void*)streamPtr;
68  //printf("ptr %p\n", myStream.ptr);
69  //printf("streamPtr %p\n", streamPtr);
70 }
71 
72 void mycufftDestroy(void* ptr)
73 {
74  cufftHandle *planPtr = (cufftHandle *)ptr;
75  cufftDestroy(*planPtr);
76  delete planPtr;
77 }
78 
80  printf("calculateFFTPlanSize myhandle.ptr: %p\n",myhandle.ptr);
81  size_t ws2;
82  cufftHandle *planFptr=(cufftHandle *)myhandle.ptr;
83  cufftGetSize(*planFptr, &ws2);
84  printf("calculateFFTPlanSize size %i\n", (int)ws2);
85 }
86 
87 
88 void createPlanFFT(int Xdim, int Ydim, int Ndim, int Zdim, bool forward, cufftHandle *plan){
89 
90  int Xfdim=(Xdim/2)+1;
91 
92  int nr1[] = {Xdim}; // --- Size of the image in real space
93  int nr2[] = {Ydim, Xdim}; // --- Size of the image in real space
94  int nr3[] = {Zdim, Ydim, Xdim}; // --- Size of the image in real space
95 
96  int nf1[] = {Xfdim}; // --- Size of the Fourier transform
97  int nf2[] = {Ydim, Xfdim}; // --- Size of the Fourier transform
98  int nf3[] = {Zdim, Ydim, Xfdim}; // --- Size of the Fourier transform
99  int *nr=NULL, *nf=NULL;
100  int NRANK; // 1D, 2D or 3D FFTs
101  if (Ydim==1 && Zdim==1)
102  {
103  NRANK=1;
104  nr=nr1;
105  nf=nf1;
106  }
107  else if (Zdim==1)
108  {
109  NRANK=2;
110  nr=nr2;
111  nf=nf2;
112  }
113  else
114  {
115  NRANK=3;
116  nr=nr3;
117  nf=nf3;
118  }
119 
120  int rstride = 1; // --- Distance between two successive input/output elements
121  int fstride = 1;
122  int rdist = Xdim*Ydim*Zdim; // --- Distance between batches
123  int fdist = Xfdim*Ydim*Zdim;
124 
125  if(forward){
126  gpuErrchkFFT(cufftPlanMany(plan, NRANK, nr, nr, rstride, rdist, nf, fstride, fdist, CUFFT_R2C, Ndim));
127  }else{
128  gpuErrchkFFT(cufftPlanMany(plan, NRANK, nr, nf, fstride, fdist, nr, rstride, rdist, CUFFT_C2R, Ndim));
129  }
130 
131 }
132 
133 void createPlanFFTStream(int Xdim, int Ydim, int Ndim, int Zdim,
134  bool forward, cufftHandle *plan, myStreamHandle &myStream){
135 
136  int Xfdim=(Xdim/2)+1;
137 
138  int nr1[] = {Xdim}; // --- Size of the image in real space
139  int nr2[] = {Ydim, Xdim}; // --- Size of the image in real space
140  int nr3[] = {Zdim, Ydim, Xdim}; // --- Size of the image in real space
141 
142  int nf1[] = {Xfdim}; // --- Size of the Fourier transform
143  int nf2[] = {Ydim, Xfdim}; // --- Size of the Fourier transform
144  int nf3[] = {Zdim, Ydim, Xfdim}; // --- Size of the Fourier transform
145  int *nr=NULL, *nf=NULL;
146  int NRANK; // 1D, 2D or 3D FFTs
147  if (Ydim==1 && Zdim==1)
148  {
149  NRANK=1;
150  nr=nr1;
151  nf=nf1;
152  }
153  else if (Zdim==1)
154  {
155  NRANK=2;
156  nr=nr2;
157  nf=nf2;
158  }
159  else
160  {
161  NRANK=3;
162  nr=nr3;
163  nf=nf3;
164  }
165 
166  int rstride = 1; // --- Distance between two successive input/output elements
167  int fstride = 1;
168  int rdist = Xdim*Ydim*Zdim; // --- Distance between batches
169  int fdist = Xfdim*Ydim*Zdim;
170 
171  cudaStream_t *stream = (cudaStream_t*) myStream.ptr;
172  if(forward){
173  gpuErrchkFFT(cufftPlanMany(plan, NRANK, nr, nr, rstride, rdist, nf, fstride, fdist, CUFFT_R2C, Ndim));
174  gpuErrchkFFT(cufftSetStream(*plan, *stream));
175  }else{
176  gpuErrchkFFT(cufftPlanMany(plan, NRANK, nr, nf, fstride, fdist, nr, rstride, rdist, CUFFT_C2R, Ndim));
177  gpuErrchkFFT(cufftSetStream(*plan, *stream));
178  }
179 
180 }
181 
182 template<typename T>
184 {
185  if (d_data!=NULL)
186  gpuFree((void*) d_data);
187  if (h_data!=NULL)
188  gpuErrchk(cudaFreeHost(h_data));
189  Xdim=Ydim=Zdim=Ndim=yxdim=zyxdim=nzyxdim=0;
190  d_data=NULL;
191  h_data=NULL;
192 }
193 // explicit instantiation
194 template void TransformMatrix<float>::clear();
195 
196 void gpuMalloc(void** d_data, size_t Nbytes)
197 {
198  gpuErrchk(cudaMalloc(d_data, Nbytes));
199 }
200 
201 void gpuFree(void* d_data)
202 {
203  gpuErrchk(cudaFree(d_data));
204 }
205 
206 void cpuMalloc(void** h_data, size_t Nbytes)
207 {
208  gpuErrchk(cudaMallocHost(h_data, Nbytes));
209 }
210 
211 void cpuFree(void* h_data)
212 {
213  gpuErrchk(cudaFreeHost(h_data));
214 }
215 
216 void initializeIdentity(float* d_data, float *h_data, int Ndim, myStreamHandle &myStream)
217 {
218  //float *matrices = new float[Ndim*9];
219  for(int i=0; i<Ndim; i++){
220  float aux_matrix[9] = {1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0};
221  int offset=i*9;
222  for (int j=0; j<9; j++)
223  h_data[offset+j] = aux_matrix[j];
224  }
225  cudaStream_t *stream = (cudaStream_t*) myStream.ptr;
226  gpuErrchk(cudaMemcpyAsync((void*)d_data, h_data, Ndim*9*sizeof(float), cudaMemcpyHostToDevice, *stream));
227  //delete []matrices;
228 
229 }
230 
231 /*void setTranslationMatrix(float* d_data, float posX, float posY, int n)
232 {
233  float matrix[9] = {1, 0, posX, 0, 1, posY, 0, 0, 1};
234  gpuErrchk(cudaMemcpy((void*)&d_data[n*9], &matrix[0], 9*sizeof(float), cudaMemcpyHostToDevice));
235 }*/
236 
237 void setTranslationMatrix(float* d_data, float* posX, float* posY, int Ndim, myStreamHandle &myStream)
238 {
239  float *matrices;
240  gpuErrchk(cudaMallocHost((void**)&matrices, sizeof(float)*Ndim*9));
241 
242  for(int i=0; i<Ndim; i++){
243  float aux_matrix[9] = {1, 0, -posX[i], 0, 1, -posY[i], 0, 0, 1};
244  int offset=i*9;
245  //memcpy(&matrices[offset], &aux_matrix, 9*sizeof(float));
246  for (int j=0; j<9; j++)
247  matrices[offset+j] = aux_matrix[j];
248  }
249  cudaStream_t *stream = (cudaStream_t*) myStream.ptr;
250  gpuErrchk(cudaMemcpyAsync((void*)d_data, matrices, Ndim*9*sizeof(float), cudaMemcpyHostToDevice, *stream));
251  delete []matrices;
252 }
253 
254 /*void setRotationMatrix(float* d_data, float ang, int n)
255 {
256  float rad = (float)(ang*PI/180);
257  float matrix[9] = {cosf(rad), -sinf(rad), 0, sinf(rad), cosf(rad), 0, 0, 0, 1};
258  gpuErrchk(cudaMemcpy((void*)&d_data[n*9], &matrix[0], 9*sizeof(float), cudaMemcpyHostToDevice));
259 }*/
260 void setRotationMatrix(float* d_data, float* ang, int Ndim, myStreamHandle &myStream)
261 {
262 
263  float *rad_vector;
264  gpuErrchk(cudaMallocHost((void**)&rad_vector, sizeof(float)*Ndim*9));
265 
266  for(int i=0; i<Ndim; i++){
267  float rad = (float)(-ang[i]*M_PI/180);
268  float matrix[9] = {cosf(rad), -sinf(rad), 0, sinf(rad), cosf(rad), 0, 0, 0, 1};
269  int offset=i*9;
270  for (int j=0; j<9; j++)
271  rad_vector[offset+j] = matrix[j];
272  }
273  cudaStream_t *stream = (cudaStream_t*) myStream.ptr;
274  gpuErrchk(cudaMemcpyAsync((void*)d_data, rad_vector, Ndim*9*sizeof(float), cudaMemcpyHostToDevice, *stream));
275  delete []rad_vector;
276 }
277 
278 void gpuCopyFromCPUToGPU(void* data, void* d_data, size_t Nbytes)
279 {
280  gpuErrchk(cudaMemcpy(d_data, data, Nbytes, cudaMemcpyHostToDevice));
281 }
282 
283 void gpuCopyFromGPUToCPU(void* d_data, void* data, size_t Nbytes)
284 {
285  gpuErrchk(cudaMemcpy(data, d_data, Nbytes, cudaMemcpyDeviceToHost));
286 }
287 
288 void gpuCopyFromGPUToGPU(void* d_dataFrom, void* d_dataTo, size_t Nbytes)
289 {
290  gpuErrchk(cudaMemcpy(d_dataTo, d_dataFrom, Nbytes, cudaMemcpyDeviceToDevice));
291 }
292 
293 void gpuCopyFromCPUToGPUStream(void* data, void* d_data, size_t Nbytes, myStreamHandle &myStream)
294 {
295  cudaStream_t *stream = (cudaStream_t*) myStream.ptr;
296  gpuErrchk(cudaMemcpyAsync(d_data, data, Nbytes, cudaMemcpyHostToDevice, *stream));
297 
298  //gpuErrchk(cudaStreamSynchronize(*stream));
299 }
300 
301 void gpuCopyFromGPUToCPUStream(void* d_data, void* data, size_t Nbytes, myStreamHandle &myStream)
302 {
303  cudaStream_t *stream = (cudaStream_t*) myStream.ptr;
304  gpuErrchk(cudaMemcpyAsync(data, d_data, Nbytes, cudaMemcpyDeviceToHost, *stream));
305 
306  gpuErrchk(cudaStreamSynchronize(*stream));
307  //cudaDeviceSynchronize();
308 }
309 
310 void gpuCopyFromGPUToGPUStream(void* d_dataFrom, void* d_dataTo, size_t Nbytes, myStreamHandle &myStream)
311 {
312  cudaStream_t *stream = (cudaStream_t*) myStream.ptr;
313  gpuErrchk(cudaMemcpyAsync(d_dataTo, d_dataFrom, Nbytes, cudaMemcpyDeviceToDevice, *stream));
314 }
315 
316 int gridFromBlock(int tasks, int Nthreads)
317 {
318  int numBlk = tasks/Nthreads;
319  if(tasks%Nthreads>0)
320  numBlk++;
321  return numBlk;
322 }
324 {
325  size_t free_byte, total_byte;
326  gpuErrchk(cudaMemGetInfo(&free_byte, &total_byte));
327 
328  float free_db = (float)free_byte;
329  float total_db = (float)total_byte;
330  float used_db = total_db - free_db;
331 
332  data[0]=total_db;
333  data[1]=free_db;
334  data[2]=used_db;
335 }
336 
338 {
339  cudaDeviceProp prop;
340  cudaGetDeviceProperties(&prop, 0);
341  grid[0] = prop.maxGridSize[0];
342  grid[1] = prop.maxGridSize[1];
343  grid[2] = prop.maxGridSize[2];
344 }
345 
346 
347 __device__ cufftComplex CB_pointwiseMultiplicationComplexKernelLoad(void *dataIn, size_t offset,
348  void *callerInfo, void *sharedPtr)
349 {
350 
351  //printf("INSIDEEEE IFFT\n");
352  pointwiseMult *myData = (pointwiseMult*)callerInfo;
353 
354  cufftComplex reference = ((cufftComplex*)dataIn)[offset];
355  cufftComplex *mask = (cufftComplex*)myData->data;
356 
357  int normFactor = myData->normFactor;
358  int indexM = offset%normFactor;
359 
360  float factor = 1.0f / normFactor;
361 
362  cufftComplex mulOut = cuCmulf((cuComplex)reference, (cuComplex)mask[indexM]);
363  cufftComplex out;
364  out.x = mulOut.x*factor;
365  out.y = mulOut.y*factor;
366 
367  //if(offset>9000 && offset<9100)
368  // printf("offset %i, mask %f, data %f, mul %f, factor %f\n", offset, mask[indexM].x, reference.x, out.x, factor);
369 
370  return out;
371 }
373 
374 
375 
376 __device__ void CB_pointwiseMultiplicationComplexKernelStore(void *dataOut, size_t offset, cufftComplex element,
377  void *callerInfo, void *sharedPtr)
378 {
379 
380  pointwiseMult *myData = (pointwiseMult*)callerInfo;
381 
382  cufftComplex *mask = myData->data;
383  int normFactor = myData->normFactor;
384  int indexM = offset%normFactor;
385 
386  float factor = 1.0f / normFactor;
387 
388  cufftComplex mulOut = cuCmulf((cuComplex)element, (cuComplex)mask[indexM]);
389  cufftComplex out;
390  out.x = mulOut.x*factor;
391  out.y = mulOut.y*factor;
392  ((cufftComplex*)dataOut)[offset] = out;
393 
394 }
396 
397 
398 template float* loadToGPU<float>(const float* data, size_t items);
399 template std::complex<float>* loadToGPU<std::complex<float> >(const std::complex<float>* data, size_t items);
400 template<typename T>
401 T* loadToGPU(const T* data, size_t items) {
402 T* d_data;
403 size_t bytes = items * sizeof(T);
404 gpuErrchk(cudaMalloc(&d_data, bytes));
405 gpuErrchk(cudaMemcpy(d_data, data, bytes, cudaMemcpyHostToDevice));
406 return d_data;
407 }
408 
409 template<>
410 void GpuMultidimArrayAtGpu<float>::fftStream(GpuMultidimArrayAtGpu< std::complex<float> > &fourierTransform,
411  mycufftHandle &myhandle, myStreamHandle &myStream, bool useCallback,
412  GpuMultidimArrayAtGpu< std::complex<float> > &dataMask)
413 {
414 
415  int Xfdim=(Xdim/2)+1;
416  //if(fourierTransform.d_data==NULL)
417  fourierTransform.resize(Xfdim,Ydim,Zdim,Ndim);
418  //printf("FFT Xdim %i, Ydim %i, Ndim %i, Zdim %i \n", Xfdim, Ydim, Zdim, Ndim);
419 
420  int positionReal=0;
421  int positionFFT=0;
422  int NdimNew, auxNdim;
423  NdimNew = Ndim;
424  int aux=Ndim;
425 
426  auxNdim=Ndim;
427 
428  cudaStream_t *stream = (cudaStream_t*) myStream.ptr;
429 
430  if(myhandle.ptr!=NULL)
431  NdimNew = Ndim;
432 
433  while(aux>0){
434 
437  if(NdimNew!=Ndim){
438  auxInFFT.resize(Xdim,Ydim,Zdim,NdimNew);
439  gpuCopyFromGPUToGPUStream((cufftReal*)&d_data[positionReal], auxInFFT.d_data, Xdim*Ydim*Zdim*NdimNew*sizeof(cufftReal), myStream);
440  auxOutFFT.resize(Xfdim,Ydim,Zdim,NdimNew);
441  }
442 
443  cufftHandle *planFptr = new cufftHandle;
444  cufftHandle *planAuxFptr = new cufftHandle;
445  if(auxNdim!=NdimNew){
446  createPlanFFTStream(Xdim, Ydim, NdimNew, Zdim, true, planAuxFptr, myStream);
447  }else{
448  if(myhandle.ptr == NULL){
449  //printf("Creating plan FFT\n");
450  createPlanFFTStream(Xdim, Ydim, NdimNew, Zdim, true, planFptr, myStream);
451  myhandle.ptr = (void *)planFptr;
452  planFptr=(cufftHandle *)myhandle.ptr;
453  }else{
454  planFptr=(cufftHandle *)myhandle.ptr;
455  }
456  }
457 
458  //AJ using callbacks
459  if(useCallback){
460 
461  pointwiseMult *dataCB;
462  gpuErrchk(cudaMallocHost((void **)&dataCB, sizeof(pointwiseMult)));
463  dataCB->normFactor = fourierTransform.yxdim;
464  dataCB->data = (cufftComplex*)dataMask.d_data;
465 
466  //printf("Using callbacks %i \n", dataCB->normFactor);
467  //fflush(stdout);
468 
469  // Allocate device memory for parameters
470  pointwiseMult *d_params;
471  gpuErrchk(cudaMalloc((void **)&d_params, sizeof(pointwiseMult)));
472 
473  // Copy host memory to device
474  gpuErrchk(cudaMemcpyAsync(d_params, dataCB, sizeof(pointwiseMult), cudaMemcpyHostToDevice, *stream));
475 
476  cufftCallbackStoreC h_pointwiseMultiplicationComplexKernel;
477  gpuErrchk(cudaMemcpyFromSymbolAsync(&h_pointwiseMultiplicationComplexKernel,
479  sizeof(h_pointwiseMultiplicationComplexKernel), 0, cudaMemcpyDeviceToHost, *stream));
480 
481  cufftResult status = cufftXtSetCallback(*planFptr,
482  (void **)&h_pointwiseMultiplicationComplexKernel,
483  CUFFT_CB_ST_COMPLEX,
484  (void **)&d_params);
485  }
486  //END AJ
487 
488  if(auxNdim==NdimNew){
489  if(NdimNew!=Ndim){
490  gpuErrchkFFT(cufftExecR2C(*planFptr, auxInFFT.d_data, auxOutFFT.d_data));
491  }else{
492  gpuErrchkFFT(cufftExecR2C(*planFptr, (cufftReal*)&d_data[positionReal], (cufftComplex*)&fourierTransform.d_data[positionFFT]));
493  }
494  }else{
495  if(NdimNew!=Ndim){
496  gpuErrchkFFT(cufftExecR2C(*planAuxFptr, auxInFFT.d_data, auxOutFFT.d_data));
497  }else{
498  gpuErrchkFFT(cufftExecR2C(*planAuxFptr, (cufftReal*)&d_data[positionReal], (cufftComplex*)&fourierTransform.d_data[positionFFT]));
499  }
500  }
501 
502 
503  if(NdimNew!=Ndim){
504  gpuCopyFromGPUToGPUStream(auxOutFFT.d_data, (cufftComplex*)&fourierTransform.d_data[positionFFT], Xfdim*Ydim*Zdim*NdimNew*sizeof(cufftComplex), myStream);
505  auxOutFFT.clear();
506  auxInFFT.clear();
507  }
508 
509  auxNdim=NdimNew;
510 
511  positionReal+=(NdimNew*Xdim*Ydim*Zdim);
512  positionFFT+=(NdimNew*Xfdim*Ydim*Zdim);
513  aux-=NdimNew;
514  if(aux<NdimNew)
515  NdimNew=aux;
516 
517  if(auxNdim!=NdimNew && NdimNew!=0)
518  cufftDestroy(*planAuxFptr);
519 
520 
521  gpuErrchk(cudaStreamSynchronize(*stream));
522 
523  }//AJ end while
524 }
525 
526 
527 template<>
528 template<>
529 void GpuMultidimArrayAtGpu<float>::fft(GpuMultidimArrayAtGpu< std::complex<float> > &fourierTransform, mycufftHandle &myhandle)
530 {
531 
532  int Xfdim=(Xdim/2)+1;
533  fourierTransform.resize(Xfdim,Ydim,Zdim,Ndim);
534 
535  int positionReal=0;
536  int positionFFT=0;
537  size_t NdimNew, auxNdim;
538  NdimNew = Ndim;
539  size_t aux=Ndim;
540 
541  auxNdim=Ndim;
542 
543  if(myhandle.ptr!=NULL) {
544  NdimNew = Ndim;
545  }
546 
547  while(aux>0){
548 
551  if(NdimNew!=Ndim){
552  auxInFFT.resize(Xdim,Ydim,Zdim,NdimNew);
553  gpuCopyFromGPUToGPU((cufftReal*)&d_data[positionReal], auxInFFT.d_data, Xdim*Ydim*Zdim*NdimNew*sizeof(cufftReal));
554  auxOutFFT.resize(Xfdim,Ydim,Zdim,NdimNew);
555  }
556 
557  cufftHandle *planFptr = NULL;
558  cufftHandle *planAuxFptr = NULL;
559  if(auxNdim!=NdimNew){
560  planAuxFptr = new cufftHandle;
561  createPlanFFT(Xdim, Ydim, NdimNew, Zdim, true, planAuxFptr);
562  }else{
563  if(myhandle.ptr == NULL){
564  myhandle.ptr = planFptr = new cufftHandle;
565  createPlanFFT(Xdim, Ydim, NdimNew, Zdim, true, planFptr);
566  }
567  planFptr=(cufftHandle *)myhandle.ptr;
568  }
569 
570  if(auxNdim==NdimNew){
571  if(NdimNew!=Ndim){
572  gpuErrchkFFT(cufftExecR2C(*planFptr, auxInFFT.d_data, auxOutFFT.d_data));
573  }else{
574  gpuErrchkFFT(cufftExecR2C(*planFptr, (cufftReal*)&d_data[positionReal], (cufftComplex*)&fourierTransform.d_data[positionFFT]));
575  }
576  }else{
577  if(NdimNew!=Ndim){
578  gpuErrchkFFT(cufftExecR2C(*planAuxFptr, auxInFFT.d_data, auxOutFFT.d_data));
579  }else{
580  gpuErrchkFFT(cufftExecR2C(*planAuxFptr, (cufftReal*)&d_data[positionReal], (cufftComplex*)&fourierTransform.d_data[positionFFT]));
581  }
582  }
583 
584  gpuErrchk(cudaDeviceSynchronize());
585 
586  if(NdimNew!=Ndim){
587  gpuCopyFromGPUToGPU(auxOutFFT.d_data, (cufftComplex*)&fourierTransform.d_data[positionFFT], Xfdim*Ydim*Zdim*NdimNew*sizeof(cufftComplex));
588  auxOutFFT.clear();
589  auxInFFT.clear();
590  }
591 
592  auxNdim=NdimNew;
593 
594  positionReal+=(NdimNew*Xdim*Ydim*Zdim);
595  positionFFT+=(NdimNew*Xfdim*Ydim*Zdim);
596  aux-=NdimNew;
597  if(aux<NdimNew)
598  NdimNew=aux;
599 
600  if (NULL != planAuxFptr) {
601  cufftDestroy(*planAuxFptr); // destroy if created
602  }
603 
604  }//AJ end while
605 
606 }
607 
608 template<>
609 template<>
611  mycufftHandle &myhandle, myStreamHandle &myStream, bool useCallback,
612  GpuMultidimArrayAtGpu< std::complex<float> > &dataMask)
613 {
614 
615  int Xfdim=(realSpace.Xdim/2)+1;
616 
617  //printf("FFT Xdim %i, Ydim %i, Ndim %i, Zdim %i Xfdim %i \n", Xdim, Ydim, Zdim, Ndim, Xfdim);
618 
619  int positionReal=0;
620  int positionFFT=0;
621  int NdimNew, auxNdim;
622  NdimNew = realSpace.Ndim;
623  int aux=realSpace.Ndim;
624 
625  auxNdim=realSpace.Ndim;
626 
627  cudaStream_t *stream = (cudaStream_t*) myStream.ptr;
628 
629  if(myhandle.ptr!=NULL)
630  NdimNew = Ndim;
631 
632  while(aux>0){
633 
636  if(NdimNew!=Ndim){
637  auxInFFT.resize(Xfdim,realSpace.Ydim,realSpace.Zdim,NdimNew);
638  gpuCopyFromGPUToGPUStream((cufftComplex*)&d_data[positionFFT], auxInFFT.d_data, Xfdim*realSpace.Ydim*realSpace.Zdim*NdimNew*sizeof(cufftComplex), myStream);
639  auxOutFFT.resize(realSpace.Xdim,realSpace.Ydim,realSpace.Zdim, NdimNew);
640  }
641 
642  cufftHandle *planBptr = new cufftHandle;
643  cufftHandle *planAuxBptr = new cufftHandle;
644  if(auxNdim!=NdimNew){
645  createPlanFFTStream(Xdim, Ydim, NdimNew, Zdim, false, planAuxBptr, myStream);
646  }else{
647  if(myhandle.ptr == NULL){
648  //printf("Creating plan IFFT\n");
649  createPlanFFTStream(realSpace.Xdim, realSpace.Ydim, NdimNew, Zdim, false, planBptr, myStream);
650  myhandle.ptr = (void *)planBptr;
651  planBptr=(cufftHandle *)myhandle.ptr;
652  }else{
653  planBptr=(cufftHandle *)myhandle.ptr;
654  }
655  }
656 
657  //AJ using callbacks
658  if(useCallback){
659  //printf("Using callbacks\n");
660  pointwiseMult *dataCB;
661  gpuErrchk(cudaMallocHost((void **)&dataCB, sizeof(pointwiseMult)));
662  dataCB->normFactor = dataMask.yxdim;
663  dataCB->data = (cufftComplex*)dataMask.d_data;
664 
665  // Allocate device memory for parameters
666  pointwiseMult *d_params;
667  gpuErrchk(cudaMalloc((void **)&d_params, sizeof(pointwiseMult)));
668 
669  // Copy host memory to device
670  gpuErrchk(cudaMemcpyAsync(d_params, dataCB, sizeof(pointwiseMult), cudaMemcpyHostToDevice, *stream));
671 
672 
673  cufftCallbackLoadC h_pointwiseMultiplicationComplexKernel;
674  gpuErrchk(cudaMemcpyFromSymbolAsync(&h_pointwiseMultiplicationComplexKernel,
676  sizeof(h_pointwiseMultiplicationComplexKernel), 0, cudaMemcpyDeviceToHost, *stream));
677 
678  cufftResult status = cufftXtSetCallback(*planBptr,
679  (void **)&h_pointwiseMultiplicationComplexKernel,
680  CUFFT_CB_LD_COMPLEX,
681  (void **)&d_params);
682  }
683  //END AJ
684 
685  if(auxNdim==NdimNew){
686  if(NdimNew!=Ndim){
687  gpuErrchkFFT(cufftExecC2R(*planBptr, auxInFFT.d_data, auxOutFFT.d_data));
688  }else{
689  gpuErrchkFFT(cufftExecC2R(*planBptr, (cufftComplex *)&d_data[positionFFT], (cufftReal*)&realSpace.d_data[positionReal]));
690  }
691  }else{
692  if(NdimNew!=Ndim){
693  gpuErrchkFFT(cufftExecC2R(*planAuxBptr, auxInFFT.d_data, auxOutFFT.d_data));
694  }else{
695  gpuErrchkFFT(cufftExecC2R(*planAuxBptr, (cufftComplex *)&d_data[positionFFT], (cufftReal*)&realSpace.d_data[positionReal]));
696  }
697  }
698 
699  gpuErrchk(cudaStreamSynchronize(*stream));
700 
701  if(NdimNew!=Ndim){
702  gpuCopyFromGPUToGPUStream(auxOutFFT.d_data, (cufftReal*)&realSpace.d_data[positionReal], realSpace.Xdim*realSpace.Ydim*realSpace.Zdim*NdimNew*sizeof(cufftReal), myStream);
703  auxOutFFT.clear();
704  auxInFFT.clear();
705  }
706 
707  auxNdim=NdimNew;
708 
709  positionReal+=(NdimNew*realSpace.Xdim*realSpace.Ydim*realSpace.Zdim);
710  positionFFT+=(NdimNew*Xfdim*realSpace.Ydim*realSpace.Zdim);
711  aux-=NdimNew;
712  if(aux<NdimNew)
713  NdimNew=aux;
714 
715  if(auxNdim!=NdimNew && NdimNew!=0)
716  cufftDestroy(*planAuxBptr);
717 
718  }//AJ end while
719 
720 }
721 
722 template<>
723 template<>
725 {
726 
727  int Xfdim=(realSpace.Xdim/2)+1;
728 
729  int positionReal=0;
730  int positionFFT=0;
731  size_t NdimNew, auxNdim;
732  NdimNew = realSpace.Ndim;
733  size_t aux=realSpace.Ndim;
734 
735  auxNdim=realSpace.Ndim;
736 
737  if(myhandle.ptr!=NULL)
738  NdimNew = Ndim;
739 
740  while(aux>0){
741 
744  if(NdimNew!=Ndim){
745  auxInFFT.resize(Xfdim,realSpace.Ydim,realSpace.Zdim,NdimNew);
746  gpuCopyFromGPUToGPU((cufftComplex*)&d_data[positionFFT], auxInFFT.d_data, Xfdim*realSpace.Ydim*realSpace.Zdim*NdimNew*sizeof(cufftComplex));
747  auxOutFFT.resize(realSpace.Xdim,realSpace.Ydim,realSpace.Zdim, NdimNew);
748  }
749 
750  cufftHandle *planBptr = nullptr;
751  cufftHandle *planAuxBptr = nullptr;
752  if(auxNdim!=NdimNew){
753  planAuxBptr = new cufftHandle;
754  createPlanFFT(Xdim, Ydim, NdimNew, Zdim, false, planAuxBptr);
755  }else{
756  if(nullptr == myhandle.ptr){
757  myhandle.ptr = planBptr = new cufftHandle;
758  createPlanFFT(realSpace.Xdim, realSpace.Ydim, NdimNew, Zdim, false, planBptr);
759  }
760  planBptr=(cufftHandle *)myhandle.ptr;
761  }
762 
763  if(auxNdim==NdimNew){
764  if(NdimNew!=Ndim){
765  gpuErrchkFFT(cufftExecC2R(*planBptr, auxInFFT.d_data, auxOutFFT.d_data));
766  }else{
767  gpuErrchkFFT(cufftExecC2R(*planBptr, (cufftComplex *)&d_data[positionFFT], (cufftReal*)&realSpace.d_data[positionReal]));
768  }
769  }else{
770  if(NdimNew!=Ndim){
771  gpuErrchkFFT(cufftExecC2R(*planAuxBptr, auxInFFT.d_data, auxOutFFT.d_data));
772  }else{
773  gpuErrchkFFT(cufftExecC2R(*planAuxBptr, (cufftComplex *)&d_data[positionFFT], (cufftReal*)&realSpace.d_data[positionReal]));
774  }
775  }
776 
777 
778  gpuErrchk(cudaDeviceSynchronize());
779 
780  if(NdimNew!=Ndim){
781  gpuCopyFromGPUToGPU(auxOutFFT.d_data, (cufftReal*)&realSpace.d_data[positionReal], realSpace.Xdim*realSpace.Ydim*realSpace.Zdim*NdimNew*sizeof(cufftReal));
782  auxOutFFT.clear();
783  auxInFFT.clear();
784  }
785 
786  auxNdim=NdimNew;
787 
788  positionReal+=(NdimNew*realSpace.Xdim*realSpace.Ydim*realSpace.Zdim);
789  positionFFT+=(NdimNew*Xfdim*realSpace.Ydim*realSpace.Zdim);
790  aux-=NdimNew;
791  if(aux<NdimNew)
792  NdimNew=aux;
793 
794  if(nullptr != planAuxBptr)
795  cufftDestroy(*planAuxBptr); // destroy if created
796 
797  }//AJ end while
798 
799 }
#define gpuErrchk(code)
Definition: cuda_asserts.h:31
void resize(const GpuMultidimArrayAtGpu< T1 > &array)
void gpuMalloc(void **d_data, size_t Nbytes)
void mycufftDestroy(void *ptr)
__device__ cufftComplex CB_pointwiseMultiplicationComplexKernelLoad(void *dataIn, size_t offset, void *callerInfo, void *sharedPtr)
void gpuFree(void *d_data)
T * loadToGPU(const T *data, size_t items)
cufftComplex * data
void calculateFFTPlanSize(mycufftHandle &myhandle)
void createPlanFFTStream(int Xdim, int Ydim, int Ndim, int Zdim, bool forward, cufftHandle *plan, myStreamHandle &myStream)
void resize(const TransformMatrix< T1 > &array, myStreamHandle &myStream)
#define i
int cufftHandle
Definition: cuda_fft.h:41
void nf
void gpuCopyFromGPUToCPUStream(void *d_data, void *data, size_t Nbytes, myStreamHandle &myStream)
void cuda_check_gpu_memory(float *data)
#define gpuErrchkFFT(code)
Definition: cuda_asserts.h:32
void gpuCopyFromCPUToGPU(void *data, void *d_data, size_t Nbytes)
__device__ cufftCallbackLoadC d_pointwiseMultiplicationComplexKernelLoad
void setRotationMatrix(float *d_data, float *ang, int Ndim, myStreamHandle &myStream)
void gpuCopyFromGPUToGPUStream(void *d_dataFrom, void *d_dataTo, size_t Nbytes, myStreamHandle &myStream)
void myStreamCreate(myStreamHandle &myStream)
void cpuFree(void *h_data)
int gridFromBlock(int tasks, int Nthreads)
#define j
void gpuCopyFromCPUToGPUStream(void *data, void *d_data, size_t Nbytes, myStreamHandle &myStream)
void fftStream(GpuMultidimArrayAtGpu< std::complex< float >> &fourierTransform, mycufftHandle &myhandle, myStreamHandle &myStream, bool useCallback, GpuMultidimArrayAtGpu< std::complex< float >> &dataRef)
void initializeIdentity(float *d_data, float *h_data, int Ndim, myStreamHandle &myStream)
void setTranslationMatrix(float *d_data, float *posX, float *posY, int Ndim, myStreamHandle &myStream)
void createPlanFFT(int Xdim, int Ydim, int Ndim, int Zdim, bool forward, cufftHandle *plan)
void cuda_check_gpu_properties(int *grid)
void gpuCopyFromGPUToGPU(void *d_dataFrom, void *d_dataTo, size_t Nbytes)
void fft(GpuMultidimArrayAtGpu< T1 > &fourierTransform, mycufftHandle &myhandle)
void cpuMalloc(void **h_data, size_t Nbytes)
template float * loadToGPU< float >(const float *data, size_t items)
__device__ void CB_pointwiseMultiplicationComplexKernelStore(void *dataOut, size_t offset, cufftComplex element, void *callerInfo, void *sharedPtr)
__device__ cufftCallbackStoreC d_pointwiseMultiplicationComplexKernelStore
void myStreamDestroy(void *ptr)
void gpuCopyFromGPUToCPU(void *d_data, void *data, size_t Nbytes)