6 #include <cuda_runtime.h> 22 size_t _Ndim,
size_t _Xdim,
size_t _Ydim,
size_t _Zdim) {
23 if (_Xdim*_Ydim*_Zdim*_Ndim==nzyxdim)
35 gpuErrchk(cudaMalloc(&d_data,nzyxdim*
sizeof(T)));
36 gpuErrchk(cudaMallocHost(&h_data,nzyxdim*
sizeof(T)));
41 size_t _Ndim,
size_t _Xdim,
size_t _Ydim,
size_t _Zdim);
46 if (_Xdim*_Ydim*_Zdim*_Ndim==nzyxdim){
53 setDims(_Xdim, _Ydim, _Zdim, _Ndim);
54 gpuErrchk(cudaMalloc(&d_data,nzyxdim*
sizeof(T)));
59 cudaStream_t *streamPtr = (cudaStream_t *)ptr;
60 cudaStreamDestroy(*streamPtr);
65 cudaStream_t *streamPtr =
new cudaStream_t;
67 myStream.
ptr = (
void*)streamPtr;
75 cufftDestroy(*planPtr);
80 printf(
"calculateFFTPlanSize myhandle.ptr: %p\n",myhandle.
ptr);
83 cufftGetSize(*planFptr, &ws2);
84 printf(
"calculateFFTPlanSize size %i\n", (
int)ws2);
93 int nr2[] = {Ydim, Xdim};
94 int nr3[] = {Zdim, Ydim, Xdim};
97 int nf2[] = {Ydim, Xfdim};
98 int nf3[] = {Zdim, Ydim, Xfdim};
99 int *nr=NULL, *
nf=NULL;
101 if (Ydim==1 && Zdim==1)
122 int rdist = Xdim*Ydim*Zdim;
123 int fdist = Xfdim*Ydim*Zdim;
126 gpuErrchkFFT(cufftPlanMany(plan, NRANK, nr, nr, rstride, rdist, nf, fstride, fdist, CUFFT_R2C, Ndim));
128 gpuErrchkFFT(cufftPlanMany(plan, NRANK, nr, nf, fstride, fdist, nr, rstride, rdist, CUFFT_C2R, Ndim));
136 int Xfdim=(Xdim/2)+1;
139 int nr2[] = {Ydim, Xdim};
140 int nr3[] = {Zdim, Ydim, Xdim};
143 int nf2[] = {Ydim, Xfdim};
144 int nf3[] = {Zdim, Ydim, Xfdim};
145 int *nr=NULL, *
nf=NULL;
147 if (Ydim==1 && Zdim==1)
168 int rdist = Xdim*Ydim*Zdim;
169 int fdist = Xfdim*Ydim*Zdim;
171 cudaStream_t *stream = (cudaStream_t*) myStream.
ptr;
173 gpuErrchkFFT(cufftPlanMany(plan, NRANK, nr, nr, rstride, rdist, nf, fstride, fdist, CUFFT_R2C, Ndim));
176 gpuErrchkFFT(cufftPlanMany(plan, NRANK, nr, nf, fstride, fdist, nr, rstride, rdist, CUFFT_C2R, Ndim));
189 Xdim=Ydim=Zdim=Ndim=yxdim=zyxdim=nzyxdim=0;
208 gpuErrchk(cudaMallocHost(h_data, Nbytes));
219 for(
int i=0;
i<Ndim;
i++){
220 float aux_matrix[9] = {1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0};
222 for (
int j=0;
j<9;
j++)
223 h_data[offset+
j] = aux_matrix[
j];
225 cudaStream_t *stream = (cudaStream_t*) myStream.
ptr;
226 gpuErrchk(cudaMemcpyAsync((
void*)d_data, h_data, Ndim*9*
sizeof(
float), cudaMemcpyHostToDevice, *stream));
240 gpuErrchk(cudaMallocHost((
void**)&matrices,
sizeof(
float)*Ndim*9));
242 for(
int i=0;
i<Ndim;
i++){
243 float aux_matrix[9] = {1, 0, -posX[
i], 0, 1, -posY[
i], 0, 0, 1};
246 for (
int j=0;
j<9;
j++)
247 matrices[offset+
j] = aux_matrix[
j];
249 cudaStream_t *stream = (cudaStream_t*) myStream.
ptr;
250 gpuErrchk(cudaMemcpyAsync((
void*)d_data, matrices, Ndim*9*
sizeof(
float), cudaMemcpyHostToDevice, *stream));
264 gpuErrchk(cudaMallocHost((
void**)&rad_vector,
sizeof(
float)*Ndim*9));
266 for(
int i=0;
i<Ndim;
i++){
267 float rad = (float)(-ang[
i]*M_PI/180);
268 float matrix[9] = {cosf(rad), -sinf(rad), 0, sinf(rad), cosf(rad), 0, 0, 0, 1};
270 for (
int j=0;
j<9;
j++)
271 rad_vector[offset+
j] = matrix[
j];
273 cudaStream_t *stream = (cudaStream_t*) myStream.
ptr;
274 gpuErrchk(cudaMemcpyAsync((
void*)d_data, rad_vector, Ndim*9*
sizeof(
float), cudaMemcpyHostToDevice, *stream));
280 gpuErrchk(cudaMemcpy(d_data, data, Nbytes, cudaMemcpyHostToDevice));
285 gpuErrchk(cudaMemcpy(data, d_data, Nbytes, cudaMemcpyDeviceToHost));
290 gpuErrchk(cudaMemcpy(d_dataTo, d_dataFrom, Nbytes, cudaMemcpyDeviceToDevice));
295 cudaStream_t *stream = (cudaStream_t*) myStream.
ptr;
296 gpuErrchk(cudaMemcpyAsync(d_data, data, Nbytes, cudaMemcpyHostToDevice, *stream));
303 cudaStream_t *stream = (cudaStream_t*) myStream.
ptr;
304 gpuErrchk(cudaMemcpyAsync(data, d_data, Nbytes, cudaMemcpyDeviceToHost, *stream));
306 gpuErrchk(cudaStreamSynchronize(*stream));
312 cudaStream_t *stream = (cudaStream_t*) myStream.
ptr;
313 gpuErrchk(cudaMemcpyAsync(d_dataTo, d_dataFrom, Nbytes, cudaMemcpyDeviceToDevice, *stream));
318 int numBlk = tasks/Nthreads;
325 size_t free_byte, total_byte;
326 gpuErrchk(cudaMemGetInfo(&free_byte, &total_byte));
328 float free_db = (float)free_byte;
329 float total_db = (float)total_byte;
330 float used_db = total_db - free_db;
340 cudaGetDeviceProperties(&prop, 0);
341 grid[0] = prop.maxGridSize[0];
342 grid[1] = prop.maxGridSize[1];
343 grid[2] = prop.maxGridSize[2];
348 void *callerInfo,
void *sharedPtr)
354 cufftComplex reference = ((cufftComplex*)dataIn)[offset];
355 cufftComplex *mask = (cufftComplex*)myData->
data;
362 cufftComplex mulOut = cuCmulf((cuComplex)reference, (cuComplex)mask[indexM]);
364 out.x = mulOut.x*factor;
365 out.y = mulOut.y*factor;
377 void *callerInfo,
void *sharedPtr)
382 cufftComplex *mask = myData->
data;
388 cufftComplex mulOut = cuCmulf((cuComplex)element, (cuComplex)mask[indexM]);
390 out.x = mulOut.x*factor;
391 out.y = mulOut.y*factor;
392 ((cufftComplex*)dataOut)[offset] = out;
399 template std::complex<float>* loadToGPU<std::complex<float> >(
const std::complex<float>*
data,
size_t items);
403 size_t bytes = items *
sizeof(T);
405 gpuErrchk(cudaMemcpy(d_data, data, bytes, cudaMemcpyHostToDevice));
415 int Xfdim=(Xdim/2)+1;
417 fourierTransform.resize(Xfdim,Ydim,Zdim,Ndim);
422 int NdimNew, auxNdim;
428 cudaStream_t *stream = (cudaStream_t*) myStream.
ptr;
430 if(myhandle.
ptr!=NULL)
438 auxInFFT.
resize(Xdim,Ydim,Zdim,NdimNew);
440 auxOutFFT.
resize(Xfdim,Ydim,Zdim,NdimNew);
445 if(auxNdim!=NdimNew){
448 if(myhandle.
ptr == NULL){
451 myhandle.
ptr = (
void *)planFptr;
463 dataCB->normFactor = fourierTransform.yxdim;
464 dataCB->data = (cufftComplex*)dataMask.d_data;
476 cufftCallbackStoreC h_pointwiseMultiplicationComplexKernel;
477 gpuErrchk(cudaMemcpyFromSymbolAsync(&h_pointwiseMultiplicationComplexKernel,
479 sizeof(h_pointwiseMultiplicationComplexKernel), 0, cudaMemcpyDeviceToHost, *stream));
481 cufftResult status = cufftXtSetCallback(*planFptr,
482 (
void **)&h_pointwiseMultiplicationComplexKernel,
488 if(auxNdim==NdimNew){
492 gpuErrchkFFT(cufftExecR2C(*planFptr, (cufftReal*)&d_data[positionReal], (cufftComplex*)&fourierTransform.d_data[positionFFT]));
498 gpuErrchkFFT(cufftExecR2C(*planAuxFptr, (cufftReal*)&d_data[positionReal], (cufftComplex*)&fourierTransform.d_data[positionFFT]));
504 gpuCopyFromGPUToGPUStream(auxOutFFT.
d_data, (cufftComplex*)&fourierTransform.d_data[positionFFT], Xfdim*Ydim*Zdim*NdimNew*
sizeof(cufftComplex), myStream);
511 positionReal+=(NdimNew*Xdim*Ydim*Zdim);
512 positionFFT+=(NdimNew*Xfdim*Ydim*Zdim);
517 if(auxNdim!=NdimNew && NdimNew!=0)
518 cufftDestroy(*planAuxFptr);
521 gpuErrchk(cudaStreamSynchronize(*stream));
532 int Xfdim=(Xdim/2)+1;
533 fourierTransform.resize(Xfdim,Ydim,Zdim,Ndim);
537 size_t NdimNew, auxNdim;
543 if(myhandle.
ptr!=NULL) {
552 auxInFFT.
resize(Xdim,Ydim,Zdim,NdimNew);
554 auxOutFFT.
resize(Xfdim,Ydim,Zdim,NdimNew);
559 if(auxNdim!=NdimNew){
563 if(myhandle.
ptr == NULL){
570 if(auxNdim==NdimNew){
574 gpuErrchkFFT(cufftExecR2C(*planFptr, (cufftReal*)&d_data[positionReal], (cufftComplex*)&fourierTransform.d_data[positionFFT]));
580 gpuErrchkFFT(cufftExecR2C(*planAuxFptr, (cufftReal*)&d_data[positionReal], (cufftComplex*)&fourierTransform.d_data[positionFFT]));
587 gpuCopyFromGPUToGPU(auxOutFFT.
d_data, (cufftComplex*)&fourierTransform.d_data[positionFFT], Xfdim*Ydim*Zdim*NdimNew*
sizeof(cufftComplex));
594 positionReal+=(NdimNew*Xdim*Ydim*Zdim);
595 positionFFT+=(NdimNew*Xfdim*Ydim*Zdim);
600 if (NULL != planAuxFptr) {
601 cufftDestroy(*planAuxFptr);
615 int Xfdim=(realSpace.
Xdim/2)+1;
621 int NdimNew, auxNdim;
622 NdimNew = realSpace.
Ndim;
623 int aux=realSpace.
Ndim;
625 auxNdim=realSpace.
Ndim;
627 cudaStream_t *stream = (cudaStream_t*) myStream.
ptr;
629 if(myhandle.
ptr!=NULL)
644 if(auxNdim!=NdimNew){
647 if(myhandle.
ptr == NULL){
650 myhandle.
ptr = (
void *)planBptr;
662 dataCB->normFactor = dataMask.yxdim;
663 dataCB->data = (cufftComplex*)dataMask.d_data;
673 cufftCallbackLoadC h_pointwiseMultiplicationComplexKernel;
674 gpuErrchk(cudaMemcpyFromSymbolAsync(&h_pointwiseMultiplicationComplexKernel,
676 sizeof(h_pointwiseMultiplicationComplexKernel), 0, cudaMemcpyDeviceToHost, *stream));
678 cufftResult status = cufftXtSetCallback(*planBptr,
679 (
void **)&h_pointwiseMultiplicationComplexKernel,
685 if(auxNdim==NdimNew){
689 gpuErrchkFFT(cufftExecC2R(*planBptr, (cufftComplex *)&d_data[positionFFT], (cufftReal*)&realSpace.
d_data[positionReal]));
695 gpuErrchkFFT(cufftExecC2R(*planAuxBptr, (cufftComplex *)&d_data[positionFFT], (cufftReal*)&realSpace.
d_data[positionReal]));
699 gpuErrchk(cudaStreamSynchronize(*stream));
709 positionReal+=(NdimNew*realSpace.
Xdim*realSpace.
Ydim*realSpace.
Zdim);
710 positionFFT+=(NdimNew*Xfdim*realSpace.
Ydim*realSpace.
Zdim);
715 if(auxNdim!=NdimNew && NdimNew!=0)
716 cufftDestroy(*planAuxBptr);
727 int Xfdim=(realSpace.
Xdim/2)+1;
731 size_t NdimNew, auxNdim;
732 NdimNew = realSpace.
Ndim;
733 size_t aux=realSpace.
Ndim;
735 auxNdim=realSpace.
Ndim;
737 if(myhandle.
ptr!=NULL)
752 if(auxNdim!=NdimNew){
754 createPlanFFT(Xdim, Ydim, NdimNew, Zdim,
false, planAuxBptr);
756 if(
nullptr == myhandle.
ptr){
763 if(auxNdim==NdimNew){
767 gpuErrchkFFT(cufftExecC2R(*planBptr, (cufftComplex *)&d_data[positionFFT], (cufftReal*)&realSpace.
d_data[positionReal]));
773 gpuErrchkFFT(cufftExecC2R(*planAuxBptr, (cufftComplex *)&d_data[positionFFT], (cufftReal*)&realSpace.
d_data[positionReal]));
788 positionReal+=(NdimNew*realSpace.
Xdim*realSpace.
Ydim*realSpace.
Zdim);
789 positionFFT+=(NdimNew*Xfdim*realSpace.
Ydim*realSpace.
Zdim);
794 if(
nullptr != planAuxBptr)
795 cufftDestroy(*planAuxBptr);
void resize(const GpuMultidimArrayAtGpu< T1 > &array)
void gpuMalloc(void **d_data, size_t Nbytes)
void mycufftDestroy(void *ptr)
__device__ cufftComplex CB_pointwiseMultiplicationComplexKernelLoad(void *dataIn, size_t offset, void *callerInfo, void *sharedPtr)
void gpuFree(void *d_data)
T * loadToGPU(const T *data, size_t items)
void calculateFFTPlanSize(mycufftHandle &myhandle)
void createPlanFFTStream(int Xdim, int Ydim, int Ndim, int Zdim, bool forward, cufftHandle *plan, myStreamHandle &myStream)
void gpuCopyFromGPUToCPUStream(void *d_data, void *data, size_t Nbytes, myStreamHandle &myStream)
void cuda_check_gpu_memory(float *data)
#define gpuErrchkFFT(code)
void gpuCopyFromCPUToGPU(void *data, void *d_data, size_t Nbytes)
__device__ cufftCallbackLoadC d_pointwiseMultiplicationComplexKernelLoad
void setRotationMatrix(float *d_data, float *ang, int Ndim, myStreamHandle &myStream)
void gpuCopyFromGPUToGPUStream(void *d_dataFrom, void *d_dataTo, size_t Nbytes, myStreamHandle &myStream)
void myStreamCreate(myStreamHandle &myStream)
void cpuFree(void *h_data)
int gridFromBlock(int tasks, int Nthreads)
void gpuCopyFromCPUToGPUStream(void *data, void *d_data, size_t Nbytes, myStreamHandle &myStream)
void fftStream(GpuMultidimArrayAtGpu< std::complex< float >> &fourierTransform, mycufftHandle &myhandle, myStreamHandle &myStream, bool useCallback, GpuMultidimArrayAtGpu< std::complex< float >> &dataRef)
void initializeIdentity(float *d_data, float *h_data, int Ndim, myStreamHandle &myStream)
void setTranslationMatrix(float *d_data, float *posX, float *posY, int Ndim, myStreamHandle &myStream)
void createPlanFFT(int Xdim, int Ydim, int Ndim, int Zdim, bool forward, cufftHandle *plan)
void cuda_check_gpu_properties(int *grid)
void gpuCopyFromGPUToGPU(void *d_dataFrom, void *d_dataTo, size_t Nbytes)
void fft(GpuMultidimArrayAtGpu< T1 > &fourierTransform, mycufftHandle &myhandle)
void cpuMalloc(void **h_data, size_t Nbytes)
template float * loadToGPU< float >(const float *data, size_t items)
__device__ void CB_pointwiseMultiplicationComplexKernelStore(void *dataOut, size_t offset, cufftComplex element, void *callerInfo, void *sharedPtr)
__device__ cufftCallbackStoreC d_pointwiseMultiplicationComplexKernelStore
void myStreamDestroy(void *ptr)
void gpuCopyFromGPUToCPU(void *d_data, void *data, size_t Nbytes)