#include <cuda_fft.h>

Inheritance diagram for CudaFFT< T >:

Collaboration diagram for CudaFFT< T >:

Public Member Functions
	CudaFFT ()

	~CudaFFT ()

void	init (const HW &gpu, const FFTSettings< T > &settings, bool reuse=true)

void	release () final

std::complex< T > *	fft (T *h_inOut)

std::complex< T > *	fft (const T h_in, std::complex< T > h_out)

T *	ifft (std::complex< T > *h_inOut)

T *	ifft (const std::complex< T > h_in, T h_out)

size_t	estimatePlanBytes (const FFTSettings< T > &settings)

Public Member Functions inherited from AFT< T >
virtual	~AFT ()

virtual size_t	estimateTotalBytes (const FFTSettings< T > &settings)

Static Public Member Functions
static std::complex< T > *	fft (cufftHandle plan, T *d_inOut)

static std::complex< T > *	fft (cufftHandle plan, const T d_in, std::complex< T > d_out)

static T *	ifft (cufftHandle plan, std::complex< T > *d_inOut)

static T *	ifft (cufftHandle plan, const std::complex< T > d_in, T d_out)

static cufftHandle *	createPlan (const GPU &gpu, const FFTSettings< T > &settings)

static FFTSettings< T > *	findOptimal (const GPU &gpu, const FFTSettings< T > &settings, size_t reserveBytes, bool squareOnly, int sigPercChange, bool crop, bool verbose)

static FFTSettings< T >	findMaxBatch (const FFTSettings< T > &settings, size_t maxBytes)

static FFTSettings< T >	findOptimalSizeOrMaxBatch (GPU &gpu, const FFTSettings< T > &settings, size_t reserveBytes, bool squareOnly, int sigPercChange, bool crop, bool verbose)

static void	release (cufftHandle *plan)

Additional Inherited Members

Detailed Description

template<typename T>
class CudaFFT< T >

Definition at line 47 of file cuda_fft.h.

Constructor & Destructor Documentation

◆ CudaFFT()

template<typename T >

CudaFFT< T >::CudaFFT ( )

inline

Definition at line 49 of file cuda_fft.h.

               {
         setDefault();
     };

◆ ~CudaFFT()

template<typename T >

CudaFFT< T >::~CudaFFT ( )

inline

Definition at line 52 of file cuda_fft.h.

                {
         release();
     }

Member Function Documentation

◆ createPlan()

template<typename T >

cufftHandle * CudaFFT< T >::createPlan	(	const GPU &	gpu,
		const FFTSettings< T > &	settings
	)

static

Definition at line 276 of file cuda_fft.cpp.

                                                                                   {
     if (settings.sElemsBatch() > std::numeric_limits<int>::max()) {
         REPORT_ERROR(ERR_ARG_INCORRECT, "Too many elements for Fourier Transformation. "
                 "It would cause int overflow in the cuda kernel. Try to decrease batch size");
     }
     auto plan = new cufftHandle;
     auto f = [&] (int rank, int *n, int *inembed,
             int istride, int idist, int *onembed, int ostride,
             int odist, cufftType type, int batch) {
         gpuErrchkFFT(cufftPlanMany(plan, rank, n, inembed,
                 istride, idist, onembed, ostride,
                 odist, type, batch));
     };
     manyHelper(settings, f);
     gpuErrchkFFT(cufftSetStream(*plan, *(cudaStream_t*)gpu.stream()));
     return plan;
 }

◆ estimatePlanBytes()

template<typename T >

size_t CudaFFT< T >::estimatePlanBytes ( const FFTSettings< T > & settings )

virtual

Implements AFT< T >.

Definition at line 211 of file cuda_fft.cpp.

                                                                    {
     size_t size = 0;
     auto f = [&] (int rank, int *n, int *inembed,
             int istride, int idist, int *onembed, int ostride,
             int odist, cufftType type, int batch) {
         gpuErrchkFFT(cufftEstimateMany(rank, n, inembed,
                 istride, idist, onembed, ostride,
                 odist, type, batch, &size));
     };
     manyHelper(settings, f);
     return size;
 }

◆ fft() [1/4]

template<typename T >

std::complex< T > * CudaFFT< T >::fft ( T * h_inOut )

virtual

Implements AFT< T >.

Definition at line 119 of file cuda_fft.cpp.

                                          {
     return fft(d_inOut, (std::complex<T>*) d_inOut);
 }

◆ fft() [2/4]

template<typename T >

std::complex< T > * CudaFFT< T >::fft	(	const T *	h_in,
		std::complex< T > *	h_out
	)

virtual

Implements AFT< T >.

Definition at line 139 of file cuda_fft.cpp.

                               {
     auto isReady = m_isInit && m_settings->isForward();
     if ( ! isReady) {
         REPORT_ERROR(ERR_LOGIC_ERROR, "Not ready to perform Fourier Transform. "
                 "Call init() function first");
     }
 
     // process signals in batches
     for (size_t offset = 0; offset < m_settings->sDim().n(); offset += m_settings->batch()) {
         // how many signals to process
         size_t toProcess = std::min(m_settings->batch(), m_settings->sDim().n() - offset);
 
         // copy memory
         gpuErrchk(cudaMemcpyAsync(
                 m_d_SD,
                 h_in + offset * m_settings->sDim().xyzPadded(),
                 toProcess * m_settings->sBytesSingle(),
                 cudaMemcpyHostToDevice, *(cudaStream_t*)m_gpu->stream()));
 
         // Wipe out memory before calling transformation
         gpuErrchk(cudaMemset(m_d_FD, 0., m_settings->fBytesBatch()));
 
         fft(*m_plan, m_d_SD, m_d_FD);
 
         // copy data back
         gpuErrchk(cudaMemcpyAsync(
                 h_out + offset * m_settings->fDim().xyzPadded(),
                 m_d_FD,
                 toProcess * m_settings->fBytesSingle(),
                 cudaMemcpyDeviceToHost, *(cudaStream_t*)m_gpu->stream()));
     }
     return h_out;
 }

◆ fft() [3/4]

template<typename T >

std::complex< T > * CudaFFT< T >::fft	(	cufftHandle	plan,
		T *	d_inOut
	)

static

Definition at line 124 of file cuda_fft.cpp.

                                                            {
     return fft(plan, d_inOut, (std::complex<T>*)d_inOut);
 }

◆ fft() [4/4]

template<typename T >

std::complex< T > * CudaFFT< T >::fft	(	cufftHandle	plan,
		const T *	d_in,
		std::complex< T > *	d_out
	)

static

Definition at line 225 of file cuda_fft.cpp.

                               {
     if (std::is_same<T, float>::value) {
         gpuErrchkFFT(cufftExecR2C(plan, (cufftReal*)d_in, (cufftComplex*)d_out));
     } else if (std::is_same<T, double>::value){
         gpuErrchkFFT(cufftExecD2Z(plan, (cufftDoubleReal*)d_in, (cufftDoubleComplex*)d_out));
     } else {
         REPORT_ERROR(ERR_TYPE_INCORRECT, "Not implemented");
     }
     return d_out;
 }

◆ findMaxBatch()

template<typename T >

FFTSettings< T > CudaFFT< T >::findMaxBatch	(	const FFTSettings< T > &	settings,
		size_t	maxBytes
	)

static

Definition at line 295 of file cuda_fft.cpp.

                          {
     size_t singleBytes = settings.sBytesSingle() + (settings.isInPlace() ? 0 : settings.fBytesSingle());
     size_t batch = min((maxBytes / singleBytes), settings.batch()) + 1; // + 1 will be deducted in the while loop
     while (batch > 1) {
         batch--;
         auto tmp = FFTSettings<T>(settings.sDim(), batch, settings.isInPlace(), settings.isForward());
         size_t totalBytes = CudaFFT<T>().estimateTotalBytes(tmp);
         if (totalBytes <= maxBytes) {
             return tmp;
         }
     }
     REPORT_ERROR(ERR_GPU_MEMORY, "Estimated batch size is 0(zero). "
             "This probably means you don't have enough GPU memory for even a single transformation.");
 }

◆ findOptimal()

template<typename T >

FFTSettings< T > * CudaFFT< T >::findOptimal	(	const GPU &	gpu,
		const FFTSettings< T > &	settings,
		size_t	reserveBytes,
		bool	squareOnly,
		int	sigPercChange,
		bool	crop,
		bool	verbose
	)

static

Definition at line 312 of file cuda_fft.cpp.

                                  {
     using cuFFTAdvisor::Tristate::TRUE;
     using cuFFTAdvisor::Tristate::FALSE;
     size_t freeBytes = gpu.lastFreeBytes();
     std::vector<cuFFTAdvisor::BenchmarkResult const *> *options =
             cuFFTAdvisor::Advisor::find(10, gpu.device(), // FIXME DS this should be configurable
                     settings.sDim().x(), settings.sDim().y(), settings.sDim().z(), settings.sDim().n(),
                     TRUE, // use batch
                     std::is_same<T, float>::value ? TRUE : FALSE,
                     settings.isForward() ? TRUE : FALSE,
                     settings.isInPlace() ? TRUE : FALSE,
                     cuFFTAdvisor::Tristate::TRUE, // is real
                     sigPercChange, memoryUtils::MB(freeBytes - reserveBytes),
                     false, // allow transposition
                     squareOnly, crop);
 
     FFTSettings<T> *result = nullptr;
     if (0 != options->size()) {
         auto res = options->at(0);
         auto optSetting = FFTSettings<T>(
                 res->transform->X,
                 res->transform->Y,
                 res->transform->Z,
                 settings.sDim().n(),
                 res->transform->N / res->transform->repetitions,
                 settings.isInPlace(),
                 settings.isForward());
         result = new FFTSettings<T>(optSetting);
     } 
     if (verbose) {
        if (nullptr != result) {
                 options->at(0)->printHeader(stdout); printf("\n");
                 options->at(0)->print(stdout); printf("\n");
        } else {
                 std::cout << "No result obtained. Maybe too strict search?" << std::endl;
        }
     }
     for (auto& it : *options) delete it;
     delete options;
     return result;
 }

◆ findOptimalSizeOrMaxBatch()

template<typename T >

FFTSettings< T > CudaFFT< T >::findOptimalSizeOrMaxBatch	(	GPU &	gpu,
		const FFTSettings< T > &	settings,
		size_t	reserveBytes,
		bool	squareOnly,
		int	sigPercChange,
		bool	crop,
		bool	verbose
	)

static

Definition at line 358 of file cuda_fft.cpp.

                                  {
     auto *candidate = findOptimal(gpu, settings, reserveBytes, squareOnly, sigPercChange, crop, verbose);
     if (nullptr != candidate) {
         return *candidate;
     }
     if (gpu.lastFreeBytes() > reserveBytes) {
         REPORT_ERROR(ERR_GPU_MEMORY, "You have less GPU memory than you want to use");
     }
     return findMaxBatch(settings, gpu.lastFreeBytes() - reserveBytes);
 }

◆ ifft() [1/4]

template<typename T >

T * CudaFFT< T >::ifft ( std::complex< T > * h_inOut )

virtual

Implements AFT< T >.

Definition at line 129 of file cuda_fft.cpp.

                                           {
     return ifft(d_inOut, (T*)d_inOut);
 }

◆ ifft() [2/4]

template<typename T >

T * CudaFFT< T >::ifft	(	const std::complex< T > *	h_in,
		T *	h_out
	)

virtual

Implements AFT< T >.

Definition at line 175 of file cuda_fft.cpp.

                   {
     auto isReady = m_isInit && ( ! m_settings->isForward());
     if ( ! isReady) {
         REPORT_ERROR(ERR_LOGIC_ERROR, "Not ready to perform Inverse Fourier Transform. "
                 "Call init() function first");
     }
 
     // process signals in batches
     for (size_t offset = 0; offset < m_settings->fDim().n(); offset += m_settings->batch()) {
         // how many signals to process
         size_t toProcess = std::min(m_settings->batch(), m_settings->fDim().n() - offset);
 
         // copy memoryvim
         gpuErrchk(cudaMemcpyAsync(
                 m_d_FD,
                 h_in + offset * m_settings->fDim().xyzPadded(),
                 toProcess * m_settings->fBytesSingle(),
                 cudaMemcpyHostToDevice, *(cudaStream_t*)m_gpu->stream()));
 
         // Wipe out memory before calling transformation
         gpuErrchk(cudaMemset(m_d_SD, 0., m_settings->sBytesBatch()));
 
         ifft(*m_plan, m_d_FD, m_d_SD);
 
         // copy data back
         gpuErrchk(cudaMemcpyAsync(
                 h_out + offset * m_settings->sDim().xyzPadded(),
                 m_d_SD,
                 toProcess * m_settings->sBytesSingle(),
                 cudaMemcpyDeviceToHost, *(cudaStream_t*)m_gpu->stream()));
     }
     return h_out;
 }

◆ ifft() [3/4]

template<typename T >

T * CudaFFT< T >::ifft	(	cufftHandle	plan,
		std::complex< T > *	d_inOut
	)

static

Definition at line 134 of file cuda_fft.cpp.

                                                             {
     return ifft(plan, d_inOut, (T*)d_inOut);
 }

◆ ifft() [4/4]

template<typename T >

T * CudaFFT< T >::ifft	(	cufftHandle	plan,
		const std::complex< T > *	d_in,
		T *	d_out
	)

static

Definition at line 238 of file cuda_fft.cpp.

                   {
     if (std::is_same<T, float>::value) {
         gpuErrchkFFT(cufftExecC2R(plan, (cufftComplex*)d_in, (cufftReal*)d_out));
     } else if (std::is_same<T, double>::value){
         gpuErrchkFFT(cufftExecZ2D(plan, (cufftDoubleComplex*)d_in, (cufftDoubleReal*)d_out));
     } else {
         REPORT_ERROR(ERR_TYPE_INCORRECT, "Not implemented");
     }
     return d_out;
 }

◆ init()

template<typename T >

void CudaFFT< T >::init	(	const HW &	gpu,
		const FFTSettings< T > &	settings,
		bool	reuse = `true`
	)

virtual

Implements AFT< T >.

Definition at line 34 of file cuda_fft.cpp.

                                                                                {
     bool canReuse = m_isInit
             && reuse
             && (m_settings->sBytesBatch() >= settings.sBytesBatch())
             && (m_settings->fBytesBatch() >= settings.fBytesBatch());
     bool mustAllocate = !canReuse;
     if (mustAllocate) {
         release();
     }
     // previous plan and settings has to be released,
     // otherwise we will get GPU/CPU memory leak
     release(m_plan);
     delete m_settings;
 
     m_settings = new FFTSettings<T>(settings);
     try {
         m_gpu = &dynamic_cast<const GPU&>(gpu);
     } catch (std::bad_cast&) {
         REPORT_ERROR(ERR_ARG_INCORRECT, "Instance of GPU expected");
     }
 
     check();
 
     m_plan = createPlan(*m_gpu, *m_settings);
     if (mustAllocate) {
         // allocate input data storage
         gpuErrchk(cudaMalloc(&m_d_SD, m_settings->sBytesBatch()));
         if (m_settings->isInPlace()) {
             // input data holds also the output
             m_d_FD = (std::complex<T>*)m_d_SD;
         } else {
             // allocate also the output buffer
             gpuErrchk(cudaMalloc(&m_d_FD, m_settings->fBytesBatch()));
         }
     }
 
     m_isInit = true;
 }

◆ release() [1/2]

template<typename T >

void CudaFFT< T >::release ( )

finalvirtual

Implements AFT< T >.

Definition at line 108 of file cuda_fft.cpp.

                          {
     gpuErrchk(cudaFree(m_d_SD));
     if ((void*)m_d_FD != (void*)m_d_SD) {
         gpuErrchk(cudaFree(m_d_FD));
     }
     release(m_plan);
     delete m_settings;
     setDefault();
 }

◆ release() [2/2]

template<typename T >

void CudaFFT< T >::release ( cufftHandle * plan )

static

Definition at line 74 of file cuda_fft.cpp.

                                           {
     if (nullptr != plan) {
         cufftDestroy(*plan);
         delete plan;
         plan = nullptr;
     }
 }

The documentation for this class was generated from the following files:

xmipp/libraries/reconstruction_cuda/cuda_fft.h
xmipp/libraries/reconstruction_cuda/cuda_fft.cpp

Public Member Functions

Static Public Member Functions

Additional Inherited Members

Detailed Description

template<typename T> class CudaFFT< T >

Constructor & Destructor Documentation

◆ CudaFFT()

◆ ~CudaFFT()

Member Function Documentation

◆ createPlan()

◆ estimatePlanBytes()

◆ fft() [1/4]

◆ fft() [2/4]

◆ fft() [3/4]

◆ fft() [4/4]

◆ findMaxBatch()

◆ findOptimal()

◆ findOptimalSizeOrMaxBatch()

◆ ifft() [1/4]

◆ ifft() [2/4]

◆ ifft() [3/4]

◆ ifft() [4/4]

◆ init()

◆ release() [1/2]

◆ release() [2/2]

template<typename T>
class CudaFFT< T >