Xmipp  v3.23.11-Nereus
Public Member Functions | Static Public Member Functions | List of all members
CudaFFT< T > Class Template Reference

#include <cuda_fft.h>

Inheritance diagram for CudaFFT< T >:
Inheritance graph
[legend]
Collaboration diagram for CudaFFT< T >:
Collaboration graph
[legend]

Public Member Functions

 CudaFFT ()
 
 ~CudaFFT ()
 
void init (const HW &gpu, const FFTSettings< T > &settings, bool reuse=true)
 
void release () final
 
std::complex< T > * fft (T *h_inOut)
 
std::complex< T > * fft (const T *h_in, std::complex< T > *h_out)
 
T * ifft (std::complex< T > *h_inOut)
 
T * ifft (const std::complex< T > *h_in, T *h_out)
 
size_t estimatePlanBytes (const FFTSettings< T > &settings)
 
- Public Member Functions inherited from AFT< T >
virtual ~AFT ()
 
virtual size_t estimateTotalBytes (const FFTSettings< T > &settings)
 

Static Public Member Functions

static std::complex< T > * fft (cufftHandle plan, T *d_inOut)
 
static std::complex< T > * fft (cufftHandle plan, const T *d_in, std::complex< T > *d_out)
 
static T * ifft (cufftHandle plan, std::complex< T > *d_inOut)
 
static T * ifft (cufftHandle plan, const std::complex< T > *d_in, T *d_out)
 
static cufftHandlecreatePlan (const GPU &gpu, const FFTSettings< T > &settings)
 
static FFTSettings< T > * findOptimal (const GPU &gpu, const FFTSettings< T > &settings, size_t reserveBytes, bool squareOnly, int sigPercChange, bool crop, bool verbose)
 
static FFTSettings< T > findMaxBatch (const FFTSettings< T > &settings, size_t maxBytes)
 
static FFTSettings< T > findOptimalSizeOrMaxBatch (GPU &gpu, const FFTSettings< T > &settings, size_t reserveBytes, bool squareOnly, int sigPercChange, bool crop, bool verbose)
 
static void release (cufftHandle *plan)
 

Additional Inherited Members

Detailed Description

template<typename T>
class CudaFFT< T >

Definition at line 47 of file cuda_fft.h.

Constructor & Destructor Documentation

◆ CudaFFT()

template<typename T >
CudaFFT< T >::CudaFFT ( )
inline

Definition at line 49 of file cuda_fft.h.

49  {
50  setDefault();
51  };

◆ ~CudaFFT()

template<typename T >
CudaFFT< T >::~CudaFFT ( )
inline

Definition at line 52 of file cuda_fft.h.

52  {
53  release();
54  }
void release() final
Definition: cuda_fft.cpp:108

Member Function Documentation

◆ createPlan()

template<typename T >
cufftHandle * CudaFFT< T >::createPlan ( const GPU gpu,
const FFTSettings< T > &  settings 
)
static

Definition at line 276 of file cuda_fft.cpp.

276  {
277  if (settings.sElemsBatch() > std::numeric_limits<int>::max()) {
278  REPORT_ERROR(ERR_ARG_INCORRECT, "Too many elements for Fourier Transformation. "
279  "It would cause int overflow in the cuda kernel. Try to decrease batch size");
280  }
281  auto plan = new cufftHandle;
282  auto f = [&] (int rank, int *n, int *inembed,
283  int istride, int idist, int *onembed, int ostride,
284  int odist, cufftType type, int batch) {
285  gpuErrchkFFT(cufftPlanMany(plan, rank, n, inembed,
286  istride, idist, onembed, ostride,
287  odist, type, batch));
288  };
289  manyHelper(settings, f);
290  gpuErrchkFFT(cufftSetStream(*plan, *(cudaStream_t*)gpu.stream()));
291  return plan;
292 }
void * stream() const
Definition: gpu.h:50
#define REPORT_ERROR(nerr, ErrormMsg)
Definition: xmipp_error.h:211
int cufftHandle
Definition: cuda_fft.h:41
#define gpuErrchkFFT(code)
Definition: cuda_asserts.h:32
viol type
double * f
Incorrect argument received.
Definition: xmipp_error.h:113
void max(Image< double > &op1, const Image< double > &op2)
constexpr size_t sElemsBatch() const
Definition: fft_settings.h:118
int * n

◆ estimatePlanBytes()

template<typename T >
size_t CudaFFT< T >::estimatePlanBytes ( const FFTSettings< T > &  settings)
virtual

Implements AFT< T >.

Definition at line 211 of file cuda_fft.cpp.

211  {
212  size_t size = 0;
213  auto f = [&] (int rank, int *n, int *inembed,
214  int istride, int idist, int *onembed, int ostride,
215  int odist, cufftType type, int batch) {
216  gpuErrchkFFT(cufftEstimateMany(rank, n, inembed,
217  istride, idist, onembed, ostride,
218  odist, type, batch, &size));
219  };
220  manyHelper(settings, f);
221  return size;
222 }
#define gpuErrchkFFT(code)
Definition: cuda_asserts.h:32
viol type
double * f
int * n

◆ fft() [1/4]

template<typename T >
std::complex< T > * CudaFFT< T >::fft ( T *  h_inOut)
virtual

Implements AFT< T >.

Definition at line 119 of file cuda_fft.cpp.

119  {
120  return fft(d_inOut, (std::complex<T>*) d_inOut);
121 }
std::complex< T > * fft(T *h_inOut)
Definition: cuda_fft.cpp:119

◆ fft() [2/4]

template<typename T >
std::complex< T > * CudaFFT< T >::fft ( const T *  h_in,
std::complex< T > *  h_out 
)
virtual

Implements AFT< T >.

Definition at line 139 of file cuda_fft.cpp.

140  {
141  auto isReady = m_isInit && m_settings->isForward();
142  if ( ! isReady) {
143  REPORT_ERROR(ERR_LOGIC_ERROR, "Not ready to perform Fourier Transform. "
144  "Call init() function first");
145  }
146 
147  // process signals in batches
148  for (size_t offset = 0; offset < m_settings->sDim().n(); offset += m_settings->batch()) {
149  // how many signals to process
150  size_t toProcess = std::min(m_settings->batch(), m_settings->sDim().n() - offset);
151 
152  // copy memory
153  gpuErrchk(cudaMemcpyAsync(
154  m_d_SD,
155  h_in + offset * m_settings->sDim().xyzPadded(),
156  toProcess * m_settings->sBytesSingle(),
157  cudaMemcpyHostToDevice, *(cudaStream_t*)m_gpu->stream()));
158 
159  // Wipe out memory before calling transformation
160  gpuErrchk(cudaMemset(m_d_FD, 0., m_settings->fBytesBatch()));
161 
162  fft(*m_plan, m_d_SD, m_d_FD);
163 
164  // copy data back
165  gpuErrchk(cudaMemcpyAsync(
166  h_out + offset * m_settings->fDim().xyzPadded(),
167  m_d_FD,
168  toProcess * m_settings->fBytesSingle(),
169  cudaMemcpyDeviceToHost, *(cudaStream_t*)m_gpu->stream()));
170  }
171  return h_out;
172 }
#define gpuErrchk(code)
Definition: cuda_asserts.h:31
void * stream() const
Definition: gpu.h:50
void min(Image< double > &op1, const Image< double > &op2)
#define REPORT_ERROR(nerr, ErrormMsg)
Definition: xmipp_error.h:211
std::complex< T > * fft(T *h_inOut)
Definition: cuda_fft.cpp:119
Some logical error in the pipeline.
Definition: xmipp_error.h:147

◆ fft() [3/4]

template<typename T >
std::complex< T > * CudaFFT< T >::fft ( cufftHandle  plan,
T *  d_inOut 
)
static

Definition at line 124 of file cuda_fft.cpp.

124  {
125  return fft(plan, d_inOut, (std::complex<T>*)d_inOut);
126 }
std::complex< T > * fft(T *h_inOut)
Definition: cuda_fft.cpp:119

◆ fft() [4/4]

template<typename T >
std::complex< T > * CudaFFT< T >::fft ( cufftHandle  plan,
const T *  d_in,
std::complex< T > *  d_out 
)
static

Definition at line 225 of file cuda_fft.cpp.

226  {
227  if (std::is_same<T, float>::value) {
228  gpuErrchkFFT(cufftExecR2C(plan, (cufftReal*)d_in, (cufftComplex*)d_out));
229  } else if (std::is_same<T, double>::value){
230  gpuErrchkFFT(cufftExecD2Z(plan, (cufftDoubleReal*)d_in, (cufftDoubleComplex*)d_out));
231  } else {
232  REPORT_ERROR(ERR_TYPE_INCORRECT, "Not implemented");
233  }
234  return d_out;
235 }
#define REPORT_ERROR(nerr, ErrormMsg)
Definition: xmipp_error.h:211
#define gpuErrchkFFT(code)
Definition: cuda_asserts.h:32
Incorrect type received.
Definition: xmipp_error.h:190

◆ findMaxBatch()

template<typename T >
FFTSettings< T > CudaFFT< T >::findMaxBatch ( const FFTSettings< T > &  settings,
size_t  maxBytes 
)
static

Definition at line 295 of file cuda_fft.cpp.

296  {
297  size_t singleBytes = settings.sBytesSingle() + (settings.isInPlace() ? 0 : settings.fBytesSingle());
298  size_t batch = min((maxBytes / singleBytes), settings.batch()) + 1; // + 1 will be deducted in the while loop
299  while (batch > 1) {
300  batch--;
301  auto tmp = FFTSettings<T>(settings.sDim(), batch, settings.isInPlace(), settings.isForward());
302  size_t totalBytes = CudaFFT<T>().estimateTotalBytes(tmp);
303  if (totalBytes <= maxBytes) {
304  return tmp;
305  }
306  }
307  REPORT_ERROR(ERR_GPU_MEMORY, "Estimated batch size is 0(zero). "
308  "This probably means you don't have enough GPU memory for even a single transformation.");
309 }
void min(Image< double > &op1, const Image< double > &op2)
constexpr bool isInPlace() const
Definition: fft_settings.h:126
#define REPORT_ERROR(nerr, ErrormMsg)
Definition: xmipp_error.h:211
virtual size_t estimateTotalBytes(const FFTSettings< T > &settings)
Definition: aft.h:47
constexpr size_t sBytesSingle() const
Definition: fft_settings.h:106
constexpr size_t batch() const
Definition: fft_settings.h:86
GPU memory related issues.
Definition: xmipp_error.h:124
constexpr Dimensions sDim() const
Definition: fft_settings.h:78
constexpr bool isForward() const
Definition: fft_settings.h:122
constexpr size_t fBytesSingle() const
Definition: fft_settings.h:90

◆ findOptimal()

template<typename T >
FFTSettings< T > * CudaFFT< T >::findOptimal ( const GPU gpu,
const FFTSettings< T > &  settings,
size_t  reserveBytes,
bool  squareOnly,
int  sigPercChange,
bool  crop,
bool  verbose 
)
static

Definition at line 312 of file cuda_fft.cpp.

315  {
318  size_t freeBytes = gpu.lastFreeBytes();
319  std::vector<cuFFTAdvisor::BenchmarkResult const *> *options =
320  cuFFTAdvisor::Advisor::find(10, gpu.device(), // FIXME DS this should be configurable
321  settings.sDim().x(), settings.sDim().y(), settings.sDim().z(), settings.sDim().n(),
322  TRUE, // use batch
323  std::is_same<T, float>::value ? TRUE : FALSE,
324  settings.isForward() ? TRUE : FALSE,
325  settings.isInPlace() ? TRUE : FALSE,
326  cuFFTAdvisor::Tristate::TRUE, // is real
327  sigPercChange, memoryUtils::MB(freeBytes - reserveBytes),
328  false, // allow transposition
329  squareOnly, crop);
330 
331  FFTSettings<T> *result = nullptr;
332  if (0 != options->size()) {
333  auto res = options->at(0);
334  auto optSetting = FFTSettings<T>(
335  res->transform->X,
336  res->transform->Y,
337  res->transform->Z,
338  settings.sDim().n(),
339  res->transform->N / res->transform->repetitions,
340  settings.isInPlace(),
341  settings.isForward());
342  result = new FFTSettings<T>(optSetting);
343  }
344  if (verbose) {
345  if (nullptr != result) {
346  options->at(0)->printHeader(stdout); printf("\n");
347  options->at(0)->print(stdout); printf("\n");
348  } else {
349  std::cout << "No result obtained. Maybe too strict search?" << std::endl;
350  }
351  }
352  for (auto& it : *options) delete it;
353  delete options;
354  return result;
355 }
static std::vector< BenchmarkResult const * > * find(int howMany, int device, int x, int y=1, int z=1, int n=1, Tristate::Tristate isBatched=Tristate::TRUE, Tristate::Tristate isFloat=Tristate::TRUE, Tristate::Tristate isForward=Tristate::TRUE, Tristate::Tristate isInPlace=Tristate::TRUE, Tristate::Tristate isReal=Tristate::TRUE, int maxSignalInc=INT_MAX, int maxMemory=INT_MAX, bool allowTransposition=false, bool squareOnly=false, bool crop=false)
Definition: advisor.cpp:43
constexpr bool isInPlace() const
Definition: fft_settings.h:126
CUDA_HD constexpr size_t z() const
Definition: dimensions.h:69
CUDA_HD constexpr size_t x() const
Definition: dimensions.h:51
int device() const
Definition: gpu.h:46
CUDA_HD constexpr size_t y() const
Definition: dimensions.h:60
CUDA_HD constexpr size_t n() const
Definition: dimensions.h:78
#define FALSE
Definition: defines.h:24
constexpr Dimensions sDim() const
Definition: fft_settings.h:78
constexpr T MB(T bytes)
Definition: memory_utils.h:87
#define TRUE
Definition: defines.h:25
constexpr bool isForward() const
Definition: fft_settings.h:122
size_t lastFreeBytes() const
Definition: gpu.h:59

◆ findOptimalSizeOrMaxBatch()

template<typename T >
FFTSettings< T > CudaFFT< T >::findOptimalSizeOrMaxBatch ( GPU gpu,
const FFTSettings< T > &  settings,
size_t  reserveBytes,
bool  squareOnly,
int  sigPercChange,
bool  crop,
bool  verbose 
)
static

Definition at line 358 of file cuda_fft.cpp.

361  {
362  auto *candidate = findOptimal(gpu, settings, reserveBytes, squareOnly, sigPercChange, crop, verbose);
363  if (nullptr != candidate) {
364  return *candidate;
365  }
366  if (gpu.lastFreeBytes() > reserveBytes) {
367  REPORT_ERROR(ERR_GPU_MEMORY, "You have less GPU memory than you want to use");
368  }
369  return findMaxBatch(settings, gpu.lastFreeBytes() - reserveBytes);
370 }
#define REPORT_ERROR(nerr, ErrormMsg)
Definition: xmipp_error.h:211
GPU memory related issues.
Definition: xmipp_error.h:124
static FFTSettings< T > * findOptimal(const GPU &gpu, const FFTSettings< T > &settings, size_t reserveBytes, bool squareOnly, int sigPercChange, bool crop, bool verbose)
Definition: cuda_fft.cpp:312
size_t lastFreeBytes() const
Definition: gpu.h:59
static FFTSettings< T > findMaxBatch(const FFTSettings< T > &settings, size_t maxBytes)
Definition: cuda_fft.cpp:295

◆ ifft() [1/4]

template<typename T >
T * CudaFFT< T >::ifft ( std::complex< T > *  h_inOut)
virtual

Implements AFT< T >.

Definition at line 129 of file cuda_fft.cpp.

129  {
130  return ifft(d_inOut, (T*)d_inOut);
131 }
T * ifft(std::complex< T > *h_inOut)
Definition: cuda_fft.cpp:129

◆ ifft() [2/4]

template<typename T >
T * CudaFFT< T >::ifft ( const std::complex< T > *  h_in,
T *  h_out 
)
virtual

Implements AFT< T >.

Definition at line 175 of file cuda_fft.cpp.

176  {
177  auto isReady = m_isInit && ( ! m_settings->isForward());
178  if ( ! isReady) {
179  REPORT_ERROR(ERR_LOGIC_ERROR, "Not ready to perform Inverse Fourier Transform. "
180  "Call init() function first");
181  }
182 
183  // process signals in batches
184  for (size_t offset = 0; offset < m_settings->fDim().n(); offset += m_settings->batch()) {
185  // how many signals to process
186  size_t toProcess = std::min(m_settings->batch(), m_settings->fDim().n() - offset);
187 
188  // copy memoryvim
189  gpuErrchk(cudaMemcpyAsync(
190  m_d_FD,
191  h_in + offset * m_settings->fDim().xyzPadded(),
192  toProcess * m_settings->fBytesSingle(),
193  cudaMemcpyHostToDevice, *(cudaStream_t*)m_gpu->stream()));
194 
195  // Wipe out memory before calling transformation
196  gpuErrchk(cudaMemset(m_d_SD, 0., m_settings->sBytesBatch()));
197 
198  ifft(*m_plan, m_d_FD, m_d_SD);
199 
200  // copy data back
201  gpuErrchk(cudaMemcpyAsync(
202  h_out + offset * m_settings->sDim().xyzPadded(),
203  m_d_SD,
204  toProcess * m_settings->sBytesSingle(),
205  cudaMemcpyDeviceToHost, *(cudaStream_t*)m_gpu->stream()));
206  }
207  return h_out;
208 }
#define gpuErrchk(code)
Definition: cuda_asserts.h:31
void * stream() const
Definition: gpu.h:50
void min(Image< double > &op1, const Image< double > &op2)
#define REPORT_ERROR(nerr, ErrormMsg)
Definition: xmipp_error.h:211
T * ifft(std::complex< T > *h_inOut)
Definition: cuda_fft.cpp:129
Some logical error in the pipeline.
Definition: xmipp_error.h:147

◆ ifft() [3/4]

template<typename T >
T * CudaFFT< T >::ifft ( cufftHandle  plan,
std::complex< T > *  d_inOut 
)
static

Definition at line 134 of file cuda_fft.cpp.

134  {
135  return ifft(plan, d_inOut, (T*)d_inOut);
136 }
T * ifft(std::complex< T > *h_inOut)
Definition: cuda_fft.cpp:129

◆ ifft() [4/4]

template<typename T >
T * CudaFFT< T >::ifft ( cufftHandle  plan,
const std::complex< T > *  d_in,
T *  d_out 
)
static

Definition at line 238 of file cuda_fft.cpp.

239  {
240  if (std::is_same<T, float>::value) {
241  gpuErrchkFFT(cufftExecC2R(plan, (cufftComplex*)d_in, (cufftReal*)d_out));
242  } else if (std::is_same<T, double>::value){
243  gpuErrchkFFT(cufftExecZ2D(plan, (cufftDoubleComplex*)d_in, (cufftDoubleReal*)d_out));
244  } else {
245  REPORT_ERROR(ERR_TYPE_INCORRECT, "Not implemented");
246  }
247  return d_out;
248 }
#define REPORT_ERROR(nerr, ErrormMsg)
Definition: xmipp_error.h:211
#define gpuErrchkFFT(code)
Definition: cuda_asserts.h:32
Incorrect type received.
Definition: xmipp_error.h:190

◆ init()

template<typename T >
void CudaFFT< T >::init ( const HW gpu,
const FFTSettings< T > &  settings,
bool  reuse = true 
)
virtual

Implements AFT< T >.

Definition at line 34 of file cuda_fft.cpp.

34  {
35  bool canReuse = m_isInit
36  && reuse
37  && (m_settings->sBytesBatch() >= settings.sBytesBatch())
38  && (m_settings->fBytesBatch() >= settings.fBytesBatch());
39  bool mustAllocate = !canReuse;
40  if (mustAllocate) {
41  release();
42  }
43  // previous plan and settings has to be released,
44  // otherwise we will get GPU/CPU memory leak
45  release(m_plan);
46  delete m_settings;
47 
48  m_settings = new FFTSettings<T>(settings);
49  try {
50  m_gpu = &dynamic_cast<const GPU&>(gpu);
51  } catch (std::bad_cast&) {
52  REPORT_ERROR(ERR_ARG_INCORRECT, "Instance of GPU expected");
53  }
54 
55  check();
56 
57  m_plan = createPlan(*m_gpu, *m_settings);
58  if (mustAllocate) {
59  // allocate input data storage
60  gpuErrchk(cudaMalloc(&m_d_SD, m_settings->sBytesBatch()));
61  if (m_settings->isInPlace()) {
62  // input data holds also the output
63  m_d_FD = (std::complex<T>*)m_d_SD;
64  } else {
65  // allocate also the output buffer
66  gpuErrchk(cudaMalloc(&m_d_FD, m_settings->fBytesBatch()));
67  }
68  }
69 
70  m_isInit = true;
71 }
#define gpuErrchk(code)
Definition: cuda_asserts.h:31
#define REPORT_ERROR(nerr, ErrormMsg)
Definition: xmipp_error.h:211
constexpr size_t fBytesBatch() const
Definition: fft_settings.h:98
Incorrect argument received.
Definition: xmipp_error.h:113
static cufftHandle * createPlan(const GPU &gpu, const FFTSettings< T > &settings)
Definition: cuda_fft.cpp:276
void release() final
Definition: cuda_fft.cpp:108
Definition: gpu.h:36
constexpr size_t sBytesBatch() const
Definition: fft_settings.h:114
check(nparam, nf, nfsr, &Linfty, nineq, nineqn, neq, neqn, ncsrl, ncsrn, mode, &modem, eps, bgbnd, param)

◆ release() [1/2]

template<typename T >
void CudaFFT< T >::release ( )
finalvirtual

Implements AFT< T >.

Definition at line 108 of file cuda_fft.cpp.

108  {
109  gpuErrchk(cudaFree(m_d_SD));
110  if ((void*)m_d_FD != (void*)m_d_SD) {
111  gpuErrchk(cudaFree(m_d_FD));
112  }
113  release(m_plan);
114  delete m_settings;
115  setDefault();
116 }
#define gpuErrchk(code)
Definition: cuda_asserts.h:31
void release() final
Definition: cuda_fft.cpp:108

◆ release() [2/2]

template<typename T >
void CudaFFT< T >::release ( cufftHandle plan)
static

Definition at line 74 of file cuda_fft.cpp.

74  {
75  if (nullptr != plan) {
76  cufftDestroy(*plan);
77  delete plan;
78  plan = nullptr;
79  }
80 }

The documentation for this class was generated from the following files: