26 #include "../data/fftwT.h" 45 bool canReuse = m_isInit
48 && (m_settings->sBytesBatch() >= settings.
sBytesBatch())
49 && (m_settings->fBytesBatch() >= settings.
fBytesBatch())
50 && (needsAuxArray(*m_settings) == needsAuxArray(settings));
51 bool mustAllocate = !canReuse;
61 m_cpu = &
dynamic_cast<const CPU&
>(cpu);
62 }
catch (std::bad_cast&) {
68 m_plan = createPlan(*m_cpu, *m_settings,
true);
85 return fftwf_malloc(bytes);
90 return fftw_malloc(bytes);
96 fftw_free(alignedData);
102 fftwf_free(alignedData);
108 fftw_free(alignedData);
114 fftwf_free(alignedData);
119 if (needsAuxArray(*m_settings)) {
120 m_SD = (
float*)allocateAligned(m_settings->sBytesBatch());
121 if (m_settings->isInPlace()) {
123 m_FD = (std::complex<float>*)m_SD;
126 m_FD = (std::complex<float>*)allocateAligned(m_settings->fBytesBatch());
133 if (needsAuxArray(*m_settings)) {
134 m_SD = (
double*)allocateAligned(m_settings->sBytesBatch());
135 if (m_settings->isInPlace()) {
137 m_FD = (std::complex<double>*)m_SD;
140 m_FD = (std::complex<double>*)allocateAligned(m_settings->fBytesBatch());
147 if (
nullptr != plan) {
154 if (m_settings->sDim().x() < 1) {
157 if ((m_settings->sDim().y() > 1)
158 && (m_settings->sDim().x() < 2)) {
161 if ((m_settings->sDim().z() > 1)
162 && (m_settings->sDim().y() < 2)) {
165 if (m_settings->batch() > m_settings->sDim().n()) {
173 m_settings =
nullptr;
190 if ((
void*)m_FD != (
void*)m_SD) {
198 if ((
void*)m_FD != (
void*)m_SD) {
205 return fft(inOut, (std::complex<T>*) inOut);
210 std::complex<T> *out) {
211 auto isReady = m_isInit && m_settings->isForward();
214 "Call init() function first");
218 for (
size_t offset = 0; offset < m_settings->sDim().n(); offset += m_settings->batch()) {
219 size_t signalsToProcess =
std::min(m_settings->batch(), m_settings->sDim().n() - offset);
220 size_t signalsToSkip = m_settings->batch() - signalsToProcess;
221 bool useAux = (m_settings->batch() != signalsToProcess) && needsAuxArray(*m_settings);
224 const T *batchSrc = in
225 + (offset - signalsToSkip) * m_settings->sDim().xyzPadded();
226 std::complex<T> *batchDest = out
227 + (offset - signalsToSkip) * m_settings->fDim().xyzPadded();
232 in + offset * m_settings->sDim().xyzPadded(),
233 signalsToProcess * m_settings->sBytesSingle());
235 batchSrc = (
const T*)m_SD;
236 batchDest = (std::complex<T>*)m_SD;
239 fft(cast(m_plan), batchSrc, batchDest);
243 memcpy(out + offset * m_settings->fDim().xyzPadded(),
245 signalsToProcess * m_settings->fBytesSingle());
253 return ifft(inOut, (T*) inOut);
259 auto isReady = m_isInit && ( ! m_settings->isForward());
262 "Call init() function first");
266 for (
size_t offset = 0; offset < m_settings->fDim().n(); offset += m_settings->batch()) {
267 size_t signalsToProcess =
std::min(m_settings->batch(), m_settings->fDim().n() - offset);
268 size_t signalsToSkip = m_settings->batch() - signalsToProcess;
269 bool useAux = (m_settings->batch() != signalsToProcess) || needsAuxArray(*m_settings);
273 in + offset * m_settings->fDim().xyzPadded(),
274 signalsToProcess * m_settings->fBytesSingle());
277 T *batchDest = useAux
279 : out + (offset - signalsToSkip) * m_settings->sDim().xyzPadded();
281 ifft(cast(m_plan), m_FD, batchDest);
285 memcpy(out + offset * m_settings->sDim().xyzPadded(),
287 signalsToProcess * m_settings->sBytesSingle());
296 bool isDataAligned) {
297 auto f = [&] (
int rank,
const int *
n,
int howmany,
298 void *
in,
const int *inembed,
299 int istride,
int idist,
300 void *out,
const int *onembed,
301 int ostride,
int odist,
304 return fftwf_plan_many_dft_r2c(rank, n, howmany,
305 (
float *)in,
nullptr,
307 (fftwf_complex *)out,
nullptr,
311 return fftwf_plan_many_dft_c2r(rank, n, howmany,
312 (fftwf_complex *)in,
nullptr,
314 (
float *)out,
nullptr,
319 return planHelper<const fftwf_plan>(settings,
f, cpu.
noOfParallUnits(), isDataAligned);
325 bool isDataAligned) {
326 auto f = [&] (
int rank,
const int *
n,
int howmany,
327 void *
in,
const int *inembed,
328 int istride,
int idist,
329 void *out,
const int *onembed,
330 int ostride,
int odist,
333 return fftw_plan_many_dft_r2c(rank, n, howmany,
334 (
double *)in,
nullptr,
336 (fftw_complex *)out,
nullptr,
340 return fftw_plan_many_dft_c2r(rank, n, howmany,
341 (fftw_complex *)in,
nullptr,
343 (
double *)out,
nullptr,
348 auto result = planHelper<const fftw_plan>(settings,
f, cpu.
noOfParallUnits(), isDataAligned);
353 template<
typename U,
typename F>
356 bool isDataAligned) {
357 auto n = std::array<int, 3>{(int)settings.
sDim().
z(), (int)settings.
sDim().
y(), (int)settings.
sDim().
x()};
359 if (settings.
sDim().
z() == 1) rank--;
360 if ((2 == rank) && (settings.
sDim().
y() == 1)) rank--;
361 int offset = 3 - rank;
364 void *out = settings.
isInPlace() ?
in : &m_mockOut;
368 auto flags = FFTW_ESTIMATE
369 | (settings.
isForward() ? FFTW_PRESERVE_INPUT : FFTW_DESTROY_INPUT);
370 if ( ! isDataAligned) {
371 flags = flags | FFTW_UNALIGNED;
385 std::lock_guard lck(fftwtMutex);
388 fftw_plan_with_nthreads(threads);
389 fftwf_plan_with_nthreads(threads);
391 auto tmp =
function(rank, &
n[offset], settings.
batch(),
406 std::lock_guard lck(fftwtMutex);
407 fftwf_destroy_plan(plan);
414 std::lock_guard lck(fftwtMutex);
415 fftw_destroy_plan(plan);
421 std::complex<float>*
FFTwT<float>::fft(
const fftwf_plan plan,
const float *
in, std::complex<float> *out) {
423 fftwf_execute_dft_r2c(plan, (
float*)in, (fftwf_complex*) out);
431 fftw_execute_dft_r2c(plan, (
double*)in, (fftw_complex*) out);
438 return fft(plan, inOut, (std::complex<T>*)inOut);
445 fftwf_execute_dft_c2r(plan, (fftwf_complex*)in, (
float*) out);
453 fftw_execute_dft_c2r(plan, (fftw_complex*)in, (
double*) out);
460 return ifft(plan, inOut, (T*)inOut);
constexpr size_t xyzPadded() const
void min(Image< double > &op1, const Image< double > &op2)
size_t estimatePlanBytes(const FFTSettings< T > &settings)
constexpr bool isInPlace() const
void init(const HW &cpu, const FFTSettings< T > &settings, bool reuse=true)
#define REPORT_ERROR(nerr, ErrormMsg)
CUDA_HD constexpr size_t z() const
constexpr size_t fBytesBatch() const
static const fftw_plan createPlan(const CPU &cpu, const FFTSettings< double > &settings, bool isDataAligned=false)
constexpr Dimensions fDim() const
std::complex< T > * fft(T *inOut)
unsigned noOfParallUnits() const
constexpr size_t batch() const
CUDA_HD constexpr size_t x() const
Incorrect argument received.
static void * allocateAligned(size_t bytes)
FFTwT_Startup fftwt_startup
CUDA_HD constexpr size_t y() const
constexpr size_t maxBytesBatch() const
CUDA_HD constexpr size_t n() const
constexpr Dimensions sDim() const
T * ifft(std::complex< T > *inOut)
constexpr bool isForward() const
constexpr size_t sBytesBatch() const
check(nparam, nf, nfsr, &Linfty, nineq, nineqn, neq, neqn, ncsrl, ncsrn, mode, &modem, eps, bgbnd, param)
Some logical error in the pipeline.