Xmipp  v3.23.11-Nereus
Functions
reconstruct_fourier_codelet_padded_image_to_fft.cpp File Reference
#include <core/multidim_array.h>
#include <reconstruction_cuda/cuda_xmipp_utils.h>
#include <reconstruction_cuda/cuda_asserts.h>
#include <fftw3.h>
#include <starpu.h>
#include <pthread.h>
#include <core/xmipp_fft.h>
#include <core/xmipp_fftw.h>
#include "reconstruct_fourier_codelets.h"
#include "reconstruct_fourier_defines.h"
#include "reconstruct_fourier_util.h"
Include dependency graph for reconstruct_fourier_codelet_padded_image_to_fft.cpp:

Go to the source code of this file.

Functions

__host__ __device__ float fft_IDX2DIGFREQ (int idx, int size)
 
void func_padded_image_to_fft_cpu (void **buffers, void *cl_arg)
 
void padded_image_to_fft_cuda_initialize (uint32_t paddedImageSize)
 
void padded_image_to_fft_cuda_deinitialize ()
 
__global__ void convertImagesKernel (const float2 *input, uint32_t inputSizeX, uint32_t inputSizeY, uint32_t outputSizeX, float2 *output, const float maxResolutionSqr, const float normFactor)
 
void func_padded_image_to_fft_cuda (void **buffers, void *cl_arg)
 

Function Documentation

◆ convertImagesKernel()

__global__ void convertImagesKernel ( const float2 *  input,
uint32_t  inputSizeX,
uint32_t  inputSizeY,
uint32_t  outputSizeX,
float2 *  output,
const float  maxResolutionSqr,
const float  normFactor 
)

Definition at line 178 of file reconstruct_fourier_codelet_padded_image_to_fft.cpp.

185  {
186  // assign pixel to thread
187  volatile int idx = (blockIdx.x * blockDim.x) + threadIdx.x;
188  volatile int idy = (blockIdx.y * blockDim.y) + threadIdx.y;
189 
190  int halfY = inputSizeY / 2;
191 
192  // input is an image in Fourier space (not normalized)
193  // with low frequencies in the inner corners
194  float2 freq;
195  if ((idy < inputSizeY) // for all input lines
196  && (idx < outputSizeX)) { // for all output pixels in the line
197  // process line only if it can hold sufficiently high frequency, i.e. process only
198  // first and last N lines
199  if (idy < outputSizeX || idy >= (inputSizeY - outputSizeX)) {
200  // check the frequency
201  freq.x = fft_IDX2DIGFREQ(idx, inputSizeY);
202  freq.y = fft_IDX2DIGFREQ(idy, inputSizeY);
203  if ((freq.x * freq.x + freq.y * freq.y) > maxResolutionSqr) {
204  return;
205  }
206  // do the shift (lower line will move up, upper down)
207  int newY = (idy < halfY) ? (idy + outputSizeX) : (idy - inputSizeY + outputSizeX);
208  int oIndex = newY * outputSizeX + idx;
209 
210  // copy data and perform normalization
211  int iIndex = idy*inputSizeX + idx;
212  float2 value = input[iIndex];
213  value.x *= normFactor;
214  value.y *= normFactor;
215  output[oIndex] = value;
216  }
217  }
218 }
__host__ __device__ float fft_IDX2DIGFREQ(int idx, int size)

◆ fft_IDX2DIGFREQ()

__host__ __device__ float fft_IDX2DIGFREQ ( int  idx,
int  size 
)
inline

Index to frequency. Same as xmipp_fft.h::FFT_IDX2DIGFREQ, but compatible with CUDA and not using doubles. Given an index and a size of the FFT, this function returns the corresponding digital frequency (-1/2 to 1/2).

Definition at line 58 of file reconstruct_fourier_codelet_padded_image_to_fft.cpp.

58  {
59  if (size <= 1) return 0;
60  return ((idx <= (size / 2)) ? idx : (-size + idx)) / (float) size;
61 }

◆ func_padded_image_to_fft_cpu()

void func_padded_image_to_fft_cpu ( void **  buffers,
void *  cl_arg 
)

Definition at line 101 of file reconstruct_fourier_codelet_padded_image_to_fft.cpp.

101  {
102  float* inPaddedImage = (float*)STARPU_VECTOR_GET_PTR(buffers[0]);
103  float2* outProcessedFft = (float2*)STARPU_VECTOR_GET_PTR(buffers[1]);
104  float2* temporaryFftScratch = (float2*)STARPU_MATRIX_GET_PTR(buffers[2]);
105  const uint32_t noOfImages = ((LoadedImagesBuffer*) STARPU_VARIABLE_GET_PTR(buffers[3]))->noOfImages;
106  const PaddedImageToFftArgs& arg = *(PaddedImageToFftArgs*)(cl_arg);
107 
108  const uint32_t rawFftSizeX = STARPU_MATRIX_GET_NX(buffers[2]);
109  const uint32_t rawFftSizeY = STARPU_MATRIX_GET_NY(buffers[2]);
110 
111  const size_t imageStridePaddedImage = STARPU_VECTOR_GET_ELEMSIZE(buffers[0]) / sizeof(float);
112  const size_t temporaryFftScratchSizeBytes = STARPU_VECTOR_GET_ELEMSIZE(buffers[2]) * STARPU_VECTOR_GET_NX(buffers[2]);
113  const size_t imageStrideOutput = STARPU_VECTOR_GET_ELEMSIZE(buffers[1]) / sizeof(float2);
114 
115  if (alignmentOf(inPaddedImage) < ALIGNMENT || alignmentOf(temporaryFftScratch) < ALIGNMENT) {
116  fprintf(stderr,"Bad alignments of buffers for FFT: %d, %d\n", alignmentOf(inPaddedImage), alignmentOf(temporaryFftScratch));
117  assert(false);
118  }
119 
120  static pthread_mutex_t fftw_plan_mutex = PTHREAD_MUTEX_INITIALIZER;
121  // Planning API of FFTW is not thread safe
122  pthread_mutex_lock(&fftw_plan_mutex);
123  fftwf_plan plan = fftwf_plan_dft_r2c_2d(arg.paddedImageSize, arg.paddedImageSize,
124  inPaddedImage, (fftwf_complex*) temporaryFftScratch, FFTW_ESTIMATE);
125  pthread_mutex_unlock(&fftw_plan_mutex);
126 
127  assert(plan != nullptr);
128 
129  const float normalizationFactor = 1.0f / (arg.paddedImageSize * arg.paddedImageSize);
130 
131  float* in = inPaddedImage;
132  float2* out = outProcessedFft;
133  for (uint32_t i = 0; i < noOfImages; i++) {
134  memset(temporaryFftScratch, 0, temporaryFftScratchSizeBytes);
135 
136  fftwf_execute_dft_r2c(plan, in, (fftwf_complex*) temporaryFftScratch);
137 
138  cropAndShift(temporaryFftScratch, rawFftSizeX, rawFftSizeY, arg.fftSizeX,
139  out, arg.maxResolutionSqr, normalizationFactor);
140 
141  in += imageStridePaddedImage;
142  out += imageStrideOutput;
143  }
144 
145  pthread_mutex_lock(&fftw_plan_mutex);
146  fftwf_destroy_plan(plan);
147  pthread_mutex_unlock(&fftw_plan_mutex);
148 }
uint32_t alignmentOf(size_t ptr)
#define i
int in
const uint32_t ALIGNMENT
fprintf(glob_prnt.io, "\)

◆ func_padded_image_to_fft_cuda()

void func_padded_image_to_fft_cuda ( void **  buffers,
void *  cl_arg 
)

Definition at line 220 of file reconstruct_fourier_codelet_padded_image_to_fft.cpp.

220  {
221  float* inPaddedImage = (float*)STARPU_VECTOR_GET_PTR(buffers[0]);
222  float2* outProcessedFft = (float2*)STARPU_VECTOR_GET_PTR(buffers[1]);
223  float2* temporaryFftScratch = (float2*)STARPU_MATRIX_GET_PTR(buffers[2]);
224  const uint32_t noOfImages = ((LoadedImagesBuffer*) STARPU_VARIABLE_GET_PTR(buffers[3]))->noOfImages;
225  const PaddedImageToFftArgs& arg = *(PaddedImageToFftArgs*)(cl_arg);
226 
227  const uint32_t rawFftSizeX = STARPU_MATRIX_GET_NX(buffers[2]);
228  const uint32_t rawFftSizeY = STARPU_MATRIX_GET_NY(buffers[2]);
229 
230  const size_t imageStridePaddedImage = STARPU_VECTOR_GET_ELEMSIZE(buffers[0]) / sizeof(float);
231  const size_t temporaryFftScratchSizeBytes = STARPU_VECTOR_GET_ELEMSIZE(buffers[2]) * STARPU_VECTOR_GET_NX(buffers[2]);
232  const size_t imageStrideOutput = STARPU_VECTOR_GET_ELEMSIZE(buffers[1]) / sizeof(float2);
233 
234  if (alignmentOf(inPaddedImage) < ALIGNMENT || alignmentOf(temporaryFftScratch) < ALIGNMENT) {
235  fprintf(stderr,"Bad alignments of buffers for FFT: %d, %d\n", alignmentOf(inPaddedImage), alignmentOf(temporaryFftScratch));
236  assert(false);
237  }
238 
239  //TODO Investigate the use of cuFFTAdvisor to achieve better performance
240  cufftHandle& plan = cudaFftPlan[starpu_worker_get_id_check()];
241 
242  const float normalizationFactor = 1.0f / (arg.paddedImageSize * arg.paddedImageSize);
243 
244  float* in = inPaddedImage;
245  float2* out = outProcessedFft;
246  for (uint32_t i = 0; i < noOfImages; i++) {
247  // Clear the memory to which we will write
248  // Clear FFT output
249  gpuErrchk(cudaMemsetAsync(temporaryFftScratch, 0, temporaryFftScratchSizeBytes, starpu_cuda_get_local_stream()));
250  // Clear cropAndShift output (as the kernel writes only somewhere)
251  //TODO It could be cheaper to just write everywhere...
252  gpuErrchk(cudaMemsetAsync(out, 0, imageStrideOutput * sizeof(float2), starpu_cuda_get_local_stream()));
253 
254  // Execute FFT plan
255  gpuErrchkFFT(cufftExecR2C(plan, in, temporaryFftScratch));
256 
257  // Process results, one thread for each pixel of raw FFT
258  dim3 dimBlock(BLOCK_DIM, BLOCK_DIM);
259  dim3 dimGrid((rawFftSizeX + dimBlock.x - 1) / dimBlock.x, (rawFftSizeY + dimBlock.y - 1) / dimBlock.y);
260  convertImagesKernel<<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>>(
261  temporaryFftScratch, rawFftSizeX, rawFftSizeY, arg.fftSizeX,
262  out, arg.maxResolutionSqr, normalizationFactor);
263  gpuErrchk(cudaPeekAtLastError());
264 
265  in += imageStridePaddedImage;
266  out += imageStrideOutput;
267  }
268 
269  // gpuErrchk(cudaStreamSynchronize(starpu_cuda_get_local_stream())); disabled because codelet is async
270 }
#define gpuErrchk(code)
Definition: cuda_asserts.h:31
uint32_t alignmentOf(size_t ptr)
constexpr int BLOCK_DIM
#define i
int cufftHandle
Definition: cuda_fft.h:41
#define gpuErrchkFFT(code)
Definition: cuda_asserts.h:32
int in
const uint32_t ALIGNMENT
fprintf(glob_prnt.io, "\)

◆ padded_image_to_fft_cuda_deinitialize()

void padded_image_to_fft_cuda_deinitialize ( )

Definition at line 173 of file reconstruct_fourier_codelet_padded_image_to_fft.cpp.

173  {
174  starpu_execute_on_each_worker(&cuda_deinitialize_fft_plan, nullptr, STARPU_CUDA);
175 }

◆ padded_image_to_fft_cuda_initialize()

void padded_image_to_fft_cuda_initialize ( uint32_t  paddedImageSize)

Definition at line 169 of file reconstruct_fourier_codelet_padded_image_to_fft.cpp.

169  {
170  starpu_execute_on_each_worker(&cuda_initialize_fft_plan, &paddedImageSize, STARPU_CUDA);
171 }