Xmipp  v3.23.11-Nereus
Public Member Functions | Static Public Member Functions | List of all members
ExtremaFinder::CudaExtremaFinder< T > Class Template Reference

#include <cuda_single_extrema_finder.h>

Inheritance diagram for ExtremaFinder::CudaExtremaFinder< T >:
Inheritance graph
[legend]
Collaboration diagram for ExtremaFinder::CudaExtremaFinder< T >:
Collaboration graph
[legend]

Public Member Functions

 CudaExtremaFinder ()
 
virtual ~CudaExtremaFinder ()
 
 CudaExtremaFinder (CudaExtremaFinder &o)=delete
 
CudaExtremaFinderoperator= (const CudaExtremaFinder &other)=delete
 
CudaExtremaFinder const & operator= (CudaExtremaFinder &&o)=delete
 
 CudaExtremaFinder (CudaExtremaFinder &&o)
 
template<typename KERNEL >
void findBasic (const T *__restrict__ h_data, const KERNEL &k)
 
template<typename C >
void sFindUniversal (const C &comp, T startVal, const GPU &gpu, const Dimensions &dims, const T *__restrict__ d_data, float *__restrict__ d_positions, T *__restrict__ d_values)
 
- Public Member Functions inherited from ExtremaFinder::AExtremaFinder< T >
 AExtremaFinder ()
 
virtual ~AExtremaFinder ()
 
void init (const ExtremaFinderSettings &settings, bool reuse)
 
void find (const T *data)
 
HWgetHW () const
 
const ExtremaFinderSettingsgetSettings () const
 
const std::vector< T > & getValues () const
 
const std::vector< float > & getPositions () const
 

Static Public Member Functions

template<typename C >
static void sFindUniversal (const C &comp, T startVal, const GPU &gpu, const Dimensions &dims, const T *d_data, float *d_positions, T *d_values)
 
static void sFindMax (const GPU &gpu, const Dimensions &dims, const T *d_data, float *d_positions, T *d_values)
 
static void sFindLowest (const GPU &gpu, const Dimensions &dims, const T *d_data, float *d_positions, T *d_values)
 
template<typename C >
static void sFindUniversal2DAroundCenter (const C &comp, T startVal, const GPU &gpu, const Dimensions &dims, const T *data, float *d_positions, T *d_values, size_t maxDist)
 
static void sFindMax2DAroundCenter (const GPU &gpu, const Dimensions &dims, const T *d_data, float *d_positions, T *d_values, size_t maxDist)
 
static void sFindLowest2DAroundCenter (const GPU &gpu, const Dimensions &dims, const T *d_data, float *d_positions, T *d_values, size_t maxDist)
 
static void sRefineLocation (const GPU &gpu, const Dimensions &dims, const float *d_indices, float *d_positions, const T *d_data)
 
static size_t ceilPow2 (size_t x)
 

Additional Inherited Members

- Protected Member Functions inherited from ExtremaFinder::AExtremaFinder< T >
std::vector< T > & getValues ()
 
std::vector< float > & getPositions ()
 
constexpr bool isInitialized () const
 

Detailed Description

template<typename T>
class ExtremaFinder::CudaExtremaFinder< T >

Definition at line 42 of file cuda_single_extrema_finder.h.

Constructor & Destructor Documentation

◆ CudaExtremaFinder() [1/3]

template<typename T>
ExtremaFinder::CudaExtremaFinder< T >::CudaExtremaFinder ( )
inline

Definition at line 45 of file cuda_single_extrema_finder.h.

45  {
46  setDefault();
47  }

◆ ~CudaExtremaFinder()

template<typename T>
virtual ExtremaFinder::CudaExtremaFinder< T >::~CudaExtremaFinder ( )
inlinevirtual

Definition at line 49 of file cuda_single_extrema_finder.h.

49  {
50  release();
51  }

◆ CudaExtremaFinder() [2/3]

template<typename T>
ExtremaFinder::CudaExtremaFinder< T >::CudaExtremaFinder ( CudaExtremaFinder< T > &  o)
delete

◆ CudaExtremaFinder() [3/3]

template<typename T>
ExtremaFinder::CudaExtremaFinder< T >::CudaExtremaFinder ( CudaExtremaFinder< T > &&  o)
inline

Definition at line 56 of file cuda_single_extrema_finder.h.

56  {
57  m_loadStream = o.m_loadStream;
58  m_workStream = o.m_workStream;
59 
60  // device memory
61  m_d_values = o.m_d_values;
62  m_d_positions = o.m_d_positions;
63  m_d_batch = o.m_d_batch;
64 
65  // synch primitives
66  m_mutex = o.m_mutex;
67  m_cv = o.m_cv;
68  m_isDataReady = o.m_isDataReady;
69 
70  // host memory
71  m_h_batchResult = o.m_h_batchResult;
72 
73  // clean original
74  o.setDefault();
75  }

Member Function Documentation

◆ ceilPow2()

template<typename T >
size_t ExtremaFinder::CudaExtremaFinder< T >::ceilPow2 ( size_t  x)
static

Definition at line 334 of file cuda_single_extrema_finder.cpp.

335 {
336  if (x <= 1) return 1;
337  int power = 2;
338  x--;
339  while (x >>= 1) power <<= 1;
340  return power;
341 }
doublereal * x
void power(Image< double > &op)

◆ findBasic()

template<typename T>
template<typename KERNEL >
void ExtremaFinder::CudaExtremaFinder< T >::findBasic ( const T *__restrict__  h_data,
const KERNEL &  k 
)

Definition at line 206 of file cuda_single_extrema_finder.cpp.

206  {
207  bool isReady = this->isInitialized();
208  if ( ! isReady) {
209  REPORT_ERROR(ERR_LOGIC_ERROR, "Not ready to execute. Call init() first");
210  }
211  if ( ! GPU::isMemoryPinned(h_data)) {
212  REPORT_ERROR(ERR_LOGIC_ERROR, "Input memory has to be pinned (page-locked)");
213  }
214  m_workStream->set();
215  m_loadStream->set();
216  // start loading data at the background
217  m_isDataReady = false;
218  auto loadingThread = std::thread(&CudaExtremaFinder<T>::loadThreadRoutine, this, h_data);
219 
220  auto s = this->getSettings();
221  // process signals in batches
222  for (size_t offset = 0; offset < s.dims.n(); offset += s.batch) {
223  // how many signals to process
224  size_t toProcess = std::min(s.batch, s.dims.n() - offset);
225  auto batchDims = s.dims.copyForN(toProcess);
226  {
227  // block until data is loaded
228  // mutex will be freed once leaving this block
229  std::unique_lock<std::mutex> lk(*m_mutex);
230  m_cv->wait(lk, [&]{return m_isDataReady;});
231  // call finding kernel
232  k(*m_workStream, batchDims, m_d_batch,
233  m_d_positions, m_d_values);
234 
235  // notify that buffer is processed (new will be loaded in background)
236  m_workStream->synch();
237  m_isDataReady = false;
238  m_cv->notify_one();
239  }
240  downloadPositionsFromGPU(offset, toProcess);
241  downloadValuesFromGPU(offset, toProcess);
242  }
243  loadingThread.join();
244 }
void min(Image< double > &op1, const Image< double > &op2)
#define REPORT_ERROR(nerr, ErrormMsg)
Definition: xmipp_error.h:211
constexpr bool isInitialized() const
static bool isMemoryPinned(const void *h_mem)
Definition: gpu.cpp:140
ql0001_ & k(htemp+1),(cvec+1),(atemp+1),(bj+1),(bl+1),(bu+1),(x+1),(clamda+1), &iout, infoqp, &zero,(w+1), &lenw,(iw+1), &leniw, &glob_grd.epsmac
void set()
Definition: gpu.cpp:50
const ExtremaFinderSettings & getSettings() const
void synch() const
Definition: gpu.cpp:129
Some logical error in the pipeline.
Definition: xmipp_error.h:147

◆ operator=() [1/2]

template<typename T>
CudaExtremaFinder& ExtremaFinder::CudaExtremaFinder< T >::operator= ( const CudaExtremaFinder< T > &  other)
delete

◆ operator=() [2/2]

template<typename T>
CudaExtremaFinder const& ExtremaFinder::CudaExtremaFinder< T >::operator= ( CudaExtremaFinder< T > &&  o)
delete

◆ sFindLowest()

template<typename T>
void ExtremaFinder::CudaExtremaFinder< T >::sFindLowest ( const GPU gpu,
const Dimensions dims,
const T *  d_data,
float *  d_positions,
T *  d_values 
)
static

Definition at line 355 of file cuda_single_extrema_finder.cpp.

359  {
360  return sFindUniversal([] __device__ (T l, T r) { return l < r; },
362  gpu, dims, d_data, d_positions, d_values);
363 }
void max(Image< double > &op1, const Image< double > &op2)
static void sFindUniversal(const C &comp, T startVal, const GPU &gpu, const Dimensions &dims, const T *d_data, float *d_positions, T *d_values)

◆ sFindLowest2DAroundCenter()

template<typename T >
void ExtremaFinder::CudaExtremaFinder< T >::sFindLowest2DAroundCenter ( const GPU gpu,
const Dimensions dims,
const T *  d_data,
float *  d_positions,
T *  d_values,
size_t  maxDist 
)
static

Definition at line 454 of file cuda_single_extrema_finder.cpp.

460  {
461  return sFindUniversal2DAroundCenter([] __device__ (T l, T r) { return l < r; },
463  gpu, dims, d_data, d_positions, d_values, maxDist);
464 }
void max(Image< double > &op1, const Image< double > &op2)
static void sFindUniversal2DAroundCenter(const C &comp, T startVal, const GPU &gpu, const Dimensions &dims, const T *data, float *d_positions, T *d_values, size_t maxDist)

◆ sFindMax()

template<typename T>
void ExtremaFinder::CudaExtremaFinder< T >::sFindMax ( const GPU gpu,
const Dimensions dims,
const T *  d_data,
float *  d_positions,
T *  d_values 
)
static

Definition at line 344 of file cuda_single_extrema_finder.cpp.

348  {
349  return sFindUniversal([] __device__ (T l, T r) { return l > r; },
350  std::numeric_limits<T>::lowest(),
351  gpu, dims, d_data, d_positions, d_values);
352 }
static void sFindUniversal(const C &comp, T startVal, const GPU &gpu, const Dimensions &dims, const T *d_data, float *d_positions, T *d_values)

◆ sFindMax2DAroundCenter()

template<typename T >
void ExtremaFinder::CudaExtremaFinder< T >::sFindMax2DAroundCenter ( const GPU gpu,
const Dimensions dims,
const T *  d_data,
float *  d_positions,
T *  d_values,
size_t  maxDist 
)
static

Definition at line 427 of file cuda_single_extrema_finder.cpp.

433  {
434  return sFindUniversal2DAroundCenter([] __device__ (T l, T r) { return l > r; },
435  std::numeric_limits<T>::lowest(),
436  gpu, dims, d_data, d_positions, d_values, maxDist);
437 }
static void sFindUniversal2DAroundCenter(const C &comp, T startVal, const GPU &gpu, const Dimensions &dims, const T *data, float *d_positions, T *d_values, size_t maxDist)

◆ sFindUniversal() [1/2]

template<typename T>
template<typename C >
static void ExtremaFinder::CudaExtremaFinder< T >::sFindUniversal ( const C &  comp,
startVal,
const GPU gpu,
const Dimensions dims,
const T *  d_data,
float *  d_positions,
T *  d_values 
)
static

◆ sFindUniversal() [2/2]

template<typename T>
template<typename C >
void ExtremaFinder::CudaExtremaFinder< T >::sFindUniversal ( const C &  comp,
startVal,
const GPU gpu,
const Dimensions dims,
const T *__restrict__  d_data,
float *__restrict__  d_positions,
T *__restrict__  d_values 
)

Definition at line 367 of file cuda_single_extrema_finder.cpp.

374  {
375  // check input
376  assert(dims.sizeSingle() > 0);
377  assert(dims.n() > 0);
378  assert(nullptr != d_data);
379  assert((nullptr != d_positions) || (nullptr != d_values));
380  assert(dims.size() <= std::numeric_limits<unsigned>::max()); // indexing overflow in the kernel
381 
382  // create threads / blocks
383  size_t maxThreads = 512;
384  size_t threads = (dims.sizeSingle() < maxThreads) ? ceilPow2(dims.sizeSingle()) : maxThreads;
385  dim3 dimBlock(threads, 1, 1);
386  dim3 dimGrid(dims.n(), 1, 1);
387  auto stream = *(cudaStream_t*)gpu.stream();
388 
389  // for each thread, we need two variables in shared memory
390  size_t smemSize = 2 * threads * sizeof(T);
391  switch (threads) {
392  case 512:
393  return findUniversal<T, 512><<< dimGrid, dimBlock, smemSize, stream>>>(
394  comp, startVal, d_data, d_positions, d_values, dims.sizeSingle());
395  case 256:
396  return findUniversal<T, 256><<< dimGrid, dimBlock, smemSize, stream>>>(
397  comp, startVal, d_data, d_positions, d_values, dims.sizeSingle());
398  case 128:
399  return findUniversal<T, 128><<< dimGrid, dimBlock, smemSize, stream>>>(
400  comp, startVal, d_data, d_positions, d_values, dims.sizeSingle());
401  case 64:
402  return findUniversal<T, 64><<< dimGrid, dimBlock, smemSize, stream>>>(
403  comp, startVal, d_data, d_positions, d_values, dims.sizeSingle());
404  case 32:
405  return findUniversal<T, 32><<< dimGrid, dimBlock, smemSize, stream>>>(
406  comp, startVal, d_data, d_positions, d_values, dims.sizeSingle());
407  case 16:
408  return findUniversal<T, 16><<< dimGrid, dimBlock, smemSize, stream>>>(
409  comp, startVal, d_data, d_positions, d_values, dims.sizeSingle());
410  case 8:
411  return findUniversal<T, 8><<< dimGrid, dimBlock, smemSize, stream>>>(
412  comp, startVal, d_data, d_positions, d_values, dims.sizeSingle());
413  case 4:
414  return findUniversal<T, 4><<< dimGrid, dimBlock, smemSize, stream>>>(
415  comp, startVal, d_data, d_positions, d_values, dims.sizeSingle());
416  case 2:
417  return findUniversal<T, 2><<< dimGrid, dimBlock, smemSize, stream>>>(
418  comp, startVal, d_data, d_positions, d_values, dims.sizeSingle());
419  case 1:
420  return findUniversal<T, 1><<< dimGrid, dimBlock, smemSize, stream>>>(
421  comp, startVal, d_data, d_positions, d_values, dims.sizeSingle());
422  default: REPORT_ERROR(ERR_NOT_IMPLEMENTED, "Unsupported number of threads");
423  }
424 }
void * stream() const
Definition: gpu.h:50
Case or algorithm not implemented yet.
Definition: xmipp_error.h:177
#define REPORT_ERROR(nerr, ErrormMsg)
Definition: xmipp_error.h:211
CUDA_HD constexpr size_t sizeSingle() const
Definition: dimensions.h:100
void max(Image< double > &op1, const Image< double > &op2)
CUDA_HD constexpr size_t n() const
Definition: dimensions.h:78
constexpr size_t size() const
Definition: dimensions.h:104

◆ sFindUniversal2DAroundCenter()

template<typename T >
template<typename C >
void ExtremaFinder::CudaExtremaFinder< T >::sFindUniversal2DAroundCenter ( const C &  comp,
startVal,
const GPU gpu,
const Dimensions dims,
const T *  data,
float *  d_positions,
T *  d_values,
size_t  maxDist 
)
static

Definition at line 468 of file cuda_single_extrema_finder.cpp.

476  {
477  // check input
478  assert(dims.is2D());
479  assert( ! dims.isPadded());
480  assert(dims.sizeSingle() > 0);
481  assert(dims.n() > 0);
482  assert(nullptr != d_data);
483  assert((nullptr != d_positions) || (nullptr != d_values));
484  assert(0 < maxDist);
485  int xHalf = dims.x() / 2;
486  int yHalf = dims.y() / 2;
487  assert((2 * xHalf) > maxDist);
488  assert((2 * yHalf) > maxDist);
489 
490  // prepare threads / blocks
491  size_t maxThreads = 512;
492  size_t windowWidth = 2 * maxDist;
493  // threads should process a single row of the signal
494  size_t threads = (windowWidth < maxThreads) ? ceilPow2(windowWidth) : maxThreads;
495  dim3 dimBlock(threads, 1, 1);
496  dim3 dimGrid(dims.n(), 1, 1);
497  auto stream = *(cudaStream_t*)gpu.stream();
498 
499  // for each thread, we need two variables in shared memory
500  int smemSize = 2 * threads * sizeof(T);
501  switch (threads) {
502  case 512:
503  return findUniversal2DNearCenter<T, 512><<< dimGrid, dimBlock, smemSize, stream>>>(
504  comp, startVal, d_data, d_positions, d_values, dims.x(), dims.y(), maxDist);
505  case 256:
506  return findUniversal2DNearCenter<T, 256><<< dimGrid, dimBlock, smemSize, stream>>>(
507  comp, startVal, d_data, d_positions, d_values, dims.x(), dims.y(), maxDist);
508  case 128:
509  return findUniversal2DNearCenter<T, 128><<< dimGrid, dimBlock, smemSize, stream>>>(
510  comp, startVal, d_data, d_positions, d_values, dims.x(), dims.y(), maxDist);
511  case 64:
512  return findUniversal2DNearCenter<T, 64><<< dimGrid, dimBlock, smemSize, stream>>>(
513  comp, startVal, d_data, d_positions, d_values, dims.x(), dims.y(), maxDist);
514  case 32:
515  return findUniversal2DNearCenter<T, 32><<< dimGrid, dimBlock, smemSize, stream>>>(
516  comp, startVal, d_data, d_positions, d_values, dims.x(), dims.y(), maxDist);
517  case 16:
518  return findUniversal2DNearCenter<T, 16><<< dimGrid, dimBlock, smemSize, stream>>>(
519  comp, startVal, d_data, d_positions, d_values, dims.x(), dims.y(), maxDist);
520  case 8:
521  return findUniversal2DNearCenter<T, 8><<< dimGrid, dimBlock, smemSize, stream>>>(
522  comp, startVal, d_data, d_positions, d_values, dims.x(), dims.y(), maxDist);
523  case 4:
524  return findUniversal2DNearCenter<T, 4><<< dimGrid, dimBlock, smemSize, stream>>>(
525  comp, startVal, d_data, d_positions, d_values, dims.x(), dims.y(), maxDist);
526  case 2:
527  return findUniversal2DNearCenter<T, 2><<< dimGrid, dimBlock, smemSize, stream>>>(
528  comp, startVal, d_data, d_positions, d_values, dims.x(), dims.y(), maxDist);
529  case 1:
530  return findUniversal2DNearCenter<T, 1><<< dimGrid, dimBlock, smemSize, stream>>>(
531  comp, startVal, d_data, d_positions, d_values, dims.x(), dims.y(), maxDist);
532  default: REPORT_ERROR(ERR_NOT_IMPLEMENTED, "Unsupported number of threads");
533  }
534 }
CUDA_HD constexpr bool is2D() const
Definition: dimensions.h:162
void * stream() const
Definition: gpu.h:50
Case or algorithm not implemented yet.
Definition: xmipp_error.h:177
#define REPORT_ERROR(nerr, ErrormMsg)
Definition: xmipp_error.h:211
constexpr bool isPadded() const
Definition: dimensions.h:153
CUDA_HD constexpr size_t x() const
Definition: dimensions.h:51
CUDA_HD constexpr size_t sizeSingle() const
Definition: dimensions.h:100
CUDA_HD constexpr size_t y() const
Definition: dimensions.h:60
CUDA_HD constexpr size_t n() const
Definition: dimensions.h:78

◆ sRefineLocation()

template<typename T >
void ExtremaFinder::CudaExtremaFinder< T >::sRefineLocation ( const GPU gpu,
const Dimensions dims,
const float *  d_indices,
float *  d_positions,
const T *  d_data 
)
static

Definition at line 440 of file cuda_single_extrema_finder.cpp.

445  {
446  assert(dims.n() > 0);
447  dim3 dimBlock(std::min(dims.n(), 1024LU)); // 1024 is max threads per block, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications__technical-specifications-per-compute-capability
448  dim3 dimGrid(dims.n() / 1024 + 1);
449  auto stream = *(cudaStream_t*)gpu.stream();
450  return refineLocation<T, 3><<< dimGrid, dimBlock, 0, stream>>>(d_indices, d_positions, d_data, dims);
451 }
void * stream() const
Definition: gpu.h:50
void min(Image< double > &op1, const Image< double > &op2)
CUDA_HD constexpr size_t n() const
Definition: dimensions.h:78

The documentation for this class was generated from the following files: