Xmipp  v3.23.11-Nereus
cuda_correlation_computer.cpp
Go to the documentation of this file.
1 /***************************************************************************
2  *
3  * Authors: David Strelak (davidstrelak@gmail.com)
4  *
5  * Unidad de Bioinformatica of Centro Nacional de Biotecnologia , CSIC
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 2 of the License, or
10  * (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, write to the Free Software
19  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
20  * 02111-1307 USA
21  *
22  * All comments concerning this program package may be sent to the
23  * e-mail address 'xmipp@cnb.csic.es'
24  ***************************************************************************/
25 #include <cuda_runtime_api.h>
28 #include "cuda_correlation.cu"
29 
30 template<typename T>
31 template<bool NORMALIZE>
33  if (NORMALIZE) {
34  computeCorrStatOneToNNormalize();
35  } else {
36  REPORT_ERROR(ERR_NOT_IMPLEMENTED, "Funnily, this should be easier than what we already can do :)");
37  }
38  storeResultOneToN<NORMALIZE>();
39 }
40 
41 template<typename T>
42 template<bool NORMALIZE>
44  auto &res = this->getFiguresOfMerit();
45  const size_t noOfSignals = this->getSettings().otherDims.n();
46  const size_t elems = this->getSettings().refDims.sizeSingle();
47  if (NORMALIZE) { // we must normalize the output
48  auto ref = computeStat(m_h_ref_corrRes[0], elems); // single result
49  auto others = (ResRaw*) m_h_corrRes; // array of results
50  for(size_t n = 0; n < noOfSignals; ++n) {
51  auto o = computeStat(others[n], elems);
52  T num = others[n].corr - (o.avg * ref.avg * elems);
53  T denom = o.stddev * ref.stddev * elems;
54  res.at(n) = num / denom;
55  }
56  } else { // we can directly use the correlation index
57  auto others = (ResNorm*) m_h_corrRes; // array of results
58  for(size_t n = 0; n < noOfSignals; ++n) {
59  res.at(n) = others[n].corr / elems;
60  }
61  }
62 }
63 
64 template<typename T>
65 template<typename U>
67  Stat s;
68  s.avg = r.sum / norm;
69  T sumSqrNorm = r.sumSqr / norm;
70  s.stddev = sqrt(abs(sumSqrNorm - (s.avg * s.avg)));
71  return s;
72 }
73 
74 template<typename T>
76  bool isReady = this->isInitialized() && this->isRefLoaded();
77  if ( ! isReady) {
78  REPORT_ERROR(ERR_LOGIC_ERROR, "Not ready to execute. Call init() and load reference");
79  }
80  if ( ! m_stream->isGpuPointer(others)) {
81  REPORT_ERROR(ERR_NOT_IMPLEMENTED, "Processing data from host is not yet supported");
82  } else {
83  m_d_others = others;
84  }
85 
86  const auto &s = this->getSettings();
87  this->getFiguresOfMerit().resize(s.otherDims.n());
88  switch(s.type) {
89  case MeritType::OneToN: {
90  if (s.normalizeResult) {
91  computeOneToN<true>();
92  } else {
93  computeOneToN<false>();
94  }
95  break;
96  }
97  default:
98  REPORT_ERROR(ERR_NOT_IMPLEMENTED, "This case is not implemented");
99  }
100 }
101 
102 template<typename T>
104  const auto &s = this->getSettings();
105  this->setIsRefLoaded(nullptr != ref);
106 
107  if (m_stream->isGpuPointer(ref)) {
108  REPORT_ERROR(ERR_NOT_IMPLEMENTED, "Only reference data on CPU are currently supported");
109  } else {
110  size_t bytes = s.refDims.size() * sizeof(T);
111  bool hasToPin = ! m_stream->isMemoryPinned(ref);
112  if (hasToPin) {
113  m_stream->pinMemory(ref, bytes);
114  }
115  auto stream = *(cudaStream_t*)m_stream->stream();
116  // copy data to GPU
117  gpuErrchk(cudaMemcpyAsync(
118  m_d_ref,
119  ref,
120  bytes,
121  cudaMemcpyHostToDevice, stream));
122  if (hasToPin) {
123  m_stream->unpinMemory(ref);
124  }
125  }
126  if (s.normalizeResult) {
127  computeAvgStddevForRef();
128  } else {
129  // nothing to do, assume that stddev = 1 and avg = 0 for each signal
130  }
131 }
132 
133 template<typename T>
135  const auto &dims = this->getSettings().otherDims;
136  auto stream = *(cudaStream_t*)m_stream->stream();
137 
138  dim3 dimBlock(64);
139  dim3 dimGrid(
140  ceil((dims.x() * dims.n()) / (float)dimBlock.x));
141  // zero-init the result
142  size_t bytes = dims.n() * sizeof(ResRaw);
143  gpuErrchk(cudaMemset(m_d_corrRes, 0, bytes)); // reuse data
144 
145  if (std::is_same<T, float>::value) {
146  computeCorrIndexStat2DOneToN<float, float3>
147  <<<dimGrid, dimBlock, 0, stream>>> (
148  (float*)m_d_ref,
149  (float*)m_d_others,
150  dims.x(), dims.y(), dims.n(),
151  (float3*)m_d_corrRes);
152  } else if (std::is_same<T, double>::value) {
153  computeCorrIndexStat2DOneToN<double, double3>
154  <<<dimGrid, dimBlock, 0, stream>>> (
155  (double*)m_d_ref,
156  (double*)m_d_others,
157  dims.x(), dims.y(), dims.n(),
158  (double3*)m_d_corrRes);
159  }
160  // get result from device
161  gpuErrchk(cudaMemcpyAsync(
162  m_h_corrRes,
163  m_d_corrRes, // reuse data
164  bytes,
165  cudaMemcpyDeviceToHost, stream));
166  m_stream->synch();
167 }
168 
169 template<typename T>
171  const auto &dims = this->getSettings().refDims;
172  auto stream = *(cudaStream_t*)m_stream->stream();
173 
174  dim3 dimBlock(64);
175  dim3 dimGrid(
176  ceil((dims.x() * dims.n()) / (float)dimBlock.x));
177  // zero-init the result
178  size_t bytes = dims.n() * sizeof(ResRef);
179  gpuErrchk(cudaMemset(m_d_corrRes, 0, bytes)); // reuse data
180  if (std::is_same<T, float>::value) {
181  computeSumSumSqr2D<float, float2>
182  <<<dimGrid, dimBlock, 0, stream>>> (
183  (float*)m_d_ref,
184  dims.x(), dims.y(), dims.n(),
185  (float2*)m_d_corrRes);
186  } else if (std::is_same<T, double>::value) {
187  computeSumSumSqr2D<double, double2>
188  <<<dimGrid, dimBlock, 0, stream>>> (
189  (double*)m_d_ref,
190  dims.x(), dims.y(), dims.n(),
191  (double2*)m_d_corrRes);
192  }
193  // get result from device
194  gpuErrchk(cudaMemcpyAsync(
195  m_h_ref_corrRes,
196  m_d_corrRes, // reuse data
197  bytes,
198  cudaMemcpyDeviceToHost, stream));
199  m_stream->synch();
200 }
201 
202 template<typename T>
203 void CudaCorrelationComputer<T>::initialize(bool doAllocation) {
204  const auto &s = this->getSettings();
205  if (doAllocation) {
206  release();
207  }
208 
209  m_stream = dynamic_cast<GPU*>(s.hw.at(0));
210  if (nullptr == m_stream) {
211  REPORT_ERROR(ERR_LOGIC_ERROR, "Instance of GPU is expected");
212  }
213  m_stream->set();
214 
215  if (doAllocation) {
216  allocate();
217  }
218 }
219 
220 template<typename T>
222  // GPU memory
223  gpuErrchk(cudaFree(m_d_ref));
224 // gpuErrchk(cudaFree(m_d_others));
225  gpuErrchk(cudaFree(m_d_corrRes));
226  // CPU memory
227  if (this->isInitialized()) { // otherwise these pointer are null and unpin is not happy
228  m_stream->unpinMemory(m_h_corrRes);
229  m_stream->unpinMemory(m_h_ref_corrRes);
230  }
231  free(m_h_ref_corrRes);
232  free(m_h_corrRes);
233 
234  setDefault();
235 }
236 
237 template<typename T>
240  const auto& s = this->getSettings();
241 
242  gpuErrchk(cudaMalloc(&m_d_ref, s.refDims.size() * sizeof(T)));
243 // gpuErrchk(cudaMalloc(&m_d_others, s.otherDims.size() * sizeof(T)));
244 
245  size_t bytesResOthers;
246  size_t bytesResRef;
247  if (s.normalizeResult) {
248  bytesResOthers = s.otherDims.n() * sizeof(ResRaw);
249  bytesResRef = s.refDims.n() * sizeof(ResRef);
250  } else {
251  bytesResOthers = s.otherDims.n() * sizeof(ResNorm);
252  bytesResRef = 1; // we actually don't need anything, but this makes code cleaner
253  }
254  gpuErrchk(cudaMalloc(&m_d_corrRes, bytesResOthers));
255 
256  m_h_ref_corrRes = (ResRef*)page_aligned_alloc(bytesResRef);
257  memset(m_h_ref_corrRes, 0, bytesResRef); // to make valgrind happy
258  m_stream->pinMemory(m_h_ref_corrRes, bytesResRef);
259 
260  m_h_corrRes = page_aligned_alloc(bytesResOthers);
261  memset(m_h_corrRes, 0, bytesResOthers); // to make valgrind happy
262  m_stream->pinMemory(m_h_corrRes, bytesResOthers);
263 }
264 
265 template<typename T>
267  // GPU memory
268  m_d_ref = nullptr;
269  m_d_others = nullptr;
270  m_d_corrRes = nullptr;
271  // CPU memory
272  m_h_ref_corrRes = nullptr;
273  m_h_corrRes = nullptr;
274  // others
275  m_stream = nullptr;
276 }
277 
278 template<typename T>
280  bool result = true;
281  if ( ! this->isInitialized()) {
282  return false;
283  }
284  auto &sOrig = this->getSettings();
285  result = result && sOrig.type == s.type;
286  result = result && (sOrig.otherDims.size() >= s.otherDims.size()); // previously, we needed more space
287 
288  return result;
289 }
290 
291 template<typename T>
293  const auto &s = this->getSettings();
294  if (1 != s.hw.size()) {
295  REPORT_ERROR(ERR_LOGIC_ERROR, "Single GPU (stream) expected");
296  }
297  if ( ! s.normalizeResult) {
298  REPORT_ERROR(ERR_LOGIC_ERROR, "Non-normalized output not yet supported");
299  }
300  if (MeritType::OneToN != s.type) {
301  REPORT_ERROR(ERR_LOGIC_ERROR, "Only 1:N correlations are currently supported");
302  }
303  if ( ! s.otherDims.is2D()) {
304  REPORT_ERROR(ERR_NOT_IMPLEMENTED, "Only 2D input is currently supported");
305  }
306 }
307 
308 // explicit instantiation
309 template class CudaCorrelationComputer<float>;
310 template class CudaCorrelationComputer<double>;
#define gpuErrchk(code)
Definition: cuda_asserts.h:31
CUDA_HD constexpr bool is2D() const
Definition: dimensions.h:162
void * page_aligned_alloc(size_t bytes)
Definition: memory_utils.h:66
Case or algorithm not implemented yet.
Definition: xmipp_error.h:177
#define REPORT_ERROR(nerr, ErrormMsg)
Definition: xmipp_error.h:211
void sqrt(Image< double > &op)
void abs(Image< double > &op)
T norm(const std::vector< T > &v)
Definition: vector_ops.h:399
MeritType type
std::vector< HW * > hw
free((char *) ob)
void loadReference(const T *ref) override
Dimensions otherDims
constexpr size_t size() const
Definition: dimensions.h:104
Definition: gpu.h:36
struct stat Stat
void compute(T *others) override
int * n
Some logical error in the pipeline.
Definition: xmipp_error.h:147