Xmipp  v3.23.11-Nereus
movie_alignment_correlation_gpu.cpp
Go to the documentation of this file.
1 /***************************************************************************
2  *
3  * Authors: David Strelak (davidstrelak@gmail.com)
4  *
5  * Unidad de Bioinformatica of Centro Nacional de Biotecnologia , CSIC
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 2 of the License, or
10  * (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, write to the Free Software
19  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
20  * 02111-1307 USA
21  *
22  * All comments concerning this program package may be sent to the
23  * e-mail address 'xmipp@cnb.csic.es'
24  ***************************************************************************/
25 
28 #include "core/userSettings.h"
31 #include <CTPL/ctpl_stl.h>
32 
33 
34 
35 template<typename T>
38  this->addParamsLine(" [--device <dev=0>] : GPU device to use. 0th by default");
39  this->addParamsLine(" [--storage <fn=\"\">] : Path to file that can be used to store results of the benchmark");
40  this->addParamsLine(" [--patchesAvg <avg=3>] : Number of near frames used for averaging a single patch");
41  this->addExampleLine(
42  "xmipp_cuda_movie_alignment_correlation -i movie.xmd --oaligned alignedMovie.stk --oavg alignedMicrograph.mrc --device 0");
43  this->addSeeAlsoLine("xmipp_movie_alignment_correlation");
44 }
45 
46 template<typename T>
49  std::cout << "Device: " << mGpu.value().device() << " (" << mGpu.value().getUUID() << ")" << "\n";
50  std::cout << "Benchmark storage " << (storage.empty() ? "Default" : storage) << "\n";
51  std::cout << "Patches avg: " << patchesAvg << "\n";
52 }
53 
54 template<typename T>
57 
58  // read GPU
59  int device = this->getIntParam("--device");
60  if (device < 0) REPORT_ERROR(ERR_ARG_INCORRECT, "Invalid GPU device");
61  mGpu = core::optional<GPU>(device);
62  mGpu.value().set();
63 
64  // read permanent storage
65  storage = this->getParam("--storage");
66 
67  // read patch averaging
68  patchesAvg = this->getIntParam("--patchesAvg");
69  if (patchesAvg < 1) REPORT_ERROR(ERR_ARG_INCORRECT, "Patch averaging has to be at least 1 (one).");
70 }
71 
72 template<typename T>
74  const bool crop = true;
75  if (auto optDim = getStoredSizes(ref, crop); optDim) {
76  return optDim.value().copyForN(ref.n());
77  }
78  std::cout << "Benchmarking cuFFT ..." << std::endl;
79  auto hint = FFTSettings<T>(ref.createSingle()); // movie frame is big enought to give us an idea
80  auto candidate = std::unique_ptr<FFTSettings<T>>(CudaFFT<T>::findOptimal(mGpu.value(), hint, 0, hint.sDim().x() == hint.sDim().y(), 10, crop, true));
81  if (!candidate) {
82  REPORT_ERROR(ERR_GPU_MEMORY, "Insufficient GPU memory for processing a single frame of the movie.");
83  }
84  storeSizes(ref, candidate->sDim(), crop);
85  return candidate->sDim().copyForN(ref.n());
86 }
87 
88 template<typename T>
90  const bool crop = false;
91  if (auto optDim = this->getStoredSizes(ref, crop); optDim) {
92  return optDim.value().copyForN(ref.n());
93  }
94  std::cout << "Benchmarking cuFFT ..." << std::endl;
95  auto hint = FFTSettings<T>(ref, 1, false, false);
96  auto candidate = std::unique_ptr<FFTSettings<T>>(CudaFFT<T>::findOptimal(mGpu.value(), hint, 0, hint.sDim().x() == hint.sDim().y(), 20, crop, true));
97  if (!candidate) {
98  REPORT_ERROR(ERR_GPU_MEMORY, "Insufficient GPU memory for processing a correlations of the movie.");
99  }
100  this->storeSizes(ref, candidate->sDim(), crop);
101  return candidate->sDim().copyForN(ref.n());
102 }
103 
104 template<typename T>
106  const bool crop = false;
107  const auto reqPatchSize = this->getRequestedPatchSize();
108  auto ref = Dimensions(reqPatchSize.first, reqPatchSize.second, 1, this->getMovieSize().n());
109  if (auto optDim = this->getStoredSizes(ref, crop); optDim) {
110  return optDim.value().copyForN(ref.n());
111  }
112  std::cout << "Benchmarking cuFFT ..." << std::endl;
113  auto hint = FFTSettings<T>(ref);
114  auto candidate = std::unique_ptr<FFTSettings<T>>(CudaFFT<T>::findOptimal(mGpu.value(), hint, 0, hint.sDim().x() == hint.sDim().y(), 20, crop, true));
115  if (!candidate) {
116  REPORT_ERROR(ERR_GPU_MEMORY, "Insufficient GPU memory for processing a correlations of the movie.");
117  }
118  this->storeSizes(ref, candidate->sDim(), crop);
119  return candidate->sDim();
120 }
121 
122 template<typename T>
124  auto getNearestEven = [] (size_t v, T minScale, size_t shift) { // scale is less than 1
125  size_t size = 2; // to get even sizes
126  while ((size / (float)v) < minScale) {
127  size += 2;
128  }
129  return size;
130  };
131  const T requestedScale = this->getScaleFactor();
132  // hint, possibly bigger then requested, so that it fits max shift window
133  Dimensions hint(getNearestEven(d.x(), requestedScale, static_cast<size_t>(this->maxShift)),
134  getNearestEven(d.y(), requestedScale, static_cast<size_t>(this->maxShift)),
135  d.z(), d.n());
136  return hint;
137 }
138 
139 template<typename T>
140 std::vector<FramePatchMeta<T>> ProgMovieAlignmentCorrelationGPU<T>::getPatchesLocation(
141  const std::pair<T, T> &borders, const Dimensions &patch) {
142  size_t patchesX = this->localAlignPatches.first;
143  size_t patchesY = this->localAlignPatches.second;
144  T windowXSize = this->getMovieSize().x() - 2 * borders.first;
145  T windowYSize = this->getMovieSize().y() - 2 * borders.second;
146  T corrX = std::ceil(((patchesX * patch.x()) - windowXSize) / (T) (patchesX - 1));
147  T corrY = std::ceil(((patchesY * patch.y()) - windowYSize) / (T) (patchesY - 1));
148  T stepX = (T)patch.x() - corrX;
149  T stepY = (T)patch.y() - corrY;
150  std::vector<FramePatchMeta<T>> result;
151  for (size_t y = 0; y < patchesY; ++y) {
152  for (size_t x = 0; x < patchesX; ++x) {
153  T tlx = borders.first + x * stepX; // Top Left
154  T tly = borders.second + y * stepY;
155  T brx = tlx + patch.x() - 1; // Bottom Right
156  T bry = tly + patch.y() - 1; // -1 for indexing
157  Point2D<T> tl(tlx, tly);
158  Point2D<T> br(brx, bry);
159  Rectangle<Point2D<T>> r(tl, br);
160  result.emplace_back(FramePatchMeta<T> { .rec = r, .id_x = x, .id_y = y });
161  }
162  }
163  return result;
164 }
165 
166 template<typename T>
168  const AlignmentResult<T> &globAlignment, T *result) {
169  const auto &movieDim = movie.getDim();
170  const int n = static_cast<int>(movieDim.n());
171  const auto patchSize = patch.getSize();
172  const auto bufferBytes = patchSize.x * sizeof(T);
173  auto buffer = std::make_unique<T[]>(patchSize.x); // faster than memory manager
174  for (int t = 0; t < n; ++t) {// for each patch
175  for (int y = 0; y < patchSize.y; ++y) { // for each row
176  bool copy = true;
177  // while averaging odd num of frames, use copy equally from previous and following frames
178  // otherwise prefer following frames
179  for (int f = std::max(0, t - ((patchesAvg - 1) / 2)); f <= std::min(n - 1, t + (patchesAvg / 2)); ++f) {
180  const auto *frame = movie.getFrame(f).data;
181  const auto xShift = static_cast<int>(std::round(globAlignment.shifts[f].x));
182  const auto yShift = static_cast<int>(std::round(globAlignment.shifts[f].y));
183  // notice we don't test any boundaries - it should always be within the range of the frame
184  // see implementation of patch position generation and frame border computation
185  const int srcY = patch.tl.y + y + yShift;
186  const int srcX = patch.tl.x + xShift;
187  auto *src = frame + srcY * movieDim.x() + srcX;
188  if (copy) {
189  memcpy(buffer.get(), src, bufferBytes);
190  } else {
191  for (int x = 0; x < patchSize.x; ++x) {
192  buffer[x] += src[x];
193  }
194  }
195  copy = false;
196  }
197  const int patchOffset = t * patchSize.x * patchSize.y;
198  const int destIndex = patchOffset + y * patchSize.x;
199  memcpy(result + destIndex, buffer.get(), bufferBytes); // write result
200  }
201  }
202 }
203 
204 template<typename T>
206  const AlignmentResult<T> &globAlignment, int verbose) {
207  T minX = std::numeric_limits<T>::max();
208  T maxX = std::numeric_limits<T>::min();
209  T minY = std::numeric_limits<T>::max();
210  T maxY = std::numeric_limits<T>::min();
211  for (const auto& s : globAlignment.shifts) {
212  minX = std::min(std::floor(s.x), minX);
213  maxX = std::max(std::ceil(s.x), maxX);
214  minY = std::min(std::floor(s.y), minY);
215  maxY = std::max(std::ceil(s.y), maxY);
216  }
217  auto res = std::make_pair(std::abs(maxX - minX), std::abs(maxY - minY));
218  if (verbose > 1) {
219  std::cout << "Movie borders: x=" << res.first << " y=" << res.second << std::endl;
220  }
221  return res;
222 }
223 
224 template<typename T>
226  const auto pSize = findGoodPatchSize();
227  LASP.doBinning = false;
228  LASP.raw = Dimensions(0);
229  LASP.movie = pSize;
230  const auto cSize = this->findGoodCorrelationSize(this->getCorrelationHint(pSize));
231  LASP.out = LACP.dim = cSize;
232  const auto maxBytes = static_cast<size_t>(static_cast<float>(mGpu.value().lastFreeBytes()) * 0.9f); // leave some buffer in case of memory fragmentation
233  auto getMemReq = [&LASP=LASP, &LACP=LACP, &gpu=mGpu.value()]() {
234  auto scale = CUDAFlexAlignScale<T>(LASP, gpu).estimateBytes();
235  auto correlate = CUDAFlexAlignCorrelate<T>(LACP, gpu).estimateBytes();
236  return scale + correlate;
237  };
238  auto cond = [&LASP=LASP]() {
239  // we want only no. of batches that can process the patches without extra invocations
240  return 0 == LASP.movie.n() % LASP.batch;
241  };
242 
243  // we're gonna run both scaling and correlation in two streams to overlap memory transfers and computations
244  // more streams do not make sense because we're limited by the transfers
245  size_t bufferDivider = 1;
246  size_t corrBatchDivider = 0;
247  const size_t correlations = cSize.n() * (cSize.n() - 1) / 2;
248  do {
249  corrBatchDivider++;
250  auto corrBatch = correlations / corrBatchDivider;
251  LACP.batch = corrBatch;
252  if ((pSize.n() / bufferDivider) > corrBatch) {
253  bufferDivider += (bufferDivider == 1) ? 2 : 1; // we use two buffers, so we need the same memory for batch == 1 and == 2
254  }
255  LACP.bufferSize = pSize.n() / bufferDivider;
256  for (auto scaleBatch = pSize.n(); scaleBatch > 0; --scaleBatch) {
257  LASP.batch = scaleBatch;
258  if (cond() && getMemReq() <= maxBytes) {
259  return;
260  }
261  }
262  } while (true);
263 }
264 
265 template<typename T>
266 T* ProgMovieAlignmentCorrelationGPU<T>::setFilter(float scale, const Dimensions &dims) {
267  MultidimArray<T> tmp = this->createLPF(this->getPixelResolution(scale), dims);
268  auto *filter = reinterpret_cast<T*>(BasicMemManager::instance().get(tmp.nzyxdim *sizeof(T), MemType::CUDA_MANAGED));
269  memcpy(filter, tmp.data, tmp.nzyxdim *sizeof(T));
270  return filter;
271 }
272 
273 template<typename T>
275  typename CUDAFlexAlignScale<T>::Params &SP,
276  typename CUDAFlexAlignCorrelate<T>::Params &CP, int streams, int threads) {
277  if (this->verbose) {
278  std::cout << "Actual scale factor (X): " << scale << "\n";
279  std::cout << "Size of the patch : " << SP.movie << "\n";
280  std::cout << "Size of the correlation: " << SP.out << "\n";
281  std::cout << "Correlation batch : " << CP.batch << "\n";
282  std::cout << "GPU streams : " << streams << "\n";
283  std::cout << "CPU threads : " << threads << "\n";
284  }
285 }
286 
287 
288 template<typename T>
290  const MetaData &movieMD, const Image<T> &dark, const Image<T> &igain,
291  const AlignmentResult<T> &globAlignment) {
292  LAOptimize();
293  const auto movieSize = this->getMovieSize();
294  if ((movieSize.x() < LASP.movie.x())
295  || (movieSize.y() < LASP.movie.y())) {
296  REPORT_ERROR(ERR_PARAM_INCORRECT, "Movie is too small for local alignment.");
297  }
298 
299  auto borders = getMovieBorders(globAlignment, this->verbose > 1);
300  auto patchesLocation = this->getPatchesLocation(borders, LASP.movie);
301  const auto actualScale = static_cast<float>(LASP.out.x()) / static_cast<float>(LASP.movie.x()); // assuming we use square patches
302 
303  auto streams = std::array<GPU, 2>{GPU(mGpu.value().device(), 1), GPU(mGpu.value().device(), 2)};
304  auto loadPool = ctpl::thread_pool(static_cast<int>(std::min(this->localAlignPatches.first, this->localAlignPatches.second)));
305 
306  report(actualScale, LASP, LACP, sizeof(streams) / sizeof(streams[0]), loadPool.size());
307 
308  // prepare memory
309  auto *filter = setFilter(actualScale, LASP.out);
310  std::vector<float*> corrPositions(loadPool.size()); // initializes to nullptrs
311  std::vector<T*> patches(loadPool.size()); // initializes to nullptrs
312  std::vector<std::complex<T>*> scalledPatches(loadPool.size()); // initializes to nullptrs
313 
314  // prepare control structures
315  const AlignmentContext context {
316  .verbose = this->verbose,
317  .maxShift = this->maxShift * actualScale,
318  .N = LASP.movie.n(),
319  .scale = std::make_pair(LASP.movie.x() / (T) LASP.out.x(), LASP.movie.y() / (T) LASP.out.y()),
320  .refFrame = globAlignment.refFrame,
321  .out = LASP.out,
322  };
323  auto mutexes = std::array<std::mutex, 2>(); // one for each step, as they need to happen 'atomically'
324  streams[0].set(); streams[1].set();
325  auto scaler = CUDAFlexAlignScale<T>(LASP, streams[0]);
326  scaler.init();
327  auto correlater = CUDAFlexAlignCorrelate<T>(LACP, streams[1]);
328  correlater.init();
329 
330  // prepare result
331  LocalAlignmentResult<T> result { globalHint:globAlignment, movieDim:movieSize};
332  result.shifts.reserve(patchesLocation.size() * movieSize.n());
333  std::vector<std::future<void>> futures;
334  futures.reserve(patchesLocation.size());
335 
336  // helper lambdas
337  auto alloc = [](size_t bytes) {
338  // it's faster to allocate CPU memory and then pin it, because registering can run in parallel
340  GPU::pinMemory(ptr, bytes);
341  return ptr;
342  };
343 
344  // process each patch
345  for (auto &p : patchesLocation) {
346  // prefill some info about patch
347  const auto shiftsOffset = result.shifts.size();
348  for (size_t i = 0;i < movieSize.n();++i) {
349  // keep this consistent with data loading
350  float globShiftX = std::round(globAlignment.shifts.at(i).x);
351  float globShiftY = std::round(globAlignment.shifts.at(i).y);
352  p.id_t = i;
353  // total shift (i.e. global shift + local shift) will be computed later on
354  result.shifts.emplace_back(p, Point2D<T>(globShiftX, globShiftY));
355  }
356  // parallel routine
357  auto routine = [&, p, shiftsOffset](int thrId) { // p and shiftsOffset by copy to avoid race condition
358  // alllocate memory for patch data
359  if (nullptr == patches.at(thrId)) {
360  patches[thrId] = reinterpret_cast<T*>(alloc(scaler.getMovieSettings().sBytes()));
361  scalledPatches[thrId] = reinterpret_cast<std::complex<T>*>(alloc(scaler.getOutputSettings().fBytes()));
362  }
363  auto *data = patches.at(thrId);
364 
365  // get data
366  getPatchData(p.rec, globAlignment, data);
367 
368  // downscale patches, result is in FD
369  for (auto i = 0; i < LASP.movie.n(); i += LASP.batch) {
370  std::unique_lock lock(mutexes[0]);
371  scaler.run(nullptr,
372  data + i * scaler.getMovieSettings().sDim().sizeSingle(),
373  scalledPatches[thrId] + i * scaler.getOutputSettings().fDim().sizeSingle(),
374  filter);
375  }
376  scaler.synch();
377 
378  // allocate memory for positions of the correlation maxima
379  if (nullptr == corrPositions.at(thrId)) {
380  auto bytes = context.alignmentBytes();
381  corrPositions[thrId] = reinterpret_cast<T*>(alloc(bytes));
382  memset(corrPositions[thrId], 0, bytes); // otherwise valgrind complains
383  }
384  auto &correlations = corrPositions.at(thrId);
385 
386  // compute correlations
387  {
388  std::unique_lock lock(mutexes[1]);
389  correlater.run(scalledPatches[thrId], correlations, context.maxShift);
390  correlater.synch();
391  }
392 
393  // compute resulting shifts
394  auto res = computeAlignment(correlations, context);
395  for (size_t i = 0;i < context.N;++i) {
396  // update total shift (i.e. global shift + local shift)
397  result.shifts[i + shiftsOffset].second += res.shifts[i];
398  }
399  };
400  futures.emplace_back(loadPool.push(routine));
401  }
402 
403  // wait till everything is done
404  for (auto &f : futures) { f.get(); }
405 
406  // clean the memory
407  for (auto *ptr : corrPositions) {
408  GPU::unpinMemory(ptr);
410  for (auto *ptr : patches) {
411  GPU::unpinMemory(ptr);
413  }
414  for (auto *ptr : scalledPatches) {
415  GPU::unpinMemory(ptr);
417  }
420 
421  // compute coefficients for BSpline
422  auto coeffs = BSplineHelper::computeBSplineCoeffs(movieSize, result,
423  this->localAlignmentControlPoints, this->localAlignPatches,
424  this->verbose, this->solverIterations);
425  result.bsplineRep = core::optional<BSplineGrid<T>>(
426  BSplineGrid<T>(this->localAlignmentControlPoints, coeffs.first, coeffs.second));
427 
428  return result;
429 }
430 
431 template<typename T>
433  const AlignmentResult<T> &globAlignment) {
434  // this is basically the computeLocalAlignment method without actuall computing
435  // we use the value of global shift for all patches instead
436  LAOptimize();
437  const auto movieSize = this->getMovieSize();
438  const auto borders = getMovieBorders(globAlignment, this->verbose > 1);
439  auto patchesLocation = this->getPatchesLocation(borders, LASP.movie);
440  LocalAlignmentResult<T> result { globalHint:globAlignment, movieDim:movieSize };
441  // get alignment for all patches
442  for (auto &p : patchesLocation) {
443  for (size_t i = 0; i < movieSize.n(); ++i) {
444  p.id_t = i;
445  result.shifts.emplace_back(p, Point2D<T>(globAlignment.shifts.at(i).x, globAlignment.shifts.at(i).y));
446  }
447  }
448 
449  auto coeffs = BSplineHelper::computeBSplineCoeffs(movieSize, result,
450  this->localAlignmentControlPoints, this->localAlignPatches,
451  this->verbose, this->solverIterations);
452  result.bsplineRep = core::optional<BSplineGrid<T>>(
453  BSplineGrid<T>(this->localAlignmentControlPoints, coeffs.first, coeffs.second));
454 
455  return result;
456 }
457 
458 template<typename T>
460  const MetaData& movieMD, const Image<T>& dark, const Image<T>& igain,
461  Image<T>& initialMic, size_t& Ninitial, Image<T>& averageMicrograph,
462  size_t& N, const AlignmentResult<T> &globAlignment) {
463  applyShiftsComputeAverage(movieMD, dark, igain, initialMic, Ninitial, averageMicrograph,
464  N, localFromGlobal(globAlignment));
465 }
466 
467 template <typename T>
469 {
470  mGpu.value().updateMemoryInfo();
471  const auto freeBytes = mGpu.value().lastFreeBytes();
472  const auto frameBytes = movie.getDim().xy() * sizeof(T);
473  auto streams = std::min(4UL, freeBytes / (frameBytes * 2)); // // upper estimation is 2 full frames of GPU data per stream
474  if (this->verbose > 1)
475  std::cout << "GPU streams used for output generation: " << streams << "\n";
476  return static_cast<int>(streams);
477 }
478 
479 template<typename T>
481  const MetaData& movieMD, const Image<T>& dark, const Image<T>& igain,
482  Image<T>& initialMic, size_t& Ninitial, Image<T>& averageMicrograph,
483  size_t& N, const LocalAlignmentResult<T> &alignment) {
484  Ninitial = N = 0;
485  if ( ! alignment.bsplineRep) {
487  "Missing BSpline representation. This should not happen. Please contact developers.");
488  }
489 
490  struct AuxData {
491  MultidimArray<T> shiftedFrame;
492  GeoTransformer<T> transformer;
493  GPU stream;
494  T *hOut;
495  };
496 
497  auto coeffs = std::make_pair(alignment.bsplineRep.value().getCoeffsX(),
498  alignment.bsplineRep.value().getCoeffsY());
499 
500  // prepare data for each thread
501  auto pool = ctpl::thread_pool(getOutputStreamCount());
502  auto aux = std::vector<AuxData>(pool.size());
503  auto futures = std::vector<std::future<void>>();
504  for (auto i = 0; i < pool.size(); ++i) {
505  aux[i].stream = GPU(mGpu.value().device(), i + 1);
506  }
507 
508  int frameIndex = -1;
509  auto mutexes = std::array<std::mutex, 3>(); // protecting access to unique resourcese
511  {
512  frameIndex++;
513  if ((frameIndex >= this->nfirstSum) && (frameIndex <= this->nlastSum)) {
514  // user might want to align frames 3..10, but sum only 4..6
515  // by deducting the first frame that was aligned, we get proper offset to the stored memory
516  auto routine = [&, frameIndex](int threadId) { // all by reference, frameIndex by copy
517  int frameOffset = frameIndex - this->nfirst;
518  auto &a = aux[threadId];
519  a.stream.set();
520  auto &frame = movie.getFrame(frameIndex);
521 
522  if ( ! this->fnInitialAvg.isEmpty()) {
523  std::unique_lock lock(mutexes[0]);
524  if (0 == initialMic().yxdim)
525  initialMic() = frame;
526  else
527  initialMic() += frame;
528  Ninitial++;
529  }
530 
531  if ( ! this->fnAligned.isEmpty() || ! this->fnAvg.isEmpty()) {
532  if (nullptr == a.hOut) {
533  a.hOut = reinterpret_cast<T*>(BasicMemManager::instance().get(frame.yxdim * sizeof(T), MemType::CUDA_HOST));
534  }
535  auto shiftedFrame = MultidimArray<T>(1, 1, frame.ydim, frame.xdim, a.hOut);
536  a.transformer.initLazyForBSpline(frame.xdim, frame.ydim, alignment.movieDim.n(),
537  this->localAlignmentControlPoints.x(), this->localAlignmentControlPoints.y(), this->localAlignmentControlPoints.n(), a.stream);
538  a.transformer.applyBSplineTransform(3, shiftedFrame, frame, coeffs, frameOffset);
539  a.stream.synch(); // make sure that data is fetched from GPU
540 
541  if (this->fnAligned != "") {
542  std::unique_lock lock(mutexes[1]);
543  Image<T> tmp(shiftedFrame);
544  tmp.write(this->fnAligned, frameOffset + 1, true,
545  WRITE_REPLACE);
546  }
547  if (this->fnAvg != "") {
548  std::unique_lock lock(mutexes[2]);
549  if (0 == averageMicrograph().yxdim)
550  averageMicrograph() = shiftedFrame;
551  else
552  averageMicrograph() += shiftedFrame;
553  N++;
554  }
555  }
556 
557  };
558  futures.emplace_back(pool.push(routine));
559  }
560  }
561  for (auto &t : futures) {
562  t.get();
563  }
564  for (auto i = 0; i < pool.size(); ++i) {
565  BasicMemManager::instance().give(aux[i].hOut);
566  }
567 }
568 
569 
570 template<typename T>
572  const Dimensions &opt, bool applyCrop) {
573  auto single = orig.copyForN(1);
574  UserSettings::get(storage).insert(*this, getKey(optSizeXStr, single, applyCrop), opt.x());
575  UserSettings::get(storage).insert(*this, getKey(optSizeYStr, single, applyCrop), opt.y());
576  UserSettings::get(storage).store(); // write changes immediately
577 }
578 
579 template<typename T>
581  const Dimensions &dim, bool applyCrop) {
582  size_t x, y, batch;
583  auto single = dim.copyForN(1);
584  bool res = true;
585  res = res && UserSettings::get(storage).find(*this, getKey(optSizeXStr, single, applyCrop), x);
586  res = res && UserSettings::get(storage).find(*this, getKey(optSizeYStr, single, applyCrop), y);
587  if (res) {
588  return std::optional(Dimensions(x, y, 1, dim.n()));
589  } else {
590  return {};
591  }
592 }
593 
594 
595 template<typename T>
597  const auto rSize = this->getMovieSizeRaw();
598  GASP.batch = 1; // always 1
599  GASP.raw = rSize;
600  GASP.doBinning = this->applyBinning();
601  const auto mSize = GASP.doBinning ? movie.getDim() : findGoodCropSize(rSize);
602  GASP.movie = mSize;
603  const auto cSize = findGoodCorrelationSize(getCorrelationHint(movie.getDim()));
604  GASP.out = GACP.dim = cSize;
605  const auto maxBytes = static_cast<size_t>(static_cast<float>(mGpu.value().lastFreeBytes()) * 0.9f); // leave some buffer in case of memory fragmentation
606  // first, we prepare all frames for correlation - we crop / binned them, convert to FD and then crop again
607  // ideally we want two streams to overlap memory transfers and computations
608  // more streams do not make sense because we're limited by the transfers
609  GAStreams = 2;
610  if (GAStreams * CUDAFlexAlignScale<T>(GASP, mGpu.value()).estimateBytes() >= maxBytes) {
611  GAStreams = 1;
612  }
613  if (CUDAFlexAlignScale<T>(GASP, mGpu.value()).estimateBytes() >= maxBytes) {
614  REPORT_ERROR(ERR_GPU_MEMORY, "Insufficient GPU memory for processing global alignment.");
615  }
616  // once that is done, we compute correlation of all frames
617  size_t bufferDivider = 1;
618  size_t batchDivider = 0;
619  const size_t correlations = cSize.n() * (cSize.n() - 1) / 2;
620  do {
621  batchDivider++;
622  auto batch = correlations / batchDivider;
623  GACP.batch = batch;
624  if ((mSize.n() / bufferDivider) > batch) {
625  bufferDivider += (bufferDivider == 1) ? 2 : 1; // we use two buffers, so we need the same memory for batch == 1 and == 2
626  }
627  GACP.bufferSize = mSize.n() / bufferDivider;
628  } while (CUDAFlexAlignCorrelate<T>(GACP, mGpu.value()).estimateBytes() >= maxBytes);
629 }
630 
631 
632 template<typename T>
634  const MetaData &movieMD, const Image<T> &dark, const Image<T> &igain) {
635  // prepare storage for the movie
636  movie.set(this->getMovieSize());
637  // find optimal settings
638  GAOptimize();
639  const auto noOfThreads = 2 * GAStreams; // to keep them saturated
640 
641  const auto actualScale = static_cast<float>(GASP.out.x()) / static_cast<float>(this->getMovieSize().x());
642  report(actualScale, GASP, GACP, GAStreams, noOfThreads);
643 
644 
645  // prepare control structures
646  auto cpuPool = ctpl::thread_pool(noOfThreads);
647  auto gpuPool = ctpl::thread_pool(GAStreams);
648  auto scalers = std::vector<CUDAFlexAlignScale<T>>();
649  auto streams = std::vector<GPU>();
650  streams.reserve(gpuPool.size());
651  scalers.reserve(gpuPool.size());
652  for (auto i = 0; i < GAStreams; ++i) { // fill vectors in serial fashion
653  streams.emplace_back(mGpu.value().device(), i + 1);
654  scalers.emplace_back(GASP, streams[i]);
655  gpuPool.push([&streams, &scalers, i](int) {
656  // allocate memory in parallel
657  streams[i].set();
658  scalers[i].init();
659  });
660  }
661 
662  // prepare memory
663  auto *filter = setFilter(actualScale, GASP.out);
664  auto *scaledFrames = reinterpret_cast<std::complex<T>*>(BasicMemManager::instance().get(scalers[0].getOutputSettings().fBytes(), MemType::CPU_PAGE_ALIGNED));
665  const auto frameBytes = scalers[0].getMovieSettings().sBytesSingle(); // either it's the binned size, or cropped size
666 
667  // load movie, perform binning, FT, and downscale
668  for (auto i = 0; i < GASP.movie.n(); i += GASP.batch) {
669  auto routine = [&, index=i](int) // all by reference, i by copy
670  {
671  auto *rawFrame = loadFrame(movieMD, dark, igain, index);
672  auto *frame = reinterpret_cast<T *>(BasicMemManager::instance().get(frameBytes, MemType::CUDA_HOST));
673  if (this->applyBinning()) {
674  movie.setFrameData(index, frame); // we want to store the binned frame
675  } else {
676  movie.setFrameData(index, rawFrame); // we want to store the raw frame
677  getCroppedFrame(GASP.movie, frame, rawFrame);
678  }
679  gpuPool.push([&](int stream) {
680  auto &scaler = scalers[stream];
681  scaler.run(rawFrame, frame, scaledFrames + index * scaler.getOutputSettings().fDim().sizeSingle(), filter);
682  scaler.synch();
683 
684  }).get();
685  // if we bin, get rid of the raw frame. If we crop, get rid of the cropped frame
686  BasicMemManager::instance().give(this->applyBinning() ? rawFrame : frame);
687  };
688  cpuPool.push(routine);
689  }
690  // wait till done
691  cpuPool.stop(true);
692  gpuPool.stop(true);
693 
694  // release unused memory
695  scalers.clear();
697 
698  // prepare structures / memory for correlation
699  const AlignmentContext context {
700  .verbose = this->verbose,
701  .maxShift = this->maxShift * actualScale,
702  .N = GASP.movie.n(),
703  .scale = std::make_pair(movie.getDim().x() / (T) GACP.dim.x(), movie.getDim().y() / (T) GACP.dim.y()),
704  .refFrame = std::nullopt,
705  .out = GASP.out,
706  };
707  auto *correlations = reinterpret_cast<float*>(BasicMemManager::instance().get(context.alignmentBytes(), MemType::CUDA_HOST));
708 
709  // correlate
710  auto correlator = CUDAFlexAlignCorrelate<T>(GACP, mGpu.value());
711  correlator.init();
712  correlator.run(scaledFrames, correlations, context.maxShift);
713  mGpu.value().synch();
714  // result is a centered correlation function with (hopefully) a cross
715  // indicating the requested shift
716  auto result = computeAlignment(correlations, context);
717 
718  // release memory
721  BasicMemManager::instance().give(scaledFrames);
723  return result;
724 }
725 
726 template<typename T>
728  T *dest, T *src) {
729  for (size_t y = 0; y < dim.y(); ++y) {
730  memcpy(dest + (dim.x() * y),
731  src + (movie.getDim().x() * y),
732  dim.x() * sizeof(T));
733  }
734 }
735 
736 template<typename T>
738  const auto dim = movie.getDim();
739  for (auto i = 0; i < dim.n(); ++i) {
740  BasicMemManager::instance().give(movie.getFrame(i).data);
741  }
742 }
743 
744 template<typename T>
746  const Image<T>& dark, const Image<T>& igain, size_t index) {
747  const auto &movieDim = this->getMovieSizeRaw();
748  int frameIndex = -1;
749  size_t counter = 0;
750  for (size_t objId : movieMD.ids())
751  {
752  // get to correct index
753  frameIndex++;
754  if (frameIndex < this->nfirst) continue;
755  if (frameIndex > this->nlast) break;
756 
757  if (counter == index) {
758  // load frame
759  auto *ptr = reinterpret_cast<T*>(BasicMemManager::instance().get(movieDim.xy() * sizeof(T), MemType::CPU_PAGE_ALIGNED));
760  auto mda = MultidimArray<T>(1, 1, movieDim.y(), movieDim.x(), ptr);
761  Image<T> frame(mda);
762  AProgMovieAlignmentCorrelation<T>::loadFrame(movieMD, dark, igain, objId, frame);
763  return ptr;
764  }
765  counter++;
766  }
767  return nullptr;
768 }
769 
770 template <typename T>
772  float *positions, const AlignmentContext &context)
773 {
774  auto noOfCorrelations = static_cast<int>(context.N * (context.N - 1) / 2);
775  // matrices describing observations
776  Matrix2D<float> A(noOfCorrelations, static_cast<int>(context.N) - 1);
777  Matrix1D<float> bX(noOfCorrelations), bY(noOfCorrelations);
778 
779  auto idx = 0;
780  for (size_t i = 0; i < context.N - 1; ++i) {
781  for (size_t j = i + 1; j < context.N; ++j) {
782  // X and Y positions of the maxima. To find the shift, we need to deduct the center of the frame
783  bX(idx) = positions[2*idx] - (static_cast<float>(context.out.x()) / 2.f);
784  bY(idx) = positions[2*idx+1] - (static_cast<float>(context.out.y()) / 2.f);
785  bX(idx) *= context.scale.first; // scale to expected size
786  bY(idx) *= context.scale.second;
787  if (context.verbose > 1) {
788  std::cerr << "Frame " << i << " to Frame " << j << " -> ("
789  << bX(idx) << "," << bY(idx) << ")" << std::endl;
790  }
791  for (int ij = i; ij < j; ij++) {
792  A(idx, ij) = 1.f; // mark which pairs of frames we measured
793  }
794  idx++;
795  }
796  }
797  return AProgMovieAlignmentCorrelation<T>::computeAlignment(bX, bY, A, context.refFrame, context.N, context.verbose);
798 }
799 
800 // explicit instantiation
void min(Image< double > &op1, const Image< double > &op2)
Parameter incorrect.
Definition: xmipp_error.h:181
constexpr Dimensions copyForN(size_t n) const
Definition: dimensions.h:46
__host__ __device__ float2 floor(const float2 v)
std::vector< Point2D< T > > shifts
AlignmentResult< T > computeAlignment(Matrix1D< T > &bX, Matrix1D< T > &bY, Matrix2D< T > &A, const core::optional< size_t > &refFrame, size_t N, int verbose)
#define REPORT_ERROR(nerr, ErrormMsg)
Definition: xmipp_error.h:211
CUDA_HD constexpr size_t z() const
Definition: dimensions.h:69
HBITMAP buffer
Definition: svm-toy.cpp:37
static double * y
T & value() const
Definition: optional.h:67
void write(const FileName &name="", size_t select_img=ALL_IMAGES, bool isStack=false, int mode=WRITE_OVERWRITE, CastWriteMode castMode=CW_CAST, int _swapWrite=0)
constexpr Dimensions createSingle() const
Definition: dimensions.h:42
void loadFrame(const MetaData &movie, const Image< T > &dark, const Image< T > &igain, size_t objId, Image< T > &out) const
void abs(Image< double > &op)
static UserSettings & get(const std::string &path=std::string())
Definition: userSettings.h:49
virtual IdIteratorProxy< false > ids()
void give(void *ptr)
bool insert(const T &obj, const std::string &key, const U &value)
Definition: userSettings.h:91
doublereal * x
#define i
doublereal * d
CUDA_HD constexpr size_t x() const
Definition: dimensions.h:51
void * get(size_t bytes, MemType type)
static void pinMemory(const void *h_mem, size_t bytes, unsigned int flags=0)
Definition: gpu.cpp:92
cudaStream_t * streams
void readParams()
Read argument from command line.
viol index
double * f
GPU memory related issues.
Definition: xmipp_error.h:124
Incorrect argument received.
Definition: xmipp_error.h:113
core::optional< BSplineGrid< T > > bsplineRep
static BasicMemManager & instance()
#define FOR_ALL_OBJECTS_IN_METADATA(__md)
Definition: metadata_base.h:64
void max(Image< double > &op1, const Image< double > &op2)
CUDA_HD constexpr size_t y() const
Definition: dimensions.h:60
void set()
Definition: gpu.cpp:50
CUDA_HD constexpr size_t n() const
Definition: dimensions.h:78
#define j
static std::pair< Matrix1D< T >, Matrix1D< T > > computeBSplineCoeffs(const Dimensions &movieSize, const LocalAlignmentResult< T > &alignment, const Dimensions &controlPoints, const std::pair< size_t, size_t > &noOfPatches, int verbosity, int solverIters)
size_t correlations(const Dimensions &d)
virtual void defineParams()
Define parameters.
constexpr Dimensions sDim() const
Definition: fft_settings.h:78
int round(double x)
Definition: ap.cpp:7245
bool find(const T &obj, const std::string &key, U &value)
Definition: userSettings.h:116
static FFTSettings< T > * findOptimal(const GPU &gpu, const FFTSettings< T > &settings, size_t reserveBytes, bool squareOnly, int sigPercChange, bool crop, bool verbose)
Definition: cuda_fft.cpp:312
Rectangle< Point2D< T > > rec
Definition: gpu.h:36
static void unpinMemory(const void *h_mem)
Definition: gpu.cpp:108
Incorrect value received.
Definition: xmipp_error.h:195
int * n
doublereal * a
virtual void readParams()
Read argument from command line.