38 this->addParamsLine(
" [--device <dev=0>] : GPU device to use. 0th by default");
39 this->addParamsLine(
" [--storage <fn=\"\">] : Path to file that can be used to store results of the benchmark");
40 this->addParamsLine(
" [--patchesAvg <avg=3>] : Number of near frames used for averaging a single patch");
42 "xmipp_cuda_movie_alignment_correlation -i movie.xmd --oaligned alignedMovie.stk --oavg alignedMicrograph.mrc --device 0");
43 this->addSeeAlsoLine(
"xmipp_movie_alignment_correlation");
49 std::cout <<
"Device: " << mGpu.value().device() <<
" (" << mGpu.value().getUUID() <<
")" <<
"\n";
50 std::cout <<
"Benchmark storage " << (storage.empty() ?
"Default" : storage) <<
"\n";
51 std::cout <<
"Patches avg: " << patchesAvg <<
"\n";
59 int device = this->getIntParam(
"--device");
65 storage = this->getParam(
"--storage");
68 patchesAvg = this->getIntParam(
"--patchesAvg");
74 const bool crop =
true;
75 if (
auto optDim = getStoredSizes(ref, crop); optDim) {
76 return optDim.value().copyForN(ref.
n());
78 std::cout <<
"Benchmarking cuFFT ..." << std::endl;
80 auto candidate = std::unique_ptr<FFTSettings<T>>(
CudaFFT<T>::findOptimal(mGpu.value(), hint, 0, hint.sDim().x() == hint.sDim().y(), 10, crop,
true));
84 storeSizes(ref, candidate->
sDim(), crop);
90 const bool crop =
false;
91 if (
auto optDim = this->getStoredSizes(ref, crop); optDim) {
92 return optDim.value().copyForN(ref.
n());
94 std::cout <<
"Benchmarking cuFFT ..." << std::endl;
96 auto candidate = std::unique_ptr<FFTSettings<T>>(
CudaFFT<T>::findOptimal(mGpu.value(), hint, 0, hint.sDim().x() == hint.sDim().y(), 20, crop,
true));
100 this->storeSizes(ref, candidate->
sDim(), crop);
106 const bool crop =
false;
107 const auto reqPatchSize = this->getRequestedPatchSize();
108 auto ref =
Dimensions(reqPatchSize.first, reqPatchSize.second, 1, this->getMovieSize().n());
109 if (
auto optDim = this->getStoredSizes(ref, crop); optDim) {
112 std::cout <<
"Benchmarking cuFFT ..." << std::endl;
114 auto candidate = std::unique_ptr<FFTSettings<T>>(
CudaFFT<T>::findOptimal(mGpu.value(), hint, 0, hint.sDim().x() == hint.sDim().y(), 20, crop,
true));
118 this->storeSizes(ref, candidate->
sDim(), crop);
119 return candidate->
sDim();
124 auto getNearestEven = [] (
size_t v, T minScale,
size_t shift) {
126 while ((size / (
float)v) < minScale) {
131 const T requestedScale = this->getScaleFactor();
133 Dimensions hint(getNearestEven(d.
x(), requestedScale,
static_cast<size_t>(this->maxShift)),
134 getNearestEven(d.
y(), requestedScale,
static_cast<size_t>(this->maxShift)),
141 const std::pair<T, T> &borders,
const Dimensions &patch) {
142 size_t patchesX = this->localAlignPatches.first;
143 size_t patchesY = this->localAlignPatches.second;
144 T windowXSize = this->getMovieSize().x() - 2 * borders.first;
145 T windowYSize = this->getMovieSize().y() - 2 * borders.second;
146 T corrX = std::ceil(((patchesX * patch.
x()) - windowXSize) / (T) (patchesX - 1));
147 T corrY = std::ceil(((patchesY * patch.
y()) - windowYSize) / (T) (patchesY - 1));
148 T stepX = (T)patch.
x() - corrX;
149 T stepY = (T)patch.
y() - corrY;
150 std::vector<FramePatchMeta<T>> result;
151 for (
size_t y = 0;
y < patchesY; ++
y) {
152 for (
size_t x = 0;
x < patchesX; ++
x) {
153 T tlx = borders.first +
x * stepX;
154 T tly = borders.second +
y * stepY;
155 T brx = tlx + patch.
x() - 1;
156 T bry = tly + patch.
y() - 1;
169 const auto &movieDim = movie.getDim();
170 const int n =
static_cast<int>(movieDim.n());
171 const auto patchSize = patch.getSize();
172 const auto bufferBytes = patchSize.
x *
sizeof(T);
173 auto buffer = std::make_unique<T[]>(patchSize.x);
174 for (
int t = 0; t <
n; ++t) {
175 for (
int y = 0;
y < patchSize.y; ++
y) {
179 for (
int f =
std::max(0, t - ((patchesAvg - 1) / 2));
f <=
std::min(n - 1, t + (patchesAvg / 2)); ++
f) {
180 const auto *frame = movie.getFrame(
f).data;
185 const int srcY = patch.tl.
y +
y + yShift;
186 const int srcX = patch.tl.
x + xShift;
187 auto *src = frame + srcY * movieDim.x() + srcX;
189 memcpy(
buffer.get(), src, bufferBytes);
191 for (
int x = 0;
x < patchSize.x; ++
x) {
197 const int patchOffset = t * patchSize.x * patchSize.y;
198 const int destIndex = patchOffset +
y * patchSize.x;
199 memcpy(result + destIndex,
buffer.get(), bufferBytes);
211 for (
const auto& s : globAlignment.
shifts) {
213 maxX =
std::max(std::ceil(s.x), maxX);
215 maxY =
std::max(std::ceil(s.y), maxY);
219 std::cout <<
"Movie borders: x=" << res.first <<
" y=" << res.second << std::endl;
226 const auto pSize = findGoodPatchSize();
227 LASP.doBinning =
false;
230 const auto cSize = this->findGoodCorrelationSize(this->getCorrelationHint(pSize));
231 LASP.out = LACP.dim = cSize;
232 const auto maxBytes =
static_cast<size_t>(
static_cast<float>(mGpu.value().lastFreeBytes()) * 0.9
f);
233 auto getMemReq = [&LASP=LASP, &LACP=LACP, &gpu=mGpu.value()]() {
236 return scale + correlate;
238 auto cond = [&LASP=LASP]() {
240 return 0 == LASP.movie.n() % LASP.batch;
245 size_t bufferDivider = 1;
246 size_t corrBatchDivider = 0;
247 const size_t correlations = cSize.n() * (cSize.n() - 1) / 2;
250 auto corrBatch = correlations / corrBatchDivider;
251 LACP.batch = corrBatch;
252 if ((pSize.n() / bufferDivider) > corrBatch) {
253 bufferDivider += (bufferDivider == 1) ? 2 : 1;
255 LACP.bufferSize = pSize.n() / bufferDivider;
256 for (
auto scaleBatch = pSize.n(); scaleBatch > 0; --scaleBatch) {
257 LASP.batch = scaleBatch;
258 if (cond() && getMemReq() <= maxBytes) {
267 MultidimArray<T> tmp = this->createLPF(this->getPixelResolution(scale), dims);
278 std::cout <<
"Actual scale factor (X): " << scale <<
"\n";
279 std::cout <<
"Size of the patch : " << SP.
movie <<
"\n";
280 std::cout <<
"Size of the correlation: " << SP.
out <<
"\n";
281 std::cout <<
"Correlation batch : " << CP.
batch <<
"\n";
282 std::cout <<
"GPU streams : " <<
streams <<
"\n";
283 std::cout <<
"CPU threads : " << threads <<
"\n";
293 const auto movieSize = this->getMovieSize();
294 if ((movieSize.x() < LASP.movie.x())
295 || (movieSize.y() < LASP.movie.y())) {
299 auto borders = getMovieBorders(globAlignment, this->verbose > 1);
300 auto patchesLocation = this->getPatchesLocation(borders, LASP.movie);
301 const auto actualScale =
static_cast<float>(LASP.out.x()) / static_cast<float>(LASP.movie.x());
303 auto streams = std::array<GPU, 2>{
GPU(mGpu.value().device(), 1),
GPU(mGpu.value().device(), 2)};
304 auto loadPool =
ctpl::thread_pool(static_cast<int>(
std::min(this->localAlignPatches.first, this->localAlignPatches.second)));
306 report(actualScale, LASP, LACP,
sizeof(
streams) /
sizeof(
streams[0]), loadPool.size());
309 auto *filter = setFilter(actualScale, LASP.out);
310 std::vector<float*> corrPositions(loadPool.size());
311 std::vector<T*> patches(loadPool.size());
312 std::vector<std::complex<T>*> scalledPatches(loadPool.size());
315 const AlignmentContext context {
316 .verbose = this->verbose,
317 .maxShift = this->maxShift * actualScale,
319 .scale = std::make_pair(LASP.movie.x() / (T) LASP.out.x(), LASP.movie.y() / (T) LASP.out.y()),
323 auto mutexes = std::array<std::mutex, 2>();
332 result.
shifts.reserve(patchesLocation.size() * movieSize.n());
333 std::vector<std::future<void>> futures;
334 futures.reserve(patchesLocation.size());
337 auto alloc = [](
size_t bytes) {
345 for (
auto &p : patchesLocation) {
347 const auto shiftsOffset = result.shifts.size();
348 for (
size_t i = 0;
i < movieSize.n();++
i) {
354 result.shifts.emplace_back(p,
Point2D<T>(globShiftX, globShiftY));
357 auto routine = [&, p, shiftsOffset](
int thrId) {
359 if (
nullptr == patches.at(thrId)) {
360 patches[thrId] =
reinterpret_cast<T*
>(alloc(scaler.getMovieSettings().sBytes()));
361 scalledPatches[thrId] =
reinterpret_cast<std::complex<T>*
>(alloc(scaler.getOutputSettings().fBytes()));
363 auto *data = patches.at(thrId);
366 getPatchData(p.rec, globAlignment, data);
369 for (
auto i = 0;
i < LASP.movie.n();
i += LASP.batch) {
370 std::unique_lock lock(mutexes[0]);
372 data +
i * scaler.getMovieSettings().sDim().sizeSingle(),
373 scalledPatches[thrId] +
i * scaler.getOutputSettings().fDim().sizeSingle(),
379 if (
nullptr == corrPositions.at(thrId)) {
380 auto bytes = context.alignmentBytes();
381 corrPositions[thrId] =
reinterpret_cast<T*
>(alloc(bytes));
382 memset(corrPositions[thrId], 0, bytes);
388 std::unique_lock lock(mutexes[1]);
389 correlater.run(scalledPatches[thrId],
correlations, context.maxShift);
395 for (
size_t i = 0;
i < context.N;++
i) {
397 result.shifts[
i + shiftsOffset].second += res.shifts[
i];
400 futures.emplace_back(loadPool.push(routine));
404 for (
auto &
f : futures) {
f.get(); }
407 for (
auto *ptr : corrPositions) {
410 for (
auto *ptr : patches) {
414 for (
auto *ptr : scalledPatches) {
423 this->localAlignmentControlPoints, this->localAlignPatches,
424 this->verbose, this->solverIterations);
426 BSplineGrid<T>(this->localAlignmentControlPoints, coeffs.first, coeffs.second));
437 const auto movieSize = this->getMovieSize();
438 const auto borders = getMovieBorders(globAlignment, this->verbose > 1);
439 auto patchesLocation = this->getPatchesLocation(borders, LASP.movie);
442 for (
auto &p : patchesLocation) {
443 for (
size_t i = 0;
i < movieSize.n(); ++
i) {
450 this->localAlignmentControlPoints, this->localAlignPatches,
451 this->verbose, this->solverIterations);
453 BSplineGrid<T>(this->localAlignmentControlPoints, coeffs.first, coeffs.second));
463 applyShiftsComputeAverage(movieMD, dark, igain, initialMic, Ninitial, averageMicrograph,
464 N, localFromGlobal(globAlignment));
467 template <
typename T>
470 mGpu.value().updateMemoryInfo();
471 const auto freeBytes = mGpu.value().lastFreeBytes();
472 const auto frameBytes = movie.getDim().xy() *
sizeof(T);
474 if (this->verbose > 1)
475 std::cout <<
"GPU streams used for output generation: " <<
streams <<
"\n";
476 return static_cast<int>(
streams);
487 "Missing BSpline representation. This should not happen. Please contact developers.");
497 auto coeffs = std::make_pair(alignment.
bsplineRep.
value().getCoeffsX(),
502 auto aux = std::vector<AuxData>(pool.size());
503 auto futures = std::vector<std::future<void>>();
504 for (
auto i = 0;
i < pool.size(); ++
i) {
505 aux[
i].stream =
GPU(mGpu.value().device(),
i + 1);
509 auto mutexes = std::array<std::mutex, 3>();
513 if ((frameIndex >= this->nfirstSum) && (frameIndex <= this->nlastSum)) {
516 auto routine = [&, frameIndex](
int threadId) {
517 int frameOffset = frameIndex - this->nfirst;
518 auto &
a = aux[threadId];
520 auto &frame = movie.getFrame(frameIndex);
522 if ( ! this->fnInitialAvg.isEmpty()) {
523 std::unique_lock lock(mutexes[0]);
524 if (0 == initialMic().yxdim)
525 initialMic() = frame;
527 initialMic() += frame;
531 if ( ! this->fnAligned.isEmpty() || ! this->fnAvg.isEmpty()) {
532 if (
nullptr ==
a.hOut) {
536 a.transformer.initLazyForBSpline(frame.xdim, frame.ydim, alignment.
movieDim.
n(),
537 this->localAlignmentControlPoints.x(), this->localAlignmentControlPoints.y(), this->localAlignmentControlPoints.n(),
a.stream);
538 a.transformer.applyBSplineTransform(3, shiftedFrame, frame, coeffs, frameOffset);
541 if (this->fnAligned !=
"") {
542 std::unique_lock lock(mutexes[1]);
544 tmp.
write(this->fnAligned, frameOffset + 1,
true,
547 if (this->fnAvg !=
"") {
548 std::unique_lock lock(mutexes[2]);
549 if (0 == averageMicrograph().yxdim)
550 averageMicrograph() = shiftedFrame;
552 averageMicrograph() += shiftedFrame;
558 futures.emplace_back(pool.push(routine));
561 for (
auto &t : futures) {
564 for (
auto i = 0;
i < pool.size(); ++
i) {
588 return std::optional(
Dimensions(x, y, 1, dim.
n()));
597 const auto rSize = this->getMovieSizeRaw();
600 GASP.doBinning = this->applyBinning();
601 const auto mSize = GASP.doBinning ? movie.getDim() : findGoodCropSize(rSize);
603 const auto cSize = findGoodCorrelationSize(getCorrelationHint(movie.getDim()));
604 GASP.out = GACP.dim = cSize;
605 const auto maxBytes =
static_cast<size_t>(
static_cast<float>(mGpu.value().lastFreeBytes()) * 0.9
f);
617 size_t bufferDivider = 1;
618 size_t batchDivider = 0;
619 const size_t correlations = cSize.n() * (cSize.n() - 1) / 2;
622 auto batch = correlations / batchDivider;
624 if ((mSize.n() / bufferDivider) > batch) {
625 bufferDivider += (bufferDivider == 1) ? 2 : 1;
627 GACP.bufferSize = mSize.n() / bufferDivider;
636 movie.set(this->getMovieSize());
639 const auto noOfThreads = 2 * GAStreams;
641 const auto actualScale =
static_cast<float>(GASP.out.x()) / static_cast<float>(this->getMovieSize().x());
642 report(actualScale, GASP, GACP, GAStreams, noOfThreads);
648 auto scalers = std::vector<CUDAFlexAlignScale<T>>();
649 auto streams = std::vector<GPU>();
650 streams.reserve(gpuPool.size());
651 scalers.reserve(gpuPool.size());
652 for (
auto i = 0;
i < GAStreams; ++
i) {
653 streams.emplace_back(mGpu.value().device(),
i + 1);
654 scalers.emplace_back(GASP, streams[
i]);
655 gpuPool.push([&streams, &scalers, i](
int) {
663 auto *filter = setFilter(actualScale, GASP.out);
665 const auto frameBytes = scalers[0].getMovieSettings().sBytesSingle();
668 for (
auto i = 0;
i < GASP.movie.n();
i += GASP.batch) {
669 auto routine = [&,
index=
i](int)
671 auto *rawFrame = loadFrame(movieMD, dark, igain,
index);
673 if (this->applyBinning()) {
674 movie.setFrameData(
index, frame);
676 movie.setFrameData(
index, rawFrame);
677 getCroppedFrame(GASP.movie, frame, rawFrame);
679 gpuPool.push([&](
int stream) {
680 auto &scaler = scalers[stream];
681 scaler.run(rawFrame, frame, scaledFrames +
index * scaler.getOutputSettings().fDim().sizeSingle(), filter);
688 cpuPool.push(routine);
699 const AlignmentContext context {
700 .verbose = this->verbose,
701 .maxShift = this->maxShift * actualScale,
703 .scale = std::make_pair(movie.getDim().x() / (T) GACP.dim.x(), movie.getDim().y() / (T) GACP.dim.y()),
704 .refFrame = std::nullopt,
712 correlator.run(scaledFrames,
correlations, context.maxShift);
713 mGpu.value().synch();
729 for (
size_t y = 0;
y < dim.
y(); ++
y) {
730 memcpy(dest + (dim.
x() *
y),
731 src + (movie.getDim().x() *
y),
732 dim.
x() *
sizeof(T));
738 const auto dim = movie.getDim();
739 for (
auto i = 0;
i < dim.
n(); ++
i) {
747 const auto &movieDim = this->getMovieSizeRaw();
750 for (
size_t objId : movieMD.
ids())
754 if (frameIndex < this->nfirst)
continue;
755 if (frameIndex > this->nlast)
break;
757 if (counter == index) {
770 template <
typename T>
772 float *positions,
const AlignmentContext &context)
774 auto noOfCorrelations =
static_cast<int>(context.N * (context.N - 1) / 2);
780 for (
size_t i = 0;
i < context.N - 1; ++
i) {
781 for (
size_t j =
i + 1;
j < context.N; ++
j) {
783 bX(idx) = positions[2*idx] - (
static_cast<float>(context.out.x()) / 2.
f);
784 bY(idx) = positions[2*idx+1] - (
static_cast<float>(context.out.y()) / 2.
f);
785 bX(idx) *= context.scale.first;
786 bY(idx) *= context.scale.second;
787 if (context.verbose > 1) {
788 std::cerr <<
"Frame " <<
i <<
" to Frame " <<
j <<
" -> (" 789 << bX(idx) <<
"," << bY(idx) <<
")" << std::endl;
791 for (
int ij =
i; ij <
j; ij++) {
void min(Image< double > &op1, const Image< double > &op2)
constexpr Dimensions copyForN(size_t n) const
__host__ __device__ float2 floor(const float2 v)
std::vector< Point2D< T > > shifts
AlignmentResult< T > computeAlignment(Matrix1D< T > &bX, Matrix1D< T > &bY, Matrix2D< T > &A, const core::optional< size_t > &refFrame, size_t N, int verbose)
#define REPORT_ERROR(nerr, ErrormMsg)
CUDA_HD constexpr size_t z() const
void write(const FileName &name="", size_t select_img=ALL_IMAGES, bool isStack=false, int mode=WRITE_OVERWRITE, CastWriteMode castMode=CW_CAST, int _swapWrite=0)
constexpr Dimensions createSingle() const
void loadFrame(const MetaData &movie, const Image< T > &dark, const Image< T > &igain, size_t objId, Image< T > &out) const
void abs(Image< double > &op)
static UserSettings & get(const std::string &path=std::string())
bool insert(const T &obj, const std::string &key, const U &value)
void defineParams()
Define parameters.
CUDA_HD constexpr size_t x() const
void * get(size_t bytes, MemType type)
static void pinMemory(const void *h_mem, size_t bytes, unsigned int flags=0)
void readParams()
Read argument from command line.
GPU memory related issues.
Incorrect argument received.
core::optional< BSplineGrid< T > > bsplineRep
static BasicMemManager & instance()
void max(Image< double > &op1, const Image< double > &op2)
CUDA_HD constexpr size_t y() const
CUDA_HD constexpr size_t n() const
static std::pair< Matrix1D< T >, Matrix1D< T > > computeBSplineCoeffs(const Dimensions &movieSize, const LocalAlignmentResult< T > &alignment, const Dimensions &controlPoints, const std::pair< size_t, size_t > &noOfPatches, int verbosity, int solverIters)
size_t correlations(const Dimensions &d)
virtual void defineParams()
Define parameters.
constexpr Dimensions sDim() const
bool find(const T &obj, const std::string &key, U &value)
static FFTSettings< T > * findOptimal(const GPU &gpu, const FFTSettings< T > &settings, size_t reserveBytes, bool squareOnly, int sigPercChange, bool crop, bool verbose)
static void unpinMemory(const void *h_mem)
Incorrect value received.
virtual void readParams()
Read argument from command line.