Xmipp  v3.23.11-Nereus
cuda_gpu_geo_transformer.cpp
Go to the documentation of this file.
1 /***************************************************************************
2  *
3  * Authors: David Strelak (davidstrelak@gmail.com)
4  *
5  * Unidad de Bioinformatica of Centro Nacional de Biotecnologia , CSIC
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 2 of the License, or
10  * (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, write to the Free Software
19  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
20  * 02111-1307 USA
21  *
22  * All comments concerning this program package may be sent to the
23  * e-mail address 'xmipp@cnb.csic.es'
24  ***************************************************************************/
25 
27 #include "cuda_asserts.h"
28 #include <cuda_runtime_api.h>
29 #include "cuda_all.cpp"
30 
31 template<typename T>
33  cudaFree(d_in);
34  cudaFree(d_out);
35  cudaFree(d_trInv);
36  cudaFree(d_coeffsX);
37  cudaFree(d_coeffsY);
38  setDefaultValues();
39 }
40 
41 template<typename T>
43  isReadyForBspline = isReadyForMatrix = false;
44  d_trInv = d_in = d_out = d_coeffsX = d_coeffsY = nullptr;
45  inX = inY = inZ = splineX = splineY = splineN;
46 }
47 
48 template<typename T>
49 void GeoTransformer<T>::initForMatrix(size_t x, size_t y, size_t z) {
50  release();
51 
52  inX = x;
53  inY = y;
54  inZ = z;
55  size_t matSize = (0 == z) ? 9 : 16;
56  gpuErrchk(cudaMalloc((void** ) &d_trInv, matSize * sizeof(T)));
57  gpuErrchk(cudaMalloc((void** ) &d_in, x * y * z * sizeof(T)));
58  gpuErrchk(cudaMalloc((void** ) &d_out, x * y * z * sizeof(T)));
59 
60  isReadyForMatrix = true;
61 }
62 
63 template<typename T>
64 void GeoTransformer<T>::initLazyForMatrix(size_t x, size_t y, size_t z) {
65  if (!isReadyForMatrix) {
66  initForMatrix(x, y, z);
67  }
68 }
69 
70 template <typename T>
71 void GeoTransformer<T>::initForBSpline(size_t inX, size_t inY, size_t inN,
72  size_t splineX, size_t splineY, size_t splineN, const GPU &gpu)
73 {
74  release();
75 
76  this->inX = inX;
77  this->inY = inY;
78  this->inZ = 1;
79  this->inN = inN;
80  this->splineX = splineX;
81  this->splineY = splineY;
82  this->splineN = splineN;
83  // take into account end control points
84 
85  // padding for produceAndLoadCoeffs; Y dimension has to be a multiple of BLOCK_SIZE
86  const int BLOCK_SIZE = iirConvolve2D_Cardinal_BSpline_3_MirrorOffBoundKernels::BLOCK_SIZE;
87  const int Y_padded = (inY / BLOCK_SIZE) * BLOCK_SIZE + BLOCK_SIZE * (inY % BLOCK_SIZE != 0);
88 
89  size_t inOutSize = inX * inY;
90  size_t inOutSize_padded = inX* Y_padded;
91  size_t coeffsSize = splineX * splineY * splineN;
92  gpuErrchk(cudaMalloc((void** ) &d_coeffsX, coeffsSize * sizeof(T)));
93  gpuErrchk(cudaMalloc((void** ) &d_coeffsY, coeffsSize * sizeof(T)));
94  gpuErrchk(cudaMalloc((void** ) &d_in, inOutSize_padded * sizeof(T)));
95  gpuErrchk(cudaMalloc((void** ) &d_out, inOutSize * sizeof(T)));
96 
97  this->gpu = &gpu;
98 
99  isReadyForBspline = true;
100 }
101 
102 template <typename T>
103 void GeoTransformer<T>::initLazyForBSpline(size_t inX, size_t inY, size_t inZ,
104  size_t splineX, size_t splineY, size_t splineN, const GPU &gpu)
105 {
106  if (!isReadyForBspline)
107  {
108  initForBSpline(inX, inY, inZ, splineX, splineY, splineN, gpu);
109  }
110 }
111 
112 template<typename T>
114  Matrix1D<T> shift(2);
115  shift.vdata[0] = 0.45;
116  shift.vdata[1] = 0.62;
117  Matrix2D<T> transform;
118  translation2DMatrix(shift, transform, true);
119  test(transform);
120 }
121 
122 template<typename T>
123 void GeoTransformer<T>::test(const Matrix2D<T> &transform) {
124  MultidimArray<T> resGpu, resCpu;
125  MultidimArray<T> input(32, 32);
126  for (int i = 0; i < input.ydim; ++i) {
127  for (int j = 0; j < input.xdim; ++j) {
128  input.data[i * input.xdim + j] = i * 10 + j;
129  }
130  }
131 
132  this->initForMatrix(input.xdim, input.ydim, input.zdim);
133  this->applyGeometry(3, resGpu, input, transform, false, true);
134  ::applyGeometry(3, resCpu, input, transform, false, true);
135 
136  bool failed = false;
137  for (int i = 0; i < input.ydim; ++i) {
138  for (int j = 0; j < input.xdim; ++j) {
139  int index = i * input.xdim + j;
140  T gpu = resGpu[index];
141  T cpu = resCpu[index];
142  if (std::abs(cpu - gpu) > 0.001) {
143  failed = true;
144  fprintf(stderr, "error[%d]: GPU %.4f CPU %.4f\n", index, gpu,
145  cpu);
146  }
147  }
148  }
149 
150  fprintf(stderr, "test transform result: %s\n", failed ? "FAIL" : "OK");
151  this->release();
152 }
153 
154 template<typename T>
156  int splineDegree,
157  MultidimArray<T> &output, const MultidimArray<T> &input,
158  const std::pair<Matrix1D<T>, Matrix1D<T>> &coeffs, size_t imageIdx, T outside) {
159  checkRestrictions(3, output, input, coeffs, imageIdx);
160 
161  loadOutput(output, outside);
162  produceAndLoadCoeffs(input);
163 
164  loadCoefficients(coeffs.first, coeffs.second);
165 
166  dim3 dimBlock(BLOCK_DIM_X, BLOCK_DIM_X);
167  dim3 dimGrid(ceil(inX / (T) dimBlock.x), ceil(inY / (T) dimBlock.y));
168 
169  switch (splineDegree) {
170  case 3:
171  applyLocalShiftGeometryKernel<T, 3><<<dimGrid, dimBlock>>>(d_coeffsX, d_coeffsY,
172  d_out, (int)inX, (int)inY, (int)inN,
173  d_in, imageIdx, (int)splineX, (int)splineY, (int)splineN);
174  gpuErrchk(cudaPeekAtLastError());
175  break;
176  default:
177  throw std::logic_error("not implemented");
178  }
179 
180  gpuErrchk(
181  cudaMemcpy(output.data, d_out, output.zyxdim * sizeof(T),
182  cudaMemcpyDeviceToHost));
183 
184 }
185 
186 template<typename T>
188  int splineDegree,
189  MultidimArray<T> &output, const MultidimArray<T> &input,
190  const std::pair<Matrix1D<T>, Matrix1D<T>> &coeffs, size_t imageIdx, T outside) {
191  checkRestrictions(3, output, input, coeffs, imageIdx);
192  auto stream = *(cudaStream_t*)gpu->stream();
193 
194  setOutputSize(output);
195  if ( splineDegree > 1 ) {
196  produceAndLoadCoeffs(input);
197  }
198  else
199  {
200  gpuErrchk(cudaMemcpyAsync(d_in, input.data, input.yxdim * sizeof(T), cudaMemcpyHostToDevice, stream));
201  }
202 
203  loadCoefficients(coeffs.first, coeffs.second);
204 
205  dim3 dimBlock(16, 16);
206  dim3 dimGrid(ceil(inX / (T) dimBlock.x), ceil((inY / (T) dimBlock.y) / (T) pixelsPerThread)); //more pixels
207 
208  // take into account end points
209  T hX = (splineX == 3) ? inX : (inX / (T) ((splineX - 3)));
210  T hY = (splineY == 3) ? inY : (inY / (T) ((splineY - 3)));
211  T hT = (splineN == 3) ? inN : (inN / (T) ((splineN - 3)));
212  T tPos = imageIdx / hT;
213 
214  switch (splineDegree) {
215  case 1:
216  applyLocalShiftGeometryKernelMorePixels<T, 1, pixelsPerThread><<<dimGrid, dimBlock, 0, stream>>>(d_coeffsX, d_coeffsY,
217  d_out, (int)inX, (int)inY, (int)inN,
218  d_in, imageIdx, (int)splineX, (int)splineY, (int)splineN,
219  hX, hY, tPos);
220  gpuErrchk(cudaPeekAtLastError());
221  break;
222  case 3:
223  applyLocalShiftGeometryKernelMorePixels<T, 3, pixelsPerThread><<<dimGrid, dimBlock, 0, stream>>>(d_coeffsX, d_coeffsY,
224  d_out, (int)inX, (int)inY, (int)inN,
225  d_in, imageIdx, (int)splineX, (int)splineY, (int)splineN,
226  hX, hY, tPos);
227  gpuErrchk(cudaPeekAtLastError());
228  break;
229  default:
230  REPORT_ERROR(ERR_NOT_IMPLEMENTED, formatString("applyBSplineTransform not implemented for spline degree %d.", splineDegree));
231  }
232 
233  gpuErrchk(
234  cudaMemcpyAsync(output.data, d_out, output.zyxdim * sizeof(T),
235  cudaMemcpyDeviceToHost, stream));
236 }
237 
238 template<typename T>
239 template<typename T_MAT>
240 void GeoTransformer<T>::applyGeometry(int splineDegree,
241  MultidimArray<T> &output, const MultidimArray<T> &input,
242  const Matrix2D<T_MAT> &transform, bool isInv, bool wrap, T outside,
243  const MultidimArray<T> *bCoeffsPtr) {
244  checkRestrictions(splineDegree, output, input, transform);
245  if (transform.isIdentity()) {
246  typeCast(input, output);
247  return;
248  }
249 
250  loadTransform(transform, isInv);
251  loadOutput(output, outside);
252 
253  if (splineDegree > 1) {
254  if (NULL != bCoeffsPtr) {
255  loadInput(*bCoeffsPtr);
256  } else {
257  produceAndLoadCoeffs(input);
258  }
259  } else {
260  loadInput(input);
261  }
262 
263  if (input.getDim() == 2) {
264  if (wrap) {
265  applyGeometry_2D_wrap(splineDegree);
266  } else {
267  throw std::logic_error("Not implemented yet");
268  }
269  } else {
270  throw std::logic_error("Not implemented yet");
271  }
272 
273  gpuErrchk(
274  cudaMemcpy(output.data, d_out, output.zyxdim * sizeof(T),
275  cudaMemcpyDeviceToHost));
276 }
277 
278 template<typename T>
279 template<typename T_MAT>
281  bool isInv) {
282  Matrix2D<T_MAT> trInv = isInv ? transform : transform.inv();
283  Matrix2D<T> tmp;
284  typeCast(trInv, tmp);
285  gpuErrchk(
286  cudaMemcpy(d_trInv, tmp.mdata, tmp.mdim * sizeof(T),
287  cudaMemcpyHostToDevice));
288 }
289 
290 template <typename T>
292  const Matrix1D<T> &Y)
293 {
294  gpuErrchk(
295  cudaMemcpyAsync(d_coeffsX, X.vdata, X.vdim * sizeof(T),
296  cudaMemcpyHostToDevice, *(cudaStream_t*)gpu->stream()));
297  gpuErrchk(
298  cudaMemcpyAsync(d_coeffsY, Y.vdata, Y.vdim * sizeof(T),
299  cudaMemcpyHostToDevice, *(cudaStream_t*)gpu->stream()));
300 }
301 
302 template <typename T>
304  const MultidimArray<T> &input)
305 {
306  gpuErrchk(
307  cudaMemcpyAsync(d_in, input.data, input.yxdim * sizeof(T), cudaMemcpyHostToDevice, *(cudaStream_t*)gpu->stream()));
308 
309  iirConvolve2D_Cardinal_Bspline_3_MirrorOffBoundInplace(d_in, input.xdim, input.ydim, *(cudaStream_t*)gpu->stream());
310 }
311 
312 
313 template<typename T>
314 void GeoTransformer<T>::applyGeometry_2D_wrap(int splineDegree) {
315  T minxp = 0;
316  T minyp = 0;
317  T minxpp = minxp - XMIPP_EQUAL_ACCURACY;
318  T minypp = minyp - XMIPP_EQUAL_ACCURACY;
319  T maxxp = inX - 1;
320  T maxyp = inY - 1;
321  T maxxpp = maxxp + XMIPP_EQUAL_ACCURACY;
322  T maxypp = maxyp + XMIPP_EQUAL_ACCURACY;
323 
324  dim3 dimBlock(BLOCK_DIM_X, BLOCK_DIM_X);
325  dim3 dimGrid(ceil(inX / (T) dimBlock.x), ceil(inY / (T) dimBlock.y));
326 
327  switch (splineDegree) {
328  case 3:
329  applyGeometryKernel_2D_wrap<T, 3,true><<<dimGrid, dimBlock>>>(d_trInv,
330  minxpp, maxxpp, minypp, maxypp,
331  minxp, maxxp, minyp, maxyp,
332  d_out, (int)inX, (int)inY, d_in, (int)inX, (int)inY);
333  gpuErrchk(cudaPeekAtLastError());
334  break;
335  default:
336  throw std::logic_error("not implemented");
337  }
338 }
339 
340 template<typename T>
342  gpuErrchk(
343  cudaMemcpy(d_in, input.data, input.zyxdim * sizeof(T),
344  cudaMemcpyHostToDevice));
345 }
346 
347 template<typename T>
348 void GeoTransformer<T>::loadOutput(MultidimArray<T> &output, T outside) {
349  setOutputSize(output);
350 
351  if (outside != (T) 0) {
352  // Initialize output matrix with value=outside
354  {
355  DIRECT_MULTIDIM_ELEM(output, n) = outside;
356  }
357  gpuErrchk(
358  cudaMemcpy(d_out, output.data, output.zyxdim * sizeof(T),
359  cudaMemcpyHostToDevice));
360  } else {
361  gpuErrchk(cudaMemset(d_out, 0, output.zyxdim * sizeof(T)));
362  }
363 }
364 template<typename T>
366  if (output.xdim == 0) {
367  output.resizeNoCopy(inZ, inY, inX);
368  }
369 }
370 
371 template<typename T>
372 template<typename T_MAT>
373 void GeoTransformer<T>::checkRestrictions(int splineDegree,
374  MultidimArray<T> &output, const MultidimArray<T> &input,
375  const Matrix2D<T_MAT> &transform) {
376  if (!isReadyForMatrix)
377  throw std::logic_error("Transformer is not ready yet.");
378 
379  checkRestrictions(output, input);
380 
381  if ((input.getDim() == 2)
382  && ((transform.Xdim() != 3) || (transform.Ydim() != 3)))
383  throw std::invalid_argument("2D transformation matrix is not 3x3");
384  if ((input.getDim() == 3)
385  && ((transform.Xdim() != 4) || (transform.Ydim() != 4)))
386  throw std::invalid_argument("3D transformation matrix is not 4x4");
387 }
388 
389 
390 template<typename T>
391 void GeoTransformer<T>::checkRestrictions(int splineDegree,
392  MultidimArray<T> &output, const MultidimArray<T> &input,
393  const std::pair<Matrix1D<T>, Matrix1D<T>> &coeffs, size_t frameIdx) {
394  if (!isReadyForBspline)
395  throw std::logic_error("Transformer is not ready yet.");
396 
397  checkRestrictions(output, input);
398 
399  if (frameIdx > inN)
400  throw std::invalid_argument("Frame index is out of bound");
401  size_t coeffsElems = splineX * splineY * splineN;
402  if ((coeffs.first.size() != coeffsElems) || (coeffs.second.size() != coeffsElems))
403  throw std::invalid_argument("Number of coefficients does not fit. "
404  "To init function, pass N control points.");
405 }
406 
407 template<typename T>
409  const MultidimArray<T> &input) {
410  if (!input.xdim)
411  throw std::invalid_argument("Input is empty");
412  if ((inX != input.xdim) || (inY != input.ydim) || (inZ != input.zdim))
413  throw std::logic_error(
414  "Transformer has been initialized for a different size of the input");
415  if (&input == &output)
416  throw std::invalid_argument(
417  "The input array cannot be the same as the output array");
418 
419  if (input.xdim < 64) {
420  throw std::invalid_argument("Xdim should be at least 64");
421  }
422 
423  if (input.ydim <= 1) {
424  throw std::invalid_argument("Ydim should be at least 2");
425  }
426 }
427 
428 template<typename T>
429 std::unique_ptr<T[]> GeoTransformer<T>::copy_out_d_in(size_t size) const {
430  auto copy_in = std::unique_ptr<T[]>(new T[size]);
431 
432  cudaMemcpy(copy_in.get(), d_in , sizeof(T) * size, cudaMemcpyDeviceToHost);
433 
434  return copy_in;
435  }
436 
437 template class GeoTransformer<float>;
438 template class GeoTransformer<double>;
#define gpuErrchk(code)
Definition: cuda_asserts.h:31
size_t Xdim() const
Definition: matrix2d.h:575
bool isIdentity() const
Definition: matrix2d.cpp:1323
Case or algorithm not implemented yet.
Definition: xmipp_error.h:177
template void translation2DMatrix(const Matrix1D< float > &, Matrix2D< float > &, bool inverse)
#define REPORT_ERROR(nerr, ErrormMsg)
Definition: xmipp_error.h:211
void produceAndLoadCoeffs(const MultidimArray< T > &input)
void resizeNoCopy(const MultidimArray< T1 > &v)
static double * y
void applyGeometry(int SplineDegree, MultidimArray< std::complex< double > > &V2, const MultidimArray< std::complex< double > > &V1, const Matrix2D< double > &A, bool inv, bool wrap, std::complex< double > outside, MultidimArray< double > *BcoeffsPtr)
void initForMatrix(size_t x, size_t y, size_t z)
void inv(Matrix2D< T > &result) const
Definition: matrix2d.cpp:663
void abs(Image< double > &op)
T * mdata
Definition: matrix2d.h:395
#define BLOCK_DIM_X
doublereal * x
#define i
Definition: mask.h:36
viol index
void initLazyForMatrix(size_t x, size_t y=1, size_t z=1)
void applyBSplineTransformRef(int splineDegree, MultidimArray< T > &output, const MultidimArray< T > &input, const std::pair< Matrix1D< T >, Matrix1D< T >> &coeffs, size_t imageIdx, T outside=0)
void initLazyForBSpline(size_t inX, size_t inY, size_t inN, size_t splineX, size_t splineY, size_t splineN, const GPU &gpu)
#define XMIPP_EQUAL_ACCURACY
Definition: xmipp_macros.h:119
#define FOR_ALL_DIRECT_ELEMENTS_IN_MULTIDIMARRAY(v)
double z
#define DIRECT_MULTIDIM_ELEM(v, n)
size_t Ydim() const
Definition: matrix2d.h:584
#define j
std::unique_ptr< T[]> copy_out_d_in(size_t size) const
Definition: ctf.h:38
void typeCast(const Matrix1D< T1 > &v1, Matrix1D< T2 > &v2)
Definition: matrix1d.h:1227
size_t mdim
Definition: matrix2d.h:416
String formatString(const char *format,...)
void initForBSpline(size_t inX, size_t inY, size_t inN, size_t splineX, size_t splineY, size_t splineN, const GPU &gpu)
Definition: gpu.h:36
fprintf(glob_prnt.io, "\)
T * vdata
The array itself.
Definition: matrix1d.h:258
void applyBSplineTransform(int splineDegree, MultidimArray< T > &output, const MultidimArray< T > &input, const std::pair< Matrix1D< T >, Matrix1D< T >> &coeffs, size_t imageIdx, T outside=0)
void applyGeometry(int splineDegree, MultidimArray< T > &output, const MultidimArray< T > &input, const Matrix2D< T_MAT > &transform, bool isInv, bool wrap, T outside=0, const MultidimArray< T > *bCoeffsPtr=NULL)
int * n
size_t vdim
Number of elements.
Definition: matrix1d.h:264