28 #include <cuda_runtime_api.h> 43 isReadyForBspline = isReadyForMatrix =
false;
44 d_trInv = d_in = d_out = d_coeffsX = d_coeffsY =
nullptr;
45 inX = inY = inZ = splineX = splineY = splineN;
55 size_t matSize = (0 ==
z) ? 9 : 16;
56 gpuErrchk(cudaMalloc((
void** ) &d_trInv, matSize *
sizeof(T)));
57 gpuErrchk(cudaMalloc((
void** ) &d_in, x * y * z *
sizeof(T)));
58 gpuErrchk(cudaMalloc((
void** ) &d_out, x * y * z *
sizeof(T)));
60 isReadyForMatrix =
true;
65 if (!isReadyForMatrix) {
66 initForMatrix(x, y, z);
72 size_t splineX,
size_t splineY,
size_t splineN,
const GPU &gpu)
80 this->splineX = splineX;
81 this->splineY = splineY;
82 this->splineN = splineN;
86 const int BLOCK_SIZE = iirConvolve2D_Cardinal_BSpline_3_MirrorOffBoundKernels::BLOCK_SIZE;
87 const int Y_padded = (inY / BLOCK_SIZE) * BLOCK_SIZE + BLOCK_SIZE * (inY % BLOCK_SIZE != 0);
89 size_t inOutSize = inX * inY;
90 size_t inOutSize_padded = inX* Y_padded;
91 size_t coeffsSize = splineX * splineY * splineN;
92 gpuErrchk(cudaMalloc((
void** ) &d_coeffsX, coeffsSize *
sizeof(T)));
93 gpuErrchk(cudaMalloc((
void** ) &d_coeffsY, coeffsSize *
sizeof(T)));
94 gpuErrchk(cudaMalloc((
void** ) &d_in, inOutSize_padded *
sizeof(T)));
95 gpuErrchk(cudaMalloc((
void** ) &d_out, inOutSize *
sizeof(T)));
99 isReadyForBspline =
true;
102 template <
typename T>
104 size_t splineX,
size_t splineY,
size_t splineN,
const GPU &gpu)
106 if (!isReadyForBspline)
108 initForBSpline(inX, inY, inZ, splineX, splineY, splineN, gpu);
115 shift.
vdata[0] = 0.45;
116 shift.
vdata[1] = 0.62;
126 for (
int i = 0;
i < input.
ydim; ++
i) {
127 for (
int j = 0;
j < input.
xdim; ++
j) {
132 this->initForMatrix(input.
xdim, input.
ydim, input.
zdim);
133 this->
applyGeometry(3, resGpu, input, transform,
false,
true);
137 for (
int i = 0;
i < input.
ydim; ++
i) {
138 for (
int j = 0;
j < input.
xdim; ++
j) {
140 T gpu = resGpu[
index];
141 T cpu = resCpu[
index];
144 fprintf(stderr,
"error[%d]: GPU %.4f CPU %.4f\n", index, gpu,
150 fprintf(stderr,
"test transform result: %s\n", failed ?
"FAIL" :
"OK");
159 checkRestrictions(3, output, input, coeffs, imageIdx);
161 loadOutput(output, outside);
162 produceAndLoadCoeffs(input);
164 loadCoefficients(coeffs.first, coeffs.second);
167 dim3 dimGrid(ceil(inX / (T) dimBlock.x), ceil(inY / (T) dimBlock.y));
169 switch (splineDegree) {
171 applyLocalShiftGeometryKernel<T, 3><<<dimGrid, dimBlock>>>(d_coeffsX, d_coeffsY,
172 d_out, (int)inX, (
int)inY, (int)inN,
173 d_in, imageIdx, (
int)splineX, (int)splineY, (
int)splineN);
177 throw std::logic_error(
"not implemented");
181 cudaMemcpy(output.
data, d_out, output.
zyxdim *
sizeof(T),
182 cudaMemcpyDeviceToHost));
191 checkRestrictions(3, output, input, coeffs, imageIdx);
192 auto stream = *(cudaStream_t*)gpu->stream();
194 setOutputSize(output);
195 if ( splineDegree > 1 ) {
196 produceAndLoadCoeffs(input);
200 gpuErrchk(cudaMemcpyAsync(d_in, input.
data, input.
yxdim *
sizeof(T), cudaMemcpyHostToDevice, stream));
203 loadCoefficients(coeffs.first, coeffs.second);
205 dim3 dimBlock(16, 16);
206 dim3 dimGrid(ceil(inX / (T) dimBlock.x), ceil((inY / (T) dimBlock.y) / (T) pixelsPerThread));
209 T hX = (splineX == 3) ? inX : (inX / (T) ((splineX - 3)));
210 T hY = (splineY == 3) ? inY : (inY / (T) ((splineY - 3)));
211 T hT = (splineN == 3) ? inN : (inN / (T) ((splineN - 3)));
212 T tPos = imageIdx / hT;
214 switch (splineDegree) {
216 applyLocalShiftGeometryKernelMorePixels<T, 1, pixelsPerThread><<<dimGrid, dimBlock, 0, stream>>>(d_coeffsX, d_coeffsY,
217 d_out, (int)inX, (
int)inY, (int)inN,
218 d_in, imageIdx, (
int)splineX, (int)splineY, (
int)splineN,
223 applyLocalShiftGeometryKernelMorePixels<T, 3, pixelsPerThread><<<dimGrid, dimBlock, 0, stream>>>(d_coeffsX, d_coeffsY,
224 d_out, (int)inX, (
int)inY, (int)inN,
225 d_in, imageIdx, (
int)splineX, (int)splineY, (
int)splineN,
234 cudaMemcpyAsync(output.
data, d_out, output.
zyxdim *
sizeof(T),
235 cudaMemcpyDeviceToHost, stream));
239 template<
typename T_MAT>
244 checkRestrictions(splineDegree, output, input, transform);
250 loadTransform(transform, isInv);
251 loadOutput(output, outside);
253 if (splineDegree > 1) {
254 if (NULL != bCoeffsPtr) {
255 loadInput(*bCoeffsPtr);
257 produceAndLoadCoeffs(input);
263 if (input.
getDim() == 2) {
265 applyGeometry_2D_wrap(splineDegree);
267 throw std::logic_error(
"Not implemented yet");
270 throw std::logic_error(
"Not implemented yet");
274 cudaMemcpy(output.
data, d_out, output.
zyxdim *
sizeof(T),
275 cudaMemcpyDeviceToHost));
279 template<
typename T_MAT>
286 cudaMemcpy(d_trInv, tmp.
mdata, tmp.
mdim *
sizeof(T),
287 cudaMemcpyHostToDevice));
290 template <
typename T>
295 cudaMemcpyAsync(d_coeffsX, X.
vdata, X.
vdim *
sizeof(T),
296 cudaMemcpyHostToDevice, *(cudaStream_t*)gpu->stream()));
298 cudaMemcpyAsync(d_coeffsY, Y.
vdata, Y.
vdim *
sizeof(T),
299 cudaMemcpyHostToDevice, *(cudaStream_t*)gpu->stream()));
302 template <
typename T>
307 cudaMemcpyAsync(d_in, input.
data, input.
yxdim *
sizeof(T), cudaMemcpyHostToDevice, *(cudaStream_t*)gpu->stream()));
309 iirConvolve2D_Cardinal_Bspline_3_MirrorOffBoundInplace(d_in, input.
xdim, input.
ydim, *(cudaStream_t*)gpu->stream());
325 dim3 dimGrid(ceil(inX / (T) dimBlock.x), ceil(inY / (T) dimBlock.y));
327 switch (splineDegree) {
329 applyGeometryKernel_2D_wrap<T, 3,true><<<dimGrid, dimBlock>>>(d_trInv,
330 minxpp, maxxpp, minypp, maxypp,
331 minxp, maxxp, minyp, maxyp,
332 d_out, (int)inX, (
int)inY, d_in, (int)inX, (
int)inY);
336 throw std::logic_error(
"not implemented");
343 cudaMemcpy(d_in, input.
data, input.
zyxdim *
sizeof(T),
344 cudaMemcpyHostToDevice));
349 setOutputSize(output);
351 if (outside != (T) 0) {
358 cudaMemcpy(d_out, output.
data, output.
zyxdim *
sizeof(T),
359 cudaMemcpyHostToDevice));
366 if (output.
xdim == 0) {
372 template<
typename T_MAT>
376 if (!isReadyForMatrix)
377 throw std::logic_error(
"Transformer is not ready yet.");
379 checkRestrictions(output, input);
382 && ((transform.
Xdim() != 3) || (transform.
Ydim() != 3)))
383 throw std::invalid_argument(
"2D transformation matrix is not 3x3");
385 && ((transform.
Xdim() != 4) || (transform.
Ydim() != 4)))
386 throw std::invalid_argument(
"3D transformation matrix is not 4x4");
394 if (!isReadyForBspline)
395 throw std::logic_error(
"Transformer is not ready yet.");
397 checkRestrictions(output, input);
400 throw std::invalid_argument(
"Frame index is out of bound");
401 size_t coeffsElems = splineX * splineY * splineN;
402 if ((coeffs.first.size() != coeffsElems) || (coeffs.second.size() != coeffsElems))
403 throw std::invalid_argument(
"Number of coefficients does not fit. " 404 "To init function, pass N control points.");
411 throw std::invalid_argument(
"Input is empty");
412 if ((inX != input.
xdim) || (inY != input.
ydim) || (inZ != input.
zdim))
413 throw std::logic_error(
414 "Transformer has been initialized for a different size of the input");
415 if (&input == &output)
416 throw std::invalid_argument(
417 "The input array cannot be the same as the output array");
419 if (input.
xdim < 64) {
420 throw std::invalid_argument(
"Xdim should be at least 64");
423 if (input.
ydim <= 1) {
424 throw std::invalid_argument(
"Ydim should be at least 2");
430 auto copy_in = std::unique_ptr<T[]>(
new T[size]);
432 cudaMemcpy(copy_in.get(), d_in ,
sizeof(T) * size, cudaMemcpyDeviceToHost);
Case or algorithm not implemented yet.
#define REPORT_ERROR(nerr, ErrormMsg)
void resizeNoCopy(const MultidimArray< T1 > &v)
void inv(Matrix2D< T > &result) const
void abs(Image< double > &op)
#define XMIPP_EQUAL_ACCURACY
#define FOR_ALL_DIRECT_ELEMENTS_IN_MULTIDIMARRAY(v)
#define DIRECT_MULTIDIM_ELEM(v, n)
void typeCast(const Matrix1D< T1 > &v1, Matrix1D< T2 > &v2)
String formatString(const char *format,...)
fprintf(glob_prnt.io, "\)
T * vdata
The array itself.
size_t vdim
Number of elements.