7 #include "cuda_angular_sph_alignment.cu" 16 #include <thrust/reduce.h> 17 #include <thrust/device_vector.h> 24 cudaError
cudaMallocAndCopy(T** target,
const T* source,
size_t numberOfElements,
size_t memSize = 0)
26 size_t elemSize = numberOfElements *
sizeof(T);
27 memSize = memSize == 0 ? elemSize : memSize *
sizeof(T);
29 cudaError err = cudaSuccess;
30 if ((err = cudaMalloc(target, memSize)) != cudaSuccess) {
35 if ((err = cudaMemcpy(*target, source, elemSize, cudaMemcpyHostToDevice)) != cudaSuccess) {
40 if (memSize > elemSize) {
41 cudaMemset((*target) + numberOfElements, 0, memSize - elemSize);
47 #define processCudaError() (_processCudaError(__FILE__, __LINE__)) 50 cudaError_t err = cudaGetLastError();
51 if (err != cudaSuccess) {
52 fprintf(stderr,
"File: %s: line %d\nCuda error: %s\n", file, line, cudaGetErrorString(err));
59 template<
typename Target,
typename Source>
60 void transformData(Target** dest, Source* source,
size_t n,
bool mallocMem =
true)
62 std::vector<Target> tmp(source, source + n);
65 if (cudaMalloc(dest,
sizeof(Target) * n) != cudaSuccess){
70 if (cudaMemcpy(*dest, tmp.data(),
sizeof(Target) * n, cudaMemcpyHostToDevice) != cudaSuccess){
90 cudaFree(dProjectionPlane);
104 if (program ==
nullptr)
105 throw(std::runtime_error(
"AngularSphAlignment not associated with the program!"));
109 this->iRmax = 1.0 / program->
RmaxDef;
110 setupImageMetaData(program->
V);
113 setupVolumeDataCpu();
114 setupVolumeMaskCpu();
126 grid.x = ((imageMetaData.
xDim + block.x - 1) / block.x);
127 grid.y = ((imageMetaData.
yDim + block.y - 1) / block.y);
128 grid.z = ((imageMetaData.
zDim + block.z - 1) / block.z);
130 totalGridSize = grid.x * grid.y * grid.z;
133 constantSharedMemSize = 0;
138 if (program ==
nullptr)
139 throw(std::runtime_error(
"AngularSphAlignment not associated with the program!"));
144 setupProjectionPlaneCpu();
148 setupProjectionPlane();
153 changingSharedMemSize = 0;
154 changingSharedMemSize +=
sizeof(int4) * steps;
158 void AngularSphAlignment::setupClnm()
160 clnmVec.resize(program->
vL1.
size());
162 for (
unsigned i = 0;
i < program->
vL1.
size(); ++
i) {
163 clnmVec[
i].x = program->
clnm[
i];
168 if (dClnm ==
nullptr) {
172 if (cudaMemcpy(dClnm, clnmVec.data(), clnmVec.size() *
sizeof(
PrecisionType3),
173 cudaMemcpyHostToDevice) != cudaSuccess)
178 void AngularSphAlignment::setupClnmCpu()
180 clnmVec.resize(program->
vL1.
size());
182 for (
unsigned i = 0;
i < program->
vL1.
size(); ++
i) {
183 clnmVec[
i].x = program->
clnm[
i];
187 dClnm = clnmVec.data();
197 size_t elements = imageMetaData.
xDim * imageMetaData.
yDim * imageMetaData.
zDim;
198 std::vector<PrecisionType> tVec(elements);
199 cudaMemcpy(tVec.data(), inputData,
sizeof(
PrecisionType) * elements, cudaMemcpyDeviceToHost);
200 std::vector<double> dVec(tVec.begin(), tVec.end());
201 memcpy(outputImage().data, dVec.
data(),
sizeof(double) * elements);
204 void AngularSphAlignment::setupVolumeData()
206 const auto& vol = program->
V();
207 transformData(&dVolData, vol.data, vol.zyxdim, dVolData ==
nullptr);
210 void AngularSphAlignment::setupVolumeDataCpu()
212 const auto& vol = program->
V();
213 volDataVec.assign(vol.data, vol.data + vol.zyxdim);
214 dVolData = volDataVec.data();
217 void AngularSphAlignment::setupRotation()
222 void AngularSphAlignment::setupRotationCpu()
225 dRotation = rotationVec.data();
228 void AngularSphAlignment::setupVolumeMask()
230 if (dVolMask ==
nullptr) {
236 cudaMemcpyHostToDevice) != cudaSuccess)
241 void AngularSphAlignment::setupVolumeMaskCpu()
246 void AngularSphAlignment::setupProjectionPlane()
248 const auto& projPlane = program->
P();
249 if (dProjectionPlane ==
nullptr) {
250 if (cudaMalloc(&dProjectionPlane, projPlane.yxdim *
sizeof(
PrecisionType)) != cudaSuccess)
253 cudaMemset(dProjectionPlane, 0, projPlane.yxdim *
sizeof(
PrecisionType));
256 void AngularSphAlignment::setupProjectionPlaneCpu()
258 const auto& projPlane = program->
P();
259 projectionPlaneVec.assign(projPlane.data, projPlane.data + projPlane.yxdim);
260 dProjectionPlane = projectionPlaneVec.data();
294 thrust::device_vector<PrecisionType> thrustVec(totalGridSize * 3, 0.0);
297 cudaDeviceSynchronize();
300 projectionKernel<<<grid, block, constantSharedMemSize + changingSharedMemSize>>>(
311 thrust::raw_pointer_cast(thrustVec.data())
314 cudaDeviceSynchronize();
316 auto countIt = thrustVec.begin();
317 auto sumVDIt = countIt + totalGridSize;
318 auto modgIt = sumVDIt + totalGridSize;
320 outputs.
count = thrust::reduce(countIt, sumVDIt);
321 outputs.
sumVD = thrust::reduce(sumVDIt, modgIt);
322 outputs.
modg = thrust::reduce(modgIt, thrustVec.end());
326 void AngularSphAlignment::transferProjectionPlane()
330 std::vector<PrecisionType> tmp(program->
P().zyxdim);
331 cudaMemcpy(tmp.data(), dProjectionPlane, tmp.size() *
sizeof(
PrecisionType),
332 cudaMemcpyDeviceToHost);
333 std::vector<double> tmpDouble(tmp.begin(), tmp.end());
334 memcpy(program->
P().
data, tmpDouble.
data(), tmpDouble.size() *
sizeof(double));
337 void AngularSphAlignment::transferProjectionPlaneCpu()
339 std::vector<double> tmp(projectionPlaneVec.begin(), projectionPlaneVec.end());
340 memcpy(program->
P().
data, tmp.
data(), tmp.size() *
sizeof(double));
346 transferProjectionPlaneCpu();
348 transferProjectionPlane();
352 void AngularSphAlignment::setupZSHparams()
354 zshparamsVec.resize(program->
vL1.
size());
356 for (
unsigned i = 0;
i < zshparamsVec.size(); ++
i) {
357 zshparamsVec[
i].w = program->
vL1[
i];
358 zshparamsVec[
i].x = program->
vN[
i];
359 zshparamsVec[
i].y = program->
vL2[
i];
360 zshparamsVec[
i].z = program->
vM[
i];
363 if (dZshParams ==
nullptr) {
364 if (
cudaMallocAndCopy(&dZshParams, zshparamsVec.data(), zshparamsVec.size()) != cudaSuccess)
367 if (cudaMemcpy(dZshParams, zshparamsVec.data(), zshparamsVec.size() *
sizeof(int4),
368 cudaMemcpyHostToDevice) != cudaSuccess)
373 void AngularSphAlignment::setupZSHparamsCpu()
375 zshparamsVec.resize(program->
vL1.
size());
377 for (
unsigned i = 0;
i < zshparamsVec.size(); ++
i) {
378 zshparamsVec[
i].w = program->
vL1[
i];
379 zshparamsVec[
i].x = program->
vN[
i];
380 zshparamsVec[
i].y = program->
vL2[
i];
381 zshparamsVec[
i].z = program->
vM[
i];
383 dZshParams = zshparamsVec.data();
388 auto& mda = inputImage();
389 transformData(outputImageData, mda.data, mda.xdim * mda.ydim * mda.zdim);
392 void AngularSphAlignment::setupImageMetaData(
const Image<double>& mda)
394 imageMetaData.
xShift = mda().xinit;
395 imageMetaData.
yShift = mda().yinit;
396 imageMetaData.
zShift = mda().zinit;
397 imageMetaData.
xDim = mda().xdim;
398 imageMetaData.
yDim = mda().ydim;
399 imageMetaData.
zDim = mda().zdim;
404 auto& mda = inputImage();
411 if (cudaMalloc(outputImageData, size) != cudaSuccess)
431 size_t idxZ0=2*idxY0;
456 double gx=0.0, gy=0.0, gz=0.0;
457 double k2=
ZZ(pos)*
ZZ(pos);
458 double kr=
ZZ(pos)*iRmaxF;
459 double k2i2=k2+
YY(pos)*
YY(pos);
460 double ir=
YY(pos)*iRmaxF;
461 double r2=k2i2+
XX(pos)*
XX(pos);
462 double jr=
XX(pos)*iRmaxF;
463 double rr=
sqrt(r2)*iRmaxF;
465 for (
size_t idx=0; idx<idxY0; idx++) {
475 gy +=
VEC_ELEM(clnm,idx+idxY0) *(zsph);
476 gz +=
VEC_ELEM(clnm,idx+idxZ0) *(zsph);
481 int k_mask, i_mask, j_mask;
483 k_mask = (int)(
ZZ(pos)+gz);
484 i_mask = (int)(
YY(pos)+gy);
485 j_mask = (int)(
XX(pos)+gx);
487 if (V_mask.
outside(k_mask, i_mask, j_mask)) {
490 voxelI_mask =
A3D_ELEM(V_mask, k_mask, i_mask, j_mask);
493 if (voxelI_mask == 1) {
496 outputs.
sumVD += voxelI;
497 outputs.
modg += gx*gx+gy*gy+gz*gz;
510 for (
size_t i = 0;
i < 3;
i++)
511 for (
size_t j = 0;
j < 3;
j++)
512 tmp[
i] += rotation[3 *
i +
j] * pos[
j];
536 outside_value : ELEM_3D_SHIFTED(data, ImD, z0, y0, x0);
538 outside_value : ELEM_3D_SHIFTED(data, ImD, z0, y0, x1);
540 outside_value : ELEM_3D_SHIFTED(data, ImD, z0, y1, x0);
542 outside_value : ELEM_3D_SHIFTED(data, ImD, z0, y1, x1);
544 outside_value : ELEM_3D_SHIFTED(data, ImD, z1, y0, x0);
546 outside_value : ELEM_3D_SHIFTED(data, ImD, z1, y0, x1);
548 outside_value : ELEM_3D_SHIFTED(data, ImD, z1, y1, x0);
550 outside_value : ELEM_3D_SHIFTED(data, ImD, z1, y1, x1);
576 outputs->
sumVD = 0.0;
578 outputs->
count = 0.0;
592 for (
int k = P2L_Z_IDX(volMeta, 0);
k < P2L_Z_IDX(volMeta, volMeta.
zDim);
k++) {
593 for (
int i = P2L_Y_IDX(volMeta, 0);
i < P2L_Y_IDX(volMeta, volMeta.
yDim);
i++) {
594 for (
int j = P2L_X_IDX(volMeta, 0);
j < P2L_X_IDX(volMeta, volMeta.
xDim);
j++) {
595 pos[2] =
k; pos[1] =
i; pos[0] =
j;
599 double gx = 0.0, gy = 0.0, gz = 0.0;
600 double k2= pos[2] * pos[2];
601 double kr= pos[2] * iRmax;
602 double k2i2 =k2 + pos[1] * pos[1];
603 double ir= pos[1] * iRmax;
604 double r2= k2i2 + pos[0] * pos[0];
605 double jr= pos[0] * iRmax;
606 double rr =
sqrt(r2) * iRmax;
609 for (
int idx = 0; idx < steps; idx++) {
610 int l1 = zshparams[idx].w;
611 int n = zshparams[idx].x;
612 int l2 = zshparams[idx].y;
613 int m = zshparams[idx].z;
616 gx += zsph * clnm[idx].x;
617 gy += zsph * clnm[idx].y;
618 gz += zsph * clnm[idx].z;
622 int k_mask, i_mask, j_mask;
624 k_mask = (int)(pos[2] + gz);
625 i_mask = (int)(pos[1] + gy);
626 j_mask = (int)(pos[0] + gx);
628 if (IS_OUTSIDE(volMeta, k_mask, i_mask, j_mask)) {
631 voxelI_mask = ELEM_3D_SHIFTED(Vmask, volMeta, k_mask, i_mask, j_mask);
634 if (voxelI_mask == 1) {
636 pos[0] + gx, pos[1] + gy, pos[2] + gz);
637 ELEM_2D_SHIFTED(projectionPlane, volMeta,
i,
j) += voxelI;
638 outputs->
sumVD += voxelI;
639 outputs->
modg += gx*gx + gy*gy + gz*gz;
#define A2D_ELEM(v, i, j)
void rotate(PrecisionType *pos, PrecisionType *rotation)
void _processCudaError(const char *file, int line)
void setupConstantParameters()
void sqrt(Image< double > &op)
MultidimArray< int > V_mask
cudaError cudaMallocAndCopy(T **target, const T *source, size_t numberOfElements, size_t memSize=0)
ql0001_ & k(htemp+1),(cvec+1),(atemp+1),(bj+1),(bl+1),(bu+1),(x+1),(clamda+1), &iout, infoqp, &zero,(w+1), &lenw,(iw+1), &leniw, &glob_grd.epsmac
#define A3D_ELEM(V, k, i, j)
#define processCudaError()
void setupImageNew(Image< double > &inputImage, PrecisionType **outputImageData)
void associateWith(ProgAngularSphAlignmentGpu *prog)
void fakeKernel(PrecisionType Rmax2, PrecisionType iRmax, ImageMetaData volMeta, PrecisionType *volData, PrecisionType *rotation, int steps, int4 *zshparams, PrecisionType3 *clnm, int *Vmask, PrecisionType *projectionPlane, KernelOutputs *outputs)
void runKernelTest(Matrix1D< double > &clnm, size_t idxY0, double RmaxF2, double iRmaxF, Matrix2D< double > R, const MultidimArray< double > &mV, Matrix1D< double > &steps_cp, Matrix1D< int > &vL1, Matrix1D< int > &vN, Matrix1D< int > &vL2, Matrix1D< int > &vM, MultidimArray< int > &V_mask, MultidimArray< double > &mP)
T interpolatedElement3D(double x, double y, double z, T outside_value=(T) 0) const
PrecisionType interpolatedElement3DCpu(PrecisionType *data, ImageMetaData ImD, PrecisionType x, PrecisionType y, PrecisionType z, PrecisionType outside_value=0)
void setupChangingParameters()
bool outside(int k, int i, int j) const
void transformData(Target **dest, Source *source, size_t n, bool mallocMem=true)
fprintf(glob_prnt.io, "\)
#define LIN_INTERP(a, l, h)
KernelOutputs getOutputs()