25 #include <cuda_runtime_api.h> 28 #include "cuda_correlation.cu" 31 template<
bool NORMALIZE>
34 computeCorrStatOneToNNormalize();
38 storeResultOneToN<NORMALIZE>();
42 template<
bool NORMALIZE>
44 auto &res = this->getFiguresOfMerit();
45 const size_t noOfSignals = this->getSettings().otherDims.n();
46 const size_t elems = this->getSettings().refDims.sizeSingle();
48 auto ref = computeStat(m_h_ref_corrRes[0], elems);
49 auto others = (ResRaw*) m_h_corrRes;
50 for(
size_t n = 0;
n < noOfSignals; ++
n) {
51 auto o = computeStat(others[
n], elems);
52 T num = others[
n].corr - (o.avg * ref.avg * elems);
53 T denom = o.stddev * ref.stddev * elems;
54 res.at(
n) = num / denom;
57 auto others = (ResNorm*) m_h_corrRes;
58 for(
size_t n = 0;
n < noOfSignals; ++
n) {
59 res.at(
n) = others[
n].corr / elems;
69 T sumSqrNorm = r.sumSqr /
norm;
70 s.stddev =
sqrt(
abs(sumSqrNorm - (s.avg * s.avg)));
76 bool isReady = this->isInitialized() && this->isRefLoaded();
80 if ( ! m_stream->isGpuPointer(others)) {
86 const auto &s = this->getSettings();
87 this->getFiguresOfMerit().resize(s.otherDims.n());
90 if (s.normalizeResult) {
91 computeOneToN<true>();
93 computeOneToN<false>();
104 const auto &s = this->getSettings();
105 this->setIsRefLoaded(
nullptr != ref);
107 if (m_stream->isGpuPointer(ref)) {
110 size_t bytes = s.refDims.size() *
sizeof(T);
111 bool hasToPin = ! m_stream->isMemoryPinned(ref);
113 m_stream->pinMemory(ref, bytes);
115 auto stream = *(cudaStream_t*)m_stream->stream();
121 cudaMemcpyHostToDevice, stream));
123 m_stream->unpinMemory(ref);
126 if (s.normalizeResult) {
127 computeAvgStddevForRef();
135 const auto &dims = this->getSettings().otherDims;
136 auto stream = *(cudaStream_t*)m_stream->stream();
140 ceil((dims.x() * dims.n()) / (
float)dimBlock.x));
142 size_t bytes = dims.n() *
sizeof(ResRaw);
143 gpuErrchk(cudaMemset(m_d_corrRes, 0, bytes));
145 if (std::is_same<T, float>::value) {
146 computeCorrIndexStat2DOneToN<float, float3>
147 <<<dimGrid, dimBlock, 0, stream>>> (
150 dims.x(), dims.y(), dims.n(),
151 (float3*)m_d_corrRes);
152 }
else if (std::is_same<T, double>::value) {
153 computeCorrIndexStat2DOneToN<double, double3>
154 <<<dimGrid, dimBlock, 0, stream>>> (
157 dims.x(), dims.y(), dims.n(),
158 (double3*)m_d_corrRes);
165 cudaMemcpyDeviceToHost, stream));
171 const auto &dims = this->getSettings().refDims;
172 auto stream = *(cudaStream_t*)m_stream->stream();
176 ceil((dims.x() * dims.n()) / (
float)dimBlock.x));
178 size_t bytes = dims.n() *
sizeof(ResRef);
179 gpuErrchk(cudaMemset(m_d_corrRes, 0, bytes));
180 if (std::is_same<T, float>::value) {
181 computeSumSumSqr2D<float, float2>
182 <<<dimGrid, dimBlock, 0, stream>>> (
184 dims.x(), dims.y(), dims.n(),
185 (float2*)m_d_corrRes);
186 }
else if (std::is_same<T, double>::value) {
187 computeSumSumSqr2D<double, double2>
188 <<<dimGrid, dimBlock, 0, stream>>> (
190 dims.x(), dims.y(), dims.n(),
191 (double2*)m_d_corrRes);
198 cudaMemcpyDeviceToHost, stream));
204 const auto &s = this->getSettings();
209 m_stream =
dynamic_cast<GPU*
>(s.hw.at(0));
210 if (
nullptr == m_stream) {
227 if (this->isInitialized()) {
228 m_stream->unpinMemory(m_h_corrRes);
229 m_stream->unpinMemory(m_h_ref_corrRes);
231 free(m_h_ref_corrRes);
240 const auto& s = this->getSettings();
242 gpuErrchk(cudaMalloc(&m_d_ref, s.refDims.size() *
sizeof(T)));
245 size_t bytesResOthers;
247 if (s.normalizeResult) {
248 bytesResOthers = s.otherDims.n() *
sizeof(ResRaw);
249 bytesResRef = s.refDims.n() *
sizeof(ResRef);
251 bytesResOthers = s.otherDims.n() *
sizeof(ResNorm);
254 gpuErrchk(cudaMalloc(&m_d_corrRes, bytesResOthers));
257 memset(m_h_ref_corrRes, 0, bytesResRef);
258 m_stream->pinMemory(m_h_ref_corrRes, bytesResRef);
261 memset(m_h_corrRes, 0, bytesResOthers);
262 m_stream->pinMemory(m_h_corrRes, bytesResOthers);
269 m_d_others =
nullptr;
270 m_d_corrRes =
nullptr;
272 m_h_ref_corrRes =
nullptr;
273 m_h_corrRes =
nullptr;
281 if ( ! this->isInitialized()) {
284 auto &sOrig = this->getSettings();
285 result = result && sOrig.type == s.
type;
286 result = result && (sOrig.otherDims.size() >= s.
otherDims.
size());
293 const auto &s = this->getSettings();
294 if (1 != s.
hw.size()) {
CUDA_HD constexpr bool is2D() const
void * page_aligned_alloc(size_t bytes)
Case or algorithm not implemented yet.
#define REPORT_ERROR(nerr, ErrormMsg)
void sqrt(Image< double > &op)
void abs(Image< double > &op)
T norm(const std::vector< T > &v)
void loadReference(const T *ref) override
constexpr size_t size() const
void compute(T *others) override
Some logical error in the pipeline.