Xmipp  v3.23.11-Nereus
dimred_tools.cpp
Go to the documentation of this file.
1 /***************************************************************************
2  *
3  * Authors: Carlos Oscar Sanchez Sorzano coss@cnb.csic.es (2013)
4  *
5  * Unidad de Bioinformatica of Centro Nacional de Biotecnologia , CSIC
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 2 of the License, or
10  * (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, write to the Free Software
19  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
20  * 02111-1307 USA
21  *
22  * All comments concerning this program package may be sent to the
23  * e-mail address 'xmipp@cnb.csic.es'
24  ***************************************************************************/
25 
26 #include <random>
27 #include <algorithm>
28 #include "dimred_tools.h"
29 #include "core/xmipp_funcs.h"
30 
31 void GenerateData::generateNewDataset(const DatasetType &type, int N, double noise)
32 {
34  X.resizeNoCopy(N,3);
35  t.resizeNoCopy(N,2);
36 
37  auto g = std::mt19937();
38  if (0 == noise) {
39  g.seed(42);
40  } else {
41  std::random_device rd;
42  g.seed(rd());
43  }
44 
45  auto distUniform = std::uniform_real_distribution<>(0.0, 1.0);
46  auto distGauss = std::normal_distribution<>(0.0, 1.0);
47  switch (type) {
48  case DatasetType::SWISS:
49  {
50  for (int i=0; i<N; ++i)
51  {
52  // Generate t
53  MAT_ELEM(t,i,0)=(3 * PI / 2) * (1 + 2 * distUniform(g)); // t
54  MAT_ELEM(t,i,1) = 30 * distUniform(g); // height
55  double localT=MAT_ELEM(t,i,0);
56  double localHeight=MAT_ELEM(t,i,1);
57 
58  // Generate X
59  double s,c;
60  //sincos(localT,&s,&c);
61  s = sin(localT);
62  c = cos(localT);
63  MAT_ELEM(X,i,0)=localT * c + noise * distGauss(g);
64  MAT_ELEM(X,i,1)=localHeight + noise * distGauss(g);
65  MAT_ELEM(X,i,2)=localT * s + noise * distGauss(g);
66 
67  // Generate label
68  VEC_ELEM(label,i)=(unsigned char)(round(localT/2)+round(localHeight/12))%2;
69  }
70  break;
71  }
72  case DatasetType::HELIX:
73  {
74  double iN=1.0/N;
75  for (int i=0; i<N; ++i)
76  {
77  // Generate t
78  MAT_ELEM(t,i,0)=2 * PI * i*iN;
79  MAT_ELEM(t,i,1)=30 * distUniform(g); // height
80  double localT=MAT_ELEM(t,i,0);
81 
82  // Generate X
83  double s,c;
84  //sincos(localT,&s,&c);
85  s = sin(localT);
86  c = cos(localT);
87  double s8,c8;
88  //sincos(8*localT,&s8,&c8);
89  s8 = sin(8*localT);
90  c8 = cos(8*localT);
91  MAT_ELEM(X,i,0)=(2 + c8)*c+noise*distGauss(g);
92  MAT_ELEM(X,i,1)=(2 + c8)*s+noise*distGauss(g);
93  MAT_ELEM(X,i,2)=s8+noise*distGauss(g);
94 
95  // Generate label
96  VEC_ELEM(label,i)=(unsigned char)(round(localT * 1.5))%2;
97  }
98  break;
99  }
101  {
102  int actualN=round(sqrt(N));
103  X.resizeNoCopy(actualN*actualN,3);
104  t.resizeNoCopy(actualN*actualN,2);
105  label.resizeNoCopy(actualN*actualN);
106  for (int ii=0; ii<actualN; ii++)
107  {
108  for (int jj=0; jj<actualN; jj++)
109  {
110  int i=ii*actualN+jj;
111 
112  // Generate t
113  double x = 1 - 2 * distUniform(g);
114  double y = 1 - 2 * distUniform(g);
115  MAT_ELEM(t,i,0)=x;
116  MAT_ELEM(t,i,1)=y;
117 
118  // Generate X
119  MAT_ELEM(X,i,0)=x+noise*distGauss(g);
120  MAT_ELEM(X,i,1)=y+noise*distGauss(g);
121  double z=10*sin(PI * x) * tanh(3 * y);
122  MAT_ELEM(X,i,2)=z+noise*distGauss(g);
123 
124  // Generate label
125  VEC_ELEM(label,i)=(unsigned char)(round(0.1*(x+y+z-3)))%2;
126  }
127  }
128  break;
129  }
131  {
132  // Create centers
133  std::vector<Matrix1D<double> > centers;
134  Matrix1D<double> center(3);
135  const int Nclusters=5;
136  for (int i=0; i<Nclusters; i++)
137  {
139  VEC_ELEM(center,i)=10*distUniform(g);
140  centers.push_back(center);
141  }
142 
143  // Measure the minimum distance between centers
144  Matrix1D<double> diff;
145  double minDistance=1e38;
146  for (int i=0; i<Nclusters-1; ++i)
147  for (int j=i+1; j<Nclusters; ++j)
148  {
149  diff=centers[i]-centers[j];
150  double distance=diff.module();
151  minDistance=std::min(minDistance,distance);
152  }
153 
154  // Create clusters
155  t.initZeros();
156  double sigma=minDistance/sqrt(12);
157  for (int n=0; n<N; ++n)
158  {
159  int i=(Nclusters*n)/N;
160  const Matrix1D<double> &center=centers[i];
161  MAT_ELEM(X,n,0)=XX(center)+(distUniform(g)-0.5)*sigma+noise*distGauss(g);
162  MAT_ELEM(X,n,1)=YY(center)+(distUniform(g)-0.5)*sigma+noise*distGauss(g);
163  MAT_ELEM(X,n,2)=ZZ(center)+(distUniform(g)-0.5)*sigma+noise*distGauss(g);
164  }
165  break;
166  }
168  {
169  double iN=1.0/N;
170  for (int i=0; i<N; ++i)
171  {
172  // Generate t
173  MAT_ELEM(t,i,0)=2 * PI * i*iN;
174  MAT_ELEM(t,i,1)=5*distUniform(g);
175  double localT=MAT_ELEM(t,i,0);
176  double height=MAT_ELEM(t,i,1);
177 
178  // Generate X
179  double s,c;
180  //sincos(localT,&s,&c);
181  s = sin(localT);
182  c = cos(localT);
183  MAT_ELEM(X,i,0)=c+noise*distGauss(g);
184  MAT_ELEM(X,i,1)=c*s+noise*distGauss(g);
185  MAT_ELEM(X,i,2)=height+noise*distGauss(g);
186 
187  // Generate label
188  VEC_ELEM(label,i)=(unsigned char)(round(localT *0.5)+round(height*0.5))%2;
189  }
190  break;
191  }
192  default:
193  REPORT_ERROR(ERR_ARG_INCORRECT,"Incorrect method passed to generate data");
194  }
195 }
196 
197 void insertNeighbour(Matrix2D<int> &idx, Matrix2D<double> &distance, int i1, int i2, double d)
198 {
199  int K=MAT_XSIZE(idx);
200  int kInsert=K;
201  for (int k=K-1; k>=0; --k)
202  if (MAT_ELEM(distance,i1,k)>d)
203  kInsert=k;
204  else
205  break;
206  if (kInsert<K)
207  {
208  for (int kp=K-1; kp>kInsert; --kp)
209  {
210  int kp_1=kp-1;
211  MAT_ELEM(distance,i1,kp)=MAT_ELEM(distance,i1,kp_1);
212  MAT_ELEM(idx,i1,kp)=MAT_ELEM(idx,i1,kp_1);
213  }
214  MAT_ELEM(distance,i1,kInsert)=d;
215  MAT_ELEM(idx,i1,kInsert)=i2;
216  }
217 }
218 
220 {
221  K=std::min(K,(int)MAT_YSIZE(X)-1);
222  idx.initConstant(MAT_YSIZE(X),K,-1);
223  distance.initConstant(MAT_YSIZE(X),K,1e38);
224  for (size_t i1=0; i1<MAT_YSIZE(X)-1; ++i1)
225  for (size_t i2=i1+1; i2<MAT_YSIZE(X); ++i2)
226  {
227  // Compute the distance between i1 and i2
228  double d=0;
229  if (f==NULL)
230  for (int j=0; j<MAT_XSIZE(X); ++j)
231  {
232  double diff=MAT_ELEM(X,i1,j)-MAT_ELEM(X,i2,j);
233  d+=diff*diff;
234  }
235  else
236  d=(*f)(X,i1,i2);
237 
238  // Check if they are nearest neighbours
239  insertNeighbour(idx,distance,i1,i2,d);
240  insertNeighbour(idx,distance,i2,i1,d);
241  }
242  if (computeSqrt)
244  MAT_ELEM(distance,i,j)=sqrt(MAT_ELEM(distance,i,j));
245 }
246 
248 {
249  distance.initZeros();
250  int ii=0;
251  for (size_t i1=0; i1<VEC_XSIZE(ind1); ++i1)
252  {
253  // Compute the distance between ind1[i] and ind2[i]
254  double d=0;
255  if (f==NULL){
256  for (size_t j=0; j<MAT_XSIZE(X); ++j)
257  {
258  double diff=MAT_ELEM(X,ind1(i1),j)-MAT_ELEM(X,ind2(i1),j);
259  d+=diff*diff;
260  }
261  }
262  else
263  d=(*f)(X,ind1(i1),ind2(i1));
264 
265  if (computeSqrt){
266  d=sqrt(d);
267  }
268 
269  distance(ii) = d;
270  ii++;
271  }
272 }
273 
275 {
276  distance.initZeros(MAT_YSIZE(X),MAT_YSIZE(X));
277  for (size_t i1=0; i1<MAT_YSIZE(X)-1; ++i1)
278  for (size_t i2=i1+1; i2<MAT_YSIZE(X); ++i2)
279  {
280  // Compute the distance between i1 and i2
281  double d=0;
282  if (f==NULL)
283  for (int j=0; j<MAT_XSIZE(X); ++j)
284  {
285  double diff=MAT_ELEM(X,i1,j)-MAT_ELEM(X,i2,j);
286  d+=diff*diff;
287  }
288  else
289  d=(*f)(X,i1,i2);
290 
291  if (computeSqrt)
292  d=sqrt(d);
293  MAT_ELEM(distance,i2,i1)=MAT_ELEM(distance,i1,i2)=d;
294  }
295 }
296 
298 {
299  Matrix2D<int> idx;
300  Matrix2D<double> kDistance;
301  kNearestNeighbours(X, K, idx, kDistance, f, computeSqrt);
302  distance.initZeros(MAT_YSIZE(X),MAT_YSIZE(X));
304  {
305  int idx_ij=MAT_ELEM(idx,i,j);
306  MAT_ELEM(distance,idx_ij,i)=MAT_ELEM(distance,i,idx_ij)=MAT_ELEM(kDistance,i,j);
307  }
308 }
309 
310 void computeSimilarityMatrix(Matrix2D<double> &D2, double sigma, bool skipZeros, bool normalize)
311 {
312  double maxDistance=1.0;
313  if (normalize)
314  maxDistance=D2.computeMax();
315  double K=-0.5/(sigma*sigma*maxDistance);
316  if (skipZeros)
317  {
319  if (MAT_ELEM(D2,i,j)!=0)
320  MAT_ELEM(D2,i,j)=exp(MAT_ELEM(D2,i,j)*K);
321  }
322  else
323  {
325  MAT_ELEM(D2,i,j)=exp(MAT_ELEM(D2,i,j)*K);
326  }
327 }
328 
330 {
332  G.rowSum(d);
333  L.resizeNoCopy(G);
335  if (i==j)
336  MAT_ELEM(L,i,i)=VEC_ELEM(d,i)-MAT_ELEM(G,i,i);
337  else
338  MAT_ELEM(L,i,j)=-MAT_ELEM(G,i,j);
339 }
340 
342 {
343  int k1=5;
344  int k2=12;
345  if (k2>MAT_YSIZE(X))
346  {
347  k2=MAT_YSIZE(X)-1;
348  k1=k2/2;
349  }
350  Matrix2D<int> idx;
352  kNearestNeighbours(X,k2,idx,distance,f);
353 
354  // Estimate d
355  double dsum=0;
356  std::cerr << "Estimating dimensionality ... " << std::endl;
357  init_progress_bar(MAT_YSIZE(distance));
358  for (int i=0; i<MAT_YSIZE(distance); ++i)
359  {
360  double dist=log(MAT_ELEM(distance,i,0));
361  double S=dist;
362  for (int k=1; k<MAT_XSIZE(distance); ++k)
363  {
364  dist=log(MAT_ELEM(distance,i,k));
365  S+=dist;
366  if (k>=k1)
367  {
368  double d=(k-1)/(S-dist*(k+1));
369  dsum+=d;
370  }
371  }
372  progress_bar(i);
373  }
374  progress_bar(MAT_YSIZE(distance));
375  return -dsum/((k2-k1)*MAT_YSIZE(distance));
376 }
377 
378 // By correlation dimension
380 {
381  int K=3;
382  Matrix2D<int> idx;
384  kNearestNeighbours(X,K,idx,distance,f);
385 
386  // Compute median and maximum
387  size_t N=MAT_XSIZE(distance)*MAT_YSIZE(distance);
388  std::sort(MATRIX2D_ARRAY(distance),MATRIX2D_ARRAY(distance)+N);
389  double median=MATRIX2D_ARRAY(distance)[N/2];
390  double maxVal=MATRIX2D_ARRAY(distance)[N-1];
391  median*=median; // Because we compute dist^2 instead of dist
392  maxVal*=maxVal;
393  if (maxVal==0)
394  return 0;
395 
396  // Compute the distance of all versus all
397  double countLessMedianK=0, countLessMaxK=0;
398  std::cerr << "Estimating dimensionality ... " << std::endl;
400  for (size_t i1=0; i1<MAT_YSIZE(X)-1; ++i1)
401  {
402  for (size_t i2=i1+1; i2<MAT_YSIZE(X); ++i2)
403  {
404  // Compute the distance between i1 and i2
405  double d=0;
406  if (f==NULL)
407  for (int j=0; j<MAT_XSIZE(X); ++j)
408  {
409  double diff=MAT_ELEM(X,i1,j)-MAT_ELEM(X,i2,j);
410  d+=diff*diff;
411  }
412  else
413  d=(*f)(X,i1,i2);
414 
415  // Check d
416  if (d<=maxVal)
417  {
418  countLessMaxK+=1;
419  if (d<=median)
420  countLessMedianK+=1;
421  }
422  }
423  progress_bar(i1);
424  }
425  progress_bar(MAT_YSIZE(X)-1);
426  double iCombinations=2.0/(MAT_YSIZE(X)*(MAT_YSIZE(X)-1));
427  double probLessMedianK=countLessMedianK*iCombinations; // Probability of a distance being less than medianK
428  double probLessMaxK=countLessMaxK*iCombinations; // Probability of a distance being less than maxK
429  return 2*log(probLessMaxK/probLessMedianK)/log(maxVal/median);
430 }
431 
433 {
434  if (normalize)
435  normalizeColumns(X);
436 
437  if (method=="MLE")
438  return intrinsicDimensionalityMLE(X,f);
439  else if (method=="CorrDim")
440  return intrinsicDimensionalityCorrDim(X,f);
441  else
442  REPORT_ERROR(ERR_ARG_INCORRECT,"Unknown dimensionality estimate method");
443 
444  // I do not implement NearNbDim because it gives a wrong answer on the swiss roll
445  // I do not implement PackingNumbers because it is too slow on the swiss roll and gives a wrong answer
446  // I do not implement EigValue because it only provides integer dimensions
447  // I do not implement GMST because it is slower than MLE and CorrDim
448 }
449 
451  Matrix2D<double> &Xi)
452 {
453  Xi.resizeNoCopy(MAT_XSIZE(idx),MAT_XSIZE(X));
454  for (int j=0; j<MAT_XSIZE(idx); ++j)
455  {
456  int jNeighbour=MAT_ELEM(idx,i,j);
457  memcpy(&MAT_ELEM(Xi,j,0),&MAT_ELEM(X,jNeighbour,0),MAT_XSIZE(X)*sizeof(double));
458  }
459 }
460 
462 {
463  X=NULL;
464  distance=NULL;
465 }
466 
468 {
469  this->X=&X;
470 }
471 
473 {
474  this->outputDim=outputDim;
475 }
476 
478 {
479  return Y;
480 }
#define FOR_ALL_ELEMENTS_IN_MATRIX2D(m)
Definition: matrix2d.h:104
#define MAT_YSIZE(m)
Definition: matrix2d.h:124
void init_progress_bar(long total)
void min(Image< double > &op1, const Image< double > &op2)
#define VEC_ELEM(v, i)
Definition: matrix1d.h:245
void rowSum(Matrix1D< T > &sum) const
Definition: matrix2d.cpp:779
double module() const
Definition: matrix1d.h:983
void insertNeighbour(Matrix2D< int > &idx, Matrix2D< double > &distance, int i1, int i2, double d)
void computeSimilarityMatrix(Matrix2D< double > &D2, double sigma, bool skipZeros, bool normalize)
void normalizeColumns(Matrix2D< double > &A)
Definition: matrix2d.cpp:188
#define REPORT_ERROR(nerr, ErrormMsg)
Definition: xmipp_error.h:211
void kNearestNeighbours(const Matrix2D< double > &X, int K, Matrix2D< int > &idx, Matrix2D< double > &distance, DimRedDistance2 f, bool computeSqrt)
#define VEC_XSIZE(m)
Definition: matrix1d.h:77
doublereal * c
doublereal * g
void computeDistance(const Matrix2D< double > &X, Matrix2D< double > &distance, DimRedDistance2 f, bool computeSqrt)
void setInputData(Matrix2D< double > &X)
Set input data.
void initConstant(T val)
Definition: matrix2d.h:602
DatasetType
Definition: dimred_tools.h:42
void sqrt(Image< double > &op)
static double * y
Matrix1D< unsigned char > label
Definition: dimred_tools.h:56
void extractNearestNeighbours(const Matrix2D< double > &X, Matrix2D< int > &idx, int i, Matrix2D< double > &Xi)
double intrinsicDimensionalityCorrDim(const Matrix2D< double > &X, DimRedDistance2 f)
const Matrix2D< double > & getReducedData()
Get reduced data.
void computeGraphLaplacian(const Matrix2D< double > &G, Matrix2D< double > &L)
void generateNewDataset(const DatasetType &type, int N=1000, double noise=0.05)
void computeRandomPointsDistance(const Matrix2D< double > &X, Matrix1D< double > &distance, Matrix1D< int > ind1, Matrix1D< int > ind2, DimRedDistance2 f, bool computeSqrt)
doublereal * x
#define i
ql0001_ & k(htemp+1),(cvec+1),(atemp+1),(bj+1),(bl+1),(bu+1),(x+1),(clamda+1), &iout, infoqp, &zero,(w+1), &lenw,(iw+1), &leniw, &glob_grd.epsmac
doublereal * d
void median(MultidimArray< T > &x, MultidimArray< T > &y, T &m)
Definition: filters.h:1055
void resizeNoCopy(int Ydim, int Xdim)
Definition: matrix2d.h:534
#define MAT_ELEM(m, i, j)
Definition: matrix2d.h:116
#define FOR_ALL_ELEMENTS_IN_MATRIX1D(v)
Definition: matrix1d.h:72
Matrix2D< double > t
Definition: dimred_tools.h:53
void log(Image< double > &op)
#define XX(v)
Definition: matrix1d.h:85
viol type
double * f
Incorrect argument received.
Definition: xmipp_error.h:113
double intrinsicDimensionalityMLE(const Matrix2D< double > &X, DimRedDistance2 f)
void computeDistanceToNeighbours(const Matrix2D< double > &X, int K, Matrix2D< double > &distance, DimRedDistance2 f, bool computeSqrt)
void progress_bar(long rlen)
Matrix2D< double > X
Definition: dimred_tools.h:48
quaternion_type< T > normalize(quaternion_type< T > q)
Definition: point.cpp:278
double z
double(* DimRedDistance2)(const Matrix2D< double > &X, size_t i1, size_t i2)
Definition: dimred_tools.h:75
void initZeros()
Definition: matrix1d.h:592
void sort(struct DCEL_T *dcel)
Definition: sorting.cpp:18
#define j
#define YY(v)
Definition: matrix1d.h:93
TYPE distance(struct Point_T *p, struct Point_T *q)
Definition: point.cpp:28
int round(double x)
Definition: ap.cpp:7245
#define MAT_XSIZE(m)
Definition: matrix2d.h:120
DimRedAlgorithm()
Empty constructor.
std::string String
Definition: xmipp_strings.h:34
void initZeros()
Definition: matrix2d.h:626
void resizeNoCopy(int Xdim)
Definition: matrix1d.h:458
void setOutputDimensionality(size_t outputDim)
Set output dimensionality.
constexpr int K
T computeMax() const
Definition: matrix2d.cpp:1236
#define PI
Definition: tools.h:43
#define MATRIX2D_ARRAY(m)
Definition: matrix2d.h:89
double intrinsicDimensionality(Matrix2D< double > &X, const String &method, bool normalize, DimRedDistance2 f)
int * n
#define ZZ(v)
Definition: matrix1d.h:101