Xmipp  v3.23.11-Nereus
metadata_split.cpp
Go to the documentation of this file.
1 /***************************************************************************
2  *
3  * Authors: Sjors Scheres (scheres@cnb.csic.es)
4  *
5  * Unidad de Bioinformatica of Centro Nacional de Biotecnologia , CSIC
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 2 of the License, or
10  * (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, write to the Free Software
19  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
20  * 02111-1307 USA
21  *
22  * All comments concerning this program package may be sent to the
23  * e-mail address 'xmipp@cnb.csic.es'
24  ***************************************************************************/
25 
26 #include "core/matrix2d.h"
27 #include "core/metadata_vec.h"
28 #include "core/xmipp_image.h"
29 #include "core/xmipp_program.h"
30 #include "data/numerical_tools.h"
32 
34 {
35  FileName fn_in, fn_out, fn_root, fn_cc;
36  String sortLabelStr, extension;
37  MDLabel sortLabel;
38  MetaDataVec mdIn;
39  MetaDataVec *mdPtr, *mdPtr2;
40  std::vector<MetaDataVec> mdVector;
41  bool dont_randomize, dont_remove_disabled, dont_sort, use_cc;
42  size_t N;
43  int AHCiter, AHCsubset;
44 
45 protected:
46 
47  void defineParams()
48  {
49  addUsageLine("Split a metadata (randomly by default) in any number of equally sized output metadata.");
50  addUsageLine("By default, only enabled entries in the input file will be written to the output files.");
51 
52  addParamsLine(" -i <inputSelfile> : Input MetaData File");
53  addParamsLine(" [ -n <parts=2> ] : Number of output MetaDatas");
54  addParamsLine(" [ --oroot <rootname=\"\"> ] : Rootname for output MetaDatas");
55  addParamsLine(" : output will be rootname_<n>.xmd");
56  addParamsLine(" [--dont_randomize ] : Do not generate random groups");
57  addParamsLine(" [--dont_sort ] : Do not sort the outputs MetaData");
58  addParamsLine(" [--dont_remove_disabled]: Do not remove disabled images from MetaData");
59  addParamsLine(" [--use_correlation <fnCC=\"\"> <iter=100> <subset=16>] : Use correlation to a single volume.");
60  addParamsLine(" : This is the output of xmipp_reconstruct_significant");
61  addParamsLine(" : with a single reference volume.");
62  addParamsLine(" :+ The metadata is splitted by taking random subsets of the projection");
63  addParamsLine(" :+ directions and applying hierarchical clustering a number of");
64  addParamsLine(" :+ iterations on these random subsets");
65  addParamsLine(" [-l <label=\"image\">] : sort using a label, default image");
66 
67  addSeeAlsoLine("reconstruct_significant");
68 
69  addExampleLine("Splits input.sel in two parts:", false);
70  addExampleLine(" xmipp_metadata_split -i input.sel --oroot output_part");
71  addExampleLine("Splits input.sel in 4 output files without randomizing input metdata:", false);
72  addExampleLine(" xmipp_metadata_split -i input.sel -n 4 --dont_randomize");
73  addExampleLine("Splits input.sel in two parts with sel extension:", false);
74  addExampleLine(" xmipp_metadata_split -i input.sel --oroot output_part:sel");
75  }
76 
77  void readParams()
78  {
79  fn_in = getParam("-i");
80  N = getIntParam("-n");
81  fn_root = getParam("--oroot");
82  extension = fn_root.getFileFormat();
83  fn_root = fn_root.removeFileFormat();
84  if (fn_root.empty())
85  fn_root = fn_in.withoutExtension();
86  if (extension.empty())
87  extension = "xmd";
88  dont_randomize = checkParam("--dont_randomize");
89  dont_sort = checkParam("--dont_sort");
90  dont_remove_disabled = checkParam("--dont_remove_disabled");
91  use_cc = checkParam("--use_correlation");
92  if (use_cc)
93  {
94  fn_cc = getParam("--use_correlation",0);
95  AHCiter = getIntParam("--use_correlation",1);
96  AHCsubset = getIntParam("--use_correlation",2);
97  dont_randomize=true;
98  }
99 
100  sortLabelStr = "objId"; //if not sort, by default is objId
101  if(!dont_sort)
102  sortLabelStr = getParam("-l");
103 
104  sortLabel = MDL::str2Label(sortLabelStr);
105  if (sortLabel == MDL_UNDEFINED)
106  REPORT_ERROR(ERR_MD_UNDEFINED, (String)"Unrecognized label '" + sortLabelStr + "'");
107  }
108 
109 public:
110  void run()
111  {
112  mdIn.read(fn_in);
113 
114  if (dont_randomize)
115  mdPtr = &mdIn;
116  else
117  {
118  mdPtr = new MetaDataVec();
119  mdPtr->randomize(mdIn);
120  }
121 
122  if (!dont_remove_disabled && mdPtr->containsLabel(MDL_ENABLED)) //remove disabled entries by default
123  mdPtr->removeObjects(MDValueEQ(MDL_ENABLED, -1));
124 
125  size_t Num_images = mdPtr->size();
126  size_t Num_groups = XMIPP_MIN(N, Num_images);
127  if (!use_cc)
128  mdPtr->split(Num_groups, mdVector); //Split MD in Num_groups
129  else
130  {
131  Image<double> cc;
132  cc.read(fn_cc);
133  MultidimArray<double> &mcc=cc();
134  auto Ndirs=(int)XSIZE(mcc);
135  auto Nvols=(int)YSIZE(mcc);
136  auto Nimgs=(int)ZSIZE(mcc);
137  if (Nvols!=1)
138  REPORT_ERROR(ERR_ARG_INCORRECT,"This program is meant only for cross-correlation volumes of a single reference model");
139  Matrix2D<int> coocurrence(Nimgs,Nimgs);
140  Matrix2D<double> X(Nimgs,AHCsubset);
141  MultidimArray<int> permutation;
142  AHCClassifier ahc;
143 
144  for (int iter=0; iter<AHCiter; ++iter)
145  {
146  // Take the correlation with random directions that will serve as probe functions
147  randomPermutation(Ndirs,permutation);
149  MAT_ELEM(X,i,j)=A3D_ELEM(mcc,i,0,A1D_ELEM(permutation,j));
150 
151  // Classify
152  ahc.clusterData(X,(int)Num_groups);
153 
154  // Update coocurrence
155  for (int i=0; i<Nimgs-1; ++i)
156  for (int j=i+1; j<Nimgs; ++j)
158  {
159  MAT_ELEM(coocurrence,i,j)+=1;
160  MAT_ELEM(coocurrence,j,i)=MAT_ELEM(coocurrence,i,j);
161  }
162  }
163 
164  // Convert the coocurrence matrix into a distance matrix
165  int maxCoocurrence=coocurrence.computeMax();
167  D.resizeNoCopy(coocurrence);
168  FOR_ALL_ELEMENTS_IN_MATRIX2D(coocurrence)
169  MAT_ELEM(D,i,j)=maxCoocurrence-MAT_ELEM(coocurrence,i,j);
170 
171  // Now perform the clustering with this distance
172  ahc.clusterWithDistance(D,(int)Num_groups);
173 
174  // Construct output metadata
176  for (size_t k=0; k<Num_groups; ++k)
177  mdVector.push_back(dummy);
178 
179  size_t i=0;
180  for (auto& row : *mdPtr)
181  {
182  mdVector[VEC_ELEM(ahc.clusterAssigned,i)].addRow(dynamic_cast<MDRowVec&>(row));
183  i++;
184  }
185  }
186 
187  //Write splitted groups to disk
188  for (size_t i = 0; i < Num_groups; ++i)
189  {
190  fn_out.compose(fn_root, i + 1, extension);
191 
192  if(!dont_sort)
193  mdPtr->sort(mdVector[i], sortLabel);
194  else
195  mdPtr = &(mdVector[i]);
196 
197  mdPtr->write(fn_out);
198  }
199  }
200 
201 }
202 ;//end of class ProgMetadataSplit
#define FOR_ALL_ELEMENTS_IN_MATRIX2D(m)
Definition: matrix2d.h:104
void removeObjects(const std::vector< size_t > &toRemove) override
#define YSIZE(v)
#define VEC_ELEM(v, i)
Definition: matrix1d.h:245
void sort(const MetaDataVec &MDin, const MDLabel sortLabel, bool asc=true, int limit=-1, int offset=0)
static MDLabel str2Label(const String &labelName)
void read(const FileName &inFile, const std::vector< MDLabel > *desiredLabels=nullptr, bool decomposeStack=true) override
#define REPORT_ERROR(nerr, ErrormMsg)
Definition: xmipp_error.h:211
FileName removeFileFormat() const
#define A1D_ELEM(v, i)
void compose(const String &str, const size_t no, const String &ext="")
void split(size_t n, std::vector< MetaDataVec > &results, const MDLabel sortLabel=MDL_OBJID) const
void write(const FileName &outFile, WriteModeMetaData mode=MD_OVERWRITE) const
glob_prnt iter
void randomize(const MetaData &MDin)
size_t size() const override
#define i
Is this image enabled? (int [-1 or 1])
ql0001_ & k(htemp+1),(cvec+1),(atemp+1),(bj+1),(bl+1),(bu+1),(x+1),(clamda+1), &iout, infoqp, &zero,(w+1), &lenw,(iw+1), &leniw, &glob_grd.epsmac
void randomPermutation(int N, MultidimArray< int > &result)
void addSeeAlsoLine(const char *seeAlso)
void resizeNoCopy(int Ydim, int Xdim)
Definition: matrix2d.h:534
Undefined label.
Definition: xmipp_error.h:162
#define MAT_ELEM(m, i, j)
Definition: matrix2d.h:116
void clusterData(const Matrix2D< double > &X, int numberOfClusters=2, int distance=2, int linkageType=1)
#define A3D_ELEM(V, k, i, j)
Matrix1D< int > clusterAssigned
const char * getParam(const char *param, int arg=0)
Incorrect argument received.
Definition: xmipp_error.h:113
#define XSIZE(v)
#define ZSIZE(v)
void addExampleLine(const char *example, bool verbatim=true)
double dummy
#define XMIPP_MIN(x, y)
Definition: xmipp_macros.h:181
#define j
void clusterWithDistance(const Matrix2D< double > &D, int numberOfClusters=2, int linkageType=1)
FileName withoutExtension() const
std::string String
Definition: xmipp_strings.h:34
String getFileFormat() const
bool checkParam(const char *param)
int read(const FileName &name, DataMode datamode=DATA, size_t select_img=ALL_IMAGES, bool mapData=false, int mode=WRITE_READONLY)
T computeMax() const
Definition: matrix2d.cpp:1236
void addUsageLine(const char *line, bool verbatim=false)
bool containsLabel(const MDLabel label) const override
int getIntParam(const char *param, int arg=0)
MDLabel
void addParamsLine(const String &line)