Xmipp  v3.23.11-Nereus
pdb2cif.cpp
Go to the documentation of this file.
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2020 NKI/AVL, Netherlands Cancer Institute
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  *
9  * 1. Redistributions of source code must retain the above copyright notice, this
10  * list of conditions and the following disclaimer
11  * 2. Redistributions in binary form must reproduce the above copyright notice,
12  * this list of conditions and the following disclaimer in the documentation
13  * and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  */
26 
27 #include <cif++.hpp>
28 #include <cif++/pdb/pdb2cif.hpp>
29 #include <cif++/pdb/pdb2cif_remark_3.hpp>
30 #include <cif++/gzio.hpp>
31 
32 #include <iomanip>
33 #include <map>
34 #include <set>
35 #include <stack>
36 
37 using cif::category;
38 using cif::datablock;
39 using cif::iequals;
40 using cif::key;
41 // using cif::row;
42 using cif::to_lower;
43 using cif::to_lower_copy;
44 // using cif::compound_factory;
45 
46 // --------------------------------------------------------------------
47 // attempt to come up with better error handling
48 
49 namespace error
50 {
51  enum pdbErrors
52  {
55  };
56 
57  namespace detail
58  {
59  class pdbCategory : public std::error_category
60  {
61  public:
62  const char *name() const noexcept
63  {
64  return "pdb";
65  }
66 
67  std::string message(int value) const
68  {
69  switch (value)
70  {
71  case residueNotFound:
72  return "Residue not found";
73 
74  case invalidDate:
75  return "Invalid date";
76 
77  default:
78  return "Error in PDB format";
79  }
80  }
81  };
82  } // namespace detail
83 
84  std::error_category &pdbCategory()
85  {
86  static detail::pdbCategory impl;
87  return impl;
88  }
89 
90  inline std::error_code make_error_code(pdbErrors e)
91  {
92  return std::error_code(static_cast<int>(e), pdbCategory());
93  }
94 } // namespace error
95 
96 namespace std
97 {
98 
99  template <>
100  struct is_error_code_enum<error::pdbErrors>
101  {
102  static const bool value = true;
103  };
104 
105 } // namespace std
106 
107 namespace cif::pdb
108 {
109 
110 // --------------------------------------------------------------------
111 
112 const std::map<std::string, int> kMonths{
113  { "JAN", 1 },
114  { "FEB", 2 },
115  { "MAR", 3 },
116  { "APR", 4 },
117  { "MAY", 5 },
118  { "JUN", 6 },
119  { "JUL", 7 },
120  { "AUG", 8 },
121  { "SEP", 9 },
122  { "OCT", 10 },
123  { "NOV", 11 },
124  { "DEC", 12 },
125 };
126 
127 const std::set<std::string> kSupportedRecords{
128  "HEADER", "OBSLTE", "TITLE ", "SPLIT ", "CAVEAT", "COMPND", "SOURCE",
129  "KEYWDS", "EXPDTA", "NUMMDL", "MDLTYP", "AUTHOR", "REVDAT", "SPRSDE",
130  "JRNL ", "REMARK", "DBREF ", "DBREF1", "DBREF2", "SEQADV", "SEQRES",
131  "MODRES", "HET ", "HETNAM", "HETSYN", "FORMUL", "HELIX ", "SHEET ",
132  "SSBOND", "LINK ", "CISPEP", "SITE ", "CRYST1", "ORIGX1", "SCALE1",
133  "MTRIX1", "ORIGX2", "SCALE2", "MTRIX2", "ORIGX3", "SCALE3", "MTRIX3",
134  "MODEL ", "ATOM ", "ANISOU", "TER ", "HETATM", "ENDMDL", "CONECT",
135  "MASTER", "END ",
136 
137  // bah...
138  "LINKR "
139 };
140 
141 bool isWater(const std::string &resname)
142 {
143  return resname == "HOH" or resname == "H2O" or resname == "OH2" or resname == "WAT" or resname == "DOD" or resname == "WAT";
144 }
145 
146 // --------------------------------------------------------------------
147 // Unfortunately, parsing a PDB file requires several passes over the
148 // data. Therefore we first obtain all records where a record has the
149 // value flattened out for continuation.
150 
151 PDBRecord::PDBRecord(uint32_t lineNr, const std::string &name, const std::string &value)
152  : mNext(nullptr)
153  , mLineNr(lineNr)
154  , mVlen(value.length())
155 {
156  assert(name.length() <= 10);
157 
158  strcpy(mName, name.c_str());
159  strcpy(mValue, value.c_str());
160 }
161 
162 PDBRecord::~PDBRecord()
163 {
164 }
165 
166 void *PDBRecord::operator new(size_t size, size_t vLen)
167 {
168  return malloc(size + vLen + 1);
169 }
170 
171 void PDBRecord::operator delete(void *p)
172 {
173  free(p);
174 }
175 
176 void PDBRecord::operator delete(void *p, size_t vLen)
177 {
178  free(p);
179 }
180 
181 bool PDBRecord::is(const char *name) const
182 {
183  return iequals(mName, name);
184 }
185 
186 char PDBRecord::vC(size_t column)
187 {
188  char result = ' ';
189  if (column - 7 < mVlen)
190  result = mValue[column - 7];
191  return result;
192 }
193 
194 std::string PDBRecord::vS(size_t columnFirst, size_t columnLast)
195 {
196  std::string result;
197 
198  if (columnLast > mVlen + 6)
199  columnLast = mVlen + 6;
200 
201  if (columnFirst < mVlen + 7)
202  {
203  result = std::string{ mValue + columnFirst - 7, mValue + columnLast - 7 + 1 };
204  cif::trim(result);
205  }
206 
207  return result;
208 }
209 
210 int PDBRecord::vI(int columnFirst, int columnLast)
211 {
212  int result = 0;
213 
214  const char *e = mValue + mVlen;
215  if (e > mValue + columnLast - 7 + 1)
216  e = mValue + columnLast - 7 + 1;
217 
218  enum
219  {
220  start,
221  digit,
222  tail
223  } state = start;
224  bool negate = false;
225 
226  try
227  {
228  for (const char *p = mValue + columnFirst - 7; p < e; ++p)
229  {
230  switch (state)
231  {
232  case start:
233  if (*p == '+')
234  state = digit;
235  else if (*p == '-')
236  {
237  negate = true;
238  state = digit;
239  }
240  else if (isdigit(*p))
241  {
242  result = *p - '0';
243  state = digit;
244  }
245  else if (not isspace(*p))
246  throw std::runtime_error("Not a valid integer in PDB record");
247  break;
248 
249  case digit:
250  if (isspace(*p))
251  state = tail;
252  else if (not isdigit(*p))
253  throw std::runtime_error("Not a valid integer in PDB record");
254  else
255  result = result * 10 + *p - '0';
256  break;
257 
258  case tail:
259  if (not isspace(*p))
260  throw std::runtime_error("Not a valid integer in PDB record");
261  break;
262  }
263  }
264  }
265  catch (const std::exception &ex)
266  {
267  if (cif::VERBOSE >= 0)
268  std::cerr << "Trying to parse '" << std::string(mValue + columnFirst - 7, mValue + columnLast - 7) << '\'' << std::endl;
269  throw;
270  }
271 
272  if (negate)
273  result = -result;
274 
275  return result;
276 }
277 
278 std::string PDBRecord::vF(size_t columnFirst, size_t columnLast)
279 {
280  // for now... TODO: check format?
281  return vS(columnFirst, columnLast);
282 }
283 
284 // --------------------------------------------------------------------
285 
287 {
288  public:
289  SpecificationListParser(const std::string &text)
290  : mText(text)
291  , mP(mText.begin())
292  {
293  }
294 
295  std::tuple<std::string, std::string> GetNextSpecification();
296 
297  private:
298  std::string mText;
299  std::string::iterator mP;
300 };
301 
302 std::tuple<std::string, std::string> SpecificationListParser::GetNextSpecification()
303 {
304  std::string id, value;
305 
306  std::string::iterator start = mP, backup;
307 
308  enum
309  {
310  eStart,
311  eID,
312  eColon,
313  eValue,
314  eNL,
315  eNL_ID,
316  eSemiColon,
317  eError,
318  eDone
319  } state = eStart;
320 
321  while (mP != mText.end() and state != eDone)
322  {
323  char ch = *mP++;
324 
325  switch (state)
326  {
327  case eStart:
328  if (isalnum(ch) or ch == '_')
329  {
330  id = { ch };
331  value.clear();
332  state = eID;
333  start = mP;
334  }
335  else if (not isspace(ch))
336  {
337  if (cif::VERBOSE > 0)
338  std::cerr << "skipping invalid character in SOURCE ID: " << ch << std::endl;
339  }
340  break;
341 
342  case eID:
343  if (isalnum(ch) or ch == '_')
344  id += ch;
345  else if (ch == ':')
346  state = eColon;
347  else
348  state = eError;
349  break;
350 
351  case eColon:
352  if (ch == ';')
353  {
354  if (cif::VERBOSE > 0)
355  std::cerr << "Empty value for SOURCE: " << id << std::endl;
356  state = eStart;
357  }
358  else if (not isspace(ch))
359  {
360  value = { ch };
361  state = eValue;
362  }
363  break;
364 
365  case eValue:
366  if (ch == '\n')
367  {
368  backup = mP;
369  state = eNL;
370  }
371  else if (ch == ';')
372  {
373  backup = mP;
374  state = eSemiColon;
375  }
376  else
377  value += ch;
378  break;
379 
380  case eSemiColon:
381  if (ch == '\n')
382  state = eDone;
383  else if (ch != ' ')
384  {
385  value.insert(value.end(), backup, mP);
386  state = eValue;
387  }
388  break;
389 
390  case eNL:
391  if (isalnum(ch))
392  {
393  value += ' ';
394  state = eNL_ID;
395  }
396  else if (isspace(ch))
397  state = eValue;
398  break;
399 
400  case eNL_ID:
401  if (ch == ':')
402  {
403  mP = backup;
404  state = eDone;
405  }
406  else if (ch == ';')
407  state = eSemiColon;
408  else if (not(isalnum(ch) or ch == '_'))
409  {
410  value.insert(value.end(), backup, mP);
411  state = eValue;
412  }
413  break;
414 
415  case eError:
416  if (ch == ';')
417  {
418  if (cif::VERBOSE > 0)
419  std::cerr << "Skipping invalid header line: '" << std::string(start, mP) << std::endl;
420  state = eStart;
421  }
422  break;
423 
424  case eDone: break; // keep compiler happy
425  }
426  }
427 
428  cif::trim(value);
429 
430  return std::make_tuple(id, value);
431 }
432 
433 // --------------------------------------------------------------------
434 
436 {
437  public:
439  : mData(nullptr)
440  , mRec(nullptr)
441  {
442  }
443 
445  {
446  PDBRecord *r = mData;
447  while (r != nullptr)
448  {
449  PDBRecord *d = r;
450  r = d->mNext;
451  delete d;
452  }
453  }
454 
455  void Parse(std::istream &is, cif::file &result);
456 
457  private:
458  // ----------------------------------------------------------------
459 
460  struct DBREF
461  {
462  std::string PDBIDCode;
463  char chainID;
464  int seqBegin;
465  char insertBegin = ' ';
466  int seqEnd;
467  char insertEnd = ' ';
468  std::string database;
469  std::string dbAccession;
470  std::string dbIdCode;
471  int dbSeqBegin;
472  char dbinsBeg;
473  int dbSeqEnd;
474  char dbinsEnd;
475  };
476 
477  struct HET
478  {
479  std::string hetID;
480  char chainID;
481  int seqNum;
482  char iCode;
483  int numHetAtoms = 0;
484  std::string text;
485  std::string asymID;
486  std::vector<PDBRecord *> atoms;
487  bool processed = false;
488  bool branch = false;
489  PDBRecord *asn = nullptr;
490 
491  HET(const std::string &hetID, char chainID, int seqNum, char iCode, int numHetAtoms = 0, const std::string &text = {})
492  : hetID(hetID)
493  , chainID(chainID)
494  , seqNum(seqNum)
495  , iCode(iCode)
496  , numHetAtoms(numHetAtoms)
497  , text(text)
498  {
499  }
500  };
501 
502  struct UNOBS
503  {
504  int modelNr;
505  std::string res;
506  char chain;
507  int seq;
508  char iCode;
509  std::vector<std::string> atoms;
510  };
511 
512  struct ATOM_REF
513  {
514  std::string name;
515  std::string resName;
516  int resSeq;
517  char chainID;
518  char iCode;
519  char altLoc;
520 
521  bool operator==(const ATOM_REF &rhs) const
522  {
523  return name == rhs.name and
524  resName == rhs.resName and
525  resSeq == rhs.resSeq and
526  (altLoc == rhs.altLoc or altLoc == ' ' or rhs.altLoc == ' ') and
527  chainID == rhs.chainID and
528  iCode == rhs.iCode;
529  }
530 
531  bool operator!=(const ATOM_REF &rhs) const
532  {
533  return not operator==(rhs);
534  }
535 
536  bool operator<(const ATOM_REF &rhs) const
537  {
538  int d = chainID - rhs.chainID;
539  if (d == 0)
540  d = resSeq - rhs.resSeq;
541  if (d == 0)
542  d = iCode - rhs.iCode;
543  // if (d == 0) d = resName.compare(rhs.resName);
544  if (d == 0)
545  d = name.compare(rhs.name);
546  if (d == 0 and altLoc != ' ' and rhs.altLoc != ' ')
547  d = altLoc - rhs.altLoc;
548  return d < 0;
549  }
550 
551  friend std::ostream &operator<<(std::ostream &os, const ATOM_REF &a)
552  {
553  os << a.name << ' ' << a.resName << ' ' << a.chainID << ' ' << a.resSeq << (a.iCode == ' ' ? "" : std::string{ a.iCode }) << (a.altLoc != ' ' ? std::string{ ' ', a.altLoc } : "");
554  return os;
555  }
556  };
557 
558  struct LINK
559  {
560  ATOM_REF a, b;
561  std::string symOpA, symOpB;
562  float distance;
563  };
564 
565  struct SUGAR
566  {
567  ATOM_REF c1;
568  int leaving_o;
569  ATOM_REF next;
570  };
571 
572  class SUGAR_TREE : public std::vector<SUGAR>
573  {
574  public:
575  std::string entityName() const
576  {
577  return empty() ? "" : entityName(begin());
578  }
579 
580  private:
581  std::string entityName(const_iterator sugar) const
582  {
583  std::string result;
584 
585  for (auto i = begin(); i != end(); ++i)
586  {
587  if (i->next != sugar->c1)
588  continue;
589 
590  auto n = entityName(i) + "-(1-" + std::to_string(i->leaving_o) + ")";
591 
592  if (result.empty())
593  result = n;
594  else
595  result += "-[" + n + ']';
596  }
597 
598  if (not result.empty() and result.back() != ']')
599  result += '-';
600 
601  auto compound = cif::compound_factory::instance().create(sugar->c1.resName);
602  if (compound)
603  result += compound->name();
604  else if (sugar->c1.resName == "MAN")
605  result += "alpha-D-mannopyranose";
606  else if (sugar->c1.resName == "BMA")
607  result += "beta-D-mannopyranose";
608  else if (sugar->c1.resName == "NAG")
609  result += "2-acetamido-2-deoxy-beta-D-glucopyranose";
610  else if (sugar->c1.resName == "NDG")
611  result += "2-acetamido-2-deoxy-alpha-D-glucopyranose";
612  else if (sugar->c1.resName == "FUC")
613  result += "alpha-L-fucopyranose";
614  else if (sugar->c1.resName == "FUL")
615  result += "beta-L-fucopyranose";
616  else
617  result += sugar->c1.resName;
618 
619  return result;
620  }
621  };
622 
623  // ----------------------------------------------------------------
624 
625  /*
626  To get from PDB chains to CIF entity and poly records we take the following steps:
627 
628  First check if there is a Primary Structure Section. If there is, it should contain
629  a valid DBREF/SEQRES pair that allows the reconstruction of numbering of residues.
630 
631  If that fails, we fall back to:
632 
633  1. Collect the chains from the PDB file.
634  2. For each chain, split out the residues and waters, assign those to new entities
635  3. If there are multiple chains containing residues, align those to find unique polymers
636  4. Annotate the entity records with available information in the PDB file (COMPND e.g.)
637  5. Create the mapping structures from PDB numbering to CIF numbering.
638  */
639 
640  struct PDBCompound
641  {
642  int mMolID;
643  std::string mTitle;
644  std::set<char> mChains;
645  std::map<std::string, std::string> mInfo;
646  std::map<std::string, std::string> mSource;
647  int mCount = 0;
648  };
649 
650  struct PDBSeqRes
651  {
652  std::string mMonID;
653  int mSeqNum;
654  char mIcode;
655 
656  int mDbSeqNum = 0;
657  bool mSeen = false;
658  std::set<std::string> mAlts;
659 
660  bool operator==(const PDBSeqRes &rhs) const
661  {
662  return mSeqNum == rhs.mSeqNum and mMonID == rhs.mMonID and mIcode == rhs.mIcode;
663  }
664  };
665 
666  struct PDBChain
667  {
668  PDBChain(const std::string &structureID, char chainID, int molID)
669  : mDbref{ structureID, chainID }
670  , mWaters(0)
671  , mTerIndex(0)
672  , mMolID(molID)
673  , mNextSeqNum(1)
674  , mNextDbSeqNum(1)
675  {
676  }
677 
678  DBREF mDbref;
679  std::vector<PDBSeqRes> mSeqres, mHet;
680  int mWaters;
681  int mTerIndex;
682 
683  int mMolID;
684 
685  // scratch values for reading SEQRES records
686  int mNextSeqNum;
687  int mNextDbSeqNum;
688 
689  // scratch value for aligning
690  struct AtomRes
691  {
692  std::string mMonID;
693  int mSeqNum;
694  char mIcode;
695 
696  bool operator==(const AtomRes &rhs) const { return mSeqNum == rhs.mSeqNum and mIcode == rhs.mIcode; }
697  bool operator!=(const AtomRes &rhs) const { return mSeqNum != rhs.mSeqNum or mIcode != rhs.mIcode; }
698  };
699  std::vector<AtomRes> mResiduesSeen;
700 
701  int AlignResToSeqRes();
702  bool SameSequence(const PDBChain &rhs) const;
703  };
704 
705  // ----------------------------------------------------------------
706 
707  PDBCompound &GetOrCreateCompound(int molID)
708  {
709  auto i = std::find_if(mCompounds.begin(), mCompounds.end(), [molID](PDBCompound &comp) -> bool
710  { return comp.mMolID == molID; });
711  if (i == mCompounds.end())
712  {
713  mCompounds.push_back(PDBCompound{ molID });
714 
715  mMolID2EntityID[molID] = std::to_string(mNextEntityNr++);
716 
717  i = prev(mCompounds.end());
718  }
719 
720  return *i;
721  }
722 
723  // locate the PDBChain record for a chain ID, or create it with dummy data if missing
724  PDBChain &GetChainForID(char chainID, int numRes = 0)
725  {
726  auto i = std::find_if(mChains.begin(), mChains.end(), [chainID](PDBChain &ch) -> bool
727  { return ch.mDbref.chainID == chainID; });
728 
729  if (i == mChains.end())
730  {
731  // locate the compound for this chain, if any (does that happen?)
732  int molID = 0;
733  for (auto &cmp : mCompounds)
734  {
735  if (cmp.mChains.count(chainID) > 0)
736  {
737  molID = cmp.mMolID;
738  break;
739  }
740  }
741 
742  mChains.emplace_back(mStructureID, chainID, molID);
743 
744  i = prev(mChains.end());
745  }
746 
747  return *i;
748  };
749 
750  void InsertChemComp(const std::string &chemComp)
751  {
752  if (find(mChemComp.begin(), mChemComp.end(), chemComp) == mChemComp.end())
753  mChemComp.push_back(chemComp);
754  }
755 
756  void InsertAtomType(const std::string &atomType)
757  {
758  if (find(mAtomTypes.begin(), mAtomTypes.end(), atomType) == mAtomTypes.end())
759  mAtomTypes.push_back(atomType);
760  }
761 
762  // ----------------------------------------------------------------
763 
764  template <typename Predicate>
765  PDBRecord *FindRecord(Predicate &&pred)
766  {
767  PDBRecord *result;
768 
769  for (result = mData; result != nullptr; result = result->mNext)
770  {
771  if (pred(*result))
772  break;
773  }
774 
775  return result;
776  }
777 
778  PDBRecord *FindRecord(const char *name)
779  {
780  return FindRecord([name](PDBRecord &rec) -> bool
781  { return rec.is(name); });
782  }
783 
784  // ----------------------------------------------------------------
785 
786  char vC(size_t column) const
787  {
788  return mRec->vC(column);
789  }
790 
791  std::string vS(size_t columnFirst, size_t columnLast = std::numeric_limits<size_t>::max()) const
792  {
793  return mRec->vS(columnFirst, columnLast);
794  }
795 
796  std::string vF(size_t columnFirst, size_t columnLast) const
797  {
798  return mRec->vF(columnFirst, columnLast);
799  }
800 
801  int vI(int columnFirst, int columnLast) const
802  {
803  return mRec->vI(columnFirst, columnLast);
804  }
805 
806  // ----------------------------------------------------------------
807 
808  // Map a PDB residue location to a seqnum in a struct_asym
809  std::tuple<std::string, int, bool> MapResidue(char chainID, int resSeq, char iCode) const
810  {
811  auto key = std::make_tuple(chainID, resSeq, iCode);
812 
813  try
814  {
815  return mChainSeq2AsymSeq.at(key);
816  }
817  catch (const std::exception &ex)
818  {
819  throw_with_nested(std::runtime_error(std::string("Residue ") + chainID + std::to_string(resSeq) + iCode + " could not be mapped"));
820  }
821  }
822 
823  std::tuple<std::string, int, bool> MapResidue(char chainID, int resSeq, char iCode, std::error_code &ec) const
824  {
825  auto key = std::make_tuple(chainID, resSeq, iCode);
826 
827  std::tuple<std::string, int, bool> result;
828 
829  if (not mChainSeq2AsymSeq.count(key))
830  {
832  if (cif::VERBOSE > 0)
833  std::cerr << "Residue " << chainID << resSeq << iCode << " could not be mapped" << std::endl;
834  }
835  else
836  result = mChainSeq2AsymSeq.at(key);
837 
838  return result;
839  }
840 
841  std::tuple<std::string, int, bool> MapResidue(char chainID, int resSeq, char iCode, const std::string &resName);
842 
843  // ----------------------------------------------------------------
844 
845  void PreParseInput(std::istream &is);
846 
847  void GetNextRecord();
848  void Match(const std::string &expected, bool throwIfMissing);
849 
850  void ParseTitle();
851  void ParseCitation(const std::string &id);
852  void ParseRemarks();
853 
854  // void ParseRemark3();
855  // size_t ParseRemark3(const std::string& program, const Remark3Template templ[], size_t N);
856  // std::string NextRemark3Line();
857 
858  void ParseRemark200();
859  void ParseRemark350();
860 
861  void ParsePrimaryStructure();
862  void ParseHeterogen();
863  void ConstructEntities();
864  void ConstructSugarTrees(int &asymNr);
865  void ParseSecondaryStructure();
866  void ParseConnectivtyAnnotation();
867  void ParseMiscellaneousFeatures();
868  void ParseCrystallographic();
869  void ParseCoordinateTransformation();
870  void ParseCoordinate(int modelNr);
871  void ParseConnectivty();
872  void ParseBookkeeping();
873 
874  // ----------------------------------------------------------------
875 
876  category *getCategory(std::string name)
877  {
878  return &mDatablock[name];
879  }
880 
881  std::vector<std::string> SplitCSV(const std::string &value);
882 
883  std::string pdb2cifDate(std::string s, std::error_code &ec)
884  {
885  std::smatch m;
886  const std::regex
887  rx1(R"((\d{2})-(JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)-(\d{2}))"),
888  rx2(R"((JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)-(\d{2}))");
889 
890  try
891  {
892  if (regex_match(s, m, rx1))
893  {
894  int day = stoi(m[1].str());
895  auto mi = kMonths.find(m[2].str());
896  if (mi == kMonths.end())
897  throw std::runtime_error("Invalid month: '" + m[2].str() + '\'');
898  int month = mi->second;
899  int year = 1900 + stoi(m[3].str());
900  if (year < 1950)
901  year += 100;
902 
903  std::stringstream ss;
904  ss << std::setw(4) << std::setfill('0') << year << '-'
905  << std::setw(2) << std::setfill('0') << month << '-'
906  << std::setw(2) << std::setfill('0') << day;
907 
908  s = ss.str();
909  }
910  else if (regex_match(s, m, rx2))
911  {
912  auto mi = kMonths.find(m[1].str());
913  if (mi == kMonths.end())
914  throw std::runtime_error("Invalid month: '" + m[1].str() + '\'');
915  int month = mi->second;
916  int year = 1900 + stoi(m[2].str());
917  if (year < 1950)
918  year += 100;
919 
920  s = cif::format("%04d-%02d", year, month).str();
921  }
922  else
924  }
925  catch (const std::exception &ex)
926  {
927  if (cif::VERBOSE > 0)
928  std::cerr << ex.what() << std::endl;
930  }
931 
932  return s;
933  }
934 
935  std::string pdb2cifDate(std::string s)
936  {
937  std::error_code ec;
938  auto result = pdb2cifDate(s, ec);
939  if (ec and cif::VERBOSE > 0)
940  std::cerr << "Invalid date(" << s << "): " << ec.message() << std::endl;
941  return result;
942  }
943 
944  std::string pdb2cifAuth(std::string author)
945  {
946  cif::trim(author);
947 
948  const std::regex rx(R"(((?:[A-Z]+\.)+)(.+))");
949  std::smatch m;
950  if (regex_match(author, m, rx))
951  author = m[2].str() + ", " + m[1].str();
952 
953  bool upper = true;
954  for (auto &c : author)
955  {
956  if (ispunct(c) or isspace(c))
957  upper = true;
958  else if (upper)
959  upper = false;
960  else
961  c = cif::tolower(c);
962  }
963 
964  return author;
965  }
966 
967  std::string pdb2cifSymmetry(std::string s)
968  {
969  static const std::regex sgRx(R"((\d{1,3})(\d{3}))");
970 
971  if (not s.empty())
972  {
973  std::smatch m;
974  if (not std::regex_match(s, m, sgRx))
975  throw std::runtime_error("invalid symmetry value '" + s + '\'');
976 
977  s = m[1].str() + "_" + m[2].str();
978  }
979 
980  return s;
981  }
982 
983  std::string pdb2cifCharge(std::string c)
984  {
985  std::regex rx(R"((\d+)(\+|-))");
986  std::smatch m;
987 
988  if (std::regex_match(c, m, rx))
989  {
990  if (m[2].str() == "-")
991  c = '-' + m[1].str();
992  else
993  c = m[1].str();
994  }
995 
996  return c;
997  }
998 
999  std::vector<char> altLocsForAtom(char chainID, int seqNum, char iCode, std::string atomName);
1000  void MapChainID2AsymIDS(char chainID, std::vector<std::string> &asymIds);
1001 
1002  std::tuple<ATOM_REF, bool> FindLink(const std::string &name1, const std::string &resName1, int resSeq1, char altLoc1, char chainID1, char iCode1,
1003  const std::string &name2, const std::string &resName2 = "")
1004  {
1005  return FindLink(ATOM_REF{ name1, resName1, resSeq1, altLoc1, chainID1, iCode1 }, name2, resName2);
1006  }
1007 
1008  std::tuple<ATOM_REF, bool> FindLink(const ATOM_REF &atom, const std::string &name2, const std::string &resName2 = "") const
1009  {
1010  auto i = std::find_if(mLinks.begin(), mLinks.end(), [&](const LINK &link)
1011  { return (link.a == atom and link.b.name == name2 and (resName2.empty() or link.b.resName == resName2)) or
1012  (link.b == atom and link.a.name == name2 and (resName2.empty() or link.a.resName == resName2)); });
1013 
1014  if (i != mLinks.end())
1015  return { i->a == atom ? i->b : i->a, true };
1016 
1017  return {};
1018  }
1019 
1020  // ----------------------------------------------------------------
1021 
1022  PDBRecord *mData;
1023  PDBRecord *mRec;
1024  cif::datablock mDatablock;
1025 
1026  std::string mStructureID;
1027  std::string mModelTypeDetails;
1028  std::string mOriginalDate;
1029  std::string mExpMethod = "X-RAY DIFFRACTION";
1030  int mCitationAuthorNr = 1, mCitationEditorNr = 1;
1031  int mNextMolID = 1, mNextEntityNr = 1;
1032  int mNextSoftwareOrd = 1;
1033 
1034  struct SEQADV
1035  {
1036  std::string resName;
1037  char chainID;
1038  int seqNum;
1039  char iCode;
1040  std::string database;
1041  std::string dbAccession;
1042  std::string dbRes;
1043  int dbSeq;
1044  std::string conflict;
1045  };
1046 
1047  std::vector<SEQADV> mSeqadvs;
1048 
1049  std::list<PDBCompound> mCompounds;
1050  std::list<PDBChain> mChains;
1051  std::vector<HET> mHets;
1052  std::map<std::string, std::string> mHetnams;
1053  std::map<std::string, std::string> mHetsyns;
1054  std::map<std::string, std::string> mFormuls;
1055  std::string mWaterHetID;
1056  std::vector<std::string> mChemComp, mAtomTypes;
1057 
1058  std::map<std::string, std::string> mRemark200;
1059  std::string mRefinementSoftware;
1060  int mAtomID = 0;
1061  int mPdbxDifOrdinal = 0;
1062 
1063  std::vector<UNOBS> mUnobs;
1064  std::vector<LINK> mLinks;
1065 
1066  // various maps between numbering schemes
1067  std::map<std::tuple<char, int, char>, std::tuple<std::string, int, bool>> mChainSeq2AsymSeq;
1068 
1069  std::map<int, std::string> mMolID2EntityID;
1070  std::map<std::string, std::string> mHet2EntityID;
1071  std::map<std::string, std::string> mBranch2EntityID;
1072  std::map<std::string, std::string> mAsymID2EntityID;
1073  std::map<std::string, std::string> mMod2parent;
1074  std::set<std::string> mSugarEntities;
1075 };
1076 
1077 // --------------------------------------------------------------------
1078 
1079 std::vector<char> PDBFileParser::altLocsForAtom(char inChainID, int inResSeq, char inICode, std::string inAtomName)
1080 {
1081  // well, maybe this could be optimized...
1082  std::set<char> result;
1083 
1084  for (auto r = mData; r != nullptr; r = r->mNext)
1085  {
1086  if (r->is("ATOM ") or r->is("HETATM")) // 1 - 6 Record name "ATOM "
1087  { // ...
1088  std::string name = r->vS(13, 16); // 13 - 16 Atom name Atom name.
1089  char altLoc = r->vC(17); // 17 Character altLoc Alternate location indicator.
1090  char chainID = r->vC(22); // 22 Character chainID Chain identifier.
1091  int resSeq = r->vI(23, 26); // 23 - 26 Integer resSeq Residue sequence number.
1092  char iCode = r->vC(27); // 27 AChar iCode Code for insertion of residues.
1093 
1094  if (chainID == inChainID and resSeq == inResSeq and iCode == inICode and name == inAtomName and altLoc != ' ')
1095  result.insert(altLoc);
1096  }
1097  }
1098 
1099  return { result.begin(), result.end() };
1100 }
1101 
1102 void PDBFileParser::MapChainID2AsymIDS(char chainID, std::vector<std::string> &asymIds)
1103 {
1104  for (const auto &[key, value] : mChainSeq2AsymSeq)
1105  {
1106  if (std::get<0>(key) == chainID)
1107  asymIds.push_back(std::get<0>(value));
1108  }
1109 
1110  std::sort(asymIds.begin(), asymIds.end(), [](const std::string &a, const std::string &b)
1111  {
1112  int d = static_cast<int>(a.length() - b.length());
1113  if (d == 0)
1114  d = a.compare(b);
1115  return d < 0; });
1116 
1117  asymIds.erase(std::unique(asymIds.begin(), asymIds.end()), asymIds.end());
1118 }
1119 
1120 // --------------------------------------------------------------------
1121 
1122 void PDBFileParser::PreParseInput(std::istream &is)
1123 {
1124  std::string lookahead;
1125  uint32_t lineNr = 1;
1126  getline(is, lookahead);
1127 
1128  if (lookahead.back() == '\r')
1129  lookahead.pop_back();
1130 
1131  // if (cif::starts_with(lookahead, "HEADER") == false)
1132  // throw std::runtime_error("This does not look like a PDB file, should start with a HEADER line");
1133 
1134  auto contNr = [&lookahead](int offset, int len) -> int
1135  {
1136  std::string cs = lookahead.substr(offset, len);
1137  cif::trim(cs);
1138  int result = 0;
1139 
1140  if (not cs.empty())
1141  {
1142  auto r = std::from_chars(cs.data(), cs.data() + cs.length(), result);
1143  if (r.ec != std::errc())
1144  throw std::runtime_error("Continuation std::string '" + cs + "' is not valid");
1145  }
1146 
1147  return result;
1148  };
1149 
1150  PDBRecord *last = nullptr;
1151  std::set<std::string> dropped;
1152 
1153  for (;;)
1154  {
1155  if (lookahead.empty())
1156  {
1157  if (is.eof())
1158  break;
1159 
1160  if (cif::VERBOSE > 0)
1161  std::cerr << "Line number " << lineNr << " is empty!" << std::endl;
1162 
1163  getline(is, lookahead);
1164  ++lineNr;
1165 
1166  continue;
1167  }
1168 
1169  std::string type = lookahead.substr(0, 6);
1170  std::string value;
1171  if (lookahead.length() > 6)
1172  value = cif::trim_right_copy(lookahead.substr(6));
1173 
1174  uint32_t curLineNr = lineNr;
1175  getline(is, lookahead);
1176  ++lineNr;
1177 
1178  if (kSupportedRecords.count(type) == 0)
1179  {
1180  cif::trim(type);
1181 
1182  if (type != "END") // special case
1183  dropped.insert(type);
1184 
1185  lookahead.clear();
1186 
1187  continue;
1188  }
1189 
1190  // see if we need to append continuation values
1191  if (type == "AUTHOR" or
1192  type == "EXPDTA" or
1193  type == "MDLTYP" or
1194  type == "KEYWDS" or
1195  type == "SPLIT " or
1196  type == "SPRSDE" or
1197  type == "TITLE ")
1198  {
1199  int n = 2;
1200  while (lookahead.substr(0, 6) == type and contNr(7, 3) == n)
1201  {
1202  value += cif::trim_right_copy(lookahead.substr(10));
1203  getline(is, lookahead);
1204  ++lineNr;
1205  ++n;
1206  }
1207  }
1208  else if (type == "COMPND")
1209  {
1210  int n = 2;
1211  value += '\n';
1212  while (lookahead.substr(0, 6) == type and contNr(7, 3) == n)
1213  {
1214  value += cif::trim_right_copy(lookahead.substr(10));
1215  value += '\n';
1216  getline(is, lookahead);
1217  ++lineNr;
1218  ++n;
1219  }
1220  }
1221  else if (type == "REVDAT")
1222  {
1223  int revNr = stoi(value.substr(1, 3));
1224  int n = 2;
1225  while (lookahead.substr(0, 6) == type and
1226  stoi(lookahead.substr(7, 3)) == revNr and
1227  contNr(10, 2) == n)
1228  {
1229  value += lookahead.substr(38);
1230  getline(is, lookahead);
1231  ++lineNr;
1232  ++n;
1233  }
1234  }
1235  else if (type == "CAVEAT")
1236  {
1237  int n = 2;
1238  while (lookahead.substr(0, 6) == type and contNr(7, 3) == n)
1239  {
1240  value += cif::trim_right_copy(lookahead.substr(13));
1241  getline(is, lookahead);
1242  ++lineNr;
1243  ++n;
1244  }
1245  }
1246  else if (type == "OBSLTE")
1247  {
1248  while (lookahead.substr(0, 6) == type)
1249  {
1250  value += lookahead.substr(31);
1251  getline(is, lookahead);
1252  ++lineNr;
1253  }
1254  }
1255  else if (type == "SOURCE")
1256  {
1257  value += '\n';
1258  int n = 2;
1259  while (lookahead.substr(0, 6) == type and contNr(7, 3) == n)
1260  {
1261  value += cif::trim_copy(lookahead.substr(10));
1262  value += '\n';
1263  getline(is, lookahead);
1264  ++lineNr;
1265  ++n;
1266  }
1267  }
1268  else if (type == "FORMUL")
1269  {
1270  try
1271  {
1272  int compNr;
1273  try
1274  {
1275  compNr = stoi(value.substr(1, 3));
1276  }
1277  catch (const std::exception &ex)
1278  {
1279  if (cif::VERBOSE >= 0)
1280  std::cerr << "Dropping FORMUL line (" << (lineNr - 1) << ") with invalid component number '" << value.substr(1, 3) << '\'' << std::endl;
1281  continue;
1282  // throw_with_nested(std::runtime_error("Invalid component number '" + value.substr(1, 3) + '\''));
1283  }
1284 
1285  int n = 2;
1286  try
1287  {
1288  while (lookahead.substr(0, 6) == type and
1289  stoi(lookahead.substr(7, 3)) == compNr and
1290  contNr(16, 2) == n)
1291  {
1292  value += cif::trim_right_copy(lookahead.substr(19));
1293  ;
1294  getline(is, lookahead);
1295  ++lineNr;
1296  ++n;
1297  }
1298  }
1299  catch (const std::invalid_argument &ex)
1300  {
1301  continue;
1302  // throw_with_nested(std::runtime_error("Invalid component number '" + lookahead.substr(7, 3) + '\''));
1303  }
1304  }
1305  catch (const std::exception &ex)
1306  {
1307  if (cif::VERBOSE >= 0)
1308  std::cerr << "Error parsing FORMUL at line " << lineNr << std::endl;
1309  throw;
1310  }
1311  }
1312  else if (type == "HETNAM" or
1313  type == "HETSYN")
1314  {
1315  int n = 2;
1316  while (lookahead.substr(0, 6) == type and contNr(8, 2) == n)
1317  {
1318  value += cif::trim_right_copy(lookahead.substr(16));
1319  ;
1320  getline(is, lookahead);
1321  ++lineNr;
1322  ++n;
1323  }
1324  }
1325  else if (type == "SITE ")
1326  {
1327  std::string siteName = value.substr(5, 3);
1328  cif::trim_right(value);
1329  size_t n = value.length() - 12;
1330  value += std::string(11 - (n % 11), ' ');
1331 
1332  while (lookahead.substr(0, 6) == type and lookahead.substr(11, 3) == siteName)
1333  {
1334  std::string s = lookahead.substr(18);
1335  cif::trim_right(s);
1336  s += std::string(11 - (s.length() % 11), ' ');
1337  value += s;
1338 
1339  // TODO: improve this... either use numRes or don't lump together all text
1340  // value += " " + cif::trim_right_copy();
1341  getline(is, lookahead);
1342  ++lineNr;
1343  }
1344  }
1345  else if (type == "REMARK")
1346  {
1347  type += value.substr(0, 4);
1348 
1349  // parse it now, makes life easier later on
1350  if (type == "REMARK 200" or type == "REMARK 240")
1351  {
1352  auto i = value.find(":");
1353 
1354  if (i != std::string::npos)
1355  {
1356  std::string k = value.substr(4, i - 4);
1357  std::string v = value.substr(i + 1);
1358 
1359  cif::trim(k);
1360  while (k.find(" ") != std::string::npos)
1361  cif::replace_all(k, " ", " ");
1362  cif::trim(v);
1363 
1364  if (iequals(v, "NONE") or iequals(v, "N/A") or iequals(v, "NAN"))
1365  mRemark200[k] = ".";
1366  else if (not iequals(v, "NULL"))
1367  mRemark200[k] = v;
1368  }
1369  }
1370  }
1371 
1372  PDBRecord *cur = new (value.length()) PDBRecord(curLineNr, type, value);
1373 
1374  if (last == nullptr)
1375  last = mData = cur;
1376  else
1377  last->mNext = cur;
1378 
1379  last = cur;
1380 
1381  cif::trim(type);
1382 
1383  if (type == "LINK" or type == "LINKR")
1384  {
1385  LINK link = {};
1386 
1387  link.a.name = cur->vS(13, 16); // 13 - 16 Atom name1 Atom name.
1388  link.a.altLoc = cur->vC(17); // 17 Character altLoc1 Alternate location indicator.
1389  link.a.resName = cur->vS(18, 20); // 18 - 20 Residue name resName1 Residue name.
1390  link.a.chainID = cur->vC(22); // 22 Character chainID1 Chain identifier.
1391  link.a.resSeq = cur->vI(23, 26); // 23 - 26 Integer resSeq1 Residue sequence number.
1392  link.a.iCode = cur->vC(27); // 27 AChar iCode1 Insertion code.
1393  link.b.name = cur->vS(43, 46); // 43 - 46 Atom name2 Atom name.
1394  link.b.altLoc = cur->vC(47); // 47 Character altLoc2 Alternate location indicator.
1395  link.b.resName = cur->vS(48, 50); // 48 - 50 Residue name resName2 Residue name.
1396  link.b.chainID = cur->vC(52); // 52 Character chainID2 Chain identifier.
1397  link.b.resSeq = cur->vI(53, 56); // 53 - 56 Integer resSeq2 Residue sequence number.
1398  link.b.iCode = cur->vC(57); // 57 AChar iCode2 Insertion code.
1399  link.symOpA = cur->vS(60, 65); // 60 - 65 SymOP sym1 Symmetry operator atom 1.
1400  link.symOpB = cur->vS(67, 72); // 67 - 72 SymOP sym2 Symmetry operator atom 2.
1401 
1402  if (type == "LINK") // 1 - 6 Record name "LINK "
1403  {
1404  auto f = cur->vF(74, 78);
1405  auto r = cif::from_chars(f.data(), f.data() + f.length(), link.distance);
1406  if (r.ec != std::errc() and cif::VERBOSE > 0)
1407  std::cerr << "Error parsing link distance at line " << cur->mLineNr << std::endl;
1408  }
1409  // 74 – 78 Real(5.2) Length Link distance
1410 
1411  mLinks.push_back(link);
1412  }
1413 
1414  if (type == "END")
1415  break;
1416  }
1417 
1418  if (not dropped.empty())
1419  {
1420  if (cif::VERBOSE >= 0)
1421  std::cerr << "Dropped unsupported records: " << cif::join(dropped, ", ") << std::endl;
1422  }
1423 
1424  if (mData == nullptr)
1425  throw std::runtime_error("Empty file?");
1426 
1427  mRec = mData;
1428 }
1429 
1430 void PDBFileParser::GetNextRecord()
1431 {
1432  if (mRec != nullptr)
1433  mRec = mRec->mNext;
1434 
1435  if (mRec == nullptr)
1436  {
1437  static PDBRecord *end = new (0) PDBRecord({ 0, "END ", "" });
1438  mRec = end;
1439  }
1440 }
1441 
1442 void PDBFileParser::Match(const std::string &expected, bool throwIfMissing)
1443 {
1444  assert(mRec);
1445  if (mRec->mName != expected)
1446  {
1447  if (throwIfMissing)
1448  throw std::runtime_error("Expected record " + expected + " but found " + mRec->mName);
1449  if (cif::VERBOSE > 0)
1450  std::cerr << "Expected record " + expected + " but found " + mRec->mName << std::endl;
1451  }
1452 }
1453 
1454 std::vector<std::string> PDBFileParser::SplitCSV(const std::string &value)
1455 {
1456  auto vs = cif::split<std::string>(value, ",");
1457  for (auto &v : vs)
1458  cif::trim(v);
1459  return vs;
1460 }
1461 
1462 void PDBFileParser::ParseTitle()
1463 {
1464  // strict ordering required
1465 
1466  // HEADER
1467  // 1 - 6 Record name "HEADER"
1468  // 11 - 50 String(40) classification Classifies the molecule(s).
1469  // 51 - 59 Date depDate Deposition date. This is the date the
1470  // coordinates were received at the PDB.
1471  // 63 - 66 IDcode idCode This identifier is unique within the PDB.
1472 
1473  Match("HEADER", false);
1474 
1475  std::string keywords;
1476 
1477  if (mRec->is("HEADER"))
1478  {
1479  mStructureID = vS(63, 66);
1480  keywords = vS(11, 50);
1481  mOriginalDate = pdb2cifDate(vS(51, 59));
1482 
1483  cif::trim(keywords);
1484 
1485  GetNextRecord();
1486  }
1487 
1488  cif::trim(mStructureID);
1489  if (mStructureID.empty())
1490  mStructureID = "nohd";
1491 
1492  mDatablock.set_name(mStructureID);
1493 
1494  auto cat = getCategory("entry");
1495  // cat->addColumn("id");
1496  cat->emplace({
1497  { "id", mStructureID } });
1498 
1499  // OBSLTE
1500  if (mRec->is("OBSLTE"))
1501  {
1502  // 1 - 6 Record name "OBSLTE"
1503  // 9 - 10 Continuation continuation Allows concatenation of multiple records
1504  // 12 - 20 Date repDate Date that this datablock was replaced.
1505  // 22 - 25 IDcode idCode ID code of this datablock.
1506  // 32 - 35 IDcode rIdCode ID code of datablock that replaced this one.
1507  // 37 - 40 ...
1508 
1509  std::string old = vS(22, 25);
1510  std::string date = pdb2cifDate(vS(12, 20));
1511  cat = getCategory("pdbx_database_PDB_obs");
1512 
1513  std::string value = mRec->vS(32);
1514  for (auto i : cif::split<std::string>(value, " ", true))
1515  {
1516  cat->emplace({
1517  { "id", "OBSLTE" },
1518  { "date", date },
1519  { "replace_pdb_id", old },
1520  { "pdb_id", i } });
1521  }
1522 
1523  GetNextRecord();
1524  }
1525 
1526  // TITLE
1527  Match("TITLE ", false);
1528  std::string title;
1529  if (mRec->is("TITLE ")) // 1 - 6 Record name "TITLE "
1530  { // 9 - 10 Continuation continuation Allows concatenation of multiple records.
1531  title = vS(11); // 11 - 80 String title Title of the experiment.
1532  GetNextRecord();
1533  }
1534 
1535  // SPLIT
1536  if (mRec->is("SPLIT "))
1537  {
1538  // 1 - 6 Record name "SPLIT "
1539  // 9 - 10 Continuation continuation Allows concatenation of multiple records.
1540  // 12 - 15 IDcode idCode ID code of related datablock.
1541 
1542  throw std::runtime_error("SPLIT PDB files are not supported");
1543  }
1544 
1545  // CAVEAT
1546  int caveatID = 1;
1547  while (mRec->is("CAVEAT")) // 1 - 6 Record name "CAVEAT"
1548  {
1549  getCategory("database_PDB_caveat")->emplace({
1550  { "id", caveatID++ },
1551  { "text", std::string{ mRec->vS(20) } } // 20 - 79 String comment Free text giving the reason for the CAVEAT.
1552  });
1553 
1554  GetNextRecord();
1555  }
1556 
1557  // COMPND
1558  Match("COMPND", false);
1559  // 1 - 6 Record name "COMPND"
1560  // 8 - 10 Continuation continuation Allows concatenation of multiple records.
1561  // 11 - 80 Specification compound Description of the molecular components.
1562  // list
1563 
1564  std::string value{ mRec->vS(11) };
1565  if (value.find(':') == std::string::npos)
1566  {
1567  // special case for dumb, stripped files
1568  auto &comp = GetOrCreateCompound(1);
1569  comp.mInfo["MOLECULE"] = value;
1570  }
1571  else
1572  {
1573  SpecificationListParser p(value);
1574 
1575  for (;;)
1576  {
1577  std::string key, val;
1578  std::tie(key, val) = p.GetNextSpecification();
1579 
1580  if (key.empty())
1581  break;
1582 
1583  if (not iequals(key, "MOL_ID") and mCompounds.empty())
1584  {
1585  if (cif::VERBOSE > 0)
1586  std::cerr << "Ignoring invalid COMPND record" << std::endl;
1587  break;
1588  }
1589 
1590  if (key == "MOL_ID")
1591  {
1592  auto &comp = GetOrCreateCompound(stoi(val));
1593  comp.mTitle = title;
1594  }
1595  else if (key == "CHAIN")
1596  {
1597  for (auto c : cif::split<std::string>(val, ","))
1598  {
1599  cif::trim(c);
1600  mCompounds.back().mChains.insert(c[0]);
1601  }
1602  }
1603  else
1604  mCompounds.back().mInfo[key] = val;
1605  }
1606  }
1607 
1608  if (mRec->is("COMPND"))
1609  GetNextRecord();
1610 
1611  // SOURCE
1612  Match("SOURCE", false);
1613 
1614  if (mRec->is("SOURCE"))
1615  {
1616  // 1 - 6 Record name "SOURCE"
1617  // 8 - 10 Continuation continuation Allows concatenation of multiple records.
1618  // 11 - 79 Specification srcName Identifies the source of the
1619  // List macromolecule in a token: value format.
1620 
1621  std::map<std::string, std::string> *source = nullptr;
1622 
1623  // value = { mRec->vS(11) };
1624  // for (auto si = ba::make_split_iterator(value, ba::token_finder(ba::is_any_of(";"), ba::token_compress_on)); not si.eof(); ++si)
1625  // {
1626  // std::string s(si->begin(), si->end());
1627  // if (s.empty())
1628  // continue;
1629  //
1630  // auto colon = s.find(": ");
1631  // if (colon == std::string::npos)
1632  // {
1633  // if (cif::VERBOSE > 0)
1634  // std::cerr << "invalid source field, missing colon (" << s << ')' << std::endl;
1635  // continue;
1636  // }
1637  SpecificationListParser p(vS(11));
1638 
1639  for (;;)
1640  {
1641  std::string key, val;
1642  std::tie(key, val) = p.GetNextSpecification();
1643 
1644  if (key.empty())
1645  break;
1646 
1647  if (key == "MOL_ID")
1648  {
1649  for (auto &c : mCompounds)
1650  {
1651  if (c.mMolID == stoi(val))
1652  {
1653  source = &c.mSource;
1654  break;
1655  }
1656  }
1657 
1658  continue;
1659  }
1660 
1661  if (source == nullptr)
1662  throw std::runtime_error("At line " + std::to_string(mRec->mLineNr) + ": missing MOL_ID in SOURCE");
1663 
1664  (*source)[key] = val;
1665  }
1666 
1667  GetNextRecord();
1668  }
1669 
1670  // KEYWDS
1671  Match("KEYWDS", false);
1672  std::string pdbxKeywords;
1673 
1674  if (mRec->is("KEYWDS")) // 1 - 6 Record name "KEYWDS"
1675  { // 9 - 10 Continuation continuation Allows concatenation of records if necessary.
1676  pdbxKeywords = vS(11); // 11 - 79 List keywds Comma-separated list of keywords relevant
1677  // to the datablock.
1678  GetNextRecord();
1679  }
1680 
1681  if (not(keywords.empty() and pdbxKeywords.empty()))
1682  {
1683  getCategory("struct_keywords")->emplace({
1684  { "entry_id", mStructureID },
1685  { "pdbx_keywords", keywords },
1686  { "text", pdbxKeywords } });
1687  }
1688 
1689  // EXPDTA
1690  Match("EXPDTA", false);
1691  if (mRec->is("EXPDTA"))
1692  {
1693  mExpMethod = vS(11);
1694 
1695  cat = getCategory("exptl");
1696 
1697  auto crystals = cif::split<std::string>(mRemark200["NUMBER OF CRYSTALS USED"], "; ");
1698  if (crystals.empty())
1699  crystals.push_back("");
1700  auto ci = crystals.begin();
1701 
1702  for (auto expMethod : cif::split<std::string>(mExpMethod, ";"))
1703  {
1704  cif::trim(expMethod);
1705 
1706  if (expMethod.empty())
1707  continue;
1708 
1709  cat->emplace({
1710  { "entry_id", mStructureID },
1711  { "method", expMethod },
1712  { "crystals_number", ci != crystals.end() ? *ci : "" } });
1713  }
1714 
1715  GetNextRecord();
1716  }
1717 
1718  // NUMMDL
1719  if (mRec->is("NUMMDL"))
1720  {
1721  if (cif::VERBOSE > 0)
1722  std::cerr << "skipping unimplemented NUMMDL record" << std::endl;
1723  GetNextRecord();
1724  }
1725 
1726  // MDLTYP
1727  if (mRec->is("MDLTYP"))
1728  {
1729  mModelTypeDetails = vS(11);
1730  GetNextRecord();
1731  }
1732 
1733  // AUTHOR
1734  Match("AUTHOR", false);
1735  if (mRec->is("AUTHOR"))
1736  {
1737  int n = 1;
1738  cat = getCategory("audit_author");
1739 
1740  value = { mRec->vS(11) };
1741  for (auto author : cif::split<std::string>(value, ",", true))
1742  {
1743  cat->emplace({
1744  { "name", pdb2cifAuth(author) },
1745  { "pdbx_ordinal", n } });
1746  ++n;
1747  }
1748 
1749  GetNextRecord();
1750  }
1751 
1752  // REVDAT
1753  bool firstRevDat = true;
1754  struct RevDat
1755  {
1756  int revNum;
1757  std::string date, dateOriginal, replaces;
1758  int modType;
1759  std::vector<std::string> types;
1760 
1761  bool operator<(const RevDat &rhs) const { return revNum < rhs.revNum; }
1762  };
1763  std::vector<RevDat> revdats;
1764 
1765  while (mRec->is("REVDAT"))
1766  {
1767  // 1 - 6 Record name "REVDAT"
1768  int revNum = vI(8, 10); // 8 - 10 Integer modNum Modification number.
1769  // 11 - 12 Continuation continuation Allows concatenation of multiple records.
1770  std::string date = pdb2cifDate(vS(14, 22)); // 14 - 22 Date modDate Date of modification (or release for
1771  // new entries) in DD-MMM-YY format. This is
1772  // not repeated on continued lines.
1773  std::string modID = vS(24, 27); // 24 - 27 IDCode modID ID code of this datablock. This is not repeated on
1774  // continuation lines.
1775  int modType = vI(32, 32); // 32 Integer modType An integer identifying the type of
1776  // modification. For all revisions, the
1777  // modification type is listed as 1
1778  std::string detail = vS(40); // 40 - 45 LString(6) record Modification detail.
1779  // 47 - 52 LString(6) record Modification detail.
1780  // 54 - 59 LString(6) record Modification detail.
1781  // 61 - 66 LString(6) record Modification detail.
1782 
1783  revdats.push_back({ revNum, date, modType == 0 ? mOriginalDate : "", modID, modType });
1784 
1785  revdats.back().types = cif::split<std::string>(detail, " ");
1786 
1787  if (firstRevDat)
1788  {
1789  cat = getCategory("database_2");
1790  cat->emplace({
1791  { "database_id", "PDB" },
1792  { "database_code", modID } });
1793  }
1794 
1795  GetNextRecord();
1796  firstRevDat = false;
1797  }
1798 
1799  /*
1800  This is internal stuff for PDB, don't write it ???
1801 */
1802  sort(revdats.begin(), revdats.end());
1803  for (auto &revdat : revdats)
1804  {
1805  getCategory("database_PDB_rev")->emplace({
1806  { "num", revdat.revNum },
1807  { "date", revdat.date },
1808  { "date_original", revdat.dateOriginal },
1809  { "replaces", revdat.replaces },
1810  { "mod_type", revdat.modType } });
1811 
1812  for (auto &type : revdat.types)
1813  {
1814  if (type.empty())
1815  continue;
1816  getCategory("database_PDB_rev_record")->emplace({
1817  { "rev_num", revdat.revNum },
1818  { "type", type } });
1819  }
1820  }
1821  //*/
1822 
1823  // SPRSDE
1824  if (mRec->is("SPRSDE"))
1825  {
1826  if (cif::VERBOSE > 0)
1827  std::cerr << "skipping unimplemented SPRSDE record" << std::endl;
1828  GetNextRecord();
1829  }
1830 
1831  // JRNL
1832  if (mRec->is("JRNL "))
1833  ParseCitation("primary");
1834 }
1835 
1836 void PDBFileParser::ParseCitation(const std::string &id)
1837 {
1838  const char *rec = mRec->mName;
1839 
1840  std::string auth, titl, edit, publ, refn, pmid, doi;
1841  std::string pubname, volume, astm, country, issn, csd;
1842  std::string pageFirst;
1843  int year = 0;
1844 
1845  auto extend = [](std::string &s, const std::string &p)
1846  {
1847  if (not s.empty())
1848  s += ' ';
1849  s += cif::trim_copy(p);
1850  };
1851 
1852  while (mRec->is(rec) and (id == "primary" or vC(12) == ' '))
1853  {
1854  std::string k = vS(13, 16);
1855  if (k == "AUTH")
1856  extend(auth, vS(20, 79));
1857  else if (k == "TITL")
1858  extend(titl, vS(20, 79));
1859  else if (k == "EDIT")
1860  extend(edit, vS(20, 79));
1861  else if (k == "REF")
1862  {
1863  if (pubname.empty())
1864  {
1865  extend(pubname, vS(20, 47));
1866  if (vS(50, 51) == "V.")
1867  volume = cif::trim_copy(vS(52, 55));
1868  pageFirst = vS(57, 61);
1869  year = vI(63, 66);
1870  }
1871  else
1872  extend(pubname, vS(20, 47));
1873  }
1874  else if (k == "PUBL")
1875  extend(publ, vS(20, 70));
1876  else if (k == "REFN")
1877  {
1878  if (vS(20, 23) == "ASTN")
1879  astm = vS(25, 30);
1880  country = vS(33, 34);
1881  if (vS(36, 39) == "ISSN")
1882  issn = vS(41, 65);
1883  }
1884  else if (k == "PMID")
1885  pmid = vS(20, 79);
1886  else if (k == "DOI")
1887  doi = vS(20, 79);
1888 
1889  GetNextRecord();
1890  }
1891 
1892  auto cat = getCategory("citation");
1893  cat->emplace({
1894  { "id", id },
1895  { "title", titl },
1896  { "journal_abbrev", pubname },
1897  { "journal_volume", volume },
1898  { "page_first", pageFirst },
1899  { "year", year > 0 ? std::to_string(year) : "" },
1900  { "journal_id_ASTM", astm },
1901  { "country", country },
1902  { "journal_id_ISSN", issn },
1903  { "journal_id_CSD", csd },
1904  { "book_publisher", publ },
1905  { "pdbx_database_id_PubMed", pmid },
1906  { "pdbx_database_id_DOI", doi } });
1907 
1908  if (not auth.empty())
1909  {
1910  cat = getCategory("citation_author");
1911  for (auto author : cif::split<std::string>(auth, ",", true))
1912  {
1913  cat->emplace({
1914  { "citation_id", id },
1915  { "name", pdb2cifAuth(author) },
1916  { "ordinal", mCitationAuthorNr } });
1917 
1918  ++mCitationAuthorNr;
1919  }
1920  }
1921 
1922  if (not edit.empty())
1923  {
1924  cat = getCategory("citation_editor");
1925  for (auto editor : cif::split<std::string>(edit, ",", true))
1926  {
1927  cat->emplace({
1928  { "citation_id", id },
1929  { "name", pdb2cifAuth(editor) },
1930  { "ordinal", mCitationEditorNr } });
1931 
1932  ++mCitationEditorNr;
1933  }
1934  }
1935 }
1936 
1937 void PDBFileParser::ParseRemarks()
1938 {
1939  std::string sequenceDetails, compoundDetails, sourceDetails;
1940 
1941  while (cif::starts_with(mRec->mName, "REMARK"))
1942  {
1943  int remarkNr = vI(8, 10);
1944 
1945  try
1946  {
1947  switch (remarkNr)
1948  {
1949  case 1:
1950  while (mRec->is("REMARK 1"))
1951  {
1952  if (mRec->mVlen > 15 and vS(12, 20) == "REFERENCE")
1953  {
1954  std::string id = vS(22, 70);
1955  GetNextRecord();
1956 
1957  ParseCitation(id);
1958  }
1959  else
1960  GetNextRecord();
1961  }
1962  break;
1963 
1964  case 3:
1965  // we skip REMARK 3 until we know the mapping
1966  while (mRec->is("REMARK 3"))
1967  GetNextRecord();
1968  break;
1969 
1970  case 4:
1971  // who cares...
1972  while (mRec->is("REMARK 4"))
1973  GetNextRecord();
1974  break;
1975 
1976  case 100:
1977  {
1978  const std::regex rx(R"(THE (\S+) ID CODE IS (\S+?)\.?\s*)");
1979  std::smatch m;
1980  std::string r = vS(12);
1981 
1982  if (std::regex_match(r, m, rx))
1983  {
1984  auto cat = getCategory("database_2");
1985  cat->emplace({
1986  { "database_id", m[1].str() },
1987  { "database_code", m[2].str() } });
1988  }
1989 
1990  GetNextRecord();
1991  break;
1992  }
1993 
1994  case 200:
1995  {
1996  // we already parsed most of this remark, but the "REMARK:" part might have been split.
1997 
1998  bool remark = false;
1999 
2000  do
2001  {
2002  std::string r = mRec->vS(12);
2003 
2004  if (cif::starts_with(r, "REMARK: "))
2005  {
2006  mRemark200["REMARK"] = r.substr(8);
2007  remark = true;
2008  }
2009  else if (remark)
2010  {
2011  if (r.empty())
2012  remark = false;
2013  else
2014  mRemark200["REMARK"] += r;
2015  }
2016 
2017  GetNextRecord();
2018  } while (mRec->is("REMARK 200"));
2019  break;
2020  }
2021 
2022  case 280:
2023  {
2024  std::string density_Matthews, densityPercentSol, conditions;
2025 
2026  const std::regex rx1(R"(SOLVENT CONTENT, VS +\(%\): *(.+))"),
2027  rx2(R"(MATTHEWS COEFFICIENT, VM \(ANGSTROMS\*\*3/DA\): *(.+))");
2028 
2029  std::smatch m;
2030 
2031  do
2032  {
2033  std::string r = vS(12);
2034 
2035  if (conditions.empty())
2036  {
2037  if (std::regex_match(r, m, rx1))
2038  densityPercentSol = m[1].str();
2039  else if (std::regex_match(r, m, rx2))
2040  density_Matthews = m[1].str();
2041  else if (cif::starts_with(r, "CRYSTALLIZATION CONDITIONS: "))
2042  conditions = r.substr(28);
2043  }
2044  else
2045  conditions = conditions + ' ' + r;
2046 
2047  GetNextRecord();
2048  } while (mRec->is("REMARK 280"));
2049 
2050  std::string desc = mRemark200["REMARK"];
2051  if (desc == "NULL")
2052  desc.clear();
2053 
2054  getCategory("exptl_crystal")->emplace({
2055  { "id", 1 },
2056  { "density_Matthews", iequals(density_Matthews, "NULL") ? "" : density_Matthews },
2057  { "density_percent_sol", iequals(densityPercentSol, "NULL") ? "" : densityPercentSol },
2058  { "description", desc } });
2059 
2060  // now try to parse the conditions
2061  const std::regex rx3(R"(TEMPERATURE +(\d+)K)"), rx4(R"(PH *(?:: *)?(\d+(?:\.\d+)?))") /*, rx5(R"(\b(\d+)C\b)")*/;
2062 
2063  std::string temp, ph, method;
2064 
2065  for (auto s : cif::split<std::string>(conditions, ",", true))
2066  {
2067  cif::trim(s);
2068 
2069  if (std::regex_search(s, m, rx3))
2070  temp = m[1].str();
2071  if (std::regex_search(s, m, rx4))
2072  ph = m[1].str();
2073  if (s.length() < 60 and
2074  (cif::icontains(s, "drop") or cif::icontains(s, "vapor") or cif::icontains(s, "batch")))
2075  {
2076  if (not method.empty())
2077  method = method + ", " + s;
2078  else
2079  method = s;
2080  }
2081  }
2082 
2083  if (not(method.empty() and temp.empty() and ph.empty() and (conditions.empty() or conditions == "NULL")))
2084  {
2085  getCategory("exptl_crystal_grow")->emplace({
2086  { "crystal_id", 1 },
2087  { "method", method },
2088  { "temp", temp },
2089  { "pH", ph },
2090  { "pdbx_details", conditions } });
2091  }
2092 
2093  break;
2094  }
2095 
2096  // case 290:
2097  //
2098  // break;
2099 
2100  case 350:
2101  // postponed since we don't have the required information yet
2102  for (; mRec->is("REMARK 350"); GetNextRecord())
2103  ;
2104  break;
2105 
2106  case 400:
2107  {
2108  std::stringstream s;
2109  GetNextRecord();
2110  if (vS(12) == "COMPOUND")
2111  GetNextRecord();
2112 
2113  while (mRec->is("REMARK 400"))
2114  {
2115  s << vS(12) << std::endl;
2116  GetNextRecord();
2117  }
2118 
2119  compoundDetails = s.str();
2120  break;
2121  }
2122 
2123  case 450:
2124  {
2125  std::stringstream s;
2126  GetNextRecord();
2127  if (vS(12) == "SOURCE")
2128  GetNextRecord();
2129 
2130  while (mRec->is("REMARK 450"))
2131  {
2132  s << vS(12) << std::endl;
2133  GetNextRecord();
2134  }
2135 
2136  sourceDetails = s.str();
2137  break;
2138  }
2139 
2140  case 465:
2141  {
2142  bool headerSeen = false;
2143  std::regex rx(R"( *MODELS *(\d+)-(\d+))");
2144  int models[2] = { -1, -1 };
2145 
2146  for (; mRec->is("REMARK 465"); GetNextRecord())
2147  {
2148  if (not headerSeen)
2149  {
2150  std::string line = vS(12);
2151  std::smatch m;
2152 
2153  if (std::regex_match(line, m, rx))
2154  {
2155  models[0] = std::stoi(m[1].str());
2156  models[1] = stoi(m[2].str());
2157  }
2158  else
2159  headerSeen = cif::contains(line, "RES C SSSEQI");
2160  continue;
2161  }
2162 
2163  if (models[0] == models[1])
2164  models[0] = models[1] = vI(12, 14);
2165 
2166  std::string res = vS(16, 18);
2167  char chain = vC(20);
2168  int seq = vI(22, 26);
2169  char iCode = vC(27);
2170 
2171  for (int modelNr = models[0]; modelNr <= models[1]; ++modelNr)
2172  mUnobs.push_back({ modelNr, res, chain, seq, iCode });
2173  }
2174 
2175  break;
2176  }
2177 
2178  case 470:
2179  {
2180  bool headerSeen = false;
2181  std::regex rx(R"( *MODELS *(\d+)-(\d+))");
2182  int models[2] = { -1, -1 };
2183 
2184  for (; mRec->is("REMARK 470"); GetNextRecord())
2185  {
2186  if (not headerSeen)
2187  {
2188  std::string line = vS(12);
2189  std::smatch m;
2190 
2191  if (std::regex_match(line, m, rx))
2192  {
2193  models[0] = stoi(m[1].str());
2194  models[1] = stoi(m[2].str());
2195  }
2196  else
2197  headerSeen = cif::contains(line, "RES CSSEQI ATOMS");
2198  continue;
2199  }
2200 
2201  if (models[0] == models[1])
2202  models[0] = models[1] = vI(12, 14);
2203 
2204  std::string res = vS(16, 18);
2205  char chain = vC(20);
2206  int seq = vI(21, 24);
2207  char iCode = vC(25);
2208 
2209  std::string atomStr = mRec->vS(29);
2210  auto atoms = cif::split<std::string>(atomStr, " ", true);
2211 
2212  for (int modelNr = models[0]; modelNr <= models[1]; ++modelNr)
2213  mUnobs.push_back({ modelNr, res, chain, seq, iCode, atoms });
2214  }
2215 
2216  break;
2217  }
2218 
2219  case 500:
2220  {
2221  GetNextRecord();
2222 
2223  enum State
2224  {
2225  eStart,
2226  eCCinSAU,
2227  eCC,
2228  eCBL,
2229  eCBA,
2230  eTA,
2231  eCTg,
2232  ePG,
2233  eMCP,
2234  eChC
2235  } state = eStart;
2236  bool headerSeen = false;
2237  int id = 0;
2238 
2239  for (; mRec->is("REMARK 500"); GetNextRecord())
2240  {
2241  std::string line = vS(12);
2242 
2243  if (line == "GEOMETRY AND STEREOCHEMISTRY")
2244  continue;
2245 
2246  switch (state)
2247  {
2248  case eStart:
2249  {
2250  if (line.empty() or not cif::starts_with(line, "SUBTOPIC: "))
2251  continue;
2252 
2253  std::string subtopic = line.substr(10);
2254 
2255  if (subtopic == "CLOSE CONTACTS IN SAME ASYMMETRIC UNIT")
2256  state = eCCinSAU;
2257  else if (subtopic == "CLOSE CONTACTS")
2258  state = eCC;
2259  else if (subtopic == "COVALENT BOND LENGTHS")
2260  state = eCBL;
2261  else if (subtopic == "COVALENT BOND ANGLES")
2262  state = eCBA;
2263  else if (subtopic == "TORSION ANGLES")
2264  state = eTA;
2265  else if (subtopic == "NON-CIS, NON-TRANS")
2266  state = eCTg;
2267  else if (subtopic == "PLANAR GROUPS")
2268  state = ePG;
2269  else if (subtopic == "MAIN CHAIN PLANARITY")
2270  state = eMCP;
2271  else if (subtopic == "CHIRAL CENTERS")
2272  state = eChC;
2273  else if (cif::VERBOSE > 0)
2274  throw std::runtime_error("Unknown subtopic in REMARK 500: " + subtopic);
2275 
2276  headerSeen = false;
2277  id = 0;
2278  break;
2279  }
2280 
2281  case eCCinSAU:
2282  {
2283  if (not headerSeen)
2284  headerSeen =
2285  line == "ATM1 RES C SSEQI ATM2 RES C SSEQI DISTANCE";
2286  else if (line.empty())
2287  state = eStart;
2288  else
2289  {
2290  std::string atom1 = vS(13, 16);
2291  std::string res1 = vS(19, 21);
2292  std::string alt1 = vS(17, 17);
2293  char chain1 = vC(23);
2294  int seq1 = vI(25, 29);
2295  std::string iCode1 = vS(30, 30);
2296 
2297  std::string atom2 = vS(34, 37);
2298  std::string alt2 = vS(38, 38);
2299  std::string res2 = vS(40, 42);
2300  char chain2 = vC(44);
2301  int seq2 = vI(46, 50);
2302  std::string iCode2 = vS(51, 51);
2303 
2304  std::string distance = vF(63, 71);
2305 
2306  getCategory("pdbx_validate_close_contact")->emplace({
2307  { "id", std::to_string(++id) },
2308  { "PDB_model_num", 1 },
2309  { "auth_atom_id_1", atom1 },
2310  { "auth_asym_id_1", std::string{ chain1 } },
2311  { "auth_comp_id_1", res1 },
2312  { "auth_seq_id_1", seq1 },
2313  { "PDB_ins_code_1", iCode1 },
2314  { "label_alt_id_1", alt1 },
2315  { "auth_atom_id_2", atom2 },
2316  { "auth_asym_id_2", std::string{ chain2 } },
2317  { "auth_comp_id_2", res2 },
2318  { "auth_seq_id_2", seq2 },
2319  { "PDB_ins_code_2", iCode2 },
2320  { "label_alt_id_2", alt2 },
2321  { "dist", distance } });
2322  }
2323  break;
2324  }
2325 
2326  case eCC:
2327  {
2328  if (not headerSeen)
2329  headerSeen = line == "ATM1 RES C SSEQI ATM2 RES C SSEQI SSYMOP DISTANCE";
2330  else if (line.empty())
2331  state = eStart;
2332  else
2333  {
2334  std::string atom1 = vS(13, 16);
2335  std::string res1 = vS(19, 21);
2336  char chain1 = vC(23);
2337  int seq1 = vI(25, 29);
2338 
2339  std::string atom2 = vS(34, 37);
2340  std::string res2 = vS(40, 42);
2341  char chain2 = vC(44);
2342  int seq2 = vI(46, 50);
2343 
2344  std::string symop;
2345  try
2346  {
2347  symop = pdb2cifSymmetry(vS(54, 59));
2348  }
2349  catch (const std::exception &ex)
2350  {
2351  if (cif::VERBOSE > 0)
2352  std::cerr << "Dropping REMARK 500 at line " << mRec->mLineNr << " due to invalid symmetry operation" << std::endl;
2353  continue;
2354  }
2355 
2356  std::string distance = vF(63, 71);
2357 
2358  getCategory("pdbx_validate_symm_contact")->emplace({
2359  { "id", std::to_string(++id) },
2360  { "PDB_model_num", 1 },
2361  { "auth_atom_id_1", atom1 },
2362  { "auth_asym_id_1", std::string{ chain1 } },
2363  { "auth_comp_id_1", res1 },
2364  { "auth_seq_id_1", seq1 },
2365  // { "PDB_ins_code_1", "" },
2366  // { "label_alt_id_1", "" },
2367  { "site_symmetry_1", "1_555" },
2368  { "auth_atom_id_2", atom2 },
2369  { "auth_asym_id_2", std::string{ chain2 } },
2370  { "auth_comp_id_2", res2 },
2371  { "auth_seq_id_2", seq2 },
2372  // { "PDB_ins_code_2", "" },
2373  // { "label_alt_id_2", "" },
2374  { "site_symmetry_2", symop },
2375  { "dist", distance } });
2376  }
2377  break;
2378  }
2379 
2380  case eCBL:
2381  {
2382  if (not headerSeen)
2383  {
2384  if (cif::starts_with(line, "FORMAT: ") and line != "FORMAT: (10X,I3,1X,2(A3,1X,A1,I4,A1,1X,A4,3X),1X,F6.3)")
2385  throw std::runtime_error("Unexpected format in REMARK 500");
2386 
2387  headerSeen = line == "M RES CSSEQI ATM1 RES CSSEQI ATM2 DEVIATION";
2388  }
2389  else if (line.empty())
2390  state = eStart;
2391  else
2392  {
2393  int model = vI(11, 13);
2394  std::string resNam1 = vS(15, 17);
2395  std::string chainID1{ vC(19) };
2396  int seqNum1 = vI(20, 23);
2397  std::string iCode1{ vC(24) };
2398  std::string alt1 = vS(30, 30);
2399  std::string atm1 = vS(26, 29);
2400 
2401  std::string resNam2 = vS(33, 35);
2402  std::string chainID2{ vC(37) };
2403  int seqNum2 = vI(38, 41);
2404  std::string iCode2{ vC(42) };
2405  std::string alt2 = vS(48, 48);
2406  std::string atm2 = vS(44, 47);
2407 
2408  std::string deviation = vF(51, 57);
2409 
2410  if (iCode1 == " ")
2411  iCode1.clear();
2412  if (iCode2 == " ")
2413  iCode2.clear();
2414 
2415  getCategory("pdbx_validate_rmsd_bond")->emplace({
2416  { "id", std::to_string(++id) },
2417  { "PDB_model_num", model ? model : 1 },
2418  { "auth_atom_id_1", atm1 },
2419  { "auth_asym_id_1", chainID1 },
2420  { "auth_comp_id_1", resNam1 },
2421  { "auth_seq_id_1", seqNum1 },
2422  { "PDB_ins_code_1", iCode1 },
2423  { "label_alt_id_1", alt1 },
2424  { "auth_atom_id_2", atm2 },
2425  { "auth_asym_id_2", chainID2 },
2426  { "auth_comp_id_2", resNam2 },
2427  { "auth_seq_id_2", seqNum2 },
2428  { "PDB_ins_code_2", iCode2 },
2429  { "label_alt_id_2", alt2 },
2430  { "bond_deviation", deviation } });
2431  }
2432 
2433  break;
2434  }
2435 
2436  case eCBA:
2437  if (not headerSeen)
2438  {
2439  if (cif::starts_with(line, "FORMAT: ") and line != "FORMAT: (10X,I3,1X,A3,1X,A1,I4,A1,3(1X,A4,2X),12X,F5.1)")
2440  throw std::runtime_error("Unexpected format in REMARK 500");
2441 
2442  headerSeen = line == "M RES CSSEQI ATM1 ATM2 ATM3";
2443  }
2444  else if (line.empty())
2445  state = eStart;
2446  else if (vS(64) == "DEGREES")
2447  {
2448  int model = vI(11, 13);
2449  std::string resNam = vS(15, 17);
2450  std::string chainID{ vC(19) };
2451  int seqNum = vI(20, 23);
2452  std::string iCode{ vC(24) };
2453 
2454  if (iCode == " ")
2455  iCode.clear();
2456 
2457  std::string atoms[3] = { vS(27, 30), vS(34, 37), vS(41, 44) };
2458  std::string deviation = vF(57, 62);
2459  if (deviation == "*****")
2460  deviation.clear();
2461 
2462  getCategory("pdbx_validate_rmsd_angle")->emplace({
2463  { "id", std::to_string(++id) },
2464  { "PDB_model_num", model ? model : 1 },
2465  { "auth_atom_id_1", atoms[0] },
2466  { "auth_asym_id_1", chainID },
2467  { "auth_comp_id_1", resNam },
2468  { "auth_seq_id_1", seqNum },
2469  { "PDB_ins_code_1", iCode },
2470  { "auth_atom_id_2", atoms[1] },
2471  { "auth_asym_id_2", chainID },
2472  { "auth_comp_id_2", resNam },
2473  { "auth_seq_id_2", seqNum },
2474  { "PDB_ins_code_2", iCode },
2475  { "auth_atom_id_3", atoms[2] },
2476  { "auth_asym_id_3", chainID },
2477  { "auth_comp_id_3", resNam },
2478  { "auth_seq_id_3", seqNum },
2479  { "PDB_ins_code_3", iCode },
2480  { "angle_deviation", deviation } });
2481  }
2482 
2483  break;
2484 
2485  case eTA:
2486  if (not headerSeen)
2487  {
2488  if (cif::starts_with(line, "FORMAT: ") and line != "FORMAT:(10X,I3,1X,A3,1X,A1,I4,A1,4X,F7.2,3X,F7.2)")
2489  throw std::runtime_error("Unexpected format in REMARK 500");
2490 
2491  headerSeen = line == "M RES CSSEQI PSI PHI";
2492  }
2493  else if (line.empty())
2494  state = eStart;
2495  else
2496  {
2497  int model = vI(11, 13);
2498  std::string resNam = vS(15, 17);
2499  std::string chainID{ vC(19) };
2500  int seqNum = vI(20, 23);
2501  std::string iCode{ vC(24) };
2502 
2503  if (iCode == " ")
2504  iCode.clear();
2505 
2506  std::string psi = vF(27, 35);
2507  std::string phi = vF(37, 45);
2508 
2509  getCategory("pdbx_validate_torsion")->emplace({
2510  { "id", std::to_string(++id) },
2511  { "PDB_model_num", model ? model : 1 },
2512  { "auth_comp_id", resNam },
2513  { "auth_asym_id", chainID },
2514  { "auth_seq_id", seqNum },
2515  { "PDB_ins_code", iCode },
2516  { "phi", phi },
2517  { "psi", psi } });
2518  }
2519  break;
2520 
2521  case eCTg:
2522  if (not headerSeen)
2523  headerSeen = line == "MODEL OMEGA";
2524  else if (line.empty())
2525  state = eStart;
2526  else
2527  {
2528  int model = vI(45, 48);
2529 
2530  std::string resNam1 = vS(12, 14);
2531  std::string chainID1{ vC(16) };
2532  int seqNum1 = vI(17, 21);
2533  std::string iCode1{ vC(22) };
2534 
2535  if (iCode1 == " ")
2536  iCode1.clear();
2537 
2538  std::string resNam2 = vS(27, 29);
2539  std::string chainID2{ vC(31) };
2540  int seqNum2 = vI(32, 36);
2541  std::string iCode2{ vC(37) };
2542 
2543  if (iCode2 == " ")
2544  iCode2.clear();
2545 
2546  std::string omega = vF(54, 60);
2547 
2548  getCategory("pdbx_validate_peptide_omega")->emplace({
2549  { "id", std::to_string(++id) },
2550  { "PDB_model_num", model ? model : 1 },
2551  { "auth_comp_id_1", resNam1 },
2552  { "auth_asym_id_1", chainID1 },
2553  { "auth_seq_id_1", seqNum1 },
2554  { "PDB_ins_code_1", iCode1 },
2555  { "auth_comp_id_2", resNam2 },
2556  { "auth_asym_id_2", chainID2 },
2557  { "auth_seq_id_2", seqNum2 },
2558  { "PDB_ins_code_2", iCode2 },
2559  { "omega", omega } });
2560  }
2561  break;
2562 
2563  case ePG:
2564  if (not headerSeen)
2565  headerSeen = line == "M RES CSSEQI RMS TYPE";
2566  else if (line.empty())
2567  state = eStart;
2568  else
2569  {
2570  int model = vI(11, 13);
2571  std::string resNam = vS(15, 17);
2572  std::string chainID{ vC(19) };
2573  int seqNum = vI(20, 23);
2574  std::string iCode{ vC(24) };
2575 
2576  if (iCode == " ")
2577  iCode.clear();
2578 
2579  std::string rmsd = vF(32, 36);
2580  std::string type = vS(41);
2581 
2582  getCategory("pdbx_validate_planes")->emplace({
2583  { "id", std::to_string(++id) },
2584  { "PDB_model_num", model ? model : 1 },
2585  { "auth_comp_id", resNam },
2586  { "auth_asym_id", chainID },
2587  { "auth_seq_id", seqNum },
2588  { "PDB_ins_code", iCode },
2589  { "rmsd", rmsd },
2590  { "type", type } });
2591  }
2592  break;
2593 
2594  default:
2595  state = eStart;
2596  break;
2597  }
2598  }
2599 
2600  break;
2601  }
2602 
2603  case 610:
2604  {
2605  bool headerSeen = false;
2606 
2607  for (; mRec->is("REMARK 610"); GetNextRecord())
2608  {
2609  if (not headerSeen)
2610  {
2611  std::string line = vS(12);
2612  headerSeen = cif::contains(line, "RES C SSEQI");
2613  continue;
2614  }
2615 
2616  int modelNr = vI(12, 14);
2617  if (modelNr == 0)
2618  modelNr = 1;
2619  std::string res = vS(16, 18);
2620  char chain = vC(20);
2621  int seq = vI(22, 25);
2622  char iCode = vC(26);
2623 
2624  auto compound = cif::compound_factory::instance().create(res);
2625  if (compound == nullptr)
2626  continue;
2627 
2628  std::vector<std::string> atoms;
2629  for (auto atom : compound->atoms())
2630  {
2631  if (atom.type_symbol != cif::H)
2632  atoms.push_back(atom.id);
2633  }
2634 
2635  mUnobs.push_back({ modelNr, res, chain, seq, iCode, { atoms } });
2636  }
2637 
2638  break;
2639  }
2640 
2641  case 800:
2642  {
2643  const std::regex rx1(R"(SITE_IDENTIFIER: (.+))"),
2644  rx2(R"(EVIDENCE_CODE: (.+))"),
2645  rx3(R"(SITE_DESCRIPTION: (binding site for residue ([[:alnum:]]{1,3}) ([[:alnum:]]) (\d+)|.+))", std::regex_constants::icase);
2646 
2647  std::string id, evidence, desc;
2648  std::string pdbxAuthAsymID, pdbxAuthCompID, pdbxAuthSeqID, pdbxAuthInsCode;
2649  std::smatch m;
2650 
2651  enum State
2652  {
2653  sStart,
2654  sID,
2655  sEvidence,
2656  sDesc,
2657  sDesc2
2658  } state = sStart;
2659 
2660  auto store = [&]()
2661  {
2662  // Find the matching SITE record
2663  auto site = FindRecord([id](PDBRecord &r) -> bool
2664  { return r.is("SITE ") and r.vS(12, 14) == id; });
2665 
2666  if (site == nullptr)
2667  throw std::runtime_error("Invalid REMARK 800, no SITE record for id " + id);
2668 
2669  // next record, store what we have
2670  getCategory("struct_site")->emplace({
2671  { "id", id },
2672  { "details", desc },
2673  { "pdbx_auth_asym_id", pdbxAuthAsymID },
2674  { "pdbx_auth_comp_id", pdbxAuthCompID },
2675  { "pdbx_auth_seq_id", pdbxAuthSeqID },
2676  { "pdbx_num_residues", site->vI(16, 17) },
2677  { "pdbx_evidence_code", evidence } });
2678  };
2679 
2680  for (; mRec->is("REMARK 800"); GetNextRecord())
2681  {
2682  std::string s = mRec->vS(12);
2683  if (s.empty())
2684  continue;
2685 
2686  switch (state)
2687  {
2688  case sStart:
2689  if (s == "SITE")
2690  state = sID;
2691  else if (cif::VERBOSE > 0)
2692  throw std::runtime_error("Invalid REMARK 800 record, expected SITE");
2693  break;
2694 
2695  case sID:
2696  if (std::regex_match(s, m, rx1))
2697  {
2698  id = m[1].str();
2699  state = sEvidence;
2700  }
2701  else if (cif::VERBOSE > 0)
2702  throw std::runtime_error("Invalid REMARK 800 record, expected SITE_IDENTIFIER");
2703  break;
2704 
2705  case sEvidence:
2706  if (regex_match(s, m, rx2))
2707  {
2708  evidence = m[1].str();
2709  state = sDesc;
2710  }
2711  else if (cif::VERBOSE > 0)
2712  throw std::runtime_error("Invalid REMARK 800 record, expected SITE_IDENTIFIER");
2713  break;
2714 
2715  case sDesc:
2716  if (regex_match(s, m, rx3))
2717  {
2718  desc = m[1].str();
2719  pdbxAuthCompID = m[2].str();
2720  pdbxAuthAsymID = m[3].str();
2721  pdbxAuthSeqID = m[4].str();
2722 
2723  state = sDesc2;
2724  }
2725  break;
2726 
2727  case sDesc2:
2728  if (regex_match(s, m, rx1))
2729  {
2730  store();
2731 
2732  id = m[1].str();
2733  state = sEvidence;
2734  evidence.clear();
2735  desc.clear();
2736  }
2737  else
2738  desc = desc + ' ' + s;
2739  break;
2740  }
2741  }
2742 
2743  if (not id.empty())
2744  store();
2745 
2746  break;
2747  }
2748 
2749  case 999:
2750  {
2751  std::stringstream s;
2752  GetNextRecord();
2753  if (vS(12) == "SEQUENCE")
2754  GetNextRecord();
2755 
2756  while (mRec->is("REMARK 999"))
2757  {
2758  s << vS(12) << std::endl;
2759  GetNextRecord();
2760  }
2761 
2762  sequenceDetails = s.str();
2763  break;
2764  }
2765 
2766  // these are skipped
2767 
2768  case 2:
2769  case 290:
2770  case 300:
2771  case 620:
2772  GetNextRecord();
2773  break;
2774 
2775  default:
2776  {
2777  std::string skipped = mRec->mName;
2778 
2779  std::stringstream s;
2780 
2781  if (not mRec->vS(11).empty())
2782  s << mRec->vS(11) << std::endl;
2783  GetNextRecord();
2784 
2785  while (mRec->is(skipped.c_str()))
2786  {
2787  s << mRec->vS(11) << std::endl;
2788  GetNextRecord();
2789  }
2790 
2791  getCategory("pdbx_database_remark")->emplace({
2792  { "id", remarkNr },
2793  { "text", s.str() } });
2794 
2795  break;
2796  }
2797  }
2798  }
2799  catch (const std::exception &ex)
2800  {
2801  std::throw_with_nested(std::runtime_error("Error parsing REMARK " + std::to_string(remarkNr)));
2802  }
2803  }
2804 
2805  if (not(compoundDetails.empty() and sequenceDetails.empty() and sourceDetails.empty()))
2806  {
2807  getCategory("pdbx_entry_details")->emplace({
2808  { "entry_id", mStructureID },
2809  { "compound_details", compoundDetails },
2810  { "sequence_details", sequenceDetails },
2811  { "source_details", sourceDetails } });
2812  }
2813 
2814  // store remark 200 info (special case)
2815  if (not mRemark200.empty())
2816  ParseRemark200();
2817 }
2818 
2819 void PDBFileParser::ParseRemark200()
2820 {
2821  auto rm200 = [&](const char *name, int diffrnNr) -> std::string
2822  {
2823  int nr = 0;
2824  std::string result;
2825 
2826  for (auto s : cif::split<std::string>(mRemark200[name], ";"))
2827  {
2828  if (++nr != diffrnNr)
2829  continue;
2830 
2831  cif::trim(s);
2832 
2833  if (s == "NULL")
2834  s.clear();
2835 
2836  result = std::move(s);
2837  break;
2838  }
2839 
2840  return result;
2841  };
2842 
2843  auto inRM200 = [this](std::initializer_list<const char *> s) -> bool
2844  {
2845  bool result = false;
2846 
2847  for (auto *n : s)
2848  {
2849  if (not this->mRemark200[n].empty())
2850  {
2851  result = true;
2852  break;
2853  }
2854  }
2855 
2856  return result;
2857  };
2858 
2859  /*
2860  The category computing is no longer used.
2861 
2862  if (inRM200({"INTENSITY-INTEGRATION SOFTWARE", "DATA SCALING SOFTWARE", "SOFTWARE USED"}) or
2863  not mRefinementSoftware.empty())
2864  getCategory("computing")->emplace({
2865  { "entry_id", mStructureID },
2866  { "pdbx_data_reduction_ii", mRemark200["INTENSITY-INTEGRATION SOFTWARE"] },
2867  { "pdbx_data_reduction_ds", mRemark200["DATA SCALING SOFTWARE"] },
2868  { "structure_solution", mRemark200["SOFTWARE USED"] },
2869  { "structure_refinement", mRefinementSoftware }
2870  });
2871 */
2872 
2873  struct
2874  {
2875  const char *a;
2876  const char *b;
2877  } kSWMap[] = {
2878  { "data reduction", "INTENSITY-INTEGRATION SOFTWARE" },
2879  { "data scaling", "DATA SCALING SOFTWARE" },
2880  { "phasing", "SOFTWARE USED" },
2881  };
2882 
2883  for (auto &sw : kSWMap)
2884  {
2885  if (mRemark200[sw.b].empty())
2886  continue;
2887 
2888  getCategory("software")->emplace({
2889  { "name", mRemark200[sw.b] },
2890  { "classification", sw.a },
2891  { "version", "." },
2892  { "pdbx_ordinal", mNextSoftwareOrd++ } });
2893  }
2894 
2895  std::string scatteringType;
2896  if (mRemark200["EXPERIMENT TYPE"] == "X-RAY DIFFRACTION")
2897  scatteringType = "x-ray";
2898  else if (mRemark200["EXPERIMENT TYPE"] == "NEUTRON DIFFRACTION")
2899  scatteringType = "neutron";
2900 
2901  std::set<std::string> diffrnWaveLengths;
2902 
2903  for (int diffrnNr = 1;; ++diffrnNr)
2904  {
2905  std::string ambientTemp = rm200("TEMPERATURE (KELVIN)", diffrnNr);
2906  if (ambientTemp.empty())
2907  break;
2908 
2909  if (cif::ends_with(ambientTemp, "K"))
2910  ambientTemp.erase(ambientTemp.length() - 1, 1);
2911 
2912  getCategory("diffrn")->emplace({
2913  { "id", diffrnNr },
2914  { "ambient_temp", ambientTemp },
2915  // { "ambient_temp_details", seqID },
2916  { "crystal_id", 1 } });
2917 
2918  std::string collectionDate;
2919  std::error_code ec;
2920  collectionDate = pdb2cifDate(rm200("DATE OF DATA COLLECTION", diffrnNr), ec);
2921  if (ec)
2922  {
2923  if (cif::VERBOSE > 0)
2924  std::cerr << ec.message() << " for pdbx_collection_date" << std::endl;
2925 
2926  // The date field can become truncated when multiple values are available
2927  if (diffrnNr != 1)
2928  collectionDate.clear();
2929  }
2930 
2931  getCategory("diffrn_detector")->emplace({
2932  { "diffrn_id", diffrnNr },
2933  { "detector", rm200("DETECTOR TYPE", diffrnNr) },
2934  { "type", rm200("DETECTOR MANUFACTURER", diffrnNr) },
2935  { "pdbx_collection_date", collectionDate },
2936  { "details", rm200("OPTICS", diffrnNr) } });
2937 
2938  if (inRM200({ "MONOCHROMATIC OR LAUE (M/L)", "MONOCHROMATOR", "DIFFRACTION PROTOCOL" }) or not scatteringType.empty())
2939  getCategory("diffrn_radiation")->emplace({
2940  { "diffrn_id", diffrnNr },
2941  { "wavelength_id", 1 },
2942  { "pdbx_monochromatic_or_laue_m_l", rm200("MONOCHROMATIC OR LAUE (M/L)", diffrnNr) },
2943  { "monochromator", rm200("MONOCHROMATOR", diffrnNr) },
2944  { "pdbx_diffrn_protocol", rm200("DIFFRACTION PROTOCOL", diffrnNr) },
2945  { "pdbx_scattering_type", scatteringType } });
2946 
2947  std::string wl = rm200("WAVELENGTH OR RANGE (A)", diffrnNr);
2948  auto wavelengths = cif::split<std::string>(wl, ", -", true);
2949 
2950  diffrnWaveLengths.insert(wavelengths.begin(), wavelengths.end());
2951 
2952  std::string source;
2953  if (rm200("SYNCHROTRON (Y/N)", diffrnNr) == "Y")
2954  {
2955  getCategory("diffrn_source")->emplace({
2956  { "diffrn_id", diffrnNr },
2957  { "source", "SYNCHROTRON" },
2958  { "type", rm200("RADIATION SOURCE", diffrnNr) + " BEAMLINE " + rm200("BEAMLINE", diffrnNr) },
2959  { "pdbx_synchrotron_site", rm200("RADIATION SOURCE", diffrnNr) },
2960  { "pdbx_synchrotron_beamline", rm200("BEAMLINE", diffrnNr) },
2961 
2962  { "pdbx_wavelength", wavelengths.size() == 1 ? wavelengths[0] : "" },
2963  { "pdbx_wavelength_list", wavelengths.size() == 1 ? "" : cif::join(wavelengths, ", ") },
2964  });
2965  }
2966  else if (inRM200({ "X-RAY GENERATOR MODEL", "RADIATION SOURCE", "BEAMLINE", "WAVELENGTH OR RANGE (A)" }))
2967  {
2968  getCategory("diffrn_source")->emplace({
2969  { "diffrn_id", diffrnNr },
2970  { "source", rm200("RADIATION SOURCE", diffrnNr) },
2971  { "type", rm200("X-RAY GENERATOR MODEL", diffrnNr) },
2972 
2973  { "pdbx_wavelength", wavelengths.size() == 1 ? wavelengths[0] : "" },
2974  { "pdbx_wavelength_list", wavelengths.size() == 1 ? "" : cif::join(wavelengths, ", ") },
2975  });
2976  }
2977  }
2978 
2979  int wavelengthNr = 1;
2980  for (auto wl : diffrnWaveLengths)
2981  {
2982  if (cif::ends_with(wl, "A"))
2983  wl.erase(wl.length() - 1, 1);
2984 
2985  getCategory("diffrn_radiation_wavelength")->emplace({
2986  { "id", wavelengthNr++ },
2987  { "wavelength", wl.empty() ? "." : wl },
2988  { "wt", "1.0" } });
2989  }
2990 
2991  if (inRM200({ "METHOD USED TO DETERMINE THE STRUCTURE", "STARTING MODEL" }))
2992  {
2993  auto cat = getCategory("refine");
2994  assert(cat->empty());
2995 
2996  std::string resolution = mRemark200["RESOLUTION RANGE HIGH (A)"];
2997  if (resolution.empty())
2998  resolution = ".";
2999 
3000  cat->emplace({
3001  { "pdbx_method_to_determine_struct", mRemark200["METHOD USED TO DETERMINE THE STRUCTURE"] },
3002  { "pdbx_starting_model", mRemark200["STARTING MODEL"] },
3003  { "ls_d_res_high", resolution },
3004  { "pdbx_diffrn_id", 1 },
3005  { "pdbx_refine_id", mExpMethod },
3006  { "entry_id", mStructureID } });
3007  }
3008 
3009  if (inRM200({ "REJECTION CRITERIA (SIGMA(I))", "RESOLUTION RANGE HIGH (A)", "RESOLUTION RANGE LOW (A)", "NUMBER OF UNIQUE REFLECTIONS", "COMPLETENESS FOR RANGE (%)", "<I/SIGMA(I)> FOR THE DATA SET", "R MERGE (I)", "R SYM (I)", "DATA REDUNDANCY" }))
3010  {
3011  auto cat = getCategory("reflns");
3012  cat->emplace({
3013  { "entry_id", mStructureID },
3014  { "observed_criterion_sigma_I", mRemark200["REJECTION CRITERIA (SIGMA(I))"] },
3015  { "d_resolution_high", mRemark200["RESOLUTION RANGE HIGH (A)"] },
3016  { "d_resolution_low", mRemark200["RESOLUTION RANGE LOW (A)"] },
3017  { "number_obs", mRemark200["NUMBER OF UNIQUE REFLECTIONS"] },
3018  { "percent_possible_obs", mRemark200["COMPLETENESS FOR RANGE (%)"] },
3019  { "pdbx_netI_over_sigmaI", mRemark200["<I/SIGMA(I)> FOR THE DATA SET"] },
3020  { "pdbx_Rmerge_I_obs", mRemark200["R MERGE (I)"] },
3021  { "pdbx_Rsym_value", mRemark200["R SYM (I)"] },
3022  { "pdbx_redundancy", mRemark200["DATA REDUNDANCY"] },
3023  { "pdbx_ordinal", 1 },
3024  { "pdbx_diffrn_id", 1 }
3025  });
3026  }
3027 
3028  if (inRM200({ "HIGHEST RESOLUTION SHELL, RANGE HIGH (A)" })) // that one field is mandatory...
3029  {
3030  getCategory("reflns_shell")->emplace({
3031  { "d_res_high", mRemark200["HIGHEST RESOLUTION SHELL, RANGE HIGH (A)"] },
3032  { "d_res_low", mRemark200["HIGHEST RESOLUTION SHELL, RANGE LOW (A)"] },
3033  { "percent_possible_all", mRemark200["COMPLETENESS FOR SHELL (%)"] },
3034  { "Rmerge_I_obs", mRemark200["R MERGE FOR SHELL (I)"] },
3035  { "pdbx_Rsym_value", mRemark200["R SYM FOR SHELL (I)"] },
3036  { "meanI_over_sigI_obs", mRemark200["<I/SIGMA(I)> FOR SHELL"] },
3037  { "pdbx_redundancy", mRemark200["DATA REDUNDANCY IN SHELL"] },
3038  { "pdbx_ordinal", 1 },
3039  { "pdbx_diffrn_id", 1 } });
3040  }
3041  else if (inRM200({ "HIGHEST RESOLUTION SHELL, RANGE LOW (A)", "COMPLETENESS FOR SHELL (%)",
3042  "R MERGE FOR SHELL (I)", "R SYM FOR SHELL (I)", "<I/SIGMA(I)> FOR SHELL", "DATA REDUNDANCY IN SHELL" }))
3043  {
3044  if (cif::VERBOSE > 0)
3045  std::cerr << "Not writing reflns_shell record since d_res_high is missing" << std::endl;
3046  }
3047 }
3048 
3049 void PDBFileParser::ParseRemark350()
3050 {
3051  auto saved = mRec;
3052 
3053  enum State
3054  {
3055  eStart,
3056  eInfo,
3057  eAnd,
3058  eApply,
3059  eBioMT
3060  } state = eStart;
3061 
3062  const std::regex
3063  kRX1(R"(BIOMOLECULE: (\d+))"),
3064  kRX2(R"(([^:]+): (.+?)(?: (ANGSTROM\*\*2|KCAL/MOL))?)"),
3065  kRX8(R"(APPLY THE FOLLOWING TO CHAINS: (.+))"),
3066  kRX9(R"(AND CHAINS: (.+))"),
3067  kRX10(R"(BIOMT([123])\s+(\d+)\s+(-?\d+(?:\.\d+)?)\s+(-?\d+(?:\.\d+)?)\s+(-?\d+(?:\.\d+)?)\s+(-?\d+(?:\.\d+)?))");
3068 
3069  int biomolecule = 0, operID = 0;
3070  std::vector<std::string> operExpression;
3071  std::map<std::string, std::string> values;
3072  std::vector<std::string> asymIdList;
3073  std::smatch m;
3074  cif::row_handle genR;
3075 
3076  std::vector<double> mat, vec;
3077 
3078  for (mRec = FindRecord("REMARK 350"); mRec != nullptr and mRec->is("REMARK 350"); GetNextRecord())
3079  {
3080  std::string line = vS(11);
3081 
3082  switch (state)
3083  {
3084  case eStart:
3085  if (regex_match(line, m, kRX1))
3086  {
3087  biomolecule = stoi(m[1].str());
3088  state = eInfo;
3089  }
3090  break;
3091 
3092  case eInfo:
3093  if (regex_match(line, m, kRX8))
3094  {
3095  state = eApply;
3096 
3097  std::string value = m[1].str();
3098 
3099  for (auto chain : cif::split<std::string>(value, ", ", true))
3100  {
3101  if (chain.empty()) // happens when we have a AND CHAIN line
3102  {
3103  state = eAnd;
3104  break;
3105  }
3106 
3107  if (chain.length() != 1)
3108  throw std::runtime_error("Invalid REMARK 350");
3109 
3110  MapChainID2AsymIDS(chain[0], asymIdList);
3111  }
3112  }
3113  else if (regex_match(line, m, kRX2))
3114  values[m[1].str()] = m[2].str();
3115  break;
3116 
3117  case eAnd:
3118  if (regex_match(line, m, kRX9))
3119  {
3120  state = eApply;
3121 
3122  std::string value = m[1].str();
3123 
3124  for (auto chain : cif::split<std::string>(value, ", ", true))
3125  {
3126  if (chain.empty()) // happens when we have another AND CHAIN line
3127  {
3128  state = eAnd;
3129  break;
3130  }
3131 
3132  MapChainID2AsymIDS(chain[0], asymIdList);
3133  }
3134 
3135  continue;
3136  }
3137  // fall through
3138 
3139  case eApply:
3140  if (regex_match(line, m, kRX10))
3141  {
3142  int mt = stoi(m[1].str());
3143  if (mt != 1)
3144  throw std::runtime_error("Invalid REMARK 350");
3145 
3146  operID = stoi(m[2].str());
3147  operExpression.push_back(std::to_string(operID));
3148 
3149  mat.push_back(stod(m[3].str()));
3150  mat.push_back(stod(m[4].str()));
3151  mat.push_back(stod(m[5].str()));
3152  vec.push_back(stod(m[6].str()));
3153  state = eBioMT;
3154  }
3155  break;
3156 
3157  case eBioMT:
3158  if (regex_match(line, m, kRX10))
3159  {
3160  int mt = stoi(m[1].str());
3161 
3162  if (mt == 1)
3163  {
3164  operID = stoi(m[2].str());
3165  operExpression.push_back(std::to_string(operID));
3166  }
3167  else if (operID != stoi(m[2].str()))
3168  throw std::runtime_error("Invalid REMARK 350");
3169 
3170  mat.push_back(stod(m[3].str()));
3171  mat.push_back(stod(m[4].str()));
3172  mat.push_back(stod(m[5].str()));
3173  vec.push_back(stod(m[6].str()));
3174 
3175  if (mt == 3)
3176  {
3177  if (vec.size() != 3 or mat.size() != 9)
3178  throw std::runtime_error("Invalid REMARK 350");
3179 
3180  if (operID == 1)
3181  {
3182  std::string oligomer = values["AUTHOR DETERMINED BIOLOGICAL UNIT"];
3183  if (oligomer.empty())
3184  oligomer = values["SOFTWARE DETERMINED QUATERNARY STRUCTURE"];
3185  to_lower(oligomer);
3186 
3187  int count = 0;
3188  std::smatch m2;
3189 
3190  if (std::regex_match(oligomer, m2, std::regex(R"((\d+)-meric)")))
3191  {
3192  count = stoi(m2[1].str());
3193  }
3194  else if (cif::ends_with(oligomer, "meric"))
3195  {
3196  std::string cs = oligomer.substr(0, oligomer.length() - 5);
3197  if (cs == "mono")
3198  count = 1;
3199  else if (cs == "di")
3200  count = 2;
3201  else if (cs == "tri")
3202  count = 3;
3203  else if (cs == "tetra")
3204  count = 4;
3205  else if (cs == "hexa")
3206  count = 6;
3207  else if (cs == "octa")
3208  count = 8;
3209  else if (cs == "dodeca")
3210  count = 12;
3211  }
3212 
3213  std::string details;
3214  if (values["AUTHOR DETERMINED BIOLOGICAL UNIT"].empty())
3215  {
3216  if (not values["SOFTWARE DETERMINED QUATERNARY STRUCTURE"].empty())
3217  details = "software_defined_assembly";
3218  }
3219  else if (values["SOFTWARE DETERMINED QUATERNARY STRUCTURE"].empty())
3220  details = "author_defined_assembly";
3221  else
3222  details = "author_and_software_defined_assembly";
3223 
3224  getCategory("pdbx_struct_assembly")->emplace({
3225  { "id", biomolecule },
3226  { "details", details },
3227  { "method_details", values["SOFTWARE USED"] },
3228  { "oligomeric_details", oligomer },
3229  { "oligomeric_count", count > 0 ? std::to_string(count) : "" } });
3230 
3231  auto cat = getCategory("pdbx_struct_assembly_prop");
3232 
3233  if (not values["TOTAL BURIED SURFACE AREA"].empty())
3234  cat->emplace({
3235  { "biol_id", biomolecule },
3236  { "type", "ABSA (A^2)" },
3237  { "value", values["TOTAL BURIED SURFACE AREA"] } });
3238 
3239  if (not values["CHANGE IN SOLVENT FREE ENERGY"].empty())
3240  cat->emplace({
3241  { "biol_id", biomolecule },
3242  { "type", "MORE" },
3243  { "value", values["CHANGE IN SOLVENT FREE ENERGY"] } });
3244 
3245  if (not values["SURFACE AREA OF THE COMPLEX"].empty())
3246  cat->emplace({
3247  { "biol_id", biomolecule },
3248  { "type", "SSA (A^2)" },
3249  { "value", values["SURFACE AREA OF THE COMPLEX"] } });
3250 
3251  values.clear();
3252  }
3253 
3254  std::string type = mat == std::vector<double>{ 1, 0, 0, 0, 1, 0, 0, 0, 1 } and vec == std::vector<double>{ 0, 0, 0 } ? "identity operation" : "crystal symmetry operation";
3255 
3256  // if (type == "identity operation")
3257  // {
3258 
3259  // }
3260  // else
3261  try
3262  {
3263  getCategory("pdbx_struct_oper_list")->emplace({
3264  { "id", operID },
3265  { "type", type },
3266  // { "name", "" },
3267  // { "symmetryOperation", "" },
3268  { "matrix[1][1]", cif::format("%12.10f", mat[0]).str() },
3269  { "matrix[1][2]", cif::format("%12.10f", mat[1]).str() },
3270  { "matrix[1][3]", cif::format("%12.10f", mat[2]).str() },
3271  { "vector[1]", cif::format("%12.10f", vec[0]).str() },
3272  { "matrix[2][1]", cif::format("%12.10f", mat[3]).str() },
3273  { "matrix[2][2]", cif::format("%12.10f", mat[4]).str() },
3274  { "matrix[2][3]", cif::format("%12.10f", mat[5]).str() },
3275  { "vector[2]", cif::format("%12.10f", vec[1]).str() },
3276  { "matrix[3][1]", cif::format("%12.10f", mat[6]).str() },
3277  { "matrix[3][2]", cif::format("%12.10f", mat[7]).str() },
3278  { "matrix[3][3]", cif::format("%12.10f", mat[8]).str() },
3279  { "vector[3]", cif::format("%12.10f", vec[2]).str() } });
3280  }
3281  catch (duplicate_key_error &ex)
3282  {
3283  // so what?
3284  }
3285 
3286  mat.clear();
3287  vec.clear();
3288  }
3289  }
3290  else if (regex_match(line, m, kRX1))
3291  {
3292  if (not(vec.empty() and mat.empty()))
3293  throw std::runtime_error("Invalid REMARK 350");
3294 
3295  getCategory("pdbx_struct_assembly_gen")->emplace({
3296  { "assembly_id", biomolecule },
3297  { "oper_expression", cif::join(operExpression, ",") },
3298  { "asym_id_list", cif::join(asymIdList, ",") } });
3299 
3300  biomolecule = stoi(m[1].str());
3301  asymIdList.clear();
3302  operExpression.clear();
3303 
3304  state = eInfo;
3305  }
3306  break;
3307  }
3308  }
3309 
3310  if (not operExpression.empty())
3311  {
3312  getCategory("pdbx_struct_assembly_gen")->emplace({
3313  { "assembly_id", biomolecule },
3314  { "oper_expression", cif::join(operExpression, ",") },
3315  { "asym_id_list", cif::join(asymIdList, ",") } });
3316  }
3317 
3318  mRec = saved;
3319 }
3320 
3321 void PDBFileParser::ParsePrimaryStructure()
3322 {
3323  // First locate the DBREF record. Might be missing
3324  DBREF cur = { mStructureID };
3325 
3326  while (cif::starts_with(mRec->mName, "DBREF"))
3327  {
3328  if (mRec->is("DBREF ")) // 1 - 6 Record name "DBREF "
3329  {
3330  cur.PDBIDCode = vS(8, 11); // 8 - 11 IDcode idCode ID code of this datablock.
3331  cur.chainID = vC(13); // 13 Character chainID Chain identifier.
3332  cur.seqBegin = vI(15, 18); // 15 - 18 Integer seqBegin Initial sequence number of the
3333  // PDB sequence segment.
3334  cur.insertBegin = vC(19); // 19 AChar insertBegin Initial insertion code of the
3335  // PDB sequence segment.
3336  cur.seqEnd = vI(21, 24); // 21 - 24 Integer seqEnd Ending sequence number of the
3337  // PDB sequence segment.
3338  cur.insertEnd = vC(25); // 25 AChar insertEnd Ending insertion code of the
3339  // PDB sequence segment.
3340  cur.database = vS(27, 32); // 27 - 32 LString database Sequence database name.
3341  cur.dbAccession = vS(34, 41); // 34 - 41 LString dbAccession Sequence database accession code.
3342  cur.dbIdCode = vS(43, 54); // 43 - 54 LString dbIdCode Sequence database identification code.
3343  cur.dbSeqBegin = vI(56, 60); // 56 - 60 Integer dbseqBegin Initial sequence number of the
3344  // database seqment.
3345  cur.dbinsBeg = vC(61); // 61 AChar idbnsBeg Insertion code of initial residue of the
3346  // segment, if PDB is the reference.
3347  cur.dbSeqEnd = vI(63, 67); // 63 - 67 Integer dbseqEnd Ending sequence number of the
3348  // database segment.
3349  cur.dbinsEnd = vC(68); // 68 AChar dbinsEnd Insertion code of the ending residue of
3350  // the segment, if PDB is the reference.
3351  auto &chain = GetChainForID(cur.chainID);
3352  chain.mDbref = cur;
3353  }
3354  else if (mRec->is("DBREF1")) // 1 - 6 Record name "DBREF1"
3355  {
3356  cur.PDBIDCode = vS(8, 11); // 8 - 11 IDcode idCode ID code of this datablock.
3357  cur.chainID = vC(13); // 13 Character chainID Chain identifier.
3358  cur.seqBegin = vI(15, 18); // 15 - 18 Integer seqBegin Initial sequence number of the
3359  // PDB sequence segment, right justified.
3360  cur.insertBegin = vC(19); // 19 AChar insertBegin Initial insertion code of the
3361  // PDB sequence segment.
3362  cur.seqEnd = vI(21, 24); // 21 - 24 Integer seqEnd Ending sequence number of the
3363  // PDB sequence segment, right justified.
3364  cur.insertEnd = vC(25); // 25 AChar insertEnd Ending insertion code of the
3365  // PDB sequence segment.
3366  cur.database = vS(27, 32); // 27 - 32 LString database Sequence database name.
3367  cur.dbIdCode = vS(48, 67); // 48 - 67 LString dbIdCode Sequence database identification code,
3368  }
3369  else if (mRec->is("DBREF2")) // 1 - 6 Record name "DBREF2"
3370  { // 8 - 11 IDcode idCode ID code of this datablock.
3371  if (vC(13) != cur.chainID) // 13 Character chainID Chain identifier.
3372  throw std::runtime_error("Chain ID's for DBREF1/DBREF2 records do not match");
3373  cur.dbAccession = vS(19, 40); // 19 - 40 LString dbAccession Sequence database accession code,
3374  // left justified.
3375  cur.dbSeqBegin = vI(46, 55); // 46 - 55 Integer seqBegin Initial sequence number of the
3376  // Database segment, right justified.
3377  cur.dbSeqEnd = vI(58, 67); // 58 - 67 Integer seqEnd Ending sequence number of the
3378  // Database segment, right justified.
3379  auto &chain = GetChainForID(cur.chainID);
3380  chain.mDbref = cur;
3381  }
3382 
3383  GetNextRecord();
3384  }
3385 
3386  // update chains
3387  for (auto &chain : mChains)
3388  {
3389  chain.mNextSeqNum = chain.mDbref.seqBegin;
3390  chain.mNextDbSeqNum = chain.mDbref.dbSeqBegin;
3391  }
3392 
3393  while (mRec->is("SEQADV"))
3394  { // 1 - 6 Record name "SEQADV"
3395  mSeqadvs.push_back({
3396  // 8 - 11 IDcode idCode ID code of this datablock.
3397  vS(13, 15), // 13 - 15 Residue name resName Name of the PDB residue in conflict.
3398  vC(17), // 17 Character chainID PDB chain identifier.
3399  vI(19, 22), // 19 - 22 Integer seqNum PDB sequence number.
3400  vC(23), // 23 AChar iCode PDB insertion code.
3401  vS(25, 28), // 25 - 28 LString database
3402  vS(30, 38), // 30 - 38 LString dbAccession Sequence database accession number.
3403  vS(40, 42), // 40 - 42 Residue name dbRes Sequence database residue name.
3404  vI(44, 48), // 44 - 48 Integer dbSeq Sequence database sequence number.
3405  vS(50, 70) // 50 - 70 LString conflict Conflict comment.
3406  });
3407 
3408  GetNextRecord();
3409  }
3410 
3411  while (mRec->is("SEQRES")) // 1 - 6 Record name "SEQRES"
3412  { // 8 - 10 Integer serNum Serial number of the SEQRES record for the
3413  // current chain. Starts at 1 and increments
3414  // by one each line. Reset to 1 for each chain.
3415  char chainID = vC(12); // 12 Character chainID Chain identifier. This may be any single
3416  // legal character, including a blank which is
3417  // is used if there is only one chain.
3418  int numRes = vI(14, 17); // 14 - 17 Integer numRes Number of residues in the chain.
3419  // This value is repeated on every record.
3420  std::string monomers = vS(20, 70); // 20 - 22 Residue name resName Residue name.
3421  // ...
3422 
3423  auto &chain = GetChainForID(chainID, numRes);
3424 
3425  for (auto monID : cif::split<std::string>(monomers, " ", true))
3426  {
3427  if (monID.empty())
3428  continue;
3429 
3430  chain.mSeqres.push_back({ monID, chain.mNextSeqNum++, ' ', chain.mNextDbSeqNum++ });
3431 
3432  InsertChemComp(monID);
3433  }
3434 
3435  GetNextRecord();
3436  }
3437 
3438  // First pass over MODRES, only store relevant information required in ConstructEntities
3439  while (mRec->is("MODRES")) // 1 - 6 Record name "MODRES"
3440  { // 8 - 11 IDcode idCode ID code of this datablock.
3441  std::string resName = vS(13, 15); // 13 - 15 Residue name resName Residue name used in this datablock.
3442  // char chainID = vC(17); // 17 Character chainID Chain identifier.
3443  // int seqNum = vI(19, 22); // 19 - 22 Integer seqNum Sequence number.
3444  // char iCode = vC(23); // 23 AChar iCode Insertion code.
3445  std::string stdRes = vS(25, 27); // 25 - 27 Residue name stdRes Standard residue name.
3446  // std::string comment = vS(30, 70); // 30 - 70 String comment Description of the residue modification.
3447 
3448  mMod2parent[resName] = stdRes;
3449 
3450  GetNextRecord();
3451  }
3452 }
3453 
3454 void PDBFileParser::ParseHeterogen()
3455 {
3456  while (mRec->is("HET "))
3457  { // 1 - 6 Record name "HET "
3458  std::string hetID = vS(8, 10); // 8 - 10 LString(3) hetID Het identifier, right-justified.
3459  char chainID = vC(13); // 13 Character ChainID Chain identifier.
3460  int seqNum = vI(14, 17); // 14 - 17 Integer seqNum Sequence number.
3461  char iCode = vC(18); // 18 AChar iCode Insertion code.
3462  int numHetAtoms = vI(21, 25); // 21 - 25 Integer numHetAtoms Number of HETATM records for the group
3463  // present in the datablock.
3464  std::string text = vS(31, 70); // 31 - 70 String text Text describing Het group.
3465 
3466  mHets.emplace_back(hetID, chainID, seqNum, iCode, numHetAtoms, text);
3467 
3468  GetNextRecord();
3469  }
3470 
3471  for (;;)
3472  {
3473  if (mRec->is("HETNAM")) // 1 - 6 Record name "HETNAM"
3474  { // 9 - 10 Continuation continuation Allows concatenation of multiple records.
3475  std::string hetID = vS(12, 14); // 12 - 14 LString(3) hetID Het identifier, right-justified.
3476  std::string text = vS(16); // 16 - 70 String text Chemical name.
3477 
3478  mHetnams[hetID] = text;
3479  InsertChemComp(hetID);
3480 
3481  GetNextRecord();
3482  continue;
3483  }
3484 
3485  if (mRec->is("HETSYN")) // 1 - 6 Record name "HETSYN"
3486  { // 9 - 10 Continuation continuation Allows concatenation of multiple records.
3487  std::string hetID = vS(12, 14); // 12 - 14 LString(3) hetID Het identifier, right-justified.
3488  std::string syn = vS(16); // 16 - 70 SList hetSynonyms List of synonyms.
3489 
3490  mHetsyns[hetID] = syn;
3491 
3492  GetNextRecord();
3493  continue;
3494  }
3495 
3496  break;
3497  }
3498 
3499  while (mRec->is("FORMUL")) // 1 - 6 Record name "FORMUL"
3500  { // 9 - 10 Integer compNum Component number.
3501  std::string hetID = vS(13, 15); // 13 - 15 LString(3) hetID Het identifier.
3502  // 17 - 18 Integer continuation Continuation number.
3503  char waterMark = vC(19); // 19 Character asterisk "*" for water.
3504  std::string formula = vS(20); // 20 - 70 String text Chemical formula.
3505 
3506  mFormuls[hetID] = formula;
3507 
3508  if (waterMark == '*')
3509  mWaterHetID = hetID;
3510 
3511  GetNextRecord();
3512  }
3513 }
3514 
3515 void PDBFileParser::ConstructEntities()
3516 {
3517  // We parsed the Primary Structure and Heterogen sections, if available.
3518  // But if we didn't parse anything, we need to fake the data based on residues in ATOM records
3519 
3520  // First iterate all ATOM records and store the residues as found in these records
3521  int modelNr = 1;
3522 
3523  typedef std::map<std::tuple<char, int, char, char>, std::string> CompTypeMap;
3524  CompTypeMap residuesSeen; // used to validate PDB files...
3525 
3526  for (auto r = mData; r != nullptr; r = r->mNext)
3527  {
3528  if (r->is("MODEL "))
3529  {
3530  modelNr = r->vI(11, 14);
3531  if (modelNr != 1)
3532  break;
3533  continue;
3534  }
3535 
3536  if (r->is("ATOM ") or r->is("HETATM")) // 1 - 6 Record name "ATOM "
3537  { // ...
3538  std::string name = r->vS(13, 16); // 13 - 16 Atom name Atom name.
3539  char altLoc = r->vC(17); // 17 Character altLoc Alternate location indicator.
3540  std::string resName = r->vS(18, 20); // 18 - 20 Residue name resName Residue name.
3541  char chainID = r->vC(22); // 22 Character chainID Chain identifier.
3542  int resSeq = r->vI(23, 26); // 23 - 26 Integer resSeq Residue sequence number.
3543  char iCode = r->vC(27); // 27 AChar iCode Code for insertion of residues.
3544 
3545  // first validate, too sad this is required...
3546  CompTypeMap::key_type k = std::make_tuple(chainID, resSeq, iCode, altLoc);
3547  if (residuesSeen.count(k) == 0)
3548  residuesSeen[k] = resName;
3549  else if (residuesSeen[k] != resName)
3550  throw std::runtime_error("inconsistent residue type for " + std::string{ chainID } + std::to_string(resSeq) + iCode + altLoc + "\n" +
3551  " (" + residuesSeen[k] + " != " + resName + ")");
3552 
3553  auto &chain = GetChainForID(chainID);
3554 
3555  PDBChain::AtomRes ar{ resName, resSeq, iCode };
3556 
3557  if ((chain.mResiduesSeen.empty() or chain.mResiduesSeen.back() != ar) and
3558  (cif::compound_factory::instance().is_known_peptide(resName) or cif::compound_factory::instance().is_known_base(resName)))
3559  {
3560  chain.mResiduesSeen.push_back(ar);
3561  }
3562 
3563  // now that we're iterating atoms anyway, clean up the mUnobs array
3564  mUnobs.erase(remove_if(mUnobs.begin(), mUnobs.end(), [=](UNOBS &a)
3565  {
3566  bool result = false;
3567 
3568  if (modelNr == a.modelNr and
3569  resName == a.res and
3570  chainID == a.chain and
3571  resSeq == a.seq and
3572  iCode == a.iCode)
3573  {
3574  auto i = find(a.atoms.begin(), a.atoms.end(), name);
3575  if (i != a.atoms.end())
3576  {
3577  a.atoms.erase(i);
3578  result = a.atoms.empty();
3579  }
3580  }
3581 
3582  return result; }),
3583  mUnobs.end());
3584 
3585  continue;
3586  }
3587 
3588  if (r->is("TER ")) // 1 - 6 Record name "TER "
3589  { // 7 - 11 Integer serial Serial number.
3590  // 18 - 20 Residue name resName Residue name.
3591  char chainID = r->vC(22); // 22 Character chainID Chain identifier.
3592  // 23 - 26 Integer resSeq Residue sequence number.
3593  // 27 AChar iCode Insertion code.
3594  auto &chain = GetChainForID(chainID);
3595  if (chain.mTerIndex == 0) // Is this the first TER record? (Refmac writes out multiple TER records...)
3596  chain.mTerIndex = static_cast<int>(chain.mResiduesSeen.size());
3597  continue;
3598  }
3599  }
3600 
3601  // prune completely empty chains?
3602  mChains.erase(remove_if(mChains.begin(), mChains.end(), [](auto &chain)
3603  { return chain.mResiduesSeen.empty() and chain.mSeqres.empty(); }),
3604  mChains.end());
3605 
3606  for (auto &chain : mChains)
3607  {
3608  if (not(chain.mSeqres.empty() or chain.mResiduesSeen.empty()))
3609  {
3610  // seems safe to assume TER record is at the right location...
3611  // However, some files don't have them at all.
3612  // When mTerIndex == 0 this is most likely the case. Right?
3613 
3614  if (chain.mTerIndex > 0)
3615  chain.mResiduesSeen.erase(chain.mResiduesSeen.begin() + chain.mTerIndex, chain.mResiduesSeen.end());
3616 
3617  int lastResidueIndex = chain.AlignResToSeqRes();
3618 
3619  if (lastResidueIndex > 0 and lastResidueIndex + 1 < static_cast<int>(chain.mResiduesSeen.size()))
3620  {
3621  auto &r = chain.mResiduesSeen[lastResidueIndex + 1];
3622 
3623  if (cif::VERBOSE > 0)
3624  {
3625  std::cerr << "Detected residues that cannot be aligned to SEQRES" << std::endl
3626  << "First residue is " << chain.mDbref.chainID << ':' << r.mSeqNum << r.mIcode << std::endl;
3627  }
3628 
3629  chain.mTerIndex = lastResidueIndex + 1;
3630  }
3631  }
3632  else
3633  {
3634  // So, we did not have a SEQRES for this chain. Try to reconstruct it.
3635  // Problem here is that TER records may be located incorrectly. So
3636  // first lets shift the ter index until it is past the last known
3637  // aminoacid or base.
3638 
3639  for (int ix = chain.mTerIndex; ix < static_cast<int>(chain.mResiduesSeen.size()); ++ix)
3640  {
3641  std::string resName = chain.mResiduesSeen[ix].mMonID;
3642 
3643  if (cif::compound_factory::instance().is_known_peptide(resName) or
3644  cif::compound_factory::instance().is_known_base(resName))
3645  {
3646  chain.mTerIndex = ix + 1;
3647  }
3648 
3649  InsertChemComp(resName);
3650  }
3651 
3652  // And now construct our 'SEQRES'...
3653  for (int ix = 0; ix < chain.mTerIndex; ++ix)
3654  {
3655  auto &ar = chain.mResiduesSeen[ix];
3656  chain.mSeqres.push_back({ ar.mMonID, ar.mSeqNum, ar.mIcode, ar.mSeqNum, true });
3657  }
3658  }
3659  }
3660 
3661  std::set<char> terminatedChains;
3662  std::map<char, int> residuePerChainCounter;
3663 
3664  for (auto r = mData; r != nullptr; r = r->mNext)
3665  {
3666  if (r->is("MODEL "))
3667  {
3668  modelNr = r->vI(11, 14);
3669  if (modelNr != 1)
3670  break;
3671  continue;
3672  }
3673 
3674  if (r->is("ATOM ") or r->is("HETATM"))
3675  { // 1 - 6 Record name "ATOM "
3676  // int serial = r->vI(7, 11); // 7 - 11 Integer serial Atom serial number.
3677  // ...
3678  char altLoc = vC(17); // 17 Character altLoc Alternate location indicator.
3679  std::string resName = r->vS(18, 20); // 18 - 20 Residue name resName Residue name.
3680  char chainID = r->vC(22); // 22 Character chainID Chain identifier.
3681  int resSeq = r->vI(23, 26); // 23 - 26 Integer resSeq Residue sequence number.
3682  char iCode = r->vC(27); // 27 AChar iCode Code for insertion of residues.
3683 
3684  auto &chain = GetChainForID(chainID);
3685 
3686  auto i = find(chain.mSeqres.begin(), chain.mSeqres.end(), PDBSeqRes{ resName, resSeq, iCode });
3687 
3688  // might be a hetero
3689  if (altLoc != ' ' and i == chain.mSeqres.end())
3690  {
3691  i = find_if(chain.mSeqres.begin(), chain.mSeqres.end(),
3692  [resSeq, iCode](const PDBSeqRes &r) -> bool
3693  {
3694  return r.mSeqNum == resSeq and r.mIcode == iCode;
3695  });
3696  }
3697 
3698  if (i != chain.mSeqres.end())
3699  {
3700  i->mSeen = true;
3701  if (i->mMonID != resName)
3702  i->mAlts.insert(resName);
3703  }
3704  else
3705  {
3706  auto &residues = chain.mHet;
3707 
3708  if (residues.empty() or residues.back().mSeqNum != resSeq)
3709  {
3710  i = lower_bound(residues.begin(), residues.end(),
3711  PDBSeqRes{ resName, resSeq, iCode },
3712  [=](const PDBSeqRes &r1, const PDBSeqRes &r2) -> bool
3713  {
3714  return r1.mSeqNum < r2.mSeqNum;
3715  });
3716 
3717  residues.insert(i, { resName, resSeq, iCode, resSeq, true });
3718 
3719  InsertChemComp(resName);
3720  }
3721  }
3722 
3723  int residueCount = (residuePerChainCounter[chainID] += 1);
3724 
3725  // There appears to be a program that writes out HETATM records as ATOM records....
3726  if (not(cif::compound_factory::instance().is_known_peptide(resName) or cif::compound_factory::instance().is_known_base(resName)) or
3727  terminatedChains.count(chainID) or
3728  (chain.mTerIndex > 0 and residueCount >= chain.mTerIndex))
3729  {
3730  if (isWater(resName))
3731  mWaterHetID = resName;
3732 
3733  auto h = find_if(mHets.begin(), mHets.end(), [=](const HET &het) -> bool
3734  { return het.hetID == resName and het.chainID == chainID and
3735  het.seqNum == resSeq and het.iCode == iCode; });
3736 
3737  if (h == mHets.end())
3738  {
3739  mHets.push_back({ resName, chainID, resSeq, iCode, 0 }); // double perhaps, but that does not care
3740  h = prev(mHets.end());
3741  }
3742 
3743  h->atoms.push_back(r);
3744  }
3745 
3746  continue;
3747  }
3748 
3749  if (r->is("TER "))
3750  {
3751  char chainID = r->vC(22); // 22 Character chainID Chain identifier.
3752  terminatedChains.insert(chainID);
3753  }
3754  }
3755 
3756  // Create missing compounds
3757  for (auto &chain : mChains)
3758  {
3759  if (chain.mMolID != 0 or chain.mSeqres.empty())
3760  continue;
3761 
3762  // now this chain may contain the same residues as another one
3763  for (auto &other : mChains)
3764  {
3765  if (&other == &chain or other.mMolID == 0)
3766  continue;
3767 
3768  if (chain.SameSequence(other))
3769  {
3770  chain.mMolID = other.mMolID;
3771  break;
3772  }
3773  }
3774 
3775  if (chain.mMolID != 0)
3776  continue;
3777 
3778  auto &comp = GetOrCreateCompound(mNextMolID++);
3779  comp.mChains.insert(chain.mDbref.chainID);
3780 
3781  chain.mMolID = comp.mMolID;
3782  }
3783 
3784  std::set<std::string> structTitle, structDescription;
3785 
3786  // Create poly_scheme and write pdbx_poly_seq_scheme and create mapping table
3787 
3788  auto cat = getCategory("pdbx_poly_seq_scheme");
3789  int asymNr = 0;
3790  for (auto &chain : mChains)
3791  {
3792  std::string asymID = cif::cif_id_for_number(asymNr++);
3793 
3794  if (mMolID2EntityID.count(chain.mMolID) == 0)
3795  continue;
3796 
3797  std::string entityID = mMolID2EntityID[chain.mMolID];
3798 
3799  mAsymID2EntityID[asymID] = entityID;
3800 
3801  getCategory("struct_asym")->emplace({
3802  { "id", asymID },
3803  { "pdbx_blank_PDB_chainid_flag", chain.mDbref.chainID == ' ' ? "Y" : "N" },
3804  // pdbx_modified
3805  { "entity_id", entityID },
3806  // details
3807  });
3808 
3809  int seqNr = 1;
3810  for (auto &res : chain.mSeqres)
3811  {
3812  mChainSeq2AsymSeq[std::make_tuple(chain.mDbref.chainID, res.mSeqNum, res.mIcode)] = std::make_tuple(asymID, seqNr, true);
3813 
3814  std::string seqID = std::to_string(seqNr);
3815  ++seqNr;
3816 
3817  std::set<std::string> monIds = { res.mMonID };
3818  monIds.insert(res.mAlts.begin(), res.mAlts.end());
3819 
3820  for (std::string monID : monIds)
3821  {
3822  std::string authMonID, authSeqNum, authInsCode{'.'};
3823 
3824  if (res.mSeen)
3825  {
3826  authMonID = monID;
3827  authSeqNum = std::to_string(res.mSeqNum);
3828  if (res.mIcode != ' ' and res.mIcode != 0)
3829  authInsCode = std::string{ res.mIcode };
3830 
3831  cat->emplace({
3832  { "asym_id", asymID },
3833  { "entity_id", mMolID2EntityID[chain.mMolID] },
3834  { "seq_id", seqID },
3835  { "mon_id", monID },
3836  { "ndb_seq_num", seqID },
3837  { "pdb_seq_num", res.mSeqNum },
3838  { "auth_seq_num", authSeqNum },
3839  { "pdb_mon_id", authMonID },
3840  { "auth_mon_id", authMonID },
3841  { "pdb_strand_id", std::string{ chain.mDbref.chainID } },
3842  { "pdb_ins_code", authInsCode },
3843  { "hetero", res.mAlts.empty() ? "n" : "y" } });
3844  }
3845  else
3846  {
3847  if (res.mIcode != ' ' and res.mIcode != 0)
3848  authInsCode = std::string{ res.mIcode } + "A";
3849 
3850  cat->emplace({
3851  { "asym_id", asymID },
3852  { "entity_id", mMolID2EntityID[chain.mMolID] },
3853  { "seq_id", seqID },
3854  { "mon_id", monID },
3855  { "ndb_seq_num", seqID },
3856  { "pdb_seq_num", res.mSeqNum },
3857  { "auth_seq_num", "." },
3858  { "pdb_mon_id", "." },
3859  { "auth_mon_id", "." },
3860  { "pdb_strand_id", std::string{ chain.mDbref.chainID } },
3861  { "pdb_ins_code", authInsCode },
3862  { "hetero", res.mAlts.empty() ? "n" : "y" } });
3863  }
3864  }
3865  }
3866  }
3867 
3868  // We have now created all compounds, write them out
3869  uint32_t structRefID = 0, structRefSeqAlignID = 0;
3870 
3871  for (auto &cmp : mCompounds)
3872  {
3873  ++structRefID;
3874 
3875  std::string srcMethod;
3876 
3877  if (not cmp.mSource["SYNTHETIC"].empty())
3878  {
3879  srcMethod = "syn";
3880 
3881  getCategory("pdbx_entity_src_syn")->emplace({
3882  { "entity_id", mMolID2EntityID[cmp.mMolID] },
3883  { "pdbx_src_id", structRefID },
3884  { "organism_scientific", cmp.mSource["ORGANISM_SCIENTIFIC"] },
3885  { "ncbi_taxonomy_id", cmp.mSource["ORGANISM_TAXID"] },
3886  });
3887  }
3888  else if (cmp.mInfo["ENGINEERED"] == "YES" or
3889  not cmp.mSource["EXPRESSION_SYSTEM"].empty())
3890  {
3891  srcMethod = "man";
3892 
3893  getCategory("entity_src_gen")->emplace({
3894  { "entity_id", mMolID2EntityID[cmp.mMolID] },
3895  { "pdbx_src_id", structRefID },
3896  { "gene_src_common_name", cmp.mSource["ORGANISM_COMMON"] },
3897  { "pdbx_gene_src_gene", cmp.mSource["GENE"] },
3898  { "gene_src_strain", cmp.mSource["STRAIN"] },
3899  { "gene_src_tissue", cmp.mSource["TISSUE"] },
3900  { "gene_src_tissue_fraction", cmp.mSource["TISSUE_FRACTION"] },
3901  { "pdbx_gene_src_cell_line", cmp.mSource["CELL_LINE"] },
3902  { "pdbx_gene_src_organelle", cmp.mSource["ORGANELLE"] },
3903  { "pdbx_gene_src_cell", cmp.mSource["CELL"] },
3904  { "pdbx_gene_src_cellular_location", cmp.mSource["CELLULAR_LOCATION"] },
3905  { "host_org_common_name", cmp.mSource["EXPRESSION_SYSTEM_COMMON"] },
3906  { "pdbx_gene_src_scientific_name", cmp.mSource["ORGANISM_SCIENTIFIC"] },
3907  { "pdbx_gene_src_ncbi_taxonomy_id", cmp.mSource["ORGANISM_TAXID"] },
3908  { "pdbx_host_org_scientific_name", cmp.mSource["EXPRESSION_SYSTEM"] },
3909  { "pdbx_host_org_ncbi_taxonomy_id", cmp.mSource["EXPRESSION_SYSTEM_TAXID"] },
3910  { "pdbx_host_org_strain", cmp.mSource["EXPRESSION_SYSTEM_STRAIN"] },
3911  { "pdbx_host_org_variant", cmp.mSource["EXPRESSION_SYSTEM_VARIANT"] },
3912  { "pdbx_host_org_cell_line", cmp.mSource["EXPRESSION_SYSTEM_CELL_LINE"] },
3913  { "pdbx_host_org_cellular_location", cmp.mSource["EXPRESSION_SYSTEM_CELLULAR_LOCATION"] },
3914  { "pdbx_host_org_vector_type", cmp.mSource["EXPRESSION_SYSTEM_VECTOR_TYPE"] },
3915  { "pdbx_host_org_vector", cmp.mSource["EXPRESSION_SYSTEM_VECTOR"] },
3916  { "pdbx_host_org_gene", cmp.mSource["EXPRESSION_SYSTEM_GENE"] },
3917  { "plasmid_name", cmp.mSource["EXPRESSION_SYSTEM_PLASMID"] },
3918  { "pdbx_description", cmp.mSource["OTHER_DETAILS"] } });
3919  }
3920  else if (not cmp.mSource["ORGANISM_SCIENTIFIC"].empty())
3921  {
3922  srcMethod = "nat";
3923 
3924  getCategory("entity_src_nat")->emplace({
3925  { "entity_id", mMolID2EntityID[cmp.mMolID] },
3926  { "pdbx_src_id", structRefID },
3927  { "common_name", cmp.mSource["ORGANISM_COMMON"] },
3928  { "strain", cmp.mSource["STRAIN"] },
3929  { "pdbx_secretion", cmp.mSource["SECRETION"] },
3930  { "pdbx_organism_scientific", cmp.mSource["ORGANISM_SCIENTIFIC"] },
3931  { "pdbx_ncbi_taxonomy_id", cmp.mSource["ORGANISM_TAXID"] },
3932  { "pdbx_cellular_location", cmp.mSource["CELLULAR_LOCATION"] },
3933  { "pdbx_plasmid_name", cmp.mSource["PLASMID"] },
3934  { "pdbx_organ", cmp.mSource["ORGAN"] },
3935  });
3936  }
3937 
3938  getCategory("entity")->emplace({
3939  { "id", mMolID2EntityID[cmp.mMolID] },
3940  { "type", "polymer" },
3941  { "src_method", srcMethod },
3942  { "pdbx_description", cmp.mInfo["MOLECULE"] },
3943  // { "pdbx_formula_weight", },
3944  { "pdbx_number_of_molecules", cmp.mChains.size() },
3945  { "details", cmp.mInfo["OTHER_DETAILS"] },
3946  { "pdbx_mutation", cmp.mInfo["MUTATION"] },
3947  { "pdbx_fragment", cmp.mInfo["FRAGMENT"] },
3948  { "pdbx_ec", cmp.mInfo["EC"] } });
3949 
3950  if (not cmp.mInfo["SYNONYM"].empty())
3951  {
3952  getCategory("entity_name_com")->emplace({
3953  { "entity_id", mMolID2EntityID[cmp.mMolID] },
3954  { "name", cmp.mInfo["SYNONYM"] } });
3955  }
3956 
3957  std::string desc = cmp.mInfo["MOLECULE"];
3958  if (not cmp.mInfo["EC"].empty())
3959  desc += " (E.C." + cmp.mInfo["EC"] + ")";
3960 
3961  if (not cmp.mTitle.empty())
3962  structTitle.insert(cmp.mTitle);
3963 
3964  if (not desc.empty())
3965  structDescription.insert(desc);
3966 
3967  auto ci = find_if(mChains.begin(), mChains.end(),
3968  [cmp](PDBChain &c) -> bool
3969  { return cmp.mChains.count(c.mDbref.chainID); });
3970 
3971  if (ci != mChains.end() and not ci->mDbref.dbIdCode.empty())
3972  {
3973  getCategory("struct_ref")->emplace({
3974  { "id", structRefID },
3975  { "entity_id", mMolID2EntityID[cmp.mMolID] },
3976  { "db_name", ci->mDbref.database },
3977  { "db_code", ci->mDbref.dbIdCode },
3978  { "pdbx_db_accession", ci->mDbref.dbAccession },
3979  // { "pdbx_align_begin", ci->mDbref.dbSeqBegin }
3980  });
3981  }
3982 
3983  bool nstdMonomer = false, nonstandardLinkage = false;
3984  bool mightBePolyPeptide = true, mightBeDNA = true;
3985 
3986  std::vector<std::string> chains;
3987  std::string seq, seqCan;
3988 
3989  // write out the chains for this compound
3990  for (auto &chain : mChains)
3991  {
3992  if (chain.mMolID != cmp.mMolID)
3993  continue;
3994 
3995  // chain.mEntityID = cmp.mEntityID;
3996 
3997  ++structRefSeqAlignID;
3998  DBREF &dbref = chain.mDbref;
3999 
4000  if (not dbref.database.empty())
4001  {
4002  auto insToStr = [](char i) -> std::string
4003  { return i == ' ' or not isprint(i) ? "" : std::string{ i }; };
4004 
4005  auto &pdbxPolySeqScheme = *getCategory("pdbx_poly_seq_scheme");
4006 
4007  int seqAlignBeg = 0, seqAlignEnd = 0;
4008 
4009  try
4010  {
4011  seqAlignBeg = pdbxPolySeqScheme.find1<int>(key("pdb_strand_id") == std::string { dbref.chainID } and
4012  key("pdb_seq_num") == dbref.seqBegin and
4013  (key("pdb_ins_code") == insToStr(dbref.insertBegin) or key("pdb_ins_code") == cif::null),
4014  "seq_id");
4015 
4016  seqAlignEnd = pdbxPolySeqScheme.find1<int>(key("pdb_strand_id") == std::string { dbref.chainID } and
4017  key("pdb_seq_num") == dbref.seqEnd and
4018  (key("pdb_ins_code") == insToStr(dbref.insertEnd) or key("pdb_ins_code") == cif::null),
4019  "seq_id");
4020  }
4021  catch (...)
4022  {
4023  }
4024 
4025  getCategory("struct_ref_seq")->emplace({
4026  { "align_id", structRefSeqAlignID },
4027  { "ref_id", structRefID },
4028  { "pdbx_PDB_id_code", dbref.PDBIDCode },
4029  { "pdbx_strand_id", std::string{ chain.mDbref.chainID } },
4030  { "seq_align_beg", seqAlignBeg },
4031  { "pdbx_seq_align_beg_ins_code", insToStr(dbref.insertBegin) },
4032  { "seq_align_end", seqAlignEnd },
4033  { "pdbx_seq_align_end_ins_code", insToStr(dbref.insertEnd) },
4034  { "pdbx_db_accession", dbref.dbAccession },
4035  { "db_align_beg", dbref.dbSeqBegin },
4036  { "pdbx_db_align_beg_ins_code", insToStr(dbref.dbinsBeg) },
4037  { "db_align_end", dbref.dbSeqEnd },
4038  { "pdbx_db_align_end_ins_code", insToStr(dbref.dbinsEnd) },
4039  { "pdbx_auth_seq_align_beg", dbref.seqBegin },
4040  { "pdbx_auth_seq_align_end", dbref.seqEnd } });
4041 
4042  // write the struct_ref_seq_dif
4043  for (auto &seqadv : mSeqadvs)
4044  {
4045  if (seqadv.chainID != chain.mDbref.chainID or seqadv.resName.empty())
4046  continue;
4047 
4048  std::string asym, seqNum;
4049  int labelSeq = -1;
4050  std::error_code ec;
4051 
4052  std::tie(asym, labelSeq, std::ignore) = MapResidue(seqadv.chainID, seqadv.seqNum, seqadv.iCode, ec);
4053  if (ec)
4054  {
4055  if (cif::VERBOSE > 0)
4056  std::cerr << "dropping unmatched SEQADV record" << std::endl;
4057  continue;
4058  }
4059 
4060  seqNum = std::to_string(labelSeq);
4061 
4062  getCategory("struct_ref_seq_dif")->emplace({
4063  { "align_id", structRefSeqAlignID },
4064  { "pdbx_PDB_id_code", dbref.PDBIDCode },
4065  { "mon_id", seqadv.resName },
4066  { "pdbx_pdb_strand_id", seqadv.chainID },
4067  { "seq_num", seqNum },
4068  { "pdbx_pdb_ins_code", seqadv.iCode == ' ' ? std::string{} : std::string{ seqadv.iCode } },
4069  { "pdbx_seq_db_name", seqadv.database },
4070  { "pdbx_seq_db_accession_code", seqadv.dbAccession },
4071  { "db_mon_id", seqadv.dbRes },
4072  { "pdbx_seq_db_seq_num", seqadv.dbSeq },
4073  { "details", seqadv.conflict },
4074  { "pdbx_auth_seq_num", seqadv.seqNum },
4075  { "pdbx_ordinal", ++mPdbxDifOrdinal } });
4076  }
4077  }
4078 
4079  if (not chains.empty()) // not the first one for this molID
4080  {
4081  chains.push_back(std::string{ chain.mDbref.chainID });
4082  continue;
4083  }
4084 
4085  chains.push_back(std::string{ chain.mDbref.chainID });
4086 
4087  size_t seqLen = 0, seqCanLen = 0;
4088 
4089  for (auto &res : chain.mSeqres)
4090  {
4091  std::string letter, stdRes;
4092 
4093  if (mMod2parent.count(res.mMonID))
4094  stdRes = mMod2parent.at(res.mMonID);
4095 
4096  if (cif::compound_factory::kAAMap.count(res.mMonID))
4097  {
4098  letter = cif::compound_factory::kAAMap.at(res.mMonID);
4099  mightBeDNA = false;
4100  }
4101  else if (cif::compound_factory::kBaseMap.count(res.mMonID))
4102  {
4103  letter = cif::compound_factory::kBaseMap.at(res.mMonID);
4104  mightBePolyPeptide = false;
4105  }
4106  else
4107  {
4108  nstdMonomer = true;
4109  letter = '(' + res.mMonID + ')';
4110 
4111  // sja...
4112  auto compound = cif::compound_factory::instance().create(stdRes.empty() ? res.mMonID : stdRes);
4113  if (compound != nullptr and
4114  not iequals(compound->type(), "L-peptide linking") and
4115  not iequals(compound->type(), "RNA linking"))
4116  {
4117  nonstandardLinkage = true;
4118  }
4119  }
4120 
4121  if (seqLen + letter.length() > 80)
4122  {
4123  seq += '\n';
4124  seqLen = 0;
4125  }
4126 
4127  seq += letter;
4128  seqLen += letter.length();
4129 
4130  if (letter.length() > 1)
4131  {
4132  if (not stdRes.empty() and cif::compound_factory::kAAMap.count(stdRes))
4133  letter = cif::compound_factory::kAAMap.at(stdRes);
4134  else if (cif::compound_factory::kBaseMap.count(res.mMonID))
4135  letter = cif::compound_factory::kBaseMap.at(res.mMonID);
4136  else
4137  letter = 'X';
4138  }
4139 
4140  if (seqCanLen + letter.length() > 80)
4141  {
4142  seqCan += '\n';
4143  seqCanLen = 0;
4144  }
4145  seqCan += letter;
4146  seqCanLen += letter.length();
4147  }
4148 
4149  auto cat_ps = getCategory("entity_poly_seq");
4150  for (size_t i = 0; i < chain.mSeqres.size(); ++i)
4151  {
4152  auto &rs = chain.mSeqres[i];
4153 
4154  if (std::find(mChemComp.begin(), mChemComp.end(), rs.mMonID) == mChemComp.end())
4155  mChemComp.emplace_back(rs.mMonID);
4156 
4157  cat_ps->emplace({
4158  { "entity_id", mMolID2EntityID[cmp.mMolID] },
4159  { "num", i + 1 },
4160  { "mon_id", rs.mMonID },
4161  { "hetero", rs.mAlts.empty() ? "n" : "y" } });
4162 
4163  for (auto &a : rs.mAlts)
4164  {
4165  cat_ps->emplace({
4166  { "entity_id", mMolID2EntityID[cmp.mMolID] },
4167  { "num", i + 1 },
4168  { "mon_id", a },
4169  { "hetero", "y" } });
4170  }
4171  }
4172  }
4173 
4174  std::string type;
4175  if (mightBePolyPeptide and not mightBeDNA)
4176  type = "polypeptide(L)";
4177  else if (mightBeDNA and not mightBePolyPeptide)
4178  type = "polyribonucleotide";
4179 
4180  getCategory("entity_poly")->emplace({
4181  { "entity_id", mMolID2EntityID[cmp.mMolID] },
4182  { "pdbx_seq_one_letter_code", seq },
4183  { "pdbx_seq_one_letter_code_can", seqCan },
4184  { "nstd_monomer", (nstdMonomer ? "yes" : "no") },
4185  { "pdbx_strand_id", cif::join(chains, ",") },
4186  { "nstd_linkage", nonstandardLinkage ? "yes" : "no" },
4187  { "type", type } });
4188  }
4189 
4190  if (not(structTitle.empty() and structDescription.empty()))
4191  {
4192  getCategory("struct")->emplace({
4193  { "entry_id", mStructureID },
4194  { "title", cif::join(structTitle, ", ") },
4195  { "pdbx_descriptor", cif::join(structDescription, ", ") },
4196  { "pdbx_model_type_details", mModelTypeDetails } });
4197  }
4198 
4199  // build sugar trees first
4200  ConstructSugarTrees(asymNr);
4201 
4202  // done with the sugar, resume operation as before
4203 
4204  std::map<char, std::string> waterChains;
4205  std::map<std::tuple<std::string, std::string>, int> ndbSeqNum; // for nonpoly scheme
4206  std::map<std::string,int> entityAuthSeqNum; // for nonpoly scheme too
4207 
4208  for (size_t i = 0; i < mHets.size(); ++i)
4209  {
4210  auto &heti = mHets[i];
4211 
4212  if (not heti.asymID.empty())
4213  continue;
4214 
4215  if (heti.hetID == mWaterHetID or isWater(heti.hetID))
4216  continue;
4217 
4218  // See if this residue is part of SEQRES
4219  auto &chain = GetChainForID(heti.chainID);
4220  auto ih = find(chain.mSeqres.begin(), chain.mSeqres.end(), PDBSeqRes{ heti.hetID, heti.seqNum, heti.iCode });
4221 
4222  // If so, skip it, it is not an entity then
4223  if (ih != chain.mSeqres.end())
4224  continue;
4225 
4226  heti.asymID = cif::cif_id_for_number(asymNr++);
4227  }
4228 
4229  std::set<std::string> writtenAsyms;
4230 
4231  std::map<std::string, int> hetCount; // for pdbx_number_of_molecules
4232  for (auto &het : mHets)
4233  hetCount[het.hetID] += 1;
4234 
4235  for (auto &het : mHets)
4236  {
4237  std::string hetID = het.hetID;
4238 
4239  auto &chain = GetChainForID(het.chainID);
4240 
4241  // See if this residue is part of SEQRES
4242  auto i = find(chain.mSeqres.begin(), chain.mSeqres.end(), PDBSeqRes{ hetID, het.seqNum, het.iCode });
4243 
4244  // If so, skip it, it is not an entity then
4245  if (i != chain.mSeqres.end())
4246  continue;
4247 
4248  // See if we've already added it to the entities
4249  if (mHet2EntityID.count(hetID) == 0)
4250  {
4251  std::string entityID = std::to_string(mNextEntityNr++);
4252  mHet2EntityID[hetID] = entityID;
4253 
4254  if (hetID == mWaterHetID)
4255  {
4256  getCategory("entity")->emplace({
4257  { "id", entityID },
4258  { "type", "water" },
4259  { "src_method", "nat" },
4260  { "pdbx_description", "water" },
4261  { "pdbx_number_of_molecules", hetCount[hetID] } });
4262  }
4263  else
4264  {
4265  if (mHetnams[hetID].empty())
4266  {
4267  auto compound = cif::compound_factory::instance().create(hetID);
4268  if (compound != nullptr)
4269  mHetnams[hetID] = compound->name();
4270  }
4271 
4272  getCategory("entity")->emplace({
4273  { "id", entityID },
4274  { "type", "non-polymer" },
4275  { "src_method", "syn" },
4276  { "pdbx_description", mHetnams[hetID] },
4277  { "details", mHetsyns[hetID] },
4278  { "pdbx_number_of_molecules", hetCount[hetID] } });
4279  }
4280 
4281  // write a pdbx_entity_nonpoly record
4282  std::string name = mHetnams[hetID];
4283  if (name.empty() and hetID == mWaterHetID)
4284  name = "water";
4285  getCategory("pdbx_entity_nonpoly")->emplace({
4286  { "entity_id", entityID },
4287  { "name", name },
4288  { "comp_id", hetID } });
4289  }
4290 
4291  // create an asym for this het/chain combo, if needed
4292 
4293  std::string asymID = het.asymID;
4294 
4295  auto k = std::make_tuple(het.chainID, het.seqNum, het.iCode);
4296  if (mChainSeq2AsymSeq.count(k) == 0)
4297  {
4298  if (hetID == mWaterHetID or isWater(hetID))
4299  {
4300  if (waterChains.count(het.chainID) == 0)
4301  {
4302  asymID = cif::cif_id_for_number(asymNr++);
4303  waterChains[het.chainID] = asymID;
4304  }
4305  else
4306  asymID = waterChains[het.chainID];
4307  }
4308  else
4309  asymID = het.asymID;
4310 
4311  assert(asymID.empty() == false);
4312 
4313  mAsymID2EntityID[asymID] = mHet2EntityID[hetID];
4314 
4315  // NOTE, a nonpoly residue has no label_seq_id
4316  // but in pdbx_nonpoly_scheme there is such a number.
4317  // Since this number is not used anywhere else we
4318  // just use it here and do not store it in the table
4319  mChainSeq2AsymSeq[k] = std::make_tuple(asymID, 0, false);
4320 
4321  if (writtenAsyms.count(asymID) == 0)
4322  {
4323  writtenAsyms.insert(asymID);
4324  getCategory("struct_asym")->emplace({
4325  { "id", asymID },
4326  { "pdbx_blank_PDB_chainid_flag", het.chainID == ' ' ? "Y" : "N" },
4327  // pdbx_modified
4328  { "entity_id", mHet2EntityID[hetID] },
4329  // details
4330  });
4331  }
4332  }
4333 
4334  int seqNr = ++ndbSeqNum[std::make_tuple(hetID, asymID)];
4335  int authSeqNr = ++entityAuthSeqNum[hetID];
4336 
4337  std::string iCode{ het.iCode };
4338  cif::trim(iCode);
4339  if (iCode.empty())
4340  iCode = { '.' };
4341 
4342  getCategory("pdbx_nonpoly_scheme")->emplace({
4343  { "asym_id", asymID },
4344  { "entity_id", mHet2EntityID[hetID] },
4345  { "mon_id", hetID },
4346  { "ndb_seq_num", seqNr },
4347  { "pdb_seq_num", het.seqNum },
4348  { "auth_seq_num", authSeqNr }, // Yes
4349  { "pdb_mon_id", hetID },
4350  { "auth_mon_id", hetID },
4351  { "pdb_strand_id", std::string{ het.chainID } },
4352  { "pdb_ins_code", iCode } });
4353 
4354  // mapping needed?
4355  mChainSeq2AsymSeq[std::make_tuple(het.chainID, het.seqNum, het.iCode)] = std::make_tuple(asymID, seqNr, false);
4356  }
4357 
4358  int modResID = 1;
4359  std::set<std::string> modResSet;
4360  for (auto rec = FindRecord("MODRES"); rec != nullptr and rec->is("MODRES");
4361  rec = rec->mNext) // 1 - 6 Record name "MODRES"
4362  { // 8 - 11 IDcode idCode ID code of this datablock.
4363  std::string resName = rec->vS(13, 15); // 13 - 15 Residue name resName Residue name used in this datablock.
4364  char chainID = rec->vC(17); // 17 Character chainID Chain identifier.
4365  int seqNum = rec->vI(19, 22); // 19 - 22 Integer seqNum Sequence number.
4366  char iCode = rec->vC(23); // 23 AChar iCode Insertion code.
4367  std::string stdRes = rec->vS(25, 27); // 25 - 27 Residue name stdRes Standard residue name.
4368  std::string comment = rec->vS(30, 70); // 30 - 70 String comment Description of the residue modification.
4369 
4370  std::string asymID;
4371  int seq;
4372  std::error_code ec;
4373 
4374  std::tie(asymID, seq, std::ignore) = MapResidue(chainID, seqNum, iCode, ec);
4375  if (ec) // no need to write a modres if it could not be found
4376  {
4377  if (cif::VERBOSE > 0)
4378  std::cerr << "dropping unmapped MODRES record" << std::endl;
4379  continue;
4380  }
4381 
4382  getCategory("pdbx_struct_mod_residue")->emplace({
4383  { "id", modResID++ },
4384  { "label_asym_id", asymID },
4385  { "label_seq_id", seq },
4386  { "label_comp_id", resName },
4387  { "auth_asym_id", std::string(1, chainID) },
4388  { "auth_seq_id", seqNum },
4389  { "auth_comp_id", resName },
4390  { "PDB_ins_code", iCode == ' ' ? "" : std::string{ iCode } },
4391  { "parent_comp_id", stdRes },
4392  { "details", comment }
4393  });
4394 
4395  modResSet.insert(resName);
4396  }
4397 
4398  // // chem compounds
4399 
4400  for (auto cc : mChemComp)
4401  {
4402  auto compound = cif::compound_factory::instance().create(
4403  mMod2parent.count(cc) ? mMod2parent[cc] : cc);
4404 
4405  std::string name;
4406  std::string formula;
4407  std::string type;
4408  std::string nstd = ".";
4409  std::string formulaWeight;
4410 
4411  if (compound != nullptr)
4412  {
4413  name = compound->name();
4414  type = compound->type();
4415 
4416  if (iequals(type, "L-peptide linking") or iequals(type, "peptide linking"))
4417  nstd = "y";
4418 
4419  formula = compound->formula();
4420  formulaWeight = std::to_string(compound->formula_weight());
4421  }
4422 
4423  if (name.empty())
4424  name = mHetnams[cc];
4425 
4426  if (type.empty())
4427  type = "NON-POLYMER";
4428 
4429  if (formula.empty())
4430  {
4431  formula = mFormuls[cc];
4432 
4433  const std::regex rx(R"(\d+\((.+)\))");
4434  std::smatch m;
4435  if (std::regex_match(formula, m, rx))
4436  formula = m[1].str();
4437  }
4438 
4439  if (modResSet.count(cc))
4440  nstd = "n";
4441 
4442  getCategory("chem_comp")->emplace({
4443  { "id", cc },
4444  { "name", name },
4445  { "formula", formula },
4446  { "formula_weight", formulaWeight },
4447  { "mon_nstd_flag", nstd },
4448  { "type", type }
4449  });
4450  }
4451 
4452  getCategory("chem_comp")->reorder_by_index();
4453 
4454  // unobserved can now be written as well
4455 
4456  int idRes = 0, idAtom = 0;
4457  sort(mUnobs.begin(), mUnobs.end(), [](const UNOBS &a, const UNOBS &b) -> bool
4458  {
4459  int d = a.modelNr - b.modelNr;
4460  if (d == 0)
4461  d = a.seq - b.seq;
4462  return d < 0; });
4463 
4464  for (auto &unobs : mUnobs)
4465  {
4466  bool isPolymer = false;
4467  std::string asymID, compID = unobs.res;
4468  int seqNr = 0;
4469  std::error_code ec;
4470 
4471  std::tie(asymID, seqNr, isPolymer) = MapResidue(unobs.chain, unobs.seq, unobs.iCode, ec);
4472  if (ec)
4473  {
4474  if (cif::VERBOSE > 0)
4475  std::cerr << "error mapping unobserved residue" << std::endl;
4476  continue;
4477  }
4478 
4479  if (unobs.atoms.empty())
4480  {
4481  getCategory("pdbx_unobs_or_zero_occ_residues")->emplace({
4482  { "id", std::to_string(++idRes) },
4483  { "polymer_flag", isPolymer ? "Y" : "N" },
4484  { "occupancy_flag", 1 },
4485  { "PDB_model_num", unobs.modelNr ? unobs.modelNr : 1 },
4486  { "auth_asym_id", std::string{ unobs.chain } },
4487  { "auth_comp_id", unobs.res },
4488  { "auth_seq_id", unobs.seq },
4489  { "PDB_ins_code", unobs.iCode == ' ' ? "" : std::string{ unobs.iCode } },
4490  { "label_asym_id", asymID },
4491  { "label_comp_id", compID }, // TODO: change to correct comp_id
4492  { "label_seq_id", seqNr > 0 ? std::to_string(seqNr) : "" } });
4493  }
4494  else
4495  {
4496  for (auto &atom : unobs.atoms)
4497  {
4498  getCategory("pdbx_unobs_or_zero_occ_atoms")->emplace({
4499  { "id", std::to_string(++idAtom) },
4500  { "polymer_flag", isPolymer ? "Y" : "N" },
4501  { "occupancy_flag", 1 },
4502  { "PDB_model_num", unobs.modelNr ? unobs.modelNr : 1 },
4503  { "auth_asym_id", std::string{ unobs.chain } },
4504  { "auth_comp_id", unobs.res },
4505  { "auth_seq_id", unobs.seq },
4506  { "PDB_ins_code", unobs.iCode == ' ' ? "" : std::string{ unobs.iCode } },
4507  { "auth_atom_id", atom },
4508  { "label_asym_id", asymID },
4509  { "label_comp_id", compID }, // TODO: change to correct comp_id
4510  { "label_seq_id", seqNr > 0 ? std::to_string(seqNr) : "" },
4511  { "label_atom_id", atom } });
4512  }
4513  }
4514  }
4515 }
4516 
4517 void PDBFileParser::ConstructSugarTrees(int &asymNr)
4518 {
4519  for (;;)
4520  {
4521  // find a first NAG/NDG
4522  auto si = std::find_if(mHets.begin(), mHets.end(), [](const HET &h)
4523  { return (h.hetID == "NAG" or h.hetID == "NDG") and not(h.processed or h.branch); });
4524  if (si != mHets.end())
4525  {
4526  si->processed = true;
4527 
4528  // take the location of the C1 atom(s?)
4529  std::set<char> ci;
4530 
4531  for (auto a : si->atoms)
4532  {
4533  std::string name = a->vS(13, 16); // 13 - 16 Atom name Atom name.
4534 
4535  if (name != "C1")
4536  continue;
4537 
4538  ci.insert(a->vC(17)); // 17 Character altLoc Alternate location indicator.
4539  }
4540 
4541  if (ci.empty())
4542  continue;
4543 
4544  for (auto alt : ci)
4545  {
4546  ATOM_REF c1{ "C1", si->hetID, si->seqNum, si->chainID, si->iCode, alt };
4547 
4548  const auto &[asn, linked] = FindLink(c1, "ND2", "ASN");
4549  if (not linked)
4550  continue;
4551 
4552  std::stack<ATOM_REF> c1s;
4553  c1s.push(c1);
4554 
4555  SUGAR_TREE sugarTree;
4556  sugarTree.push_back({ c1 });
4557 
4558  // naive implementation
4559  while (not c1s.empty())
4560  {
4561  c1 = c1s.top();
4562  c1s.pop();
4563 
4564  for (auto o : { "O1", "O2", "O3", "O4", "O5", "O6" })
4565  {
4566  ATOM_REF leaving = c1;
4567  leaving.name = o;
4568 
4569  const auto &[nc1, linked_c1] = FindLink(leaving, "C1");
4570  if (linked_c1)
4571  {
4572  sugarTree.push_back({ nc1, o[1] - '0', c1 });
4573  c1s.push(nc1);
4574  }
4575  }
4576  }
4577 
4578  if (sugarTree.size() < 2) // not really a tree
4579  continue;
4580 
4581  auto branchName = sugarTree.entityName();
4582  auto entityID = mBranch2EntityID[branchName];
4583 
4584  // See if we've already added it to the entities
4585  if (entityID.empty())
4586  {
4587  entityID = std::to_string(mNextEntityNr++);
4588  mBranch2EntityID[branchName] = entityID;
4589 
4590  getCategory("entity")->emplace({
4591  { "id", entityID },
4592  { "type", "branched" },
4593  { "src_method", "man" },
4594  { "pdbx_description", branchName } });
4595 
4596  getCategory("pdbx_entity_branch")->emplace({
4597  { "entity_id", entityID },
4598  { "type", "oligosaccharide" } });
4599 
4600  int num = 0;
4601  std::map<ATOM_REF, int> branch_list;
4602 
4603  for (auto &s : sugarTree)
4604  {
4605  getCategory("pdbx_entity_branch_list")->emplace({
4606  { "entity_id", entityID },
4607  { "comp_id", s.c1.resName },
4608  { "num", ++num },
4609  { "hetero", ci.size() == 1 ? "n" : "y" } });
4610 
4611  branch_list[s.c1] = num;
4612  }
4613 
4614  auto &branch_link = *getCategory("pdbx_entity_branch_link");
4615 
4616  for (auto &s : sugarTree)
4617  {
4618  if (s.leaving_o == 0)
4619  continue;
4620 
4621  branch_link.emplace({
4622  { "link_id", branch_link.size() + 1 },
4623  { "entity_id", entityID },
4624  { "entity_branch_list_num_1", branch_list[s.c1] },
4625  { "comp_id_1", s.c1.resName },
4626  { "atom_id_1", s.c1.name },
4627  { "leaving_atom_id_1", "O1" },
4628  { "entity_branch_list_num_2", branch_list[s.next] },
4629  { "comp_id_2", s.next.resName },
4630  { "atom_id_2", "O" + std::to_string(s.leaving_o) },
4631  { "leaving_atom_id_2", "HO" + std::to_string(s.leaving_o) },
4632  { "value_order", "sing" }
4633  });
4634  }
4635  }
4636 
4637  mSugarEntities.insert(entityID);
4638 
4639  // create an asym for this sugar tree
4640 
4641  std::string asymID = cif::cif_id_for_number(asymNr++);
4642 
4643  mAsymID2EntityID[asymID] = entityID;
4644 
4645  getCategory("struct_asym")->emplace({
4646  { "id", asymID },
4647  { "pdbx_blank_PDB_chainid_flag", si->chainID == ' ' ? "Y" : "N" },
4648  { "pdbx_modified", "N" },
4649  { "entity_id", entityID } });
4650 
4651  std::string iCode{ si->iCode };
4652  cif::trim(iCode);
4653  if (iCode.empty())
4654  iCode = { '.' };
4655 
4656  int num = 0;
4657  for (auto s : sugarTree)
4658  {
4659  getCategory("pdbx_branch_scheme")->emplace({
4660  { "asym_id", asymID },
4661  { "entity_id", entityID },
4662  { "mon_id", s.c1.resName },
4663  { "num", ++num },
4664  { "pdb_asym_id", asymID },
4665  { "pdb_mon_id", s.c1.resName },
4666  { "pdb_seq_num", num },
4667  { "auth_asym_id", std::string{ s.c1.chainID } },
4668  { "auth_mon_id", s.next.resName },
4669  { "auth_seq_num", s.c1.resSeq },
4670  { "hetero", ci.size() == 1 ? "n" : "y" } });
4671 
4672  auto k = std::make_tuple(s.c1.chainID, s.c1.resSeq, s.c1.iCode);
4673  assert(mChainSeq2AsymSeq.count(k) == 0);
4674 
4675  mChainSeq2AsymSeq[k] = std::make_tuple(asymID, num, false);
4676 
4677  // mark all hets as part of tree
4678 
4679  for (auto &h : mHets)
4680  {
4681  if (h.hetID == s.c1.resName and h.chainID == s.c1.chainID and h.seqNum == s.c1.resSeq and h.iCode == s.c1.iCode)
4682  {
4683  h.branch = true;
4684  break; // should be only one of course... right?
4685  }
4686  }
4687  }
4688 
4689  break;
4690  }
4691 
4692  continue;
4693  }
4694 
4695  break;
4696  }
4697 
4698  // remove the branched HET's
4699  mHets.erase(std::remove_if(mHets.begin(), mHets.end(), [](auto &h)
4700  { return h.branch; }),
4701  mHets.end());
4702 }
4703 
4704 void PDBFileParser::ParseSecondaryStructure()
4705 {
4706  bool firstHelix = true;
4707 
4708  while (mRec->is("HELIX "))
4709  {
4710  // 1 - 6 Record name "HELIX "
4711  // 8 - 10 Integer serNum Serial number of the helix. This starts
4712  // at 1 and increases incrementally.
4713  // 12 - 14 LString(3) helixID Helix identifier. In addition to a serial
4714  // number, each helix is given an
4715  // alphanumeric character helix identifier.
4716  // 16 - 18 Residue name initResName Name of the initial residue.
4717  // 20 Character initChainID Chain identifier for the chain containing
4718  // this helix.
4719  // 22 - 25 Integer initSeqNum Sequence number of the initial residue.
4720  // 26 AChar initICode Insertion code of the initial residue.
4721  // 28 - 30 Residue name endResName Name of the terminal residue of the helix.
4722  // 32 Character endChainID Chain identifier for the chain containing
4723  // this helix.
4724  // 34 - 37 Integer endSeqNum Sequence number of the terminal residue.
4725  // 38 AChar endICode Insertion code of the terminal residue.
4726  // 39 - 40 Integer helixClass Helix class (see below).
4727  // 41 - 70 String comment Comment about this helix.
4728  // 72 - 76 Integer length Length of this helix.
4729 
4730  std::string begAsymID, endAsymID;
4731  int begSeq, endSeq;
4732  std::error_code ec;
4733 
4734  std::tie(begAsymID, begSeq, std::ignore) = MapResidue(vC(20), vI(22, 25), vC(26), ec);
4735  if (not ec)
4736  std::tie(endAsymID, endSeq, std::ignore) = MapResidue(vC(32), vI(34, 37), vC(38), ec);
4737 
4738  if (ec)
4739  {
4740  if (cif::VERBOSE > 0)
4741  std::cerr << "Could not map residue for HELIX " << vI(8, 10) << std::endl;
4742  }
4743  else
4744  {
4745  auto cat = getCategory("struct_conf");
4746  cat->emplace({
4747  { "conf_type_id", "HELX_P" },
4748  { "id", "HELX_P" + std::to_string(vI(8, 10)) },
4749  { "pdbx_PDB_helix_id", vS(12, 14) },
4750  { "beg_label_comp_id", vS(16, 18) },
4751  { "beg_label_asym_id", begAsymID },
4752  { "beg_label_seq_id", begSeq },
4753  { "pdbx_beg_PDB_ins_code", vS(26, 26) },
4754  { "end_label_comp_id", vS(28, 30) },
4755  { "end_label_asym_id", endAsymID },
4756  { "end_label_seq_id", endSeq },
4757  { "pdbx_end_PDB_ins_code", vS(38, 38) },
4758 
4759  { "beg_auth_comp_id", vS(16, 18) },
4760  { "beg_auth_asym_id", vS(20, 20) },
4761  { "beg_auth_seq_id", vI(22, 25) },
4762  { "end_auth_comp_id", vS(28, 30) },
4763  { "end_auth_asym_id", vS(32, 32) },
4764  { "end_auth_seq_id", vI(34, 37) },
4765 
4766  { "pdbx_PDB_helix_class", vS(39, 40) },
4767  { "details", vS(41, 70) },
4768  { "pdbx_PDB_helix_length", vI(72, 76) } });
4769 
4770  if (firstHelix)
4771  {
4772  cat = getCategory("struct_conf_type");
4773  cat->emplace({
4774  { "id", "HELX_P" } });
4775  firstHelix = false;
4776  }
4777  }
4778 
4779  GetNextRecord();
4780  }
4781 
4782  std::set<std::string> sheetsSeen;
4783  int rangeID = 1;
4784 
4785  while (mRec->is("SHEET "))
4786  {
4787  // 1 - 6 Record name "SHEET "
4788  // 8 - 10 Integer strand Strand number which starts at 1 for each
4789  // strand within a sheet and increases by one.
4790  // 12 - 14 LString(3) sheetID Sheet identifier.
4791  // 15 - 16 Integer numStrands Number of strands in sheet.
4792  // 18 - 20 Residue name initResName Residue name of initial residue.
4793  // 22 Character initChainID Chain identifier of initial residue
4794  // in strand.
4795  // 23 - 26 Integer initSeqNum Sequence number of initial residue
4796  // in strand.
4797  // 27 AChar initICode Insertion code of initial residue
4798  // in strand.
4799  // 29 - 31 Residue name endResName Residue name of terminal residue.
4800  // 33 Character endChainID Chain identifier of terminal residue.
4801  // 34 - 37 Integer endSeqNum Sequence number of terminal residue.
4802  // 38 AChar endICode Insertion code of terminal residue.
4803  // 39 - 40 Integer sense Sense of strand with respect to previous
4804  // strand in the sheet. 0 if first strand,
4805  // 1 if parallel,and -1 if anti-parallel.
4806  // 42 - 45 Atom curAtom Registration. Atom name in current strand.
4807  // 46 - 48 Residue name curResName Registration. Residue name in current strand
4808  // 50 Character curChainID Registration. Chain identifier in
4809  // current strand.
4810  // 51 - 54 Integer curResSeq Registration. Residue sequence number
4811  // in current strand.
4812  // 55 AChar curICode Registration. Insertion code in
4813  // current strand.
4814  // 57 - 60 Atom prevAtom Registration. Atom name in previous strand.
4815  // 61 - 63 Residue name prevResName Registration. Residue name in
4816  // previous strand.
4817  // 65 Character prevChainID Registration. Chain identifier in
4818  // previous strand.
4819  // 66 - 69 Integer prevResSeq Registration. Residue sequence number
4820  // in previous strand.
4821  // 70 AChar prevICode Registration. Insertion code in
4822  // previous strand.
4823 
4824  std::string sheetID = cif::trim_copy(vS(12, 14));
4825  if (sheetsSeen.count(sheetID) == 0)
4826  {
4827  sheetsSeen.insert(sheetID);
4828 
4829  rangeID = 1;
4830 
4831  getCategory("struct_sheet")->emplace({
4832  { "id", sheetID },
4833  { "number_strands", vI(15, 16) },
4834  });
4835  }
4836 
4837  int sense = vI(39, 40);
4838 
4839  if (sense != 0)
4840  {
4841  getCategory("struct_sheet_order")->emplace({
4842  { "sheet_id", sheetID },
4843  { "range_id_1", rangeID },
4844  { "range_id_2", rangeID + 1 },
4845  { "sense", sense == -1 ? "anti-parallel" : "parallel" } });
4846  }
4847 
4848  std::string begAsymID, endAsymID;
4849  int begSeq, endSeq;
4850  std::error_code ec;
4851 
4852  std::tie(begAsymID, begSeq, std::ignore) = MapResidue(vC(22), vI(23, 26), vC(27), ec);
4853  if (not ec)
4854  std::tie(endAsymID, endSeq, std::ignore) = MapResidue(vC(33), vI(34, 37), vC(38), ec);
4855 
4856  if (ec)
4857  {
4858  if (cif::VERBOSE > 0)
4859  std::cerr << "Dropping SHEET record " << vI(8, 10) << std::endl;
4860  }
4861  else
4862  {
4863  getCategory("struct_sheet_range")->emplace({
4864  { "sheet_id", sheetID },
4865  { "id", vI(8, 10) },
4866  { "beg_label_comp_id", vS(18, 20) },
4867  { "beg_label_asym_id", begAsymID },
4868  { "beg_label_seq_id", begSeq },
4869  { "pdbx_beg_PDB_ins_code", vS(27, 27) },
4870  { "end_label_comp_id", vS(29, 31) },
4871  { "end_label_asym_id", endAsymID },
4872  { "end_label_seq_id", endSeq },
4873  { "pdbx_end_PDB_ins_code", vS(38, 38) },
4874 
4875  { "beg_auth_comp_id", vS(18, 20) },
4876  { "beg_auth_asym_id", vS(22, 22) },
4877  { "beg_auth_seq_id", vI(23, 26) },
4878  { "end_auth_comp_id", vS(29, 31) },
4879  { "end_auth_asym_id", vS(33, 33) },
4880  { "end_auth_seq_id", vI(34, 37) },
4881  });
4882 
4883  if (sense != 0 and mRec->mVlen > 34)
4884  {
4885  std::string r1AsymID, r2AsymID;
4886  int r1Seq, r2Seq;
4887 
4888  std::tie(r1AsymID, r1Seq, std::ignore) = MapResidue(vC(65), vI(66, 69), vC(70), ec);
4889  if (not ec)
4890  std::tie(r2AsymID, r2Seq, std::ignore) = MapResidue(vC(50), vI(51, 54), vC(55), ec);
4891 
4892  if (ec)
4893  {
4894  if (cif::VERBOSE > 0)
4895  std::cerr << "skipping unmatched pdbx_struct_sheet_hbond record" << std::endl;
4896  }
4897  else
4898  getCategory("pdbx_struct_sheet_hbond")->emplace({
4899  { "sheet_id", sheetID },
4900  { "range_id_1", rangeID },
4901  { "range_id_2", rangeID + 1 },
4902  { "range_1_label_atom_id", vS(57, 60) },
4903  { "range_1_label_comp_id", vS(61, 63) },
4904  { "range_1_label_asym_id", r1AsymID },
4905  { "range_1_label_seq_id", r1Seq },
4906  { "range_1_PDB_ins_code", vS(70, 70) },
4907  { "range_1_auth_atom_id", vS(57, 60) },
4908  { "range_1_auth_comp_id", vS(61, 63) },
4909  { "range_1_auth_asym_id", vS(65, 65) },
4910  { "range_1_auth_seq_id", vI(66, 69) },
4911 
4912  { "range_2_label_atom_id", vS(42, 45) },
4913  { "range_2_label_comp_id", vS(46, 48) },
4914  { "range_2_label_asym_id", r2AsymID },
4915  { "range_2_label_seq_id", r2Seq },
4916  { "range_2_PDB_ins_code", vS(55, 55) },
4917  { "range_2_auth_atom_id", vS(42, 45) },
4918  { "range_2_auth_comp_id", vS(46, 48) },
4919  { "range_2_auth_asym_id", vS(50, 50) },
4920  { "range_2_auth_seq_id", vI(51, 54) } });
4921  }
4922 
4923  if (sense != 0)
4924  ++rangeID;
4925  }
4926 
4927  GetNextRecord();
4928  }
4929 }
4930 
4931 static bool IsMetal(const std::string &resName, const std::string &atomID)
4932 {
4933  bool result = false;
4934 
4935  try
4936  {
4937  auto compound = cif::compound_factory::instance().create(resName);
4938  if (compound != nullptr)
4939  {
4940  auto at = cif::atom_type_traits(compound->get_atom_by_atom_id(atomID).type_symbol);
4941  result = at.is_metal();
4942  }
4943  }
4944  catch (...)
4945  {
4946  }
4947 
4948  return result;
4949 }
4950 
4951 void PDBFileParser::ParseConnectivtyAnnotation()
4952 {
4953  int ssBondNr = 0;
4954  int linkNr = 0;
4955  bool firstCovale = true, firstMetalc = true;
4956 
4957  // Aaargh... Coot writes the records in the wrong order...
4958  for (;; GetNextRecord())
4959  {
4960  if (mRec->is("SSBOND"))
4961  {
4962  if (ssBondNr == 0)
4963  {
4964  getCategory("struct_conn_type")->emplace({
4965  { "id", "disulf" },
4966  });
4967  }
4968 
4969  // 1 - 6 Record name "SSBOND"
4970  // 8 - 10 Integer serNum Serial number.
4971  // 12 - 14 LString(3) "CYS" Residue name.
4972  // 16 Character chainID1 Chain identifier.
4973  // 18 - 21 Integer seqNum1 Residue sequence number.
4974  // 22 AChar icode1 Insertion code.
4975  // 26 - 28 LString(3) "CYS" Residue name.
4976  // 30 Character chainID2 Chain identifier.
4977  // 32 - 35 Integer seqNum2 Residue sequence number.
4978  // 36 AChar icode2 Insertion code.
4979  // 60 - 65 SymOP sym1 Symmetry operator for residue 1.
4980  // 67 - 72 SymOP sym2 Symmetry operator for residue 2.
4981  // 74 – 78 Real(5.2) Length Disulfide bond distance
4982 
4983  std::string p1Asym, p2Asym;
4984  int p1Seq = 0, p2Seq = 0;
4985  std::error_code ec;
4986 
4987  std::tie(p1Asym, p1Seq, std::ignore) = MapResidue(vC(16), vI(18, 21), vC(22), ec);
4988  if (not ec)
4989  std::tie(p2Asym, p2Seq, std::ignore) = MapResidue(vC(30), vI(32, 35), vC(36), ec);
4990 
4991  if (ec)
4992  {
4993  if (cif::VERBOSE > 0)
4994  std::cerr << "Dropping SSBOND " << vI(8, 10) << std::endl;
4995  continue;
4996  }
4997 
4998  std::vector<char> alt1 = altLocsForAtom(vC(16), vI(18, 21), vC(22), "SG");
4999  std::vector<char> alt2 = altLocsForAtom(vC(30), vI(32, 35), vC(36), "SG");
5000 
5001  if (alt1.empty())
5002  alt1.push_back(0);
5003  if (alt2.empty())
5004  alt2.push_back(0);
5005 
5006  std::string sym1, sym2;
5007  try
5008  {
5009  sym1 = pdb2cifSymmetry(vS(60, 65));
5010  sym2 = pdb2cifSymmetry(vS(67, 72));
5011  }
5012  catch (const std::exception &ex)
5013  {
5014  if (cif::VERBOSE > 0)
5015  std::cerr << "Dropping SSBOND " << vI(8, 10) << " due to invalid symmetry operation" << std::endl;
5016  continue;
5017  }
5018 
5019  for (auto a1 : alt1)
5020  {
5021  for (auto a2 : alt2)
5022  {
5023  getCategory("struct_conn")->emplace({
5024  { "id", "disulf" + std::to_string(++ssBondNr) },
5025  { "conn_type_id", "disulf" },
5026 
5027  { "ptnr1_label_asym_id", p1Asym },
5028  { "pdbx_ptnr1_label_alt_id", a1 ? std::string{ a1 } : std::string() },
5029  { "ptnr1_label_comp_id", vS(12, 14) },
5030  { "ptnr1_label_seq_id", p1Seq ? std::to_string(p1Seq) : "." },
5031  { "ptnr1_label_atom_id", "SG" },
5032  { "ptnr1_symmetry", sym1 },
5033 
5034  { "ptnr2_label_asym_id", p2Asym },
5035  { "pdbx_ptnr2_label_alt_id", a2 ? std::string{ a2 } : std::string() },
5036  { "ptnr2_label_comp_id", vS(26, 28) },
5037  { "ptnr2_label_seq_id", p2Seq ? std::to_string(p2Seq) : "." },
5038  { "ptnr2_label_atom_id", "SG" },
5039 
5040  { "ptnr1_auth_asym_id", vS(16, 16) },
5041  { "ptnr1_auth_comp_id", vS(12, 14) },
5042  { "ptnr1_auth_seq_id", vI(18, 21) },
5043  { "ptnr2_auth_asym_id", vS(30, 30) },
5044  { "ptnr2_auth_comp_id", vS(26, 28) },
5045  { "ptnr2_auth_seq_id", vI(32, 35) },
5046 
5047  { "ptnr2_symmetry", sym2 },
5048 
5049  { "pdbx_dist_value", vS(74, 78) },
5050  });
5051  }
5052  }
5053 
5054  continue;
5055  }
5056 
5057  if (mRec->is("LINK ") or mRec->is("LINKR "))
5058  {
5059  if (cif::VERBOSE > 0 and mRec->is("LINKR "))
5060  std::cerr << "Accepting non-standard LINKR record, but ignoring extra information" << std::endl;
5061 
5062  // 1 - 6 Record name "LINK "
5063  std::string name1 = vS(13, 16); // 13 - 16 Atom name1 Atom name.
5064  // 17 Character altLoc1 Alternate location indicator.
5065  std::string resName1 = vS(18, 20); // 18 - 20 Residue name resName1 Residue name.
5066  // 22 Character chainID1 Chain identifier.
5067  // 23 - 26 Integer resSeq1 Residue sequence number.
5068  // 27 AChar iCode1 Insertion code.
5069  std::string name2 = vS(43, 46); // 43 - 46 Atom name2 Atom name.
5070  // 47 Character altLoc2 Alternate location indicator.
5071  std::string resName2 = vS(48, 50); // 48 - 50 Residue name resName2 Residue name.
5072  // 52 Character chainID2 Chain identifier.
5073  // 53 - 56 Integer resSeq2 Residue sequence number.
5074  // 57 AChar iCode2 Insertion code.
5075  // 60 - 65 SymOP sym1 Symmetry operator atom 1.
5076  // 67 - 72 SymOP sym2 Symmetry operator atom 2.
5077  // 74 – 78 Real(5.2) Length Link distance
5078 
5079  std::string type = "covale";
5080  if (IsMetal(resName1, name1) or IsMetal(resName2, name2))
5081  type = "metalc";
5082 
5083  if (type == "covale" and firstCovale)
5084  {
5085  getCategory("struct_conn_type")->emplace({
5086  { "id", type },
5087  });
5088  firstCovale = false;
5089  }
5090 
5091  if (type == "metalc" and firstMetalc)
5092  {
5093  getCategory("struct_conn_type")->emplace({
5094  { "id", type },
5095  });
5096  firstMetalc = false;
5097  }
5098 
5099  ++linkNr;
5100 
5101  std::string p1Asym, p2Asym;
5102  int p1Seq = 0, p2Seq = 0;
5103  bool isResseq1 = false, isResseq2 = false;
5104  std::error_code ec;
5105 
5106  std::tie(p1Asym, p1Seq, isResseq1) = MapResidue(vC(22), vI(23, 26), vC(27), ec);
5107  if (not ec)
5108  std::tie(p2Asym, p2Seq, isResseq2) = MapResidue(vC(52), vI(53, 56), vC(57), ec);
5109 
5110  if (ec)
5111  {
5112  if (cif::VERBOSE > 0)
5113  std::cerr << "Dropping LINK record at line " << mRec->mLineNr << std::endl;
5114  continue;
5115  }
5116 
5117  std::string distance, ccp4LinkID;
5118 
5119  if (mRec->is("LINK "))
5120  {
5121  distance = vS(74, 78);
5122 
5123  double d;
5124  auto r = cif::from_chars(distance.data(), distance.data() + distance.length(), d);
5125  if (r.ec != std::errc())
5126  {
5127  if (cif::VERBOSE > 0)
5128  std::cerr << "Distance value '" << distance << "' is not a valid float in LINK record" << std::endl;
5129  swap(ccp4LinkID, distance); // assume this is a ccp4_link_id... oh really?
5130  }
5131  }
5132  else // LINKR
5133  ccp4LinkID = vS(74, 78); // the link ID
5134 
5135  std::string sym1, sym2;
5136  try
5137  {
5138  sym1 = pdb2cifSymmetry(vS(60, 65));
5139  sym2 = pdb2cifSymmetry(vS(67, 72));
5140  }
5141  catch (const std::exception &ex)
5142  {
5143  if (cif::VERBOSE > 0)
5144  std::cerr << "Dropping LINK record at line " << mRec->mLineNr << " due to invalid symmetry operation" << std::endl;
5145  continue;
5146  }
5147 
5148  getCategory("struct_conn")->emplace({
5149  { "id", type + std::to_string(linkNr) },
5150  { "conn_type_id", type },
5151 
5152  // { "ccp4_link_id", ccp4LinkID },
5153 
5154  { "ptnr1_label_asym_id", p1Asym },
5155  { "ptnr1_label_comp_id", vS(18, 20) },
5156  { "ptnr1_label_seq_id", (isResseq1 and p1Seq) ? std::to_string(p1Seq) : "." },
5157  { "ptnr1_label_atom_id", vS(13, 16) },
5158  { "pdbx_ptnr1_label_alt_id", vS(17, 17) },
5159  { "pdbx_ptnr1_PDB_ins_code", vS(27, 27) },
5160  { "pdbx_ptnr1_standard_comp_id", "" },
5161  { "ptnr1_symmetry", sym1 },
5162 
5163  { "ptnr2_label_asym_id", p2Asym },
5164  { "ptnr2_label_comp_id", vS(48, 50) },
5165  { "ptnr2_label_seq_id", (isResseq2 and p2Seq) ? std::to_string(p2Seq) : "." },
5166  { "ptnr2_label_atom_id", vS(43, 46) },
5167  { "pdbx_ptnr2_label_alt_id", vS(47, 47) },
5168  { "pdbx_ptnr2_PDB_ins_code", vS(57, 57) },
5169 
5170  { "ptnr1_auth_asym_id", vS(22, 22) },
5171  { "ptnr1_auth_comp_id", vS(18, 20) },
5172  { "ptnr1_auth_seq_id", vI(23, 26) },
5173  { "ptnr2_auth_asym_id", vS(52, 52) },
5174  { "ptnr2_auth_comp_id", vS(48, 50) },
5175  { "ptnr2_auth_seq_id", vI(53, 56) },
5176 
5177  // { "ptnr1_auth_atom_id", vS(13, 16) },
5178  // { "ptnr2_auth_atom_id", vS(43, 46) },
5179 
5180  { "ptnr2_symmetry", sym2 },
5181 
5182  { "pdbx_dist_value", distance } });
5183 
5184  continue;
5185  }
5186 
5187  if (mRec->is("CISPEP"))
5188  {
5189  // 1 - 6 Record name "CISPEP"
5190  int serNum = vI(8, 10); // 8 - 10 Integer serNum Record serial number.
5191  std::string pep1 = vS(12, 14); // 12 - 14 LString(3) pep1 Residue name.
5192  char chainID1 = vC(16); // 16 Character chainID1 Chain identifier.
5193  int seqNum1 = vI(18, 21); // 18 - 21 Integer seqNum1 Residue sequence number.
5194  char iCode1 = vC(22); // 22 AChar icode1 Insertion code.
5195  std::string pep2 = vS(26, 28); // 26 - 28 LString(3) pep2 Residue name.
5196  char chainID2 = vC(30); // 30 Character chainID2 Chain identifier.
5197  int seqNum2 = vI(32, 35); // 32 - 35 Integer seqNum2 Residue sequence number.
5198  char iCode2 = vC(36); // 36 AChar icode2 Insertion code.
5199  int modNum = vI(44, 46); // 44 - 46 Integer modNum Identifies the specific model.
5200  std::string measure = vF(54, 59); // 54 - 59 Real(6.2) measure Angle measurement in degrees.
5201 
5202  if (modNum == 0)
5203  modNum = 1;
5204 
5205  std::string lAsym1, lAsym2;
5206  int lResSeq1, lResSeq2;
5207  std::error_code ec;
5208 
5209  std::tie(lAsym1, lResSeq1, std::ignore) = MapResidue(chainID1, seqNum1, iCode1, ec);
5210  if (not ec)
5211  std::tie(lAsym2, lResSeq2, std::ignore) = MapResidue(chainID2, seqNum2, iCode2, ec);
5212 
5213  if (ec)
5214  {
5215  if (cif::VERBOSE > 0)
5216  std::cerr << "Dropping CISPEP record at line " << mRec->mLineNr << std::endl;
5217  continue;
5218  }
5219 
5220  std::string iCode1str = iCode1 == ' ' ? std::string() : std::string{ iCode1 };
5221  std::string iCode2str = iCode2 == ' ' ? std::string() : std::string{ iCode2 };
5222 
5223  getCategory("struct_mon_prot_cis")->emplace({
5224  { "pdbx_id", serNum },
5225  { "label_comp_id", pep1 },
5226  { "label_seq_id", lResSeq1 },
5227  { "label_asym_id", lAsym1 },
5228  { "label_alt_id", "." },
5229  { "pdbx_PDB_ins_code", iCode1str },
5230  { "auth_comp_id", pep1 },
5231  { "auth_seq_id", seqNum1 },
5232  { "auth_asym_id", std::string{ chainID1 } },
5233  { "pdbx_label_comp_id_2", pep2 },
5234  { "pdbx_label_seq_id_2", lResSeq2 },
5235  { "pdbx_label_asym_id_2", lAsym2 },
5236  { "pdbx_PDB_ins_code_2", iCode2str },
5237  { "pdbx_auth_comp_id_2", pep2 },
5238  { "pdbx_auth_seq_id_2", seqNum2 },
5239  { "pdbx_auth_asym_id_2", std::string{ chainID2 } },
5240  { "pdbx_PDB_model_num", modNum },
5241  { "pdbx_omega_angle", measure } });
5242 
5243  continue;
5244  }
5245 
5246  break;
5247  }
5248 }
5249 
5250 void PDBFileParser::ParseMiscellaneousFeatures()
5251 {
5252  int structSiteGenID = 1;
5253 
5254  while (mRec->is("SITE "))
5255  { // 1 - 6 Record name "SITE "
5256  // 8 - 10 Integer seqNum Sequence number.
5257  std::string siteID = vS(12, 14); // 12 - 14 LString(3) siteID Site name.
5258  int numRes = vI(16, 17); // 16 - 17 Integer numRes Number of residues that compose the site.
5259 
5260  int o = 19;
5261 
5262  auto cat = getCategory("struct_site_gen");
5263 
5264  for (int i = 0; i < numRes; ++i)
5265  {
5266  std::string resName = vS(o, o + 2); // 19 - 21 Residue name resName1 Residue name for first residue that
5267  // creates the site.
5268  char chainID = vC(o + 4); // 23 Character chainID1 Chain identifier for first residue of site.
5269  int seq = vI(o + 5, o + 8); // 24 - 27 Integer seq1 Residue sequence number for first residue
5270  // of the site.
5271  char iCode = vC(o + 9); // 28 AChar iCode1 Insertion code for first residue of the site.
5272 
5273  int labelSeq;
5274  std::string asym;
5275  bool isResseq;
5276  std::error_code ec;
5277 
5278  std::tie(asym, labelSeq, isResseq) = MapResidue(chainID, seq, iCode, ec);
5279 
5280  if (ec)
5281  {
5282  if (cif::VERBOSE > 0)
5283  std::cerr << "skipping struct_site_gen record" << std::endl;
5284  }
5285  else
5286  cat->emplace({
5287  { "id", structSiteGenID++ },
5288  { "site_id", siteID },
5289  { "pdbx_num_res", numRes },
5290  { "label_comp_id", resName },
5291  { "label_asym_id", asym },
5292  { "label_seq_id", (labelSeq > 0 and isResseq) ? std::to_string(labelSeq) : std::string(".") },
5293  { "pdbx_auth_ins_code", iCode == ' ' ? "" : std::string{ iCode } },
5294  { "auth_comp_id", resName },
5295  { "auth_asym_id", std::string{ chainID } },
5296  { "auth_seq_id", seq },
5297  { "label_atom_id", "." },
5298  { "label_alt_id", "." },
5299  });
5300 
5301  o += 11;
5302  }
5303 
5304  GetNextRecord();
5305  }
5306 }
5307 
5308 void PDBFileParser::ParseCrystallographic()
5309 {
5310  if (mRec->is("CRYST1"))
5311  {
5312  Match("CRYST1", true);
5313 
5314  getCategory("cell")->emplace({
5315  { "entry_id", mStructureID }, // 1 - 6 Record name "CRYST1"
5316  { "length_a", vF(7, 15) }, // 7 - 15 Real(9.3) a a (Angstroms).
5317  { "length_b", vF(16, 24) }, // 16 - 24 Real(9.3) b b (Angstroms).
5318  { "length_c", vF(25, 33) }, // 25 - 33 Real(9.3) c c (Angstroms).
5319  { "angle_alpha", vF(34, 40) }, // 34 - 40 Real(7.2) alpha alpha (degrees).
5320  { "angle_beta", vF(41, 47) }, // 41 - 47 Real(7.2) beta beta (degrees).
5321  { "angle_gamma", vF(48, 54) }, // 48 - 54 Real(7.2) gamma gamma (degrees).
5322  /* goes into symmetry */ // 56 - 66 LString sGroup Space group.
5323  { "Z_PDB", vF(67, 70) } // 67 - 70 Integer z Z value.
5324  });
5325 
5326  std::string spaceGroup, intTablesNr;
5327  try
5328  {
5329  spaceGroup = vS(56, 66);
5330  intTablesNr = std::to_string(get_space_group_number(spaceGroup));
5331  }
5332  catch (...)
5333  {
5334  }
5335 
5336  getCategory("symmetry")->emplace({
5337  { "entry_id", mStructureID },
5338  { "space_group_name_H-M", spaceGroup },
5339  { "Int_Tables_number", intTablesNr } });
5340 
5341  GetNextRecord();
5342  }
5343  else
5344  {
5345  // no cryst1, make a simple one, like this:
5346  // CRYST1 1.000 1.000 1.000 90.00 90.00 90.00 P 1 1
5347  getCategory("cell")->emplace({
5348  { "entry_id", mStructureID }, // 1 - 6 Record name "CRYST1"
5349  { "length_a", 1 }, // 7 - 15 Real(9.3) a a (Angstroms).
5350  { "length_b", 1 }, // 16 - 24 Real(9.3) b b (Angstroms).
5351  { "length_c", 1 }, // 25 - 33 Real(9.3) c c (Angstroms).
5352  { "angle_alpha", 90 }, // 34 - 40 Real(7.2) alpha alpha (degrees).
5353  { "angle_beta", 90 }, // 41 - 47 Real(7.2) beta beta (degrees).
5354  { "angle_gamma", 90 }, // 48 - 54 Real(7.2) gamma gamma (degrees).
5355  /* goes into symmetry */ // 56 - 66 LString sGroup Space group.
5356  { "Z_PDB", 1 } // 67 - 70 Integer z Z value.
5357  });
5358 
5359  getCategory("symmetry")->emplace({
5360  { "entry_id", mStructureID },
5361  { "space_group_name_H-M", "P 1" },
5362  { "Int_Tables_number", 1 }
5363  });
5364  }
5365 }
5366 
5367 void PDBFileParser::ParseCoordinateTransformation()
5368 {
5369  std::string m[3][3], v[3];
5370 
5371  if (cif::starts_with(mRec->mName, "ORIGX"))
5372  {
5373  for (std::string n : { "1", "2", "3" })
5374  {
5375  int x = stoi(n) - 1;
5376 
5377  Match("ORIGX" + n, true); // 1 - 6 Record name "ORIGXn" n=1, 2, or 3
5378  m[x][0] = vF(11, 20); // 11 - 20 Real(10.6) o[n][1] On1
5379  m[x][1] = vF(21, 30); // 21 - 30 Real(10.6) o[n][2] On2
5380  m[x][2] = vF(31, 40); // 31 - 40 Real(10.6) o[n][3] On3
5381  v[x] = vF(46, 55); // 46 - 55 Real(10.5) t[n] Tn
5382 
5383  GetNextRecord();
5384  }
5385 
5386  getCategory("database_PDB_matrix")->emplace({
5387  { "entry_id", mStructureID },
5388  { "origx[1][1]", m[0][0] },
5389  { "origx[1][2]", m[0][1] },
5390  { "origx[1][3]", m[0][2] },
5391  { "origx[2][1]", m[1][0] },
5392  { "origx[2][2]", m[1][1] },
5393  { "origx[2][3]", m[1][2] },
5394  { "origx[3][1]", m[2][0] },
5395  { "origx[3][2]", m[2][1] },
5396  { "origx[3][3]", m[2][2] },
5397  { "origx_vector[1]", v[0] },
5398  { "origx_vector[2]", v[1] },
5399  { "origx_vector[3]", v[2] },
5400  });
5401  }
5402 
5403  if (cif::starts_with(mRec->mName, "SCALE"))
5404  {
5405  for (std::string n : { "1", "2", "3" })
5406  {
5407  int x = stoi(n) - 1;
5408 
5409  Match("SCALE" + n, true); // 1 - 6 Record name "SCALEn" n=1, 2, or 3
5410  m[x][0] = vF(11, 20); // 11 - 20 Real(10.6) s[n][1] Sn1
5411  m[x][1] = vF(21, 30); // 21 - 30 Real(10.6) s[n][2] Sn2
5412  m[x][2] = vF(31, 40); // 31 - 40 Real(10.6) s[n][3] Sn3
5413  v[x] = vF(46, 55); // 46 - 55 Real(10.5) u[n] Un
5414 
5415  GetNextRecord();
5416  }
5417 
5418  getCategory("atom_sites")->emplace({
5419  { "entry_id", mStructureID },
5420  { "fract_transf_matrix[1][1]", m[0][0] },
5421  { "fract_transf_matrix[1][2]", m[0][1] },
5422  { "fract_transf_matrix[1][3]", m[0][2] },
5423  { "fract_transf_matrix[2][1]", m[1][0] },
5424  { "fract_transf_matrix[2][2]", m[1][1] },
5425  { "fract_transf_matrix[2][3]", m[1][2] },
5426  { "fract_transf_matrix[3][1]", m[2][0] },
5427  { "fract_transf_matrix[3][2]", m[2][1] },
5428  { "fract_transf_matrix[3][3]", m[2][2] },
5429  { "fract_transf_vector[1]", v[0] },
5430  { "fract_transf_vector[2]", v[1] },
5431  { "fract_transf_vector[3]", v[2] },
5432  });
5433  }
5434 
5435  while (cif::starts_with(mRec->mName, "MTRIX1"))
5436  {
5437  int serial = 0, igiven = 0;
5438 
5439  for (std::string n : { "1", "2", "3" })
5440  {
5441  int x = stoi(n) - 1;
5442 
5443  Match("MTRIX" + n, true); // 1 - 6 Record name "MTRIXn" n=1, 2, or 3
5444  serial = vI(8, 10); // 8 - 10 Integer serial Serial number.
5445  m[x][0] = vF(11, 20); // 11 - 20 Real(10.6) m[n][1] Mn1
5446  m[x][1] = vF(21, 30); // 21 - 30 Real(10.6) m[n][2] Mn2
5447  m[x][2] = vF(31, 40); // 31 - 40 Real(10.6) m[n][3] Mn3
5448  v[x] = vF(46, 55); // 46 - 55 Real(10.5) v[n] Vn
5449  igiven = vC(60) == '1'; // 60 Integer iGiven 1 if coordinates for the representations
5450  // which are approximately related by the
5451  GetNextRecord(); // transformations of the molecule are
5452  } // contained in the datablock. Otherwise, blank.
5453 
5454  getCategory("struct_ncs_oper")->emplace({
5455  { "id", serial },
5456  { "matrix[1][1]", m[0][0] },
5457  { "matrix[1][2]", m[0][1] },
5458  { "matrix[1][3]", m[0][2] },
5459  { "matrix[2][1]", m[1][0] },
5460  { "matrix[2][2]", m[1][1] },
5461  { "matrix[2][3]", m[1][2] },
5462  { "matrix[3][1]", m[2][0] },
5463  { "matrix[3][2]", m[2][1] },
5464  { "matrix[3][3]", m[2][2] },
5465  { "vector[1]", v[0] },
5466  { "vector[2]", v[1] },
5467  { "vector[3]", v[2] },
5468  { "code", igiven ? "given" : "" } });
5469  }
5470 }
5471 
5472 void PDBFileParser::ParseCoordinate(int modelNr)
5473 {
5474  // oh oh, we have to sort our atom_site records by ascending asym_id
5475  // This routine used to be so trivial...
5476 
5477  typedef std::tuple<std::string, int, bool, PDBRecord *, PDBRecord *> atomRec;
5478 
5479  std::vector<atomRec> atoms;
5480  while (mRec->is("ATOM ") or mRec->is("HETATM")) // 1 - 6 Record name "ATOM "
5481  {
5482  char chainID = vC(22); // 22 Character chainID Chain identifier.
5483  int resSeq = vI(23, 26); // 23 - 26 Integer resSeq Residue sequence number.
5484  char iCode = vC(27);
5485 
5486  std::string asymID;
5487  int seqID;
5488  bool isResseq;
5489 
5490  std::tie(asymID, seqID, isResseq) = MapResidue(chainID, resSeq, iCode);
5491 
5492  PDBRecord *atom = mRec;
5493  PDBRecord *anisou = nullptr;
5494 
5495  GetNextRecord();
5496  if (mRec->is("ANISOU"))
5497  {
5498  anisou = mRec;
5499  GetNextRecord();
5500  }
5501 
5502  atoms.emplace_back(asymID, seqID, isResseq, atom, anisou);
5503 
5504  /*if?... */ while (mRec->is("TER "))
5505  {
5506  Match("TER ", true);
5507  GetNextRecord();
5508  }
5509  }
5510 
5511  auto last = mRec;
5512 
5513  // use stable sort here
5514  auto rLess = [](const atomRec &a, const atomRec &b) -> bool
5515  {
5516  int d;
5517 
5518  std::string chainA = std::get<0>(a);
5519  std::string chainB = std::get<0>(b);
5520 
5521  if (chainA.length() != chainB.length())
5522  d = static_cast<int>(chainA.length() - chainB.length());
5523  else
5524  d = std::get<0>(a).compare(std::get<0>(b));
5525 
5526  if (d == 0)
5527  d = std::get<1>(a) - std::get<1>(b);
5528  return d < 0;
5529  };
5530 
5531  stable_sort(atoms.begin(), atoms.end(), rLess);
5532 
5533  // now reiterate the atoms to reorder alternates
5534  for (size_t i = 0; i + 1 < atoms.size(); ++i)
5535  {
5536  char altLoc = std::get<3>(atoms[i])->vC(17);
5537 
5538  if (altLoc == ' ' or altLoc == 0)
5539  continue;
5540 
5541  auto b = atoms.begin() + i;
5542  auto e = b;
5543 
5544  std::map<std::string, int> atomIndex; // index number of first occurrence of a atom name
5545 
5546  while (e != atoms.end() and rLess(*b, *e) == false)
5547  {
5548  std::string name = std::get<3>(*e)->vS(13, 16);
5549 
5550  if (atomIndex.count(name) == 0)
5551  atomIndex[name] = static_cast<int>(atomIndex.size() + 1);
5552 
5553  ++e;
5554  }
5555 
5556  auto aLess = [&](atomRec &a, atomRec &b) -> bool
5557  {
5558  std::string na = std::get<3>(a)->vS(13, 16);
5559  std::string nb = std::get<3>(b)->vS(13, 16);
5560 
5561  int d = atomIndex[na] - atomIndex[nb];
5562  if (d == 0)
5563  d = std::get<3>(a)->vC(17) - std::get<3>(b)->vC(17);
5564  assert(d != 0);
5565  return d < 0;
5566  };
5567 
5568  sort(b, e, aLess);
5569 
5570  i += distance(b, e) - 1;
5571  }
5572 
5573  // while (mRec->is("ATOM ") or mRec->is("HETATM")) // 1 - 6 Record name "ATOM "
5574  for (auto &a : atoms)
5575  {
5576  std::string asymID;
5577  int seqID;
5578  bool isResseq;
5579  PDBRecord *atom;
5580  PDBRecord *anisou;
5581  std::tie(asymID, seqID, isResseq, atom, anisou) = a;
5582 
5583  mRec = atom;
5584 
5585  ++mAtomID;
5586 
5587  std::string groupPDB = mRec->is("ATOM ") ? "ATOM" : "HETATM";
5588  // int serial = vI(7, 11); // 7 - 11 Integer serial Atom serial number.
5589  std::string name = vS(13, 16); // 13 - 16 Atom name Atom name.
5590  char altLoc = vC(17); // 17 Character altLoc Alternate location indicator.
5591  std::string resName = vS(18, 20); // 18 - 20 Residue name resName Residue name.
5592  char chainID = vC(22); // 22 Character chainID Chain identifier.
5593  int resSeq = vI(23, 26); // 23 - 26 Integer resSeq Residue sequence number.
5594  char iCode = vC(27); // 27 AChar iCode Code for insertion of residues.
5595  std::string x = vF(31, 38); // 31 - 38 Real(8.3) x Orthogonal coordinates for X in Angstroms.
5596  std::string y = vF(39, 46); // 39 - 46 Real(8.3) y Orthogonal coordinates for Y in Angstroms.
5597  std::string z = vF(47, 54); // 47 - 54 Real(8.3) z Orthogonal coordinates for Z in Angstroms.
5598  std::string occupancy = vF(55, 60); // 55 - 60 Real(6.2) occupancy Occupancy.
5599  std::string tempFactor = vF(61, 66); // 61 - 66 Real(6.2) tempFactor Temperature factor.
5600  std::string element = vS(77, 78); // 77 - 78 LString(2) element Element symbol, right-justified.
5601  std::string charge = vS(79, 80); // 79 - 80 LString(2) charge Charge on the atom.
5602 
5603  std::string entityID = mAsymID2EntityID[asymID];
5604 
5605  charge = pdb2cifCharge(charge);
5606 
5607  // if (cif::compound_factory::instance().is_known_peptide(resName) or cif::compound_factory::instance().is_known_base(resName))
5608  if (resName == "UNK" or cif::compound_factory::kAAMap.count(resName) or cif::compound_factory::kBaseMap.count(resName))
5609  {
5610  if (groupPDB == "HETATM")
5611  {
5612  if (cif::VERBOSE > 0)
5613  std::cerr << "Changing atom from HETATM to ATOM at line " << mRec->mLineNr << std::endl;
5614  groupPDB = "ATOM";
5615  }
5616  }
5617  else
5618  {
5619  if (groupPDB == "ATOM")
5620  {
5621  if (cif::VERBOSE > 0)
5622  std::cerr << "Changing atom from ATOM to HETATM at line " << mRec->mLineNr << std::endl;
5623  groupPDB = "HETATM";
5624  }
5625  }
5626 
5627  // if the atom is part of a sugar, we need to replace the auth_seq_id/resSeq
5628  if (mSugarEntities.count(entityID))
5629  {
5630  using namespace cif::literals;
5631 
5632  auto &branch_scheme = *getCategory("pdbx_branch_scheme");
5633  resSeq = branch_scheme.find1<int>("asym_id"_key == asymID and "auth_seq_num"_key == resSeq, "pdb_seq_num");
5634  }
5635 
5636  getCategory("atom_site")->emplace({
5637  { "group_PDB", groupPDB },
5638  { "id", mAtomID },
5639  { "type_symbol", element },
5640  { "label_atom_id", name },
5641  { "label_alt_id", altLoc != ' ' ? std::string{ altLoc } : "." },
5642  { "label_comp_id", resName },
5643  { "label_asym_id", asymID },
5644  { "label_entity_id", entityID },
5645  { "label_seq_id", (isResseq and seqID > 0) ? std::to_string(seqID) : "." },
5646  { "pdbx_PDB_ins_code", iCode == ' ' ? "" : std::string{ iCode } },
5647  { "Cartn_x", x },
5648  { "Cartn_y", y },
5649  { "Cartn_z", z },
5650  { "occupancy", occupancy },
5651  { "B_iso_or_equiv", tempFactor },
5652  { "pdbx_formal_charge", charge },
5653  { "auth_seq_id", resSeq },
5654  { "auth_comp_id", resName },
5655  { "auth_asym_id", std::string{ chainID } },
5656  { "auth_atom_id", name },
5657  { "pdbx_PDB_model_num", modelNr } });
5658 
5659  InsertAtomType(element);
5660 
5661  std::string check = vS(7, 11) + vS(77, 80);
5662 
5663  if (anisou != nullptr)
5664  {
5665  mRec = anisou; // 1 - 6 Record name "ANISOU"
5666  int u11 = vI(29, 35); // 29 - 35 Integer u[0][0] U(1,1)
5667  int u22 = vI(36, 42); // 36 - 42 Integer u[1][1] U(2,2)
5668  int u33 = vI(43, 49); // 43 - 49 Integer u[2][2] U(3,3)
5669  int u12 = vI(50, 56); // 50 - 56 Integer u[0][1] U(1,2)
5670  int u13 = vI(57, 63); // 57 - 63 Integer u[0][2] U(1,3)
5671  int u23 = vI(64, 70); // 64 - 70 Integer u[1][2] U(2,3)
5672 
5673  if (vS(7, 11) + vS(77, 80) != check)
5674  throw std::runtime_error("ANISOU record should follow corresponding ATOM record");
5675 
5676  auto f = [](float f) -> std::string
5677  { return cif::format("%6.4f", f).str(); };
5678 
5679  getCategory("atom_site_anisotrop")->emplace({
5680  { "id", mAtomID },
5681  { "type_symbol", element },
5682  { "pdbx_label_atom_id", name },
5683  { "pdbx_label_alt_id", altLoc != ' ' ? std::string{ altLoc } : "." },
5684  { "pdbx_label_comp_id", resName },
5685  { "pdbx_label_asym_id", asymID },
5686  { "pdbx_label_seq_id", (isResseq and seqID > 0) ? std::to_string(seqID) : "." },
5687  { "U[1][1]", f(u11 / 10000.f) },
5688  { "U[2][2]", f(u22 / 10000.f) },
5689  { "U[3][3]", f(u33 / 10000.f) },
5690  { "U[1][2]", f(u12 / 10000.f) },
5691  { "U[1][3]", f(u13 / 10000.f) },
5692  { "U[2][3]", f(u23 / 10000.f) },
5693  { "pdbx_auth_seq_id", resSeq },
5694  { "pdbx_auth_comp_id", resName },
5695  { "pdbx_auth_asym_id", std::string{ chainID } },
5696  { "pdbx_auth_atom_id", name } });
5697  }
5698  }
5699 
5700  mRec = last;
5701 }
5702 
5703 void PDBFileParser::ParseConnectivty()
5704 {
5705  while (mRec->is("CONECT"))
5706  GetNextRecord();
5707 }
5708 
5709 void PDBFileParser::ParseBookkeeping()
5710 {
5711  if (mRec->is("MASTER"))
5712  {
5713  Match("MASTER", false);
5714  GetNextRecord();
5715  }
5716  Match("END ", false);
5717 }
5718 
5719 void PDBFileParser::Parse(std::istream &is, cif::file &result)
5720 {
5721  try
5722  {
5723  mDatablock.set_validator(result.get_validator());
5724 
5725  PreParseInput(is);
5726 
5727  mRec = mData;
5728 
5729  ParseTitle();
5730 
5731  ParseRemarks();
5732  ParsePrimaryStructure();
5733  ParseHeterogen();
5734 
5735  ConstructEntities();
5736 
5737  ParseRemark350();
5738 
5739  ParseSecondaryStructure();
5740  ParseConnectivtyAnnotation();
5741  ParseMiscellaneousFeatures();
5742  ParseCrystallographic();
5743  ParseCoordinateTransformation();
5744 
5745  uint32_t modelNr = 1;
5746  bool hasAtoms = false;
5747 
5748  while (mRec->is("MODEL ") or mRec->is("ATOM ") or mRec->is("HETATM"))
5749  {
5750  bool model = false;
5751  if (mRec->is("MODEL "))
5752  {
5753  model = true;
5754 
5755  modelNr = vI(11, 14);
5756 
5757  GetNextRecord();
5758  }
5759 
5760  hasAtoms = hasAtoms or mRec->is("ATOM ") or mRec->is("HETATM");
5761 
5762  ParseCoordinate(modelNr);
5763 
5764  if (model)
5765  {
5766  Match("ENDMDL", true);
5767  GetNextRecord();
5768  }
5769  }
5770 
5771  if (not hasAtoms)
5772  throw std::runtime_error("Either the PDB file has no atom records, or the field " + std::string(mRec->mName) + " is not at the correct location");
5773 
5774  for (auto e : mAtomTypes)
5775  getCategory("atom_type")->emplace({
5776  { "symbol", e } });
5777 
5778  // in V5, atom_type is sorted
5779  getCategory("atom_type")->reorder_by_index();
5780 
5781  ParseConnectivty();
5782  ParseBookkeeping();
5783 
5784  // almost done, now fix some outstanding issued that could not be done before
5785 
5786  try
5787  {
5788  auto r = FindRecord("REMARK 3");
5789 
5790  if (r != nullptr and Remark3Parser::parse(mExpMethod, r, mDatablock))
5791  {
5792  // make sure the "exptl" category is created
5793  auto exptl = getCategory("exptl");
5794  if (exptl->empty())
5795  {
5796  exptl->emplace({
5797  { "entry_id", mStructureID },
5798  { "method", mExpMethod },
5799  { "crystals_number", mRemark200["NUMBER OF CRYSTALS USED"] } });
5800  }
5801  }
5802  }
5803  catch (const std::exception &ex)
5804  {
5805  if (cif::VERBOSE >= 0)
5806  std::cerr << "Error parsing REMARK 3" << std::endl;
5807  throw;
5808  }
5809  //
5810  // auto cat = getCategory("pdbx_refine_tls_group");
5811  // for (Row r: *cat)
5812  // {
5813  // // add the mapped locations
5814  //
5815  // try
5816  // {
5817  // std::string asymID;
5818  // int resNum;
5819  //
5820  // cif::tie(asymID, resNum) = r.get("beg_auth_asym_id", "beg_auth_seq_id");
5821  //
5822  // r["beg_label_asym_id"] = asymID;
5823  // r["beg_label_seq_id"] = resNum;
5824  //
5825  // cif::tie(asymID, resNum) = r.get("end_auth_asym_id", "end_auth_seq_id");
5826  //
5827  // r["end_label_asym_id"] = asymID;
5828  // r["end_label_seq_id"] = resNum;
5829  // }
5830  // catch (const std::exception& ex)
5831  // {
5832  // continue;
5833  // }
5834  // }
5835 
5836  using namespace cif::literals;
5837 
5838  auto &atom_site = *getCategory("atom_site");
5839 
5840  for (auto r : getCategory("struct_conn")->find("pdbx_dist_value"_key == 0 or "pdbx_dist_value"_key == cif::null))
5841  {
5842  const auto &[asym1, seq1, atom1, symm1, asym2, seq2, atom2, symm2] = r.get<std::string, std::string, std::string, std::string, std::string, std::string, std::string, std::string>(
5843  "ptnr1_label_asym_id", "ptnr1_label_seq_id", "ptnr1_label_atom_id", "ptnr1_symmetry",
5844  "ptnr2_label_asym_id", "ptnr2_label_seq_id", "ptnr2_label_atom_id", "ptnr2_symmetry");
5845 
5846  float distance = 1.0f;
5847 
5848  try
5849  {
5850  auto a1 = atom_site.find1("label_asym_id"_key == asym1 and "label_seq_id"_key == seq1 and "label_atom_id"_key == atom1);
5851  auto a2 = atom_site.find1("label_asym_id"_key == asym2 and "label_seq_id"_key == seq2 and "label_atom_id"_key == atom2);
5852 
5853  if (not a1 or not a2)
5854  throw std::runtime_error("cannot find atom");
5855 
5856  const auto &[x1, y1, z1] = a1.get<float, float, float>("cartn_x", "cartn_y", "cartn_z");
5857  const auto &[x2, y2, z2] = a2.get<float, float, float>("cartn_x", "cartn_y", "cartn_z");
5858 
5859  if ((symm1.empty() or symm1 == "1_555") and (symm2.empty() or symm2 == "1_555"))
5860  distance = std::sqrt(
5861  (x1 - x2) * (x1 - x2) +
5862  (y1 - y2) * (y1 - y2) +
5863  (z1 - z2) * (z1 - z2)
5864  );
5865  else if (cif::VERBOSE > 0)
5866  std::cerr << "Cannot calculate distance for link since one of the atoms is in another dimension" << std::endl;
5867  }
5868  catch (std::exception &ex)
5869  {
5870  if (cif::VERBOSE > 0)
5871  std::cerr << "Error finding atom for LINK distance calculation: " << ex.what() << std::endl;
5872  }
5873 
5874  r["pdbx_dist_value"] = distance;
5875  }
5876 
5877  result.emplace_back(std::move(mDatablock));
5878  }
5879  catch (const std::exception &ex)
5880  {
5881  if (cif::VERBOSE >= 0)
5882  {
5883  std::cerr << "Error parsing PDB";
5884  if (mRec != nullptr)
5885  std::cerr << " at line " << mRec->mLineNr;
5886  std::cerr << std::endl;
5887  }
5888  throw;
5889  }
5890 }
5891 
5892 // ----------------------------------------------------------------
5893 // A blast like alignment. Returns index of last aligned residue.
5894 
5895 // matrix is m x n, addressing i,j is 0 <= i < m and 0 <= j < n
5896 // element m i,j is mapped to [i * n + j] and thus storage is row major
5897 
5898 template <typename T>
5899 class matrix
5900 {
5901  public:
5902  using value_type = T;
5903 
5904  matrix() = delete;
5905  matrix(const matrix &) = delete;
5906  matrix &operator=(const matrix &) = delete;
5907 
5908  matrix(uint32_t m, uint32_t n, T v = T())
5909  : m_m(m)
5910  , m_n(n)
5911  {
5912  m_data = new value_type[m_m * m_n];
5913  std::fill(m_data, m_data + (m_m * m_n), v);
5914  }
5915 
5917  {
5918  delete[] m_data;
5919  }
5920 
5921  uint32_t dim_m() const { return m_m; }
5922  uint32_t dim_n() const { return m_n; }
5923 
5924  value_type operator()(uint32_t i, uint32_t j) const
5925  {
5926  assert(i < m_m);
5927  assert(j < m_n);
5928  return m_data[i * m_n + j];
5929  }
5930 
5931  value_type &operator()(uint32_t i, uint32_t j)
5932  {
5933  assert(i < m_m);
5934  assert(j < m_n);
5935  return m_data[i * m_n + j];
5936  }
5937 
5938  private:
5939  value_type *m_data;
5940  uint32_t m_m, m_n;
5941 };
5942 
5943 int PDBFileParser::PDBChain::AlignResToSeqRes()
5944 {
5945  // Use dynamic programming to align the found residues (in ATOM records) against
5946  // the residues in the SEQRES records in order to obtain the residue numbering.
5947  // sigh...
5948 
5949  auto &rx = mSeqres;
5950  auto &ry = mResiduesSeen;
5951 
5952  int dimX = static_cast<int>(mSeqres.size());
5953  if (dimX == 0)
5954  throw std::runtime_error(std::string("SEQRES for chain ") + mDbref.chainID + " is empty");
5955 
5956  int dimY = static_cast<int>(mResiduesSeen.size());
5957  if (dimY == 0)
5958  throw std::runtime_error(std::string("Number of residues in ATOM records for chain ") + mDbref.chainID + " is zero");
5959 
5960  matrix<float> B(dimX, dimY), Ix(dimX, dimY), Iy(dimX, dimY);
5961  matrix<int8_t> tb(dimX, dimY);
5962 
5963  int x, y;
5964 
5965  const float
5966  kMatchReward = 5,
5967  kMismatchCost = -10,
5968  kGapOpen = 10, gapExtend = 0.1f;
5969 
5970  float high = 0;
5971  int highX = 0, highY = 0;
5972 
5973  for (x = 0; x < dimX; ++x)
5974  {
5975  for (y = 0; y < dimY; ++y)
5976  {
5977  auto &a = rx[x];
5978  auto &b = ry[y];
5979 
5980  float Ix1 = x > 0 ? Ix(x - 1, y) : 0;
5981  float Iy1 = y > 0 ? Iy(x, y - 1) : 0;
5982 
5983  // score for alignment
5984  float M;
5985  if (a.mMonID == b.mMonID)
5986  M = kMatchReward;
5987  else
5988  M = kMismatchCost;
5989 
5990  // gap open cost is zero if the PDB ATOM records indicate that a gap
5991  // should be here.
5992  float gapOpen = kGapOpen;
5993  if (y == 0 or (y + 1 < dimY and ry[y + 1].mSeqNum > ry[y].mSeqNum + 1))
5994  gapOpen = 0;
5995 
5996  if (x > 0 and y > 0)
5997  M += B(x - 1, y - 1);
5998 
5999  float s;
6000  if (M >= Ix1 and M >= Iy1)
6001  {
6002  tb(x, y) = 0;
6003  B(x, y) = s = M;
6004 
6005  Ix(x, y) = M - (x < dimX - 1 ? gapOpen : 0);
6006  Iy(x, y) = M - (y < dimY - 1 ? gapOpen : 0);
6007  }
6008  else if (Ix1 >= Iy1)
6009  {
6010  tb(x, y) = 1;
6011  B(x, y) = s = Ix1;
6012 
6013  Ix(x, y) = Ix1 - gapExtend;
6014  Iy(x, y) = M - (y < dimY - 1 ? gapOpen : 0);
6015  if (Iy(x, y) < Iy1 - gapExtend)
6016  Iy(x, y) = Iy1 - gapExtend;
6017  }
6018  else
6019  {
6020  tb(x, y) = -1;
6021  B(x, y) = s = Iy1;
6022 
6023  Ix(x, y) = M - (x < dimX - 1 ? gapOpen : 0);
6024  if (Ix(x, y) < Ix1 - gapExtend)
6025  Ix(x, y) = Ix1 - gapExtend;
6026  Iy(x, y) = Iy1 - gapExtend;
6027  }
6028 
6029  if (/*(x == dimX - 1 or y == dimY - 1) and */ high < s)
6030  {
6031  high = s;
6032  highX = x;
6033  highY = y;
6034  }
6035  }
6036  }
6037 
6038  const int kFlagSeqNr = std::numeric_limits<int>::min();
6039 
6040  // reset positions of seqres
6041  for (auto &sr : rx)
6042  {
6043  sr.mSeqNum = kFlagSeqNr;
6044  sr.mIcode = ' ';
6045  }
6046 
6047  // assign numbers
6048  x = highX;
6049  y = highY;
6050 
6051  // C++ is getting closer to Pascal :-)
6052  auto printAlignment = [&tb, highX, highY, &rx, &ry, this]()
6053  {
6054  std::cerr << std::string(cif::get_terminal_width(), '-') << std::endl
6055  << "Alignment for chain " << mDbref.chainID << std::endl
6056  << std::endl;
6057  std::vector<std::pair<std::string, std::string>> alignment;
6058 
6059  int x = highX;
6060  int y = highY;
6061 
6062  for (x = highX, y = highY; x >= 0 and y >= 0;)
6063  {
6064  switch (tb(x, y))
6065  {
6066  case -1:
6067  alignment.push_back(make_pair("...", ry[y].mMonID));
6068  --y;
6069  break;
6070 
6071  case 1:
6072  alignment.push_back(make_pair(rx[x].mMonID, "..."));
6073  --x;
6074  break;
6075 
6076  case 0:
6077  alignment.push_back(make_pair(rx[x].mMonID, ry[y].mMonID));
6078  --x;
6079  --y;
6080  break;
6081  }
6082  }
6083 
6084  while (x >= 0)
6085  {
6086  alignment.push_back(make_pair(rx[x].mMonID, "..."));
6087  --x;
6088  }
6089 
6090  while (y >= 0)
6091  {
6092  alignment.push_back(make_pair("...", ry[y].mMonID));
6093  --y;
6094  }
6095 
6096  reverse(alignment.begin(), alignment.end());
6097  for (auto a : alignment)
6098  std::cerr << " " << a.first << " -- " << a.second << std::endl;
6099 
6100  std::cerr << std::endl;
6101  };
6102 
6103  if (cif::VERBOSE > 1)
6104  printAlignment();
6105 
6106  try
6107  {
6108  while (x >= 0 and y >= 0)
6109  {
6110  switch (tb(x, y))
6111  {
6112  case -1:
6113  throw std::runtime_error("A residue found in the ATOM records (" + ry[y].mMonID +
6114  " @ " + std::string{ mDbref.chainID } + ":" + std::to_string(ry[y].mSeqNum) +
6115  ((ry[y].mIcode == ' ' or ry[y].mIcode == 0) ? "" : std::string{ ry[y].mIcode }) +
6116  ") was not found in the SEQRES records");
6117  break;
6118 
6119  case 1:
6120  if (cif::VERBOSE > 3)
6121  std::cerr << "Missing residue in ATOM records: " << rx[x].mMonID << " at " << rx[x].mSeqNum << std::endl;
6122 
6123  --x;
6124  break;
6125 
6126  case 0:
6127  if (rx[x].mMonID != ry[y].mMonID)
6128  {
6129  std::cerr << "Warning, unaligned residues at " << x << "/" << y << "(" << rx[x].mMonID << '/' << ry[y].mMonID << ") SEQRES does not agree with ATOM records" << std::endl;
6130  rx[x].mMonID = ry[y].mMonID;
6131  }
6132 
6133  rx[x].mSeqNum = ry[y].mSeqNum;
6134  rx[x].mIcode = ry[y].mIcode;
6135 
6136  --x;
6137  --y;
6138  }
6139  }
6140  }
6141  catch (const std::exception &ex)
6142  {
6143  if (cif::VERBOSE == 1)
6144  printAlignment();
6145 
6146  throw;
6147  }
6148 
6149  // assign numbers to the residues that don't have them yet
6150  std::stack<int> unnumbered;
6151  for (x = 0; x < dimX; ++x)
6152  {
6153  if (rx[x].mSeqNum == kFlagSeqNr)
6154  {
6155  if (x > 0 and rx[x - 1].mSeqNum != kFlagSeqNr)
6156  rx[x].mSeqNum = rx[x - 1].mSeqNum + 1;
6157  else
6158  unnumbered.push(x);
6159  }
6160  }
6161 
6162  while (unnumbered.empty() == false)
6163  {
6164  x = unnumbered.top();
6165  if (x >= dimX - 1)
6166  throw std::runtime_error("Could not assign sequence numbers");
6167  rx[x].mSeqNum = rx[x + 1].mSeqNum - 1;
6168  unnumbered.pop();
6169  }
6170 
6171  return highY;
6172 }
6173 
6174 bool PDBFileParser::PDBChain::SameSequence(const PDBChain &rhs) const
6175 {
6176  bool result = mSeqres.size() == rhs.mSeqres.size();
6177 
6178  for (size_t i = 0; result and i < mSeqres.size(); ++i)
6179  result = mSeqres[i].mMonID == rhs.mSeqres[i].mMonID;
6180 
6181  return result;
6182 }
6183 
6184 // --------------------------------------------------------------------
6185 
6186 void ReadPDBFile(std::istream &pdbFile, cif::file &cifFile)
6187 {
6188  PDBFileParser p;
6189 
6190  cifFile.load_dictionary("mmcif_pdbx.dic");
6191 
6192  p.Parse(pdbFile, cifFile);
6193 
6194  if (not cifFile.is_valid() and cif::VERBOSE >= 0)
6195  std::cerr << "Resulting mmCIF file is not valid!" << std::endl;
6196 }
6197 
6198 // --------------------------------------------------------------------
6199 
6201 {
6202  file result;
6203 
6204  auto *buffer = is.rdbuf();
6205  if (buffer)
6206  {
6207  char ch = std::char_traits<char>::to_char_type(buffer->sgetc());
6208 
6209  // All PDB files should always start with a HEADER line
6210  // and so the very first character in a valid PDB file
6211  // is 'H'. It is as simple as that.
6212 
6213  if (ch == 'h' or ch == 'H')
6214  ReadPDBFile(is, result);
6215  else
6216  {
6217  try
6218  {
6219  result.load(is);
6220  }
6221  catch (const std::exception &ex)
6222  {
6223  std::throw_with_nested(std::runtime_error("Since the file did not start with a valid PDB HEADER line mmCIF was assumed, but that failed."));
6224  }
6225  }
6226  }
6227 
6228  // Must be a PDB like file, right?
6229  if (result.get_validator() == nullptr)
6230  result.load_dictionary("mmcif_pdbx.dic");
6231 
6232  return result;
6233 }
6234 
6235 file read(const std::filesystem::path &file)
6236 {
6237  try
6238  {
6239  gzio::ifstream in(file);
6240  if (not in.is_open())
6241  throw std::runtime_error("Could not open file " + file.string() + " for input");
6242 
6243  return read(in);
6244  }
6245  catch (const std::exception &ex)
6246  {
6247  throw_with_nested(std::runtime_error("Error reading file " + file.string()));
6248  }
6249 }
6250 
6251 } // namespace pdbx
uint32_t get_terminal_width()
Definition: utilities.cpp:110
void to_lower(std::string &s)
Definition: text.cpp:114
void min(Image< double > &op1, const Image< double > &op2)
std::string trim_copy(std::string_view s)
Definition: text.cpp:211
void ReadPDBFile(std::istream &pdbFile, cif::file &cifFile)
Definition: pdb2cif.cpp:6186
void replace_all(std::string &s, std::string_view what, std::string_view with)
Definition: text.cpp:134
uint32_t dim_n() const
Definition: pdb2cif.cpp:5922
const std::set< std::string > kSupportedRecords
Definition: pdb2cif.cpp:127
doublereal * c
bool operator==(faketype, faketype)
Definition: gtest.h:1366
void sqrt(Image< double > &op)
std::vector< SelLine >::iterator find(std::vector< SelLine > &text, const std::string &img_name)
Definition: selfile.cpp:553
HBITMAP buffer
Definition: svm-toy.cpp:37
static double * y
void trim(std::string &s)
Definition: text.cpp:205
file read(const std::filesystem::path &file)
Definition: pdb2cif.cpp:6235
const std::map< std::string, int > kMonths
Definition: pdb2cif.cpp:112
bool icontains(std::string_view s, std::string_view q)
Definition: text.cpp:143
bool operator!=(faketype, faketype)
Definition: gtest.h:1367
void compare(Image< double > &op1, const Image< double > &op2)
matrix(uint32_t m, uint32_t n, T v=T())
Definition: pdb2cif.cpp:5908
std::ostream & operator<<(std::ostream &os, const Message &sb)
bool operator==(const AtomRes &rhs) const
Definition: pdb2cif.cpp:696
bool iequals(std::string_view a, std::string_view b)
Definition: text.cpp:59
doublereal * x
#define i
ql0001_ & k(htemp+1),(cvec+1),(atemp+1),(bj+1),(bl+1),(bu+1),(x+1),(clamda+1), &iout, infoqp, &zero,(w+1), &lenw,(iw+1), &leniw, &glob_grd.epsmac
HWND edit
Definition: svm-toy.cpp:41
doublereal * d
bool operator!=(const AtomRes &rhs) const
Definition: pdb2cif.cpp:697
doublereal * b
std::string to_lower_copy(std::string_view s)
Definition: text.cpp:120
struct _constraint * cs
if(fabs(c[*nmax+ *nmax *c_dim1])==0.e0)
std::error_code make_error_code(pdbErrors e)
Definition: pdb2cif.cpp:90
viol type
int in
double * f
void max(Image< double > &op1, const Image< double > &op2)
free((char *) ob)
double z
uint32_t dim_m() const
Definition: pdb2cif.cpp:5921
int mt
int VERBOSE
Definition: utilities.cpp:58
basic_istream< char, std::char_traits< char > > istream
Definition: utilities.cpp:815
std::string trim_right_copy(std::string_view s)
Definition: text.cpp:163
std::string message(int value) const
Definition: pdb2cif.cpp:67
void sort(struct DCEL_T *dcel)
Definition: sorting.cpp:18
#define j
value_type operator()(uint32_t i, uint32_t j) const
Definition: pdb2cif.cpp:5924
int m
void Parse(std::istream &is, cif::file &result)
Definition: pdb2cif.cpp:5719
std::string cif_id_for_number(int number)
Definition: text.cpp:235
TYPE distance(struct Point_T *p, struct Point_T *q)
Definition: point.cpp:28
#define len
void trim_right(std::string &s)
Definition: text.cpp:148
double psi(const double x)
pdbErrors
Definition: pdb2cif.cpp:51
value_type & operator()(uint32_t i, uint32_t j)
Definition: pdb2cif.cpp:5931
float r2
std::string to_string(bond_type bondType)
Definition: compound.cpp:43
SpecificationListParser(const std::string &text)
Definition: pdb2cif.cpp:289
const char * name() const noexcept
Definition: pdb2cif.cpp:62
bool isWater(const std::string &resname)
Definition: pdb2cif.cpp:141
std::error_category & pdbCategory()
Definition: pdb2cif.cpp:84
int * n
int get_space_group_number(std::string spacegroup)
Definition: symmetry.cpp:43
doublereal * a
bool operator<(const SparseElement &_x, const SparseElement &_y)
std::tuple< std::string, std::string > GetNextSpecification()
Definition: pdb2cif.cpp:302
double * backup
float r1
check(nparam, nf, nfsr, &Linfty, nineq, nineqn, neq, neqn, ncsrl, ncsrn, mode, &modem, eps, bgbnd, param)