28 #include <cif++/pdb/pdb2cif.hpp> 29 #include <cif++/pdb/pdb2cif_remark_3.hpp> 30 #include <cif++/gzio.hpp> 62 const char *
name() const noexcept
72 return "Residue not found";
75 return "Invalid date";
78 return "Error in PDB format";
92 return std::error_code(static_cast<int>(e),
pdbCategory());
102 static const bool value =
true;
128 "HEADER",
"OBSLTE",
"TITLE ",
"SPLIT ",
"CAVEAT",
"COMPND",
"SOURCE",
129 "KEYWDS",
"EXPDTA",
"NUMMDL",
"MDLTYP",
"AUTHOR",
"REVDAT",
"SPRSDE",
130 "JRNL ",
"REMARK",
"DBREF ",
"DBREF1",
"DBREF2",
"SEQADV",
"SEQRES",
131 "MODRES",
"HET ",
"HETNAM",
"HETSYN",
"FORMUL",
"HELIX ",
"SHEET ",
132 "SSBOND",
"LINK ",
"CISPEP",
"SITE ",
"CRYST1",
"ORIGX1",
"SCALE1",
133 "MTRIX1",
"ORIGX2",
"SCALE2",
"MTRIX2",
"ORIGX3",
"SCALE3",
"MTRIX3",
134 "MODEL ",
"ATOM ",
"ANISOU",
"TER ",
"HETATM",
"ENDMDL",
"CONECT",
143 return resname ==
"HOH" or resname ==
"H2O" or resname ==
"OH2" or resname ==
"WAT" or resname ==
"DOD" or resname ==
"WAT";
151 PDBRecord::PDBRecord(uint32_t lineNr,
const std::string &
name,
const std::string &value)
154 , mVlen(value.length())
156 assert(name.length() <= 10);
158 strcpy(mName, name.c_str());
159 strcpy(mValue, value.c_str());
162 PDBRecord::~PDBRecord()
166 void *PDBRecord::operator
new(
size_t size,
size_t vLen)
168 return malloc(size + vLen + 1);
171 void PDBRecord::operator
delete(
void *p)
176 void PDBRecord::operator
delete(
void *p,
size_t vLen)
181 bool PDBRecord::is(
const char *name)
const 186 char PDBRecord::vC(
size_t column)
189 if (column - 7 < mVlen)
190 result = mValue[column - 7];
194 std::string PDBRecord::vS(
size_t columnFirst,
size_t columnLast)
198 if (columnLast > mVlen + 6)
199 columnLast = mVlen + 6;
201 if (columnFirst < mVlen + 7)
203 result = std::string{ mValue + columnFirst - 7, mValue + columnLast - 7 + 1 };
210 int PDBRecord::vI(
int columnFirst,
int columnLast)
214 const char *e = mValue + mVlen;
215 if (e > mValue + columnLast - 7 + 1)
216 e = mValue + columnLast - 7 + 1;
228 for (
const char *p = mValue + columnFirst - 7; p < e; ++p)
240 else if (isdigit(*p))
245 else if (not isspace(*p))
246 throw std::runtime_error(
"Not a valid integer in PDB record");
252 else if (not isdigit(*p))
253 throw std::runtime_error(
"Not a valid integer in PDB record");
255 result = result * 10 + *p -
'0';
260 throw std::runtime_error(
"Not a valid integer in PDB record");
265 catch (
const std::exception &ex)
268 std::cerr <<
"Trying to parse '" << std::string(mValue + columnFirst - 7, mValue + columnLast - 7) <<
'\'' << std::endl;
278 std::string PDBRecord::vF(
size_t columnFirst,
size_t columnLast)
281 return vS(columnFirst, columnLast);
295 std::tuple<std::string, std::string> GetNextSpecification();
299 std::string::iterator mP;
302 std::tuple<std::string, std::string> SpecificationListParser::GetNextSpecification()
304 std::string id, value;
306 std::string::iterator start = mP,
backup;
321 while (mP != mText.end() and state != eDone)
328 if (isalnum(ch)
or ch ==
'_')
335 else if (not isspace(ch))
338 std::cerr <<
"skipping invalid character in SOURCE ID: " << ch << std::endl;
343 if (isalnum(ch)
or ch ==
'_')
355 std::cerr <<
"Empty value for SOURCE: " <<
id << std::endl;
358 else if (not isspace(ch))
385 value.insert(value.end(),
backup, mP);
396 else if (isspace(ch))
408 else if (not(isalnum(ch)
or ch ==
'_'))
410 value.insert(value.end(),
backup, mP);
419 std::cerr <<
"Skipping invalid header line: '" << std::string(start, mP) << std::endl;
430 return std::make_tuple(
id, value);
446 PDBRecord *r = mData;
462 std::string PDBIDCode;
465 char insertBegin =
' ';
467 char insertEnd =
' ';
468 std::string database;
469 std::string dbAccession;
470 std::string dbIdCode;
486 std::vector<PDBRecord *> atoms;
487 bool processed =
false;
489 PDBRecord *asn =
nullptr;
491 HET(
const std::string &hetID,
char chainID,
int seqNum,
char iCode,
int numHetAtoms = 0,
const std::string &text = {})
496 , numHetAtoms(numHetAtoms)
509 std::vector<std::string> atoms;
523 return name == rhs.name and
524 resName == rhs.resName and
525 resSeq == rhs.resSeq and
526 (altLoc == rhs.altLoc
or altLoc ==
' ' or rhs.altLoc ==
' ') and
527 chainID == rhs.chainID and
536 bool operator<(
const ATOM_REF &rhs)
const 538 int d = chainID - rhs.chainID;
540 d = resSeq - rhs.resSeq;
542 d = iCode - rhs.iCode;
545 d = name.compare(rhs.name);
546 if (d == 0 and altLoc !=
' ' and rhs.altLoc !=
' ')
547 d = altLoc - rhs.altLoc;
551 friend std::ostream &
operator<<(std::ostream &os,
const ATOM_REF &
a)
553 os << a.name <<
' ' << a.resName <<
' ' << a.chainID <<
' ' << a.resSeq << (a.iCode ==
' ' ?
"" : std::string{ a.iCode }) << (a.altLoc !=
' ' ? std::string{
' ', a.altLoc } :
"");
561 std::string symOpA, symOpB;
572 class SUGAR_TREE :
public std::vector<SUGAR>
575 std::string entityName()
const 577 return empty() ?
"" : entityName(begin());
581 std::string entityName(const_iterator sugar)
const 585 for (
auto i = begin();
i != end(); ++
i)
587 if (
i->next != sugar->c1)
595 result +=
"-[" +
n +
']';
598 if (not result.empty() and result.back() !=
']')
601 auto compound = cif::compound_factory::instance().create(sugar->c1.resName);
603 result += compound->name();
604 else if (sugar->c1.resName ==
"MAN")
605 result +=
"alpha-D-mannopyranose";
606 else if (sugar->c1.resName ==
"BMA")
607 result +=
"beta-D-mannopyranose";
608 else if (sugar->c1.resName ==
"NAG")
609 result +=
"2-acetamido-2-deoxy-beta-D-glucopyranose";
610 else if (sugar->c1.resName ==
"NDG")
611 result +=
"2-acetamido-2-deoxy-alpha-D-glucopyranose";
612 else if (sugar->c1.resName ==
"FUC")
613 result +=
"alpha-L-fucopyranose";
614 else if (sugar->c1.resName ==
"FUL")
615 result +=
"beta-L-fucopyranose";
617 result += sugar->c1.resName;
644 std::set<char> mChains;
645 std::map<std::string, std::string> mInfo;
646 std::map<std::string, std::string> mSource;
658 std::set<std::string> mAlts;
662 return mSeqNum == rhs.mSeqNum and mMonID == rhs.mMonID and mIcode == rhs.mIcode;
668 PDBChain(
const std::string &structureID,
char chainID,
int molID)
669 : mDbref{ structureID, chainID }
679 std::vector<PDBSeqRes> mSeqres, mHet;
699 std::vector<AtomRes> mResiduesSeen;
701 int AlignResToSeqRes();
702 bool SameSequence(
const PDBChain &rhs)
const;
707 PDBCompound &GetOrCreateCompound(
int molID)
709 auto i = std::find_if(mCompounds.begin(), mCompounds.end(), [molID](PDBCompound &comp) ->
bool 710 {
return comp.mMolID == molID; });
711 if (
i == mCompounds.end())
713 mCompounds.push_back(PDBCompound{ molID });
717 i = prev(mCompounds.end());
724 PDBChain &GetChainForID(
char chainID,
int numRes = 0)
726 auto i = std::find_if(mChains.begin(), mChains.end(), [chainID](PDBChain &ch) ->
bool 727 {
return ch.mDbref.chainID == chainID; });
729 if (
i == mChains.end())
733 for (
auto &cmp : mCompounds)
735 if (cmp.mChains.count(chainID) > 0)
742 mChains.emplace_back(mStructureID, chainID, molID);
744 i = prev(mChains.end());
750 void InsertChemComp(
const std::string &chemComp)
752 if (
find(mChemComp.begin(), mChemComp.end(), chemComp) == mChemComp.end())
753 mChemComp.push_back(chemComp);
756 void InsertAtomType(
const std::string &atomType)
758 if (
find(mAtomTypes.begin(), mAtomTypes.end(), atomType) == mAtomTypes.end())
759 mAtomTypes.push_back(atomType);
764 template <
typename Predicate>
765 PDBRecord *FindRecord(Predicate &&pred)
769 for (result = mData; result !=
nullptr; result = result->mNext)
778 PDBRecord *FindRecord(
const char *name)
780 return FindRecord([name](PDBRecord &rec) ->
bool 781 {
return rec.is(name); });
786 char vC(
size_t column)
const 788 return mRec->vC(column);
793 return mRec->vS(columnFirst, columnLast);
796 std::string vF(
size_t columnFirst,
size_t columnLast)
const 798 return mRec->vF(columnFirst, columnLast);
801 int vI(
int columnFirst,
int columnLast)
const 803 return mRec->vI(columnFirst, columnLast);
809 std::tuple<std::string, int, bool> MapResidue(
char chainID,
int resSeq,
char iCode)
const 811 auto key = std::make_tuple(chainID, resSeq, iCode);
815 return mChainSeq2AsymSeq.at(key);
817 catch (
const std::exception &ex)
819 throw_with_nested(std::runtime_error(std::string(
"Residue ") + chainID +
std::to_string(resSeq) + iCode +
" could not be mapped"));
823 std::tuple<std::string, int, bool> MapResidue(
char chainID,
int resSeq,
char iCode, std::error_code &ec)
const 825 auto key = std::make_tuple(chainID, resSeq, iCode);
827 std::tuple<std::string, int, bool> result;
829 if (not mChainSeq2AsymSeq.count(key))
833 std::cerr <<
"Residue " << chainID << resSeq << iCode <<
" could not be mapped" << std::endl;
836 result = mChainSeq2AsymSeq.at(key);
841 std::tuple<std::string, int, bool> MapResidue(
char chainID,
int resSeq,
char iCode,
const std::string &resName);
847 void GetNextRecord();
848 void Match(
const std::string &expected,
bool throwIfMissing);
851 void ParseCitation(
const std::string &
id);
858 void ParseRemark200();
859 void ParseRemark350();
861 void ParsePrimaryStructure();
862 void ParseHeterogen();
863 void ConstructEntities();
864 void ConstructSugarTrees(
int &asymNr);
865 void ParseSecondaryStructure();
866 void ParseConnectivtyAnnotation();
867 void ParseMiscellaneousFeatures();
868 void ParseCrystallographic();
869 void ParseCoordinateTransformation();
870 void ParseCoordinate(
int modelNr);
871 void ParseConnectivty();
872 void ParseBookkeeping();
876 category *getCategory(std::string name)
878 return &mDatablock[
name];
881 std::vector<std::string> SplitCSV(
const std::string &value);
883 std::string pdb2cifDate(std::string s, std::error_code &ec)
887 rx1(R
"((\d{2})-(JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)-(\d{2}))"), 888 rx2(R"((JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)-(\d{2}))"); 892 if (regex_match(s, m, rx1))
894 int day = stoi(m[1].str());
895 auto mi =
kMonths.find(m[2].str());
897 throw std::runtime_error(
"Invalid month: '" + m[2].str() +
'\'');
898 int month = mi->second;
899 int year = 1900 + stoi(m[3].str());
903 std::stringstream ss;
904 ss << std::setw(4) << std::setfill(
'0') << year <<
'-' 905 << std::setw(2) << std::setfill(
'0') << month <<
'-' 906 << std::setw(2) << std::setfill(
'0') << day;
910 else if (regex_match(s, m, rx2))
912 auto mi =
kMonths.find(m[1].str());
914 throw std::runtime_error(
"Invalid month: '" + m[1].str() +
'\'');
915 int month = mi->second;
916 int year = 1900 + stoi(m[2].str());
920 s = cif::format(
"%04d-%02d", year, month).str();
925 catch (
const std::exception &ex)
928 std::cerr << ex.what() << std::endl;
935 std::string pdb2cifDate(std::string s)
938 auto result = pdb2cifDate(s, ec);
940 std::cerr <<
"Invalid date(" << s <<
"): " << ec.message() << std::endl;
944 std::string pdb2cifAuth(std::string author)
948 const std::regex rx(R
"(((?:[A-Z]+\.)+)(.+))"); 950 if (regex_match(author, m, rx))
951 author = m[2].str() +
", " + m[1].str();
954 for (
auto &
c : author)
956 if (ispunct(
c)
or isspace(
c))
967 std::string pdb2cifSymmetry(std::string s)
969 static const std::regex sgRx(R
"((\d{1,3})(\d{3}))"); 974 if (not std::regex_match(s, m, sgRx))
975 throw std::runtime_error(
"invalid symmetry value '" + s +
'\'');
977 s = m[1].str() +
"_" + m[2].str();
983 std::string pdb2cifCharge(std::string
c)
985 std::regex rx(R
"((\d+)(\+|-))"); 988 if (std::regex_match(c, m, rx))
990 if (m[2].str() ==
"-")
991 c =
'-' + m[1].str();
999 std::vector<char> altLocsForAtom(
char chainID,
int seqNum,
char iCode, std::string atomName);
1000 void MapChainID2AsymIDS(
char chainID, std::vector<std::string> &asymIds);
1002 std::tuple<ATOM_REF, bool> FindLink(
const std::string &name1,
const std::string &resName1,
int resSeq1,
char altLoc1,
char chainID1,
char iCode1,
1003 const std::string &name2,
const std::string &resName2 =
"")
1005 return FindLink(ATOM_REF{ name1, resName1, resSeq1, altLoc1, chainID1, iCode1 }, name2, resName2);
1008 std::tuple<ATOM_REF, bool> FindLink(
const ATOM_REF &atom,
const std::string &name2,
const std::string &resName2 =
"")
const 1010 auto i = std::find_if(mLinks.begin(), mLinks.end(), [&](
const LINK &link)
1011 {
return (link.a == atom and link.b.name == name2 and (resName2.empty()
or link.b.resName == resName2))
or 1012 (link.b == atom and link.a.name == name2 and (resName2.empty()
or link.a.resName == resName2)); });
1014 if (
i != mLinks.end())
1015 return {
i->a == atom ?
i->b :
i->a,
true };
1024 cif::datablock mDatablock;
1026 std::string mStructureID;
1027 std::string mModelTypeDetails;
1028 std::string mOriginalDate;
1029 std::string mExpMethod =
"X-RAY DIFFRACTION";
1030 int mCitationAuthorNr = 1, mCitationEditorNr = 1;
1031 int mNextMolID = 1, mNextEntityNr = 1;
1032 int mNextSoftwareOrd = 1;
1036 std::string resName;
1040 std::string database;
1041 std::string dbAccession;
1044 std::string conflict;
1047 std::vector<SEQADV> mSeqadvs;
1049 std::list<PDBCompound> mCompounds;
1050 std::list<PDBChain> mChains;
1051 std::vector<HET> mHets;
1052 std::map<std::string, std::string> mHetnams;
1053 std::map<std::string, std::string> mHetsyns;
1054 std::map<std::string, std::string> mFormuls;
1055 std::string mWaterHetID;
1056 std::vector<std::string> mChemComp, mAtomTypes;
1058 std::map<std::string, std::string> mRemark200;
1059 std::string mRefinementSoftware;
1061 int mPdbxDifOrdinal = 0;
1063 std::vector<UNOBS> mUnobs;
1064 std::vector<LINK> mLinks;
1067 std::map<std::tuple<char, int, char>, std::tuple<std::string, int, bool>> mChainSeq2AsymSeq;
1069 std::map<int, std::string> mMolID2EntityID;
1070 std::map<std::string, std::string> mHet2EntityID;
1071 std::map<std::string, std::string> mBranch2EntityID;
1072 std::map<std::string, std::string> mAsymID2EntityID;
1073 std::map<std::string, std::string> mMod2parent;
1074 std::set<std::string> mSugarEntities;
1079 std::vector<char> PDBFileParser::altLocsForAtom(
char inChainID,
int inResSeq,
char inICode, std::string inAtomName)
1082 std::set<char> result;
1084 for (
auto r = mData; r !=
nullptr; r = r->mNext)
1086 if (r->is(
"ATOM ")
or r->is(
"HETATM"))
1088 std::string
name = r->vS(13, 16);
1089 char altLoc = r->vC(17);
1090 char chainID = r->vC(22);
1091 int resSeq = r->vI(23, 26);
1092 char iCode = r->vC(27);
1094 if (chainID == inChainID and resSeq == inResSeq and iCode == inICode and name == inAtomName and altLoc !=
' ')
1095 result.insert(altLoc);
1099 return { result.begin(), result.end() };
1102 void PDBFileParser::MapChainID2AsymIDS(
char chainID, std::vector<std::string> &asymIds)
1104 for (
const auto &[key, value] : mChainSeq2AsymSeq)
1106 if (std::get<0>(key) == chainID)
1107 asymIds.push_back(std::get<0>(value));
1110 std::sort(asymIds.begin(), asymIds.end(), [](
const std::string &
a,
const std::string &
b)
1112 int d =
static_cast<int>(a.length() -
b.length());
1117 asymIds.erase(std::unique(asymIds.begin(), asymIds.end()), asymIds.end());
1124 std::string lookahead;
1125 uint32_t lineNr = 1;
1126 getline(is, lookahead);
1128 if (lookahead.back() ==
'\r')
1129 lookahead.pop_back();
1134 auto contNr = [&lookahead](
int offset,
int len) ->
int 1136 std::string
cs = lookahead.substr(offset,
len);
1142 auto r = std::from_chars(cs.data(), cs.data() + cs.length(), result);
1143 if (r.ec != std::errc())
1144 throw std::runtime_error(
"Continuation std::string '" + cs +
"' is not valid");
1150 PDBRecord *last =
nullptr;
1151 std::set<std::string> dropped;
1155 if (lookahead.empty())
1161 std::cerr <<
"Line number " << lineNr <<
" is empty!" << std::endl;
1163 getline(is, lookahead);
1169 std::string
type = lookahead.substr(0, 6);
1171 if (lookahead.length() > 6)
1174 uint32_t curLineNr = lineNr;
1175 getline(is, lookahead);
1183 dropped.insert(type);
1191 if (type ==
"AUTHOR" or 1200 while (lookahead.substr(0, 6) == type and contNr(7, 3) ==
n)
1203 getline(is, lookahead);
1208 else if (type ==
"COMPND")
1212 while (lookahead.substr(0, 6) == type and contNr(7, 3) ==
n)
1216 getline(is, lookahead);
1221 else if (type ==
"REVDAT")
1223 int revNr = stoi(value.substr(1, 3));
1225 while (lookahead.substr(0, 6) == type and
1226 stoi(lookahead.substr(7, 3)) == revNr and
1229 value += lookahead.substr(38);
1230 getline(is, lookahead);
1235 else if (type ==
"CAVEAT")
1238 while (lookahead.substr(0, 6) == type and contNr(7, 3) ==
n)
1241 getline(is, lookahead);
1246 else if (type ==
"OBSLTE")
1248 while (lookahead.substr(0, 6) ==
type)
1250 value += lookahead.substr(31);
1251 getline(is, lookahead);
1255 else if (type ==
"SOURCE")
1259 while (lookahead.substr(0, 6) == type and contNr(7, 3) ==
n)
1263 getline(is, lookahead);
1268 else if (type ==
"FORMUL")
1275 compNr = stoi(value.substr(1, 3));
1277 catch (
const std::exception &ex)
1280 std::cerr <<
"Dropping FORMUL line (" << (lineNr - 1) <<
") with invalid component number '" << value.substr(1, 3) <<
'\'' << std::endl;
1288 while (lookahead.substr(0, 6) == type and
1289 stoi(lookahead.substr(7, 3)) == compNr and
1294 getline(is, lookahead);
1299 catch (
const std::invalid_argument &ex)
1305 catch (
const std::exception &ex)
1308 std::cerr <<
"Error parsing FORMUL at line " << lineNr << std::endl;
1312 else if (type ==
"HETNAM" or 1316 while (lookahead.substr(0, 6) == type and contNr(8, 2) ==
n)
1320 getline(is, lookahead);
1325 else if (type ==
"SITE ")
1327 std::string siteName = value.substr(5, 3);
1329 size_t n = value.length() - 12;
1330 value += std::string(11 - (n % 11),
' ');
1332 while (lookahead.substr(0, 6) == type and lookahead.substr(11, 3) == siteName)
1334 std::string s = lookahead.substr(18);
1336 s += std::string(11 - (s.length() % 11),
' ');
1341 getline(is, lookahead);
1345 else if (type ==
"REMARK")
1347 type += value.substr(0, 4);
1350 if (type ==
"REMARK 200" or type ==
"REMARK 240")
1352 auto i = value.find(
":");
1354 if (
i != std::string::npos)
1356 std::string
k = value.substr(4,
i - 4);
1357 std::string v = value.substr(
i + 1);
1360 while (k.find(
" ") != std::string::npos)
1365 mRemark200[
k] =
".";
1366 else if (not
iequals(v,
"NULL"))
1372 PDBRecord *cur =
new (value.length()) PDBRecord(curLineNr, type, value);
1374 if (last ==
nullptr)
1383 if (type ==
"LINK" or type ==
"LINKR")
1387 link.a.name = cur->vS(13, 16);
1388 link.a.altLoc = cur->vC(17);
1389 link.a.resName = cur->vS(18, 20);
1390 link.a.chainID = cur->vC(22);
1391 link.a.resSeq = cur->vI(23, 26);
1392 link.a.iCode = cur->vC(27);
1393 link.b.name = cur->vS(43, 46);
1394 link.b.altLoc = cur->vC(47);
1395 link.b.resName = cur->vS(48, 50);
1396 link.b.chainID = cur->vC(52);
1397 link.b.resSeq = cur->vI(53, 56);
1398 link.b.iCode = cur->vC(57);
1399 link.symOpA = cur->vS(60, 65);
1400 link.symOpB = cur->vS(67, 72);
1404 auto f = cur->vF(74, 78);
1405 auto r = cif::from_chars(
f.data(),
f.data() +
f.length(), link.distance);
1407 std::cerr <<
"Error parsing link distance at line " << cur->mLineNr << std::endl;
1411 mLinks.push_back(link);
1418 if (not dropped.empty())
1421 std::cerr <<
"Dropped unsupported records: " << cif::join(dropped,
", ") << std::endl;
1424 if (mData ==
nullptr)
1425 throw std::runtime_error(
"Empty file?");
1430 void PDBFileParser::GetNextRecord()
1432 if (mRec !=
nullptr)
1435 if (mRec ==
nullptr)
1437 static PDBRecord *end =
new (0) PDBRecord({ 0,
"END ",
"" });
1442 void PDBFileParser::Match(
const std::string &expected,
bool throwIfMissing)
1445 if (mRec->mName != expected)
1448 throw std::runtime_error(
"Expected record " + expected +
" but found " + mRec->mName);
1450 std::cerr <<
"Expected record " + expected +
" but found " + mRec->mName << std::endl;
1454 std::vector<std::string> PDBFileParser::SplitCSV(
const std::string &value)
1456 auto vs = cif::split<std::string>(value,
",");
1462 void PDBFileParser::ParseTitle()
1473 Match(
"HEADER",
false);
1475 std::string keywords;
1477 if (mRec->is(
"HEADER"))
1479 mStructureID = vS(63, 66);
1480 keywords = vS(11, 50);
1481 mOriginalDate = pdb2cifDate(vS(51, 59));
1489 if (mStructureID.empty())
1490 mStructureID =
"nohd";
1492 mDatablock.set_name(mStructureID);
1494 auto cat = getCategory(
"entry");
1497 {
"id", mStructureID } });
1500 if (mRec->is(
"OBSLTE"))
1509 std::string old = vS(22, 25);
1510 std::string date = pdb2cifDate(vS(12, 20));
1511 cat = getCategory(
"pdbx_database_PDB_obs");
1513 std::string value = mRec->vS(32);
1514 for (
auto i : cif::split<std::string>(value,
" ",
true))
1519 {
"replace_pdb_id", old },
1527 Match(
"TITLE ",
false);
1529 if (mRec->is(
"TITLE "))
1536 if (mRec->is(
"SPLIT "))
1542 throw std::runtime_error(
"SPLIT PDB files are not supported");
1547 while (mRec->is(
"CAVEAT"))
1549 getCategory(
"database_PDB_caveat")->emplace({
1550 {
"id", caveatID++ },
1551 {
"text", std::string{ mRec->vS(20) } }
1558 Match(
"COMPND",
false);
1564 std::string value{ mRec->vS(11) };
1565 if (value.find(
':') == std::string::npos)
1568 auto &comp = GetOrCreateCompound(1);
1569 comp.mInfo[
"MOLECULE"] = value;
1577 std::string key, val;
1583 if (not
iequals(key,
"MOL_ID") and mCompounds.empty())
1586 std::cerr <<
"Ignoring invalid COMPND record" << std::endl;
1590 if (key ==
"MOL_ID")
1592 auto &comp = GetOrCreateCompound(stoi(val));
1593 comp.mTitle = title;
1595 else if (key ==
"CHAIN")
1597 for (
auto c : cif::split<std::string>(val,
","))
1600 mCompounds.back().mChains.insert(
c[0]);
1604 mCompounds.back().mInfo[key] = val;
1608 if (mRec->is(
"COMPND"))
1612 Match(
"SOURCE",
false);
1614 if (mRec->is(
"SOURCE"))
1621 std::map<std::string, std::string> *source =
nullptr;
1641 std::string key, val;
1647 if (key ==
"MOL_ID")
1649 for (
auto &
c : mCompounds)
1651 if (
c.mMolID == stoi(val))
1653 source = &
c.mSource;
1661 if (source ==
nullptr)
1662 throw std::runtime_error(
"At line " +
std::to_string(mRec->mLineNr) +
": missing MOL_ID in SOURCE");
1664 (*source)[key] = val;
1671 Match(
"KEYWDS",
false);
1672 std::string pdbxKeywords;
1674 if (mRec->is(
"KEYWDS"))
1676 pdbxKeywords = vS(11);
1681 if (not(keywords.empty() and pdbxKeywords.empty()))
1683 getCategory(
"struct_keywords")->emplace({
1684 {
"entry_id", mStructureID },
1685 {
"pdbx_keywords", keywords },
1686 {
"text", pdbxKeywords } });
1690 Match(
"EXPDTA",
false);
1691 if (mRec->is(
"EXPDTA"))
1693 mExpMethod = vS(11);
1695 cat = getCategory(
"exptl");
1697 auto crystals = cif::split<std::string>(mRemark200[
"NUMBER OF CRYSTALS USED"],
"; ");
1698 if (crystals.empty())
1699 crystals.push_back(
"");
1700 auto ci = crystals.begin();
1702 for (
auto expMethod : cif::split<std::string>(mExpMethod,
";"))
1706 if (expMethod.empty())
1710 {
"entry_id", mStructureID },
1711 {
"method", expMethod },
1712 {
"crystals_number", ci != crystals.end() ? *ci :
"" } });
1719 if (mRec->is(
"NUMMDL"))
1722 std::cerr <<
"skipping unimplemented NUMMDL record" << std::endl;
1727 if (mRec->is(
"MDLTYP"))
1729 mModelTypeDetails = vS(11);
1734 Match(
"AUTHOR",
false);
1735 if (mRec->is(
"AUTHOR"))
1738 cat = getCategory(
"audit_author");
1740 value = { mRec->vS(11) };
1741 for (
auto author : cif::split<std::string>(value,
",",
true))
1744 {
"name", pdb2cifAuth(author) },
1745 {
"pdbx_ordinal", n } });
1753 bool firstRevDat =
true;
1757 std::string date, dateOriginal, replaces;
1759 std::vector<std::string> types;
1761 bool operator<(
const RevDat &rhs)
const {
return revNum < rhs.revNum; }
1763 std::vector<RevDat> revdats;
1765 while (mRec->is(
"REVDAT"))
1768 int revNum = vI(8, 10);
1770 std::string date = pdb2cifDate(vS(14, 22));
1773 std::string modID = vS(24, 27);
1775 int modType = vI(32, 32);
1778 std::string detail = vS(40);
1783 revdats.push_back({ revNum, date, modType == 0 ? mOriginalDate :
"", modID, modType });
1785 revdats.back().types = cif::split<std::string>(detail,
" ");
1789 cat = getCategory(
"database_2");
1791 {
"database_id",
"PDB" },
1792 {
"database_code", modID } });
1796 firstRevDat =
false;
1802 sort(revdats.begin(), revdats.end());
1803 for (
auto &revdat : revdats)
1805 getCategory(
"database_PDB_rev")->emplace({
1806 {
"num", revdat.revNum },
1807 {
"date", revdat.date },
1808 {
"date_original", revdat.dateOriginal },
1809 {
"replaces", revdat.replaces },
1810 {
"mod_type", revdat.modType } });
1812 for (
auto &
type : revdat.types)
1816 getCategory(
"database_PDB_rev_record")->emplace({
1817 {
"rev_num", revdat.revNum },
1818 {
"type",
type } });
1824 if (mRec->is(
"SPRSDE"))
1827 std::cerr <<
"skipping unimplemented SPRSDE record" << std::endl;
1832 if (mRec->is(
"JRNL "))
1833 ParseCitation(
"primary");
1836 void PDBFileParser::ParseCitation(
const std::string &
id)
1838 const char *rec = mRec->mName;
1840 std::string auth, titl,
edit, publ, refn, pmid, doi;
1841 std::string pubname, volume, astm, country, issn, csd;
1842 std::string pageFirst;
1845 auto extend = [](std::string &s,
const std::string &p)
1852 while (mRec->is(rec) and (
id ==
"primary" or vC(12) ==
' '))
1854 std::string
k = vS(13, 16);
1856 extend(auth, vS(20, 79));
1857 else if (k ==
"TITL")
1858 extend(titl, vS(20, 79));
1859 else if (k ==
"EDIT")
1860 extend(edit, vS(20, 79));
1861 else if (k ==
"REF")
1863 if (pubname.empty())
1865 extend(pubname, vS(20, 47));
1866 if (vS(50, 51) ==
"V.")
1868 pageFirst = vS(57, 61);
1872 extend(pubname, vS(20, 47));
1874 else if (k ==
"PUBL")
1875 extend(publ, vS(20, 70));
1876 else if (k ==
"REFN")
1878 if (vS(20, 23) ==
"ASTN")
1880 country = vS(33, 34);
1881 if (vS(36, 39) ==
"ISSN")
1884 else if (k ==
"PMID")
1886 else if (k ==
"DOI")
1892 auto cat = getCategory(
"citation");
1896 {
"journal_abbrev", pubname },
1897 {
"journal_volume", volume },
1898 {
"page_first", pageFirst },
1900 {
"journal_id_ASTM", astm },
1901 {
"country", country },
1902 {
"journal_id_ISSN", issn },
1903 {
"journal_id_CSD", csd },
1904 {
"book_publisher", publ },
1905 {
"pdbx_database_id_PubMed", pmid },
1906 {
"pdbx_database_id_DOI", doi } });
1908 if (not auth.empty())
1910 cat = getCategory(
"citation_author");
1911 for (
auto author : cif::split<std::string>(auth,
",",
true))
1914 {
"citation_id",
id },
1915 {
"name", pdb2cifAuth(author) },
1916 {
"ordinal", mCitationAuthorNr } });
1918 ++mCitationAuthorNr;
1922 if (not edit.empty())
1924 cat = getCategory(
"citation_editor");
1925 for (
auto editor : cif::split<std::string>(edit,
",",
true))
1928 {
"citation_id",
id },
1929 {
"name", pdb2cifAuth(editor) },
1930 {
"ordinal", mCitationEditorNr } });
1932 ++mCitationEditorNr;
1937 void PDBFileParser::ParseRemarks()
1939 std::string sequenceDetails, compoundDetails, sourceDetails;
1941 while (cif::starts_with(mRec->mName,
"REMARK"))
1943 int remarkNr = vI(8, 10);
1950 while (mRec->is(
"REMARK 1"))
1952 if (mRec->mVlen > 15 and vS(12, 20) ==
"REFERENCE")
1954 std::string
id = vS(22, 70);
1966 while (mRec->is(
"REMARK 3"))
1972 while (mRec->is(
"REMARK 4"))
1978 const std::regex rx(R
"(THE (\S+) ID CODE IS (\S+?)\.?\s*)"); 1980 std::string r = vS(12); 1982 if (std::regex_match(r, m, rx))
1984 auto cat = getCategory(
"database_2");
1986 {
"database_id", m[1].str() },
1987 {
"database_code", m[2].str() } });
1998 bool remark =
false;
2002 std::string r = mRec->vS(12);
2004 if (cif::starts_with(r,
"REMARK: "))
2006 mRemark200[
"REMARK"] = r.substr(8);
2014 mRemark200[
"REMARK"] += r;
2018 }
while (mRec->is(
"REMARK 200"));
2024 std::string density_Matthews, densityPercentSol, conditions;
2026 const std::regex rx1(R
"(SOLVENT CONTENT, VS +\(%\): *(.+))"), 2027 rx2(R"(MATTHEWS COEFFICIENT, VM \(ANGSTROMS\*\*3/DA\): *(.+))"); 2033 std::string r = vS(12);
2035 if (conditions.empty())
2037 if (std::regex_match(r, m, rx1))
2038 densityPercentSol = m[1].str();
2039 else if (std::regex_match(r, m, rx2))
2040 density_Matthews = m[1].str();
2041 else if (cif::starts_with(r,
"CRYSTALLIZATION CONDITIONS: "))
2042 conditions = r.substr(28);
2045 conditions = conditions +
' ' + r;
2048 }
while (mRec->is(
"REMARK 280"));
2050 std::string desc = mRemark200[
"REMARK"];
2054 getCategory(
"exptl_crystal")->emplace({
2056 {
"density_Matthews",
iequals(density_Matthews,
"NULL") ?
"" : density_Matthews },
2057 {
"density_percent_sol",
iequals(densityPercentSol,
"NULL") ?
"" : densityPercentSol },
2058 {
"description", desc } });
2061 const std::regex rx3(R
"(TEMPERATURE +(\d+)K)"), rx4(R"(PH *(?:: *)?(\d+(?:\.\d+)?))") ;
2063 std::string temp, ph, method;
2065 for (
auto s : cif::split<std::string>(conditions,
",",
true))
2069 if (std::regex_search(s, m, rx3))
2071 if (std::regex_search(s, m, rx4))
2073 if (s.length() < 60 and
2076 if (not method.empty())
2077 method = method +
", " + s;
2083 if (not(method.empty() and temp.empty() and ph.empty() and (conditions.empty()
or conditions ==
"NULL")))
2085 getCategory(
"exptl_crystal_grow")->emplace({
2086 {
"crystal_id", 1 },
2087 {
"method", method },
2090 {
"pdbx_details", conditions } });
2102 for (; mRec->is(
"REMARK 350"); GetNextRecord())
2108 std::stringstream s;
2110 if (vS(12) ==
"COMPOUND")
2113 while (mRec->is(
"REMARK 400"))
2115 s << vS(12) << std::endl;
2119 compoundDetails = s.str();
2125 std::stringstream s;
2127 if (vS(12) ==
"SOURCE")
2130 while (mRec->is(
"REMARK 450"))
2132 s << vS(12) << std::endl;
2136 sourceDetails = s.str();
2142 bool headerSeen =
false;
2143 std::regex rx(R
"( *MODELS *(\d+)-(\d+))"); 2144 int models[2] = { -1, -1 };
2146 for (; mRec->is(
"REMARK 465"); GetNextRecord())
2150 std::string line = vS(12);
2153 if (std::regex_match(line, m, rx))
2155 models[0] = std::stoi(m[1].str());
2156 models[1] = stoi(m[2].str());
2159 headerSeen = cif::contains(line,
"RES C SSSEQI");
2163 if (models[0] == models[1])
2164 models[0] = models[1] = vI(12, 14);
2166 std::string res = vS(16, 18);
2167 char chain = vC(20);
2168 int seq = vI(22, 26);
2169 char iCode = vC(27);
2171 for (
int modelNr = models[0]; modelNr <= models[1]; ++modelNr)
2172 mUnobs.push_back({ modelNr, res, chain, seq, iCode });
2180 bool headerSeen =
false;
2181 std::regex rx(R
"( *MODELS *(\d+)-(\d+))"); 2182 int models[2] = { -1, -1 };
2184 for (; mRec->is(
"REMARK 470"); GetNextRecord())
2188 std::string line = vS(12);
2191 if (std::regex_match(line, m, rx))
2193 models[0] = stoi(m[1].str());
2194 models[1] = stoi(m[2].str());
2197 headerSeen = cif::contains(line,
"RES CSSEQI ATOMS");
2201 if (models[0] == models[1])
2202 models[0] = models[1] = vI(12, 14);
2204 std::string res = vS(16, 18);
2205 char chain = vC(20);
2206 int seq = vI(21, 24);
2207 char iCode = vC(25);
2209 std::string atomStr = mRec->vS(29);
2210 auto atoms = cif::split<std::string>(atomStr,
" ",
true);
2212 for (
int modelNr = models[0]; modelNr <= models[1]; ++modelNr)
2213 mUnobs.push_back({ modelNr, res, chain, seq, iCode, atoms });
2236 bool headerSeen =
false;
2239 for (; mRec->is(
"REMARK 500"); GetNextRecord())
2241 std::string line = vS(12);
2243 if (line ==
"GEOMETRY AND STEREOCHEMISTRY")
2250 if (line.empty()
or not cif::starts_with(line,
"SUBTOPIC: "))
2253 std::string subtopic = line.substr(10);
2255 if (subtopic ==
"CLOSE CONTACTS IN SAME ASYMMETRIC UNIT")
2257 else if (subtopic ==
"CLOSE CONTACTS")
2259 else if (subtopic ==
"COVALENT BOND LENGTHS")
2261 else if (subtopic ==
"COVALENT BOND ANGLES")
2263 else if (subtopic ==
"TORSION ANGLES")
2265 else if (subtopic ==
"NON-CIS, NON-TRANS")
2267 else if (subtopic ==
"PLANAR GROUPS")
2269 else if (subtopic ==
"MAIN CHAIN PLANARITY")
2271 else if (subtopic ==
"CHIRAL CENTERS")
2274 throw std::runtime_error(
"Unknown subtopic in REMARK 500: " + subtopic);
2285 line ==
"ATM1 RES C SSEQI ATM2 RES C SSEQI DISTANCE";
2286 else if (line.empty())
2290 std::string atom1 = vS(13, 16);
2291 std::string res1 = vS(19, 21);
2292 std::string alt1 = vS(17, 17);
2293 char chain1 = vC(23);
2294 int seq1 = vI(25, 29);
2295 std::string iCode1 = vS(30, 30);
2297 std::string atom2 = vS(34, 37);
2298 std::string alt2 = vS(38, 38);
2299 std::string res2 = vS(40, 42);
2300 char chain2 = vC(44);
2301 int seq2 = vI(46, 50);
2302 std::string iCode2 = vS(51, 51);
2306 getCategory(
"pdbx_validate_close_contact")->emplace({
2308 {
"PDB_model_num", 1 },
2309 {
"auth_atom_id_1", atom1 },
2310 {
"auth_asym_id_1", std::string{ chain1 } },
2311 {
"auth_comp_id_1", res1 },
2312 {
"auth_seq_id_1", seq1 },
2313 {
"PDB_ins_code_1", iCode1 },
2314 {
"label_alt_id_1", alt1 },
2315 {
"auth_atom_id_2", atom2 },
2316 {
"auth_asym_id_2", std::string{ chain2 } },
2317 {
"auth_comp_id_2", res2 },
2318 {
"auth_seq_id_2", seq2 },
2319 {
"PDB_ins_code_2", iCode2 },
2320 {
"label_alt_id_2", alt2 },
2321 {
"dist", distance } });
2329 headerSeen = line ==
"ATM1 RES C SSEQI ATM2 RES C SSEQI SSYMOP DISTANCE";
2330 else if (line.empty())
2334 std::string atom1 = vS(13, 16);
2335 std::string res1 = vS(19, 21);
2336 char chain1 = vC(23);
2337 int seq1 = vI(25, 29);
2339 std::string atom2 = vS(34, 37);
2340 std::string res2 = vS(40, 42);
2341 char chain2 = vC(44);
2342 int seq2 = vI(46, 50);
2347 symop = pdb2cifSymmetry(vS(54, 59));
2349 catch (
const std::exception &ex)
2352 std::cerr <<
"Dropping REMARK 500 at line " << mRec->mLineNr <<
" due to invalid symmetry operation" << std::endl;
2358 getCategory(
"pdbx_validate_symm_contact")->emplace({
2360 {
"PDB_model_num", 1 },
2361 {
"auth_atom_id_1", atom1 },
2362 {
"auth_asym_id_1", std::string{ chain1 } },
2363 {
"auth_comp_id_1", res1 },
2364 {
"auth_seq_id_1", seq1 },
2367 {
"site_symmetry_1",
"1_555" },
2368 {
"auth_atom_id_2", atom2 },
2369 {
"auth_asym_id_2", std::string{ chain2 } },
2370 {
"auth_comp_id_2", res2 },
2371 {
"auth_seq_id_2", seq2 },
2374 {
"site_symmetry_2", symop },
2375 {
"dist", distance } });
2384 if (cif::starts_with(line,
"FORMAT: ") and line !=
"FORMAT: (10X,I3,1X,2(A3,1X,A1,I4,A1,1X,A4,3X),1X,F6.3)")
2385 throw std::runtime_error(
"Unexpected format in REMARK 500");
2387 headerSeen = line ==
"M RES CSSEQI ATM1 RES CSSEQI ATM2 DEVIATION";
2389 else if (line.empty())
2393 int model = vI(11, 13);
2394 std::string resNam1 = vS(15, 17);
2395 std::string chainID1{ vC(19) };
2396 int seqNum1 = vI(20, 23);
2397 std::string iCode1{ vC(24) };
2398 std::string alt1 = vS(30, 30);
2399 std::string atm1 = vS(26, 29);
2401 std::string resNam2 = vS(33, 35);
2402 std::string chainID2{ vC(37) };
2403 int seqNum2 = vI(38, 41);
2404 std::string iCode2{ vC(42) };
2405 std::string alt2 = vS(48, 48);
2406 std::string atm2 = vS(44, 47);
2408 std::string deviation = vF(51, 57);
2415 getCategory(
"pdbx_validate_rmsd_bond")->emplace({
2417 {
"PDB_model_num", model ? model : 1 },
2418 {
"auth_atom_id_1", atm1 },
2419 {
"auth_asym_id_1", chainID1 },
2420 {
"auth_comp_id_1", resNam1 },
2421 {
"auth_seq_id_1", seqNum1 },
2422 {
"PDB_ins_code_1", iCode1 },
2423 {
"label_alt_id_1", alt1 },
2424 {
"auth_atom_id_2", atm2 },
2425 {
"auth_asym_id_2", chainID2 },
2426 {
"auth_comp_id_2", resNam2 },
2427 {
"auth_seq_id_2", seqNum2 },
2428 {
"PDB_ins_code_2", iCode2 },
2429 {
"label_alt_id_2", alt2 },
2430 {
"bond_deviation", deviation } });
2439 if (cif::starts_with(line,
"FORMAT: ") and line !=
"FORMAT: (10X,I3,1X,A3,1X,A1,I4,A1,3(1X,A4,2X),12X,F5.1)")
2440 throw std::runtime_error(
"Unexpected format in REMARK 500");
2442 headerSeen = line ==
"M RES CSSEQI ATM1 ATM2 ATM3";
2444 else if (line.empty())
2446 else if (vS(64) ==
"DEGREES")
2448 int model = vI(11, 13);
2449 std::string resNam = vS(15, 17);
2450 std::string chainID{ vC(19) };
2451 int seqNum = vI(20, 23);
2452 std::string iCode{ vC(24) };
2457 std::string atoms[3] = { vS(27, 30), vS(34, 37), vS(41, 44) };
2458 std::string deviation = vF(57, 62);
2459 if (deviation ==
"*****")
2462 getCategory(
"pdbx_validate_rmsd_angle")->emplace({
2464 {
"PDB_model_num", model ? model : 1 },
2465 {
"auth_atom_id_1", atoms[0] },
2466 {
"auth_asym_id_1", chainID },
2467 {
"auth_comp_id_1", resNam },
2468 {
"auth_seq_id_1", seqNum },
2469 {
"PDB_ins_code_1", iCode },
2470 {
"auth_atom_id_2", atoms[1] },
2471 {
"auth_asym_id_2", chainID },
2472 {
"auth_comp_id_2", resNam },
2473 {
"auth_seq_id_2", seqNum },
2474 {
"PDB_ins_code_2", iCode },
2475 {
"auth_atom_id_3", atoms[2] },
2476 {
"auth_asym_id_3", chainID },
2477 {
"auth_comp_id_3", resNam },
2478 {
"auth_seq_id_3", seqNum },
2479 {
"PDB_ins_code_3", iCode },
2480 {
"angle_deviation", deviation } });
2488 if (cif::starts_with(line,
"FORMAT: ") and line !=
"FORMAT:(10X,I3,1X,A3,1X,A1,I4,A1,4X,F7.2,3X,F7.2)")
2489 throw std::runtime_error(
"Unexpected format in REMARK 500");
2491 headerSeen = line ==
"M RES CSSEQI PSI PHI";
2493 else if (line.empty())
2497 int model = vI(11, 13);
2498 std::string resNam = vS(15, 17);
2499 std::string chainID{ vC(19) };
2500 int seqNum = vI(20, 23);
2501 std::string iCode{ vC(24) };
2506 std::string
psi = vF(27, 35);
2507 std::string phi = vF(37, 45);
2509 getCategory(
"pdbx_validate_torsion")->emplace({
2511 {
"PDB_model_num", model ? model : 1 },
2512 {
"auth_comp_id", resNam },
2513 {
"auth_asym_id", chainID },
2514 {
"auth_seq_id", seqNum },
2515 {
"PDB_ins_code", iCode },
2523 headerSeen = line ==
"MODEL OMEGA";
2524 else if (line.empty())
2528 int model = vI(45, 48);
2530 std::string resNam1 = vS(12, 14);
2531 std::string chainID1{ vC(16) };
2532 int seqNum1 = vI(17, 21);
2533 std::string iCode1{ vC(22) };
2538 std::string resNam2 = vS(27, 29);
2539 std::string chainID2{ vC(31) };
2540 int seqNum2 = vI(32, 36);
2541 std::string iCode2{ vC(37) };
2546 std::string omega = vF(54, 60);
2548 getCategory(
"pdbx_validate_peptide_omega")->emplace({
2550 {
"PDB_model_num", model ? model : 1 },
2551 {
"auth_comp_id_1", resNam1 },
2552 {
"auth_asym_id_1", chainID1 },
2553 {
"auth_seq_id_1", seqNum1 },
2554 {
"PDB_ins_code_1", iCode1 },
2555 {
"auth_comp_id_2", resNam2 },
2556 {
"auth_asym_id_2", chainID2 },
2557 {
"auth_seq_id_2", seqNum2 },
2558 {
"PDB_ins_code_2", iCode2 },
2559 {
"omega", omega } });
2565 headerSeen = line ==
"M RES CSSEQI RMS TYPE";
2566 else if (line.empty())
2570 int model = vI(11, 13);
2571 std::string resNam = vS(15, 17);
2572 std::string chainID{ vC(19) };
2573 int seqNum = vI(20, 23);
2574 std::string iCode{ vC(24) };
2579 std::string rmsd = vF(32, 36);
2580 std::string
type = vS(41);
2582 getCategory(
"pdbx_validate_planes")->emplace({
2584 {
"PDB_model_num", model ? model : 1 },
2585 {
"auth_comp_id", resNam },
2586 {
"auth_asym_id", chainID },
2587 {
"auth_seq_id", seqNum },
2588 {
"PDB_ins_code", iCode },
2590 {
"type", type } });
2605 bool headerSeen =
false;
2607 for (; mRec->is(
"REMARK 610"); GetNextRecord())
2611 std::string line = vS(12);
2612 headerSeen = cif::contains(line,
"RES C SSEQI");
2616 int modelNr = vI(12, 14);
2619 std::string res = vS(16, 18);
2620 char chain = vC(20);
2621 int seq = vI(22, 25);
2622 char iCode = vC(26);
2624 auto compound = cif::compound_factory::instance().create(res);
2625 if (compound ==
nullptr)
2628 std::vector<std::string> atoms;
2629 for (
auto atom : compound->atoms())
2631 if (atom.type_symbol != cif::H)
2632 atoms.push_back(atom.id);
2635 mUnobs.push_back({ modelNr, res, chain, seq, iCode, { atoms } });
2643 const std::regex rx1(R
"(SITE_IDENTIFIER: (.+))"), 2644 rx2(R"(EVIDENCE_CODE: (.+))"), 2645 rx3(R"(SITE_DESCRIPTION: (binding site for residue ([[:alnum:]]{1,3}) ([[:alnum:]]) (\d+)|.+))", std::regex_constants::icase); 2647 std::string id, evidence, desc; 2648 std::string pdbxAuthAsymID, pdbxAuthCompID, pdbxAuthSeqID, pdbxAuthInsCode; 2663 auto site = FindRecord([
id](PDBRecord &r) ->
bool 2664 {
return r.is(
"SITE ") and r.vS(12, 14) == id; });
2666 if (site ==
nullptr)
2667 throw std::runtime_error(
"Invalid REMARK 800, no SITE record for id " +
id);
2670 getCategory(
"struct_site")->emplace({
2672 {
"details", desc },
2673 {
"pdbx_auth_asym_id", pdbxAuthAsymID },
2674 {
"pdbx_auth_comp_id", pdbxAuthCompID },
2675 {
"pdbx_auth_seq_id", pdbxAuthSeqID },
2676 {
"pdbx_num_residues", site->vI(16, 17) },
2677 {
"pdbx_evidence_code", evidence } });
2680 for (; mRec->is(
"REMARK 800"); GetNextRecord())
2682 std::string s = mRec->vS(12);
2692 throw std::runtime_error(
"Invalid REMARK 800 record, expected SITE");
2696 if (std::regex_match(s, m, rx1))
2702 throw std::runtime_error(
"Invalid REMARK 800 record, expected SITE_IDENTIFIER");
2706 if (regex_match(s, m, rx2))
2708 evidence = m[1].str();
2712 throw std::runtime_error(
"Invalid REMARK 800 record, expected SITE_IDENTIFIER");
2716 if (regex_match(s, m, rx3))
2719 pdbxAuthCompID = m[2].str();
2720 pdbxAuthAsymID = m[3].str();
2721 pdbxAuthSeqID = m[4].str();
2728 if (regex_match(s, m, rx1))
2738 desc = desc +
' ' + s;
2751 std::stringstream s;
2753 if (vS(12) ==
"SEQUENCE")
2756 while (mRec->is(
"REMARK 999"))
2758 s << vS(12) << std::endl;
2762 sequenceDetails = s.str();
2777 std::string skipped = mRec->mName;
2779 std::stringstream s;
2781 if (not mRec->vS(11).empty())
2782 s << mRec->vS(11) << std::endl;
2785 while (mRec->is(skipped.c_str()))
2787 s << mRec->vS(11) << std::endl;
2791 getCategory(
"pdbx_database_remark")->emplace({
2793 {
"text", s.str() } });
2799 catch (
const std::exception &ex)
2801 std::throw_with_nested(std::runtime_error(
"Error parsing REMARK " +
std::to_string(remarkNr)));
2805 if (not(compoundDetails.empty() and sequenceDetails.empty() and sourceDetails.empty()))
2807 getCategory(
"pdbx_entry_details")->emplace({
2808 {
"entry_id", mStructureID },
2809 {
"compound_details", compoundDetails },
2810 {
"sequence_details", sequenceDetails },
2811 {
"source_details", sourceDetails } });
2815 if (not mRemark200.empty())
2819 void PDBFileParser::ParseRemark200()
2821 auto rm200 = [&](
const char *
name,
int diffrnNr) -> std::string
2826 for (
auto s : cif::split<std::string>(mRemark200[name],
";"))
2828 if (++nr != diffrnNr)
2836 result = std::move(s);
2843 auto inRM200 = [
this](std::initializer_list<const char *> s) ->
bool 2845 bool result =
false;
2849 if (not this->mRemark200[
n].empty())
2878 {
"data reduction",
"INTENSITY-INTEGRATION SOFTWARE" },
2879 {
"data scaling",
"DATA SCALING SOFTWARE" },
2880 {
"phasing",
"SOFTWARE USED" },
2883 for (
auto &sw : kSWMap)
2885 if (mRemark200[sw.b].empty())
2888 getCategory(
"software")->emplace({
2889 {
"name", mRemark200[sw.b] },
2890 {
"classification", sw.a },
2892 {
"pdbx_ordinal", mNextSoftwareOrd++ } });
2895 std::string scatteringType;
2896 if (mRemark200[
"EXPERIMENT TYPE"] ==
"X-RAY DIFFRACTION")
2897 scatteringType =
"x-ray";
2898 else if (mRemark200[
"EXPERIMENT TYPE"] ==
"NEUTRON DIFFRACTION")
2899 scatteringType =
"neutron";
2901 std::set<std::string> diffrnWaveLengths;
2903 for (
int diffrnNr = 1;; ++diffrnNr)
2905 std::string ambientTemp = rm200(
"TEMPERATURE (KELVIN)", diffrnNr);
2906 if (ambientTemp.empty())
2909 if (cif::ends_with(ambientTemp,
"K"))
2910 ambientTemp.erase(ambientTemp.length() - 1, 1);
2912 getCategory(
"diffrn")->emplace({
2914 {
"ambient_temp", ambientTemp },
2916 {
"crystal_id", 1 } });
2918 std::string collectionDate;
2920 collectionDate = pdb2cifDate(rm200(
"DATE OF DATA COLLECTION", diffrnNr), ec);
2924 std::cerr << ec.message() <<
" for pdbx_collection_date" << std::endl;
2928 collectionDate.clear();
2931 getCategory(
"diffrn_detector")->emplace({
2932 {
"diffrn_id", diffrnNr },
2933 {
"detector", rm200(
"DETECTOR TYPE", diffrnNr) },
2934 {
"type", rm200(
"DETECTOR MANUFACTURER", diffrnNr) },
2935 {
"pdbx_collection_date", collectionDate },
2936 {
"details", rm200(
"OPTICS", diffrnNr) } });
2938 if (inRM200({
"MONOCHROMATIC OR LAUE (M/L)",
"MONOCHROMATOR",
"DIFFRACTION PROTOCOL" })
or not scatteringType.empty())
2939 getCategory(
"diffrn_radiation")->emplace({
2940 {
"diffrn_id", diffrnNr },
2941 {
"wavelength_id", 1 },
2942 {
"pdbx_monochromatic_or_laue_m_l", rm200(
"MONOCHROMATIC OR LAUE (M/L)", diffrnNr) },
2943 {
"monochromator", rm200(
"MONOCHROMATOR", diffrnNr) },
2944 {
"pdbx_diffrn_protocol", rm200(
"DIFFRACTION PROTOCOL", diffrnNr) },
2945 {
"pdbx_scattering_type", scatteringType } });
2947 std::string wl = rm200(
"WAVELENGTH OR RANGE (A)", diffrnNr);
2948 auto wavelengths = cif::split<std::string>(wl,
", -",
true);
2950 diffrnWaveLengths.insert(wavelengths.begin(), wavelengths.end());
2953 if (rm200(
"SYNCHROTRON (Y/N)", diffrnNr) ==
"Y")
2955 getCategory(
"diffrn_source")->emplace({
2956 {
"diffrn_id", diffrnNr },
2957 {
"source",
"SYNCHROTRON" },
2958 {
"type", rm200(
"RADIATION SOURCE", diffrnNr) +
" BEAMLINE " + rm200(
"BEAMLINE", diffrnNr) },
2959 {
"pdbx_synchrotron_site", rm200(
"RADIATION SOURCE", diffrnNr) },
2960 {
"pdbx_synchrotron_beamline", rm200(
"BEAMLINE", diffrnNr) },
2962 {
"pdbx_wavelength", wavelengths.size() == 1 ? wavelengths[0] :
"" },
2963 {
"pdbx_wavelength_list", wavelengths.size() == 1 ?
"" : cif::join(wavelengths,
", ") },
2966 else if (inRM200({
"X-RAY GENERATOR MODEL",
"RADIATION SOURCE",
"BEAMLINE",
"WAVELENGTH OR RANGE (A)" }))
2968 getCategory(
"diffrn_source")->emplace({
2969 {
"diffrn_id", diffrnNr },
2970 {
"source", rm200(
"RADIATION SOURCE", diffrnNr) },
2971 {
"type", rm200(
"X-RAY GENERATOR MODEL", diffrnNr) },
2973 {
"pdbx_wavelength", wavelengths.size() == 1 ? wavelengths[0] :
"" },
2974 {
"pdbx_wavelength_list", wavelengths.size() == 1 ?
"" : cif::join(wavelengths,
", ") },
2979 int wavelengthNr = 1;
2980 for (
auto wl : diffrnWaveLengths)
2982 if (cif::ends_with(wl,
"A"))
2983 wl.erase(wl.length() - 1, 1);
2985 getCategory(
"diffrn_radiation_wavelength")->emplace({
2986 {
"id", wavelengthNr++ },
2987 {
"wavelength", wl.empty() ?
"." : wl },
2991 if (inRM200({
"METHOD USED TO DETERMINE THE STRUCTURE",
"STARTING MODEL" }))
2993 auto cat = getCategory(
"refine");
2994 assert(cat->empty());
2996 std::string resolution = mRemark200[
"RESOLUTION RANGE HIGH (A)"];
2997 if (resolution.empty())
3001 {
"pdbx_method_to_determine_struct", mRemark200[
"METHOD USED TO DETERMINE THE STRUCTURE"] },
3002 {
"pdbx_starting_model", mRemark200[
"STARTING MODEL"] },
3003 {
"ls_d_res_high", resolution },
3004 {
"pdbx_diffrn_id", 1 },
3005 {
"pdbx_refine_id", mExpMethod },
3006 {
"entry_id", mStructureID } });
3009 if (inRM200({
"REJECTION CRITERIA (SIGMA(I))",
"RESOLUTION RANGE HIGH (A)",
"RESOLUTION RANGE LOW (A)",
"NUMBER OF UNIQUE REFLECTIONS",
"COMPLETENESS FOR RANGE (%)",
"<I/SIGMA(I)> FOR THE DATA SET",
"R MERGE (I)",
"R SYM (I)",
"DATA REDUNDANCY" }))
3011 auto cat = getCategory(
"reflns");
3013 {
"entry_id", mStructureID },
3014 {
"observed_criterion_sigma_I", mRemark200[
"REJECTION CRITERIA (SIGMA(I))"] },
3015 {
"d_resolution_high", mRemark200[
"RESOLUTION RANGE HIGH (A)"] },
3016 {
"d_resolution_low", mRemark200[
"RESOLUTION RANGE LOW (A)"] },
3017 {
"number_obs", mRemark200[
"NUMBER OF UNIQUE REFLECTIONS"] },
3018 {
"percent_possible_obs", mRemark200[
"COMPLETENESS FOR RANGE (%)"] },
3019 {
"pdbx_netI_over_sigmaI", mRemark200[
"<I/SIGMA(I)> FOR THE DATA SET"] },
3020 {
"pdbx_Rmerge_I_obs", mRemark200[
"R MERGE (I)"] },
3021 {
"pdbx_Rsym_value", mRemark200[
"R SYM (I)"] },
3022 {
"pdbx_redundancy", mRemark200[
"DATA REDUNDANCY"] },
3023 {
"pdbx_ordinal", 1 },
3024 {
"pdbx_diffrn_id", 1 }
3028 if (inRM200({
"HIGHEST RESOLUTION SHELL, RANGE HIGH (A)" }))
3030 getCategory(
"reflns_shell")->emplace({
3031 {
"d_res_high", mRemark200[
"HIGHEST RESOLUTION SHELL, RANGE HIGH (A)"] },
3032 {
"d_res_low", mRemark200[
"HIGHEST RESOLUTION SHELL, RANGE LOW (A)"] },
3033 {
"percent_possible_all", mRemark200[
"COMPLETENESS FOR SHELL (%)"] },
3034 {
"Rmerge_I_obs", mRemark200[
"R MERGE FOR SHELL (I)"] },
3035 {
"pdbx_Rsym_value", mRemark200[
"R SYM FOR SHELL (I)"] },
3036 {
"meanI_over_sigI_obs", mRemark200[
"<I/SIGMA(I)> FOR SHELL"] },
3037 {
"pdbx_redundancy", mRemark200[
"DATA REDUNDANCY IN SHELL"] },
3038 {
"pdbx_ordinal", 1 },
3039 {
"pdbx_diffrn_id", 1 } });
3041 else if (inRM200({
"HIGHEST RESOLUTION SHELL, RANGE LOW (A)",
"COMPLETENESS FOR SHELL (%)",
3042 "R MERGE FOR SHELL (I)",
"R SYM FOR SHELL (I)",
"<I/SIGMA(I)> FOR SHELL",
"DATA REDUNDANCY IN SHELL" }))
3045 std::cerr <<
"Not writing reflns_shell record since d_res_high is missing" << std::endl;
3049 void PDBFileParser::ParseRemark350()
3063 kRX1(R
"(BIOMOLECULE: (\d+))"), 3064 kRX2(R"(([^:]+): (.+?)(?: (ANGSTROM\*\*2|KCAL/MOL))?)"), 3065 kRX8(R"(APPLY THE FOLLOWING TO CHAINS: (.+))"), 3066 kRX9(R"(AND CHAINS: (.+))"), 3067 kRX10(R"(BIOMT([123])\s+(\d+)\s+(-?\d+(?:\.\d+)?)\s+(-?\d+(?:\.\d+)?)\s+(-?\d+(?:\.\d+)?)\s+(-?\d+(?:\.\d+)?))"); 3069 int biomolecule = 0, operID = 0;
3070 std::vector<std::string> operExpression;
3071 std::map<std::string, std::string> values;
3072 std::vector<std::string> asymIdList;
3074 cif::row_handle genR;
3076 std::vector<double> mat, vec;
3078 for (mRec = FindRecord(
"REMARK 350"); mRec !=
nullptr and mRec->is(
"REMARK 350"); GetNextRecord())
3080 std::string line = vS(11);
3085 if (regex_match(line, m, kRX1))
3087 biomolecule = stoi(m[1].str());
3093 if (regex_match(line, m, kRX8))
3097 std::string value = m[1].str();
3099 for (
auto chain : cif::split<std::string>(value,
", ",
true))
3107 if (chain.length() != 1)
3108 throw std::runtime_error(
"Invalid REMARK 350");
3110 MapChainID2AsymIDS(chain[0], asymIdList);
3113 else if (regex_match(line, m, kRX2))
3114 values[m[1].str()] = m[2].str();
3118 if (regex_match(line, m, kRX9))
3122 std::string value = m[1].str();
3124 for (
auto chain : cif::split<std::string>(value,
", ",
true))
3132 MapChainID2AsymIDS(chain[0], asymIdList);
3140 if (regex_match(line, m, kRX10))
3142 int mt = stoi(m[1].str());
3144 throw std::runtime_error(
"Invalid REMARK 350");
3146 operID = stoi(m[2].str());
3149 mat.push_back(stod(m[3].str()));
3150 mat.push_back(stod(m[4].str()));
3151 mat.push_back(stod(m[5].str()));
3152 vec.push_back(stod(m[6].str()));
3158 if (regex_match(line, m, kRX10))
3160 int mt = stoi(m[1].str());
3164 operID = stoi(m[2].str());
3167 else if (operID != stoi(m[2].str()))
3168 throw std::runtime_error(
"Invalid REMARK 350");
3170 mat.push_back(stod(m[3].str()));
3171 mat.push_back(stod(m[4].str()));
3172 mat.push_back(stod(m[5].str()));
3173 vec.push_back(stod(m[6].str()));
3177 if (vec.size() != 3
or mat.size() != 9)
3178 throw std::runtime_error(
"Invalid REMARK 350");
3182 std::string oligomer = values[
"AUTHOR DETERMINED BIOLOGICAL UNIT"];
3183 if (oligomer.empty())
3184 oligomer = values[
"SOFTWARE DETERMINED QUATERNARY STRUCTURE"];
3190 if (std::regex_match(oligomer, m2, std::regex(R
"((\d+)-meric)"))) 3192 count = stoi(m2[1].str()); 3194 else if (cif::ends_with(oligomer,
"meric"))
3196 std::string
cs = oligomer.substr(0, oligomer.length() - 5);
3199 else if (cs ==
"di")
3201 else if (cs ==
"tri")
3203 else if (cs ==
"tetra")
3205 else if (cs ==
"hexa")
3207 else if (cs ==
"octa")
3209 else if (cs ==
"dodeca")
3213 std::string details;
3214 if (values[
"AUTHOR DETERMINED BIOLOGICAL UNIT"].empty())
3216 if (not values[
"SOFTWARE DETERMINED QUATERNARY STRUCTURE"].empty())
3217 details =
"software_defined_assembly";
3219 else if (values[
"SOFTWARE DETERMINED QUATERNARY STRUCTURE"].empty())
3220 details =
"author_defined_assembly";
3222 details =
"author_and_software_defined_assembly";
3224 getCategory(
"pdbx_struct_assembly")->emplace({
3225 {
"id", biomolecule },
3226 {
"details", details },
3227 {
"method_details", values[
"SOFTWARE USED"] },
3228 {
"oligomeric_details", oligomer },
3229 {
"oligomeric_count", count > 0 ?
std::to_string(count) :
"" } });
3231 auto cat = getCategory(
"pdbx_struct_assembly_prop");
3233 if (not values[
"TOTAL BURIED SURFACE AREA"].empty())
3235 {
"biol_id", biomolecule },
3236 {
"type",
"ABSA (A^2)" },
3237 {
"value", values[
"TOTAL BURIED SURFACE AREA"] } });
3239 if (not values[
"CHANGE IN SOLVENT FREE ENERGY"].empty())
3241 {
"biol_id", biomolecule },
3243 {
"value", values[
"CHANGE IN SOLVENT FREE ENERGY"] } });
3245 if (not values[
"SURFACE AREA OF THE COMPLEX"].empty())
3247 {
"biol_id", biomolecule },
3248 {
"type",
"SSA (A^2)" },
3249 {
"value", values[
"SURFACE AREA OF THE COMPLEX"] } });
3254 std::string
type = mat == std::vector<double>{ 1, 0, 0, 0, 1, 0, 0, 0, 1 } and vec == std::vector<double>{ 0, 0, 0 } ?
"identity operation" :
"crystal symmetry operation";
3263 getCategory(
"pdbx_struct_oper_list")->emplace({
3268 {
"matrix[1][1]", cif::format(
"%12.10f", mat[0]).str() },
3269 {
"matrix[1][2]", cif::format(
"%12.10f", mat[1]).str() },
3270 {
"matrix[1][3]", cif::format(
"%12.10f", mat[2]).str() },
3271 {
"vector[1]", cif::format(
"%12.10f", vec[0]).str() },
3272 {
"matrix[2][1]", cif::format(
"%12.10f", mat[3]).str() },
3273 {
"matrix[2][2]", cif::format(
"%12.10f", mat[4]).str() },
3274 {
"matrix[2][3]", cif::format(
"%12.10f", mat[5]).str() },
3275 {
"vector[2]", cif::format(
"%12.10f", vec[1]).str() },
3276 {
"matrix[3][1]", cif::format(
"%12.10f", mat[6]).str() },
3277 {
"matrix[3][2]", cif::format(
"%12.10f", mat[7]).str() },
3278 {
"matrix[3][3]", cif::format(
"%12.10f", mat[8]).str() },
3279 {
"vector[3]", cif::format(
"%12.10f", vec[2]).str() } });
3281 catch (duplicate_key_error &ex)
3290 else if (regex_match(line, m, kRX1))
3292 if (not(vec.empty() and mat.empty()))
3293 throw std::runtime_error(
"Invalid REMARK 350");
3295 getCategory(
"pdbx_struct_assembly_gen")->emplace({
3296 {
"assembly_id", biomolecule },
3297 {
"oper_expression", cif::join(operExpression,
",") },
3298 {
"asym_id_list", cif::join(asymIdList,
",") } });
3300 biomolecule = stoi(m[1].str());
3302 operExpression.clear();
3310 if (not operExpression.empty())
3312 getCategory(
"pdbx_struct_assembly_gen")->emplace({
3313 {
"assembly_id", biomolecule },
3314 {
"oper_expression", cif::join(operExpression,
",") },
3315 {
"asym_id_list", cif::join(asymIdList,
",") } });
3321 void PDBFileParser::ParsePrimaryStructure()
3324 DBREF cur = { mStructureID };
3326 while (cif::starts_with(mRec->mName,
"DBREF"))
3328 if (mRec->is(
"DBREF "))
3330 cur.PDBIDCode = vS(8, 11);
3331 cur.chainID = vC(13);
3332 cur.seqBegin = vI(15, 18);
3334 cur.insertBegin = vC(19);
3336 cur.seqEnd = vI(21, 24);
3338 cur.insertEnd = vC(25);
3340 cur.database = vS(27, 32);
3341 cur.dbAccession = vS(34, 41);
3342 cur.dbIdCode = vS(43, 54);
3343 cur.dbSeqBegin = vI(56, 60);
3345 cur.dbinsBeg = vC(61);
3347 cur.dbSeqEnd = vI(63, 67);
3349 cur.dbinsEnd = vC(68);
3351 auto &chain = GetChainForID(cur.chainID);
3354 else if (mRec->is(
"DBREF1"))
3356 cur.PDBIDCode = vS(8, 11);
3357 cur.chainID = vC(13);
3358 cur.seqBegin = vI(15, 18);
3360 cur.insertBegin = vC(19);
3362 cur.seqEnd = vI(21, 24);
3364 cur.insertEnd = vC(25);
3366 cur.database = vS(27, 32);
3367 cur.dbIdCode = vS(48, 67);
3369 else if (mRec->is(
"DBREF2"))
3371 if (vC(13) != cur.chainID)
3372 throw std::runtime_error(
"Chain ID's for DBREF1/DBREF2 records do not match");
3373 cur.dbAccession = vS(19, 40);
3375 cur.dbSeqBegin = vI(46, 55);
3377 cur.dbSeqEnd = vI(58, 67);
3379 auto &chain = GetChainForID(cur.chainID);
3387 for (
auto &chain : mChains)
3389 chain.mNextSeqNum = chain.mDbref.seqBegin;
3390 chain.mNextDbSeqNum = chain.mDbref.dbSeqBegin;
3393 while (mRec->is(
"SEQADV"))
3395 mSeqadvs.push_back({
3411 while (mRec->is(
"SEQRES"))
3415 char chainID = vC(12);
3418 int numRes = vI(14, 17);
3420 std::string monomers = vS(20, 70);
3423 auto &chain = GetChainForID(chainID, numRes);
3425 for (
auto monID : cif::split<std::string>(monomers,
" ",
true))
3430 chain.mSeqres.push_back({ monID, chain.mNextSeqNum++,
' ', chain.mNextDbSeqNum++ });
3432 InsertChemComp(monID);
3439 while (mRec->is(
"MODRES"))
3441 std::string resName = vS(13, 15);
3445 std::string stdRes = vS(25, 27);
3448 mMod2parent[resName] = stdRes;
3454 void PDBFileParser::ParseHeterogen()
3456 while (mRec->is(
"HET "))
3458 std::string hetID = vS(8, 10);
3459 char chainID = vC(13);
3460 int seqNum = vI(14, 17);
3461 char iCode = vC(18);
3462 int numHetAtoms = vI(21, 25);
3464 std::string text = vS(31, 70);
3466 mHets.emplace_back(hetID, chainID, seqNum, iCode, numHetAtoms, text);
3473 if (mRec->is(
"HETNAM"))
3475 std::string hetID = vS(12, 14);
3476 std::string text = vS(16);
3478 mHetnams[hetID] = text;
3479 InsertChemComp(hetID);
3485 if (mRec->is(
"HETSYN"))
3487 std::string hetID = vS(12, 14);
3488 std::string syn = vS(16);
3490 mHetsyns[hetID] = syn;
3499 while (mRec->is(
"FORMUL"))
3501 std::string hetID = vS(13, 15);
3503 char waterMark = vC(19);
3504 std::string formula = vS(20);
3506 mFormuls[hetID] = formula;
3508 if (waterMark ==
'*')
3509 mWaterHetID = hetID;
3515 void PDBFileParser::ConstructEntities()
3523 typedef std::map<std::tuple<char, int, char, char>, std::string> CompTypeMap;
3524 CompTypeMap residuesSeen;
3526 for (
auto r = mData; r !=
nullptr; r = r->mNext)
3528 if (r->is(
"MODEL "))
3530 modelNr = r->vI(11, 14);
3536 if (r->is(
"ATOM ")
or r->is(
"HETATM"))
3538 std::string
name = r->vS(13, 16);
3539 char altLoc = r->vC(17);
3540 std::string resName = r->vS(18, 20);
3541 char chainID = r->vC(22);
3542 int resSeq = r->vI(23, 26);
3543 char iCode = r->vC(27);
3546 CompTypeMap::key_type
k = std::make_tuple(chainID, resSeq, iCode, altLoc);
3547 if (residuesSeen.count(k) == 0)
3548 residuesSeen[k] = resName;
3549 else if (residuesSeen[k] != resName)
3550 throw std::runtime_error(
"inconsistent residue type for " + std::string{ chainID } +
std::to_string(resSeq) + iCode + altLoc +
"\n" +
3551 " (" + residuesSeen[
k] +
" != " + resName +
")");
3553 auto &chain = GetChainForID(chainID);
3557 if ((chain.mResiduesSeen.empty()
or chain.mResiduesSeen.back() != ar) and
3558 (cif::compound_factory::instance().is_known_peptide(resName)
or cif::compound_factory::instance().is_known_base(resName)))
3560 chain.mResiduesSeen.push_back(ar);
3564 mUnobs.erase(remove_if(mUnobs.begin(), mUnobs.end(), [=](UNOBS &
a)
3566 bool result =
false;
3568 if (modelNr ==
a.modelNr and
3569 resName ==
a.res and
3570 chainID ==
a.chain and
3574 auto i =
find(
a.atoms.begin(),
a.atoms.end(),
name);
3575 if (
i !=
a.atoms.end())
3578 result =
a.atoms.empty();
3591 char chainID = r->vC(22);
3594 auto &chain = GetChainForID(chainID);
3595 if (chain.mTerIndex == 0)
3596 chain.mTerIndex =
static_cast<int>(chain.mResiduesSeen.size());
3602 mChains.erase(remove_if(mChains.begin(), mChains.end(), [](
auto &chain)
3603 {
return chain.mResiduesSeen.empty() and chain.mSeqres.empty(); }),
3606 for (
auto &chain : mChains)
3608 if (not(chain.mSeqres.empty()
or chain.mResiduesSeen.empty()))
3614 if (chain.mTerIndex > 0)
3615 chain.mResiduesSeen.erase(chain.mResiduesSeen.begin() + chain.mTerIndex, chain.mResiduesSeen.end());
3617 int lastResidueIndex = chain.AlignResToSeqRes();
3619 if (lastResidueIndex > 0 and lastResidueIndex + 1 < static_cast<int>(chain.mResiduesSeen.size()))
3621 auto &r = chain.mResiduesSeen[lastResidueIndex + 1];
3625 std::cerr <<
"Detected residues that cannot be aligned to SEQRES" << std::endl
3626 <<
"First residue is " << chain.mDbref.chainID <<
':' << r.mSeqNum << r.mIcode << std::endl;
3629 chain.mTerIndex = lastResidueIndex + 1;
3639 for (
int ix = chain.mTerIndex; ix < static_cast<int>(chain.mResiduesSeen.size()); ++ix)
3641 std::string resName = chain.mResiduesSeen[ix].mMonID;
3643 if (cif::compound_factory::instance().is_known_peptide(resName)
or 3644 cif::compound_factory::instance().is_known_base(resName))
3646 chain.mTerIndex = ix + 1;
3649 InsertChemComp(resName);
3653 for (
int ix = 0; ix < chain.mTerIndex; ++ix)
3655 auto &ar = chain.mResiduesSeen[ix];
3656 chain.mSeqres.push_back({ ar.mMonID, ar.mSeqNum, ar.mIcode, ar.mSeqNum,
true });
3661 std::set<char> terminatedChains;
3662 std::map<char, int> residuePerChainCounter;
3664 for (
auto r = mData; r !=
nullptr; r = r->mNext)
3666 if (r->is(
"MODEL "))
3668 modelNr = r->vI(11, 14);
3674 if (r->is(
"ATOM ")
or r->is(
"HETATM"))
3678 char altLoc = vC(17);
3679 std::string resName = r->vS(18, 20);
3680 char chainID = r->vC(22);
3681 int resSeq = r->vI(23, 26);
3682 char iCode = r->vC(27);
3684 auto &chain = GetChainForID(chainID);
3686 auto i =
find(chain.mSeqres.begin(), chain.mSeqres.end(), PDBSeqRes{ resName, resSeq, iCode });
3689 if (altLoc !=
' ' and
i == chain.mSeqres.end())
3691 i = find_if(chain.mSeqres.begin(), chain.mSeqres.end(),
3692 [resSeq, iCode](
const PDBSeqRes &r) ->
bool 3694 return r.mSeqNum == resSeq and r.mIcode == iCode;
3698 if (
i != chain.mSeqres.end())
3701 if (
i->mMonID != resName)
3702 i->mAlts.insert(resName);
3706 auto &residues = chain.mHet;
3708 if (residues.empty()
or residues.back().mSeqNum != resSeq)
3710 i = lower_bound(residues.begin(), residues.end(),
3711 PDBSeqRes{ resName, resSeq, iCode },
3712 [=](
const PDBSeqRes &
r1,
const PDBSeqRes &
r2) ->
bool 3714 return r1.mSeqNum <
r2.mSeqNum;
3717 residues.insert(
i, { resName, resSeq, iCode, resSeq,
true });
3719 InsertChemComp(resName);
3723 int residueCount = (residuePerChainCounter[chainID] += 1);
3726 if (not(cif::compound_factory::instance().is_known_peptide(resName)
or cif::compound_factory::instance().is_known_base(resName))
or 3727 terminatedChains.count(chainID)
or 3728 (chain.mTerIndex > 0 and residueCount >= chain.mTerIndex))
3731 mWaterHetID = resName;
3733 auto h = find_if(mHets.begin(), mHets.end(), [=](
const HET &het) ->
bool 3734 {
return het.hetID == resName and het.chainID == chainID and
3735 het.seqNum == resSeq and het.iCode == iCode; });
3737 if (h == mHets.end())
3739 mHets.push_back({ resName, chainID, resSeq, iCode, 0 });
3740 h = prev(mHets.end());
3743 h->atoms.push_back(r);
3751 char chainID = r->vC(22);
3752 terminatedChains.insert(chainID);
3757 for (
auto &chain : mChains)
3759 if (chain.mMolID != 0
or chain.mSeqres.empty())
3763 for (
auto &other : mChains)
3765 if (&other == &chain
or other.mMolID == 0)
3768 if (chain.SameSequence(other))
3770 chain.mMolID = other.mMolID;
3775 if (chain.mMolID != 0)
3778 auto &comp = GetOrCreateCompound(mNextMolID++);
3779 comp.mChains.insert(chain.mDbref.chainID);
3781 chain.mMolID = comp.mMolID;
3784 std::set<std::string> structTitle, structDescription;
3788 auto cat = getCategory(
"pdbx_poly_seq_scheme");
3790 for (
auto &chain : mChains)
3794 if (mMolID2EntityID.count(chain.mMolID) == 0)
3797 std::string entityID = mMolID2EntityID[chain.mMolID];
3799 mAsymID2EntityID[asymID] = entityID;
3801 getCategory(
"struct_asym")->emplace({
3803 {
"pdbx_blank_PDB_chainid_flag", chain.mDbref.chainID ==
' ' ?
"Y" :
"N" },
3805 {
"entity_id", entityID },
3810 for (
auto &res : chain.mSeqres)
3812 mChainSeq2AsymSeq[std::make_tuple(chain.mDbref.chainID, res.mSeqNum, res.mIcode)] = std::make_tuple(asymID, seqNr,
true);
3817 std::set<std::string> monIds = { res.mMonID };
3818 monIds.insert(res.mAlts.begin(), res.mAlts.end());
3820 for (std::string monID : monIds)
3822 std::string authMonID, authSeqNum, authInsCode{
'.'};
3828 if (res.mIcode !=
' ' and res.mIcode != 0)
3829 authInsCode = std::string{ res.mIcode };
3832 {
"asym_id", asymID },
3833 {
"entity_id", mMolID2EntityID[chain.mMolID] },
3834 {
"seq_id", seqID },
3835 {
"mon_id", monID },
3836 {
"ndb_seq_num", seqID },
3837 {
"pdb_seq_num", res.mSeqNum },
3838 {
"auth_seq_num", authSeqNum },
3839 {
"pdb_mon_id", authMonID },
3840 {
"auth_mon_id", authMonID },
3841 {
"pdb_strand_id", std::string{ chain.mDbref.chainID } },
3842 {
"pdb_ins_code", authInsCode },
3843 {
"hetero", res.mAlts.empty() ?
"n" :
"y" } });
3847 if (res.mIcode !=
' ' and res.mIcode != 0)
3848 authInsCode = std::string{ res.mIcode } +
"A";
3851 {
"asym_id", asymID },
3852 {
"entity_id", mMolID2EntityID[chain.mMolID] },
3853 {
"seq_id", seqID },
3854 {
"mon_id", monID },
3855 {
"ndb_seq_num", seqID },
3856 {
"pdb_seq_num", res.mSeqNum },
3857 {
"auth_seq_num",
"." },
3858 {
"pdb_mon_id",
"." },
3859 {
"auth_mon_id",
"." },
3860 {
"pdb_strand_id", std::string{ chain.mDbref.chainID } },
3861 {
"pdb_ins_code", authInsCode },
3862 {
"hetero", res.mAlts.empty() ?
"n" :
"y" } });
3869 uint32_t structRefID = 0, structRefSeqAlignID = 0;
3871 for (
auto &cmp : mCompounds)
3875 std::string srcMethod;
3877 if (not cmp.mSource[
"SYNTHETIC"].empty())
3881 getCategory(
"pdbx_entity_src_syn")->emplace({
3882 {
"entity_id", mMolID2EntityID[cmp.mMolID] },
3883 {
"pdbx_src_id", structRefID },
3884 {
"organism_scientific", cmp.mSource[
"ORGANISM_SCIENTIFIC"] },
3885 {
"ncbi_taxonomy_id", cmp.mSource[
"ORGANISM_TAXID"] },
3888 else if (cmp.mInfo[
"ENGINEERED"] ==
"YES" or 3889 not cmp.mSource[
"EXPRESSION_SYSTEM"].empty())
3893 getCategory(
"entity_src_gen")->emplace({
3894 {
"entity_id", mMolID2EntityID[cmp.mMolID] },
3895 {
"pdbx_src_id", structRefID },
3896 {
"gene_src_common_name", cmp.mSource[
"ORGANISM_COMMON"] },
3897 {
"pdbx_gene_src_gene", cmp.mSource[
"GENE"] },
3898 {
"gene_src_strain", cmp.mSource[
"STRAIN"] },
3899 {
"gene_src_tissue", cmp.mSource[
"TISSUE"] },
3900 {
"gene_src_tissue_fraction", cmp.mSource[
"TISSUE_FRACTION"] },
3901 {
"pdbx_gene_src_cell_line", cmp.mSource[
"CELL_LINE"] },
3902 {
"pdbx_gene_src_organelle", cmp.mSource[
"ORGANELLE"] },
3903 {
"pdbx_gene_src_cell", cmp.mSource[
"CELL"] },
3904 {
"pdbx_gene_src_cellular_location", cmp.mSource[
"CELLULAR_LOCATION"] },
3905 {
"host_org_common_name", cmp.mSource[
"EXPRESSION_SYSTEM_COMMON"] },
3906 {
"pdbx_gene_src_scientific_name", cmp.mSource[
"ORGANISM_SCIENTIFIC"] },
3907 {
"pdbx_gene_src_ncbi_taxonomy_id", cmp.mSource[
"ORGANISM_TAXID"] },
3908 {
"pdbx_host_org_scientific_name", cmp.mSource[
"EXPRESSION_SYSTEM"] },
3909 {
"pdbx_host_org_ncbi_taxonomy_id", cmp.mSource[
"EXPRESSION_SYSTEM_TAXID"] },
3910 {
"pdbx_host_org_strain", cmp.mSource[
"EXPRESSION_SYSTEM_STRAIN"] },
3911 {
"pdbx_host_org_variant", cmp.mSource[
"EXPRESSION_SYSTEM_VARIANT"] },
3912 {
"pdbx_host_org_cell_line", cmp.mSource[
"EXPRESSION_SYSTEM_CELL_LINE"] },
3913 {
"pdbx_host_org_cellular_location", cmp.mSource[
"EXPRESSION_SYSTEM_CELLULAR_LOCATION"] },
3914 {
"pdbx_host_org_vector_type", cmp.mSource[
"EXPRESSION_SYSTEM_VECTOR_TYPE"] },
3915 {
"pdbx_host_org_vector", cmp.mSource[
"EXPRESSION_SYSTEM_VECTOR"] },
3916 {
"pdbx_host_org_gene", cmp.mSource[
"EXPRESSION_SYSTEM_GENE"] },
3917 {
"plasmid_name", cmp.mSource[
"EXPRESSION_SYSTEM_PLASMID"] },
3918 {
"pdbx_description", cmp.mSource[
"OTHER_DETAILS"] } });
3920 else if (not cmp.mSource[
"ORGANISM_SCIENTIFIC"].empty())
3924 getCategory(
"entity_src_nat")->emplace({
3925 {
"entity_id", mMolID2EntityID[cmp.mMolID] },
3926 {
"pdbx_src_id", structRefID },
3927 {
"common_name", cmp.mSource[
"ORGANISM_COMMON"] },
3928 {
"strain", cmp.mSource[
"STRAIN"] },
3929 {
"pdbx_secretion", cmp.mSource[
"SECRETION"] },
3930 {
"pdbx_organism_scientific", cmp.mSource[
"ORGANISM_SCIENTIFIC"] },
3931 {
"pdbx_ncbi_taxonomy_id", cmp.mSource[
"ORGANISM_TAXID"] },
3932 {
"pdbx_cellular_location", cmp.mSource[
"CELLULAR_LOCATION"] },
3933 {
"pdbx_plasmid_name", cmp.mSource[
"PLASMID"] },
3934 {
"pdbx_organ", cmp.mSource[
"ORGAN"] },
3938 getCategory(
"entity")->emplace({
3939 {
"id", mMolID2EntityID[cmp.mMolID] },
3940 {
"type",
"polymer" },
3941 {
"src_method", srcMethod },
3942 {
"pdbx_description", cmp.mInfo[
"MOLECULE"] },
3944 {
"pdbx_number_of_molecules", cmp.mChains.size() },
3945 {
"details", cmp.mInfo[
"OTHER_DETAILS"] },
3946 {
"pdbx_mutation", cmp.mInfo[
"MUTATION"] },
3947 {
"pdbx_fragment", cmp.mInfo[
"FRAGMENT"] },
3948 {
"pdbx_ec", cmp.mInfo[
"EC"] } });
3950 if (not cmp.mInfo[
"SYNONYM"].empty())
3952 getCategory(
"entity_name_com")->emplace({
3953 {
"entity_id", mMolID2EntityID[cmp.mMolID] },
3954 {
"name", cmp.mInfo[
"SYNONYM"] } });
3957 std::string desc = cmp.mInfo[
"MOLECULE"];
3958 if (not cmp.mInfo[
"EC"].empty())
3959 desc +=
" (E.C." + cmp.mInfo[
"EC"] +
")";
3961 if (not cmp.mTitle.empty())
3962 structTitle.insert(cmp.mTitle);
3964 if (not desc.empty())
3965 structDescription.insert(desc);
3967 auto ci = find_if(mChains.begin(), mChains.end(),
3968 [cmp](PDBChain &
c) ->
bool 3969 {
return cmp.mChains.count(
c.mDbref.chainID); });
3971 if (ci != mChains.end() and not ci->mDbref.dbIdCode.empty())
3973 getCategory(
"struct_ref")->emplace({
3974 {
"id", structRefID },
3975 {
"entity_id", mMolID2EntityID[cmp.mMolID] },
3976 {
"db_name", ci->mDbref.database },
3977 {
"db_code", ci->mDbref.dbIdCode },
3978 {
"pdbx_db_accession", ci->mDbref.dbAccession },
3983 bool nstdMonomer =
false, nonstandardLinkage =
false;
3984 bool mightBePolyPeptide =
true, mightBeDNA =
true;
3986 std::vector<std::string> chains;
3987 std::string seq, seqCan;
3990 for (
auto &chain : mChains)
3992 if (chain.mMolID != cmp.mMolID)
3997 ++structRefSeqAlignID;
3998 DBREF &dbref = chain.mDbref;
4000 if (not dbref.database.empty())
4002 auto insToStr = [](
char i) -> std::string
4003 {
return i ==
' ' or not isprint(
i) ?
"" : std::string{
i }; };
4005 auto &pdbxPolySeqScheme = *getCategory(
"pdbx_poly_seq_scheme");
4007 int seqAlignBeg = 0, seqAlignEnd = 0;
4011 seqAlignBeg = pdbxPolySeqScheme.find1<
int>(key(
"pdb_strand_id") == std::string { dbref.chainID } and
4012 key(
"pdb_seq_num") == dbref.seqBegin and
4013 (key(
"pdb_ins_code") == insToStr(dbref.insertBegin)
or key(
"pdb_ins_code") == cif::null),
4016 seqAlignEnd = pdbxPolySeqScheme.find1<
int>(key(
"pdb_strand_id") == std::string { dbref.chainID } and
4017 key(
"pdb_seq_num") == dbref.seqEnd and
4018 (key(
"pdb_ins_code") == insToStr(dbref.insertEnd)
or key(
"pdb_ins_code") == cif::null),
4025 getCategory(
"struct_ref_seq")->emplace({
4026 {
"align_id", structRefSeqAlignID },
4027 {
"ref_id", structRefID },
4028 {
"pdbx_PDB_id_code", dbref.PDBIDCode },
4029 {
"pdbx_strand_id", std::string{ chain.mDbref.chainID } },
4030 {
"seq_align_beg", seqAlignBeg },
4031 {
"pdbx_seq_align_beg_ins_code", insToStr(dbref.insertBegin) },
4032 {
"seq_align_end", seqAlignEnd },
4033 {
"pdbx_seq_align_end_ins_code", insToStr(dbref.insertEnd) },
4034 {
"pdbx_db_accession", dbref.dbAccession },
4035 {
"db_align_beg", dbref.dbSeqBegin },
4036 {
"pdbx_db_align_beg_ins_code", insToStr(dbref.dbinsBeg) },
4037 {
"db_align_end", dbref.dbSeqEnd },
4038 {
"pdbx_db_align_end_ins_code", insToStr(dbref.dbinsEnd) },
4039 {
"pdbx_auth_seq_align_beg", dbref.seqBegin },
4040 {
"pdbx_auth_seq_align_end", dbref.seqEnd } });
4043 for (
auto &seqadv : mSeqadvs)
4045 if (seqadv.chainID != chain.mDbref.chainID
or seqadv.resName.empty())
4048 std::string asym, seqNum;
4052 std::tie(asym, labelSeq, std::ignore) = MapResidue(seqadv.chainID, seqadv.seqNum, seqadv.iCode, ec);
4056 std::cerr <<
"dropping unmatched SEQADV record" << std::endl;
4062 getCategory(
"struct_ref_seq_dif")->emplace({
4063 {
"align_id", structRefSeqAlignID },
4064 {
"pdbx_PDB_id_code", dbref.PDBIDCode },
4065 {
"mon_id", seqadv.resName },
4066 {
"pdbx_pdb_strand_id", seqadv.chainID },
4067 {
"seq_num", seqNum },
4068 {
"pdbx_pdb_ins_code", seqadv.iCode ==
' ' ? std::string{} : std::string{ seqadv.iCode } },
4069 {
"pdbx_seq_db_name", seqadv.database },
4070 {
"pdbx_seq_db_accession_code", seqadv.dbAccession },
4071 {
"db_mon_id", seqadv.dbRes },
4072 {
"pdbx_seq_db_seq_num", seqadv.dbSeq },
4073 {
"details", seqadv.conflict },
4074 {
"pdbx_auth_seq_num", seqadv.seqNum },
4075 {
"pdbx_ordinal", ++mPdbxDifOrdinal } });
4079 if (not chains.empty())
4081 chains.push_back(std::string{ chain.mDbref.chainID });
4085 chains.push_back(std::string{ chain.mDbref.chainID });
4087 size_t seqLen = 0, seqCanLen = 0;
4089 for (
auto &res : chain.mSeqres)
4091 std::string letter, stdRes;
4093 if (mMod2parent.count(res.mMonID))
4094 stdRes = mMod2parent.at(res.mMonID);
4096 if (cif::compound_factory::kAAMap.count(res.mMonID))
4098 letter = cif::compound_factory::kAAMap.at(res.mMonID);
4101 else if (cif::compound_factory::kBaseMap.count(res.mMonID))
4103 letter = cif::compound_factory::kBaseMap.at(res.mMonID);
4104 mightBePolyPeptide =
false;
4109 letter =
'(' + res.mMonID +
')';
4112 auto compound = cif::compound_factory::instance().create(stdRes.empty() ? res.mMonID : stdRes);
4113 if (compound !=
nullptr and
4114 not
iequals(compound->type(),
"L-peptide linking") and
4115 not
iequals(compound->type(),
"RNA linking"))
4117 nonstandardLinkage =
true;
4121 if (seqLen + letter.length() > 80)
4128 seqLen += letter.length();
4130 if (letter.length() > 1)
4132 if (not stdRes.empty() and cif::compound_factory::kAAMap.count(stdRes))
4133 letter = cif::compound_factory::kAAMap.at(stdRes);
4134 else if (cif::compound_factory::kBaseMap.count(res.mMonID))
4135 letter = cif::compound_factory::kBaseMap.at(res.mMonID);
4140 if (seqCanLen + letter.length() > 80)
4146 seqCanLen += letter.length();
4149 auto cat_ps = getCategory(
"entity_poly_seq");
4150 for (
size_t i = 0;
i < chain.mSeqres.size(); ++
i)
4152 auto &rs = chain.mSeqres[
i];
4154 if (
std::find(mChemComp.begin(), mChemComp.end(), rs.mMonID) == mChemComp.end())
4155 mChemComp.emplace_back(rs.mMonID);
4158 {
"entity_id", mMolID2EntityID[cmp.mMolID] },
4160 {
"mon_id", rs.mMonID },
4161 {
"hetero", rs.mAlts.empty() ?
"n" :
"y" } });
4163 for (
auto &
a : rs.mAlts)
4166 {
"entity_id", mMolID2EntityID[cmp.mMolID] },
4169 {
"hetero",
"y" } });
4175 if (mightBePolyPeptide and not mightBeDNA)
4176 type =
"polypeptide(L)";
4177 else if (mightBeDNA and not mightBePolyPeptide)
4178 type =
"polyribonucleotide";
4180 getCategory(
"entity_poly")->emplace({
4181 {
"entity_id", mMolID2EntityID[cmp.mMolID] },
4182 {
"pdbx_seq_one_letter_code", seq },
4183 {
"pdbx_seq_one_letter_code_can", seqCan },
4184 {
"nstd_monomer", (nstdMonomer ?
"yes" :
"no") },
4185 {
"pdbx_strand_id", cif::join(chains,
",") },
4186 {
"nstd_linkage", nonstandardLinkage ?
"yes" :
"no" },
4187 {
"type", type } });
4190 if (not(structTitle.empty() and structDescription.empty()))
4192 getCategory(
"struct")->emplace({
4193 {
"entry_id", mStructureID },
4194 {
"title", cif::join(structTitle,
", ") },
4195 {
"pdbx_descriptor", cif::join(structDescription,
", ") },
4196 {
"pdbx_model_type_details", mModelTypeDetails } });
4200 ConstructSugarTrees(asymNr);
4204 std::map<char, std::string> waterChains;
4205 std::map<std::tuple<std::string, std::string>,
int> ndbSeqNum;
4206 std::map<std::string,int> entityAuthSeqNum;
4208 for (
size_t i = 0;
i < mHets.size(); ++
i)
4210 auto &heti = mHets[
i];
4212 if (not heti.asymID.empty())
4215 if (heti.hetID == mWaterHetID
or isWater(heti.hetID))
4219 auto &chain = GetChainForID(heti.chainID);
4220 auto ih =
find(chain.mSeqres.begin(), chain.mSeqres.end(), PDBSeqRes{ heti.hetID, heti.seqNum, heti.iCode });
4223 if (ih != chain.mSeqres.end())
4229 std::set<std::string> writtenAsyms;
4231 std::map<std::string, int> hetCount;
4232 for (
auto &het : mHets)
4233 hetCount[het.hetID] += 1;
4235 for (
auto &het : mHets)
4237 std::string hetID = het.hetID;
4239 auto &chain = GetChainForID(het.chainID);
4242 auto i =
find(chain.mSeqres.begin(), chain.mSeqres.end(), PDBSeqRes{ hetID, het.seqNum, het.iCode });
4245 if (
i != chain.mSeqres.end())
4249 if (mHet2EntityID.count(hetID) == 0)
4252 mHet2EntityID[hetID] = entityID;
4254 if (hetID == mWaterHetID)
4256 getCategory(
"entity")->emplace({
4258 {
"type",
"water" },
4259 {
"src_method",
"nat" },
4260 {
"pdbx_description",
"water" },
4261 {
"pdbx_number_of_molecules", hetCount[hetID] } });
4265 if (mHetnams[hetID].empty())
4267 auto compound = cif::compound_factory::instance().create(hetID);
4268 if (compound !=
nullptr)
4269 mHetnams[hetID] = compound->name();
4272 getCategory(
"entity")->emplace({
4274 {
"type",
"non-polymer" },
4275 {
"src_method",
"syn" },
4276 {
"pdbx_description", mHetnams[hetID] },
4277 {
"details", mHetsyns[hetID] },
4278 {
"pdbx_number_of_molecules", hetCount[hetID] } });
4282 std::string
name = mHetnams[hetID];
4283 if (name.empty() and hetID == mWaterHetID)
4285 getCategory(
"pdbx_entity_nonpoly")->emplace({
4286 {
"entity_id", entityID },
4288 {
"comp_id", hetID } });
4293 std::string asymID = het.asymID;
4295 auto k = std::make_tuple(het.chainID, het.seqNum, het.iCode);
4296 if (mChainSeq2AsymSeq.count(
k) == 0)
4298 if (hetID == mWaterHetID
or isWater(hetID))
4300 if (waterChains.count(het.chainID) == 0)
4303 waterChains[het.chainID] = asymID;
4306 asymID = waterChains[het.chainID];
4309 asymID = het.asymID;
4311 assert(asymID.empty() ==
false);
4313 mAsymID2EntityID[asymID] = mHet2EntityID[hetID];
4319 mChainSeq2AsymSeq[
k] = std::make_tuple(asymID, 0,
false);
4321 if (writtenAsyms.count(asymID) == 0)
4323 writtenAsyms.insert(asymID);
4324 getCategory(
"struct_asym")->emplace({
4326 {
"pdbx_blank_PDB_chainid_flag", het.chainID ==
' ' ?
"Y" :
"N" },
4328 {
"entity_id", mHet2EntityID[hetID] },
4334 int seqNr = ++ndbSeqNum[std::make_tuple(hetID, asymID)];
4335 int authSeqNr = ++entityAuthSeqNum[hetID];
4337 std::string iCode{ het.iCode };
4342 getCategory(
"pdbx_nonpoly_scheme")->emplace({
4343 {
"asym_id", asymID },
4344 {
"entity_id", mHet2EntityID[hetID] },
4345 {
"mon_id", hetID },
4346 {
"ndb_seq_num", seqNr },
4347 {
"pdb_seq_num", het.seqNum },
4348 {
"auth_seq_num", authSeqNr },
4349 {
"pdb_mon_id", hetID },
4350 {
"auth_mon_id", hetID },
4351 {
"pdb_strand_id", std::string{ het.chainID } },
4352 {
"pdb_ins_code", iCode } });
4355 mChainSeq2AsymSeq[std::make_tuple(het.chainID, het.seqNum, het.iCode)] = std::make_tuple(asymID, seqNr,
false);
4359 std::set<std::string> modResSet;
4360 for (
auto rec = FindRecord(
"MODRES"); rec !=
nullptr and rec->is(
"MODRES");
4363 std::string resName = rec->vS(13, 15);
4364 char chainID = rec->vC(17);
4365 int seqNum = rec->vI(19, 22);
4366 char iCode = rec->vC(23);
4367 std::string stdRes = rec->vS(25, 27);
4368 std::string comment = rec->vS(30, 70);
4374 std::tie(asymID, seq, std::ignore) = MapResidue(chainID, seqNum, iCode, ec);
4378 std::cerr <<
"dropping unmapped MODRES record" << std::endl;
4382 getCategory(
"pdbx_struct_mod_residue")->emplace({
4383 {
"id", modResID++ },
4384 {
"label_asym_id", asymID },
4385 {
"label_seq_id", seq },
4386 {
"label_comp_id", resName },
4387 {
"auth_asym_id", std::string(1, chainID) },
4388 {
"auth_seq_id", seqNum },
4389 {
"auth_comp_id", resName },
4390 {
"PDB_ins_code", iCode ==
' ' ?
"" : std::string{ iCode } },
4391 {
"parent_comp_id", stdRes },
4392 {
"details", comment }
4395 modResSet.insert(resName);
4400 for (
auto cc : mChemComp)
4402 auto compound = cif::compound_factory::instance().create(
4403 mMod2parent.count(cc) ? mMod2parent[cc] : cc);
4406 std::string formula;
4408 std::string nstd =
".";
4409 std::string formulaWeight;
4411 if (compound !=
nullptr)
4413 name = compound->name();
4414 type = compound->type();
4416 if (
iequals(type,
"L-peptide linking")
or iequals(type,
"peptide linking"))
4419 formula = compound->formula();
4424 name = mHetnams[cc];
4427 type =
"NON-POLYMER";
4429 if (formula.empty())
4431 formula = mFormuls[cc];
4433 const std::regex rx(R
"(\d+\((.+)\))"); 4435 if (std::regex_match(formula, m, rx))
4436 formula = m[1].str();
4439 if (modResSet.count(cc))
4442 getCategory(
"chem_comp")->emplace({
4445 {
"formula", formula },
4446 {
"formula_weight", formulaWeight },
4447 {
"mon_nstd_flag", nstd },
4452 getCategory(
"chem_comp")->reorder_by_index();
4456 int idRes = 0, idAtom = 0;
4457 sort(mUnobs.begin(), mUnobs.end(), [](
const UNOBS &
a,
const UNOBS &
b) ->
bool 4459 int d = a.modelNr -
b.modelNr;
4464 for (
auto &unobs : mUnobs)
4466 bool isPolymer =
false;
4467 std::string asymID, compID = unobs.res;
4471 std::tie(asymID, seqNr, isPolymer) = MapResidue(unobs.chain, unobs.seq, unobs.iCode, ec);
4475 std::cerr <<
"error mapping unobserved residue" << std::endl;
4479 if (unobs.atoms.empty())
4481 getCategory(
"pdbx_unobs_or_zero_occ_residues")->emplace({
4483 {
"polymer_flag", isPolymer ?
"Y" :
"N" },
4484 {
"occupancy_flag", 1 },
4485 {
"PDB_model_num", unobs.modelNr ? unobs.modelNr : 1 },
4486 {
"auth_asym_id", std::string{ unobs.chain } },
4487 {
"auth_comp_id", unobs.res },
4488 {
"auth_seq_id", unobs.seq },
4489 {
"PDB_ins_code", unobs.iCode ==
' ' ?
"" : std::string{ unobs.iCode } },
4490 {
"label_asym_id", asymID },
4491 {
"label_comp_id", compID },
4496 for (
auto &atom : unobs.atoms)
4498 getCategory(
"pdbx_unobs_or_zero_occ_atoms")->emplace({
4500 {
"polymer_flag", isPolymer ?
"Y" :
"N" },
4501 {
"occupancy_flag", 1 },
4502 {
"PDB_model_num", unobs.modelNr ? unobs.modelNr : 1 },
4503 {
"auth_asym_id", std::string{ unobs.chain } },
4504 {
"auth_comp_id", unobs.res },
4505 {
"auth_seq_id", unobs.seq },
4506 {
"PDB_ins_code", unobs.iCode ==
' ' ?
"" : std::string{ unobs.iCode } },
4507 {
"auth_atom_id", atom },
4508 {
"label_asym_id", asymID },
4509 {
"label_comp_id", compID },
4511 {
"label_atom_id", atom } });
4517 void PDBFileParser::ConstructSugarTrees(
int &asymNr)
4522 auto si = std::find_if(mHets.begin(), mHets.end(), [](
const HET &h)
4523 {
return (h.hetID ==
"NAG" or h.hetID ==
"NDG") and not(h.processed
or h.branch); });
4524 if (si != mHets.end())
4526 si->processed =
true;
4531 for (
auto a : si->atoms)
4533 std::string
name =
a->vS(13, 16);
4538 ci.insert(
a->vC(17));
4546 ATOM_REF c1{
"C1", si->hetID, si->seqNum, si->chainID, si->iCode, alt };
4548 const auto &[asn, linked] = FindLink(c1,
"ND2",
"ASN");
4552 std::stack<ATOM_REF> c1s;
4555 SUGAR_TREE sugarTree;
4556 sugarTree.push_back({ c1 });
4559 while (not c1s.empty())
4564 for (
auto o : {
"O1",
"O2",
"O3",
"O4",
"O5",
"O6" })
4566 ATOM_REF leaving = c1;
4569 const auto &[nc1, linked_c1] = FindLink(leaving,
"C1");
4572 sugarTree.push_back({ nc1, o[1] -
'0', c1 });
4578 if (sugarTree.size() < 2)
4581 auto branchName = sugarTree.entityName();
4582 auto entityID = mBranch2EntityID[branchName];
4585 if (entityID.empty())
4588 mBranch2EntityID[branchName] = entityID;
4590 getCategory(
"entity")->emplace({
4592 {
"type",
"branched" },
4593 {
"src_method",
"man" },
4594 {
"pdbx_description", branchName } });
4596 getCategory(
"pdbx_entity_branch")->emplace({
4597 {
"entity_id", entityID },
4598 {
"type",
"oligosaccharide" } });
4601 std::map<ATOM_REF, int> branch_list;
4603 for (
auto &s : sugarTree)
4605 getCategory(
"pdbx_entity_branch_list")->emplace({
4606 {
"entity_id", entityID },
4607 {
"comp_id", s.c1.resName },
4609 {
"hetero", ci.size() == 1 ?
"n" :
"y" } });
4611 branch_list[s.c1] = num;
4614 auto &branch_link = *getCategory(
"pdbx_entity_branch_link");
4616 for (
auto &s : sugarTree)
4618 if (s.leaving_o == 0)
4621 branch_link.emplace({
4622 {
"link_id", branch_link.size() + 1 },
4623 {
"entity_id", entityID },
4624 {
"entity_branch_list_num_1", branch_list[s.c1] },
4625 {
"comp_id_1", s.c1.resName },
4626 {
"atom_id_1", s.c1.name },
4627 {
"leaving_atom_id_1",
"O1" },
4628 {
"entity_branch_list_num_2", branch_list[s.next] },
4629 {
"comp_id_2", s.next.resName },
4632 {
"value_order",
"sing" }
4637 mSugarEntities.insert(entityID);
4643 mAsymID2EntityID[asymID] = entityID;
4645 getCategory(
"struct_asym")->emplace({
4647 {
"pdbx_blank_PDB_chainid_flag", si->chainID ==
' ' ?
"Y" :
"N" },
4648 {
"pdbx_modified",
"N" },
4649 {
"entity_id", entityID } });
4651 std::string iCode{ si->iCode };
4657 for (
auto s : sugarTree)
4659 getCategory(
"pdbx_branch_scheme")->emplace({
4660 {
"asym_id", asymID },
4661 {
"entity_id", entityID },
4662 {
"mon_id", s.c1.resName },
4664 {
"pdb_asym_id", asymID },
4665 {
"pdb_mon_id", s.c1.resName },
4666 {
"pdb_seq_num", num },
4667 {
"auth_asym_id", std::string{ s.c1.chainID } },
4668 {
"auth_mon_id", s.next.resName },
4669 {
"auth_seq_num", s.c1.resSeq },
4670 {
"hetero", ci.size() == 1 ?
"n" :
"y" } });
4672 auto k = std::make_tuple(s.c1.chainID, s.c1.resSeq, s.c1.iCode);
4673 assert(mChainSeq2AsymSeq.count(
k) == 0);
4675 mChainSeq2AsymSeq[
k] = std::make_tuple(asymID, num,
false);
4679 for (
auto &h : mHets)
4681 if (h.hetID == s.c1.resName and h.chainID == s.c1.chainID and h.seqNum == s.c1.resSeq and h.iCode == s.c1.iCode)
4699 mHets.erase(std::remove_if(mHets.begin(), mHets.end(), [](
auto &h)
4700 {
return h.branch; }),
4704 void PDBFileParser::ParseSecondaryStructure()
4706 bool firstHelix =
true;
4708 while (mRec->is(
"HELIX "))
4730 std::string begAsymID, endAsymID;
4734 std::tie(begAsymID, begSeq, std::ignore) = MapResidue(vC(20), vI(22, 25), vC(26), ec);
4736 std::tie(endAsymID, endSeq, std::ignore) = MapResidue(vC(32), vI(34, 37), vC(38), ec);
4741 std::cerr <<
"Could not map residue for HELIX " << vI(8, 10) << std::endl;
4745 auto cat = getCategory(
"struct_conf");
4747 {
"conf_type_id",
"HELX_P" },
4749 {
"pdbx_PDB_helix_id", vS(12, 14) },
4750 {
"beg_label_comp_id", vS(16, 18) },
4751 {
"beg_label_asym_id", begAsymID },
4752 {
"beg_label_seq_id", begSeq },
4753 {
"pdbx_beg_PDB_ins_code", vS(26, 26) },
4754 {
"end_label_comp_id", vS(28, 30) },
4755 {
"end_label_asym_id", endAsymID },
4756 {
"end_label_seq_id", endSeq },
4757 {
"pdbx_end_PDB_ins_code", vS(38, 38) },
4759 {
"beg_auth_comp_id", vS(16, 18) },
4760 {
"beg_auth_asym_id", vS(20, 20) },
4761 {
"beg_auth_seq_id", vI(22, 25) },
4762 {
"end_auth_comp_id", vS(28, 30) },
4763 {
"end_auth_asym_id", vS(32, 32) },
4764 {
"end_auth_seq_id", vI(34, 37) },
4766 {
"pdbx_PDB_helix_class", vS(39, 40) },
4767 {
"details", vS(41, 70) },
4768 {
"pdbx_PDB_helix_length", vI(72, 76) } });
4772 cat = getCategory(
"struct_conf_type");
4774 {
"id",
"HELX_P" } });
4782 std::set<std::string> sheetsSeen;
4785 while (mRec->is(
"SHEET "))
4825 if (sheetsSeen.count(sheetID) == 0)
4827 sheetsSeen.insert(sheetID);
4831 getCategory(
"struct_sheet")->emplace({
4833 {
"number_strands", vI(15, 16) },
4837 int sense = vI(39, 40);
4841 getCategory(
"struct_sheet_order")->emplace({
4842 {
"sheet_id", sheetID },
4843 {
"range_id_1", rangeID },
4844 {
"range_id_2", rangeID + 1 },
4845 {
"sense", sense == -1 ?
"anti-parallel" :
"parallel" } });
4848 std::string begAsymID, endAsymID;
4852 std::tie(begAsymID, begSeq, std::ignore) = MapResidue(vC(22), vI(23, 26), vC(27), ec);
4854 std::tie(endAsymID, endSeq, std::ignore) = MapResidue(vC(33), vI(34, 37), vC(38), ec);
4859 std::cerr <<
"Dropping SHEET record " << vI(8, 10) << std::endl;
4863 getCategory(
"struct_sheet_range")->emplace({
4864 {
"sheet_id", sheetID },
4865 {
"id", vI(8, 10) },
4866 {
"beg_label_comp_id", vS(18, 20) },
4867 {
"beg_label_asym_id", begAsymID },
4868 {
"beg_label_seq_id", begSeq },
4869 {
"pdbx_beg_PDB_ins_code", vS(27, 27) },
4870 {
"end_label_comp_id", vS(29, 31) },
4871 {
"end_label_asym_id", endAsymID },
4872 {
"end_label_seq_id", endSeq },
4873 {
"pdbx_end_PDB_ins_code", vS(38, 38) },
4875 {
"beg_auth_comp_id", vS(18, 20) },
4876 {
"beg_auth_asym_id", vS(22, 22) },
4877 {
"beg_auth_seq_id", vI(23, 26) },
4878 {
"end_auth_comp_id", vS(29, 31) },
4879 {
"end_auth_asym_id", vS(33, 33) },
4880 {
"end_auth_seq_id", vI(34, 37) },
4883 if (sense != 0 and mRec->mVlen > 34)
4885 std::string r1AsymID, r2AsymID;
4888 std::tie(r1AsymID, r1Seq, std::ignore) = MapResidue(vC(65), vI(66, 69), vC(70), ec);
4890 std::tie(r2AsymID, r2Seq, std::ignore) = MapResidue(vC(50), vI(51, 54), vC(55), ec);
4895 std::cerr <<
"skipping unmatched pdbx_struct_sheet_hbond record" << std::endl;
4898 getCategory(
"pdbx_struct_sheet_hbond")->emplace({
4899 {
"sheet_id", sheetID },
4900 {
"range_id_1", rangeID },
4901 {
"range_id_2", rangeID + 1 },
4902 {
"range_1_label_atom_id", vS(57, 60) },
4903 {
"range_1_label_comp_id", vS(61, 63) },
4904 {
"range_1_label_asym_id", r1AsymID },
4905 {
"range_1_label_seq_id", r1Seq },
4906 {
"range_1_PDB_ins_code", vS(70, 70) },
4907 {
"range_1_auth_atom_id", vS(57, 60) },
4908 {
"range_1_auth_comp_id", vS(61, 63) },
4909 {
"range_1_auth_asym_id", vS(65, 65) },
4910 {
"range_1_auth_seq_id", vI(66, 69) },
4912 {
"range_2_label_atom_id", vS(42, 45) },
4913 {
"range_2_label_comp_id", vS(46, 48) },
4914 {
"range_2_label_asym_id", r2AsymID },
4915 {
"range_2_label_seq_id", r2Seq },
4916 {
"range_2_PDB_ins_code", vS(55, 55) },
4917 {
"range_2_auth_atom_id", vS(42, 45) },
4918 {
"range_2_auth_comp_id", vS(46, 48) },
4919 {
"range_2_auth_asym_id", vS(50, 50) },
4920 {
"range_2_auth_seq_id", vI(51, 54) } });
4931 static bool IsMetal(
const std::string &resName,
const std::string &atomID)
4933 bool result =
false;
4937 auto compound = cif::compound_factory::instance().create(resName);
4938 if (compound !=
nullptr)
4940 auto at = cif::atom_type_traits(compound->get_atom_by_atom_id(atomID).type_symbol);
4941 result = at.is_metal();
4951 void PDBFileParser::ParseConnectivtyAnnotation()
4955 bool firstCovale =
true, firstMetalc =
true;
4958 for (;; GetNextRecord())
4960 if (mRec->is(
"SSBOND"))
4964 getCategory(
"struct_conn_type")->emplace({
4983 std::string p1Asym, p2Asym;
4984 int p1Seq = 0, p2Seq = 0;
4987 std::tie(p1Asym, p1Seq, std::ignore) = MapResidue(vC(16), vI(18, 21), vC(22), ec);
4989 std::tie(p2Asym, p2Seq, std::ignore) = MapResidue(vC(30), vI(32, 35), vC(36), ec);
4994 std::cerr <<
"Dropping SSBOND " << vI(8, 10) << std::endl;
4998 std::vector<char> alt1 = altLocsForAtom(vC(16), vI(18, 21), vC(22),
"SG");
4999 std::vector<char> alt2 = altLocsForAtom(vC(30), vI(32, 35), vC(36),
"SG");
5006 std::string sym1, sym2;
5009 sym1 = pdb2cifSymmetry(vS(60, 65));
5010 sym2 = pdb2cifSymmetry(vS(67, 72));
5012 catch (
const std::exception &ex)
5015 std::cerr <<
"Dropping SSBOND " << vI(8, 10) <<
" due to invalid symmetry operation" << std::endl;
5019 for (
auto a1 : alt1)
5021 for (
auto a2 : alt2)
5023 getCategory(
"struct_conn")->emplace({
5025 {
"conn_type_id",
"disulf" },
5027 {
"ptnr1_label_asym_id", p1Asym },
5028 {
"pdbx_ptnr1_label_alt_id", a1 ? std::string{ a1 } : std::string() },
5029 {
"ptnr1_label_comp_id", vS(12, 14) },
5031 {
"ptnr1_label_atom_id",
"SG" },
5032 {
"ptnr1_symmetry", sym1 },
5034 {
"ptnr2_label_asym_id", p2Asym },
5035 {
"pdbx_ptnr2_label_alt_id", a2 ? std::string{ a2 } : std::string() },
5036 {
"ptnr2_label_comp_id", vS(26, 28) },
5038 {
"ptnr2_label_atom_id",
"SG" },
5040 {
"ptnr1_auth_asym_id", vS(16, 16) },
5041 {
"ptnr1_auth_comp_id", vS(12, 14) },
5042 {
"ptnr1_auth_seq_id", vI(18, 21) },
5043 {
"ptnr2_auth_asym_id", vS(30, 30) },
5044 {
"ptnr2_auth_comp_id", vS(26, 28) },
5045 {
"ptnr2_auth_seq_id", vI(32, 35) },
5047 {
"ptnr2_symmetry", sym2 },
5049 {
"pdbx_dist_value", vS(74, 78) },
5057 if (mRec->is(
"LINK ")
or mRec->is(
"LINKR "))
5060 std::cerr <<
"Accepting non-standard LINKR record, but ignoring extra information" << std::endl;
5063 std::string name1 = vS(13, 16);
5065 std::string resName1 = vS(18, 20);
5069 std::string name2 = vS(43, 46);
5071 std::string resName2 = vS(48, 50);
5079 std::string
type =
"covale";
5080 if (IsMetal(resName1, name1)
or IsMetal(resName2, name2))
5083 if (type ==
"covale" and firstCovale)
5085 getCategory(
"struct_conn_type")->emplace({
5088 firstCovale =
false;
5091 if (type ==
"metalc" and firstMetalc)
5093 getCategory(
"struct_conn_type")->emplace({
5096 firstMetalc =
false;
5101 std::string p1Asym, p2Asym;
5102 int p1Seq = 0, p2Seq = 0;
5103 bool isResseq1 =
false, isResseq2 =
false;
5106 std::tie(p1Asym, p1Seq, isResseq1) = MapResidue(vC(22), vI(23, 26), vC(27), ec);
5108 std::tie(p2Asym, p2Seq, isResseq2) = MapResidue(vC(52), vI(53, 56), vC(57), ec);
5113 std::cerr <<
"Dropping LINK record at line " << mRec->mLineNr << std::endl;
5119 if (mRec->is(
"LINK "))
5121 distance = vS(74, 78);
5124 auto r = cif::from_chars(distance.data(), distance.data() + distance.length(),
d);
5125 if (r.ec != std::errc())
5128 std::cerr <<
"Distance value '" << distance <<
"' is not a valid float in LINK record" << std::endl;
5129 swap(ccp4LinkID, distance);
5133 ccp4LinkID = vS(74, 78);
5135 std::string sym1, sym2;
5138 sym1 = pdb2cifSymmetry(vS(60, 65));
5139 sym2 = pdb2cifSymmetry(vS(67, 72));
5141 catch (
const std::exception &ex)
5144 std::cerr <<
"Dropping LINK record at line " << mRec->mLineNr <<
" due to invalid symmetry operation" << std::endl;
5148 getCategory(
"struct_conn")->emplace({
5150 {
"conn_type_id", type },
5154 {
"ptnr1_label_asym_id", p1Asym },
5155 {
"ptnr1_label_comp_id", vS(18, 20) },
5156 {
"ptnr1_label_seq_id", (isResseq1 and p1Seq) ?
std::to_string(p1Seq) :
"." },
5157 {
"ptnr1_label_atom_id", vS(13, 16) },
5158 {
"pdbx_ptnr1_label_alt_id", vS(17, 17) },
5159 {
"pdbx_ptnr1_PDB_ins_code", vS(27, 27) },
5160 {
"pdbx_ptnr1_standard_comp_id",
"" },
5161 {
"ptnr1_symmetry", sym1 },
5163 {
"ptnr2_label_asym_id", p2Asym },
5164 {
"ptnr2_label_comp_id", vS(48, 50) },
5165 {
"ptnr2_label_seq_id", (isResseq2 and p2Seq) ?
std::to_string(p2Seq) :
"." },
5166 {
"ptnr2_label_atom_id", vS(43, 46) },
5167 {
"pdbx_ptnr2_label_alt_id", vS(47, 47) },
5168 {
"pdbx_ptnr2_PDB_ins_code", vS(57, 57) },
5170 {
"ptnr1_auth_asym_id", vS(22, 22) },
5171 {
"ptnr1_auth_comp_id", vS(18, 20) },
5172 {
"ptnr1_auth_seq_id", vI(23, 26) },
5173 {
"ptnr2_auth_asym_id", vS(52, 52) },
5174 {
"ptnr2_auth_comp_id", vS(48, 50) },
5175 {
"ptnr2_auth_seq_id", vI(53, 56) },
5180 {
"ptnr2_symmetry", sym2 },
5182 {
"pdbx_dist_value", distance } });
5187 if (mRec->is(
"CISPEP"))
5190 int serNum = vI(8, 10);
5191 std::string pep1 = vS(12, 14);
5192 char chainID1 = vC(16);
5193 int seqNum1 = vI(18, 21);
5194 char iCode1 = vC(22);
5195 std::string pep2 = vS(26, 28);
5196 char chainID2 = vC(30);
5197 int seqNum2 = vI(32, 35);
5198 char iCode2 = vC(36);
5199 int modNum = vI(44, 46);
5200 std::string measure = vF(54, 59);
5205 std::string lAsym1, lAsym2;
5206 int lResSeq1, lResSeq2;
5209 std::tie(lAsym1, lResSeq1, std::ignore) = MapResidue(chainID1, seqNum1, iCode1, ec);
5211 std::tie(lAsym2, lResSeq2, std::ignore) = MapResidue(chainID2, seqNum2, iCode2, ec);
5216 std::cerr <<
"Dropping CISPEP record at line " << mRec->mLineNr << std::endl;
5220 std::string iCode1str = iCode1 ==
' ' ? std::string() : std::string{ iCode1 };
5221 std::string iCode2str = iCode2 ==
' ' ? std::string() : std::string{ iCode2 };
5223 getCategory(
"struct_mon_prot_cis")->emplace({
5224 {
"pdbx_id", serNum },
5225 {
"label_comp_id", pep1 },
5226 {
"label_seq_id", lResSeq1 },
5227 {
"label_asym_id", lAsym1 },
5228 {
"label_alt_id",
"." },
5229 {
"pdbx_PDB_ins_code", iCode1str },
5230 {
"auth_comp_id", pep1 },
5231 {
"auth_seq_id", seqNum1 },
5232 {
"auth_asym_id", std::string{ chainID1 } },
5233 {
"pdbx_label_comp_id_2", pep2 },
5234 {
"pdbx_label_seq_id_2", lResSeq2 },
5235 {
"pdbx_label_asym_id_2", lAsym2 },
5236 {
"pdbx_PDB_ins_code_2", iCode2str },
5237 {
"pdbx_auth_comp_id_2", pep2 },
5238 {
"pdbx_auth_seq_id_2", seqNum2 },
5239 {
"pdbx_auth_asym_id_2", std::string{ chainID2 } },
5240 {
"pdbx_PDB_model_num", modNum },
5241 {
"pdbx_omega_angle", measure } });
5250 void PDBFileParser::ParseMiscellaneousFeatures()
5252 int structSiteGenID = 1;
5254 while (mRec->is(
"SITE "))
5257 std::string siteID = vS(12, 14);
5258 int numRes = vI(16, 17);
5262 auto cat = getCategory(
"struct_site_gen");
5264 for (
int i = 0;
i < numRes; ++
i)
5266 std::string resName = vS(o, o + 2);
5268 char chainID = vC(o + 4);
5269 int seq = vI(o + 5, o + 8);
5271 char iCode = vC(o + 9);
5278 std::tie(asym, labelSeq, isResseq) = MapResidue(chainID, seq, iCode, ec);
5283 std::cerr <<
"skipping struct_site_gen record" << std::endl;
5287 {
"id", structSiteGenID++ },
5288 {
"site_id", siteID },
5289 {
"pdbx_num_res", numRes },
5290 {
"label_comp_id", resName },
5291 {
"label_asym_id", asym },
5292 {
"label_seq_id", (labelSeq > 0 and isResseq) ?
std::to_string(labelSeq) : std::string(
".") },
5293 {
"pdbx_auth_ins_code", iCode ==
' ' ?
"" : std::string{ iCode } },
5294 {
"auth_comp_id", resName },
5295 {
"auth_asym_id", std::string{ chainID } },
5296 {
"auth_seq_id", seq },
5297 {
"label_atom_id",
"." },
5298 {
"label_alt_id",
"." },
5308 void PDBFileParser::ParseCrystallographic()
5310 if (mRec->is(
"CRYST1"))
5312 Match(
"CRYST1",
true);
5314 getCategory(
"cell")->emplace({
5315 {
"entry_id", mStructureID },
5316 {
"length_a", vF(7, 15) },
5317 {
"length_b", vF(16, 24) },
5318 {
"length_c", vF(25, 33) },
5319 {
"angle_alpha", vF(34, 40) },
5320 {
"angle_beta", vF(41, 47) },
5321 {
"angle_gamma", vF(48, 54) },
5323 {
"Z_PDB", vF(67, 70) }
5326 std::string spaceGroup, intTablesNr;
5329 spaceGroup = vS(56, 66);
5336 getCategory(
"symmetry")->emplace({
5337 {
"entry_id", mStructureID },
5338 {
"space_group_name_H-M", spaceGroup },
5339 {
"Int_Tables_number", intTablesNr } });
5347 getCategory(
"cell")->emplace({
5348 {
"entry_id", mStructureID },
5352 {
"angle_alpha", 90 },
5353 {
"angle_beta", 90 },
5354 {
"angle_gamma", 90 },
5359 getCategory(
"symmetry")->emplace({
5360 {
"entry_id", mStructureID },
5361 {
"space_group_name_H-M",
"P 1" },
5362 {
"Int_Tables_number", 1 }
5367 void PDBFileParser::ParseCoordinateTransformation()
5369 std::string
m[3][3], v[3];
5371 if (cif::starts_with(mRec->mName,
"ORIGX"))
5373 for (std::string
n : {
"1",
"2",
"3" })
5375 int x = stoi(
n) - 1;
5377 Match(
"ORIGX" +
n,
true);
5378 m[
x][0] = vF(11, 20);
5379 m[
x][1] = vF(21, 30);
5380 m[
x][2] = vF(31, 40);
5386 getCategory(
"database_PDB_matrix")->emplace({
5387 {
"entry_id", mStructureID },
5388 {
"origx[1][1]", m[0][0] },
5389 {
"origx[1][2]", m[0][1] },
5390 {
"origx[1][3]", m[0][2] },
5391 {
"origx[2][1]", m[1][0] },
5392 {
"origx[2][2]", m[1][1] },
5393 {
"origx[2][3]", m[1][2] },
5394 {
"origx[3][1]", m[2][0] },
5395 {
"origx[3][2]", m[2][1] },
5396 {
"origx[3][3]", m[2][2] },
5397 {
"origx_vector[1]", v[0] },
5398 {
"origx_vector[2]", v[1] },
5399 {
"origx_vector[3]", v[2] },
5403 if (cif::starts_with(mRec->mName,
"SCALE"))
5405 for (std::string
n : {
"1",
"2",
"3" })
5407 int x = stoi(
n) - 1;
5409 Match(
"SCALE" +
n,
true);
5410 m[
x][0] = vF(11, 20);
5411 m[
x][1] = vF(21, 30);
5412 m[
x][2] = vF(31, 40);
5418 getCategory(
"atom_sites")->emplace({
5419 {
"entry_id", mStructureID },
5420 {
"fract_transf_matrix[1][1]", m[0][0] },
5421 {
"fract_transf_matrix[1][2]", m[0][1] },
5422 {
"fract_transf_matrix[1][3]", m[0][2] },
5423 {
"fract_transf_matrix[2][1]", m[1][0] },
5424 {
"fract_transf_matrix[2][2]", m[1][1] },
5425 {
"fract_transf_matrix[2][3]", m[1][2] },
5426 {
"fract_transf_matrix[3][1]", m[2][0] },
5427 {
"fract_transf_matrix[3][2]", m[2][1] },
5428 {
"fract_transf_matrix[3][3]", m[2][2] },
5429 {
"fract_transf_vector[1]", v[0] },
5430 {
"fract_transf_vector[2]", v[1] },
5431 {
"fract_transf_vector[3]", v[2] },
5435 while (cif::starts_with(mRec->mName,
"MTRIX1"))
5437 int serial = 0, igiven = 0;
5439 for (std::string
n : {
"1",
"2",
"3" })
5441 int x = stoi(
n) - 1;
5443 Match(
"MTRIX" +
n,
true);
5445 m[
x][0] = vF(11, 20);
5446 m[
x][1] = vF(21, 30);
5447 m[
x][2] = vF(31, 40);
5449 igiven = vC(60) ==
'1';
5454 getCategory(
"struct_ncs_oper")->emplace({
5456 {
"matrix[1][1]", m[0][0] },
5457 {
"matrix[1][2]", m[0][1] },
5458 {
"matrix[1][3]", m[0][2] },
5459 {
"matrix[2][1]", m[1][0] },
5460 {
"matrix[2][2]", m[1][1] },
5461 {
"matrix[2][3]", m[1][2] },
5462 {
"matrix[3][1]", m[2][0] },
5463 {
"matrix[3][2]", m[2][1] },
5464 {
"matrix[3][3]", m[2][2] },
5465 {
"vector[1]", v[0] },
5466 {
"vector[2]", v[1] },
5467 {
"vector[3]", v[2] },
5468 {
"code", igiven ?
"given" :
"" } });
5472 void PDBFileParser::ParseCoordinate(
int modelNr)
5477 typedef std::tuple<std::string, int, bool, PDBRecord *, PDBRecord *> atomRec;
5479 std::vector<atomRec> atoms;
5480 while (mRec->is(
"ATOM ")
or mRec->is(
"HETATM"))
5482 char chainID = vC(22);
5483 int resSeq = vI(23, 26);
5484 char iCode = vC(27);
5490 std::tie(asymID, seqID, isResseq) = MapResidue(chainID, resSeq, iCode);
5492 PDBRecord *atom = mRec;
5493 PDBRecord *anisou =
nullptr;
5496 if (mRec->is(
"ANISOU"))
5502 atoms.emplace_back(asymID, seqID, isResseq, atom, anisou);
5504 while (mRec->is(
"TER "))
5506 Match(
"TER ",
true);
5514 auto rLess = [](
const atomRec &
a,
const atomRec &
b) ->
bool 5518 std::string chainA = std::get<0>(
a);
5519 std::string chainB = std::get<0>(
b);
5521 if (chainA.length() != chainB.length())
5522 d = static_cast<int>(chainA.length() - chainB.length());
5524 d = std::get<0>(
a).
compare(std::get<0>(
b));
5527 d = std::get<1>(
a) - std::get<1>(
b);
5531 stable_sort(atoms.begin(), atoms.end(), rLess);
5534 for (
size_t i = 0;
i + 1 < atoms.size(); ++
i)
5536 char altLoc = std::get<3>(atoms[
i])->vC(17);
5538 if (altLoc ==
' ' or altLoc == 0)
5541 auto b = atoms.begin() +
i;
5544 std::map<std::string, int> atomIndex;
5546 while (e != atoms.end() and rLess(*
b, *e) ==
false)
5548 std::string
name = std::get<3>(*e)->vS(13, 16);
5550 if (atomIndex.count(name) == 0)
5551 atomIndex[name] = static_cast<int>(atomIndex.size() + 1);
5556 auto aLess = [&](atomRec &
a, atomRec &
b) ->
bool 5558 std::string na = std::get<3>(
a)->vS(13, 16);
5559 std::string nb = std::get<3>(
b)->vS(13, 16);
5561 int d = atomIndex[na] - atomIndex[nb];
5563 d = std::get<3>(
a)->vC(17) - std::get<3>(
b)->vC(17);
5574 for (
auto &a : atoms)
5581 std::tie(asymID, seqID, isResseq, atom, anisou) =
a;
5587 std::string groupPDB = mRec->is(
"ATOM ") ?
"ATOM" :
"HETATM";
5589 std::string
name = vS(13, 16);
5590 char altLoc = vC(17);
5591 std::string resName = vS(18, 20);
5592 char chainID = vC(22);
5593 int resSeq = vI(23, 26);
5594 char iCode = vC(27);
5595 std::string
x = vF(31, 38);
5596 std::string
y = vF(39, 46);
5597 std::string
z = vF(47, 54);
5598 std::string occupancy = vF(55, 60);
5599 std::string tempFactor = vF(61, 66);
5600 std::string element = vS(77, 78);
5601 std::string charge = vS(79, 80);
5603 std::string entityID = mAsymID2EntityID[asymID];
5605 charge = pdb2cifCharge(charge);
5608 if (resName ==
"UNK" or cif::compound_factory::kAAMap.count(resName)
or cif::compound_factory::kBaseMap.count(resName))
5610 if (groupPDB ==
"HETATM")
5613 std::cerr <<
"Changing atom from HETATM to ATOM at line " << mRec->mLineNr << std::endl;
5619 if (groupPDB ==
"ATOM")
5622 std::cerr <<
"Changing atom from ATOM to HETATM at line " << mRec->mLineNr << std::endl;
5623 groupPDB =
"HETATM";
5628 if (mSugarEntities.count(entityID))
5630 using namespace cif::literals;
5632 auto &branch_scheme = *getCategory(
"pdbx_branch_scheme");
5633 resSeq = branch_scheme.find1<
int>(
"asym_id"_key == asymID and
"auth_seq_num"_key == resSeq,
"pdb_seq_num");
5636 getCategory(
"atom_site")->emplace({
5637 {
"group_PDB", groupPDB },
5639 {
"type_symbol", element },
5640 {
"label_atom_id", name },
5641 {
"label_alt_id", altLoc !=
' ' ? std::string{ altLoc } :
"." },
5642 {
"label_comp_id", resName },
5643 {
"label_asym_id", asymID },
5644 {
"label_entity_id", entityID },
5645 {
"label_seq_id", (isResseq and seqID > 0) ?
std::to_string(seqID) :
"." },
5646 {
"pdbx_PDB_ins_code", iCode ==
' ' ?
"" : std::string{ iCode } },
5650 {
"occupancy", occupancy },
5651 {
"B_iso_or_equiv", tempFactor },
5652 {
"pdbx_formal_charge", charge },
5653 {
"auth_seq_id", resSeq },
5654 {
"auth_comp_id", resName },
5655 {
"auth_asym_id", std::string{ chainID } },
5656 {
"auth_atom_id", name },
5657 {
"pdbx_PDB_model_num", modelNr } });
5659 InsertAtomType(element);
5661 std::string
check = vS(7, 11) + vS(77, 80);
5663 if (anisou !=
nullptr)
5666 int u11 = vI(29, 35);
5667 int u22 = vI(36, 42);
5668 int u33 = vI(43, 49);
5669 int u12 = vI(50, 56);
5670 int u13 = vI(57, 63);
5671 int u23 = vI(64, 70);
5673 if (vS(7, 11) + vS(77, 80) != check)
5674 throw std::runtime_error(
"ANISOU record should follow corresponding ATOM record");
5676 auto f = [](
float f) -> std::string
5677 {
return cif::format(
"%6.4f",
f).str(); };
5679 getCategory(
"atom_site_anisotrop")->emplace({
5681 {
"type_symbol", element },
5682 {
"pdbx_label_atom_id", name },
5683 {
"pdbx_label_alt_id", altLoc !=
' ' ? std::string{ altLoc } :
"." },
5684 {
"pdbx_label_comp_id", resName },
5685 {
"pdbx_label_asym_id", asymID },
5686 {
"pdbx_label_seq_id", (isResseq and seqID > 0) ?
std::to_string(seqID) :
"." },
5687 {
"U[1][1]",
f(u11 / 10000.
f) },
5688 {
"U[2][2]",
f(u22 / 10000.
f) },
5689 {
"U[3][3]",
f(u33 / 10000.
f) },
5690 {
"U[1][2]",
f(u12 / 10000.
f) },
5691 {
"U[1][3]",
f(u13 / 10000.
f) },
5692 {
"U[2][3]",
f(u23 / 10000.
f) },
5693 {
"pdbx_auth_seq_id", resSeq },
5694 {
"pdbx_auth_comp_id", resName },
5695 {
"pdbx_auth_asym_id", std::string{ chainID } },
5696 {
"pdbx_auth_atom_id", name } });
5703 void PDBFileParser::ParseConnectivty()
5705 while (mRec->is(
"CONECT"))
5709 void PDBFileParser::ParseBookkeeping()
5711 if (mRec->is(
"MASTER"))
5713 Match(
"MASTER",
false);
5716 Match(
"END ",
false);
5723 mDatablock.set_validator(result.get_validator());
5732 ParsePrimaryStructure();
5735 ConstructEntities();
5739 ParseSecondaryStructure();
5740 ParseConnectivtyAnnotation();
5741 ParseMiscellaneousFeatures();
5742 ParseCrystallographic();
5743 ParseCoordinateTransformation();
5745 uint32_t modelNr = 1;
5746 bool hasAtoms =
false;
5748 while (mRec->is(
"MODEL ")
or mRec->is(
"ATOM ")
or mRec->is(
"HETATM"))
5751 if (mRec->is(
"MODEL "))
5755 modelNr = vI(11, 14);
5760 hasAtoms = hasAtoms
or mRec->is(
"ATOM ")
or mRec->is(
"HETATM");
5762 ParseCoordinate(modelNr);
5766 Match(
"ENDMDL",
true);
5772 throw std::runtime_error(
"Either the PDB file has no atom records, or the field " + std::string(mRec->mName) +
" is not at the correct location");
5774 for (
auto e : mAtomTypes)
5775 getCategory(
"atom_type")->emplace({
5779 getCategory(
"atom_type")->reorder_by_index();
5788 auto r = FindRecord(
"REMARK 3");
5790 if (r !=
nullptr and Remark3Parser::parse(mExpMethod, r, mDatablock))
5793 auto exptl = getCategory(
"exptl");
5797 {
"entry_id", mStructureID },
5798 {
"method", mExpMethod },
5799 {
"crystals_number", mRemark200[
"NUMBER OF CRYSTALS USED"] } });
5803 catch (
const std::exception &ex)
5806 std::cerr <<
"Error parsing REMARK 3" << std::endl;
5836 using namespace cif::literals;
5838 auto &atom_site = *getCategory(
"atom_site");
5840 for (
auto r : getCategory(
"struct_conn")->
find(
"pdbx_dist_value"_key == 0
or "pdbx_dist_value"_key == cif::null))
5842 const auto &[asym1, seq1, atom1, symm1, asym2, seq2, atom2, symm2] = r.get<std::string, std::string, std::string, std::string, std::string, std::string, std::string, std::string>(
5843 "ptnr1_label_asym_id",
"ptnr1_label_seq_id",
"ptnr1_label_atom_id",
"ptnr1_symmetry",
5844 "ptnr2_label_asym_id",
"ptnr2_label_seq_id",
"ptnr2_label_atom_id",
"ptnr2_symmetry");
5850 auto a1 = atom_site.find1(
"label_asym_id"_key == asym1 and
"label_seq_id"_key == seq1 and
"label_atom_id"_key == atom1);
5851 auto a2 = atom_site.find1(
"label_asym_id"_key == asym2 and
"label_seq_id"_key == seq2 and
"label_atom_id"_key == atom2);
5853 if (not a1
or not a2)
5854 throw std::runtime_error(
"cannot find atom");
5856 const auto &[x1, y1, z1] = a1.get<float, float,
float>(
"cartn_x",
"cartn_y",
"cartn_z");
5857 const auto &[x2, y2, z2] = a2.get<float, float,
float>(
"cartn_x",
"cartn_y",
"cartn_z");
5859 if ((symm1.empty()
or symm1 ==
"1_555") and (symm2.empty()
or symm2 ==
"1_555"))
5861 (x1 - x2) * (x1 - x2) +
5862 (y1 - y2) * (y1 - y2) +
5863 (z1 - z2) * (z1 - z2)
5866 std::cerr <<
"Cannot calculate distance for link since one of the atoms is in another dimension" << std::endl;
5868 catch (std::exception &ex)
5871 std::cerr <<
"Error finding atom for LINK distance calculation: " << ex.what() << std::endl;
5877 result.emplace_back(std::move(mDatablock));
5879 catch (
const std::exception &ex)
5883 std::cerr <<
"Error parsing PDB";
5884 if (mRec !=
nullptr)
5885 std::cerr <<
" at line " << mRec->mLineNr;
5886 std::cerr << std::endl;
5898 template <
typename T>
5913 std::fill(m_data, m_data + (m_m * m_n), v);
5928 return m_data[i * m_n +
j];
5935 return m_data[i * m_n +
j];
5943 int PDBFileParser::PDBChain::AlignResToSeqRes()
5950 auto &ry = mResiduesSeen;
5952 int dimX =
static_cast<int>(mSeqres.size());
5954 throw std::runtime_error(std::string(
"SEQRES for chain ") + mDbref.chainID +
" is empty");
5956 int dimY =
static_cast<int>(mResiduesSeen.size());
5958 throw std::runtime_error(std::string(
"Number of residues in ATOM records for chain ") + mDbref.chainID +
" is zero");
5960 matrix<float> B(dimX, dimY), Ix(dimX, dimY), Iy(dimX, dimY);
5967 kMismatchCost = -10,
5968 kGapOpen = 10, gapExtend = 0.1f;
5971 int highX = 0, highY = 0;
5973 for (x = 0; x < dimX; ++
x)
5975 for (y = 0; y < dimY; ++
y)
5980 float Ix1 = x > 0 ? Ix(x - 1, y) : 0;
5981 float Iy1 = y > 0 ? Iy(x, y - 1) : 0;
5985 if (
a.mMonID ==
b.mMonID)
5992 float gapOpen = kGapOpen;
5993 if (y == 0
or (y + 1 < dimY and ry[y + 1].mSeqNum > ry[y].mSeqNum + 1))
5996 if (x > 0 and y > 0)
5997 M += B(x - 1, y - 1);
6000 if (M >= Ix1 and M >= Iy1)
6005 Ix(x, y) = M - (x < dimX - 1 ? gapOpen : 0);
6006 Iy(x, y) = M - (y < dimY - 1 ? gapOpen : 0);
6008 else if (Ix1 >= Iy1)
6013 Ix(x, y) = Ix1 - gapExtend;
6014 Iy(x, y) = M - (y < dimY - 1 ? gapOpen : 0);
6015 if (Iy(x, y) < Iy1 - gapExtend)
6016 Iy(x, y) = Iy1 - gapExtend;
6023 Ix(x, y) = M - (x < dimX - 1 ? gapOpen : 0);
6024 if (Ix(x, y) < Ix1 - gapExtend)
6025 Ix(x, y) = Ix1 - gapExtend;
6026 Iy(x, y) = Iy1 - gapExtend;
6043 sr.mSeqNum = kFlagSeqNr;
6052 auto printAlignment = [&tb, highX, highY, &rx, &ry,
this]()
6055 <<
"Alignment for chain " << mDbref.chainID << std::endl
6057 std::vector<std::pair<std::string, std::string>> alignment;
6062 for (x = highX, y = highY; x >= 0 and y >= 0;)
6067 alignment.push_back(make_pair(
"...", ry[y].mMonID));
6072 alignment.push_back(make_pair(rx[x].mMonID,
"..."));
6077 alignment.push_back(make_pair(rx[x].mMonID, ry[y].mMonID));
6086 alignment.push_back(make_pair(rx[x].mMonID,
"..."));
6092 alignment.push_back(make_pair(
"...", ry[y].mMonID));
6096 reverse(alignment.begin(), alignment.end());
6097 for (
auto a : alignment)
6098 std::cerr <<
" " <<
a.first <<
" -- " <<
a.second << std::endl;
6100 std::cerr << std::endl;
6108 while (x >= 0 and y >= 0)
6113 throw std::runtime_error(
"A residue found in the ATOM records (" + ry[y].mMonID +
6114 " @ " + std::string{ mDbref.chainID } +
":" +
std::to_string(ry[y].mSeqNum) +
6115 ((ry[
y].mIcode ==
' ' or ry[
y].mIcode == 0) ?
"" : std::string{ ry[
y].mIcode }) +
6116 ") was not found in the SEQRES records");
6121 std::cerr <<
"Missing residue in ATOM records: " << rx[
x].mMonID <<
" at " << rx[
x].mSeqNum << std::endl;
6127 if (rx[x].mMonID != ry[y].mMonID)
6129 std::cerr <<
"Warning, unaligned residues at " << x <<
"/" << y <<
"(" << rx[
x].mMonID <<
'/' << ry[
y].mMonID <<
") SEQRES does not agree with ATOM records" << std::endl;
6130 rx[
x].mMonID = ry[
y].mMonID;
6133 rx[
x].mSeqNum = ry[
y].mSeqNum;
6134 rx[
x].mIcode = ry[
y].mIcode;
6141 catch (
const std::exception &ex)
6150 std::stack<int> unnumbered;
6151 for (x = 0; x < dimX; ++
x)
6153 if (rx[x].mSeqNum == kFlagSeqNr)
6155 if (x > 0 and rx[x - 1].mSeqNum != kFlagSeqNr)
6156 rx[
x].mSeqNum = rx[x - 1].mSeqNum + 1;
6162 while (unnumbered.empty() ==
false)
6164 x = unnumbered.top();
6166 throw std::runtime_error(
"Could not assign sequence numbers");
6167 rx[
x].mSeqNum = rx[x + 1].mSeqNum - 1;
6174 bool PDBFileParser::PDBChain::SameSequence(
const PDBChain &rhs)
const 6176 bool result = mSeqres.size() == rhs.mSeqres.size();
6178 for (
size_t i = 0; result and
i < mSeqres.size(); ++
i)
6179 result = mSeqres[
i].mMonID == rhs.mSeqres[
i].mMonID;
6190 cifFile.load_dictionary(
"mmcif_pdbx.dic");
6192 p.
Parse(pdbFile, cifFile);
6195 std::cerr <<
"Resulting mmCIF file is not valid!" << std::endl;
6204 auto *
buffer = is.rdbuf();
6207 char ch = std::char_traits<char>::to_char_type(
buffer->sgetc());
6213 if (ch ==
'h' or ch ==
'H')
6221 catch (
const std::exception &ex)
6223 std::throw_with_nested(std::runtime_error(
"Since the file did not start with a valid PDB HEADER line mmCIF was assumed, but that failed."));
6229 if (result.get_validator() ==
nullptr)
6230 result.load_dictionary(
"mmcif_pdbx.dic");
6235 file
read(
const std::filesystem::path &file)
6239 gzio::ifstream
in(file);
6240 if (not in.is_open())
6241 throw std::runtime_error(
"Could not open file " + file.string() +
" for input");
6245 catch (
const std::exception &ex)
6247 throw_with_nested(std::runtime_error(
"Error reading file " + file.string()));
uint32_t get_terminal_width()
void to_lower(std::string &s)
void min(Image< double > &op1, const Image< double > &op2)
std::string trim_copy(std::string_view s)
void ReadPDBFile(std::istream &pdbFile, cif::file &cifFile)
void replace_all(std::string &s, std::string_view what, std::string_view with)
const std::set< std::string > kSupportedRecords
bool operator==(faketype, faketype)
void sqrt(Image< double > &op)
std::vector< SelLine >::iterator find(std::vector< SelLine > &text, const std::string &img_name)
void trim(std::string &s)
file read(const std::filesystem::path &file)
const std::map< std::string, int > kMonths
bool icontains(std::string_view s, std::string_view q)
bool operator!=(faketype, faketype)
void compare(Image< double > &op1, const Image< double > &op2)
matrix(uint32_t m, uint32_t n, T v=T())
std::ostream & operator<<(std::ostream &os, const Message &sb)
bool operator==(const AtomRes &rhs) const
bool iequals(std::string_view a, std::string_view b)
ql0001_ & k(htemp+1),(cvec+1),(atemp+1),(bj+1),(bl+1),(bu+1),(x+1),(clamda+1), &iout, infoqp, &zero,(w+1), &lenw,(iw+1), &leniw, &glob_grd.epsmac
bool operator!=(const AtomRes &rhs) const
std::string to_lower_copy(std::string_view s)
if(fabs(c[*nmax+ *nmax *c_dim1])==0.e0)
std::error_code make_error_code(pdbErrors e)
void max(Image< double > &op1, const Image< double > &op2)
basic_istream< char, std::char_traits< char > > istream
std::string trim_right_copy(std::string_view s)
std::string message(int value) const
void sort(struct DCEL_T *dcel)
value_type operator()(uint32_t i, uint32_t j) const
void Parse(std::istream &is, cif::file &result)
std::string cif_id_for_number(int number)
TYPE distance(struct Point_T *p, struct Point_T *q)
void trim_right(std::string &s)
double psi(const double x)
value_type & operator()(uint32_t i, uint32_t j)
std::string to_string(bond_type bondType)
SpecificationListParser(const std::string &text)
const char * name() const noexcept
bool isWater(const std::string &resname)
std::error_category & pdbCategory()
int get_space_group_number(std::string spacegroup)
bool operator<(const SparseElement &_x, const SparseElement &_y)
std::tuple< std::string, std::string > GetNextSpecification()
check(nparam, nf, nfsr, &Linfty, nineq, nineqn, neq, neqn, ncsrl, ncsrn, mode, &modem, eps, bgbnd, param)