27 #include "cif++/utilities.hpp" 28 #include "cif++/forward_decl.hpp" 29 #include "cif++/parser.hpp" 30 #include "cif++/file.hpp" 44 : m_source(*is.rdbuf())
46 if (is.rdbuf() ==
nullptr)
47 throw std::runtime_error(
"Attempt to read from uninitialised stream");
54 m_lookahead = get_next_token();
60 int sac_parser::get_next_char()
62 int result = std::char_traits<char>::eof();
65 result = m_source.sbumpc();
68 result = m_buffer.back();
75 int lookahead = m_source.sbumpc();
76 if (lookahead !=
'\n')
77 m_buffer.push_back(lookahead);
81 if (result == std::char_traits<char>::eof())
82 m_token_value.push_back(0);
84 m_token_value.push_back(std::char_traits<char>::to_char_type(result));
91 std::cerr <<
"get_next_char => ";
92 if (iscntrl(result)
or not isprint(result))
93 std::cerr << int(result) << std::endl;
95 std::cerr << char(result) << std::endl;
101 void sac_parser::retract()
103 assert(not m_token_value.empty());
105 char ch = m_token_value.back();
109 m_buffer.push_back(ch == 0 ? std::char_traits<char>::eof() : std::char_traits<char>::to_int_type(ch));
110 m_token_value.pop_back();
113 int sac_parser::restart(
int start)
117 while (not m_token_value.empty())
131 result = State::Value;
135 error(
"Invalid state in SacParser");
143 sac_parser::CIFToken sac_parser::get_next_token()
145 const auto kEOF = std::char_traits<char>::eof();
147 CIFToken result = CIFToken::Unknown;
149 int state = State::Start, start = State::Start;
152 m_token_value.clear();
153 mTokenType = CIFValue::Unknown;
155 while (result == CIFToken::Unknown)
157 auto ch = get_next_char();
163 result = CIFToken::Eof;
167 state = State::White;
169 else if (ch ==
' ' or ch ==
'\t')
170 state = State::White;
172 state = State::Comment;
175 else if (ch ==
';' and m_bol)
176 state = State::TextField;
177 else if (ch ==
'\'' or ch ==
'"')
180 state = State::QuotedString;
183 state = start = restart(start);
188 result = CIFToken::Eof;
189 else if (not isspace(ch))
191 state = State::Start;
193 m_token_value.clear();
196 m_bol = (ch ==
'\n');
202 state = State::Start;
204 m_token_value.clear();
207 result = CIFToken::Eof;
208 else if (not is_any_print(ch))
209 error(
"invalid character in comment");
212 case State::TextField:
214 state = State::TextField + 1;
216 error(
"unterminated textfield");
220 warning(
"invalid character in text field '" + std::string({
static_cast<char>(ch)}) +
"' (" +
std::to_string((
int)ch) +
")");
228 case State::TextField + 1:
229 if (is_text_lead(ch)
or ch ==
' ' or ch ==
'\t')
230 state = State::TextField;
233 assert(m_token_value.length() >= 2);
234 m_token_value = m_token_value.substr(1, m_token_value.length() - 3);
235 mTokenType = CIFValue::TextField;
236 result = CIFToken::Value;
239 error(
"unterminated textfield");
241 error(
"invalid character in text field");
244 case State::QuotedString:
246 error(
"unterminated quoted string");
247 else if (ch == quoteChar)
248 state = State::QuotedStringQuote;
250 warning(
"invalid character in quoted string: '" + std::string({
static_cast<char>(ch)}) +
"' (" +
std::to_string((
int)ch) +
")");
253 case State::QuotedStringQuote:
257 result = CIFToken::Value;
260 if (m_token_value.length() < 2)
261 error(
"Invalid quoted string token");
263 m_token_value = m_token_value.substr(1, m_token_value.length() - 2);
265 else if (ch == quoteChar)
267 else if (is_any_print(ch))
268 state = State::QuotedString;
270 error(
"unterminated quoted string");
272 error(
"invalid character in quoted string");
276 if (not is_non_blank(ch))
279 result = CIFToken::Tag;
284 if (ch ==
'+' or ch ==
'-')
288 else if (isdigit(ch))
291 state = start = restart(start);
300 else if (tolower(ch) ==
'e')
302 else if (is_white(ch)
or ch == kEOF)
305 result = CIFToken::Value;
306 mTokenType = CIFValue::Int;
309 state = start = restart(start);
314 if (tolower(ch) ==
'e')
316 else if (is_white(ch)
or ch == kEOF)
319 result = CIFToken::Value;
323 state = start = restart(start);
328 if (ch ==
'-' or ch ==
'+')
330 else if (isdigit(ch))
333 state = start = restart(start);
340 state = start = restart(start);
344 if (is_white(ch)
or ch == kEOF)
347 result = CIFToken::Value;
351 state = start = restart(start);
355 if (isdigit(ch)
or ch ==
'+' or ch ==
'-')
356 state = State::Int + 1;
358 state = start = restart(start);
362 if (is_white(ch)
or ch == kEOF)
365 result = CIFToken::Value;
366 mTokenType = CIFValue::Int;
369 state = start = restart(start);
390 if (result == CIFToken::Unknown and not is_non_blank(ch))
393 result = CIFToken::Value;
395 if (m_token_value ==
".")
396 mTokenType = CIFValue::Inapplicable;
397 else if (
iequals(m_token_value,
"global_"))
398 result = CIFToken::GLOBAL;
399 else if (
iequals(m_token_value,
"stop_"))
400 result = CIFToken::STOP;
401 else if (
iequals(m_token_value,
"loop_"))
402 result = CIFToken::LOOP;
403 else if (m_token_value ==
"?")
405 mTokenType = CIFValue::Unknown;
406 m_token_value.clear();
413 if (not is_non_blank(ch))
420 result = CIFToken::SAVE;
422 m_token_value.erase(m_token_value.begin(), m_token_value.begin() + 5);
428 error(
"Invalid state in get_next_token");
435 std::cerr << get_token_name(result);
436 if (mTokenType != CIFValue::Unknown)
437 std::cerr <<
' ' << get_value_name(mTokenType);
438 if (result != CIFToken::Eof)
439 std::cerr <<
" " << std::quoted(m_token_value);
440 std::cerr << std::endl;
446 void sac_parser::match(CIFToken token)
448 if (m_lookahead != token)
449 error(std::string(
"Unexpected token, expected ") + get_token_name(token) +
" but found " + get_token_name(m_lookahead));
451 m_lookahead = get_next_token();
454 bool sac_parser::parse_single_datablock(
const std::string &datablock)
469 std::string dblk =
"data_" + datablock;
470 std::string::size_type si = 0;
473 for (
auto ch = m_source.sbumpc(); not found and ch != std::streambuf::traits_type::eof(); ch = m_source.sbumpc())
480 case '#': state = comment;
break;
505 state = string_quote;
509 if (std::isspace(ch))
516 if (ch ==
';' and bol)
521 if (isspace(ch) and dblk[si] == 0)
523 else if (dblk[si++] != ch)
533 produce_datablock(datablock);
534 m_lookahead = get_next_token();
541 sac_parser::datablock_index sac_parser::index_datablocks()
543 datablock_index
index;
559 const char dblk[] =
"data_";
560 std::string::size_type si = 0;
561 std::string datablock;
563 for (
auto ch = m_source.sbumpc(); ch != std::streambuf::traits_type::eof(); ch = m_source.sbumpc())
570 case '#': state = comment;
break;
595 state = string_quote;
599 if (std::isspace(ch))
606 if (ch ==
';' and bol)
611 if (dblk[si] == 0 and is_non_blank(ch))
613 datablock = {
static_cast<char>(ch)};
616 else if (dblk[si++] != ch)
621 if (is_non_blank(ch))
622 datablock.insert(datablock.end(), char(ch));
623 else if (isspace(ch))
625 if (not datablock.empty())
626 index[datablock] = m_source.pubseekoff(0, std::ios_base::cur,
std::ios_base::in);
641 bool sac_parser::parse_single_datablock(
const std::string &datablock,
const datablock_index &index)
645 auto i = index.find(datablock);
646 if (
i != index.end())
650 produce_datablock(datablock);
651 m_lookahead = get_next_token();
660 void sac_parser::parse_file()
662 while (m_lookahead != CIFToken::Eof)
666 case CIFToken::GLOBAL:
671 produce_datablock(m_token_value);
678 error(
"This file does not seem to be an mmCIF file");
684 void sac_parser::parse_global()
686 match(CIFToken::GLOBAL);
687 while (m_lookahead == CIFToken::Tag)
689 match(CIFToken::Tag);
690 match(CIFToken::Value);
694 void sac_parser::parse_datablock()
696 static const std::string kUnitializedCategory(
"<invalid>");
697 std::string cat = kUnitializedCategory;
699 while (m_lookahead == CIFToken::LOOP
or m_lookahead == CIFToken::Tag
or m_lookahead == CIFToken::SAVE)
705 cat = kUnitializedCategory;
707 match(CIFToken::LOOP);
709 std::vector<std::string> tags;
711 while (m_lookahead == CIFToken::Tag)
713 std::string catName, itemName;
716 if (cat == kUnitializedCategory)
718 produce_category(catName);
721 else if (not
iequals(cat, catName))
722 error(
"inconsistent categories in loop_");
724 tags.push_back(itemName);
726 match(CIFToken::Tag);
729 while (m_lookahead == CIFToken::Value)
733 for (
auto tag : tags)
735 produce_item(cat, tag, m_token_value);
736 match(CIFToken::Value);
746 std::string catName, itemName;
751 produce_category(catName);
756 match(CIFToken::Tag);
758 produce_item(cat, itemName, m_token_value);
760 match(CIFToken::Value);
775 void sac_parser::parse_save_frame()
777 error(
"A regular CIF file should not contain a save frame");
782 void parser::produce_datablock(
const std::string &name)
785 std::cerr <<
"producing data_" << name << std::endl;
787 const auto &[
iter, ignore] = m_file.emplace(name);
788 m_datablock = &(*iter);
791 void parser::produce_category(
const std::string &name)
794 std::cerr <<
"producing category " << name << std::endl;
796 const auto &[cat, ignore] = m_datablock->emplace(name);
800 void parser::produce_row()
802 if (
VERBOSE >= 4 and m_category !=
nullptr)
803 std::cerr <<
"producing row for category " << m_category->name() << std::endl;
805 if (m_category ==
nullptr)
806 error(
"inconsistent categories in loop_");
808 m_category->emplace({});
809 m_row = m_category->back();
813 void parser::produce_item(
const std::string &category,
const std::string &item,
const std::string &value)
816 std::cerr <<
"producing _" << category <<
'.' << item <<
" -> " << value << std::endl;
818 if (m_category ==
nullptr or not
iequals(category, m_category->name()))
819 error(
"inconsistent categories in loop_");
821 m_row[item] = m_token_value;
bool iequals(std::string_view a, std::string_view b)
std::string to_lower_copy(std::string_view s)
FloatingPoint< float > Float
std::tuple< std::string, std::string > split_tag_name(std::string_view tag)
basic_istream< char, std::char_traits< char > > istream
std::string to_string(bond_type bondType)