Xmipp  v3.23.11-Nereus
parser.cpp
Go to the documentation of this file.
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2020 NKI/AVL, Netherlands Cancer Institute
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  *
9  * 1. Redistributions of source code must retain the above copyright notice, this
10  * list of conditions and the following disclaimer
11  * 2. Redistributions in binary form must reproduce the above copyright notice,
12  * this list of conditions and the following disclaimer in the documentation
13  * and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  */
26 
27 #include "cif++/utilities.hpp"
28 #include "cif++/forward_decl.hpp"
29 #include "cif++/parser.hpp"
30 #include "cif++/file.hpp"
31 
32 #include <cassert>
33 #include <iostream>
34 #include <map>
35 #include <regex>
36 #include <stack>
37 
38 namespace cif
39 {
40 
41 // --------------------------------------------------------------------
42 
43 sac_parser::sac_parser(std::istream &is, bool init)
44  : m_source(*is.rdbuf())
45 {
46  if (is.rdbuf() == nullptr)
47  throw std::runtime_error("Attempt to read from uninitialised stream");
48 
49  m_validate = true;
50  m_line_nr = 1;
51  m_bol = true;
52 
53  if (init)
54  m_lookahead = get_next_token();
55 }
56 
57 // get_next_char takes a char from the buffer, or if it is empty
58 // from the istream. This function also does carriage/linefeed
59 // translation.
60 int sac_parser::get_next_char()
61 {
62  int result = std::char_traits<char>::eof();
63 
64  if (m_buffer.empty())
65  result = m_source.sbumpc();
66  else
67  {
68  result = m_buffer.back();
69  m_buffer.pop_back();
70  }
71 
72  // very simple CR/LF translation into LF
73  if (result == '\r')
74  {
75  int lookahead = m_source.sbumpc();
76  if (lookahead != '\n')
77  m_buffer.push_back(lookahead);
78  result = '\n';
79  }
80 
81  if (result == std::char_traits<char>::eof())
82  m_token_value.push_back(0);
83  else
84  m_token_value.push_back(std::char_traits<char>::to_char_type(result));
85 
86  if (result == '\n')
87  ++m_line_nr;
88 
89  if (VERBOSE >= 6)
90  {
91  std::cerr << "get_next_char => ";
92  if (iscntrl(result) or not isprint(result))
93  std::cerr << int(result) << std::endl;
94  else
95  std::cerr << char(result) << std::endl;
96  }
97 
98  return result;
99 }
100 
101 void sac_parser::retract()
102 {
103  assert(not m_token_value.empty());
104 
105  char ch = m_token_value.back();
106  if (ch == '\n')
107  --m_line_nr;
108 
109  m_buffer.push_back(ch == 0 ? std::char_traits<char>::eof() : std::char_traits<char>::to_int_type(ch));
110  m_token_value.pop_back();
111 }
112 
113 int sac_parser::restart(int start)
114 {
115  int result = 0;
116 
117  while (not m_token_value.empty())
118  retract();
119 
120  switch (start)
121  {
122  case State::Start:
123  result = State::Float;
124  break;
125 
126  case State::Float:
127  result = State::Int;
128  break;
129 
130  case State::Int:
131  result = State::Value;
132  break;
133 
134  default:
135  error("Invalid state in SacParser");
136  }
137 
138  m_bol = false;
139 
140  return result;
141 }
142 
143 sac_parser::CIFToken sac_parser::get_next_token()
144 {
145  const auto kEOF = std::char_traits<char>::eof();
146 
147  CIFToken result = CIFToken::Unknown;
148  int quoteChar = 0;
149  int state = State::Start, start = State::Start;
150  m_bol = false;
151 
152  m_token_value.clear();
153  mTokenType = CIFValue::Unknown;
154 
155  while (result == CIFToken::Unknown)
156  {
157  auto ch = get_next_char();
158 
159  switch (state)
160  {
161  case State::Start:
162  if (ch == kEOF)
163  result = CIFToken::Eof;
164  else if (ch == '\n')
165  {
166  m_bol = true;
167  state = State::White;
168  }
169  else if (ch == ' ' or ch == '\t')
170  state = State::White;
171  else if (ch == '#')
172  state = State::Comment;
173  else if (ch == '_')
174  state = State::Tag;
175  else if (ch == ';' and m_bol)
176  state = State::TextField;
177  else if (ch == '\'' or ch == '"')
178  {
179  quoteChar = ch;
180  state = State::QuotedString;
181  }
182  else
183  state = start = restart(start);
184  break;
185 
186  case State::White:
187  if (ch == kEOF)
188  result = CIFToken::Eof;
189  else if (not isspace(ch))
190  {
191  state = State::Start;
192  retract();
193  m_token_value.clear();
194  }
195  else
196  m_bol = (ch == '\n');
197  break;
198 
199  case State::Comment:
200  if (ch == '\n')
201  {
202  state = State::Start;
203  m_bol = true;
204  m_token_value.clear();
205  }
206  else if (ch == kEOF)
207  result = CIFToken::Eof;
208  else if (not is_any_print(ch))
209  error("invalid character in comment");
210  break;
211 
212  case State::TextField:
213  if (ch == '\n')
214  state = State::TextField + 1;
215  else if (ch == kEOF)
216  error("unterminated textfield");
217  // else if (ch == '\\')
218  // state = State::Esc;
219  else if (not is_any_print(ch) and cif::VERBOSE > 2)
220  warning("invalid character in text field '" + std::string({static_cast<char>(ch)}) + "' (" + std::to_string((int)ch) + ")");
221  break;
222 
223  // case State::Esc:
224  // if (ch == '\n')
225 
226  // break;
227 
228  case State::TextField + 1:
229  if (is_text_lead(ch) or ch == ' ' or ch == '\t')
230  state = State::TextField;
231  else if (ch == ';')
232  {
233  assert(m_token_value.length() >= 2);
234  m_token_value = m_token_value.substr(1, m_token_value.length() - 3);
235  mTokenType = CIFValue::TextField;
236  result = CIFToken::Value;
237  }
238  else if (ch == kEOF)
239  error("unterminated textfield");
240  else if (ch != '\n')
241  error("invalid character in text field");
242  break;
243 
244  case State::QuotedString:
245  if (ch == kEOF)
246  error("unterminated quoted string");
247  else if (ch == quoteChar)
248  state = State::QuotedStringQuote;
249  else if (not is_any_print(ch) and cif::VERBOSE > 2)
250  warning("invalid character in quoted string: '" + std::string({static_cast<char>(ch)}) + "' (" + std::to_string((int)ch) + ")");
251  break;
252 
253  case State::QuotedStringQuote:
254  if (is_white(ch))
255  {
256  retract();
257  result = CIFToken::Value;
258  mTokenType = CIFValue::String;
259 
260  if (m_token_value.length() < 2)
261  error("Invalid quoted string token");
262 
263  m_token_value = m_token_value.substr(1, m_token_value.length() - 2);
264  }
265  else if (ch == quoteChar)
266  ;
267  else if (is_any_print(ch))
268  state = State::QuotedString;
269  else if (ch == kEOF)
270  error("unterminated quoted string");
271  else
272  error("invalid character in quoted string");
273  break;
274 
275  case State::Tag:
276  if (not is_non_blank(ch))
277  {
278  retract();
279  result = CIFToken::Tag;
280  }
281  break;
282 
283  case State::Float:
284  if (ch == '+' or ch == '-')
285  {
286  state = State::Float + 1;
287  }
288  else if (isdigit(ch))
289  state = State::Float + 1;
290  else
291  state = start = restart(start);
292  break;
293 
294  case State::Float + 1:
295  // if (ch == '(') // numeric???
296  // mState = State::NumericSuffix;
297  // else
298  if (ch == '.')
299  state = State::Float + 2;
300  else if (tolower(ch) == 'e')
301  state = State::Float + 3;
302  else if (is_white(ch) or ch == kEOF)
303  {
304  retract();
305  result = CIFToken::Value;
306  mTokenType = CIFValue::Int;
307  }
308  else
309  state = start = restart(start);
310  break;
311 
312  // parsed '.'
313  case State::Float + 2:
314  if (tolower(ch) == 'e')
315  state = State::Float + 3;
316  else if (is_white(ch) or ch == kEOF)
317  {
318  retract();
319  result = CIFToken::Value;
320  mTokenType = CIFValue::Float;
321  }
322  else
323  state = start = restart(start);
324  break;
325 
326  // parsed 'e'
327  case State::Float + 3:
328  if (ch == '-' or ch == '+')
329  state = State::Float + 4;
330  else if (isdigit(ch))
331  state = State::Float + 5;
332  else
333  state = start = restart(start);
334  break;
335 
336  case State::Float + 4:
337  if (isdigit(ch))
338  state = State::Float + 5;
339  else
340  state = start = restart(start);
341  break;
342 
343  case State::Float + 5:
344  if (is_white(ch) or ch == kEOF)
345  {
346  retract();
347  result = CIFToken::Value;
348  mTokenType = CIFValue::Float;
349  }
350  else
351  state = start = restart(start);
352  break;
353 
354  case State::Int:
355  if (isdigit(ch) or ch == '+' or ch == '-')
356  state = State::Int + 1;
357  else
358  state = start = restart(start);
359  break;
360 
361  case State::Int + 1:
362  if (is_white(ch) or ch == kEOF)
363  {
364  retract();
365  result = CIFToken::Value;
366  mTokenType = CIFValue::Int;
367  }
368  else
369  state = start = restart(start);
370  break;
371 
372  case State::Value:
373  if (ch == '_')
374  {
375  std::string s = to_lower_copy(m_token_value);
376 
377  if (s == "data_")
378  {
379  state = State::DATA;
380  continue;
381  }
382 
383  if (s == "save_")
384  {
385  state = State::SAVE;
386  continue;
387  }
388  }
389 
390  if (result == CIFToken::Unknown and not is_non_blank(ch))
391  {
392  retract();
393  result = CIFToken::Value;
394 
395  if (m_token_value == ".")
396  mTokenType = CIFValue::Inapplicable;
397  else if (iequals(m_token_value, "global_"))
398  result = CIFToken::GLOBAL;
399  else if (iequals(m_token_value, "stop_"))
400  result = CIFToken::STOP;
401  else if (iequals(m_token_value, "loop_"))
402  result = CIFToken::LOOP;
403  else if (m_token_value == "?")
404  {
405  mTokenType = CIFValue::Unknown;
406  m_token_value.clear();
407  }
408  }
409  break;
410 
411  case State::DATA:
412  case State::SAVE:
413  if (not is_non_blank(ch))
414  {
415  retract();
416 
417  if (state == State::DATA)
418  result = CIFToken::DATA;
419  else
420  result = CIFToken::SAVE;
421 
422  m_token_value.erase(m_token_value.begin(), m_token_value.begin() + 5);
423  }
424  break;
425 
426  default:
427  assert(false);
428  error("Invalid state in get_next_token");
429  break;
430  }
431  }
432 
433  if (VERBOSE >= 5)
434  {
435  std::cerr << get_token_name(result);
436  if (mTokenType != CIFValue::Unknown)
437  std::cerr << ' ' << get_value_name(mTokenType);
438  if (result != CIFToken::Eof)
439  std::cerr << " " << std::quoted(m_token_value);
440  std::cerr << std::endl;
441  }
442 
443  return result;
444 }
445 
446 void sac_parser::match(CIFToken token)
447 {
448  if (m_lookahead != token)
449  error(std::string("Unexpected token, expected ") + get_token_name(token) + " but found " + get_token_name(m_lookahead));
450 
451  m_lookahead = get_next_token();
452 }
453 
454 bool sac_parser::parse_single_datablock(const std::string &datablock)
455 {
456  // first locate the start, as fast as we can
457  enum
458  {
459  start,
460  comment,
461  string,
462  string_quote,
463  qstring,
464  data
465  } state = start;
466 
467  int quote = 0;
468  bool bol = true;
469  std::string dblk = "data_" + datablock;
470  std::string::size_type si = 0;
471  bool found = false;
472 
473  for (auto ch = m_source.sbumpc(); not found and ch != std::streambuf::traits_type::eof(); ch = m_source.sbumpc())
474  {
475  switch (state)
476  {
477  case start:
478  switch (ch)
479  {
480  case '#': state = comment; break;
481  case 'd':
482  case 'D':
483  state = data;
484  si = 1;
485  break;
486  case '\'':
487  case '"':
488  state = string;
489  quote = ch;
490  break;
491  case ';':
492  if (bol)
493  state = qstring;
494  break;
495  }
496  break;
497 
498  case comment:
499  if (ch == '\n')
500  state = start;
501  break;
502 
503  case string:
504  if (ch == quote)
505  state = string_quote;
506  break;
507 
508  case string_quote:
509  if (std::isspace(ch))
510  state = start;
511  else
512  state = string;
513  break;
514 
515  case qstring:
516  if (ch == ';' and bol)
517  state = start;
518  break;
519 
520  case data:
521  if (isspace(ch) and dblk[si] == 0)
522  found = true;
523  else if (dblk[si++] != ch)
524  state = start;
525  break;
526  }
527 
528  bol = (ch == '\n');
529  }
530 
531  if (found)
532  {
533  produce_datablock(datablock);
534  m_lookahead = get_next_token();
535  parse_datablock();
536  }
537 
538  return found;
539 }
540 
541 sac_parser::datablock_index sac_parser::index_datablocks()
542 {
543  datablock_index index;
544 
545  // first locate the start, as fast as we can
546  enum
547  {
548  start,
549  comment,
550  string,
551  string_quote,
552  qstring,
553  data,
554  data_name
555  } state = start;
556 
557  int quote = 0;
558  bool bol = true;
559  const char dblk[] = "data_";
560  std::string::size_type si = 0;
561  std::string datablock;
562 
563  for (auto ch = m_source.sbumpc(); ch != std::streambuf::traits_type::eof(); ch = m_source.sbumpc())
564  {
565  switch (state)
566  {
567  case start:
568  switch (ch)
569  {
570  case '#': state = comment; break;
571  case 'd':
572  case 'D':
573  state = data;
574  si = 1;
575  break;
576  case '\'':
577  case '"':
578  state = string;
579  quote = ch;
580  break;
581  case ';':
582  if (bol)
583  state = qstring;
584  break;
585  }
586  break;
587 
588  case comment:
589  if (ch == '\n')
590  state = start;
591  break;
592 
593  case string:
594  if (ch == quote)
595  state = string_quote;
596  break;
597 
598  case string_quote:
599  if (std::isspace(ch))
600  state = start;
601  else
602  state = string;
603  break;
604 
605  case qstring:
606  if (ch == ';' and bol)
607  state = start;
608  break;
609 
610  case data:
611  if (dblk[si] == 0 and is_non_blank(ch))
612  {
613  datablock = {static_cast<char>(ch)};
614  state = data_name;
615  }
616  else if (dblk[si++] != ch)
617  state = start;
618  break;
619 
620  case data_name:
621  if (is_non_blank(ch))
622  datablock.insert(datablock.end(), char(ch));
623  else if (isspace(ch))
624  {
625  if (not datablock.empty())
626  index[datablock] = m_source.pubseekoff(0, std::ios_base::cur, std::ios_base::in);
627 
628  state = start;
629  }
630  else
631  state = start;
632  break;
633  }
634 
635  bol = (ch == '\n');
636  }
637 
638  return index;
639 }
640 
641 bool sac_parser::parse_single_datablock(const std::string &datablock, const datablock_index &index)
642 {
643  bool result = false;
644 
645  auto i = index.find(datablock);
646  if (i != index.end())
647  {
648  m_source.pubseekpos(i->second, std::ios_base::in);
649 
650  produce_datablock(datablock);
651  m_lookahead = get_next_token();
652  parse_datablock();
653 
654  result = true;
655  }
656 
657  return result;
658 }
659 
660 void sac_parser::parse_file()
661 {
662  while (m_lookahead != CIFToken::Eof)
663  {
664  switch (m_lookahead)
665  {
666  case CIFToken::GLOBAL:
667  parse_global();
668  break;
669 
670  case CIFToken::DATA:
671  produce_datablock(m_token_value);
672 
673  match(CIFToken::DATA);
674  parse_datablock();
675  break;
676 
677  default:
678  error("This file does not seem to be an mmCIF file");
679  break;
680  }
681  }
682 }
683 
684 void sac_parser::parse_global()
685 {
686  match(CIFToken::GLOBAL);
687  while (m_lookahead == CIFToken::Tag)
688  {
689  match(CIFToken::Tag);
690  match(CIFToken::Value);
691  }
692 }
693 
694 void sac_parser::parse_datablock()
695 {
696  static const std::string kUnitializedCategory("<invalid>");
697  std::string cat = kUnitializedCategory; // intial value acts as a guard for empty category names
698 
699  while (m_lookahead == CIFToken::LOOP or m_lookahead == CIFToken::Tag or m_lookahead == CIFToken::SAVE)
700  {
701  switch (m_lookahead)
702  {
703  case CIFToken::LOOP:
704  {
705  cat = kUnitializedCategory; // should start a new category
706 
707  match(CIFToken::LOOP);
708 
709  std::vector<std::string> tags;
710 
711  while (m_lookahead == CIFToken::Tag)
712  {
713  std::string catName, itemName;
714  std::tie(catName, itemName) = split_tag_name(m_token_value);
715 
716  if (cat == kUnitializedCategory)
717  {
718  produce_category(catName);
719  cat = catName;
720  }
721  else if (not iequals(cat, catName))
722  error("inconsistent categories in loop_");
723 
724  tags.push_back(itemName);
725 
726  match(CIFToken::Tag);
727  }
728 
729  while (m_lookahead == CIFToken::Value)
730  {
731  produce_row();
732 
733  for (auto tag : tags)
734  {
735  produce_item(cat, tag, m_token_value);
736  match(CIFToken::Value);
737  }
738  }
739 
740  cat.clear();
741  break;
742  }
743 
744  case CIFToken::Tag:
745  {
746  std::string catName, itemName;
747  std::tie(catName, itemName) = split_tag_name(m_token_value);
748 
749  if (not iequals(cat, catName))
750  {
751  produce_category(catName);
752  cat = catName;
753  produce_row();
754  }
755 
756  match(CIFToken::Tag);
757 
758  produce_item(cat, itemName, m_token_value);
759 
760  match(CIFToken::Value);
761  break;
762  }
763 
764  case CIFToken::SAVE:
765  parse_save_frame();
766  break;
767 
768  default:
769  assert(false);
770  break;
771  }
772  }
773 }
774 
775 void sac_parser::parse_save_frame()
776 {
777  error("A regular CIF file should not contain a save frame");
778 }
779 
780 // --------------------------------------------------------------------
781 
782 void parser::produce_datablock(const std::string &name)
783 {
784  if (VERBOSE >= 4)
785  std::cerr << "producing data_" << name << std::endl;
786 
787  const auto &[iter, ignore] = m_file.emplace(name);
788  m_datablock = &(*iter);
789 }
790 
791 void parser::produce_category(const std::string &name)
792 {
793  if (VERBOSE >= 4)
794  std::cerr << "producing category " << name << std::endl;
795 
796  const auto &[cat, ignore] = m_datablock->emplace(name);
797  m_category = &*cat;
798 }
799 
800 void parser::produce_row()
801 {
802  if (VERBOSE >= 4 and m_category != nullptr)
803  std::cerr << "producing row for category " << m_category->name() << std::endl;
804 
805  if (m_category == nullptr)
806  error("inconsistent categories in loop_");
807 
808  m_category->emplace({});
809  m_row = m_category->back();
810  // m_row.lineNr(m_line_nr);
811 }
812 
813 void parser::produce_item(const std::string &category, const std::string &item, const std::string &value)
814 {
815  if (VERBOSE >= 4)
816  std::cerr << "producing _" << category << '.' << item << " -> " << value << std::endl;
817 
818  if (m_category == nullptr or not iequals(category, m_category->name()))
819  error("inconsistent categories in loop_");
820 
821  m_row[item] = m_token_value;
822 }
823 
824 } // namespace cif
glob_prnt iter
bool iequals(std::string_view a, std::string_view b)
Definition: text.cpp:59
#define i
std::string to_lower_copy(std::string_view s)
Definition: text.cpp:120
FloatingPoint< float > Float
viol index
int in
std::tuple< std::string, std::string > split_tag_name(std::string_view tag)
Definition: text.cpp:218
int VERBOSE
Definition: utilities.cpp:58
basic_istream< char, std::char_traits< char > > istream
Definition: utilities.cpp:815
void error(char *s)
Definition: tools.cpp:107
std::string String
Definition: xmipp_strings.h:34
std::string to_string(bond_type bondType)
Definition: compound.cpp:43