Xmipp  v3.23.11-Nereus
text.cpp
Go to the documentation of this file.
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2020 NKI/AVL, Netherlands Cancer Institute
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  *
9  * 1. Redistributions of source code must retain the above copyright notice, this
10  * list of conditions and the following disclaimer
11  * 2. Redistributions in binary form must reproduce the above copyright notice,
12  * this list of conditions and the following disclaimer in the documentation
13  * and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  */
26 
27 #include "cif++/text.hpp"
28 
29 #include <algorithm>
30 #include <cassert>
31 
32 namespace cif
33 {
34 
35 // --------------------------------------------------------------------
36 // This really makes a difference, having our own tolower routines
37 
38 const uint8_t kCharToLowerMap[256] =
39  {
40  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
41  0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
42  0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
43  0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
44  0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
45  0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
46  0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
47  0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
48  0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
49  0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
50  0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
51  0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
52  0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
53  0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
54  0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
55  0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff};
56 
57 // --------------------------------------------------------------------
58 
59 bool iequals(std::string_view a, std::string_view b)
60 {
61  bool result = a.length() == b.length();
62  for (auto ai = a.begin(), bi = b.begin(); result and ai != a.end(); ++ai, ++bi)
63  result = kCharToLowerMap[uint8_t(*ai)] == kCharToLowerMap[uint8_t(*bi)];
64  // result = tolower(*ai) == tolower(*bi);
65  return result;
66 }
67 
68 bool iequals(const char *a, const char *b)
69 {
70  bool result = true;
71  for (; result and *a and *b; ++a, ++b)
72  result = tolower(*a) == tolower(*b);
73 
74  return result and *a == *b;
75 }
76 
77 int icompare(std::string_view a, std::string_view b)
78 {
79  int d = 0;
80  auto ai = a.begin(), bi = b.begin();
81 
82  for (; d == 0 and ai != a.end() and bi != b.end(); ++ai, ++bi)
83  d = tolower(*ai) - tolower(*bi);
84 
85  if (d == 0)
86  {
87  if (ai != a.end())
88  d = 1;
89  else if (bi != b.end())
90  d = -1;
91  }
92 
93  return d;
94 }
95 
96 int icompare(const char *a, const char *b)
97 {
98  int d = 0;
99 
100  for (; d == 0 and *a != 0 and *b != 0; ++a, ++b)
101  d = tolower(*a) - tolower(*b);
102 
103  if (d == 0)
104  {
105  if (*a != 0)
106  d = 1;
107  else if (*b != 0)
108  d = -1;
109  }
110 
111  return d;
112 }
113 
114 void to_lower(std::string &s)
115 {
116  for (auto &c : s)
117  c = tolower(c);
118 }
119 
120 std::string to_lower_copy(std::string_view s)
121 {
122  std::string result(s);
123  for (auto &c : result)
124  c = tolower(c);
125  return result;
126 }
127 
128 void to_upper(std::string &s)
129 {
130  for (auto &c : s)
131  c = static_cast<char>(toupper(c));
132 }
133 
134 void replace_all(std::string &s, std::string_view what, std::string_view with)
135 {
136  for (std::string::size_type p = s.find(what); p != std::string::npos; p = s.find(what, p))
137  {
138  s.replace(p, what.length(), with);
139  p += with.length();
140  }
141 }
142 
143 bool icontains(std::string_view s, std::string_view q)
144 {
145  return contains(to_lower_copy(s), to_lower_copy(q));
146 }
147 
148 void trim_right(std::string &s)
149 {
150  auto e = s.end();
151  while (e != s.begin())
152  {
153  auto pe = std::prev(e);
154  if (not std::isspace(*pe))
155  break;
156  e = pe;
157  }
158 
159  if (e != s.end())
160  s.erase(e, s.end());
161 }
162 
163 std::string trim_right_copy(std::string_view s)
164 {
165  auto e = s.end();
166  while (e != s.begin())
167  {
168  auto pe = std::prev(e);
169  if (not std::isspace(*pe))
170  break;
171  e = pe;
172  }
173 
174  return {s.begin(), e};
175 }
176 
177 std::string trim_left_copy(std::string_view s)
178 {
179  auto b = s.begin();
180  while (b != s.end())
181  {
182  if (not std::isspace(*b))
183  break;
184 
185  b = std::next(b);
186  }
187 
188  return {b, s.end()};
189 }
190 
191 void trim_left(std::string &s)
192 {
193  auto b = s.begin();
194  while (b != s.end())
195  {
196  if (not std::isspace(*b))
197  break;
198 
199  b = std::next(b);
200  }
201 
202  s.erase(s.begin(), b);
203 }
204 
205 void trim(std::string &s)
206 {
207  trim_right(s);
208  trim_left(s);
209 }
210 
211 std::string trim_copy(std::string_view s)
212 {
213  return trim_left_copy(trim_right_copy(s));
214 }
215 
216 // --------------------------------------------------------------------
217 
218 std::tuple<std::string, std::string> split_tag_name(std::string_view tag)
219 {
220  if (tag.empty())
221  throw std::runtime_error("empty tag");
222  if (tag[0] != '_')
223  throw std::runtime_error("tag '" + std::string { tag } + "' does not start with underscore");
224 
225  auto s = tag.find('.');
226  if (s == std::string::npos)
227  // throw std::runtime_error("tag does not contain dot (" + std::string{ tag } + ')');
228  return std::tuple<std::string, std::string>{ "", tag.substr(1) };
229  else
230  return std::tuple<std::string, std::string>{tag.substr(1, s - 1), tag.substr(s + 1)};
231 }
232 
233 // --------------------------------------------------------------------
234 
235 std::string cif_id_for_number(int number)
236 {
237  std::string result;
238 
239  if (number >= 26 * 26 * 26)
240  result = 'L' + std::to_string(number);
241  else
242  {
243  if (number >= 26 * 26)
244  {
245  int v = number / (26 * 26);
246  result += char('A' - 1 + v);
247  number %= (26 * 26);
248  }
249 
250  if (number >= 26)
251  {
252  int v = number / 26;
253  result += char('A' - 1 + v);
254  number %= 26;
255  }
256 
257  result += char('A' + number);
258  }
259 
260  assert(not result.empty());
261  return result;
262 }
263 
264 // --------------------------------------------------------------------
265 // Simplified line breaking code taken from a decent text editor.
266 // In this case, simplified means it only supports ASCII.
267 
269 {
297 
308 };
309 
311  {
318 
319  // comma treated differently here, it is not a numeric separator in PDB
320  kLBC_SymbolAllowingBreakAfter /* kLBC_InfixNumericSeparator */,
321 
333 
334 std::string::const_iterator nextLineBreak(std::string::const_iterator text, std::string::const_iterator end)
335 {
336  if (text == end)
337  return text;
338 
339  enum breakAction
340  {
341  DBK = 0, // direct break (blank in table)
342  IBK, // indirect break (% in table)
343  PBK, // prohibited break (^ in table)
344  CIB, // combining indirect break
345  CPB // combining prohibited break
346  };
347 
348  const breakAction brkTable[27][27] = {
349  // OP CL CP QU GL NS EX SY IS PR PO NU AL ID IN HY BA BB B2 ZW CM WJ H2 H3 JL JV JT
350  /* OP */ {PBK, PBK, PBK, PBK, PBK, PBK, PBK, PBK, PBK, PBK, PBK, PBK, PBK, PBK, PBK, PBK, PBK, PBK, PBK, PBK, CPB, PBK, PBK, PBK, PBK, PBK, PBK},
351  /* CL */ {DBK, PBK, PBK, IBK, IBK, PBK, PBK, PBK, PBK, IBK, IBK, DBK, DBK, DBK, DBK, IBK, IBK, DBK, DBK, PBK, CIB, PBK, DBK, DBK, DBK, DBK, DBK},
352  /* CP */ {DBK, PBK, PBK, IBK, IBK, PBK, PBK, PBK, PBK, IBK, IBK, IBK, IBK, DBK, DBK, IBK, IBK, DBK, DBK, PBK, CIB, PBK, DBK, DBK, DBK, DBK, DBK},
353  /* QU */ {PBK, PBK, PBK, IBK, IBK, IBK, PBK, PBK, PBK, IBK, IBK, IBK, IBK, IBK, IBK, IBK, IBK, IBK, IBK, PBK, CIB, PBK, IBK, IBK, IBK, IBK, IBK},
354  /* GL */ {IBK, PBK, PBK, IBK, IBK, IBK, PBK, PBK, PBK, IBK, IBK, IBK, IBK, IBK, IBK, IBK, IBK, IBK, IBK, PBK, CIB, PBK, IBK, IBK, IBK, IBK, IBK},
355  /* NS */ {DBK, PBK, PBK, IBK, IBK, IBK, PBK, PBK, PBK, DBK, DBK, DBK, DBK, DBK, DBK, IBK, IBK, DBK, DBK, PBK, CIB, PBK, DBK, DBK, DBK, DBK, DBK},
356  /* EX */ {DBK, PBK, PBK, IBK, IBK, IBK, PBK, PBK, PBK, DBK, DBK, DBK, DBK, DBK, DBK, IBK, IBK, DBK, DBK, PBK, CIB, PBK, DBK, DBK, DBK, DBK, DBK},
357  /* SY */ {DBK, PBK, PBK, IBK, IBK, IBK, PBK, PBK, PBK, DBK, DBK, IBK, DBK, DBK, DBK, IBK, IBK, DBK, DBK, PBK, CIB, PBK, DBK, DBK, DBK, DBK, DBK},
358  /* IS */ {DBK, PBK, PBK, IBK, IBK, IBK, PBK, PBK, PBK, DBK, DBK, IBK, IBK, DBK, DBK, IBK, IBK, DBK, DBK, PBK, CIB, PBK, DBK, DBK, DBK, DBK, DBK},
359  /* PR */ {IBK, PBK, PBK, IBK, IBK, IBK, PBK, PBK, PBK, DBK, DBK, IBK, IBK, IBK, DBK, IBK, IBK, DBK, DBK, PBK, CIB, PBK, IBK, IBK, IBK, IBK, IBK},
360  /* PO */ {IBK, PBK, PBK, IBK, IBK, IBK, PBK, PBK, PBK, DBK, DBK, IBK, IBK, DBK, DBK, IBK, IBK, DBK, DBK, PBK, CIB, PBK, DBK, DBK, DBK, DBK, DBK},
361  /* NU */ {DBK, PBK, PBK, IBK, IBK, IBK, PBK, PBK, PBK, IBK, IBK, IBK, IBK, DBK, IBK, IBK, IBK, DBK, DBK, PBK, CIB, PBK, DBK, DBK, DBK, DBK, DBK},
362  /* AL */ {DBK, PBK, PBK, IBK, IBK, IBK, PBK, PBK, PBK, DBK, DBK, IBK, IBK, DBK, IBK, IBK, IBK, DBK, DBK, PBK, CIB, PBK, DBK, DBK, DBK, DBK, DBK},
363  /* ID */ {DBK, PBK, PBK, IBK, IBK, IBK, PBK, PBK, PBK, DBK, IBK, DBK, DBK, DBK, IBK, IBK, IBK, DBK, DBK, PBK, CIB, PBK, DBK, DBK, DBK, DBK, DBK},
364  /* IN */ {DBK, PBK, PBK, IBK, IBK, IBK, PBK, PBK, PBK, DBK, DBK, DBK, DBK, DBK, IBK, IBK, IBK, DBK, DBK, PBK, CIB, PBK, DBK, DBK, DBK, DBK, DBK},
365  /* HY */ {DBK, PBK, PBK, IBK, DBK, IBK, PBK, PBK, PBK, DBK, DBK, IBK, DBK, DBK, DBK, IBK, IBK, DBK, DBK, PBK, CIB, PBK, DBK, DBK, DBK, DBK, DBK},
366  /* BA */ {DBK, PBK, PBK, IBK, DBK, IBK, PBK, PBK, PBK, DBK, DBK, DBK, DBK, DBK, DBK, IBK, IBK, DBK, DBK, PBK, CIB, PBK, DBK, DBK, DBK, DBK, DBK},
367  /* BB */ {IBK, PBK, PBK, IBK, IBK, IBK, PBK, PBK, PBK, IBK, IBK, IBK, IBK, IBK, IBK, IBK, IBK, IBK, IBK, PBK, CIB, PBK, IBK, IBK, IBK, IBK, IBK},
368  /* B2 */ {DBK, PBK, PBK, IBK, IBK, IBK, PBK, PBK, PBK, DBK, DBK, DBK, DBK, DBK, DBK, IBK, IBK, DBK, PBK, PBK, CIB, PBK, DBK, DBK, DBK, DBK, DBK},
369  /* ZW */ {DBK, DBK, DBK, DBK, DBK, DBK, DBK, DBK, DBK, DBK, DBK, DBK, DBK, DBK, DBK, DBK, DBK, DBK, DBK, PBK, DBK, DBK, DBK, DBK, DBK, DBK, DBK},
370  /* CM */ {DBK, PBK, PBK, IBK, IBK, IBK, PBK, PBK, PBK, DBK, DBK, IBK, IBK, DBK, IBK, IBK, IBK, DBK, DBK, PBK, CIB, PBK, DBK, DBK, DBK, DBK, DBK},
371  /* WJ */ {IBK, PBK, PBK, IBK, IBK, IBK, PBK, PBK, PBK, IBK, IBK, IBK, IBK, IBK, IBK, IBK, IBK, IBK, IBK, PBK, CIB, PBK, IBK, IBK, IBK, IBK, IBK},
372  /* H2 */ {DBK, PBK, PBK, IBK, IBK, IBK, PBK, PBK, PBK, DBK, IBK, DBK, DBK, DBK, IBK, IBK, IBK, DBK, DBK, PBK, CIB, PBK, DBK, DBK, DBK, IBK, IBK},
373  /* H3 */ {DBK, PBK, PBK, IBK, IBK, IBK, PBK, PBK, PBK, DBK, IBK, DBK, DBK, DBK, IBK, IBK, IBK, DBK, DBK, PBK, CIB, PBK, DBK, DBK, DBK, DBK, IBK},
374  /* JL */ {DBK, PBK, PBK, IBK, IBK, IBK, PBK, PBK, PBK, DBK, IBK, DBK, DBK, DBK, IBK, IBK, IBK, DBK, DBK, PBK, CIB, PBK, IBK, IBK, IBK, IBK, DBK},
375  /* JV */ {DBK, PBK, PBK, IBK, IBK, IBK, PBK, PBK, PBK, DBK, IBK, DBK, DBK, DBK, IBK, IBK, IBK, DBK, DBK, PBK, CIB, PBK, DBK, DBK, DBK, IBK, IBK},
376  /* JT */ {DBK, PBK, PBK, IBK, IBK, IBK, PBK, PBK, PBK, DBK, IBK, DBK, DBK, DBK, IBK, IBK, IBK, DBK, DBK, PBK, CIB, PBK, DBK, DBK, DBK, DBK, IBK},
377  };
378 
379  uint8_t ch = static_cast<uint8_t>(*text);
380 
381  LineBreakClass cls;
382 
383  if (ch == '\n')
384  cls = kLBC_MandatoryBreak;
385  else if (ch < 128)
386  {
387  cls = kASCII_LBTable[ch];
388  if (cls > kLBC_MandatoryBreak and cls != kLBC_Space) // duh...
389  cls = kLBC_Alphabetic;
390  }
391  else
392  cls = kLBC_Unknown;
393 
394  if (cls == kLBC_Space)
395  cls = kLBC_WordJoiner;
396 
397  LineBreakClass ncls = cls;
398 
399  while (++text != end and cls != kLBC_MandatoryBreak)
400  {
401  ch = *text;
402 
403  LineBreakClass lcls = ncls;
404 
405  if (ch == '\n')
406  {
407  ++text;
408  break;
409  }
410 
411  ncls = kASCII_LBTable[ch];
412 
413  if (ncls == kLBC_Space)
414  continue;
415 
416  breakAction brk = brkTable[cls][ncls];
417 
418  if (brk == DBK or (brk == IBK and lcls == kLBC_Space))
419  break;
420 
421  cls = ncls;
422  }
423 
424  return text;
425 }
426 
427 std::vector<std::string> wrapLine(const std::string &text, size_t width)
428 {
429  std::vector<std::string> result;
430  std::vector<size_t> offsets = {0};
431 
432  auto b = text.begin();
433  while (b != text.end())
434  {
435  auto e = nextLineBreak(b, text.end());
436 
437  offsets.push_back(e - text.begin());
438 
439  b = e;
440  }
441 
442  size_t count = offsets.size() - 1;
443 
444  std::vector<size_t> minima(count + 1, 1000000);
445  minima[0] = 0;
446  std::vector<size_t> breaks(count + 1, 0);
447 
448  for (size_t i = 0; i < count; ++i)
449  {
450  size_t j = i + 1;
451  while (j <= count)
452  {
453  size_t w = offsets[j] - offsets[i];
454 
455  if (w > width)
456  break;
457 
458  while (w > 0 and isspace(text[offsets[i] + w - 1]))
459  --w;
460 
461  size_t cost = minima[i];
462  if (j < count) // last line may be shorter
463  cost += (width - w) * (width - w);
464 
465  if (cost < minima[j])
466  {
467  minima[j] = cost;
468  breaks[j] = i;
469  }
470 
471  ++j;
472  }
473  }
474 
475  size_t j = count;
476  while (j > 0)
477  {
478  size_t i = breaks[j];
479  result.push_back(text.substr(offsets[i], offsets[j] - offsets[i]));
480  j = i;
481  }
482 
483  reverse(result.begin(), result.end());
484 
485  return result;
486 }
487 
488 std::vector<std::string> word_wrap(const std::string &text, size_t width)
489 {
490  std::vector<std::string> result;
491  for (auto p : cif::split<std::string>(text, "\n"))
492  {
493  if (p.empty())
494  {
495  result.push_back("");
496  continue;
497  }
498 
499  auto lines = wrapLine(p, width);
500  result.insert(result.end(), lines.begin(), lines.end());
501  }
502 
503  return result;
504 }
505 
506 } // namespace cif
void to_upper(std::string &s)
Definition: text.cpp:128
void to_lower(std::string &s)
Definition: text.cpp:114
std::string trim_copy(std::string_view s)
Definition: text.cpp:211
void replace_all(std::string &s, std::string_view what, std::string_view with)
Definition: text.cpp:134
doublereal * c
void trim(std::string &s)
Definition: text.cpp:205
const LineBreakClass kASCII_LBTable[128]
Definition: text.cpp:310
bool icontains(std::string_view s, std::string_view q)
Definition: text.cpp:143
void trim_left(std::string &s)
Definition: text.cpp:191
doublereal * w
bool iequals(std::string_view a, std::string_view b)
Definition: text.cpp:59
#define i
doublereal * d
LineBreakClass
Definition: text.cpp:268
doublereal * b
std::string to_lower_copy(std::string_view s)
Definition: text.cpp:120
std::string trim_left_copy(std::string_view s)
Definition: text.cpp:177
std::tuple< std::string, std::string > split_tag_name(std::string_view tag)
Definition: text.cpp:218
std::vector< std::string > word_wrap(const std::string &text, size_t width)
Definition: text.cpp:488
std::string trim_right_copy(std::string_view s)
Definition: text.cpp:163
#define j
std::vector< std::string > wrapLine(const std::string &text, size_t width)
Definition: text.cpp:427
const uint8_t kCharToLowerMap[256]
Definition: text.cpp:38
std::string cif_id_for_number(int number)
Definition: text.cpp:235
void trim_right(std::string &s)
Definition: text.cpp:148
std::string::const_iterator nextLineBreak(std::string::const_iterator text, std::string::const_iterator end)
Definition: text.cpp:334
std::string to_string(bond_type bondType)
Definition: compound.cpp:43
doublereal * a
int icompare(std::string_view a, std::string_view b)
Definition: text.cpp:77