All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
ParsingToolkit.cxx
Go to the documentation of this file.
1 /**
2  * @file icaruscode/PMT/Algorithms/ParsingToolkit.cxx
3  * @brief Simple text parsing utilities.
4  * @author Gianluca Petrillo (petrillo@slac.stanford.edu)
5  * @date May 13, 2022
6  * @see icaruscode/PMT/Algorithms/ParsingToolkit.h
7  */
8 
9 // library header
11 
12 // C/C++ standard libraries
13 #include <algorithm> // std::sort(), std::unique()
14 #include <string>
15 #include <string_view>
16 
17 
18 // -----------------------------------------------------------------------------
19 // --- icarus::ParsingToolkit
20 // -----------------------------------------------------------------------------
23 
24 
25 // -----------------------------------------------------------------------------
26 std::pair<std::string, unsigned int> icarus::ParsingToolkit::readMultiline
27  (std::istream& in) const
28 {
29 
30  std::string fullLine;
31  std::string openQuoteLine;
32  unsigned int nLines = 0U;
33  while (in) {
34 
35  std::string line;
36  std::getline(in, line, in.widen(fParams.EOL));
37  bool const isEOF = in.eof();
38  if (!isEOF || !line.empty()) ++nLines;
39  openQuoteLine.append(line);
40 
41  if (isQuotationUnclosed(make_view(openQuoteLine))) {
42  if (isCharacterEscaped(line.begin(), line.end())) {
43  fullLine.append(openQuoteLine);
44  throw Error{ "Parser error: escaped end-of-line inside a quotation:\n"
45  + fullLine + "\n" };
46  }
47  // if the newline is quoted, it's preserved
48  if (!isEOF) openQuoteLine += fParams.EOL;
49  continue;
50  }
51  fullLine.append(openQuoteLine);
52  openQuoteLine.clear();
53 
54  if (!isCharacterEscaped(fullLine.begin(), fullLine.end())) break;
55 
56  fullLine.pop_back(); // remove the escape character
57 
58  } // while
59  fullLine.append(openQuoteLine); // usually empty
60 
61  return { std::move(fullLine), nLines };
62 } // icarus::ParsingToolkit::readMultiline()
63 
64 
65 // -----------------------------------------------------------------------------
66 auto icarus::ParsingToolkit::findQuotationStart(std::string_view sv) const
67  -> std::pair<std::string_view, QuotSpec_t const*>
68 {
69 
70  while (!sv.empty()) {
71 
72  // look for a character that could start a quotation opening
73  std::size_t const startPos = sv.find_first_of(fQuoteStarts);
74 
75  // no such character found:
76  if (startPos == std::string_view::npos) break;
77 
78  // if the character is escaped, this is not a quotation opening:
79  if (isCharacterEscaped(sv.begin(), sv.begin() + startPos)) {
80  sv.remove_prefix(std::min(startPos + 1, sv.length()));
81  continue;
82  }
83 
84  sv.remove_prefix(std::min(startPos, sv.length()));
85 
86  // try all the opening quotes
87  // (may be optimized by grouping them by first character)
88  for (auto const& qSpec: fParams.quotes) {
89 
90 // if (sv.starts_with(qSpec.first)) return { sv, &qSpec }; // C++20
91  if (sv.compare(0, qSpec.first.length(), qSpec.first) == 0)
92  return { sv, &qSpec };
93 
94  } // for quotes
95 
96  // nope, just a character; remove it and keep looking
97  sv.remove_prefix(1);
98 
99  } // while sv
100 
101  return { make_view(sv.end(), sv.end()), nullptr };
102 } // icarus::ParsingToolkit::findQuotationStart()
103 
104 
105 // -----------------------------------------------------------------------------
107  (std::string_view sv, std::string const& quotEnd) const
108 {
109  while (!sv.empty()) {
110 
111  std::size_t const pos = sv.find(quotEnd);
112  if (pos == std::string_view::npos) break;
113 
114  if (!isCharacterEscaped(sv.begin(), sv.begin() + pos)) {
115  sv.remove_prefix(pos);
116  return sv;
117  }
118 
119  sv.remove_prefix(pos + 1);
120 
121  } // while
122 
123  return make_view(sv.end(), sv.end());
124 } // icarus::ParsingToolkit::findQuotationEnd()
125 
126 
127 // -----------------------------------------------------------------------------
128 bool icarus::ParsingToolkit::isQuotationUnclosed(std::string_view sv) const {
129 
130  while (!sv.empty()) {
131 
132  auto [ qsv, qptr ] = findQuotationStart(sv);
133  if (!qptr) return false;
134 
135  qsv.remove_prefix(qptr->first.length()); // remove the opening quote
136 
137  qsv = findQuotationEnd(qsv, qptr->second);
138  if (qsv.empty()) return true;
139 
140  qsv.remove_prefix(qptr->second.length());
141  sv = qsv;
142  } // while
143 
144  return false;
145 } // icarus::ParsingToolkit::isQuotationUnclosed()
146 
147 
148 // -----------------------------------------------------------------------------
149 auto icarus::ParsingToolkit::splitOn(std::string_view sv, std::string_view sep)
150  -> SplitView_t
151 {
152  return
153  { make_view(sv.begin(), sep.begin()), sep, make_view(sep.end(), sv.end()) };
154 } // icarus::ParsingToolkit::splitOn()
155 
156 
157 // -----------------------------------------------------------------------------
158 std::string icarus::ParsingToolkit::removeWordEscapes(std::string&& s) const {
159 
160  // replace in place
161  std::string::const_iterator iSrc = s.begin(), send = s.end();
162  std::string::iterator iDest = s.begin();
163 
164  // if the last character is an escape, it's kept
165  while (iSrc != send) {
166  char const ch = *iSrc++;
167  *iDest++ = (isEscape(ch) && (iSrc != send))? *iSrc++: ch;
168  } // while
169 
170  s.erase(iDest, send);
171  return std::move(s);
172 } // icarus::ParsingToolkit::removeWordEscapes()
173 
174 
175 // -----------------------------------------------------------------------------
176 std::string icarus::ParsingToolkit::removeWordQuotations(std::string&& s) const
177 {
178  std::string_view sv = make_view(s);
179  std::string::iterator iDest = s.begin();
180 
181  while (!sv.empty()) {
182 
183  // find the next quotation
184  auto const [ fromQ, qptr ] = findQuotationStart(sv);
185 
186  // copy the material until the next quotation
187  iDest = std::copy(sv.begin(), fromQ.begin(), iDest);
188  sv = fromQ;
189 
190  if (!qptr) break; // if there is no quotation, we are done
191 
192  sv.remove_prefix(qptr->first.length()); // skip the quotation start
193 
194  // find the end of quotation
195  std::string_view const afterQ = findQuotationEnd(sv, qptr->second);
196 
197  if (afterQ.empty()) { // begin of quotation, but no end: no good
198  // leave the "begin of quotation" as is
199  iDest = std::copy(fromQ.begin(), fromQ.end(), iDest);
200  sv.remove_prefix(sv.length()); // note: quote start was already removed
201  break;
202  }
203 
204  // copy the quoted material
205  iDest = std::copy(sv.begin(), afterQ.begin(), iDest);
206  sv = afterQ;
207 
208  sv.remove_prefix(qptr->second.length()); // skip the quotation end
209 
210  } // while
211 
212  assert(sv.empty());
213 
214  s.erase(iDest, s.end());
215  return std::move(s);
216 } // icarus::ParsingToolkit::removeWordQuotations()
217 
218 
219 // -----------------------------------------------------------------------------
221 
222  fParams = std::move(params);
223 
224  // sort the quotations by length
225  auto const byOpeningLength = [](QuotSpec_t const& a, QuotSpec_t const& b)
226  {
227  std::size_t const al = a.first.length(), bl = b.first.length();
228  return (al != bl)? (al > bl): (a < b);
229  };
230  std::sort(fParams.quotes.begin(), fParams.quotes.end(), byOpeningLength);
231 
232  // collect the first character of each of the opening quotes
233  // (sorted and with no duplicates)
234  for (QuotSpec_t const& quotSpec: fParams.quotes)
235  fQuoteStarts += quotSpec.first.front();
236  std::sort(fQuoteStarts.begin(), fQuoteStarts.end());
237  fQuoteStarts.erase
238  (std::unique(fQuoteStarts.begin(), fQuoteStarts.end()), fQuoteStarts.end());
239 
240 } // icarus::ParsingToolkit::adoptParams()
241 
242 
243 // -----------------------------------------------------------------------------
std::pair< std::string, unsigned int > readMultiline(std::istream &in) const
Returns a single line of text from the input stream.
std::pair< std::string_view, QuotSpec_t const * > findQuotationStart(std::string_view sv) const
Finds the start of the next quotation in sv.
std::string removeWordEscapes(std::string &&w) const
Returns a copy of w with all escape characters removed.
bool isQuotationUnclosed(std::string_view sv) const
Returns if the sequence sv has unclosed quotation at its end.
static SplitView_t splitOn(std::string_view sv, std::string_view sep)
Splits the view sv in three: before sep, sep and after sep.
All parsing parameters.
process_name gaushit a
std::pair< std::string, std::string > QuotSpec_t
Specification of quotation: opening and closing.
void adoptParams(Params_t params)
Initializes the parameters and caches.
Record of a split token: pre-separator, separator and post-separator.
static Params_t const DefaultParameters
if &&[-z"$BASH_VERSION"] then echo Attempting to switch to bash bash shellSwitch exit fi &&["$1"= 'shellSwitch'] shift declare a IncludeDirectives for Dir in
std::string_view findQuotationEnd(std::string_view sv, std::string const &quotEnd) const
Finds the quotation end in sv.
Simple text parsing utilities.
then echo File list $list not found else cat $list while read file do echo $file sed s
Definition: file_to_url.sh:60
T copy(T const &v)
std::string removeWordQuotations(std::string &&w) const
Returns a copy of w with no quotation starts and ends.