libstdc++
regex.tcc
1 // class template regex -*- C++ -*-
2 
3 // Copyright (C) 2013-2014 Free Software Foundation, Inc.
4 //
5 // This file is part of the GNU ISO C++ Library. This library is free
6 // software; you can redistribute it and/or modify it under the
7 // terms of the GNU General Public License as published by the
8 // Free Software Foundation; either version 3, or (at your option)
9 // any later version.
10 
11 // This library is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 // GNU General Public License for more details.
15 
16 // Under Section 7 of GPL version 3, you are granted additional
17 // permissions described in the GCC Runtime Library Exception, version
18 // 3.1, as published by the Free Software Foundation.
19 
20 // You should have received a copy of the GNU General Public License and
21 // a copy of the GCC Runtime Library Exception along with this program;
22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23 // <http://www.gnu.org/licenses/>.
24 
25 /**
26  * @file bits/regex.tcc
27  * This is an internal header file, included by other library headers.
28  * Do not attempt to use it directly. @headername{regex}
29  */
30 
31 // See below __regex_algo_impl to get what this is talking about. The default
32 // value 1 indicated a conservative optimization without giving up worst case
33 // performance.
34 #ifndef _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT
35 #define _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT 1
36 #endif
37 
38 namespace std _GLIBCXX_VISIBILITY(default)
39 {
40 namespace __detail
41 {
42 _GLIBCXX_BEGIN_NAMESPACE_VERSION
43 
44  // Result of merging regex_match and regex_search.
45  //
46  // __policy now can be _S_auto (auto dispatch) and _S_alternate (use
47  // the other one if possible, for test purpose).
48  //
49  // That __match_mode is true means regex_match, else regex_search.
50  template<typename _BiIter, typename _Alloc,
51  typename _CharT, typename _TraitsT,
52  _RegexExecutorPolicy __policy,
53  bool __match_mode>
54  bool
55  __regex_algo_impl(_BiIter __s,
56  _BiIter __e,
57  match_results<_BiIter, _Alloc>& __m,
58  const basic_regex<_CharT, _TraitsT>& __re,
59  regex_constants::match_flag_type __flags)
60  {
61  if (__re._M_automaton == nullptr)
62  return false;
63 
64  typename match_results<_BiIter, _Alloc>::_Base_type& __res = __m;
65  __m._M_begin = __s;
66  __res.resize(__re._M_automaton->_M_sub_count() + 2);
67  for (auto& __it : __res)
68  __it.matched = false;
69 
70  // This function decide which executor to use under given circumstances.
71  // The _S_auto policy now is the following: if a NFA has no
72  // back-references and has more than _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT
73  // quantifiers (*, +, ?), the BFS executor will be used, other wise
74  // DFS executor. This is because DFS executor has a exponential upper
75  // bound, but better best-case performace. Meanwhile, BFS executor can
76  // effectively prevent from exponential-long time matching (which must
77  // contains many quantifiers), but it's slower in average.
78  //
79  // For simple regex, BFS executor could be 2 or more times slower than
80  // DFS executor.
81  //
82  // Of course, BFS executor cannot handle back-references.
83  bool __ret;
84  if (!__re._M_automaton->_M_has_backref
85  && (__policy == _RegexExecutorPolicy::_S_alternate
86  || __re._M_automaton->_M_quant_count
87  > _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT))
88  {
89  _Executor<_BiIter, _Alloc, _TraitsT, false>
90  __executor(__s, __e, __m, __re, __flags);
91  if (__match_mode)
92  __ret = __executor._M_match();
93  else
94  __ret = __executor._M_search();
95  }
96  else
97  {
98  _Executor<_BiIter, _Alloc, _TraitsT, true>
99  __executor(__s, __e, __m, __re, __flags);
100  if (__match_mode)
101  __ret = __executor._M_match();
102  else
103  __ret = __executor._M_search();
104  }
105  if (__ret)
106  {
107  for (auto __it : __res)
108  if (!__it.matched)
109  __it.first = __it.second = __e;
110  auto& __pre = __res[__res.size()-2];
111  auto& __suf = __res[__res.size()-1];
112  if (__match_mode)
113  {
114  __pre.matched = false;
115  __pre.first = __s;
116  __pre.second = __s;
117  __suf.matched = false;
118  __suf.first = __e;
119  __suf.second = __e;
120  }
121  else
122  {
123  __pre.first = __s;
124  __pre.second = __res[0].first;
125  __pre.matched = (__pre.first != __pre.second);
126  __suf.first = __res[0].second;
127  __suf.second = __e;
128  __suf.matched = (__suf.first != __suf.second);
129  }
130  }
131  return __ret;
132  }
133 
134 _GLIBCXX_END_NAMESPACE_VERSION
135 }
136 
137 _GLIBCXX_BEGIN_NAMESPACE_VERSION
138 
139  template<typename _Ch_type>
140  template<typename _Fwd_iter>
141  typename regex_traits<_Ch_type>::string_type
142  regex_traits<_Ch_type>::
143  lookup_collatename(_Fwd_iter __first, _Fwd_iter __last) const
144  {
145  typedef std::ctype<char_type> __ctype_type;
146  const __ctype_type& __fctyp(use_facet<__ctype_type>(_M_locale));
147 
148  static const char* __collatenames[] =
149  {
150  "NUL",
151  "SOH",
152  "STX",
153  "ETX",
154  "EOT",
155  "ENQ",
156  "ACK",
157  "alert",
158  "backspace",
159  "tab",
160  "newline",
161  "vertical-tab",
162  "form-feed",
163  "carriage-return",
164  "SO",
165  "SI",
166  "DLE",
167  "DC1",
168  "DC2",
169  "DC3",
170  "DC4",
171  "NAK",
172  "SYN",
173  "ETB",
174  "CAN",
175  "EM",
176  "SUB",
177  "ESC",
178  "IS4",
179  "IS3",
180  "IS2",
181  "IS1",
182  "space",
183  "exclamation-mark",
184  "quotation-mark",
185  "number-sign",
186  "dollar-sign",
187  "percent-sign",
188  "ampersand",
189  "apostrophe",
190  "left-parenthesis",
191  "right-parenthesis",
192  "asterisk",
193  "plus-sign",
194  "comma",
195  "hyphen",
196  "period",
197  "slash",
198  "zero",
199  "one",
200  "two",
201  "three",
202  "four",
203  "five",
204  "six",
205  "seven",
206  "eight",
207  "nine",
208  "colon",
209  "semicolon",
210  "less-than-sign",
211  "equals-sign",
212  "greater-than-sign",
213  "question-mark",
214  "commercial-at",
215  "A",
216  "B",
217  "C",
218  "D",
219  "E",
220  "F",
221  "G",
222  "H",
223  "I",
224  "J",
225  "K",
226  "L",
227  "M",
228  "N",
229  "O",
230  "P",
231  "Q",
232  "R",
233  "S",
234  "T",
235  "U",
236  "V",
237  "W",
238  "X",
239  "Y",
240  "Z",
241  "left-square-bracket",
242  "backslash",
243  "right-square-bracket",
244  "circumflex",
245  "underscore",
246  "grave-accent",
247  "a",
248  "b",
249  "c",
250  "d",
251  "e",
252  "f",
253  "g",
254  "h",
255  "i",
256  "j",
257  "k",
258  "l",
259  "m",
260  "n",
261  "o",
262  "p",
263  "q",
264  "r",
265  "s",
266  "t",
267  "u",
268  "v",
269  "w",
270  "x",
271  "y",
272  "z",
273  "left-curly-bracket",
274  "vertical-line",
275  "right-curly-bracket",
276  "tilde",
277  "DEL",
278  };
279 
280  string __s(__first, __last);
281  for (const auto& __it : __collatenames)
282  if (__s == __it)
283  return string_type(1, __fctyp.widen(
284  static_cast<char>(&__it - __collatenames)));
285 
286  // TODO Add digraph support:
287  // http://boost.sourceforge.net/libs/regex/doc/collating_names.html
288 
289  return string_type();
290  }
291 
292  template<typename _Ch_type>
293  template<typename _Fwd_iter>
294  typename regex_traits<_Ch_type>::char_class_type
295  regex_traits<_Ch_type>::
296  lookup_classname(_Fwd_iter __first, _Fwd_iter __last, bool __icase) const
297  {
298  typedef std::ctype<char_type> __ctype_type;
299  const __ctype_type& __fctyp(use_facet<__ctype_type>(_M_locale));
300 
301  // Mappings from class name to class mask.
302  static const pair<const char*, char_class_type> __classnames[] =
303  {
304  {"d", ctype_base::digit},
305  {"w", {ctype_base::alnum, _RegexMask::_S_under}},
306  {"s", ctype_base::space},
307  {"alnum", ctype_base::alnum},
308  {"alpha", ctype_base::alpha},
309  {"blank", {0, _RegexMask::_S_blank}},
310  {"cntrl", ctype_base::cntrl},
311  {"digit", ctype_base::digit},
312  {"graph", ctype_base::graph},
313  {"lower", ctype_base::lower},
314  {"print", ctype_base::print},
315  {"punct", ctype_base::punct},
316  {"space", ctype_base::space},
317  {"upper", ctype_base::upper},
318  {"xdigit", ctype_base::xdigit},
319  };
320 
321  string __s;
322  for (auto __cur = __first; __cur != __last; ++__cur)
323  __s += __fctyp.narrow(__fctyp.tolower(*__cur), '?');
324 
325  for (const auto& __it : __classnames)
326  if (__s == __it.first)
327  {
328  if (__icase
329  && ((__it.second
330  & (ctype_base::lower | ctype_base::upper)) != 0))
331  return ctype_base::alpha;
332  return __it.second;
333  }
334  return 0;
335  }
336 
337  template<typename _Ch_type>
338  bool
339  regex_traits<_Ch_type>::
340  isctype(_Ch_type __c, char_class_type __f) const
341  {
342  typedef std::ctype<char_type> __ctype_type;
343  const __ctype_type& __fctyp(use_facet<__ctype_type>(_M_locale));
344 
345  return __fctyp.is(__f._M_base, __c)
346  // [[:w:]]
347  || ((__f._M_extended & _RegexMask::_S_under)
348  && __c == __fctyp.widen('_'))
349  // [[:blank:]]
350  || ((__f._M_extended & _RegexMask::_S_blank)
351  && (__c == __fctyp.widen(' ')
352  || __c == __fctyp.widen('\t')));
353  }
354 
355  template<typename _Ch_type>
356  int
357  regex_traits<_Ch_type>::
358  value(_Ch_type __ch, int __radix) const
359  {
360  std::basic_istringstream<char_type> __is(string_type(1, __ch));
361  long __v;
362  if (__radix == 8)
363  __is >> std::oct;
364  else if (__radix == 16)
365  __is >> std::hex;
366  __is >> __v;
367  return __is.fail() ? -1 : __v;
368  }
369 
370  template<typename _Bi_iter, typename _Alloc>
371  template<typename _Out_iter>
372  _Out_iter match_results<_Bi_iter, _Alloc>::
373  format(_Out_iter __out,
374  const match_results<_Bi_iter, _Alloc>::char_type* __fmt_first,
375  const match_results<_Bi_iter, _Alloc>::char_type* __fmt_last,
376  match_flag_type __flags) const
377  {
378  _GLIBCXX_DEBUG_ASSERT( ready() );
379  regex_traits<char_type> __traits;
380  typedef std::ctype<char_type> __ctype_type;
381  const __ctype_type&
382  __fctyp(use_facet<__ctype_type>(__traits.getloc()));
383 
384  auto __output = [&](size_t __idx)
385  {
386  auto& __sub = _Base_type::operator[](__idx);
387  if (__sub.matched)
388  __out = std::copy(__sub.first, __sub.second, __out);
389  };
390 
391  if (__flags & regex_constants::format_sed)
392  {
393  for (; __fmt_first != __fmt_last;)
394  if (*__fmt_first == '&')
395  {
396  __output(0);
397  ++__fmt_first;
398  }
399  else if (*__fmt_first == '\\')
400  {
401  if (++__fmt_first != __fmt_last
402  && __fctyp.is(__ctype_type::digit, *__fmt_first))
403  __output(__traits.value(*__fmt_first++, 10));
404  else
405  *__out++ = '\\';
406  }
407  else
408  *__out++ = *__fmt_first++;
409  }
410  else
411  {
412  while (1)
413  {
414  auto __next = std::find(__fmt_first, __fmt_last, '$');
415  if (__next == __fmt_last)
416  break;
417 
418  __out = std::copy(__fmt_first, __next, __out);
419 
420  auto __eat = [&](char __ch) -> bool
421  {
422  if (*__next == __ch)
423  {
424  ++__next;
425  return true;
426  }
427  return false;
428  };
429 
430  if (++__next == __fmt_last)
431  *__out++ = '$';
432  else if (__eat('$'))
433  *__out++ = '$';
434  else if (__eat('&'))
435  __output(0);
436  else if (__eat('`'))
437  __output(_Base_type::size()-2);
438  else if (__eat('\''))
439  __output(_Base_type::size()-1);
440  else if (__fctyp.is(__ctype_type::digit, *__next))
441  {
442  long __num = __traits.value(*__next, 10);
443  if (++__next != __fmt_last
444  && __fctyp.is(__ctype_type::digit, *__next))
445  {
446  __num *= 10;
447  __num += __traits.value(*__next++, 10);
448  }
449  if (0 <= __num && __num < this->size())
450  __output(__num);
451  }
452  else
453  *__out++ = '$';
454  __fmt_first = __next;
455  }
456  __out = std::copy(__fmt_first, __fmt_last, __out);
457  }
458  return __out;
459  }
460 
461  template<typename _Out_iter, typename _Bi_iter,
462  typename _Rx_traits, typename _Ch_type>
463  _Out_iter
464  regex_replace(_Out_iter __out, _Bi_iter __first, _Bi_iter __last,
465  const basic_regex<_Ch_type, _Rx_traits>& __e,
466  const _Ch_type* __fmt,
467  regex_constants::match_flag_type __flags)
468  {
469  typedef regex_iterator<_Bi_iter, _Ch_type, _Rx_traits> _IterT;
470  _IterT __i(__first, __last, __e, __flags);
471  _IterT __end;
472  if (__i == __end)
473  {
474  if (!(__flags & regex_constants::format_no_copy))
475  __out = std::copy(__first, __last, __out);
476  }
477  else
478  {
479  sub_match<_Bi_iter> __last;
480  auto __len = char_traits<_Ch_type>::length(__fmt);
481  for (; __i != __end; ++__i)
482  {
483  if (!(__flags & regex_constants::format_no_copy))
484  __out = std::copy(__i->prefix().first, __i->prefix().second,
485  __out);
486  __out = __i->format(__out, __fmt, __fmt + __len, __flags);
487  __last = __i->suffix();
488  if (__flags & regex_constants::format_first_only)
489  break;
490  }
491  if (!(__flags & regex_constants::format_no_copy))
492  __out = std::copy(__last.first, __last.second, __out);
493  }
494  return __out;
495  }
496 
497  template<typename _Bi_iter,
498  typename _Ch_type,
499  typename _Rx_traits>
500  bool
501  regex_iterator<_Bi_iter, _Ch_type, _Rx_traits>::
502  operator==(const regex_iterator& __rhs) const
503  {
504  return (_M_match.empty() && __rhs._M_match.empty())
505  || (_M_begin == __rhs._M_begin
506  && _M_end == __rhs._M_end
507  && _M_pregex == __rhs._M_pregex
508  && _M_flags == __rhs._M_flags
509  && _M_match[0] == __rhs._M_match[0]);
510  }
511 
512  template<typename _Bi_iter,
513  typename _Ch_type,
514  typename _Rx_traits>
515  regex_iterator<_Bi_iter, _Ch_type, _Rx_traits>&
516  regex_iterator<_Bi_iter, _Ch_type, _Rx_traits>::
517  operator++()
518  {
519  // In all cases in which the call to regex_search returns true,
520  // match.prefix().first shall be equal to the previous value of
521  // match[0].second, and for each index i in the half-open range
522  // [0, match.size()) for which match[i].matched is true,
523  // match[i].position() shall return distance(begin, match[i].first).
524  // [28.12.1.4.5]
525  if (_M_match[0].matched)
526  {
527  auto __start = _M_match[0].second;
528  auto __prefix_first = _M_match[0].second;
529  if (_M_match[0].first == _M_match[0].second)
530  {
531  if (__start == _M_end)
532  {
533  _M_match = value_type();
534  return *this;
535  }
536  else
537  {
538  if (regex_search(__start, _M_end, _M_match, *_M_pregex,
539  _M_flags
540  | regex_constants::match_not_null
541  | regex_constants::match_continuous))
542  {
543  _GLIBCXX_DEBUG_ASSERT(_M_match[0].matched);
544  auto& __prefix = _M_match.at(_M_match.size());
545  __prefix.first = __prefix_first;
546  __prefix.matched = __prefix.first != __prefix.second;
547  // [28.12.1.4.5]
548  _M_match._M_begin = _M_begin;
549  return *this;
550  }
551  else
552  ++__start;
553  }
554  }
555  _M_flags |= regex_constants::match_prev_avail;
556  if (regex_search(__start, _M_end, _M_match, *_M_pregex, _M_flags))
557  {
558  _GLIBCXX_DEBUG_ASSERT(_M_match[0].matched);
559  auto& __prefix = _M_match.at(_M_match.size());
560  __prefix.first = __prefix_first;
561  __prefix.matched = __prefix.first != __prefix.second;
562  // [28.12.1.4.5]
563  _M_match._M_begin = _M_begin;
564  }
565  else
566  _M_match = value_type();
567  }
568  return *this;
569  }
570 
571  template<typename _Bi_iter,
572  typename _Ch_type,
573  typename _Rx_traits>
574  regex_token_iterator<_Bi_iter, _Ch_type, _Rx_traits>&
575  regex_token_iterator<_Bi_iter, _Ch_type, _Rx_traits>::
576  operator=(const regex_token_iterator& __rhs)
577  {
578  _M_position = __rhs._M_position;
579  _M_subs = __rhs._M_subs;
580  _M_n = __rhs._M_n;
581  _M_suffix = __rhs._M_suffix;
582  _M_has_m1 = __rhs._M_has_m1;
583  _M_normalize_result();
584  return *this;
585  }
586 
587  template<typename _Bi_iter,
588  typename _Ch_type,
589  typename _Rx_traits>
590  bool
591  regex_token_iterator<_Bi_iter, _Ch_type, _Rx_traits>::
592  operator==(const regex_token_iterator& __rhs) const
593  {
594  if (_M_end_of_seq() && __rhs._M_end_of_seq())
595  return true;
596  if (_M_suffix.matched && __rhs._M_suffix.matched
597  && _M_suffix == __rhs._M_suffix)
598  return true;
599  if (_M_end_of_seq() || _M_suffix.matched
600  || __rhs._M_end_of_seq() || __rhs._M_suffix.matched)
601  return false;
602  return _M_position == __rhs._M_position
603  && _M_n == __rhs._M_n
604  && _M_subs == __rhs._M_subs;
605  }
606 
607  template<typename _Bi_iter,
608  typename _Ch_type,
609  typename _Rx_traits>
610  regex_token_iterator<_Bi_iter, _Ch_type, _Rx_traits>&
611  regex_token_iterator<_Bi_iter, _Ch_type, _Rx_traits>::
612  operator++()
613  {
614  _Position __prev = _M_position;
615  if (_M_suffix.matched)
616  *this = regex_token_iterator();
617  else if (_M_n + 1 < _M_subs.size())
618  {
619  _M_n++;
620  _M_result = &_M_current_match();
621  }
622  else
623  {
624  _M_n = 0;
625  ++_M_position;
626  if (_M_position != _Position())
627  _M_result = &_M_current_match();
628  else if (_M_has_m1 && __prev->suffix().length() != 0)
629  {
630  _M_suffix.matched = true;
631  _M_suffix.first = __prev->suffix().first;
632  _M_suffix.second = __prev->suffix().second;
633  _M_result = &_M_suffix;
634  }
635  else
636  *this = regex_token_iterator();
637  }
638  return *this;
639  }
640 
641  template<typename _Bi_iter,
642  typename _Ch_type,
643  typename _Rx_traits>
644  void
645  regex_token_iterator<_Bi_iter, _Ch_type, _Rx_traits>::
646  _M_init(_Bi_iter __a, _Bi_iter __b)
647  {
648  _M_has_m1 = false;
649  for (auto __it : _M_subs)
650  if (__it == -1)
651  {
652  _M_has_m1 = true;
653  break;
654  }
655  if (_M_position != _Position())
656  _M_result = &_M_current_match();
657  else if (_M_has_m1)
658  {
659  _M_suffix.matched = true;
660  _M_suffix.first = __a;
661  _M_suffix.second = __b;
662  _M_result = &_M_suffix;
663  }
664  else
665  _M_result = nullptr;
666  }
667 
668 _GLIBCXX_END_NAMESPACE_VERSION
669 } // namespace
670