libstdc++
regex.tcc
1 // class template regex -*- C++ -*-
2 
3 // Copyright (C) 2013-2014 Free Software Foundation, Inc.
4 //
5 // This file is part of the GNU ISO C++ Library. This library is free
6 // software; you can redistribute it and/or modify it under the
7 // terms of the GNU General Public License as published by the
8 // Free Software Foundation; either version 3, or (at your option)
9 // any later version.
10 
11 // This library is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 // GNU General Public License for more details.
15 
16 // Under Section 7 of GPL version 3, you are granted additional
17 // permissions described in the GCC Runtime Library Exception, version
18 // 3.1, as published by the Free Software Foundation.
19 
20 // You should have received a copy of the GNU General Public License and
21 // a copy of the GCC Runtime Library Exception along with this program;
22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23 // <http://www.gnu.org/licenses/>.
24 
25 /**
26  * @file bits/regex.tcc
27  * This is an internal header file, included by other library headers.
28  * Do not attempt to use it directly. @headername{regex}
29  */
30 
31 // See below __regex_algo_impl to get what this is talking about. The default
32 // value 1 indicated a conservative optimization without giving up worst case
33 // performance.
34 #ifndef _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT
35 #define _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT 1
36 #endif
37 
38 namespace std _GLIBCXX_VISIBILITY(default)
39 {
40 namespace __detail
41 {
42 _GLIBCXX_BEGIN_NAMESPACE_VERSION
43 
44  // Result of merging regex_match and regex_search.
45  //
46  // __policy now can be _S_auto (auto dispatch) and _S_alternate (use
47  // the other one if possible, for test purpose).
48  //
49  // That __match_mode is true means regex_match, else regex_search.
50  template<typename _BiIter, typename _Alloc,
51  typename _CharT, typename _TraitsT,
52  _RegexExecutorPolicy __policy,
53  bool __match_mode>
54  bool
55  __regex_algo_impl(_BiIter __s,
56  _BiIter __e,
57  match_results<_BiIter, _Alloc>& __m,
58  const basic_regex<_CharT, _TraitsT>& __re,
59  regex_constants::match_flag_type __flags)
60  {
61  if (__re._M_automaton == nullptr)
62  return false;
63 
64  typename match_results<_BiIter, _Alloc>::_Base_type& __res = __m;
65  __res.resize(__re._M_automaton->_M_sub_count() + 2);
66  for (auto& __it : __res)
67  __it.matched = false;
68 
69  // This function decide which executor to use under given circumstances.
70  // The _S_auto policy now is the following: if a NFA has no
71  // back-references and has more than _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT
72  // quantifiers (*, +, ?), the BFS executor will be used, other wise
73  // DFS executor. This is because DFS executor has a exponential upper
74  // bound, but better best-case performace. Meanwhile, BFS executor can
75  // effectively prevent from exponential-long time matching (which must
76  // contains many quantifiers), but it's slower in average.
77  //
78  // For simple regex, BFS executor could be 2 or more times slower than
79  // DFS executor.
80  //
81  // Of course, BFS executor cannot handle back-references.
82  bool __ret;
83  if (!__re._M_automaton->_M_has_backref
84  && (__policy == _RegexExecutorPolicy::_S_alternate
85  || __re._M_automaton->_M_quant_count
86  > _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT))
87  {
88  _Executor<_BiIter, _Alloc, _TraitsT, false>
89  __executor(__s, __e, __m, __re, __flags);
90  if (__match_mode)
91  __ret = __executor._M_match();
92  else
93  __ret = __executor._M_search();
94  }
95  else
96  {
97  _Executor<_BiIter, _Alloc, _TraitsT, true>
98  __executor(__s, __e, __m, __re, __flags);
99  if (__match_mode)
100  __ret = __executor._M_match();
101  else
102  __ret = __executor._M_search();
103  }
104  if (__ret)
105  {
106  for (auto __it : __res)
107  if (!__it.matched)
108  __it.first = __it.second = __e;
109  auto& __pre = __res[__res.size()-2];
110  auto& __suf = __res[__res.size()-1];
111  if (__match_mode)
112  {
113  __pre.matched = false;
114  __pre.first = __s;
115  __pre.second = __s;
116  __suf.matched = false;
117  __suf.first = __e;
118  __suf.second = __e;
119  }
120  else
121  {
122  __pre.first = __s;
123  __pre.second = __res[0].first;
124  __pre.matched = (__pre.first != __pre.second);
125  __suf.first = __res[0].second;
126  __suf.second = __e;
127  __suf.matched = (__suf.first != __suf.second);
128  }
129  }
130  return __ret;
131  }
132 
133 _GLIBCXX_END_NAMESPACE_VERSION
134 }
135 
136 _GLIBCXX_BEGIN_NAMESPACE_VERSION
137 
138  template<typename _Ch_type>
139  template<typename _Fwd_iter>
140  typename regex_traits<_Ch_type>::string_type
141  regex_traits<_Ch_type>::
142  lookup_collatename(_Fwd_iter __first, _Fwd_iter __last) const
143  {
144  typedef std::ctype<char_type> __ctype_type;
145  const __ctype_type& __fctyp(use_facet<__ctype_type>(_M_locale));
146 
147  static const char* __collatenames[] =
148  {
149  "NUL",
150  "SOH",
151  "STX",
152  "ETX",
153  "EOT",
154  "ENQ",
155  "ACK",
156  "alert",
157  "backspace",
158  "tab",
159  "newline",
160  "vertical-tab",
161  "form-feed",
162  "carriage-return",
163  "SO",
164  "SI",
165  "DLE",
166  "DC1",
167  "DC2",
168  "DC3",
169  "DC4",
170  "NAK",
171  "SYN",
172  "ETB",
173  "CAN",
174  "EM",
175  "SUB",
176  "ESC",
177  "IS4",
178  "IS3",
179  "IS2",
180  "IS1",
181  "space",
182  "exclamation-mark",
183  "quotation-mark",
184  "number-sign",
185  "dollar-sign",
186  "percent-sign",
187  "ampersand",
188  "apostrophe",
189  "left-parenthesis",
190  "right-parenthesis",
191  "asterisk",
192  "plus-sign",
193  "comma",
194  "hyphen",
195  "period",
196  "slash",
197  "zero",
198  "one",
199  "two",
200  "three",
201  "four",
202  "five",
203  "six",
204  "seven",
205  "eight",
206  "nine",
207  "colon",
208  "semicolon",
209  "less-than-sign",
210  "equals-sign",
211  "greater-than-sign",
212  "question-mark",
213  "commercial-at",
214  "A",
215  "B",
216  "C",
217  "D",
218  "E",
219  "F",
220  "G",
221  "H",
222  "I",
223  "J",
224  "K",
225  "L",
226  "M",
227  "N",
228  "O",
229  "P",
230  "Q",
231  "R",
232  "S",
233  "T",
234  "U",
235  "V",
236  "W",
237  "X",
238  "Y",
239  "Z",
240  "left-square-bracket",
241  "backslash",
242  "right-square-bracket",
243  "circumflex",
244  "underscore",
245  "grave-accent",
246  "a",
247  "b",
248  "c",
249  "d",
250  "e",
251  "f",
252  "g",
253  "h",
254  "i",
255  "j",
256  "k",
257  "l",
258  "m",
259  "n",
260  "o",
261  "p",
262  "q",
263  "r",
264  "s",
265  "t",
266  "u",
267  "v",
268  "w",
269  "x",
270  "y",
271  "z",
272  "left-curly-bracket",
273  "vertical-line",
274  "right-curly-bracket",
275  "tilde",
276  "DEL",
277  ""
278  };
279 
280  // same as boost
281  //static const char* __digraphs[] =
282  // {
283  // "ae",
284  // "Ae",
285  // "AE",
286  // "ch",
287  // "Ch",
288  // "CH",
289  // "ll",
290  // "Ll",
291  // "LL",
292  // "ss",
293  // "Ss",
294  // "SS",
295  // "nj",
296  // "Nj",
297  // "NJ",
298  // "dz",
299  // "Dz",
300  // "DZ",
301  // "lj",
302  // "Lj",
303  // "LJ",
304  // ""
305  // };
306 
307  std::string __s(__last - __first, '?');
308  __fctyp.narrow(__first, __last, '?', &*__s.begin());
309 
310  for (unsigned int __i = 0; *__collatenames[__i]; __i++)
311  if (__s == __collatenames[__i])
312  return string_type(1, __fctyp.widen(static_cast<char>(__i)));
313 
314  //for (unsigned int __i = 0; *__digraphs[__i]; __i++)
315  // {
316  // const char* __now = __digraphs[__i];
317  // if (__s == __now)
318  // {
319  // string_type ret(__s.size(), __fctyp.widen('?'));
320  // __fctyp.widen(__now, __now + 2/* ouch */, &*ret.begin());
321  // return ret;
322  // }
323  // }
324  return string_type();
325  }
326 
327  template<typename _Ch_type>
328  template<typename _Fwd_iter>
329  typename regex_traits<_Ch_type>::char_class_type
330  regex_traits<_Ch_type>::
331  lookup_classname(_Fwd_iter __first, _Fwd_iter __last, bool __icase) const
332  {
333  typedef std::ctype<char_type> __ctype_type;
334  typedef std::ctype<char> __cctype_type;
335  typedef const pair<const char*, char_class_type> _ClassnameEntry;
336  const __ctype_type& __fctyp(use_facet<__ctype_type>(_M_locale));
337  const __cctype_type& __cctyp(use_facet<__cctype_type>(_M_locale));
338 
339  static _ClassnameEntry __classnames[] =
340  {
341  {"d", ctype_base::digit},
342  {"w", {ctype_base::alnum, _RegexMask::_S_under}},
343  {"s", ctype_base::space},
344  {"alnum", ctype_base::alnum},
345  {"alpha", ctype_base::alpha},
346  {"blank", {0, _RegexMask::_S_blank}},
347  {"cntrl", ctype_base::cntrl},
348  {"digit", ctype_base::digit},
349  {"graph", ctype_base::graph},
350  {"lower", ctype_base::lower},
351  {"print", ctype_base::print},
352  {"punct", ctype_base::punct},
353  {"space", ctype_base::space},
354  {"upper", ctype_base::upper},
355  {"xdigit", ctype_base::xdigit},
356  };
357 
358  std::string __s(__last - __first, '?');
359  __fctyp.narrow(__first, __last, '?', &__s[0]);
360  __cctyp.tolower(&*__s.begin(), &*__s.begin() + __s.size());
361  for (_ClassnameEntry* __it = __classnames;
362  __it < *(&__classnames + 1);
363  ++__it)
364  {
365  if (__s == __it->first)
366  {
367  if (__icase
368  && ((__it->second
369  & (ctype_base::lower | ctype_base::upper)) != 0))
370  return ctype_base::alpha;
371  return __it->second;
372  }
373  }
374  return 0;
375  }
376 
377  template<typename _Ch_type>
378  bool
379  regex_traits<_Ch_type>::
380  isctype(_Ch_type __c, char_class_type __f) const
381  {
382  typedef std::ctype<char_type> __ctype_type;
383  const __ctype_type& __fctyp(use_facet<__ctype_type>(_M_locale));
384 
385  return __fctyp.is(__f._M_base, __c)
386  // [[:w:]]
387  || ((__f._M_extended & _RegexMask::_S_under)
388  && __c == __fctyp.widen('_'))
389  // [[:blank:]]
390  || ((__f._M_extended & _RegexMask::_S_blank)
391  && (__c == __fctyp.widen(' ')
392  || __c == __fctyp.widen('\t')));
393  }
394 
395  template<typename _Ch_type>
396  int
397  regex_traits<_Ch_type>::
398  value(_Ch_type __ch, int __radix) const
399  {
400  std::basic_istringstream<char_type> __is(string_type(1, __ch));
401  long __v;
402  if (__radix == 8)
403  __is >> std::oct;
404  else if (__radix == 16)
405  __is >> std::hex;
406  __is >> __v;
407  return __is.fail() ? -1 : __v;
408  }
409 
410  template<typename _Bi_iter, typename _Alloc>
411  template<typename _Out_iter>
412  _Out_iter match_results<_Bi_iter, _Alloc>::
413  format(_Out_iter __out,
414  const match_results<_Bi_iter, _Alloc>::char_type* __fmt_first,
415  const match_results<_Bi_iter, _Alloc>::char_type* __fmt_last,
416  match_flag_type __flags) const
417  {
418  _GLIBCXX_DEBUG_ASSERT( ready() );
419  regex_traits<char_type> __traits;
420  typedef std::ctype<char_type> __ctype_type;
421  const __ctype_type&
422  __fctyp(use_facet<__ctype_type>(__traits.getloc()));
423 
424  auto __output = [&](size_t __idx)
425  {
426  auto& __sub = _Base_type::operator[](__idx);
427  if (__sub.matched)
428  __out = std::copy(__sub.first, __sub.second, __out);
429  };
430 
431  if (__flags & regex_constants::format_sed)
432  {
433  for (; __fmt_first != __fmt_last;)
434  if (*__fmt_first == '&')
435  {
436  __output(0);
437  ++__fmt_first;
438  }
439  else if (*__fmt_first == '\\')
440  {
441  if (++__fmt_first != __fmt_last
442  && __fctyp.is(__ctype_type::digit, *__fmt_first))
443  __output(__traits.value(*__fmt_first++, 10));
444  else
445  *__out++ = '\\';
446  }
447  else
448  *__out++ = *__fmt_first++;
449  }
450  else
451  {
452  while (1)
453  {
454  auto __next = std::find(__fmt_first, __fmt_last, '$');
455  if (__next == __fmt_last)
456  break;
457 
458  __out = std::copy(__fmt_first, __next, __out);
459 
460  auto __eat = [&](char __ch) -> bool
461  {
462  if (*__next == __ch)
463  {
464  ++__next;
465  return true;
466  }
467  return false;
468  };
469 
470  if (++__next == __fmt_last)
471  *__out++ = '$';
472  else if (__eat('$'))
473  *__out++ = '$';
474  else if (__eat('&'))
475  __output(0);
476  else if (__eat('`'))
477  __output(_Base_type::size()-2);
478  else if (__eat('\''))
479  __output(_Base_type::size()-1);
480  else if (__fctyp.is(__ctype_type::digit, *__next))
481  {
482  long __num = __traits.value(*__next, 10);
483  if (++__next != __fmt_last
484  && __fctyp.is(__ctype_type::digit, *__next))
485  {
486  __num *= 10;
487  __num += __traits.value(*__next++, 10);
488  }
489  if (0 <= __num && __num < this->size())
490  __output(__num);
491  }
492  else
493  *__out++ = '$';
494  __fmt_first = __next;
495  }
496  __out = std::copy(__fmt_first, __fmt_last, __out);
497  }
498  return __out;
499  }
500 
501  template<typename _Out_iter, typename _Bi_iter,
502  typename _Rx_traits, typename _Ch_type>
503  _Out_iter
504  regex_replace(_Out_iter __out, _Bi_iter __first, _Bi_iter __last,
505  const basic_regex<_Ch_type, _Rx_traits>& __e,
506  const _Ch_type* __fmt,
507  regex_constants::match_flag_type __flags)
508  {
509  typedef regex_iterator<_Bi_iter, _Ch_type, _Rx_traits> _IterT;
510  _IterT __i(__first, __last, __e, __flags);
511  _IterT __end;
512  if (__i == __end)
513  {
514  if (!(__flags & regex_constants::format_no_copy))
515  __out = std::copy(__first, __last, __out);
516  }
517  else
518  {
519  sub_match<_Bi_iter> __last;
520  auto __len = char_traits<_Ch_type>::length(__fmt);
521  for (; __i != __end; ++__i)
522  {
523  if (!(__flags & regex_constants::format_no_copy))
524  __out = std::copy(__i->prefix().first, __i->prefix().second,
525  __out);
526  __out = __i->format(__out, __fmt, __fmt + __len, __flags);
527  __last = __i->suffix();
528  if (__flags & regex_constants::format_first_only)
529  break;
530  }
531  if (!(__flags & regex_constants::format_no_copy))
532  __out = std::copy(__last.first, __last.second, __out);
533  }
534  return __out;
535  }
536 
537  template<typename _Bi_iter,
538  typename _Ch_type,
539  typename _Rx_traits>
540  bool
541  regex_iterator<_Bi_iter, _Ch_type, _Rx_traits>::
542  operator==(const regex_iterator& __rhs) const
543  {
544  return (_M_match.empty() && __rhs._M_match.empty())
545  || (_M_begin == __rhs._M_begin
546  && _M_end == __rhs._M_end
547  && _M_pregex == __rhs._M_pregex
548  && _M_flags == __rhs._M_flags
549  && _M_match[0] == __rhs._M_match[0]);
550  }
551 
552  template<typename _Bi_iter,
553  typename _Ch_type,
554  typename _Rx_traits>
555  regex_iterator<_Bi_iter, _Ch_type, _Rx_traits>&
556  regex_iterator<_Bi_iter, _Ch_type, _Rx_traits>::
557  operator++()
558  {
559  // In all cases in which the call to regex_search returns true,
560  // match.prefix().first shall be equal to the previous value of
561  // match[0].second, and for each index i in the half-open range
562  // [0, match.size()) for which match[i].matched is true,
563  // match[i].position() shall return distance(begin, match[i].first).
564  // [28.12.1.4.5]
565  if (_M_match[0].matched)
566  {
567  auto __start = _M_match[0].second;
568  auto __prefix_first = _M_match[0].second;
569  if (_M_match[0].first == _M_match[0].second)
570  {
571  if (__start == _M_end)
572  {
573  _M_match = value_type();
574  return *this;
575  }
576  else
577  {
578  if (regex_search(__start, _M_end, _M_match, *_M_pregex,
579  _M_flags
580  | regex_constants::match_not_null
581  | regex_constants::match_continuous))
582  {
583  _GLIBCXX_DEBUG_ASSERT(_M_match[0].matched);
584  auto& __prefix = _M_match.at(_M_match.size());
585  __prefix.first = __prefix_first;
586  __prefix.matched = __prefix.first != __prefix.second;
587  _M_match._M_in_iterator = true;
588  _M_match._M_begin = _M_begin;
589  return *this;
590  }
591  else
592  ++__start;
593  }
594  }
595  _M_flags |= regex_constants::match_prev_avail;
596  if (regex_search(__start, _M_end, _M_match, *_M_pregex, _M_flags))
597  {
598  _GLIBCXX_DEBUG_ASSERT(_M_match[0].matched);
599  auto& __prefix = _M_match.at(_M_match.size());
600  __prefix.first = __prefix_first;
601  __prefix.matched = __prefix.first != __prefix.second;
602  _M_match._M_in_iterator = true;
603  _M_match._M_begin = _M_begin;
604  }
605  else
606  _M_match = value_type();
607  }
608  return *this;
609  }
610 
611  template<typename _Bi_iter,
612  typename _Ch_type,
613  typename _Rx_traits>
614  regex_token_iterator<_Bi_iter, _Ch_type, _Rx_traits>&
615  regex_token_iterator<_Bi_iter, _Ch_type, _Rx_traits>::
616  operator=(const regex_token_iterator& __rhs)
617  {
618  _M_position = __rhs._M_position;
619  _M_subs = __rhs._M_subs;
620  _M_n = __rhs._M_n;
621  _M_result = __rhs._M_result;
622  _M_suffix = __rhs._M_suffix;
623  _M_has_m1 = __rhs._M_has_m1;
624  if (__rhs._M_result == &__rhs._M_suffix)
625  _M_result = &_M_suffix;
626  return *this;
627  }
628 
629  template<typename _Bi_iter,
630  typename _Ch_type,
631  typename _Rx_traits>
632  bool
633  regex_token_iterator<_Bi_iter, _Ch_type, _Rx_traits>::
634  operator==(const regex_token_iterator& __rhs) const
635  {
636  if (_M_end_of_seq() && __rhs._M_end_of_seq())
637  return true;
638  if (_M_suffix.matched && __rhs._M_suffix.matched
639  && _M_suffix == __rhs._M_suffix)
640  return true;
641  if (_M_end_of_seq() || _M_suffix.matched
642  || __rhs._M_end_of_seq() || __rhs._M_suffix.matched)
643  return false;
644  return _M_position == __rhs._M_position
645  && _M_n == __rhs._M_n
646  && _M_subs == __rhs._M_subs;
647  }
648 
649  template<typename _Bi_iter,
650  typename _Ch_type,
651  typename _Rx_traits>
652  regex_token_iterator<_Bi_iter, _Ch_type, _Rx_traits>&
653  regex_token_iterator<_Bi_iter, _Ch_type, _Rx_traits>::
654  operator++()
655  {
656  _Position __prev = _M_position;
657  if (_M_suffix.matched)
658  *this = regex_token_iterator();
659  else if (_M_n + 1 < _M_subs.size())
660  {
661  _M_n++;
662  _M_result = &_M_current_match();
663  }
664  else
665  {
666  _M_n = 0;
667  ++_M_position;
668  if (_M_position != _Position())
669  _M_result = &_M_current_match();
670  else if (_M_has_m1 && __prev->suffix().length() != 0)
671  {
672  _M_suffix.matched = true;
673  _M_suffix.first = __prev->suffix().first;
674  _M_suffix.second = __prev->suffix().second;
675  _M_result = &_M_suffix;
676  }
677  else
678  *this = regex_token_iterator();
679  }
680  return *this;
681  }
682 
683  template<typename _Bi_iter,
684  typename _Ch_type,
685  typename _Rx_traits>
686  void
687  regex_token_iterator<_Bi_iter, _Ch_type, _Rx_traits>::
688  _M_init(_Bi_iter __a, _Bi_iter __b)
689  {
690  _M_has_m1 = false;
691  for (auto __it : _M_subs)
692  if (__it == -1)
693  {
694  _M_has_m1 = true;
695  break;
696  }
697  if (_M_position != _Position())
698  _M_result = &_M_current_match();
699  else if (_M_has_m1)
700  {
701  _M_suffix.matched = true;
702  _M_suffix.first = __a;
703  _M_suffix.second = __b;
704  _M_result = &_M_suffix;
705  }
706  else
707  _M_result = nullptr;
708  }
709 
710 _GLIBCXX_END_NAMESPACE_VERSION
711 } // namespace
712