You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

402 lines
10KB

  1. // class template regex -*- C++ -*-
  2. // Copyright (C) 2013-2020 Free Software Foundation, Inc.
  3. //
  4. // This file is part of the GNU ISO C++ Library. This library is free
  5. // software; you can redistribute it and/or modify it under the
  6. // terms of the GNU General Public License as published by the
  7. // Free Software Foundation; either version 3, or (at your option)
  8. // any later version.
  9. // This library is distributed in the hope that it will be useful,
  10. // but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. // GNU General Public License for more details.
  13. // Under Section 7 of GPL version 3, you are granted additional
  14. // permissions described in the GCC Runtime Library Exception, version
  15. // 3.1, as published by the Free Software Foundation.
  16. // You should have received a copy of the GNU General Public License and
  17. // a copy of the GCC Runtime Library Exception along with this program;
  18. // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
  19. // <http://www.gnu.org/licenses/>.
  20. /**
  21. * @file bits/regex_automaton.h
  22. * This is an internal header file, included by other library headers.
  23. * Do not attempt to use it directly. @headername{regex}
  24. */
  25. // This macro defines the maximal state number a NFA can have.
  26. #ifndef _GLIBCXX_REGEX_STATE_LIMIT
  27. #define _GLIBCXX_REGEX_STATE_LIMIT 100000
  28. #endif
  29. namespace std _GLIBCXX_VISIBILITY(default)
  30. {
  31. _GLIBCXX_BEGIN_NAMESPACE_VERSION
  32. namespace __detail
  33. {
  34. /**
  35. * @defgroup regex-detail Base and Implementation Classes
  36. * @ingroup regex
  37. * @{
  38. */
  39. typedef long _StateIdT;
  40. static const _StateIdT _S_invalid_state_id = -1;
  41. template<typename _CharT>
  42. using _Matcher = std::function<bool (_CharT)>;
  43. /// Operation codes that define the type of transitions within the base NFA
  44. /// that represents the regular expression.
  45. enum _Opcode : int
  46. {
  47. _S_opcode_unknown,
  48. _S_opcode_alternative,
  49. _S_opcode_repeat,
  50. _S_opcode_backref,
  51. _S_opcode_line_begin_assertion,
  52. _S_opcode_line_end_assertion,
  53. _S_opcode_word_boundary,
  54. _S_opcode_subexpr_lookahead,
  55. _S_opcode_subexpr_begin,
  56. _S_opcode_subexpr_end,
  57. _S_opcode_dummy,
  58. _S_opcode_match,
  59. _S_opcode_accept,
  60. };
  61. struct _State_base
  62. {
  63. protected:
  64. _Opcode _M_opcode; // type of outgoing transition
  65. public:
  66. _StateIdT _M_next; // outgoing transition
  67. union // Since they are mutually exclusive.
  68. {
  69. size_t _M_subexpr; // for _S_opcode_subexpr_*
  70. size_t _M_backref_index; // for _S_opcode_backref
  71. struct
  72. {
  73. // for _S_opcode_alternative, _S_opcode_repeat and
  74. // _S_opcode_subexpr_lookahead
  75. _StateIdT _M_alt;
  76. // for _S_opcode_word_boundary or _S_opcode_subexpr_lookahead or
  77. // quantifiers (ungreedy if set true)
  78. bool _M_neg;
  79. };
  80. // For _S_opcode_match
  81. __gnu_cxx::__aligned_membuf<_Matcher<char>> _M_matcher_storage;
  82. };
  83. protected:
  84. explicit _State_base(_Opcode __opcode)
  85. : _M_opcode(__opcode), _M_next(_S_invalid_state_id)
  86. { }
  87. public:
  88. bool
  89. _M_has_alt()
  90. {
  91. return _M_opcode == _S_opcode_alternative
  92. || _M_opcode == _S_opcode_repeat
  93. || _M_opcode == _S_opcode_subexpr_lookahead;
  94. }
  95. #ifdef _GLIBCXX_DEBUG
  96. std::ostream&
  97. _M_print(std::ostream& ostr) const;
  98. // Prints graphviz dot commands for state.
  99. std::ostream&
  100. _M_dot(std::ostream& __ostr, _StateIdT __id) const;
  101. #endif
  102. };
  103. template<typename _Char_type>
  104. struct _State : _State_base
  105. {
  106. typedef _Matcher<_Char_type> _MatcherT;
  107. static_assert(sizeof(_MatcherT) == sizeof(_Matcher<char>),
  108. "std::function<bool(T)> has the same size as "
  109. "std::function<bool(char)>");
  110. static_assert(alignof(_MatcherT) == alignof(_Matcher<char>),
  111. "std::function<bool(T)> has the same alignment as "
  112. "std::function<bool(char)>");
  113. explicit
  114. _State(_Opcode __opcode) : _State_base(__opcode)
  115. {
  116. if (_M_opcode() == _S_opcode_match)
  117. new (this->_M_matcher_storage._M_addr()) _MatcherT();
  118. }
  119. _State(const _State& __rhs) : _State_base(__rhs)
  120. {
  121. if (__rhs._M_opcode() == _S_opcode_match)
  122. new (this->_M_matcher_storage._M_addr())
  123. _MatcherT(__rhs._M_get_matcher());
  124. }
  125. _State(_State&& __rhs) : _State_base(__rhs)
  126. {
  127. if (__rhs._M_opcode() == _S_opcode_match)
  128. new (this->_M_matcher_storage._M_addr())
  129. _MatcherT(std::move(__rhs._M_get_matcher()));
  130. }
  131. _State&
  132. operator=(const _State&) = delete;
  133. ~_State()
  134. {
  135. if (_M_opcode() == _S_opcode_match)
  136. _M_get_matcher().~_MatcherT();
  137. }
  138. // Since correct ctor and dtor rely on _M_opcode, it's better not to
  139. // change it over time.
  140. _Opcode
  141. _M_opcode() const
  142. { return _State_base::_M_opcode; }
  143. bool
  144. _M_matches(_Char_type __char) const
  145. { return _M_get_matcher()(__char); }
  146. _MatcherT&
  147. _M_get_matcher()
  148. { return *static_cast<_MatcherT*>(this->_M_matcher_storage._M_addr()); }
  149. const _MatcherT&
  150. _M_get_matcher() const
  151. {
  152. return *static_cast<const _MatcherT*>(
  153. this->_M_matcher_storage._M_addr());
  154. }
  155. };
  156. struct _NFA_base
  157. {
  158. typedef size_t _SizeT;
  159. typedef regex_constants::syntax_option_type _FlagT;
  160. explicit
  161. _NFA_base(_FlagT __f)
  162. : _M_flags(__f), _M_start_state(0), _M_subexpr_count(0),
  163. _M_has_backref(false)
  164. { }
  165. _NFA_base(_NFA_base&&) = default;
  166. protected:
  167. ~_NFA_base() = default;
  168. public:
  169. _FlagT
  170. _M_options() const
  171. { return _M_flags; }
  172. _StateIdT
  173. _M_start() const
  174. { return _M_start_state; }
  175. _SizeT
  176. _M_sub_count() const
  177. { return _M_subexpr_count; }
  178. _GLIBCXX_STD_C::vector<size_t> _M_paren_stack;
  179. _FlagT _M_flags;
  180. _StateIdT _M_start_state;
  181. _SizeT _M_subexpr_count;
  182. bool _M_has_backref;
  183. };
  184. template<typename _TraitsT>
  185. struct _NFA
  186. : _NFA_base, _GLIBCXX_STD_C::vector<_State<typename _TraitsT::char_type>>
  187. {
  188. typedef typename _TraitsT::char_type _Char_type;
  189. typedef _State<_Char_type> _StateT;
  190. typedef _Matcher<_Char_type> _MatcherT;
  191. _NFA(const typename _TraitsT::locale_type& __loc, _FlagT __flags)
  192. : _NFA_base(__flags)
  193. { _M_traits.imbue(__loc); }
  194. // for performance reasons _NFA objects should only be moved not copied
  195. _NFA(const _NFA&) = delete;
  196. _NFA(_NFA&&) = default;
  197. _StateIdT
  198. _M_insert_accept()
  199. {
  200. auto __ret = _M_insert_state(_StateT(_S_opcode_accept));
  201. return __ret;
  202. }
  203. _StateIdT
  204. _M_insert_alt(_StateIdT __next, _StateIdT __alt,
  205. bool __neg __attribute__((__unused__)))
  206. {
  207. _StateT __tmp(_S_opcode_alternative);
  208. // It labels every quantifier to make greedy comparison easier in BFS
  209. // approach.
  210. __tmp._M_next = __next;
  211. __tmp._M_alt = __alt;
  212. return _M_insert_state(std::move(__tmp));
  213. }
  214. _StateIdT
  215. _M_insert_repeat(_StateIdT __next, _StateIdT __alt, bool __neg)
  216. {
  217. _StateT __tmp(_S_opcode_repeat);
  218. // It labels every quantifier to make greedy comparison easier in BFS
  219. // approach.
  220. __tmp._M_next = __next;
  221. __tmp._M_alt = __alt;
  222. __tmp._M_neg = __neg;
  223. return _M_insert_state(std::move(__tmp));
  224. }
  225. _StateIdT
  226. _M_insert_matcher(_MatcherT __m)
  227. {
  228. _StateT __tmp(_S_opcode_match);
  229. __tmp._M_get_matcher() = std::move(__m);
  230. return _M_insert_state(std::move(__tmp));
  231. }
  232. _StateIdT
  233. _M_insert_subexpr_begin()
  234. {
  235. auto __id = this->_M_subexpr_count++;
  236. this->_M_paren_stack.push_back(__id);
  237. _StateT __tmp(_S_opcode_subexpr_begin);
  238. __tmp._M_subexpr = __id;
  239. return _M_insert_state(std::move(__tmp));
  240. }
  241. _StateIdT
  242. _M_insert_subexpr_end()
  243. {
  244. _StateT __tmp(_S_opcode_subexpr_end);
  245. __tmp._M_subexpr = this->_M_paren_stack.back();
  246. this->_M_paren_stack.pop_back();
  247. return _M_insert_state(std::move(__tmp));
  248. }
  249. _StateIdT
  250. _M_insert_backref(size_t __index);
  251. _StateIdT
  252. _M_insert_line_begin()
  253. { return _M_insert_state(_StateT(_S_opcode_line_begin_assertion)); }
  254. _StateIdT
  255. _M_insert_line_end()
  256. { return _M_insert_state(_StateT(_S_opcode_line_end_assertion)); }
  257. _StateIdT
  258. _M_insert_word_bound(bool __neg)
  259. {
  260. _StateT __tmp(_S_opcode_word_boundary);
  261. __tmp._M_neg = __neg;
  262. return _M_insert_state(std::move(__tmp));
  263. }
  264. _StateIdT
  265. _M_insert_lookahead(_StateIdT __alt, bool __neg)
  266. {
  267. _StateT __tmp(_S_opcode_subexpr_lookahead);
  268. __tmp._M_alt = __alt;
  269. __tmp._M_neg = __neg;
  270. return _M_insert_state(std::move(__tmp));
  271. }
  272. _StateIdT
  273. _M_insert_dummy()
  274. { return _M_insert_state(_StateT(_S_opcode_dummy)); }
  275. _StateIdT
  276. _M_insert_state(_StateT __s)
  277. {
  278. this->push_back(std::move(__s));
  279. if (this->size() > _GLIBCXX_REGEX_STATE_LIMIT)
  280. __throw_regex_error(
  281. regex_constants::error_space,
  282. "Number of NFA states exceeds limit. Please use shorter regex "
  283. "string, or use smaller brace expression, or make "
  284. "_GLIBCXX_REGEX_STATE_LIMIT larger.");
  285. return this->size() - 1;
  286. }
  287. // Eliminate dummy node in this NFA to make it compact.
  288. void
  289. _M_eliminate_dummy();
  290. #ifdef _GLIBCXX_DEBUG
  291. std::ostream&
  292. _M_dot(std::ostream& __ostr) const;
  293. #endif
  294. public:
  295. _TraitsT _M_traits;
  296. };
  297. /// Describes a sequence of one or more %_State, its current start
  298. /// and end(s). This structure contains fragments of an NFA during
  299. /// construction.
  300. template<typename _TraitsT>
  301. class _StateSeq
  302. {
  303. public:
  304. typedef _NFA<_TraitsT> _RegexT;
  305. public:
  306. _StateSeq(_RegexT& __nfa, _StateIdT __s)
  307. : _M_nfa(__nfa), _M_start(__s), _M_end(__s)
  308. { }
  309. _StateSeq(_RegexT& __nfa, _StateIdT __s, _StateIdT __end)
  310. : _M_nfa(__nfa), _M_start(__s), _M_end(__end)
  311. { }
  312. // Append a state on *this and change *this to the new sequence.
  313. void
  314. _M_append(_StateIdT __id)
  315. {
  316. _M_nfa[_M_end]._M_next = __id;
  317. _M_end = __id;
  318. }
  319. // Append a sequence on *this and change *this to the new sequence.
  320. void
  321. _M_append(const _StateSeq& __s)
  322. {
  323. _M_nfa[_M_end]._M_next = __s._M_start;
  324. _M_end = __s._M_end;
  325. }
  326. // Clones an entire sequence.
  327. _StateSeq
  328. _M_clone();
  329. public:
  330. _RegexT& _M_nfa;
  331. _StateIdT _M_start;
  332. _StateIdT _M_end;
  333. };
  334. //@} regex-detail
  335. } // namespace __detail
  336. _GLIBCXX_END_NAMESPACE_VERSION
  337. } // namespace std
  338. #include <bits/regex_automaton.tcc>