You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

590 lines
15KB

  1. // class template regex -*- C++ -*-
  2. // Copyright (C) 2013-2020 Free Software Foundation, Inc.
  3. //
  4. // This file is part of the GNU ISO C++ Library. This library is free
  5. // software; you can redistribute it and/or modify it under the
  6. // terms of the GNU General Public License as published by the
  7. // Free Software Foundation; either version 3, or (at your option)
  8. // any later version.
  9. // This library is distributed in the hope that it will be useful,
  10. // but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. // GNU General Public License for more details.
  13. // Under Section 7 of GPL version 3, you are granted additional
  14. // permissions described in the GCC Runtime Library Exception, version
  15. // 3.1, as published by the Free Software Foundation.
  16. // You should have received a copy of the GNU General Public License and
  17. // a copy of the GCC Runtime Library Exception along with this program;
  18. // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
  19. // <http://www.gnu.org/licenses/>.
  20. /**
  21. * @file bits/regex_scanner.tcc
  22. * This is an internal header file, included by other library headers.
  23. * Do not attempt to use it directly. @headername{regex}
  24. */
  25. // FIXME make comments doxygen format.
  26. // N3376 specified 6 regex styles: ECMAScript, basic, extended, grep, egrep
  27. // and awk
  28. // 1) grep is basic except '\n' is treated as '|'
  29. // 2) egrep is extended except '\n' is treated as '|'
  30. // 3) awk is extended except special escaping rules, and there's no
  31. // back-reference.
  32. //
  33. // References:
  34. //
  35. // ECMAScript: ECMA-262 15.10
  36. //
  37. // basic, extended:
  38. // http://pubs.opengroup.org/onlinepubs/009695399/basedefs/xbd_chap09.html
  39. //
  40. // awk: http://pubs.opengroup.org/onlinepubs/000095399/utilities/awk.html
  41. namespace std _GLIBCXX_VISIBILITY(default)
  42. {
  43. _GLIBCXX_BEGIN_NAMESPACE_VERSION
  44. namespace __detail
  45. {
  46. template<typename _CharT>
  47. _Scanner<_CharT>::
  48. _Scanner(typename _Scanner::_IterT __begin,
  49. typename _Scanner::_IterT __end,
  50. _FlagT __flags, std::locale __loc)
  51. : _ScannerBase(__flags),
  52. _M_current(__begin), _M_end(__end),
  53. _M_ctype(std::use_facet<_CtypeT>(__loc)),
  54. _M_eat_escape(_M_is_ecma()
  55. ? &_Scanner::_M_eat_escape_ecma
  56. : &_Scanner::_M_eat_escape_posix)
  57. { _M_advance(); }
  58. template<typename _CharT>
  59. void
  60. _Scanner<_CharT>::
  61. _M_advance()
  62. {
  63. if (_M_current == _M_end)
  64. {
  65. _M_token = _S_token_eof;
  66. return;
  67. }
  68. if (_M_state == _S_state_normal)
  69. _M_scan_normal();
  70. else if (_M_state == _S_state_in_bracket)
  71. _M_scan_in_bracket();
  72. else if (_M_state == _S_state_in_brace)
  73. _M_scan_in_brace();
  74. else
  75. {
  76. __glibcxx_assert(false);
  77. }
  78. }
  79. // Differences between styles:
  80. // 1) "\(", "\)", "\{" in basic. It's not escaping.
  81. // 2) "(?:", "(?=", "(?!" in ECMAScript.
  82. template<typename _CharT>
  83. void
  84. _Scanner<_CharT>::
  85. _M_scan_normal()
  86. {
  87. auto __c = *_M_current++;
  88. if (std::strchr(_M_spec_char, _M_ctype.narrow(__c, ' ')) == nullptr)
  89. {
  90. _M_token = _S_token_ord_char;
  91. _M_value.assign(1, __c);
  92. return;
  93. }
  94. if (__c == '\\')
  95. {
  96. if (_M_current == _M_end)
  97. __throw_regex_error(
  98. regex_constants::error_escape,
  99. "Unexpected end of regex when escaping.");
  100. if (!_M_is_basic()
  101. || (*_M_current != '('
  102. && *_M_current != ')'
  103. && *_M_current != '{'))
  104. {
  105. (this->*_M_eat_escape)();
  106. return;
  107. }
  108. __c = *_M_current++;
  109. }
  110. if (__c == '(')
  111. {
  112. if (_M_is_ecma() && *_M_current == '?')
  113. {
  114. if (++_M_current == _M_end)
  115. __throw_regex_error(
  116. regex_constants::error_paren,
  117. "Unexpected end of regex when in an open parenthesis.");
  118. if (*_M_current == ':')
  119. {
  120. ++_M_current;
  121. _M_token = _S_token_subexpr_no_group_begin;
  122. }
  123. else if (*_M_current == '=')
  124. {
  125. ++_M_current;
  126. _M_token = _S_token_subexpr_lookahead_begin;
  127. _M_value.assign(1, 'p');
  128. }
  129. else if (*_M_current == '!')
  130. {
  131. ++_M_current;
  132. _M_token = _S_token_subexpr_lookahead_begin;
  133. _M_value.assign(1, 'n');
  134. }
  135. else
  136. __throw_regex_error(
  137. regex_constants::error_paren,
  138. "Invalid special open parenthesis.");
  139. }
  140. else if (_M_flags & regex_constants::nosubs)
  141. _M_token = _S_token_subexpr_no_group_begin;
  142. else
  143. _M_token = _S_token_subexpr_begin;
  144. }
  145. else if (__c == ')')
  146. _M_token = _S_token_subexpr_end;
  147. else if (__c == '[')
  148. {
  149. _M_state = _S_state_in_bracket;
  150. _M_at_bracket_start = true;
  151. if (_M_current != _M_end && *_M_current == '^')
  152. {
  153. _M_token = _S_token_bracket_neg_begin;
  154. ++_M_current;
  155. }
  156. else
  157. _M_token = _S_token_bracket_begin;
  158. }
  159. else if (__c == '{')
  160. {
  161. _M_state = _S_state_in_brace;
  162. _M_token = _S_token_interval_begin;
  163. }
  164. else if (__c != ']' && __c != '}')
  165. {
  166. auto __it = _M_token_tbl;
  167. auto __narrowc = _M_ctype.narrow(__c, '\0');
  168. for (; __it->first != '\0'; ++__it)
  169. if (__it->first == __narrowc)
  170. {
  171. _M_token = __it->second;
  172. return;
  173. }
  174. __glibcxx_assert(false);
  175. }
  176. else
  177. {
  178. _M_token = _S_token_ord_char;
  179. _M_value.assign(1, __c);
  180. }
  181. }
  182. // Differences between styles:
  183. // 1) different semantics of "[]" and "[^]".
  184. // 2) Escaping in bracket expr.
  185. template<typename _CharT>
  186. void
  187. _Scanner<_CharT>::
  188. _M_scan_in_bracket()
  189. {
  190. if (_M_current == _M_end)
  191. __throw_regex_error(
  192. regex_constants::error_brack,
  193. "Unexpected end of regex when in bracket expression.");
  194. auto __c = *_M_current++;
  195. if (__c == '-')
  196. _M_token = _S_token_bracket_dash;
  197. else if (__c == '[')
  198. {
  199. if (_M_current == _M_end)
  200. __throw_regex_error(regex_constants::error_brack,
  201. "Unexpected character class open bracket.");
  202. if (*_M_current == '.')
  203. {
  204. _M_token = _S_token_collsymbol;
  205. _M_eat_class(*_M_current++);
  206. }
  207. else if (*_M_current == ':')
  208. {
  209. _M_token = _S_token_char_class_name;
  210. _M_eat_class(*_M_current++);
  211. }
  212. else if (*_M_current == '=')
  213. {
  214. _M_token = _S_token_equiv_class_name;
  215. _M_eat_class(*_M_current++);
  216. }
  217. else
  218. {
  219. _M_token = _S_token_ord_char;
  220. _M_value.assign(1, __c);
  221. }
  222. }
  223. // In POSIX, when encountering "[]" or "[^]", the ']' is interpreted
  224. // literally. So "[]]" and "[^]]" are valid regexes. See the testcases
  225. // `*/empty_range.cc`.
  226. else if (__c == ']' && (_M_is_ecma() || !_M_at_bracket_start))
  227. {
  228. _M_token = _S_token_bracket_end;
  229. _M_state = _S_state_normal;
  230. }
  231. // ECMAScript and awk permits escaping in bracket.
  232. else if (__c == '\\' && (_M_is_ecma() || _M_is_awk()))
  233. (this->*_M_eat_escape)();
  234. else
  235. {
  236. _M_token = _S_token_ord_char;
  237. _M_value.assign(1, __c);
  238. }
  239. _M_at_bracket_start = false;
  240. }
  241. // Differences between styles:
  242. // 1) "\}" in basic style.
  243. template<typename _CharT>
  244. void
  245. _Scanner<_CharT>::
  246. _M_scan_in_brace()
  247. {
  248. if (_M_current == _M_end)
  249. __throw_regex_error(
  250. regex_constants::error_brace,
  251. "Unexpected end of regex when in brace expression.");
  252. auto __c = *_M_current++;
  253. if (_M_ctype.is(_CtypeT::digit, __c))
  254. {
  255. _M_token = _S_token_dup_count;
  256. _M_value.assign(1, __c);
  257. while (_M_current != _M_end
  258. && _M_ctype.is(_CtypeT::digit, *_M_current))
  259. _M_value += *_M_current++;
  260. }
  261. else if (__c == ',')
  262. _M_token = _S_token_comma;
  263. // basic use \}.
  264. else if (_M_is_basic())
  265. {
  266. if (__c == '\\' && _M_current != _M_end && *_M_current == '}')
  267. {
  268. _M_state = _S_state_normal;
  269. _M_token = _S_token_interval_end;
  270. ++_M_current;
  271. }
  272. else
  273. __throw_regex_error(regex_constants::error_badbrace,
  274. "Unexpected character in brace expression.");
  275. }
  276. else if (__c == '}')
  277. {
  278. _M_state = _S_state_normal;
  279. _M_token = _S_token_interval_end;
  280. }
  281. else
  282. __throw_regex_error(regex_constants::error_badbrace,
  283. "Unexpected character in brace expression.");
  284. }
  285. template<typename _CharT>
  286. void
  287. _Scanner<_CharT>::
  288. _M_eat_escape_ecma()
  289. {
  290. if (_M_current == _M_end)
  291. __throw_regex_error(regex_constants::error_escape,
  292. "Unexpected end of regex when escaping.");
  293. auto __c = *_M_current++;
  294. auto __pos = _M_find_escape(_M_ctype.narrow(__c, '\0'));
  295. if (__pos != nullptr && (__c != 'b' || _M_state == _S_state_in_bracket))
  296. {
  297. _M_token = _S_token_ord_char;
  298. _M_value.assign(1, *__pos);
  299. }
  300. else if (__c == 'b')
  301. {
  302. _M_token = _S_token_word_bound;
  303. _M_value.assign(1, 'p');
  304. }
  305. else if (__c == 'B')
  306. {
  307. _M_token = _S_token_word_bound;
  308. _M_value.assign(1, 'n');
  309. }
  310. // N3376 28.13
  311. else if (__c == 'd'
  312. || __c == 'D'
  313. || __c == 's'
  314. || __c == 'S'
  315. || __c == 'w'
  316. || __c == 'W')
  317. {
  318. _M_token = _S_token_quoted_class;
  319. _M_value.assign(1, __c);
  320. }
  321. else if (__c == 'c')
  322. {
  323. if (_M_current == _M_end)
  324. __throw_regex_error(
  325. regex_constants::error_escape,
  326. "Unexpected end of regex when reading control code.");
  327. _M_token = _S_token_ord_char;
  328. _M_value.assign(1, *_M_current++);
  329. }
  330. else if (__c == 'x' || __c == 'u')
  331. {
  332. _M_value.erase();
  333. for (int __i = 0; __i < (__c == 'x' ? 2 : 4); __i++)
  334. {
  335. if (_M_current == _M_end
  336. || !_M_ctype.is(_CtypeT::xdigit, *_M_current))
  337. __throw_regex_error(
  338. regex_constants::error_escape,
  339. "Unexpected end of regex when ascii character.");
  340. _M_value += *_M_current++;
  341. }
  342. _M_token = _S_token_hex_num;
  343. }
  344. // ECMAScript recognizes multi-digit back-references.
  345. else if (_M_ctype.is(_CtypeT::digit, __c))
  346. {
  347. _M_value.assign(1, __c);
  348. while (_M_current != _M_end
  349. && _M_ctype.is(_CtypeT::digit, *_M_current))
  350. _M_value += *_M_current++;
  351. _M_token = _S_token_backref;
  352. }
  353. else
  354. {
  355. _M_token = _S_token_ord_char;
  356. _M_value.assign(1, __c);
  357. }
  358. }
  359. // Differences between styles:
  360. // 1) Extended doesn't support backref, but basic does.
  361. template<typename _CharT>
  362. void
  363. _Scanner<_CharT>::
  364. _M_eat_escape_posix()
  365. {
  366. if (_M_current == _M_end)
  367. __throw_regex_error(regex_constants::error_escape,
  368. "Unexpected end of regex when escaping.");
  369. auto __c = *_M_current;
  370. auto __pos = std::strchr(_M_spec_char, _M_ctype.narrow(__c, '\0'));
  371. if (__pos != nullptr && *__pos != '\0')
  372. {
  373. _M_token = _S_token_ord_char;
  374. _M_value.assign(1, __c);
  375. }
  376. // We MUST judge awk before handling backrefs. There's no backref in awk.
  377. else if (_M_is_awk())
  378. {
  379. _M_eat_escape_awk();
  380. return;
  381. }
  382. else if (_M_is_basic() && _M_ctype.is(_CtypeT::digit, __c) && __c != '0')
  383. {
  384. _M_token = _S_token_backref;
  385. _M_value.assign(1, __c);
  386. }
  387. else
  388. {
  389. #ifdef __STRICT_ANSI__
  390. // POSIX says it is undefined to escape ordinary characters
  391. __throw_regex_error(regex_constants::error_escape,
  392. "Unexpected escape character.");
  393. #else
  394. _M_token = _S_token_ord_char;
  395. _M_value.assign(1, __c);
  396. #endif
  397. }
  398. ++_M_current;
  399. }
  400. template<typename _CharT>
  401. void
  402. _Scanner<_CharT>::
  403. _M_eat_escape_awk()
  404. {
  405. auto __c = *_M_current++;
  406. auto __pos = _M_find_escape(_M_ctype.narrow(__c, '\0'));
  407. if (__pos != nullptr)
  408. {
  409. _M_token = _S_token_ord_char;
  410. _M_value.assign(1, *__pos);
  411. }
  412. // \ddd for oct representation
  413. else if (_M_ctype.is(_CtypeT::digit, __c)
  414. && __c != '8'
  415. && __c != '9')
  416. {
  417. _M_value.assign(1, __c);
  418. for (int __i = 0;
  419. __i < 2
  420. && _M_current != _M_end
  421. && _M_ctype.is(_CtypeT::digit, *_M_current)
  422. && *_M_current != '8'
  423. && *_M_current != '9';
  424. __i++)
  425. _M_value += *_M_current++;
  426. _M_token = _S_token_oct_num;
  427. return;
  428. }
  429. else
  430. __throw_regex_error(regex_constants::error_escape,
  431. "Unexpected escape character.");
  432. }
  433. // Eats a character class or throws an exception.
  434. // __ch could be ':', '.' or '=', _M_current is the char after ']' when
  435. // returning.
  436. template<typename _CharT>
  437. void
  438. _Scanner<_CharT>::
  439. _M_eat_class(char __ch)
  440. {
  441. for (_M_value.clear(); _M_current != _M_end && *_M_current != __ch;)
  442. _M_value += *_M_current++;
  443. if (_M_current == _M_end
  444. || *_M_current++ != __ch
  445. || _M_current == _M_end // skip __ch
  446. || *_M_current++ != ']') // skip ']'
  447. {
  448. if (__ch == ':')
  449. __throw_regex_error(regex_constants::error_ctype,
  450. "Unexpected end of character class.");
  451. else
  452. __throw_regex_error(regex_constants::error_collate,
  453. "Unexpected end of character class.");
  454. }
  455. }
  456. #ifdef _GLIBCXX_DEBUG
  457. template<typename _CharT>
  458. std::ostream&
  459. _Scanner<_CharT>::
  460. _M_print(std::ostream& ostr)
  461. {
  462. switch (_M_token)
  463. {
  464. case _S_token_anychar:
  465. ostr << "any-character\n";
  466. break;
  467. case _S_token_backref:
  468. ostr << "backref\n";
  469. break;
  470. case _S_token_bracket_begin:
  471. ostr << "bracket-begin\n";
  472. break;
  473. case _S_token_bracket_neg_begin:
  474. ostr << "bracket-neg-begin\n";
  475. break;
  476. case _S_token_bracket_end:
  477. ostr << "bracket-end\n";
  478. break;
  479. case _S_token_char_class_name:
  480. ostr << "char-class-name \"" << _M_value << "\"\n";
  481. break;
  482. case _S_token_closure0:
  483. ostr << "closure0\n";
  484. break;
  485. case _S_token_closure1:
  486. ostr << "closure1\n";
  487. break;
  488. case _S_token_collsymbol:
  489. ostr << "collsymbol \"" << _M_value << "\"\n";
  490. break;
  491. case _S_token_comma:
  492. ostr << "comma\n";
  493. break;
  494. case _S_token_dup_count:
  495. ostr << "dup count: " << _M_value << "\n";
  496. break;
  497. case _S_token_eof:
  498. ostr << "EOF\n";
  499. break;
  500. case _S_token_equiv_class_name:
  501. ostr << "equiv-class-name \"" << _M_value << "\"\n";
  502. break;
  503. case _S_token_interval_begin:
  504. ostr << "interval begin\n";
  505. break;
  506. case _S_token_interval_end:
  507. ostr << "interval end\n";
  508. break;
  509. case _S_token_line_begin:
  510. ostr << "line begin\n";
  511. break;
  512. case _S_token_line_end:
  513. ostr << "line end\n";
  514. break;
  515. case _S_token_opt:
  516. ostr << "opt\n";
  517. break;
  518. case _S_token_or:
  519. ostr << "or\n";
  520. break;
  521. case _S_token_ord_char:
  522. ostr << "ordinary character: \"" << _M_value << "\"\n";
  523. break;
  524. case _S_token_subexpr_begin:
  525. ostr << "subexpr begin\n";
  526. break;
  527. case _S_token_subexpr_no_group_begin:
  528. ostr << "no grouping subexpr begin\n";
  529. break;
  530. case _S_token_subexpr_lookahead_begin:
  531. ostr << "lookahead subexpr begin\n";
  532. break;
  533. case _S_token_subexpr_end:
  534. ostr << "subexpr end\n";
  535. break;
  536. case _S_token_unknown:
  537. ostr << "-- unknown token --\n";
  538. break;
  539. case _S_token_oct_num:
  540. ostr << "oct number " << _M_value << "\n";
  541. break;
  542. case _S_token_hex_num:
  543. ostr << "hex number " << _M_value << "\n";
  544. break;
  545. case _S_token_quoted_class:
  546. ostr << "quoted class " << "\\" << _M_value << "\n";
  547. break;
  548. default:
  549. _GLIBCXX_DEBUG_ASSERT(false);
  550. }
  551. return ostr;
  552. }
  553. #endif
  554. } // namespace __detail
  555. _GLIBCXX_END_NAMESPACE_VERSION
  556. } // namespace