You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

516 line
16KB

  1. // Locale support (codecvt) -*- C++ -*-
  2. // Copyright (C) 2000-2020 Free Software Foundation, Inc.
  3. //
  4. // This file is part of the GNU ISO C++ Library. This library is free
  5. // software; you can redistribute it and/or modify it under the
  6. // terms of the GNU General Public License as published by the
  7. // Free Software Foundation; either version 3, or (at your option)
  8. // any later version.
  9. // This library is distributed in the hope that it will be useful,
  10. // but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. // GNU General Public License for more details.
  13. // Under Section 7 of GPL version 3, you are granted additional
  14. // permissions described in the GCC Runtime Library Exception, version
  15. // 3.1, as published by the Free Software Foundation.
  16. // You should have received a copy of the GNU General Public License and
  17. // a copy of the GCC Runtime Library Exception along with this program;
  18. // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
  19. // <http://www.gnu.org/licenses/>.
  20. //
  21. // ISO C++ 14882: 22.2.1.5 Template class codecvt
  22. //
  23. // Written by Benjamin Kosnik <bkoz@redhat.com>
  24. /** @file ext/codecvt_specializations.h
  25. * This file is a GNU extension to the Standard C++ Library.
  26. */
  27. #ifndef _EXT_CODECVT_SPECIALIZATIONS_H
  28. #define _EXT_CODECVT_SPECIALIZATIONS_H 1
  29. #include <bits/c++config.h>
  30. #include <locale>
  31. #include <iconv.h>
  32. namespace __gnu_cxx _GLIBCXX_VISIBILITY(default)
  33. {
  34. _GLIBCXX_BEGIN_NAMESPACE_VERSION
  35. _GLIBCXX_BEGIN_NAMESPACE_CXX11
  36. /// Extension to use iconv for dealing with character encodings.
  37. // This includes conversions and comparisons between various character
  38. // sets. This object encapsulates data that may need to be shared between
  39. // char_traits, codecvt and ctype.
  40. class encoding_state
  41. {
  42. public:
  43. // Types:
  44. // NB: A conversion descriptor subsumes and enhances the
  45. // functionality of a simple state type such as mbstate_t.
  46. typedef iconv_t descriptor_type;
  47. protected:
  48. // Name of internal character set encoding.
  49. std::string _M_int_enc;
  50. // Name of external character set encoding.
  51. std::string _M_ext_enc;
  52. // Conversion descriptor between external encoding to internal encoding.
  53. descriptor_type _M_in_desc;
  54. // Conversion descriptor between internal encoding to external encoding.
  55. descriptor_type _M_out_desc;
  56. // The byte-order marker for the external encoding, if necessary.
  57. int _M_ext_bom;
  58. // The byte-order marker for the internal encoding, if necessary.
  59. int _M_int_bom;
  60. // Number of external bytes needed to construct one complete
  61. // character in the internal encoding.
  62. // NB: -1 indicates variable, or stateful, encodings.
  63. int _M_bytes;
  64. public:
  65. explicit
  66. encoding_state()
  67. : _M_in_desc(0), _M_out_desc(0), _M_ext_bom(0), _M_int_bom(0), _M_bytes(0)
  68. { }
  69. explicit
  70. encoding_state(const char* __int, const char* __ext,
  71. int __ibom = 0, int __ebom = 0, int __bytes = 1)
  72. : _M_int_enc(__int), _M_ext_enc(__ext), _M_in_desc(0), _M_out_desc(0),
  73. _M_ext_bom(__ebom), _M_int_bom(__ibom), _M_bytes(__bytes)
  74. { init(); }
  75. // 21.1.2 traits typedefs
  76. // p4
  77. // typedef STATE_T state_type
  78. // requires: state_type shall meet the requirements of
  79. // CopyConstructible types (20.1.3)
  80. // NB: This does not preserve the actual state of the conversion
  81. // descriptor member, but it does duplicate the encoding
  82. // information.
  83. encoding_state(const encoding_state& __obj) : _M_in_desc(0), _M_out_desc(0)
  84. { construct(__obj); }
  85. // Need assignment operator as well.
  86. encoding_state&
  87. operator=(const encoding_state& __obj)
  88. {
  89. construct(__obj);
  90. return *this;
  91. }
  92. ~encoding_state()
  93. { destroy(); }
  94. bool
  95. good() const throw()
  96. {
  97. const descriptor_type __err = (iconv_t)(-1);
  98. bool __test = _M_in_desc && _M_in_desc != __err;
  99. __test &= _M_out_desc && _M_out_desc != __err;
  100. return __test;
  101. }
  102. int
  103. character_ratio() const
  104. { return _M_bytes; }
  105. const std::string
  106. internal_encoding() const
  107. { return _M_int_enc; }
  108. int
  109. internal_bom() const
  110. { return _M_int_bom; }
  111. const std::string
  112. external_encoding() const
  113. { return _M_ext_enc; }
  114. int
  115. external_bom() const
  116. { return _M_ext_bom; }
  117. const descriptor_type&
  118. in_descriptor() const
  119. { return _M_in_desc; }
  120. const descriptor_type&
  121. out_descriptor() const
  122. { return _M_out_desc; }
  123. protected:
  124. void
  125. init()
  126. {
  127. const descriptor_type __err = (iconv_t)(-1);
  128. const bool __have_encodings = _M_int_enc.size() && _M_ext_enc.size();
  129. if (!_M_in_desc && __have_encodings)
  130. {
  131. _M_in_desc = iconv_open(_M_int_enc.c_str(), _M_ext_enc.c_str());
  132. if (_M_in_desc == __err)
  133. std::__throw_runtime_error(__N("encoding_state::_M_init "
  134. "creating iconv input descriptor failed"));
  135. }
  136. if (!_M_out_desc && __have_encodings)
  137. {
  138. _M_out_desc = iconv_open(_M_ext_enc.c_str(), _M_int_enc.c_str());
  139. if (_M_out_desc == __err)
  140. std::__throw_runtime_error(__N("encoding_state::_M_init "
  141. "creating iconv output descriptor failed"));
  142. }
  143. }
  144. void
  145. construct(const encoding_state& __obj)
  146. {
  147. destroy();
  148. _M_int_enc = __obj._M_int_enc;
  149. _M_ext_enc = __obj._M_ext_enc;
  150. _M_ext_bom = __obj._M_ext_bom;
  151. _M_int_bom = __obj._M_int_bom;
  152. _M_bytes = __obj._M_bytes;
  153. init();
  154. }
  155. void
  156. destroy() throw()
  157. {
  158. const descriptor_type __err = (iconv_t)(-1);
  159. if (_M_in_desc && _M_in_desc != __err)
  160. {
  161. iconv_close(_M_in_desc);
  162. _M_in_desc = 0;
  163. }
  164. if (_M_out_desc && _M_out_desc != __err)
  165. {
  166. iconv_close(_M_out_desc);
  167. _M_out_desc = 0;
  168. }
  169. }
  170. };
  171. /// encoding_char_traits
  172. // Custom traits type with encoding_state for the state type, and the
  173. // associated fpos<encoding_state> for the position type, all other
  174. // bits equivalent to the required char_traits instantiations.
  175. template<typename _CharT>
  176. struct encoding_char_traits
  177. : public std::char_traits<_CharT>
  178. {
  179. typedef encoding_state state_type;
  180. typedef typename std::fpos<state_type> pos_type;
  181. };
  182. _GLIBCXX_END_NAMESPACE_CXX11
  183. _GLIBCXX_END_NAMESPACE_VERSION
  184. } // namespace
  185. namespace std _GLIBCXX_VISIBILITY(default)
  186. {
  187. _GLIBCXX_BEGIN_NAMESPACE_VERSION
  188. using __gnu_cxx::encoding_state;
  189. /// codecvt<InternT, _ExternT, encoding_state> specialization.
  190. // This partial specialization takes advantage of iconv to provide
  191. // code conversions between a large number of character encodings.
  192. template<typename _InternT, typename _ExternT>
  193. class codecvt<_InternT, _ExternT, encoding_state>
  194. : public __codecvt_abstract_base<_InternT, _ExternT, encoding_state>
  195. {
  196. public:
  197. // Types:
  198. typedef codecvt_base::result result;
  199. typedef _InternT intern_type;
  200. typedef _ExternT extern_type;
  201. typedef __gnu_cxx::encoding_state state_type;
  202. typedef state_type::descriptor_type descriptor_type;
  203. // Data Members:
  204. static locale::id id;
  205. explicit
  206. codecvt(size_t __refs = 0)
  207. : __codecvt_abstract_base<intern_type, extern_type, state_type>(__refs)
  208. { }
  209. explicit
  210. codecvt(state_type& __enc, size_t __refs = 0)
  211. : __codecvt_abstract_base<intern_type, extern_type, state_type>(__refs)
  212. { }
  213. protected:
  214. virtual
  215. ~codecvt() { }
  216. virtual result
  217. do_out(state_type& __state, const intern_type* __from,
  218. const intern_type* __from_end, const intern_type*& __from_next,
  219. extern_type* __to, extern_type* __to_end,
  220. extern_type*& __to_next) const;
  221. virtual result
  222. do_unshift(state_type& __state, extern_type* __to,
  223. extern_type* __to_end, extern_type*& __to_next) const;
  224. virtual result
  225. do_in(state_type& __state, const extern_type* __from,
  226. const extern_type* __from_end, const extern_type*& __from_next,
  227. intern_type* __to, intern_type* __to_end,
  228. intern_type*& __to_next) const;
  229. virtual int
  230. do_encoding() const throw();
  231. virtual bool
  232. do_always_noconv() const throw();
  233. virtual int
  234. do_length(state_type&, const extern_type* __from,
  235. const extern_type* __end, size_t __max) const;
  236. virtual int
  237. do_max_length() const throw();
  238. };
  239. template<typename _InternT, typename _ExternT>
  240. locale::id
  241. codecvt<_InternT, _ExternT, encoding_state>::id;
  242. // This adaptor works around the signature problems of the second
  243. // argument to iconv(): SUSv2 and others use 'const char**', but glibc 2.2
  244. // uses 'char**', which matches the POSIX 1003.1-2001 standard.
  245. // Using this adaptor, g++ will do the work for us.
  246. template<typename _Tp>
  247. inline size_t
  248. __iconv_adaptor(size_t(*__func)(iconv_t, _Tp, size_t*, char**, size_t*),
  249. iconv_t __cd, char** __inbuf, size_t* __inbytes,
  250. char** __outbuf, size_t* __outbytes)
  251. { return __func(__cd, (_Tp)__inbuf, __inbytes, __outbuf, __outbytes); }
  252. template<typename _InternT, typename _ExternT>
  253. codecvt_base::result
  254. codecvt<_InternT, _ExternT, encoding_state>::
  255. do_out(state_type& __state, const intern_type* __from,
  256. const intern_type* __from_end, const intern_type*& __from_next,
  257. extern_type* __to, extern_type* __to_end,
  258. extern_type*& __to_next) const
  259. {
  260. result __ret = codecvt_base::error;
  261. if (__state.good())
  262. {
  263. const descriptor_type& __desc = __state.out_descriptor();
  264. const size_t __fmultiple = sizeof(intern_type);
  265. size_t __fbytes = __fmultiple * (__from_end - __from);
  266. const size_t __tmultiple = sizeof(extern_type);
  267. size_t __tbytes = __tmultiple * (__to_end - __to);
  268. // Argument list for iconv specifies a byte sequence. Thus,
  269. // all to/from arrays must be brutally casted to char*.
  270. char* __cto = reinterpret_cast<char*>(__to);
  271. char* __cfrom;
  272. size_t __conv;
  273. // Some encodings need a byte order marker as the first item
  274. // in the byte stream, to designate endian-ness. The default
  275. // value for the byte order marker is NULL, so if this is
  276. // the case, it's not necessary and we can just go on our
  277. // merry way.
  278. int __int_bom = __state.internal_bom();
  279. if (__int_bom)
  280. {
  281. size_t __size = __from_end - __from;
  282. intern_type* __cfixed = static_cast<intern_type*>
  283. (__builtin_alloca(sizeof(intern_type) * (__size + 1)));
  284. __cfixed[0] = static_cast<intern_type>(__int_bom);
  285. char_traits<intern_type>::copy(__cfixed + 1, __from, __size);
  286. __cfrom = reinterpret_cast<char*>(__cfixed);
  287. __conv = __iconv_adaptor(iconv, __desc, &__cfrom,
  288. &__fbytes, &__cto, &__tbytes);
  289. }
  290. else
  291. {
  292. intern_type* __cfixed = const_cast<intern_type*>(__from);
  293. __cfrom = reinterpret_cast<char*>(__cfixed);
  294. __conv = __iconv_adaptor(iconv, __desc, &__cfrom, &__fbytes,
  295. &__cto, &__tbytes);
  296. }
  297. if (__conv != size_t(-1))
  298. {
  299. __from_next = reinterpret_cast<const intern_type*>(__cfrom);
  300. __to_next = reinterpret_cast<extern_type*>(__cto);
  301. __ret = codecvt_base::ok;
  302. }
  303. else
  304. {
  305. if (__fbytes < __fmultiple * (__from_end - __from))
  306. {
  307. __from_next = reinterpret_cast<const intern_type*>(__cfrom);
  308. __to_next = reinterpret_cast<extern_type*>(__cto);
  309. __ret = codecvt_base::partial;
  310. }
  311. else
  312. __ret = codecvt_base::error;
  313. }
  314. }
  315. return __ret;
  316. }
  317. template<typename _InternT, typename _ExternT>
  318. codecvt_base::result
  319. codecvt<_InternT, _ExternT, encoding_state>::
  320. do_unshift(state_type& __state, extern_type* __to,
  321. extern_type* __to_end, extern_type*& __to_next) const
  322. {
  323. result __ret = codecvt_base::error;
  324. if (__state.good())
  325. {
  326. const descriptor_type& __desc = __state.in_descriptor();
  327. const size_t __tmultiple = sizeof(intern_type);
  328. size_t __tlen = __tmultiple * (__to_end - __to);
  329. // Argument list for iconv specifies a byte sequence. Thus,
  330. // all to/from arrays must be brutally casted to char*.
  331. char* __cto = reinterpret_cast<char*>(__to);
  332. size_t __conv = __iconv_adaptor(iconv,__desc, 0, 0,
  333. &__cto, &__tlen);
  334. if (__conv != size_t(-1))
  335. {
  336. __to_next = reinterpret_cast<extern_type*>(__cto);
  337. if (__tlen == __tmultiple * (__to_end - __to))
  338. __ret = codecvt_base::noconv;
  339. else if (__tlen == 0)
  340. __ret = codecvt_base::ok;
  341. else
  342. __ret = codecvt_base::partial;
  343. }
  344. else
  345. __ret = codecvt_base::error;
  346. }
  347. return __ret;
  348. }
  349. template<typename _InternT, typename _ExternT>
  350. codecvt_base::result
  351. codecvt<_InternT, _ExternT, encoding_state>::
  352. do_in(state_type& __state, const extern_type* __from,
  353. const extern_type* __from_end, const extern_type*& __from_next,
  354. intern_type* __to, intern_type* __to_end,
  355. intern_type*& __to_next) const
  356. {
  357. result __ret = codecvt_base::error;
  358. if (__state.good())
  359. {
  360. const descriptor_type& __desc = __state.in_descriptor();
  361. const size_t __fmultiple = sizeof(extern_type);
  362. size_t __flen = __fmultiple * (__from_end - __from);
  363. const size_t __tmultiple = sizeof(intern_type);
  364. size_t __tlen = __tmultiple * (__to_end - __to);
  365. // Argument list for iconv specifies a byte sequence. Thus,
  366. // all to/from arrays must be brutally casted to char*.
  367. char* __cto = reinterpret_cast<char*>(__to);
  368. char* __cfrom;
  369. size_t __conv;
  370. // Some encodings need a byte order marker as the first item
  371. // in the byte stream, to designate endian-ness. The default
  372. // value for the byte order marker is NULL, so if this is
  373. // the case, it's not necessary and we can just go on our
  374. // merry way.
  375. int __ext_bom = __state.external_bom();
  376. if (__ext_bom)
  377. {
  378. size_t __size = __from_end - __from;
  379. extern_type* __cfixed = static_cast<extern_type*>
  380. (__builtin_alloca(sizeof(extern_type) * (__size + 1)));
  381. __cfixed[0] = static_cast<extern_type>(__ext_bom);
  382. char_traits<extern_type>::copy(__cfixed + 1, __from, __size);
  383. __cfrom = reinterpret_cast<char*>(__cfixed);
  384. __conv = __iconv_adaptor(iconv, __desc, &__cfrom,
  385. &__flen, &__cto, &__tlen);
  386. }
  387. else
  388. {
  389. extern_type* __cfixed = const_cast<extern_type*>(__from);
  390. __cfrom = reinterpret_cast<char*>(__cfixed);
  391. __conv = __iconv_adaptor(iconv, __desc, &__cfrom,
  392. &__flen, &__cto, &__tlen);
  393. }
  394. if (__conv != size_t(-1))
  395. {
  396. __from_next = reinterpret_cast<const extern_type*>(__cfrom);
  397. __to_next = reinterpret_cast<intern_type*>(__cto);
  398. __ret = codecvt_base::ok;
  399. }
  400. else
  401. {
  402. if (__flen < static_cast<size_t>(__from_end - __from))
  403. {
  404. __from_next = reinterpret_cast<const extern_type*>(__cfrom);
  405. __to_next = reinterpret_cast<intern_type*>(__cto);
  406. __ret = codecvt_base::partial;
  407. }
  408. else
  409. __ret = codecvt_base::error;
  410. }
  411. }
  412. return __ret;
  413. }
  414. template<typename _InternT, typename _ExternT>
  415. int
  416. codecvt<_InternT, _ExternT, encoding_state>::
  417. do_encoding() const throw()
  418. {
  419. int __ret = 0;
  420. if (sizeof(_ExternT) <= sizeof(_InternT))
  421. __ret = sizeof(_InternT) / sizeof(_ExternT);
  422. return __ret;
  423. }
  424. template<typename _InternT, typename _ExternT>
  425. bool
  426. codecvt<_InternT, _ExternT, encoding_state>::
  427. do_always_noconv() const throw()
  428. { return false; }
  429. template<typename _InternT, typename _ExternT>
  430. int
  431. codecvt<_InternT, _ExternT, encoding_state>::
  432. do_length(state_type&, const extern_type* __from,
  433. const extern_type* __end, size_t __max) const
  434. { return std::min(__max, static_cast<size_t>(__end - __from)); }
  435. // _GLIBCXX_RESOLVE_LIB_DEFECTS
  436. // 74. Garbled text for codecvt::do_max_length
  437. template<typename _InternT, typename _ExternT>
  438. int
  439. codecvt<_InternT, _ExternT, encoding_state>::
  440. do_max_length() const throw()
  441. { return 1; }
  442. _GLIBCXX_END_NAMESPACE_VERSION
  443. } // namespace
  444. #endif