wchar.c 27 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070
  1. /* Copyright (C) 2002 Manuel Novoa III
  2. *
  3. * This library is free software; you can redistribute it and/or
  4. * modify it under the terms of the GNU Library General Public
  5. * License as published by the Free Software Foundation; either
  6. * version 2 of the License, or (at your option) any later version.
  7. *
  8. * This library is distributed in the hope that it will be useful,
  9. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  11. * Library General Public License for more details.
  12. *
  13. * You should have received a copy of the GNU Library General Public
  14. * License along with this library; if not, write to the Free
  15. * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  16. */
  17. /* ATTENTION! ATTENTION! ATTENTION! ATTENTION! ATTENTION!
  18. *
  19. * Besides uClibc, I'm using this code in my libc for elks, which is
  20. * a 16-bit environment with a fairly limited compiler. It would make
  21. * things much easier for me if this file isn't modified unnecessarily.
  22. * In particular, please put any new or replacement functions somewhere
  23. * else, and modify the makefile to use your version instead.
  24. * Thanks. Manuel
  25. *
  26. * ATTENTION! ATTENTION! ATTENTION! ATTENTION! ATTENTION! */
  27. /* May 23, 2002 Initial Notes:
  28. *
  29. * I'm still tweaking this stuff, but it passes the tests I've thrown
  30. * at it, and Erik needs it for the gcc port. The glibc extension
  31. * __wcsnrtombs() hasn't been tested, as I didn't find a test for it
  32. * in the glibc source. I also need to fix the behavior of
  33. * _wchar_utf8sntowcs() if the max number of wchars to convert is 0.
  34. *
  35. * UTF-8 -> wchar -> UTF-8 conversion tests on Markus Kuhn's UTF-8-demo.txt
  36. * file on my platform (x86) show about 5-10% faster conversion speed than
  37. * glibc with mbsrtowcs()/wcsrtombs() and almost twice as fast as glibc with
  38. * individual mbrtowc()/wcrtomb() calls.
  39. *
  40. * If 'DECODER' is defined, then _wchar_utf8sntowcs() will be compiled
  41. * as a fail-safe UTF-8 decoder appropriate for a terminal, etc. which
  42. * needs to deal gracefully with whatever is sent to it. In that mode,
  43. * it passes Markus Kuhn's UTF-8-test.txt stress test. I plan to add
  44. * an arg to force that behavior, so the interface will be changing.
  45. *
  46. * I need to fix the error checking for 16-bit wide chars. This isn't
  47. * an issue for uClibc, but may be for ELKS. I'm currently not sure
  48. * if I'll use 16-bit, 32-bit, or configureable wchars in ELKS.
  49. *
  50. * July 1, 2002
  51. *
  52. * Fixed _wchar_utf8sntowcs() for the max number of wchars == 0 case.
  53. * Fixed nul-char bug in btowc(), and another in __mbsnrtowcs() for 8-bit
  54. * locales.
  55. * Enabled building of a C/POSIX-locale-only version, so full locale support
  56. * no longer needs to be enabled.
  57. *
  58. * Nov 4, 2002
  59. *
  60. * Fixed a bug in _wchar_wcsntoutf8s(). Don't store wcs position if dst is NULL.
  61. * Also, introduce an awful hack into _wchar_wcsntoutf8s() and wcsrtombs() in
  62. * order to support %ls in printf. See comments below for details.
  63. * Change behaviour of wc<->mb functions when in the C locale. Now they do
  64. * a 1-1 map for the range 0x80-UCHAR_MAX. This is for backwards compatibility
  65. * and consistency with the stds requirements that a printf format string by
  66. * a valid multibyte string beginning and ending in it's initial shift state.
  67. *
  68. * Nov 5, 2002
  69. *
  70. * Forgot to change btowc and wctob when I changed the wc<->mb functions yesterday.
  71. *
  72. * Nov 7, 2002
  73. *
  74. * Add wcwidth and wcswidth, based on Markus Kuhn's wcwidth of 2002-05-08.
  75. * Added some size/speed optimizations and integrated it into my locale
  76. * framework. Minimally tested at the moment, but the stub C-locale
  77. * version (which most people would probably be using) should be fine.
  78. *
  79. * Manuel
  80. */
  81. #define _GNU_SOURCE
  82. #define _ISOC99_SOURCE
  83. #include <errno.h>
  84. #include <stddef.h>
  85. #include <limits.h>
  86. #include <stdint.h>
  87. #include <inttypes.h>
  88. #include <stdlib.h>
  89. #include <stdio.h>
  90. #include <assert.h>
  91. #include <locale.h>
  92. #include <wchar.h>
  93. #ifdef __UCLIBC_HAS_LOCALE__
  94. #define ENCODING (__global_locale.encoding)
  95. #ifdef __UCLIBC_MJN3_ONLY__
  96. #warning implement __CTYPE_HAS_UTF_8_LOCALES!
  97. #endif
  98. #define __CTYPE_HAS_UTF_8_LOCALES
  99. #else
  100. #undef __CTYPE_HAS_8_BIT_LOCALES
  101. #undef __CTYPE_HAS_UTF_8_LOCALES
  102. #undef L__wchar_utf8sntowcs
  103. #undef L__wchar_wcsntoutf8s
  104. #endif
  105. #if WCHAR_MAX > 0xffffUL
  106. #define UTF_8_MAX_LEN 6
  107. #else
  108. #define UTF_8_MAX_LEN 3
  109. #endif
  110. /* #define KUHN */
  111. /* Implementation-specific work functions. */
  112. extern size_t _wchar_utf8sntowcs(wchar_t *__restrict pwc, size_t wn,
  113. const char **__restrict src, size_t n,
  114. mbstate_t *ps, int allow_continuation);
  115. extern size_t _wchar_wcsntoutf8s(char *__restrict s, size_t n,
  116. const wchar_t **__restrict src, size_t wn);
  117. /* glibc extensions. */
  118. extern size_t __mbsnrtowcs(wchar_t *__restrict dst,
  119. const char **__restrict src,
  120. size_t NMC, size_t len, mbstate_t *__restrict ps);
  121. extern size_t __wcsnrtombs(char *__restrict dst,
  122. const wchar_t **__restrict src,
  123. size_t NWC, size_t len, mbstate_t *__restrict ps);
  124. /**********************************************************************/
  125. #ifdef L_btowc
  126. wint_t btowc(int c)
  127. {
  128. #ifdef __CTYPE_HAS_8_BIT_LOCALES
  129. wchar_t wc;
  130. unsigned char buf[1];
  131. mbstate_t mbstate;
  132. if (c != EOF) {
  133. *buf = (unsigned char) c;
  134. mbstate.mask = 0; /* Initialize the mbstate. */
  135. if (mbrtowc(&wc, buf, 1, &mbstate) <= 1) {
  136. return wc;
  137. }
  138. }
  139. return WEOF;
  140. #else /* __CTYPE_HAS_8_BIT_LOCALES */
  141. #ifdef __CTYPE_HAS_UTF_8_LOCALES
  142. if (ENCODING == __ctype_encoding_utf8) {
  143. return (((unsigned int)c) < 0x80) ? c : WEOF;
  144. }
  145. #endif /* __CTYPE_HAS_UTF_8_LOCALES */
  146. #ifdef __UCLIBC_HAS_LOCALE__
  147. assert(ENCODING == __ctype_encoding_7_bit);
  148. #endif
  149. return (((unsigned int)c) <= UCHAR_MAX) ? c : WEOF;
  150. #endif /* __CTYPE_HAS_8_BIT_LOCALES */
  151. }
  152. #endif
  153. /**********************************************************************/
  154. #ifdef L_wctob
  155. /* Note: We completely ignore ps in all currently supported conversions. */
  156. int wctob(wint_t c)
  157. {
  158. #ifdef __CTYPE_HAS_8_BIT_LOCALES
  159. unsigned char buf[MB_LEN_MAX];
  160. return (wcrtomb(buf, c, NULL) == 1) ? *buf : EOF;
  161. #else /* __CTYPE_HAS_8_BIT_LOCALES */
  162. #ifdef __CTYPE_HAS_UTF_8_LOCALES
  163. if (ENCODING == __ctype_encoding_utf8) {
  164. return ((c >= 0) && (c < 0x80)) ? c : EOF;
  165. }
  166. #endif /* __CTYPE_HAS_UTF_8_LOCALES */
  167. #ifdef __UCLIBC_HAS_LOCALE__
  168. assert(ENCODING == __ctype_encoding_7_bit);
  169. #endif
  170. return ((c >= 0) && (c <= UCHAR_MAX)) ? c : EOF;
  171. #endif /* __CTYPE_HAS_8_BIT_LOCALES */
  172. }
  173. #endif
  174. /**********************************************************************/
  175. #ifdef L_mbsinit
  176. int mbsinit(const mbstate_t *ps)
  177. {
  178. return !ps || !ps->mask;
  179. }
  180. #endif
  181. /**********************************************************************/
  182. #ifdef L_mbrlen
  183. size_t mbrlen(const char *__restrict s, size_t n, mbstate_t *__restrict ps)
  184. __attribute__ ((__weak__, __alias__("__mbrlen")));
  185. size_t __mbrlen(const char *__restrict s, size_t n, mbstate_t *__restrict ps)
  186. {
  187. static mbstate_t mbstate; /* Rely on bss 0-init. */
  188. return mbrtowc(NULL, s, n, (ps != NULL) ? ps : &mbstate);
  189. }
  190. #endif
  191. /**********************************************************************/
  192. #ifdef L_mbrtowc
  193. size_t mbrtowc(wchar_t *__restrict pwc, const char *__restrict s,
  194. size_t n, mbstate_t *__restrict ps)
  195. {
  196. static mbstate_t mbstate; /* Rely on bss 0-init. */
  197. wchar_t wcbuf[1];
  198. const char *p;
  199. size_t r;
  200. char empty_string[1]; /* Avoid static to be fPIC friendly. */
  201. if (!ps) {
  202. ps = &mbstate;
  203. }
  204. if (!s) {
  205. pwc = (wchar_t *) s; /* NULL */
  206. empty_string[0] = 0; /* Init the empty string when necessary. */
  207. s = empty_string;
  208. n = 1;
  209. } else if (!n) {
  210. return (ps->mask && (ps->wc == 0xffffU)) /* TODO: change error code? */
  211. ? ((size_t) -1) : ((size_t) -2);
  212. }
  213. p = s;
  214. #ifdef __CTYPE_HAS_UTF_8_LOCALES
  215. /* Need to do this here since mbsrtowcs doesn't allow incompletes. */
  216. if (ENCODING == __ctype_encoding_utf8) {
  217. r = _wchar_utf8sntowcs(pwc, 1, &p, n, ps, 1);
  218. return (r == 1) ? (p-s) : r; /* Need to return 0 if nul char. */
  219. }
  220. #endif
  221. r = __mbsnrtowcs(wcbuf, &p, SIZE_MAX, 1, ps);
  222. if (((ssize_t) r) >= 0) {
  223. if (pwc) {
  224. *pwc = *wcbuf;
  225. }
  226. }
  227. return (size_t) r;
  228. }
  229. #endif
  230. /**********************************************************************/
  231. #ifdef L_wcrtomb
  232. /* Note: We completely ignore ps in all currently supported conversions. */
  233. /* TODO: Check for valid state anyway? */
  234. size_t wcrtomb(register char *__restrict s, wchar_t wc,
  235. mbstate_t *__restrict ps)
  236. {
  237. wchar_t wcbuf[2];
  238. const wchar_t *pwc;
  239. size_t r;
  240. char buf[MB_LEN_MAX];
  241. if (!s) {
  242. s = buf;
  243. wc = 0;
  244. }
  245. pwc = wcbuf;
  246. wcbuf[0] = wc;
  247. wcbuf[1] = 0;
  248. r = __wcsnrtombs(s, &pwc, SIZE_MAX, MB_LEN_MAX, ps);
  249. return (r != 0) ? r : 1;
  250. }
  251. #endif
  252. /**********************************************************************/
  253. #ifdef L_mbsrtowcs
  254. size_t mbsrtowcs(wchar_t *__restrict dst, const char **__restrict src,
  255. size_t len, mbstate_t *__restrict ps)
  256. {
  257. static mbstate_t mbstate; /* Rely on bss 0-init. */
  258. return __mbsnrtowcs(dst, src, SIZE_MAX, len,
  259. ((ps != NULL) ? ps : &mbstate));
  260. }
  261. #endif
  262. /**********************************************************************/
  263. #ifdef L_wcsrtombs
  264. /* Note: We completely ignore ps in all currently supported conversions.
  265. * TODO: Check for valid state anyway? */
  266. size_t wcsrtombs(char *__restrict dst, const wchar_t **__restrict src,
  267. size_t len, mbstate_t *__restrict ps)
  268. {
  269. return __wcsnrtombs(dst, src, SIZE_MAX, len, ps);
  270. }
  271. #endif
  272. /**********************************************************************/
  273. #ifdef L__wchar_utf8sntowcs
  274. /* Define DECODER to generate a UTF-8 decoder which passes Markus Kuhn's
  275. * UTF-8-test.txt strss test.
  276. */
  277. /* #define DECODER */
  278. #ifdef DECODER
  279. #ifndef KUHN
  280. #define KUHN
  281. #endif
  282. #endif
  283. size_t _wchar_utf8sntowcs(wchar_t *__restrict pwc, size_t wn,
  284. const char **__restrict src, size_t n,
  285. mbstate_t *ps, int allow_continuation)
  286. {
  287. register const char *s;
  288. __uwchar_t mask;
  289. __uwchar_t wc;
  290. wchar_t wcbuf[1];
  291. size_t count;
  292. int incr;
  293. s = *src;
  294. assert(s != NULL);
  295. assert(ps != NULL);
  296. incr = 1;
  297. if (!pwc) {
  298. pwc = wcbuf;
  299. wn = SIZE_MAX;
  300. incr = 0;
  301. }
  302. /* This is really here only to support the glibc extension function
  303. * __mbsnrtowcs which apparently returns 0 if wn == 0 without any
  304. * check on the validity of the mbstate. */
  305. if (!(count = wn)) {
  306. return 0;
  307. }
  308. if ((mask = (__uwchar_t) ps->mask) != 0) { /* A continuation... */
  309. #ifdef DECODER
  310. wc = (__uwchar_t) ps->wc;
  311. if (n) {
  312. goto CONTINUE;
  313. }
  314. goto DONE;
  315. #else
  316. if ((wc = (__uwchar_t) ps->wc) != 0xffffU) {
  317. /* TODO: change error code here and below? */
  318. if (n) {
  319. goto CONTINUE;
  320. }
  321. goto DONE;
  322. }
  323. return (size_t) -1; /* We're in an error state. */
  324. #endif
  325. }
  326. do {
  327. if (!n) {
  328. goto DONE;
  329. }
  330. --n;
  331. if ((wc = ((unsigned char) *s++)) >= 0x80) { /* Not ASCII... */
  332. mask = 0x40;
  333. #ifdef __UCLIBC_MJN3_ONLY__
  334. #warning fix range for 16 bit wides
  335. #endif
  336. if ( ((unsigned char)(s[-1] - 0xc0)) < (0xfe - 0xc0) ) {
  337. goto START;
  338. }
  339. BAD:
  340. #ifdef DECODER
  341. wc = 0xfffd;
  342. goto COMPLETE;
  343. #else
  344. ps->mask = mask;
  345. ps->wc = 0xffffU;
  346. return (size_t) -1; /* Illegal start byte! */
  347. #endif
  348. CONTINUE:
  349. while (n) {
  350. --n;
  351. if ((*s & 0xc0) != 0x80) {
  352. goto BAD;
  353. }
  354. mask <<= 5;
  355. wc <<= 6;
  356. wc += (*s & 0x3f); /* keep seperate for bcc (smaller code) */
  357. ++s;
  358. START:
  359. wc &= ~(mask << 1);
  360. if ((wc & mask) == 0) { /* Character completed. */
  361. if ((mask >>= 5) == 0x40) {
  362. mask += mask;
  363. }
  364. /* Check for invalid sequences (longer than necessary)
  365. * and invalid chars. */
  366. if ( (wc < mask) /* Sequence not minimal length. */
  367. #ifdef KUHN
  368. #if UTF_8_MAX_LEN == 3
  369. #error broken since mask can overflow!!
  370. /* For plane 0, these are the only defined values.*/
  371. || (wc > 0xfffdU)
  372. #else
  373. /* Note that we don't need to worry about exceeding */
  374. /* 31 bits as that is the most that UTF-8 provides. */
  375. || ( ((__uwchar_t)(wc - 0xfffeU)) < 2)
  376. #endif
  377. || ( ((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U) )
  378. #endif /* KUHN */
  379. ) {
  380. goto BAD;
  381. }
  382. goto COMPLETE;
  383. }
  384. }
  385. /* Character potentially valid but incomplete. */
  386. if (!allow_continuation) {
  387. if (count != wn) {
  388. return 0;
  389. }
  390. /* NOTE: The following can fail if you allow and then disallow
  391. * continuation!!! */
  392. #if UTF_8_MAX_LEN == 3
  393. #error broken since mask can overflow!!
  394. #endif
  395. /* Need to back up... */
  396. do {
  397. --s;
  398. } while ((mask >>= 5) >= 0x40);
  399. goto DONE;
  400. }
  401. ps->mask = (wchar_t) mask;
  402. ps->wc = (wchar_t) wc;
  403. *src = s;
  404. return (size_t) -2;
  405. }
  406. COMPLETE:
  407. *pwc = wc;
  408. pwc += incr;
  409. }
  410. #ifdef DECODER
  411. while (--count);
  412. #else
  413. while (wc && --count);
  414. if (!wc) {
  415. s = NULL;
  416. }
  417. #endif
  418. DONE:
  419. /* ps->wc is irrelavent here. */
  420. ps->mask = 0;
  421. if (pwc != wcbuf) {
  422. *src = s;
  423. }
  424. return wn - count;
  425. }
  426. #endif
  427. /**********************************************************************/
  428. #ifdef L__wchar_wcsntoutf8s
  429. size_t _wchar_wcsntoutf8s(char *__restrict s, size_t n,
  430. const wchar_t **__restrict src, size_t wn)
  431. {
  432. register char *p;
  433. size_t len, t;
  434. __uwchar_t wc;
  435. const __uwchar_t *swc;
  436. int store;
  437. char buf[MB_LEN_MAX];
  438. char m;
  439. store = 1;
  440. /* NOTE: The following is an AWFUL HACK! In order to support %ls in
  441. * printf, we need to be able to compute the number of bytes needed
  442. * for the mbs conversion, not to exceed the precision specified.
  443. * But if dst is NULL, the return value is the length assuming a
  444. * sufficiently sized buffer. So, we allow passing of (char *) src
  445. * as dst in order to flag that we really want the length, subject
  446. * to the restricted buffer size and no partial conversions.
  447. * See wcsnrtombs() as well. */
  448. if (!s || (s == ((char *) src))) {
  449. if (!s) {
  450. n = SIZE_MAX;
  451. }
  452. s = buf;
  453. store = 0;
  454. }
  455. t = n;
  456. swc = (const __uwchar_t *) *src;
  457. assert(swc != NULL);
  458. while (wn && t) {
  459. wc = *swc;
  460. *s = wc;
  461. len = 1;
  462. if (wc >= 0x80) {
  463. #ifdef KUHN
  464. if (
  465. #if UTF_8_MAX_LEN == 3
  466. /* For plane 0, these are the only defined values.*/
  467. /* Note that we don't need to worry about exceeding */
  468. /* 31 bits as that is the most that UTF-8 provides. */
  469. (wc > 0xfffdU)
  470. #else
  471. /* UTF_8_MAX_LEN == 6 */
  472. (wc > 0x7fffffffUL)
  473. || ( ((__uwchar_t)(wc - 0xfffeU)) < 2)
  474. #endif
  475. || ( ((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U) )
  476. ) {
  477. return (size_t) -1;
  478. }
  479. #else /* KUHN */
  480. #if UTF_8_MAX_LEN != 3
  481. if (wc > 0x7fffffffUL) { /* Value too large. */
  482. return (size_t) -1;
  483. }
  484. #endif
  485. #endif /* KUHN */
  486. wc >>= 1;
  487. p = s;
  488. do {
  489. ++p;
  490. } while (wc >>= 5);
  491. wc = *swc;
  492. if ((len = p - s) > t) { /* Not enough space. */
  493. break;
  494. }
  495. m = 0x80;
  496. while( p>s ) {
  497. m = (m >> 1) | 0x80;
  498. *--p = (wc & 0x3f) | 0x80;
  499. wc >>= 6;
  500. }
  501. *s |= (m << 1);
  502. } else if (wc == 0) { /* End of string. */
  503. swc = NULL;
  504. break;
  505. }
  506. ++swc;
  507. --wn;
  508. t -= len;
  509. if (store) {
  510. s += len;
  511. }
  512. }
  513. if (store) {
  514. *src = (const wchar_t *) swc;
  515. }
  516. return n - t;
  517. }
  518. #endif
  519. /**********************************************************************/
  520. #ifdef L___mbsnrtowcs
  521. /* WARNING: We treat len as SIZE_MAX when dst is NULL! */
  522. size_t mbsnrtowcs(wchar_t *__restrict dst, const char **__restrict src,
  523. size_t NMC, size_t len, mbstate_t *__restrict ps)
  524. __attribute__ ((__weak__, __alias__("__mbsnrtowcs")));
  525. size_t __mbsnrtowcs(wchar_t *__restrict dst, const char **__restrict src,
  526. size_t NMC, size_t len, mbstate_t *__restrict ps)
  527. {
  528. static mbstate_t mbstate; /* Rely on bss 0-init. */
  529. wchar_t wcbuf[1];
  530. const char *s;
  531. size_t count;
  532. int incr;
  533. if (!ps) {
  534. ps = &mbstate;
  535. }
  536. #ifdef __CTYPE_HAS_UTF_8_LOCALES
  537. if (ENCODING == __ctype_encoding_utf8) {
  538. size_t r;
  539. return ((r = _wchar_utf8sntowcs(dst, len, src, NMC, ps, 1))
  540. != (size_t) -2) ? r : 0;
  541. }
  542. #endif
  543. incr = 1;
  544. if (!dst) {
  545. dst = wcbuf;
  546. len = SIZE_MAX;
  547. incr = 0;
  548. }
  549. /* Since all the following encodings are single-byte encodings... */
  550. if (len > NMC) {
  551. len = NMC;
  552. }
  553. count = len;
  554. s = *src;
  555. #ifdef __CTYPE_HAS_8_BIT_LOCALES
  556. if (ENCODING == __ctype_encoding_8_bit) {
  557. wchar_t wc;
  558. while (count) {
  559. if ((wc = ((unsigned char)(*s))) >= 0x80) { /* Non-ASCII... */
  560. wc -= 0x80;
  561. wc = __global_locale.tbl8c2wc[
  562. (__global_locale.idx8c2wc[wc >> Cc2wc_IDX_SHIFT]
  563. << Cc2wc_IDX_SHIFT) + (wc & (Cc2wc_ROW_LEN - 1))];
  564. if (!wc) {
  565. __set_errno(EILSEQ);
  566. return (size_t) -1;
  567. }
  568. }
  569. if (!(*dst = wc)) {
  570. s = NULL;
  571. break;
  572. }
  573. dst += incr;
  574. ++s;
  575. --count;
  576. }
  577. if (dst != wcbuf) {
  578. *src = s;
  579. }
  580. return len - count;
  581. }
  582. #endif
  583. #ifdef __UCLIBC_HAS_LOCALE__
  584. assert(ENCODING == __ctype_encoding_7_bit);
  585. #endif
  586. while (count) {
  587. if ((*dst = (unsigned char) *s) == 0) {
  588. s = NULL;
  589. break;
  590. }
  591. ++s;
  592. dst += incr;
  593. --count;
  594. }
  595. if (dst != wcbuf) {
  596. *src = s;
  597. }
  598. return len - count;
  599. }
  600. #endif
  601. /**********************************************************************/
  602. #ifdef L___wcsnrtombs
  603. /* WARNING: We treat len as SIZE_MAX when dst is NULL! */
  604. /* Note: We completely ignore ps in all currently supported conversions.
  605. * TODO: Check for valid state anyway? */
  606. size_t wcsnrtombs(char *__restrict dst, const wchar_t **__restrict src,
  607. size_t NWC, size_t len, mbstate_t *__restrict ps)
  608. __attribute__ ((__weak__, __alias__("__wcsnrtombs")));
  609. size_t __wcsnrtombs(char *__restrict dst, const wchar_t **__restrict src,
  610. size_t NWC, size_t len, mbstate_t *__restrict ps)
  611. {
  612. const __uwchar_t *s;
  613. size_t count;
  614. int incr;
  615. char buf[MB_LEN_MAX];
  616. #ifdef __CTYPE_HAS_UTF_8_LOCALES
  617. if (ENCODING == __ctype_encoding_utf8) {
  618. return _wchar_wcsntoutf8s(dst, len, src, NWC);
  619. }
  620. #endif /* __CTYPE_HAS_UTF_8_LOCALES */
  621. incr = 1;
  622. /* NOTE: The following is an AWFUL HACK! In order to support %ls in
  623. * printf, we need to be able to compute the number of bytes needed
  624. * for the mbs conversion, not to exceed the precision specified.
  625. * But if dst is NULL, the return value is the length assuming a
  626. * sufficiently sized buffer. So, we allow passing of (char *) src
  627. * as dst in order to flag that we really want the length, subject
  628. * to the restricted buffer size and no partial conversions.
  629. * See _wchar_wcsntoutf8s() as well. */
  630. if (!dst || (dst == ((char *) src))) {
  631. if (!dst) {
  632. len = SIZE_MAX;
  633. }
  634. dst = buf;
  635. incr = 0;
  636. }
  637. /* Since all the following encodings are single-byte encodings... */
  638. if (len > NWC) {
  639. len = NWC;
  640. }
  641. count = len;
  642. s = (const __uwchar_t *) *src;
  643. #ifdef __CTYPE_HAS_8_BIT_LOCALES
  644. if (ENCODING == __ctype_encoding_8_bit) {
  645. __uwchar_t wc;
  646. __uwchar_t u;
  647. while (count) {
  648. if ((wc = *s) <= 0x7f) {
  649. if (!(*dst = (unsigned char) wc)) {
  650. s = NULL;
  651. break;
  652. }
  653. } else {
  654. u = 0;
  655. if (wc <= Cwc2c_DOMAIN_MAX) {
  656. u = __global_locale.idx8wc2c[wc >> (Cwc2c_TI_SHIFT
  657. + Cwc2c_TT_SHIFT)];
  658. u = __global_locale.tbl8wc2c[(u << Cwc2c_TI_SHIFT)
  659. + ((wc >> Cwc2c_TT_SHIFT)
  660. & ((1 << Cwc2c_TI_SHIFT)-1))];
  661. u = __global_locale.tbl8wc2c[Cwc2c_TI_LEN
  662. + (u << Cwc2c_TT_SHIFT)
  663. + (wc & ((1 << Cwc2c_TT_SHIFT)-1))];
  664. }
  665. /* #define __WCHAR_REPLACEMENT_CHAR '?' */
  666. #ifdef __WCHAR_REPLACEMENT_CHAR
  667. *dst = (unsigned char) ( u ? u : __WCHAR_REPLACEMENT_CHAR );
  668. #else /* __WCHAR_REPLACEMENT_CHAR */
  669. if (!u) {
  670. goto BAD;
  671. }
  672. *dst = (unsigned char) u;
  673. #endif /* __WCHAR_REPLACEMENT_CHAR */
  674. }
  675. ++s;
  676. dst += incr;
  677. --count;
  678. }
  679. if (dst != buf) {
  680. *src = (const wchar_t *) s;
  681. }
  682. return len - count;
  683. }
  684. #endif /* __CTYPE_HAS_8_BIT_LOCALES */
  685. #ifdef __UCLIBC_HAS_LOCALE__
  686. assert(ENCODING == __ctype_encoding_7_bit);
  687. #endif
  688. while (count) {
  689. if (*s > UCHAR_MAX) {
  690. #if defined(__CTYPE_HAS_8_BIT_LOCALES) && !defined(__WCHAR_REPLACEMENT_CHAR)
  691. BAD:
  692. #endif
  693. __set_errno(EILSEQ);
  694. return (size_t) -1;
  695. }
  696. if ((*dst = (unsigned char) *s) == 0) {
  697. s = NULL;
  698. break;
  699. }
  700. ++s;
  701. dst += incr;
  702. --count;
  703. }
  704. if (dst != buf) {
  705. *src = (const wchar_t *) s;
  706. }
  707. return len - count;
  708. }
  709. #endif
  710. /**********************************************************************/
  711. #ifdef L_wcswidth
  712. #ifdef __UCLIBC_MJN3_ONLY__
  713. #warning if we start doing translit, wcwidth and wcswidth will need updating.
  714. #endif
  715. #if defined(__UCLIBC_HAS_LOCALE__) && \
  716. ( defined(__CTYPE_HAS_8_BIT_LOCALES) || defined(__CTYPE_HAS_UTF_8_LOCALES) )
  717. static const unsigned char new_idx[] = {
  718. 0, 5, 5, 6, 10, 15, 28, 39,
  719. 48, 48, 71, 94, 113, 128, 139, 154,
  720. 175, 186, 188, 188, 188, 188, 188, 188,
  721. 203, 208, 208, 208, 208, 208, 208, 208,
  722. 208, 219, 219, 219, 222, 222, 222, 222,
  723. 222, 222, 222, 222, 222, 222, 222, 224,
  724. 224, 231, 231, 231, 231, 231, 231, 231,
  725. 231, 231, 231, 231, 231, 231, 231, 231,
  726. 231, 231, 231, 231, 231, 231, 231, 231,
  727. 231, 231, 231, 231, 231, 231, 231, 231,
  728. 231, 231, 231, 231, 231, 231, 231, 231,
  729. 231, 231, 231, 231, 231, 231, 231, 231,
  730. 231, 231, 231, 231, 231, 231, 231, 231,
  731. 231, 231, 231, 231, 231, 231, 231, 231,
  732. 231, 231, 231, 231, 231, 231, 231, 231,
  733. 231, 231, 231, 231, 231, 231, 231, 231,
  734. 231, 231, 231, 231, 231, 231, 231, 231,
  735. 231, 231, 231, 231, 231, 231, 231, 231,
  736. 231, 231, 231, 231, 231, 231, 231, 231,
  737. 231, 231, 231, 231, 231, 231, 231, 231,
  738. 231, 231, 231, 231, 231, 233, 233, 233,
  739. 233, 233, 233, 233, 234, 234, 234, 234,
  740. 234, 234, 234, 234, 234, 234, 234, 234,
  741. 234, 234, 234, 234, 234, 234, 234, 234,
  742. 234, 234, 234, 234, 234, 234, 234, 234,
  743. 234, 234, 234, 234, 234, 234, 234, 234,
  744. 234, 234, 234, 234, 234, 234, 234, 234,
  745. 236, 236, 236, 236, 236, 236, 236, 236,
  746. 236, 236, 236, 236, 236, 236, 236, 236,
  747. 236, 236, 236, 236, 236, 236, 236, 236,
  748. 236, 236, 236, 236, 236, 236, 236, 236,
  749. 236, 237, 237, 238, 241, 241, 242, 249,
  750. 255,
  751. };
  752. static const unsigned char new_tbl[] = {
  753. 0x00, 0x01, 0x20, 0x7f, 0xa0, 0x00, 0x00, 0x50,
  754. 0x60, 0x70, 0x00, 0x83, 0x87, 0x88, 0x8a, 0x00,
  755. 0x91, 0xa2, 0xa3, 0xba, 0xbb, 0xbe, 0xbf, 0xc0,
  756. 0xc1, 0xc3, 0xc4, 0xc5, 0x00, 0x4b, 0x56, 0x70,
  757. 0x71, 0xd6, 0xe5, 0xe7, 0xe9, 0xea, 0xee, 0x00,
  758. 0x0f, 0x10, 0x11, 0x12, 0x30, 0x4b, 0xa6, 0xb1,
  759. 0x00, 0x01, 0x03, 0x3c, 0x3d, 0x41, 0x49, 0x4d,
  760. 0x4e, 0x51, 0x55, 0x62, 0x64, 0x81, 0x82, 0xbc,
  761. 0xbd, 0xc1, 0xc5, 0xcd, 0xce, 0xe2, 0xe4, 0x00,
  762. 0x02, 0x03, 0x3c, 0x3d, 0x41, 0x43, 0x47, 0x49,
  763. 0x4b, 0x4e, 0x70, 0x72, 0x81, 0x83, 0xbc, 0xbd,
  764. 0xc1, 0xc6, 0xc7, 0xc9, 0xcd, 0xce, 0x00, 0x01,
  765. 0x02, 0x3c, 0x3d, 0x3f, 0x40, 0x41, 0x44, 0x4d,
  766. 0x4e, 0x56, 0x57, 0x82, 0x83, 0xc0, 0xc1, 0xcd,
  767. 0xce, 0x00, 0x3e, 0x41, 0x46, 0x49, 0x4a, 0x4e,
  768. 0x55, 0x57, 0xbf, 0xc0, 0xc6, 0xc7, 0xcc, 0xce,
  769. 0x00, 0x41, 0x44, 0x4d, 0x4e, 0xca, 0xcb, 0xd2,
  770. 0xd5, 0xd6, 0xd7, 0x00, 0x31, 0x32, 0x34, 0x3b,
  771. 0x47, 0x4f, 0xb1, 0xb2, 0xb4, 0xba, 0xbb, 0xbd,
  772. 0xc8, 0xce, 0x00, 0x18, 0x1a, 0x35, 0x36, 0x37,
  773. 0x38, 0x39, 0x3a, 0x71, 0x7f, 0x80, 0x85, 0x86,
  774. 0x88, 0x90, 0x98, 0x99, 0xbd, 0xc6, 0xc7, 0x00,
  775. 0x2d, 0x31, 0x32, 0x33, 0x36, 0x38, 0x39, 0x3a,
  776. 0x58, 0x5a, 0x00, 0x60, 0x00, 0x12, 0x15, 0x32,
  777. 0x35, 0x52, 0x54, 0x72, 0x74, 0xb7, 0xbe, 0xc6,
  778. 0xc7, 0xc9, 0xd4, 0x00, 0x0b, 0x0f, 0xa9, 0xaa,
  779. 0x00, 0x0b, 0x10, 0x2a, 0x2f, 0x60, 0x64, 0x6a,
  780. 0x70, 0xd0, 0xeb, 0x00, 0x29, 0x2b, 0x00, 0x80,
  781. 0x00, 0x2a, 0x30, 0x3f, 0x40, 0x99, 0x9b, 0x00,
  782. 0xd0, 0x00, 0x00, 0xa4, 0x00, 0x00, 0x00, 0x1e,
  783. 0x1f, 0x00, 0x00, 0x10, 0x20, 0x24, 0x30, 0x70,
  784. 0xff, 0x00, 0x61, 0xe0, 0xe7, 0xf9, 0xfc,
  785. };
  786. static const signed char new_wtbl[] = {
  787. 0, -1, 1, -1, 1, 1, 0, 1,
  788. 0, 1, 1, 0, 1, 0, 1, 1,
  789. 0, 1, 0, 1, 0, 1, 0, 1,
  790. 0, 1, 0, 1, 1, 0, 1, 0,
  791. 1, 0, 1, 0, 1, 0, 1, 1,
  792. 0, 1, 0, 1, 0, 1, 0, 1,
  793. 1, 0, 1, 0, 1, 0, 1, 0,
  794. 1, 0, 1, 0, 1, 0, 1, 0,
  795. 1, 0, 1, 0, 1, 0, 1, 1,
  796. 0, 1, 0, 1, 0, 1, 0, 1,
  797. 0, 1, 0, 1, 0, 1, 0, 1,
  798. 0, 1, 0, 1, 0, 1, 1, 0,
  799. 1, 0, 1, 0, 1, 0, 1, 0,
  800. 1, 0, 1, 0, 1, 0, 1, 0,
  801. 1, 1, 0, 1, 0, 1, 0, 1,
  802. 0, 1, 0, 1, 0, 1, 0, 1,
  803. 1, 0, 1, 0, 1, 0, 1, 0,
  804. 1, 0, 1, 1, 0, 1, 0, 1,
  805. 0, 1, 0, 1, 0, 1, 0, 1,
  806. 0, 1, 1, 0, 1, 0, 1, 0,
  807. 1, 0, 1, 0, 1, 0, 1, 0,
  808. 1, 0, 1, 0, 1, 0, 1, 1,
  809. 0, 1, 0, 1, 0, 1, 0, 1,
  810. 0, 1, 2, 0, 1, 0, 1, 0,
  811. 1, 0, 1, 0, 1, 0, 1, 0,
  812. 1, 0, 1, 1, 0, 1, 0, 1,
  813. 1, 0, 1, 0, 1, 0, 1, 0,
  814. 1, 0, 1, 1, 2, 1, 1, 2,
  815. 2, 0, 2, 1, 2, 0, 2, 2,
  816. 1, 1, 2, 1, 1, 2, 1, 0,
  817. 1, 1, 0, 1, 0, 1, 2, 1,
  818. 0, 2, 1, 2, 1, 0, 1,
  819. };
  820. int wcswidth(const wchar_t *pwcs, size_t n)
  821. {
  822. int h, l, m, count;
  823. wchar_t wc;
  824. unsigned char b;
  825. if (ENCODING == __ctype_encoding_7_bit) {
  826. size_t i;
  827. for (i = 0 ; (i < n) && pwcs[i] ; i++) {
  828. if (pwcs[i] != ((unsigned char)(pwcs[i]))) {
  829. return -1;
  830. }
  831. }
  832. }
  833. #ifdef __CTYPE_HAS_8_BIT_LOCALES
  834. else if (ENCODING == __ctype_encoding_8_bit) {
  835. mbstate_t mbstate;
  836. mbstate.mask = 0; /* Initialize the mbstate. */
  837. if (__wcsnrtombs(NULL, &pwcs, n, SIZE_MAX, &mbstate) == ((size_t) - 1)) {
  838. return -1;
  839. }
  840. }
  841. #endif /* __CTYPE_HAS_8_BIT_LOCALES */
  842. #if defined(__CTYPE_HAS_UTF_8_LOCALES) && defined(KUHN)
  843. /* For stricter handling of allowed unicode values... see comments above. */
  844. else if (ENCODING == __ctype_encoding_utf8) {
  845. size_t i;
  846. for (i = 0 ; (i < n) && pwcs[i] ; i++) {
  847. if ( (((__uwchar_t)((pwcs[i]) - 0xfffeU)) < 2)
  848. || (((__uwchar_t)((pwcs[i]) - 0xd800U)) < (0xe000U - 0xd800U))
  849. ) {
  850. return -1;
  851. }
  852. }
  853. }
  854. #endif /* __CTYPE_HAS_UTF_8_LOCALES */
  855. for (count = 0 ; n && (wc = *pwcs++) ; n--) {
  856. if (wc <= 0xff) {
  857. /* If we're here, wc != 0. */
  858. if ((wc < 32) || ((wc >= 0x7f) && (wc < 0xa0))) {
  859. return -1;
  860. }
  861. ++count;
  862. continue;
  863. }
  864. if (((unsigned int) wc) <= 0xffff) {
  865. b = wc & 0xff;
  866. h = (wc >> 8);
  867. l = new_idx[h];
  868. h = new_idx[h+1];
  869. while ((m = (l+h) >> 1) != l) {
  870. if (b >= new_tbl[m]) {
  871. l = m;
  872. } else { /* wc < tbl[m] */
  873. h = m;
  874. }
  875. }
  876. count += new_wtbl[l]; /* none should be -1. */
  877. continue;
  878. }
  879. /* Redo this to minimize average number of compares?*/
  880. if (wc >= 0x1d167) {
  881. if (wc <= 0x1d1ad) {
  882. if ((wc <= 0x1d169
  883. || (wc >= 0x1d173
  884. && (wc <= 0x1d182
  885. || (wc >= 0x1d185
  886. && (wc <= 0x1d18b
  887. || (wc >= 0x1d1aa))))))
  888. ) {
  889. continue;
  890. }
  891. } else if (((wc >= 0xe0020) && (wc <= 0xe007f)) || (wc == 0xe0001)) {
  892. continue;
  893. } else if ((wc >= 0x20000) && (wc <= 0x2ffff)) {
  894. ++count; /* need 2.. add one here */
  895. }
  896. #if (WCHAR_MAX > 0x7fffffffL)
  897. else if (wc > 0x7fffffffL) {
  898. return -1;
  899. }
  900. #endif /* (WCHAR_MAX > 0x7fffffffL) */
  901. }
  902. ++count;
  903. }
  904. return count;
  905. }
  906. #else /* __UCLIBC_HAS_LOCALE__ */
  907. int wcswidth(const wchar_t *pwcs, size_t n)
  908. {
  909. int count;
  910. wchar_t wc;
  911. for (count = 0 ; n && (wc = *pwcs++) ; n--) {
  912. if (wc <= 0xff) {
  913. /* If we're here, wc != 0. */
  914. if ((wc < 32) || ((wc >= 0x7f) && (wc < 0xa0))) {
  915. return -1;
  916. }
  917. ++count;
  918. continue;
  919. } else {
  920. return -1;
  921. }
  922. }
  923. return count;
  924. }
  925. #endif /* __UCLIBC_HAS_LOCALE__ */
  926. #endif
  927. /**********************************************************************/
  928. #ifdef L_wcwidth
  929. int wcwidth(wchar_t wc)
  930. {
  931. return wcswidth(&wc, 1);
  932. }
  933. #endif
  934. /**********************************************************************/