wchar.c 44 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711
  1. /* Copyright (C) 2002, 2003, 2004 Manuel Novoa III
  2. *
  3. * This library is free software; you can redistribute it and/or
  4. * modify it under the terms of the GNU Library General Public
  5. * License as published by the Free Software Foundation; either
  6. * version 2 of the License, or (at your option) any later version.
  7. *
  8. * This library is distributed in the hope that it will be useful,
  9. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  11. * Library General Public License for more details.
  12. *
  13. * You should have received a copy of the GNU Library General Public
  14. * License along with this library; if not, write to the Free
  15. * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  16. */
  17. /* ATTENTION! ATTENTION! ATTENTION! ATTENTION! ATTENTION!
  18. *
  19. * Besides uClibc, I'm using this code in my libc for elks, which is
  20. * a 16-bit environment with a fairly limited compiler. It would make
  21. * things much easier for me if this file isn't modified unnecessarily.
  22. * In particular, please put any new or replacement functions somewhere
  23. * else, and modify the makefile to use your version instead.
  24. * Thanks. Manuel
  25. *
  26. * ATTENTION! ATTENTION! ATTENTION! ATTENTION! ATTENTION! */
  27. /* May 23, 2002 Initial Notes:
  28. *
  29. * I'm still tweaking this stuff, but it passes the tests I've thrown
  30. * at it, and Erik needs it for the gcc port. The glibc extension
  31. * __wcsnrtombs() hasn't been tested, as I didn't find a test for it
  32. * in the glibc source. I also need to fix the behavior of
  33. * _wchar_utf8sntowcs() if the max number of wchars to convert is 0.
  34. *
  35. * UTF-8 -> wchar -> UTF-8 conversion tests on Markus Kuhn's UTF-8-demo.txt
  36. * file on my platform (x86) show about 5-10% faster conversion speed than
  37. * glibc with mbsrtowcs()/wcsrtombs() and almost twice as fast as glibc with
  38. * individual mbrtowc()/wcrtomb() calls.
  39. *
  40. * If 'DECODER' is defined, then _wchar_utf8sntowcs() will be compiled
  41. * as a fail-safe UTF-8 decoder appropriate for a terminal, etc. which
  42. * needs to deal gracefully with whatever is sent to it. In that mode,
  43. * it passes Markus Kuhn's UTF-8-test.txt stress test. I plan to add
  44. * an arg to force that behavior, so the interface will be changing.
  45. *
  46. * I need to fix the error checking for 16-bit wide chars. This isn't
  47. * an issue for uClibc, but may be for ELKS. I'm currently not sure
  48. * if I'll use 16-bit, 32-bit, or configureable wchars in ELKS.
  49. *
  50. * July 1, 2002
  51. *
  52. * Fixed _wchar_utf8sntowcs() for the max number of wchars == 0 case.
  53. * Fixed nul-char bug in btowc(), and another in __mbsnrtowcs() for 8-bit
  54. * locales.
  55. * Enabled building of a C/POSIX-locale-only version, so full locale support
  56. * no longer needs to be enabled.
  57. *
  58. * Nov 4, 2002
  59. *
  60. * Fixed a bug in _wchar_wcsntoutf8s(). Don't store wcs position if dst is NULL.
  61. * Also, introduce an awful hack into _wchar_wcsntoutf8s() and wcsrtombs() in
  62. * order to support %ls in printf. See comments below for details.
  63. * Change behaviour of wc<->mb functions when in the C locale. Now they do
  64. * a 1-1 map for the range 0x80-UCHAR_MAX. This is for backwards compatibility
  65. * and consistency with the stds requirements that a printf format string by
  66. * a valid multibyte string beginning and ending in it's initial shift state.
  67. *
  68. * Nov 5, 2002
  69. *
  70. * Forgot to change btowc and wctob when I changed the wc<->mb functions yesterday.
  71. *
  72. * Nov 7, 2002
  73. *
  74. * Add wcwidth and wcswidth, based on Markus Kuhn's wcwidth of 2002-05-08.
  75. * Added some size/speed optimizations and integrated it into my locale
  76. * framework. Minimally tested at the moment, but the stub C-locale
  77. * version (which most people would probably be using) should be fine.
  78. *
  79. * Nov 21, 2002
  80. *
  81. * Revert the wc<->mb changes from earlier this month involving the C-locale.
  82. * Add a couple of ugly hacks to support *wprintf.
  83. * Add a mini iconv() and iconv implementation (requires locale support).
  84. *
  85. * Aug 1, 2003
  86. * Bug fix for mbrtowc.
  87. *
  88. * Aug 18, 2003
  89. * Bug fix: _wchar_utf8sntowcs and _wchar_wcsntoutf8s now set errno if EILSEQ.
  90. *
  91. * Feb 11, 2004
  92. * Bug fix: Fix size check for remaining output space in iconv().
  93. *
  94. * Manuel
  95. */
  96. #define _GNU_SOURCE
  97. #define _ISOC99_SOURCE
  98. #include <errno.h>
  99. #include <stddef.h>
  100. #include <limits.h>
  101. #include <stdint.h>
  102. #include <inttypes.h>
  103. #include <stdlib.h>
  104. #include <stdio.h>
  105. #include <assert.h>
  106. #include <locale.h>
  107. #include <wchar.h>
  108. #include <bits/uClibc_uwchar.h>
  109. /**********************************************************************/
  110. #ifdef __UCLIBC_HAS_LOCALE__
  111. #ifdef __UCLIBC_MJN3_ONLY__
  112. #ifdef L_iswspace
  113. /* generates one warning */
  114. #warning TODO: Fix Cc2wc* and Cwc2c* defines!
  115. #endif
  116. #endif /* __UCLIBC_MJN3_ONLY__ */
  117. #define ENCODING ((__UCLIBC_CURLOCALE_DATA).encoding)
  118. #define Cc2wc_IDX_SHIFT __LOCALE_DATA_Cc2wc_IDX_SHIFT
  119. #define Cc2wc_ROW_LEN __LOCALE_DATA_Cc2wc_ROW_LEN
  120. #define Cwc2c_DOMAIN_MAX __LOCALE_DATA_Cwc2c_DOMAIN_MAX
  121. #define Cwc2c_TI_SHIFT __LOCALE_DATA_Cwc2c_TI_SHIFT
  122. #define Cwc2c_TT_SHIFT __LOCALE_DATA_Cwc2c_TT_SHIFT
  123. #define Cwc2c_TI_LEN __LOCALE_DATA_Cwc2c_TI_LEN
  124. #ifndef __CTYPE_HAS_UTF_8_LOCALES
  125. #warning __CTYPE_HAS_UTF_8_LOCALES not set!
  126. #endif
  127. #else /* __UCLIBC_HAS_LOCALE__ */
  128. #ifdef __UCLIBC_MJN3_ONLY__
  129. #ifdef L_btowc
  130. /* emit only once */
  131. #warning fix preprocessor logic testing locale settings
  132. #endif
  133. #endif
  134. #define ENCODING (__ctype_encoding_7_bit)
  135. #ifdef __CTYPE_HAS_8_BIT_LOCALES
  136. #error __CTYPE_HAS_8_BIT_LOCALES is defined!
  137. #endif
  138. #ifdef __CTYPE_HAS_UTF_8_LOCALES
  139. #error __CTYPE_HAS_UTF_8_LOCALES is defined!
  140. #endif
  141. #undef L__wchar_utf8sntowcs
  142. #undef L__wchar_wcsntoutf8s
  143. #endif /* __UCLIBC_HAS_LOCALE__ */
  144. /**********************************************************************/
  145. #if WCHAR_MAX > 0xffffUL
  146. #define UTF_8_MAX_LEN 6
  147. #else
  148. #define UTF_8_MAX_LEN 3
  149. #endif
  150. #define KUHN 1
  151. /* Implementation-specific work functions. */
  152. extern size_t _wchar_utf8sntowcs(wchar_t *__restrict pwc, size_t wn,
  153. const char **__restrict src, size_t n,
  154. mbstate_t *ps, int allow_continuation);
  155. extern size_t _wchar_wcsntoutf8s(char *__restrict s, size_t n,
  156. const wchar_t **__restrict src, size_t wn);
  157. /* glibc extensions. */
  158. extern size_t __mbsnrtowcs(wchar_t *__restrict dst,
  159. const char **__restrict src,
  160. size_t NMC, size_t len, mbstate_t *__restrict ps);
  161. extern size_t __wcsnrtombs(char *__restrict dst,
  162. const wchar_t **__restrict src,
  163. size_t NWC, size_t len, mbstate_t *__restrict ps);
  164. /**********************************************************************/
  165. #ifdef L_btowc
  166. wint_t btowc(int c)
  167. {
  168. #ifdef __CTYPE_HAS_8_BIT_LOCALES
  169. wchar_t wc;
  170. unsigned char buf[1];
  171. mbstate_t mbstate;
  172. if (c != EOF) {
  173. *buf = (unsigned char) c;
  174. mbstate.__mask = 0; /* Initialize the mbstate. */
  175. if (mbrtowc(&wc, buf, 1, &mbstate) <= 1) {
  176. return wc;
  177. }
  178. }
  179. return WEOF;
  180. #else /* __CTYPE_HAS_8_BIT_LOCALES */
  181. #ifdef __UCLIBC_HAS_LOCALE__
  182. assert((ENCODING == __ctype_encoding_7_bit)
  183. || (ENCODING == __ctype_encoding_utf8));
  184. #endif /* __UCLIBC_HAS_LOCALE__ */
  185. /* If we don't have 8-bit locale support, then this is trivial since
  186. * anything outside of 0-0x7f is illegal in C/POSIX and UTF-8 locales. */
  187. return (((unsigned int)c) < 0x80) ? c : WEOF;
  188. #endif /* __CTYPE_HAS_8_BIT_LOCALES */
  189. }
  190. #endif
  191. /**********************************************************************/
  192. #ifdef L_wctob
  193. /* Note: We completely ignore ps in all currently supported conversions. */
  194. int wctob(wint_t c)
  195. {
  196. #ifdef __CTYPE_HAS_8_BIT_LOCALES
  197. unsigned char buf[MB_LEN_MAX];
  198. return (wcrtomb(buf, c, NULL) == 1) ? *buf : EOF;
  199. #else /* __CTYPE_HAS_8_BIT_LOCALES */
  200. #ifdef __UCLIBC_HAS_LOCALE__
  201. assert((ENCODING == __ctype_encoding_7_bit)
  202. || (ENCODING == __ctype_encoding_utf8));
  203. #endif /* __UCLIBC_HAS_LOCALE__ */
  204. /* If we don't have 8-bit locale support, then this is trivial since
  205. * anything outside of 0-0x7f is illegal in C/POSIX and UTF-8 locales. */
  206. /* TODO: need unsigned version of wint_t... */
  207. /* return (((unsigned int)c) < 0x80) ? c : WEOF; */
  208. return ((c >= 0) && (c < 0x80)) ? c : EOF;
  209. #endif /* __CTYPE_HAS_8_BIT_LOCALES */
  210. }
  211. #endif
  212. /**********************************************************************/
  213. #ifdef L_mbsinit
  214. int mbsinit(const mbstate_t *ps)
  215. {
  216. return !ps || !ps->__mask;
  217. }
  218. #endif
  219. /**********************************************************************/
  220. #ifdef L_mbrlen
  221. size_t __mbrlen(const char *__restrict s, size_t n, mbstate_t *__restrict ps)
  222. {
  223. static mbstate_t mbstate; /* Rely on bss 0-init. */
  224. return mbrtowc(NULL, s, n, (ps != NULL) ? ps : &mbstate);
  225. }
  226. size_t mbrlen(const char *__restrict s, size_t n, mbstate_t *__restrict ps)
  227. __attribute__ ((__weak__, __alias__("__mbrlen")));
  228. #endif
  229. /**********************************************************************/
  230. #ifdef L_mbrtowc
  231. size_t mbrtowc(wchar_t *__restrict pwc, const char *__restrict s,
  232. size_t n, mbstate_t *__restrict ps)
  233. {
  234. static mbstate_t mbstate; /* Rely on bss 0-init. */
  235. wchar_t wcbuf[1];
  236. const char *p;
  237. size_t r;
  238. char empty_string[1]; /* Avoid static to be fPIC friendly. */
  239. if (!ps) {
  240. ps = &mbstate;
  241. }
  242. if (!s) {
  243. pwc = (wchar_t *) s; /* NULL */
  244. empty_string[0] = 0; /* Init the empty string when necessary. */
  245. s = empty_string;
  246. n = 1;
  247. } else if (!n) {
  248. /* TODO: change error code? */
  249. return (ps->__mask && (ps->__wc == 0xffffU))
  250. ? ((size_t) -1) : ((size_t) -2);
  251. }
  252. p = s;
  253. #ifdef __CTYPE_HAS_UTF_8_LOCALES
  254. /* Need to do this here since mbsrtowcs doesn't allow incompletes. */
  255. if (ENCODING == __ctype_encoding_utf8) {
  256. if (!pwc) {
  257. pwc = wcbuf;
  258. }
  259. r = _wchar_utf8sntowcs(pwc, 1, &p, n, ps, 1);
  260. return (r == 1) ? (p-s) : r; /* Need to return 0 if nul char. */
  261. }
  262. #endif
  263. #ifdef __UCLIBC_MJN3_ONLY__
  264. #warning TODO: This adds a trailing nul!
  265. #endif /* __UCLIBC_MJN3_ONLY__ */
  266. r = __mbsnrtowcs(wcbuf, &p, SIZE_MAX, 1, ps);
  267. if (((ssize_t) r) >= 0) {
  268. if (pwc) {
  269. *pwc = *wcbuf;
  270. }
  271. }
  272. return (size_t) r;
  273. }
  274. #endif
  275. /**********************************************************************/
  276. #ifdef L_wcrtomb
  277. /* Note: We completely ignore ps in all currently supported conversions. */
  278. /* TODO: Check for valid state anyway? */
  279. size_t wcrtomb(register char *__restrict s, wchar_t wc,
  280. mbstate_t *__restrict ps)
  281. {
  282. #ifdef __UCLIBC_MJN3_ONLY__
  283. #warning TODO: Should wcsnrtombs nul-terminate unconditionally? Check glibc.
  284. #endif /* __UCLIBC_MJN3_ONLY__ */
  285. wchar_t wcbuf[1];
  286. const wchar_t *pwc;
  287. size_t r;
  288. char buf[MB_LEN_MAX];
  289. if (!s) {
  290. s = buf;
  291. wc = 0;
  292. }
  293. pwc = wcbuf;
  294. wcbuf[0] = wc;
  295. r = __wcsnrtombs(s, &pwc, 1, MB_LEN_MAX, ps);
  296. return (r != 0) ? r : 1;
  297. }
  298. #endif
  299. /**********************************************************************/
  300. #ifdef L_mbsrtowcs
  301. size_t mbsrtowcs(wchar_t *__restrict dst, const char **__restrict src,
  302. size_t len, mbstate_t *__restrict ps)
  303. {
  304. static mbstate_t mbstate; /* Rely on bss 0-init. */
  305. return __mbsnrtowcs(dst, src, SIZE_MAX, len,
  306. ((ps != NULL) ? ps : &mbstate));
  307. }
  308. #endif
  309. /**********************************************************************/
  310. #ifdef L_wcsrtombs
  311. /* Note: We completely ignore ps in all currently supported conversions.
  312. * TODO: Check for valid state anyway? */
  313. size_t wcsrtombs(char *__restrict dst, const wchar_t **__restrict src,
  314. size_t len, mbstate_t *__restrict ps)
  315. {
  316. return __wcsnrtombs(dst, src, SIZE_MAX, len, ps);
  317. }
  318. #endif
  319. /**********************************************************************/
  320. #ifdef L__wchar_utf8sntowcs
  321. /* Define DECODER to generate a UTF-8 decoder which passes Markus Kuhn's
  322. * UTF-8-test.txt strss test.
  323. */
  324. /* #define DECODER */
  325. #ifdef DECODER
  326. #ifndef KUHN
  327. #define KUHN
  328. #endif
  329. #endif
  330. size_t _wchar_utf8sntowcs(wchar_t *__restrict pwc, size_t wn,
  331. const char **__restrict src, size_t n,
  332. mbstate_t *ps, int allow_continuation)
  333. {
  334. register const char *s;
  335. __uwchar_t mask;
  336. __uwchar_t wc;
  337. wchar_t wcbuf[1];
  338. size_t count;
  339. int incr;
  340. s = *src;
  341. assert(s != NULL);
  342. assert(ps != NULL);
  343. incr = 1;
  344. /* NOTE: The following is an AWFUL HACK! In order to support %s in
  345. * wprintf, we need to be able to compute the number of wchars needed
  346. * for the mbs conversion, not to exceed the precision specified.
  347. * But if dst is NULL, the return value is the length assuming a
  348. * sufficiently sized buffer. So, we allow passing of (wchar_t *) ps
  349. * as pwc in order to flag that we really want the length, subject
  350. * to the restricted buffer size and no partial conversions.
  351. * See mbsnrtowcs() as well. */
  352. if (!pwc || (pwc == ((wchar_t *)ps))) {
  353. if (!pwc) {
  354. wn = SIZE_MAX;
  355. }
  356. pwc = wcbuf;
  357. incr = 0;
  358. }
  359. /* This is really here only to support the glibc extension function
  360. * __mbsnrtowcs which apparently returns 0 if wn == 0 without any
  361. * check on the validity of the mbstate. */
  362. if (!(count = wn)) {
  363. return 0;
  364. }
  365. if ((mask = (__uwchar_t) ps->__mask) != 0) { /* A continuation... */
  366. #ifdef DECODER
  367. wc = (__uwchar_t) ps->__wc;
  368. if (n) {
  369. goto CONTINUE;
  370. }
  371. goto DONE;
  372. #else
  373. if ((wc = (__uwchar_t) ps->__wc) != 0xffffU) {
  374. /* TODO: change error code here and below? */
  375. if (n) {
  376. goto CONTINUE;
  377. }
  378. goto DONE;
  379. }
  380. __set_errno(EILSEQ);
  381. return (size_t) -1; /* We're in an error state. */
  382. #endif
  383. }
  384. do {
  385. if (!n) {
  386. goto DONE;
  387. }
  388. --n;
  389. if ((wc = ((unsigned char) *s++)) >= 0x80) { /* Not ASCII... */
  390. mask = 0x40;
  391. #ifdef __UCLIBC_MJN3_ONLY__
  392. #warning TODO: Fix range for 16 bit wchar_t case.
  393. #endif
  394. if ( ((unsigned char)(s[-1] - 0xc0)) < (0xfe - 0xc0) ) {
  395. goto START;
  396. }
  397. BAD:
  398. #ifdef DECODER
  399. wc = 0xfffdU;
  400. goto COMPLETE;
  401. #else
  402. ps->__mask = mask;
  403. ps->__wc = 0xffffU;
  404. __set_errno(EILSEQ);
  405. return (size_t) -1; /* Illegal start byte! */
  406. #endif
  407. CONTINUE:
  408. while (n) {
  409. --n;
  410. if ((*s & 0xc0) != 0x80) {
  411. goto BAD;
  412. }
  413. mask <<= 5;
  414. wc <<= 6;
  415. wc += (*s & 0x3f); /* keep seperate for bcc (smaller code) */
  416. ++s;
  417. START:
  418. wc &= ~(mask << 1);
  419. if ((wc & mask) == 0) { /* Character completed. */
  420. if ((mask >>= 5) == 0x40) {
  421. mask += mask;
  422. }
  423. /* Check for invalid sequences (longer than necessary)
  424. * and invalid chars. */
  425. if ( (wc < mask) /* Sequence not minimal length. */
  426. #ifdef KUHN
  427. #if UTF_8_MAX_LEN == 3
  428. #error broken since mask can overflow!!
  429. /* For plane 0, these are the only defined values.*/
  430. || (wc > 0xfffdU)
  431. #else
  432. /* Note that we don't need to worry about exceeding */
  433. /* 31 bits as that is the most that UTF-8 provides. */
  434. || ( ((__uwchar_t)(wc - 0xfffeU)) < 2)
  435. #endif
  436. || ( ((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U) )
  437. #endif /* KUHN */
  438. ) {
  439. goto BAD;
  440. }
  441. goto COMPLETE;
  442. }
  443. }
  444. /* Character potentially valid but incomplete. */
  445. if (!allow_continuation) {
  446. if (count != wn) {
  447. return 0;
  448. }
  449. /* NOTE: The following can fail if you allow and then disallow
  450. * continuation!!! */
  451. #if UTF_8_MAX_LEN == 3
  452. #error broken since mask can overflow!!
  453. #endif
  454. /* Need to back up... */
  455. do {
  456. --s;
  457. } while ((mask >>= 5) >= 0x40);
  458. goto DONE;
  459. }
  460. ps->__mask = (wchar_t) mask;
  461. ps->__wc = (wchar_t) wc;
  462. *src = s;
  463. return (size_t) -2;
  464. }
  465. COMPLETE:
  466. *pwc = wc;
  467. pwc += incr;
  468. }
  469. #ifdef DECODER
  470. while (--count);
  471. #else
  472. while (wc && --count);
  473. if (!wc) {
  474. s = NULL;
  475. }
  476. #endif
  477. DONE:
  478. /* ps->__wc is irrelavent here. */
  479. ps->__mask = 0;
  480. if (pwc != wcbuf) {
  481. *src = s;
  482. }
  483. return wn - count;
  484. }
  485. #endif
  486. /**********************************************************************/
  487. #ifdef L__wchar_wcsntoutf8s
  488. size_t _wchar_wcsntoutf8s(char *__restrict s, size_t n,
  489. const wchar_t **__restrict src, size_t wn)
  490. {
  491. register char *p;
  492. size_t len, t;
  493. __uwchar_t wc;
  494. const __uwchar_t *swc;
  495. int store;
  496. char buf[MB_LEN_MAX];
  497. char m;
  498. store = 1;
  499. /* NOTE: The following is an AWFUL HACK! In order to support %ls in
  500. * printf, we need to be able to compute the number of bytes needed
  501. * for the mbs conversion, not to exceed the precision specified.
  502. * But if dst is NULL, the return value is the length assuming a
  503. * sufficiently sized buffer. So, we allow passing of (char *) src
  504. * as dst in order to flag that we really want the length, subject
  505. * to the restricted buffer size and no partial conversions.
  506. * See wcsnrtombs() as well. */
  507. if (!s || (s == ((char *) src))) {
  508. if (!s) {
  509. n = SIZE_MAX;
  510. }
  511. s = buf;
  512. store = 0;
  513. }
  514. t = n;
  515. swc = (const __uwchar_t *) *src;
  516. assert(swc != NULL);
  517. while (wn && t) {
  518. wc = *swc;
  519. *s = wc;
  520. len = 1;
  521. if (wc >= 0x80) {
  522. #ifdef KUHN
  523. if (
  524. #if UTF_8_MAX_LEN == 3
  525. /* For plane 0, these are the only defined values.*/
  526. /* Note that we don't need to worry about exceeding */
  527. /* 31 bits as that is the most that UTF-8 provides. */
  528. (wc > 0xfffdU)
  529. #else
  530. /* UTF_8_MAX_LEN == 6 */
  531. (wc > 0x7fffffffUL)
  532. || ( ((__uwchar_t)(wc - 0xfffeU)) < 2)
  533. #endif
  534. || ( ((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U) )
  535. ) {
  536. __set_errno(EILSEQ);
  537. return (size_t) -1;
  538. }
  539. #else /* KUHN */
  540. #if UTF_8_MAX_LEN != 3
  541. if (wc > 0x7fffffffUL) { /* Value too large. */
  542. __set_errno(EILSEQ);
  543. return (size_t) -1;
  544. }
  545. #endif
  546. #endif /* KUHN */
  547. wc >>= 1;
  548. p = s;
  549. do {
  550. ++p;
  551. } while (wc >>= 5);
  552. wc = *swc;
  553. if ((len = p - s) > t) { /* Not enough space. */
  554. break;
  555. }
  556. m = 0x80;
  557. while( p>s ) {
  558. m = (m >> 1) | 0x80;
  559. *--p = (wc & 0x3f) | 0x80;
  560. wc >>= 6;
  561. }
  562. *s |= (m << 1);
  563. } else if (wc == 0) { /* End of string. */
  564. swc = NULL;
  565. break;
  566. }
  567. ++swc;
  568. --wn;
  569. t -= len;
  570. if (store) {
  571. s += len;
  572. }
  573. }
  574. if (store) {
  575. *src = (const wchar_t *) swc;
  576. }
  577. return n - t;
  578. }
  579. #endif
  580. /**********************************************************************/
  581. #ifdef L___mbsnrtowcs
  582. /* WARNING: We treat len as SIZE_MAX when dst is NULL! */
  583. size_t __mbsnrtowcs(wchar_t *__restrict dst, const char **__restrict src,
  584. size_t NMC, size_t len, mbstate_t *__restrict ps)
  585. {
  586. static mbstate_t mbstate; /* Rely on bss 0-init. */
  587. wchar_t wcbuf[1];
  588. const char *s;
  589. size_t count;
  590. int incr;
  591. if (!ps) {
  592. ps = &mbstate;
  593. }
  594. #ifdef __CTYPE_HAS_UTF_8_LOCALES
  595. if (ENCODING == __ctype_encoding_utf8) {
  596. size_t r;
  597. return ((r = _wchar_utf8sntowcs(dst, len, src, NMC, ps, 1))
  598. != (size_t) -2) ? r : 0;
  599. }
  600. #endif
  601. incr = 1;
  602. /* NOTE: The following is an AWFUL HACK! In order to support %s in
  603. * wprintf, we need to be able to compute the number of wchars needed
  604. * for the mbs conversion, not to exceed the precision specified.
  605. * But if dst is NULL, the return value is the length assuming a
  606. * sufficiently sized buffer. So, we allow passing of ((wchar_t *)ps)
  607. * as dst in order to flag that we really want the length, subject
  608. * to the restricted buffer size and no partial conversions.
  609. * See _wchar_utf8sntowcs() as well. */
  610. if (!dst || (dst == ((wchar_t *)ps))) {
  611. if (!dst) {
  612. len = SIZE_MAX;
  613. }
  614. dst = wcbuf;
  615. incr = 0;
  616. }
  617. /* Since all the following encodings are single-byte encodings... */
  618. if (len > NMC) {
  619. len = NMC;
  620. }
  621. count = len;
  622. s = *src;
  623. #ifdef __CTYPE_HAS_8_BIT_LOCALES
  624. if (ENCODING == __ctype_encoding_8_bit) {
  625. wchar_t wc;
  626. while (count) {
  627. if ((wc = ((unsigned char)(*s))) >= 0x80) { /* Non-ASCII... */
  628. wc -= 0x80;
  629. wc = __UCLIBC_CURLOCALE_DATA.tbl8c2wc[
  630. (__UCLIBC_CURLOCALE_DATA.idx8c2wc[wc >> Cc2wc_IDX_SHIFT]
  631. << Cc2wc_IDX_SHIFT) + (wc & (Cc2wc_ROW_LEN - 1))];
  632. if (!wc) {
  633. goto BAD;
  634. }
  635. }
  636. if (!(*dst = wc)) {
  637. s = NULL;
  638. break;
  639. }
  640. dst += incr;
  641. ++s;
  642. --count;
  643. }
  644. if (dst != wcbuf) {
  645. *src = s;
  646. }
  647. return len - count;
  648. }
  649. #endif
  650. #ifdef __UCLIBC_HAS_LOCALE__
  651. assert(ENCODING == __ctype_encoding_7_bit);
  652. #endif
  653. while (count) {
  654. if ((*dst = (unsigned char) *s) == 0) {
  655. s = NULL;
  656. break;
  657. }
  658. if (*dst >= 0x80) {
  659. #ifdef __CTYPE_HAS_8_BIT_LOCALES
  660. BAD:
  661. #endif
  662. __set_errno(EILSEQ);
  663. return (size_t) -1;
  664. }
  665. ++s;
  666. dst += incr;
  667. --count;
  668. }
  669. if (dst != wcbuf) {
  670. *src = s;
  671. }
  672. return len - count;
  673. }
  674. size_t mbsnrtowcs(wchar_t *__restrict dst, const char **__restrict src,
  675. size_t NMC, size_t len, mbstate_t *__restrict ps)
  676. __attribute__ ((__weak__, __alias__("__mbsnrtowcs")));
  677. #endif
  678. /**********************************************************************/
  679. #ifdef L___wcsnrtombs
  680. /* WARNING: We treat len as SIZE_MAX when dst is NULL! */
  681. /* Note: We completely ignore ps in all currently supported conversions.
  682. * TODO: Check for valid state anyway? */
  683. size_t __wcsnrtombs(char *__restrict dst, const wchar_t **__restrict src,
  684. size_t NWC, size_t len, mbstate_t *__restrict ps)
  685. {
  686. const __uwchar_t *s;
  687. size_t count;
  688. int incr;
  689. char buf[MB_LEN_MAX];
  690. #ifdef __CTYPE_HAS_UTF_8_LOCALES
  691. if (ENCODING == __ctype_encoding_utf8) {
  692. return _wchar_wcsntoutf8s(dst, len, src, NWC);
  693. }
  694. #endif /* __CTYPE_HAS_UTF_8_LOCALES */
  695. incr = 1;
  696. /* NOTE: The following is an AWFUL HACK! In order to support %ls in
  697. * printf, we need to be able to compute the number of bytes needed
  698. * for the mbs conversion, not to exceed the precision specified.
  699. * But if dst is NULL, the return value is the length assuming a
  700. * sufficiently sized buffer. So, we allow passing of (char *) src
  701. * as dst in order to flag that we really want the length, subject
  702. * to the restricted buffer size and no partial conversions.
  703. * See _wchar_wcsntoutf8s() as well. */
  704. if (!dst || (dst == ((char *) src))) {
  705. if (!dst) {
  706. len = SIZE_MAX;
  707. }
  708. dst = buf;
  709. incr = 0;
  710. }
  711. /* Since all the following encodings are single-byte encodings... */
  712. if (len > NWC) {
  713. len = NWC;
  714. }
  715. count = len;
  716. s = (const __uwchar_t *) *src;
  717. #ifdef __CTYPE_HAS_8_BIT_LOCALES
  718. if (ENCODING == __ctype_encoding_8_bit) {
  719. __uwchar_t wc;
  720. __uwchar_t u;
  721. while (count) {
  722. if ((wc = *s) <= 0x7f) {
  723. if (!(*dst = (unsigned char) wc)) {
  724. s = NULL;
  725. break;
  726. }
  727. } else {
  728. u = 0;
  729. if (wc <= Cwc2c_DOMAIN_MAX) {
  730. u = __UCLIBC_CURLOCALE_DATA.idx8wc2c[wc >> (Cwc2c_TI_SHIFT
  731. + Cwc2c_TT_SHIFT)];
  732. u = __UCLIBC_CURLOCALE_DATA.tbl8wc2c[(u << Cwc2c_TI_SHIFT)
  733. + ((wc >> Cwc2c_TT_SHIFT)
  734. & ((1 << Cwc2c_TI_SHIFT)-1))];
  735. u = __UCLIBC_CURLOCALE_DATA.tbl8wc2c[Cwc2c_TI_LEN
  736. + (u << Cwc2c_TT_SHIFT)
  737. + (wc & ((1 << Cwc2c_TT_SHIFT)-1))];
  738. }
  739. #define __WCHAR_REPLACEMENT_CHAR '?'
  740. #ifdef __WCHAR_REPLACEMENT_CHAR
  741. *dst = (unsigned char) ( u ? u : __WCHAR_REPLACEMENT_CHAR );
  742. #else /* __WCHAR_REPLACEMENT_CHAR */
  743. if (!u) {
  744. goto BAD;
  745. }
  746. *dst = (unsigned char) u;
  747. #endif /* __WCHAR_REPLACEMENT_CHAR */
  748. }
  749. ++s;
  750. dst += incr;
  751. --count;
  752. }
  753. if (dst != buf) {
  754. *src = (const wchar_t *) s;
  755. }
  756. return len - count;
  757. }
  758. #endif /* __CTYPE_HAS_8_BIT_LOCALES */
  759. #ifdef __UCLIBC_HAS_LOCALE__
  760. assert(ENCODING == __ctype_encoding_7_bit);
  761. #endif
  762. while (count) {
  763. if (*s >= 0x80) {
  764. #if defined(__CTYPE_HAS_8_BIT_LOCALES) && !defined(__WCHAR_REPLACEMENT_CHAR)
  765. BAD:
  766. #endif
  767. __set_errno(EILSEQ);
  768. return (size_t) -1;
  769. }
  770. if ((*dst = (unsigned char) *s) == 0) {
  771. s = NULL;
  772. break;
  773. }
  774. ++s;
  775. dst += incr;
  776. --count;
  777. }
  778. if (dst != buf) {
  779. *src = (const wchar_t *) s;
  780. }
  781. return len - count;
  782. }
  783. size_t wcsnrtombs(char *__restrict dst, const wchar_t **__restrict src,
  784. size_t NWC, size_t len, mbstate_t *__restrict ps)
  785. __attribute__ ((__weak__, __alias__("__wcsnrtombs")));
  786. #endif
  787. /**********************************************************************/
  788. #ifdef L_wcswidth
  789. #ifdef __UCLIBC_MJN3_ONLY__
  790. #warning REMINDER: If we start doing translit, wcwidth and wcswidth will need updating.
  791. #warning TODO: Update wcwidth to match latest by Kuhn.
  792. #endif
  793. #if defined(__UCLIBC_HAS_LOCALE__) && \
  794. ( defined(__CTYPE_HAS_8_BIT_LOCALES) || defined(__CTYPE_HAS_UTF_8_LOCALES) )
  795. static const unsigned char new_idx[] = {
  796. 0, 5, 5, 6, 10, 15, 28, 39,
  797. 48, 48, 71, 94, 113, 128, 139, 154,
  798. 175, 186, 188, 188, 188, 188, 188, 188,
  799. 203, 208, 208, 208, 208, 208, 208, 208,
  800. 208, 219, 219, 219, 222, 222, 222, 222,
  801. 222, 222, 222, 222, 222, 222, 222, 224,
  802. 224, 231, 231, 231, 231, 231, 231, 231,
  803. 231, 231, 231, 231, 231, 231, 231, 231,
  804. 231, 231, 231, 231, 231, 231, 231, 231,
  805. 231, 231, 231, 231, 231, 231, 231, 231,
  806. 231, 231, 231, 231, 231, 231, 231, 231,
  807. 231, 231, 231, 231, 231, 231, 231, 231,
  808. 231, 231, 231, 231, 231, 231, 231, 231,
  809. 231, 231, 231, 231, 231, 231, 231, 231,
  810. 231, 231, 231, 231, 231, 231, 231, 231,
  811. 231, 231, 231, 231, 231, 231, 231, 231,
  812. 231, 231, 231, 231, 231, 231, 231, 231,
  813. 231, 231, 231, 231, 231, 231, 231, 231,
  814. 231, 231, 231, 231, 231, 231, 231, 231,
  815. 231, 231, 231, 231, 231, 231, 231, 231,
  816. 231, 231, 231, 231, 231, 233, 233, 233,
  817. 233, 233, 233, 233, 234, 234, 234, 234,
  818. 234, 234, 234, 234, 234, 234, 234, 234,
  819. 234, 234, 234, 234, 234, 234, 234, 234,
  820. 234, 234, 234, 234, 234, 234, 234, 234,
  821. 234, 234, 234, 234, 234, 234, 234, 234,
  822. 234, 234, 234, 234, 234, 234, 234, 234,
  823. 236, 236, 236, 236, 236, 236, 236, 236,
  824. 236, 236, 236, 236, 236, 236, 236, 236,
  825. 236, 236, 236, 236, 236, 236, 236, 236,
  826. 236, 236, 236, 236, 236, 236, 236, 236,
  827. 236, 237, 237, 238, 241, 241, 242, 249,
  828. 255,
  829. };
  830. static const unsigned char new_tbl[] = {
  831. 0x00, 0x01, 0x20, 0x7f, 0xa0, 0x00, 0x00, 0x50,
  832. 0x60, 0x70, 0x00, 0x83, 0x87, 0x88, 0x8a, 0x00,
  833. 0x91, 0xa2, 0xa3, 0xba, 0xbb, 0xbe, 0xbf, 0xc0,
  834. 0xc1, 0xc3, 0xc4, 0xc5, 0x00, 0x4b, 0x56, 0x70,
  835. 0x71, 0xd6, 0xe5, 0xe7, 0xe9, 0xea, 0xee, 0x00,
  836. 0x0f, 0x10, 0x11, 0x12, 0x30, 0x4b, 0xa6, 0xb1,
  837. 0x00, 0x01, 0x03, 0x3c, 0x3d, 0x41, 0x49, 0x4d,
  838. 0x4e, 0x51, 0x55, 0x62, 0x64, 0x81, 0x82, 0xbc,
  839. 0xbd, 0xc1, 0xc5, 0xcd, 0xce, 0xe2, 0xe4, 0x00,
  840. 0x02, 0x03, 0x3c, 0x3d, 0x41, 0x43, 0x47, 0x49,
  841. 0x4b, 0x4e, 0x70, 0x72, 0x81, 0x83, 0xbc, 0xbd,
  842. 0xc1, 0xc6, 0xc7, 0xc9, 0xcd, 0xce, 0x00, 0x01,
  843. 0x02, 0x3c, 0x3d, 0x3f, 0x40, 0x41, 0x44, 0x4d,
  844. 0x4e, 0x56, 0x57, 0x82, 0x83, 0xc0, 0xc1, 0xcd,
  845. 0xce, 0x00, 0x3e, 0x41, 0x46, 0x49, 0x4a, 0x4e,
  846. 0x55, 0x57, 0xbf, 0xc0, 0xc6, 0xc7, 0xcc, 0xce,
  847. 0x00, 0x41, 0x44, 0x4d, 0x4e, 0xca, 0xcb, 0xd2,
  848. 0xd5, 0xd6, 0xd7, 0x00, 0x31, 0x32, 0x34, 0x3b,
  849. 0x47, 0x4f, 0xb1, 0xb2, 0xb4, 0xba, 0xbb, 0xbd,
  850. 0xc8, 0xce, 0x00, 0x18, 0x1a, 0x35, 0x36, 0x37,
  851. 0x38, 0x39, 0x3a, 0x71, 0x7f, 0x80, 0x85, 0x86,
  852. 0x88, 0x90, 0x98, 0x99, 0xbd, 0xc6, 0xc7, 0x00,
  853. 0x2d, 0x31, 0x32, 0x33, 0x36, 0x38, 0x39, 0x3a,
  854. 0x58, 0x5a, 0x00, 0x60, 0x00, 0x12, 0x15, 0x32,
  855. 0x35, 0x52, 0x54, 0x72, 0x74, 0xb7, 0xbe, 0xc6,
  856. 0xc7, 0xc9, 0xd4, 0x00, 0x0b, 0x0f, 0xa9, 0xaa,
  857. 0x00, 0x0b, 0x10, 0x2a, 0x2f, 0x60, 0x64, 0x6a,
  858. 0x70, 0xd0, 0xeb, 0x00, 0x29, 0x2b, 0x00, 0x80,
  859. 0x00, 0x2a, 0x30, 0x3f, 0x40, 0x99, 0x9b, 0x00,
  860. 0xd0, 0x00, 0x00, 0xa4, 0x00, 0x00, 0x00, 0x1e,
  861. 0x1f, 0x00, 0x00, 0x10, 0x20, 0x24, 0x30, 0x70,
  862. 0xff, 0x00, 0x61, 0xe0, 0xe7, 0xf9, 0xfc,
  863. };
  864. static const signed char new_wtbl[] = {
  865. 0, -1, 1, -1, 1, 1, 0, 1,
  866. 0, 1, 1, 0, 1, 0, 1, 1,
  867. 0, 1, 0, 1, 0, 1, 0, 1,
  868. 0, 1, 0, 1, 1, 0, 1, 0,
  869. 1, 0, 1, 0, 1, 0, 1, 1,
  870. 0, 1, 0, 1, 0, 1, 0, 1,
  871. 1, 0, 1, 0, 1, 0, 1, 0,
  872. 1, 0, 1, 0, 1, 0, 1, 0,
  873. 1, 0, 1, 0, 1, 0, 1, 1,
  874. 0, 1, 0, 1, 0, 1, 0, 1,
  875. 0, 1, 0, 1, 0, 1, 0, 1,
  876. 0, 1, 0, 1, 0, 1, 1, 0,
  877. 1, 0, 1, 0, 1, 0, 1, 0,
  878. 1, 0, 1, 0, 1, 0, 1, 0,
  879. 1, 1, 0, 1, 0, 1, 0, 1,
  880. 0, 1, 0, 1, 0, 1, 0, 1,
  881. 1, 0, 1, 0, 1, 0, 1, 0,
  882. 1, 0, 1, 1, 0, 1, 0, 1,
  883. 0, 1, 0, 1, 0, 1, 0, 1,
  884. 0, 1, 1, 0, 1, 0, 1, 0,
  885. 1, 0, 1, 0, 1, 0, 1, 0,
  886. 1, 0, 1, 0, 1, 0, 1, 1,
  887. 0, 1, 0, 1, 0, 1, 0, 1,
  888. 0, 1, 2, 0, 1, 0, 1, 0,
  889. 1, 0, 1, 0, 1, 0, 1, 0,
  890. 1, 0, 1, 1, 0, 1, 0, 1,
  891. 1, 0, 1, 0, 1, 0, 1, 0,
  892. 1, 0, 1, 1, 2, 1, 1, 2,
  893. 2, 0, 2, 1, 2, 0, 2, 2,
  894. 1, 1, 2, 1, 1, 2, 1, 0,
  895. 1, 1, 0, 1, 0, 1, 2, 1,
  896. 0, 2, 1, 2, 1, 0, 1,
  897. };
  898. int wcswidth(const wchar_t *pwcs, size_t n)
  899. {
  900. int h, l, m, count;
  901. wchar_t wc;
  902. unsigned char b;
  903. if (ENCODING == __ctype_encoding_7_bit) {
  904. size_t i;
  905. for (i = 0 ; (i < n) && pwcs[i] ; i++) {
  906. if (pwcs[i] != ((unsigned char)(pwcs[i]))) {
  907. return -1;
  908. }
  909. }
  910. }
  911. #ifdef __CTYPE_HAS_8_BIT_LOCALES
  912. else if (ENCODING == __ctype_encoding_8_bit) {
  913. mbstate_t mbstate;
  914. mbstate.__mask = 0; /* Initialize the mbstate. */
  915. if (__wcsnrtombs(NULL, &pwcs, n, SIZE_MAX, &mbstate) == ((size_t) - 1)) {
  916. return -1;
  917. }
  918. }
  919. #endif /* __CTYPE_HAS_8_BIT_LOCALES */
  920. #if defined(__CTYPE_HAS_UTF_8_LOCALES) && defined(KUHN)
  921. /* For stricter handling of allowed unicode values... see comments above. */
  922. else if (ENCODING == __ctype_encoding_utf8) {
  923. size_t i;
  924. for (i = 0 ; (i < n) && pwcs[i] ; i++) {
  925. if ( (((__uwchar_t)((pwcs[i]) - 0xfffeU)) < 2)
  926. || (((__uwchar_t)((pwcs[i]) - 0xd800U)) < (0xe000U - 0xd800U))
  927. ) {
  928. return -1;
  929. }
  930. }
  931. }
  932. #endif /* __CTYPE_HAS_UTF_8_LOCALES */
  933. for (count = 0 ; n && (wc = *pwcs++) ; n--) {
  934. if (wc <= 0xff) {
  935. /* If we're here, wc != 0. */
  936. if ((wc < 32) || ((wc >= 0x7f) && (wc < 0xa0))) {
  937. return -1;
  938. }
  939. ++count;
  940. continue;
  941. }
  942. if (((unsigned int) wc) <= 0xffff) {
  943. b = wc & 0xff;
  944. h = (wc >> 8);
  945. l = new_idx[h];
  946. h = new_idx[h+1];
  947. while ((m = (l+h) >> 1) != l) {
  948. if (b >= new_tbl[m]) {
  949. l = m;
  950. } else { /* wc < tbl[m] */
  951. h = m;
  952. }
  953. }
  954. count += new_wtbl[l]; /* none should be -1. */
  955. continue;
  956. }
  957. /* Redo this to minimize average number of compares?*/
  958. if (wc >= 0x1d167) {
  959. if (wc <= 0x1d1ad) {
  960. if ((wc <= 0x1d169
  961. || (wc >= 0x1d173
  962. && (wc <= 0x1d182
  963. || (wc >= 0x1d185
  964. && (wc <= 0x1d18b
  965. || (wc >= 0x1d1aa))))))
  966. ) {
  967. continue;
  968. }
  969. } else if (((wc >= 0xe0020) && (wc <= 0xe007f)) || (wc == 0xe0001)) {
  970. continue;
  971. } else if ((wc >= 0x20000) && (wc <= 0x2ffff)) {
  972. ++count; /* need 2.. add one here */
  973. }
  974. #if (WCHAR_MAX > 0x7fffffffL)
  975. else if (wc > 0x7fffffffL) {
  976. return -1;
  977. }
  978. #endif /* (WCHAR_MAX > 0x7fffffffL) */
  979. }
  980. ++count;
  981. }
  982. return count;
  983. }
  984. #else /* __UCLIBC_HAS_LOCALE__ */
  985. int wcswidth(const wchar_t *pwcs, size_t n)
  986. {
  987. int count;
  988. wchar_t wc;
  989. for (count = 0 ; n && (wc = *pwcs++) ; n--) {
  990. if (wc <= 0xff) {
  991. /* If we're here, wc != 0. */
  992. if ((wc < 32) || ((wc >= 0x7f) && (wc < 0xa0))) {
  993. return -1;
  994. }
  995. ++count;
  996. continue;
  997. } else {
  998. return -1;
  999. }
  1000. }
  1001. return count;
  1002. }
  1003. #endif /* __UCLIBC_HAS_LOCALE__ */
  1004. #endif
  1005. /**********************************************************************/
  1006. #ifdef L_wcwidth
  1007. int wcwidth(wchar_t wc)
  1008. {
  1009. return wcswidth(&wc, 1);
  1010. }
  1011. #endif
  1012. /**********************************************************************/
  1013. typedef struct {
  1014. mbstate_t tostate;
  1015. mbstate_t fromstate;
  1016. int tocodeset;
  1017. int fromcodeset;
  1018. int frombom;
  1019. int tobom;
  1020. int fromcodeset0;
  1021. int frombom0;
  1022. int tobom0;
  1023. int skip_invalid_input; /* To support iconv -c option. */
  1024. } _UC_iconv_t;
  1025. #ifdef L_iconv
  1026. #include <iconv.h>
  1027. #include <string.h>
  1028. #include <endian.h>
  1029. #include <byteswap.h>
  1030. #if (__BYTE_ORDER != __BIG_ENDIAN) && (__BYTE_ORDER != __LITTLE_ENDIAN)
  1031. #error unsupported endianness for iconv
  1032. #endif
  1033. #ifndef __CTYPE_HAS_8_BIT_LOCALES
  1034. #error currently iconv requires 8 bit locales
  1035. #endif
  1036. #ifndef __CTYPE_HAS_UTF_8_LOCALES
  1037. #error currently iconv requires UTF-8 locales
  1038. #endif
  1039. enum {
  1040. IC_WCHAR_T = 0xe0,
  1041. IC_MULTIBYTE = 0xe0,
  1042. #if __BYTE_ORDER == __BIG_ENDIAN
  1043. IC_UCS_4 = 0xec,
  1044. IC_UTF_32 = 0xe4,
  1045. IC_UCS_2 = 0xe2,
  1046. IC_UTF_16 = 0xea,
  1047. #else
  1048. IC_UCS_4 = 0xed,
  1049. IC_UTF_32 = 0xe5,
  1050. IC_UCS_2 = 0xe3,
  1051. IC_UTF_16 = 0xeb,
  1052. #endif
  1053. IC_UTF_8 = 2,
  1054. IC_ASCII = 1
  1055. };
  1056. /* For the multibyte
  1057. * bit 0 means swap endian
  1058. * bit 1 means 2 byte
  1059. * bit 2 means 4 byte
  1060. *
  1061. */
  1062. const unsigned char __iconv_codesets[] =
  1063. "\x0a\xe0""WCHAR_T\x00" /* superset of UCS-4 but platform-endian */
  1064. #if __BYTE_ORDER == __BIG_ENDIAN
  1065. "\x08\xec""UCS-4\x00" /* always BE */
  1066. "\x0a\xec""UCS-4BE\x00"
  1067. "\x0a\xed""UCS-4LE\x00"
  1068. "\x09\fe4""UTF-32\x00" /* platform endian with BOM */
  1069. "\x0b\xe4""UTF-32BE\x00"
  1070. "\x0b\xe5""UTF-32LE\x00"
  1071. "\x08\xe2""UCS-2\x00" /* always BE */
  1072. "\x0a\xe2""UCS-2BE\x00"
  1073. "\x0a\xe3""UCS-2LE\x00"
  1074. "\x09\xea""UTF-16\x00" /* platform endian with BOM */
  1075. "\x0b\xea""UTF-16BE\x00"
  1076. "\x0b\xeb""UTF-16LE\x00"
  1077. #elif __BYTE_ORDER == __LITTLE_ENDIAN
  1078. "\x08\xed""UCS-4\x00" /* always BE */
  1079. "\x0a\xed""UCS-4BE\x00"
  1080. "\x0a\xec""UCS-4LE\x00"
  1081. "\x09\xf4""UTF-32\x00" /* platform endian with BOM */
  1082. "\x0b\xe5""UTF-32BE\x00"
  1083. "\x0b\xe4""UTF-32LE\x00"
  1084. "\x08\xe3""UCS-2\x00" /* always BE */
  1085. "\x0a\xe3""UCS-2BE\x00"
  1086. "\x0a\xe2""UCS-2LE\x00"
  1087. "\x09\xfa""UTF-16\x00" /* platform endian with BOM */
  1088. "\x0b\xeb""UTF-16BE\x00"
  1089. "\x0b\xea""UTF-16LE\x00"
  1090. #endif
  1091. "\x08\x02""UTF-8\x00"
  1092. "\x0b\x01""US-ASCII\x00"
  1093. "\x07\x01""ASCII"; /* Must be last! (special case to save a nul) */
  1094. static int find_codeset(const char *name)
  1095. {
  1096. const unsigned char *s;
  1097. int codeset;
  1098. for (s = __iconv_codesets ; *s ; s += *s) {
  1099. if (!strcasecmp(s+2, name)) {
  1100. return s[1];
  1101. }
  1102. }
  1103. /* The following is ripped from find_locale in locale.c. */
  1104. /* TODO: maybe CODESET_LIST + *s ??? */
  1105. /* 7bit is 1, UTF-8 is 2, 8-bit is >= 3 */
  1106. codeset = 2;
  1107. s = __LOCALE_DATA_CODESET_LIST;
  1108. do {
  1109. ++codeset; /* Increment codeset first. */
  1110. if (!strcasecmp(__LOCALE_DATA_CODESET_LIST+*s, name)) {
  1111. return codeset;
  1112. }
  1113. } while (*++s);
  1114. return 0; /* No matching codeset! */
  1115. }
  1116. iconv_t weak_function iconv_open(const char *tocode, const char *fromcode)
  1117. {
  1118. register _UC_iconv_t *px;
  1119. int tocodeset, fromcodeset;
  1120. if (((tocodeset = find_codeset(tocode)) != 0)
  1121. && ((fromcodeset = find_codeset(fromcode)) != 0)) {
  1122. if ((px = malloc(sizeof(_UC_iconv_t))) != NULL) {
  1123. px->tocodeset = tocodeset;
  1124. px->tobom0 = px->tobom = (tocodeset & 0x10) >> 4;
  1125. px->fromcodeset0 = px->fromcodeset = fromcodeset;
  1126. px->frombom0 = px->frombom = (fromcodeset & 0x10) >> 4;
  1127. px->skip_invalid_input = px->tostate.__mask
  1128. = px->fromstate.__mask = 0;
  1129. return (iconv_t) px;
  1130. }
  1131. } else {
  1132. __set_errno(EINVAL);
  1133. }
  1134. return (iconv_t)(-1);
  1135. }
  1136. int weak_function iconv_close(iconv_t cd)
  1137. {
  1138. free(cd);
  1139. return 0;
  1140. }
  1141. size_t weak_function iconv(iconv_t cd, char **__restrict inbuf,
  1142. size_t *__restrict inbytesleft,
  1143. char **__restrict outbuf,
  1144. size_t *__restrict outbytesleft)
  1145. {
  1146. _UC_iconv_t *px = (_UC_iconv_t *) cd;
  1147. size_t nrcount, r;
  1148. wchar_t wc, wc2;
  1149. int inci, inco;
  1150. assert(px != (_UC_iconv_t *)(-1));
  1151. assert(sizeof(wchar_t) == 4);
  1152. if (!inbuf || !*inbuf) { /* Need to reinitialze conversion state. */
  1153. /* Note: For shift-state encodings we possibly need to output the
  1154. * shift sequence to return to initial state! */
  1155. if ((px->fromcodeset & 0xf0) == 0xe0) {
  1156. }
  1157. px->tostate.__mask = px->fromstate.__mask = 0;
  1158. px->fromcodeset = px->fromcodeset0;
  1159. px->tobom = px->tobom0;
  1160. px->frombom = px->frombom0;
  1161. return 0;
  1162. }
  1163. nrcount = 0;
  1164. while (*inbytesleft) {
  1165. if (!*outbytesleft) {
  1166. TOO_BIG:
  1167. __set_errno(E2BIG);
  1168. return (size_t) -1;
  1169. }
  1170. inci = inco = 1;
  1171. if (px->fromcodeset >= IC_MULTIBYTE) {
  1172. inci = (px->fromcodeset == IC_WCHAR_T) ? 4: (px->fromcodeset & 6);
  1173. if (*inbytesleft < inci) goto INVALID;
  1174. wc = (((unsigned int)((unsigned char)((*inbuf)[0]))) << 8)
  1175. + ((unsigned char)((*inbuf)[1]));
  1176. if (inci == 4) {
  1177. wc = (((unsigned int)((unsigned char)((*inbuf)[2]))) << 8)
  1178. + ((unsigned char)((*inbuf)[3])) + (wc << 16);
  1179. if (!(px->fromcodeset & 1)) wc = bswap_32(wc);
  1180. } else {
  1181. if (!(px->fromcodeset & 1)) wc = bswap_16(wc);
  1182. if (((px->fromcodeset & IC_UTF_16) == IC_UTF_16)
  1183. && (((__uwchar_t)(wc - 0xd800U)) < (0xdc00U - 0xd800U))
  1184. ) { /* surrogate */
  1185. wc =- 0xd800U;
  1186. if (*inbytesleft < 4) goto INVALID;
  1187. wc2 = (((unsigned int)((unsigned char)((*inbuf)[2]))) << 8)
  1188. + ((unsigned char)((*inbuf)[3]));
  1189. if (!(px->fromcodeset & 1)) wc = bswap_16(wc2);
  1190. if (((__uwchar_t)(wc2 -= 0xdc00U)) < (0xe0000U - 0xdc00U)) {
  1191. goto ILLEGAL;
  1192. }
  1193. inci = 4; /* Change inci here in case skipping illegals. */
  1194. wc = 0x10000UL + (wc << 10) + wc2;
  1195. }
  1196. }
  1197. if (px->frombom) {
  1198. px->frombom = 0;
  1199. if ((wc == 0xfeffU)
  1200. || (wc == ((inci == 4)
  1201. ? (((wchar_t) 0xfffe0000UL))
  1202. : ((wchar_t)(0xfffeUL))))
  1203. ) {
  1204. if (wc != 0xfeffU) {
  1205. px->fromcodeset ^= 1; /* toggle endianness */
  1206. wc = 0xfeffU;
  1207. }
  1208. if (!px->frombom) {
  1209. goto BOM_SKIP_OUTPUT;
  1210. }
  1211. goto GOT_BOM;
  1212. }
  1213. }
  1214. if (px->fromcodeset != IC_WCHAR_T) {
  1215. if (((__uwchar_t) wc) > (((px->fromcodeset & IC_UCS_4) == IC_UCS_4)
  1216. ? 0x7fffffffUL : 0x10ffffUL)
  1217. #ifdef KUHN
  1218. || (((__uwchar_t)(wc - 0xfffeU)) < 2)
  1219. || (((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U))
  1220. #endif
  1221. ) {
  1222. goto ILLEGAL;
  1223. }
  1224. }
  1225. } else if (px->fromcodeset == IC_UTF_8) {
  1226. const char *p = *inbuf;
  1227. r = _wchar_utf8sntowcs(&wc, 1, &p, *inbytesleft, &px->fromstate, 0);
  1228. if (((ssize_t) r) <= 0) { /* either EILSEQ or incomplete or nul */
  1229. if (((ssize_t) r) < 0) { /* either EILSEQ or incomplete or nul */
  1230. assert((r == (size_t)(-1)) || (r == (size_t)(-2)));
  1231. if (r == (size_t)(-2)) {
  1232. INVALID:
  1233. __set_errno(EINVAL);
  1234. } else {
  1235. px->fromstate.__mask = 0;
  1236. inci = 1;
  1237. ILLEGAL:
  1238. if (px->skip_invalid_input) {
  1239. px->skip_invalid_input = 2; /* flag for iconv utility */
  1240. goto BOM_SKIP_OUTPUT;
  1241. }
  1242. __set_errno(EILSEQ);
  1243. }
  1244. return (size_t)(-1);
  1245. }
  1246. #ifdef __UCLIBC_MJN3_ONLY__
  1247. #warning TODO: optimize this.
  1248. #endif
  1249. if (p != NULL) { /* incomplete char case */
  1250. goto INVALID;
  1251. }
  1252. p = *inbuf + 1; /* nul */
  1253. }
  1254. inci = p - *inbuf;
  1255. } else if ((wc = ((unsigned char)(**inbuf))) >= 0x80) { /* Non-ASCII... */
  1256. if (px->fromcodeset == IC_ASCII) { /* US-ASCII codeset */
  1257. goto ILLEGAL;
  1258. } else { /* some other 8-bit ascii-extension codeset */
  1259. const __codeset_8_bit_t *c8b
  1260. = __locale_mmap->codeset_8_bit + px->fromcodeset - 3;
  1261. wc -= 0x80;
  1262. wc = __UCLIBC_CURLOCALE_DATA.tbl8c2wc[
  1263. (c8b->idx8c2wc[wc >> Cc2wc_IDX_SHIFT]
  1264. << Cc2wc_IDX_SHIFT) + (wc & (Cc2wc_ROW_LEN - 1))];
  1265. if (!wc) {
  1266. goto ILLEGAL;
  1267. }
  1268. }
  1269. }
  1270. if (px->tobom) {
  1271. inci = 0;
  1272. wc = 0xfeffU;
  1273. GOT_BOM:
  1274. px->tobom = 0;
  1275. }
  1276. if (px->tocodeset >= IC_MULTIBYTE) {
  1277. inco = (px->tocodeset == IC_WCHAR_T) ? 4: (px->tocodeset & 6);
  1278. if (*outbytesleft < inco) goto TOO_BIG;
  1279. if (px->tocodeset != IC_WCHAR_T) {
  1280. if (((__uwchar_t) wc) > (((px->tocodeset & IC_UCS_4) == IC_UCS_4)
  1281. ? 0x7fffffffUL : 0x10ffffUL)
  1282. #ifdef KUHN
  1283. || (((__uwchar_t)(wc - 0xfffeU)) < 2)
  1284. || (((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U))
  1285. #endif
  1286. ) {
  1287. REPLACE_32:
  1288. wc = 0xfffd;
  1289. ++nrcount;
  1290. }
  1291. }
  1292. if (inco == 4) {
  1293. if (px->tocodeset & 1) wc = bswap_32(wc);
  1294. } else {
  1295. if (((__uwchar_t)wc ) > 0xffffU) {
  1296. if ((px->tocodeset & IC_UTF_16) != IC_UTF_16) {
  1297. goto REPLACE_32;
  1298. }
  1299. if (*outbytesleft < (inco = 4)) goto TOO_BIG;
  1300. wc2 = 0xdc00U + (wc & 0x3ff);
  1301. wc = 0xd800U + ((wc >> 10) & 0x3ff);
  1302. if (px->tocodeset & 1) {
  1303. wc = bswap_16(wc);
  1304. wc2 = bswap_16(wc2);
  1305. }
  1306. wc += (wc2 << 16);
  1307. } else if (px->tocodeset & 1) wc = bswap_16(wc);
  1308. }
  1309. (*outbuf)[0] = (char)((unsigned char)(wc));
  1310. (*outbuf)[1] = (char)((unsigned char)(wc >> 8));
  1311. if (inco == 4) {
  1312. (*outbuf)[2] = (char)((unsigned char)(wc >> 16));
  1313. (*outbuf)[3] = (char)((unsigned char)(wc >> 24));
  1314. }
  1315. } else if (px->tocodeset == IC_UTF_8) {
  1316. const wchar_t *pw = &wc;
  1317. do {
  1318. r = _wchar_wcsntoutf8s(*outbuf, *outbytesleft, &pw, 1);
  1319. if (r != (size_t)(-1)) {
  1320. #ifdef __UCLIBC_MJN3_ONLY__
  1321. #warning TODO: What happens for a nul?
  1322. #endif
  1323. if (r == 0) {
  1324. if (wc != 0) {
  1325. goto TOO_BIG;
  1326. }
  1327. ++r;
  1328. }
  1329. break;
  1330. }
  1331. wc = 0xfffdU;
  1332. ++nrcount;
  1333. } while (1);
  1334. inco = r;
  1335. } else if (((__uwchar_t)(wc)) < 0x80) {
  1336. CHAR_GOOD:
  1337. **outbuf = wc;
  1338. } else {
  1339. if ((px->tocodeset != 0x01) && (wc <= Cwc2c_DOMAIN_MAX)) {
  1340. const __codeset_8_bit_t *c8b
  1341. = __locale_mmap->codeset_8_bit + px->tocodeset - 3;
  1342. __uwchar_t u;
  1343. u = c8b->idx8wc2c[wc >> (Cwc2c_TI_SHIFT + Cwc2c_TT_SHIFT)];
  1344. u = __UCLIBC_CURLOCALE_DATA.tbl8wc2c[(u << Cwc2c_TI_SHIFT)
  1345. + ((wc >> Cwc2c_TT_SHIFT)
  1346. & ((1 << Cwc2c_TI_SHIFT)-1))];
  1347. wc = __UCLIBC_CURLOCALE_DATA.tbl8wc2c[Cwc2c_TI_LEN
  1348. + (u << Cwc2c_TT_SHIFT)
  1349. + (wc & ((1 << Cwc2c_TT_SHIFT)-1))];
  1350. if (wc) {
  1351. goto CHAR_GOOD;
  1352. }
  1353. }
  1354. **outbuf = '?';
  1355. ++nrcount;
  1356. }
  1357. *outbuf += inco;
  1358. *outbytesleft -= inco;
  1359. BOM_SKIP_OUTPUT:
  1360. *inbuf += inci;
  1361. *inbytesleft -= inci;
  1362. }
  1363. return nrcount;
  1364. }
  1365. #endif
  1366. /**********************************************************************/
  1367. #ifdef L_iconv_main
  1368. #include <stdio.h>
  1369. #include <stdlib.h>
  1370. #include <string.h>
  1371. #include <wchar.h>
  1372. #include <iconv.h>
  1373. #include <stdarg.h>
  1374. #include <libgen.h>
  1375. extern const unsigned char __iconv_codesets[];
  1376. #define IBUF BUFSIZ
  1377. #define OBUF BUFSIZ
  1378. char *progname;
  1379. int hide_errors;
  1380. static void error_msg(const char *fmt, ...)
  1381. __attribute__ ((noreturn, format (printf, 1, 2)));
  1382. static void error_msg(const char *fmt, ...)
  1383. {
  1384. va_list arg;
  1385. if (!hide_errors) {
  1386. fprintf(stderr, "%s: ", progname);
  1387. va_start(arg, fmt);
  1388. vfprintf(stderr, fmt, arg);
  1389. va_end(arg);
  1390. }
  1391. exit(EXIT_FAILURE);
  1392. }
  1393. int main(int argc, char **argv)
  1394. {
  1395. FILE *ifile;
  1396. FILE *ofile = stdout;
  1397. const char *p;
  1398. const char *s;
  1399. static const char opt_chars[] = "tfocsl";
  1400. /* 012345 */
  1401. const char *opts[sizeof(opt_chars)]; /* last is infile name */
  1402. iconv_t ic;
  1403. char ibuf[IBUF];
  1404. char obuf[OBUF];
  1405. char *pi;
  1406. char *po;
  1407. size_t ni, no, r, pos;
  1408. hide_errors = 0;
  1409. for (s = opt_chars ; *s ; s++) {
  1410. opts[ s - opt_chars ] = NULL;
  1411. }
  1412. progname = *argv;
  1413. while (--argc) {
  1414. p = *++argv;
  1415. if ((*p != '-') || (*++p == 0)) {
  1416. break;
  1417. }
  1418. do {
  1419. if ((s = strchr(opt_chars,*p)) == NULL) {
  1420. USAGE:
  1421. s = basename(progname);
  1422. fprintf(stderr,
  1423. "%s [-cs] -f fromcode -t tocode [-o outputfile] [inputfile ...]\n"
  1424. " or\n%s -l\n", s, s);
  1425. return EXIT_FAILURE;
  1426. }
  1427. if ((s - opt_chars) < 3) {
  1428. if ((--argc == 0) || opts[s - opt_chars]) {
  1429. goto USAGE;
  1430. }
  1431. opts[s - opt_chars] = *++argv;
  1432. } else {
  1433. opts[s - opt_chars] = p;
  1434. }
  1435. } while (*++p);
  1436. }
  1437. if (opts[5]) { /* -l */
  1438. fprintf(stderr, "Recognized codesets:\n");
  1439. for (s = __iconv_codesets ; *s ; s += *s) {
  1440. fprintf(stderr," %s\n", s+2);
  1441. }
  1442. s = __LOCALE_DATA_CODESET_LIST;
  1443. do {
  1444. fprintf(stderr," %s\n", __LOCALE_DATA_CODESET_LIST+ (unsigned char)(*s));
  1445. } while (*++s);
  1446. return EXIT_SUCCESS;
  1447. }
  1448. if (opts[4]) {
  1449. hide_errors = 1;
  1450. }
  1451. if (!opts[0] || !opts[1]) {
  1452. goto USAGE;
  1453. }
  1454. if ((ic = iconv_open(opts[0],opts[1])) == ((iconv_t)(-1))) {
  1455. error_msg( "unsupported codeset in %s -> %s conversion\n", opts[0], opts[1]);
  1456. }
  1457. if (opts[3]) { /* -c */
  1458. ((_UC_iconv_t *) ic)->skip_invalid_input = 1;
  1459. }
  1460. if ((s = opts[2]) != NULL) {
  1461. if (!(ofile = fopen(s, "w"))) {
  1462. error_msg( "couldn't open %s for writing\n", s);
  1463. }
  1464. }
  1465. pos = ni = 0;
  1466. do {
  1467. if (!argc || ((**argv == '-') && !((*argv)[1]))) {
  1468. ifile = stdin; /* we don't check for duplicates */
  1469. } else if (!(ifile = fopen(*argv, "r"))) {
  1470. error_msg( "couldn't open %s for reading\n", *argv);
  1471. }
  1472. while ((r = fread(ibuf + ni, 1, IBUF - ni, ifile)) > 0) {
  1473. pos += r;
  1474. ni += r;
  1475. no = OBUF;
  1476. pi = ibuf;
  1477. po = obuf;
  1478. if ((r = iconv(ic, &pi, &ni, &po, &no)) == ((size_t)(-1))) {
  1479. if ((errno != EINVAL) && (errno != E2BIG)) {
  1480. error_msg( "iconv failed at pos %lu : %m\n", (unsigned long) (pos - ni));
  1481. }
  1482. }
  1483. if ((r = OBUF - no) > 0) {
  1484. if (fwrite(obuf, 1, OBUF - no, ofile) < r) {
  1485. error_msg( "write error\n");
  1486. }
  1487. }
  1488. if (ni) { /* still bytes in buffer! */
  1489. memmove(ibuf, pi, ni);
  1490. }
  1491. }
  1492. if (ferror(ifile)) {
  1493. error_msg( "read error\n");
  1494. }
  1495. ++argv;
  1496. if (ifile != stdin) {
  1497. fclose(ifile);
  1498. }
  1499. } while (--argc > 0);
  1500. iconv_close(ic);
  1501. if (ni) {
  1502. error_msg( "incomplete sequence\n");
  1503. }
  1504. return (((_UC_iconv_t *) ic)->skip_invalid_input < 2)
  1505. ? EXIT_SUCCESS : EXIT_FAILURE;
  1506. }
  1507. #endif
  1508. /**********************************************************************/