wchar.c 44 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706
  1. /* Copyright (C) 2002 Manuel Novoa III
  2. *
  3. * This library is free software; you can redistribute it and/or
  4. * modify it under the terms of the GNU Library General Public
  5. * License as published by the Free Software Foundation; either
  6. * version 2 of the License, or (at your option) any later version.
  7. *
  8. * This library is distributed in the hope that it will be useful,
  9. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  11. * Library General Public License for more details.
  12. *
  13. * You should have received a copy of the GNU Library General Public
  14. * License along with this library; if not, write to the Free
  15. * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  16. */
  17. /* ATTENTION! ATTENTION! ATTENTION! ATTENTION! ATTENTION!
  18. *
  19. * Besides uClibc, I'm using this code in my libc for elks, which is
  20. * a 16-bit environment with a fairly limited compiler. It would make
  21. * things much easier for me if this file isn't modified unnecessarily.
  22. * In particular, please put any new or replacement functions somewhere
  23. * else, and modify the makefile to use your version instead.
  24. * Thanks. Manuel
  25. *
  26. * ATTENTION! ATTENTION! ATTENTION! ATTENTION! ATTENTION! */
  27. /* May 23, 2002 Initial Notes:
  28. *
  29. * I'm still tweaking this stuff, but it passes the tests I've thrown
  30. * at it, and Erik needs it for the gcc port. The glibc extension
  31. * __wcsnrtombs() hasn't been tested, as I didn't find a test for it
  32. * in the glibc source. I also need to fix the behavior of
  33. * _wchar_utf8sntowcs() if the max number of wchars to convert is 0.
  34. *
  35. * UTF-8 -> wchar -> UTF-8 conversion tests on Markus Kuhn's UTF-8-demo.txt
  36. * file on my platform (x86) show about 5-10% faster conversion speed than
  37. * glibc with mbsrtowcs()/wcsrtombs() and almost twice as fast as glibc with
  38. * individual mbrtowc()/wcrtomb() calls.
  39. *
  40. * If 'DECODER' is defined, then _wchar_utf8sntowcs() will be compiled
  41. * as a fail-safe UTF-8 decoder appropriate for a terminal, etc. which
  42. * needs to deal gracefully with whatever is sent to it. In that mode,
  43. * it passes Markus Kuhn's UTF-8-test.txt stress test. I plan to add
  44. * an arg to force that behavior, so the interface will be changing.
  45. *
  46. * I need to fix the error checking for 16-bit wide chars. This isn't
  47. * an issue for uClibc, but may be for ELKS. I'm currently not sure
  48. * if I'll use 16-bit, 32-bit, or configureable wchars in ELKS.
  49. *
  50. * July 1, 2002
  51. *
  52. * Fixed _wchar_utf8sntowcs() for the max number of wchars == 0 case.
  53. * Fixed nul-char bug in btowc(), and another in __mbsnrtowcs() for 8-bit
  54. * locales.
  55. * Enabled building of a C/POSIX-locale-only version, so full locale support
  56. * no longer needs to be enabled.
  57. *
  58. * Nov 4, 2002
  59. *
  60. * Fixed a bug in _wchar_wcsntoutf8s(). Don't store wcs position if dst is NULL.
  61. * Also, introduce an awful hack into _wchar_wcsntoutf8s() and wcsrtombs() in
  62. * order to support %ls in printf. See comments below for details.
  63. * Change behaviour of wc<->mb functions when in the C locale. Now they do
  64. * a 1-1 map for the range 0x80-UCHAR_MAX. This is for backwards compatibility
  65. * and consistency with the stds requirements that a printf format string by
  66. * a valid multibyte string beginning and ending in it's initial shift state.
  67. *
  68. * Nov 5, 2002
  69. *
  70. * Forgot to change btowc and wctob when I changed the wc<->mb functions yesterday.
  71. *
  72. * Nov 7, 2002
  73. *
  74. * Add wcwidth and wcswidth, based on Markus Kuhn's wcwidth of 2002-05-08.
  75. * Added some size/speed optimizations and integrated it into my locale
  76. * framework. Minimally tested at the moment, but the stub C-locale
  77. * version (which most people would probably be using) should be fine.
  78. *
  79. * Nov 21, 2002
  80. *
  81. * Revert the wc<->mb changes from earlier this month involving the C-locale.
  82. * Add a couple of ugly hacks to support *wprintf.
  83. * Add a mini iconv() and iconv implementation (requires locale support).
  84. *
  85. * Aug 1, 2003
  86. * Bug fix for mbrtowc.
  87. *
  88. * Aug 18, 2003
  89. * Bug fix: _wchar_utf8sntowcs and _wchar_wcsntoutf8s now set errno if EILSEQ.
  90. *
  91. * Manuel
  92. */
  93. #define _GNU_SOURCE
  94. #define _ISOC99_SOURCE
  95. #include <errno.h>
  96. #include <stddef.h>
  97. #include <limits.h>
  98. #include <stdint.h>
  99. #include <inttypes.h>
  100. #include <stdlib.h>
  101. #include <stdio.h>
  102. #include <assert.h>
  103. #include <locale.h>
  104. #include <wchar.h>
  105. #include <bits/uClibc_uwchar.h>
  106. /**********************************************************************/
  107. #ifdef __UCLIBC_HAS_LOCALE__
  108. #ifdef __UCLIBC_MJN3_ONLY__
  109. #ifdef L_iswspace
  110. /* generates one warning */
  111. #warning TODO: Fix Cc2wc* and Cwc2c* defines!
  112. #endif
  113. #endif /* __UCLIBC_MJN3_ONLY__ */
  114. #define ENCODING ((__UCLIBC_CURLOCALE_DATA).encoding)
  115. #define Cc2wc_IDX_SHIFT __LOCALE_DATA_Cc2wc_IDX_SHIFT
  116. #define Cc2wc_ROW_LEN __LOCALE_DATA_Cc2wc_ROW_LEN
  117. #define Cwc2c_DOMAIN_MAX __LOCALE_DATA_Cwc2c_DOMAIN_MAX
  118. #define Cwc2c_TI_SHIFT __LOCALE_DATA_Cwc2c_TI_SHIFT
  119. #define Cwc2c_TT_SHIFT __LOCALE_DATA_Cwc2c_TT_SHIFT
  120. #define Cwc2c_TI_LEN __LOCALE_DATA_Cwc2c_TI_LEN
  121. #ifndef __CTYPE_HAS_UTF_8_LOCALES
  122. #warning __CTYPE_HAS_UTF_8_LOCALES not set!
  123. #endif
  124. #else /* __UCLIBC_HAS_LOCALE__ */
  125. #ifdef __UCLIBC_MJN3_ONLY__
  126. #ifdef L_btowc
  127. /* emit only once */
  128. #warning fix preprocessor logic testing locale settings
  129. #endif
  130. #endif
  131. #define ENCODING (__ctype_encoding_7_bit)
  132. #ifdef __CTYPE_HAS_8_BIT_LOCALES
  133. #error __CTYPE_HAS_8_BIT_LOCALES is defined!
  134. #endif
  135. #ifdef __CTYPE_HAS_UTF_8_LOCALES
  136. #error __CTYPE_HAS_UTF_8_LOCALES is defined!
  137. #endif
  138. #undef L__wchar_utf8sntowcs
  139. #undef L__wchar_wcsntoutf8s
  140. #endif /* __UCLIBC_HAS_LOCALE__ */
  141. /**********************************************************************/
  142. #if WCHAR_MAX > 0xffffUL
  143. #define UTF_8_MAX_LEN 6
  144. #else
  145. #define UTF_8_MAX_LEN 3
  146. #endif
  147. #define KUHN 1
  148. /* Implementation-specific work functions. */
  149. extern size_t _wchar_utf8sntowcs(wchar_t *__restrict pwc, size_t wn,
  150. const char **__restrict src, size_t n,
  151. mbstate_t *ps, int allow_continuation);
  152. extern size_t _wchar_wcsntoutf8s(char *__restrict s, size_t n,
  153. const wchar_t **__restrict src, size_t wn);
  154. /* glibc extensions. */
  155. extern size_t __mbsnrtowcs(wchar_t *__restrict dst,
  156. const char **__restrict src,
  157. size_t NMC, size_t len, mbstate_t *__restrict ps);
  158. extern size_t __wcsnrtombs(char *__restrict dst,
  159. const wchar_t **__restrict src,
  160. size_t NWC, size_t len, mbstate_t *__restrict ps);
  161. /**********************************************************************/
  162. #ifdef L_btowc
  163. wint_t btowc(int c)
  164. {
  165. #ifdef __CTYPE_HAS_8_BIT_LOCALES
  166. wchar_t wc;
  167. unsigned char buf[1];
  168. mbstate_t mbstate;
  169. if (c != EOF) {
  170. *buf = (unsigned char) c;
  171. mbstate.mask = 0; /* Initialize the mbstate. */
  172. if (mbrtowc(&wc, buf, 1, &mbstate) <= 1) {
  173. return wc;
  174. }
  175. }
  176. return WEOF;
  177. #else /* __CTYPE_HAS_8_BIT_LOCALES */
  178. #ifdef __UCLIBC_HAS_LOCALE__
  179. assert((ENCODING == __ctype_encoding_7_bit)
  180. || (ENCODING == __ctype_encoding_utf8));
  181. #endif /* __UCLIBC_HAS_LOCALE__ */
  182. /* If we don't have 8-bit locale support, then this is trivial since
  183. * anything outside of 0-0x7f is illegal in C/POSIX and UTF-8 locales. */
  184. return (((unsigned int)c) < 0x80) ? c : WEOF;
  185. #endif /* __CTYPE_HAS_8_BIT_LOCALES */
  186. }
  187. #endif
  188. /**********************************************************************/
  189. #ifdef L_wctob
  190. /* Note: We completely ignore ps in all currently supported conversions. */
  191. int wctob(wint_t c)
  192. {
  193. #ifdef __CTYPE_HAS_8_BIT_LOCALES
  194. unsigned char buf[MB_LEN_MAX];
  195. return (wcrtomb(buf, c, NULL) == 1) ? *buf : EOF;
  196. #else /* __CTYPE_HAS_8_BIT_LOCALES */
  197. #ifdef __UCLIBC_HAS_LOCALE__
  198. assert((ENCODING == __ctype_encoding_7_bit)
  199. || (ENCODING == __ctype_encoding_utf8));
  200. #endif /* __UCLIBC_HAS_LOCALE__ */
  201. /* If we don't have 8-bit locale support, then this is trivial since
  202. * anything outside of 0-0x7f is illegal in C/POSIX and UTF-8 locales. */
  203. /* TODO: need unsigned version of wint_t... */
  204. /* return (((unsigned int)c) < 0x80) ? c : WEOF; */
  205. return ((c >= 0) && (c < 0x80)) ? c : EOF;
  206. #endif /* __CTYPE_HAS_8_BIT_LOCALES */
  207. }
  208. #endif
  209. /**********************************************************************/
  210. #ifdef L_mbsinit
  211. int mbsinit(const mbstate_t *ps)
  212. {
  213. return !ps || !ps->mask;
  214. }
  215. #endif
  216. /**********************************************************************/
  217. #ifdef L_mbrlen
  218. size_t mbrlen(const char *__restrict s, size_t n, mbstate_t *__restrict ps)
  219. __attribute__ ((__weak__, __alias__("__mbrlen")));
  220. size_t __mbrlen(const char *__restrict s, size_t n, mbstate_t *__restrict ps)
  221. {
  222. static mbstate_t mbstate; /* Rely on bss 0-init. */
  223. return mbrtowc(NULL, s, n, (ps != NULL) ? ps : &mbstate);
  224. }
  225. #endif
  226. /**********************************************************************/
  227. #ifdef L_mbrtowc
  228. size_t mbrtowc(wchar_t *__restrict pwc, const char *__restrict s,
  229. size_t n, mbstate_t *__restrict ps)
  230. {
  231. static mbstate_t mbstate; /* Rely on bss 0-init. */
  232. wchar_t wcbuf[1];
  233. const char *p;
  234. size_t r;
  235. char empty_string[1]; /* Avoid static to be fPIC friendly. */
  236. if (!ps) {
  237. ps = &mbstate;
  238. }
  239. if (!s) {
  240. pwc = (wchar_t *) s; /* NULL */
  241. empty_string[0] = 0; /* Init the empty string when necessary. */
  242. s = empty_string;
  243. n = 1;
  244. } else if (!n) {
  245. return (ps->mask && (ps->wc == 0xffffU)) /* TODO: change error code? */
  246. ? ((size_t) -1) : ((size_t) -2);
  247. }
  248. p = s;
  249. #ifdef __CTYPE_HAS_UTF_8_LOCALES
  250. /* Need to do this here since mbsrtowcs doesn't allow incompletes. */
  251. if (ENCODING == __ctype_encoding_utf8) {
  252. if (!pwc) {
  253. pwc = wcbuf;
  254. }
  255. r = _wchar_utf8sntowcs(pwc, 1, &p, n, ps, 1);
  256. return (r == 1) ? (p-s) : r; /* Need to return 0 if nul char. */
  257. }
  258. #endif
  259. #ifdef __UCLIBC_MJN3_ONLY__
  260. #warning TODO: This adds a trailing nul!
  261. #endif /* __UCLIBC_MJN3_ONLY__ */
  262. r = __mbsnrtowcs(wcbuf, &p, SIZE_MAX, 1, ps);
  263. if (((ssize_t) r) >= 0) {
  264. if (pwc) {
  265. *pwc = *wcbuf;
  266. }
  267. }
  268. return (size_t) r;
  269. }
  270. #endif
  271. /**********************************************************************/
  272. #ifdef L_wcrtomb
  273. /* Note: We completely ignore ps in all currently supported conversions. */
  274. /* TODO: Check for valid state anyway? */
  275. size_t wcrtomb(register char *__restrict s, wchar_t wc,
  276. mbstate_t *__restrict ps)
  277. {
  278. #ifdef __UCLIBC_MJN3_ONLY__
  279. #warning TODO: Should wcsnrtombs nul-terminate unconditionally? Check glibc.
  280. #endif /* __UCLIBC_MJN3_ONLY__ */
  281. wchar_t wcbuf[1];
  282. const wchar_t *pwc;
  283. size_t r;
  284. char buf[MB_LEN_MAX];
  285. if (!s) {
  286. s = buf;
  287. wc = 0;
  288. }
  289. pwc = wcbuf;
  290. wcbuf[0] = wc;
  291. r = __wcsnrtombs(s, &pwc, 1, MB_LEN_MAX, ps);
  292. return (r != 0) ? r : 1;
  293. }
  294. #endif
  295. /**********************************************************************/
  296. #ifdef L_mbsrtowcs
  297. size_t mbsrtowcs(wchar_t *__restrict dst, const char **__restrict src,
  298. size_t len, mbstate_t *__restrict ps)
  299. {
  300. static mbstate_t mbstate; /* Rely on bss 0-init. */
  301. return __mbsnrtowcs(dst, src, SIZE_MAX, len,
  302. ((ps != NULL) ? ps : &mbstate));
  303. }
  304. #endif
  305. /**********************************************************************/
  306. #ifdef L_wcsrtombs
  307. /* Note: We completely ignore ps in all currently supported conversions.
  308. * TODO: Check for valid state anyway? */
  309. size_t wcsrtombs(char *__restrict dst, const wchar_t **__restrict src,
  310. size_t len, mbstate_t *__restrict ps)
  311. {
  312. return __wcsnrtombs(dst, src, SIZE_MAX, len, ps);
  313. }
  314. #endif
  315. /**********************************************************************/
  316. #ifdef L__wchar_utf8sntowcs
  317. /* Define DECODER to generate a UTF-8 decoder which passes Markus Kuhn's
  318. * UTF-8-test.txt strss test.
  319. */
  320. /* #define DECODER */
  321. #ifdef DECODER
  322. #ifndef KUHN
  323. #define KUHN
  324. #endif
  325. #endif
  326. size_t _wchar_utf8sntowcs(wchar_t *__restrict pwc, size_t wn,
  327. const char **__restrict src, size_t n,
  328. mbstate_t *ps, int allow_continuation)
  329. {
  330. register const char *s;
  331. __uwchar_t mask;
  332. __uwchar_t wc;
  333. wchar_t wcbuf[1];
  334. size_t count;
  335. int incr;
  336. s = *src;
  337. assert(s != NULL);
  338. assert(ps != NULL);
  339. incr = 1;
  340. /* NOTE: The following is an AWFUL HACK! In order to support %s in
  341. * wprintf, we need to be able to compute the number of wchars needed
  342. * for the mbs conversion, not to exceed the precision specified.
  343. * But if dst is NULL, the return value is the length assuming a
  344. * sufficiently sized buffer. So, we allow passing of (wchar_t *) ps
  345. * as pwc in order to flag that we really want the length, subject
  346. * to the restricted buffer size and no partial conversions.
  347. * See mbsnrtowcs() as well. */
  348. if (!pwc || (pwc == ((wchar_t *)ps))) {
  349. if (!pwc) {
  350. wn = SIZE_MAX;
  351. }
  352. pwc = wcbuf;
  353. incr = 0;
  354. }
  355. /* This is really here only to support the glibc extension function
  356. * __mbsnrtowcs which apparently returns 0 if wn == 0 without any
  357. * check on the validity of the mbstate. */
  358. if (!(count = wn)) {
  359. return 0;
  360. }
  361. if ((mask = (__uwchar_t) ps->mask) != 0) { /* A continuation... */
  362. #ifdef DECODER
  363. wc = (__uwchar_t) ps->wc;
  364. if (n) {
  365. goto CONTINUE;
  366. }
  367. goto DONE;
  368. #else
  369. if ((wc = (__uwchar_t) ps->wc) != 0xffffU) {
  370. /* TODO: change error code here and below? */
  371. if (n) {
  372. goto CONTINUE;
  373. }
  374. goto DONE;
  375. }
  376. __set_errno(EILSEQ);
  377. return (size_t) -1; /* We're in an error state. */
  378. #endif
  379. }
  380. do {
  381. if (!n) {
  382. goto DONE;
  383. }
  384. --n;
  385. if ((wc = ((unsigned char) *s++)) >= 0x80) { /* Not ASCII... */
  386. mask = 0x40;
  387. #ifdef __UCLIBC_MJN3_ONLY__
  388. #warning TODO: Fix range for 16 bit wchar_t case.
  389. #endif
  390. if ( ((unsigned char)(s[-1] - 0xc0)) < (0xfe - 0xc0) ) {
  391. goto START;
  392. }
  393. BAD:
  394. #ifdef DECODER
  395. wc = 0xfffdU;
  396. goto COMPLETE;
  397. #else
  398. ps->mask = mask;
  399. ps->wc = 0xffffU;
  400. __set_errno(EILSEQ);
  401. return (size_t) -1; /* Illegal start byte! */
  402. #endif
  403. CONTINUE:
  404. while (n) {
  405. --n;
  406. if ((*s & 0xc0) != 0x80) {
  407. goto BAD;
  408. }
  409. mask <<= 5;
  410. wc <<= 6;
  411. wc += (*s & 0x3f); /* keep seperate for bcc (smaller code) */
  412. ++s;
  413. START:
  414. wc &= ~(mask << 1);
  415. if ((wc & mask) == 0) { /* Character completed. */
  416. if ((mask >>= 5) == 0x40) {
  417. mask += mask;
  418. }
  419. /* Check for invalid sequences (longer than necessary)
  420. * and invalid chars. */
  421. if ( (wc < mask) /* Sequence not minimal length. */
  422. #ifdef KUHN
  423. #if UTF_8_MAX_LEN == 3
  424. #error broken since mask can overflow!!
  425. /* For plane 0, these are the only defined values.*/
  426. || (wc > 0xfffdU)
  427. #else
  428. /* Note that we don't need to worry about exceeding */
  429. /* 31 bits as that is the most that UTF-8 provides. */
  430. || ( ((__uwchar_t)(wc - 0xfffeU)) < 2)
  431. #endif
  432. || ( ((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U) )
  433. #endif /* KUHN */
  434. ) {
  435. goto BAD;
  436. }
  437. goto COMPLETE;
  438. }
  439. }
  440. /* Character potentially valid but incomplete. */
  441. if (!allow_continuation) {
  442. if (count != wn) {
  443. return 0;
  444. }
  445. /* NOTE: The following can fail if you allow and then disallow
  446. * continuation!!! */
  447. #if UTF_8_MAX_LEN == 3
  448. #error broken since mask can overflow!!
  449. #endif
  450. /* Need to back up... */
  451. do {
  452. --s;
  453. } while ((mask >>= 5) >= 0x40);
  454. goto DONE;
  455. }
  456. ps->mask = (wchar_t) mask;
  457. ps->wc = (wchar_t) wc;
  458. *src = s;
  459. return (size_t) -2;
  460. }
  461. COMPLETE:
  462. *pwc = wc;
  463. pwc += incr;
  464. }
  465. #ifdef DECODER
  466. while (--count);
  467. #else
  468. while (wc && --count);
  469. if (!wc) {
  470. s = NULL;
  471. }
  472. #endif
  473. DONE:
  474. /* ps->wc is irrelavent here. */
  475. ps->mask = 0;
  476. if (pwc != wcbuf) {
  477. *src = s;
  478. }
  479. return wn - count;
  480. }
  481. #endif
  482. /**********************************************************************/
  483. #ifdef L__wchar_wcsntoutf8s
  484. size_t _wchar_wcsntoutf8s(char *__restrict s, size_t n,
  485. const wchar_t **__restrict src, size_t wn)
  486. {
  487. register char *p;
  488. size_t len, t;
  489. __uwchar_t wc;
  490. const __uwchar_t *swc;
  491. int store;
  492. char buf[MB_LEN_MAX];
  493. char m;
  494. store = 1;
  495. /* NOTE: The following is an AWFUL HACK! In order to support %ls in
  496. * printf, we need to be able to compute the number of bytes needed
  497. * for the mbs conversion, not to exceed the precision specified.
  498. * But if dst is NULL, the return value is the length assuming a
  499. * sufficiently sized buffer. So, we allow passing of (char *) src
  500. * as dst in order to flag that we really want the length, subject
  501. * to the restricted buffer size and no partial conversions.
  502. * See wcsnrtombs() as well. */
  503. if (!s || (s == ((char *) src))) {
  504. if (!s) {
  505. n = SIZE_MAX;
  506. }
  507. s = buf;
  508. store = 0;
  509. }
  510. t = n;
  511. swc = (const __uwchar_t *) *src;
  512. assert(swc != NULL);
  513. while (wn && t) {
  514. wc = *swc;
  515. *s = wc;
  516. len = 1;
  517. if (wc >= 0x80) {
  518. #ifdef KUHN
  519. if (
  520. #if UTF_8_MAX_LEN == 3
  521. /* For plane 0, these are the only defined values.*/
  522. /* Note that we don't need to worry about exceeding */
  523. /* 31 bits as that is the most that UTF-8 provides. */
  524. (wc > 0xfffdU)
  525. #else
  526. /* UTF_8_MAX_LEN == 6 */
  527. (wc > 0x7fffffffUL)
  528. || ( ((__uwchar_t)(wc - 0xfffeU)) < 2)
  529. #endif
  530. || ( ((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U) )
  531. ) {
  532. __set_errno(EILSEQ);
  533. return (size_t) -1;
  534. }
  535. #else /* KUHN */
  536. #if UTF_8_MAX_LEN != 3
  537. if (wc > 0x7fffffffUL) { /* Value too large. */
  538. __set_errno(EILSEQ);
  539. return (size_t) -1;
  540. }
  541. #endif
  542. #endif /* KUHN */
  543. wc >>= 1;
  544. p = s;
  545. do {
  546. ++p;
  547. } while (wc >>= 5);
  548. wc = *swc;
  549. if ((len = p - s) > t) { /* Not enough space. */
  550. break;
  551. }
  552. m = 0x80;
  553. while( p>s ) {
  554. m = (m >> 1) | 0x80;
  555. *--p = (wc & 0x3f) | 0x80;
  556. wc >>= 6;
  557. }
  558. *s |= (m << 1);
  559. } else if (wc == 0) { /* End of string. */
  560. swc = NULL;
  561. break;
  562. }
  563. ++swc;
  564. --wn;
  565. t -= len;
  566. if (store) {
  567. s += len;
  568. }
  569. }
  570. if (store) {
  571. *src = (const wchar_t *) swc;
  572. }
  573. return n - t;
  574. }
  575. #endif
  576. /**********************************************************************/
  577. #ifdef L___mbsnrtowcs
  578. /* WARNING: We treat len as SIZE_MAX when dst is NULL! */
  579. size_t mbsnrtowcs(wchar_t *__restrict dst, const char **__restrict src,
  580. size_t NMC, size_t len, mbstate_t *__restrict ps)
  581. __attribute__ ((__weak__, __alias__("__mbsnrtowcs")));
  582. size_t __mbsnrtowcs(wchar_t *__restrict dst, const char **__restrict src,
  583. size_t NMC, size_t len, mbstate_t *__restrict ps)
  584. {
  585. static mbstate_t mbstate; /* Rely on bss 0-init. */
  586. wchar_t wcbuf[1];
  587. const char *s;
  588. size_t count;
  589. int incr;
  590. if (!ps) {
  591. ps = &mbstate;
  592. }
  593. #ifdef __CTYPE_HAS_UTF_8_LOCALES
  594. if (ENCODING == __ctype_encoding_utf8) {
  595. size_t r;
  596. return ((r = _wchar_utf8sntowcs(dst, len, src, NMC, ps, 1))
  597. != (size_t) -2) ? r : 0;
  598. }
  599. #endif
  600. incr = 1;
  601. /* NOTE: The following is an AWFUL HACK! In order to support %s in
  602. * wprintf, we need to be able to compute the number of wchars needed
  603. * for the mbs conversion, not to exceed the precision specified.
  604. * But if dst is NULL, the return value is the length assuming a
  605. * sufficiently sized buffer. So, we allow passing of ((wchar_t *)ps)
  606. * as dst in order to flag that we really want the length, subject
  607. * to the restricted buffer size and no partial conversions.
  608. * See _wchar_utf8sntowcs() as well. */
  609. if (!dst || (dst == ((wchar_t *)ps))) {
  610. if (!dst) {
  611. len = SIZE_MAX;
  612. }
  613. dst = wcbuf;
  614. incr = 0;
  615. }
  616. /* Since all the following encodings are single-byte encodings... */
  617. if (len > NMC) {
  618. len = NMC;
  619. }
  620. count = len;
  621. s = *src;
  622. #ifdef __CTYPE_HAS_8_BIT_LOCALES
  623. if (ENCODING == __ctype_encoding_8_bit) {
  624. wchar_t wc;
  625. while (count) {
  626. if ((wc = ((unsigned char)(*s))) >= 0x80) { /* Non-ASCII... */
  627. wc -= 0x80;
  628. wc = __UCLIBC_CURLOCALE_DATA.tbl8c2wc[
  629. (__UCLIBC_CURLOCALE_DATA.idx8c2wc[wc >> Cc2wc_IDX_SHIFT]
  630. << Cc2wc_IDX_SHIFT) + (wc & (Cc2wc_ROW_LEN - 1))];
  631. if (!wc) {
  632. goto BAD;
  633. }
  634. }
  635. if (!(*dst = wc)) {
  636. s = NULL;
  637. break;
  638. }
  639. dst += incr;
  640. ++s;
  641. --count;
  642. }
  643. if (dst != wcbuf) {
  644. *src = s;
  645. }
  646. return len - count;
  647. }
  648. #endif
  649. #ifdef __UCLIBC_HAS_LOCALE__
  650. assert(ENCODING == __ctype_encoding_7_bit);
  651. #endif
  652. while (count) {
  653. if ((*dst = (unsigned char) *s) == 0) {
  654. s = NULL;
  655. break;
  656. }
  657. if (*dst >= 0x80) {
  658. #ifdef __CTYPE_HAS_8_BIT_LOCALES
  659. BAD:
  660. #endif
  661. __set_errno(EILSEQ);
  662. return (size_t) -1;
  663. }
  664. ++s;
  665. dst += incr;
  666. --count;
  667. }
  668. if (dst != wcbuf) {
  669. *src = s;
  670. }
  671. return len - count;
  672. }
  673. #endif
  674. /**********************************************************************/
  675. #ifdef L___wcsnrtombs
  676. /* WARNING: We treat len as SIZE_MAX when dst is NULL! */
  677. /* Note: We completely ignore ps in all currently supported conversions.
  678. * TODO: Check for valid state anyway? */
  679. size_t wcsnrtombs(char *__restrict dst, const wchar_t **__restrict src,
  680. size_t NWC, size_t len, mbstate_t *__restrict ps)
  681. __attribute__ ((__weak__, __alias__("__wcsnrtombs")));
  682. size_t __wcsnrtombs(char *__restrict dst, const wchar_t **__restrict src,
  683. size_t NWC, size_t len, mbstate_t *__restrict ps)
  684. {
  685. const __uwchar_t *s;
  686. size_t count;
  687. int incr;
  688. char buf[MB_LEN_MAX];
  689. #ifdef __CTYPE_HAS_UTF_8_LOCALES
  690. if (ENCODING == __ctype_encoding_utf8) {
  691. return _wchar_wcsntoutf8s(dst, len, src, NWC);
  692. }
  693. #endif /* __CTYPE_HAS_UTF_8_LOCALES */
  694. incr = 1;
  695. /* NOTE: The following is an AWFUL HACK! In order to support %ls in
  696. * printf, we need to be able to compute the number of bytes needed
  697. * for the mbs conversion, not to exceed the precision specified.
  698. * But if dst is NULL, the return value is the length assuming a
  699. * sufficiently sized buffer. So, we allow passing of (char *) src
  700. * as dst in order to flag that we really want the length, subject
  701. * to the restricted buffer size and no partial conversions.
  702. * See _wchar_wcsntoutf8s() as well. */
  703. if (!dst || (dst == ((char *) src))) {
  704. if (!dst) {
  705. len = SIZE_MAX;
  706. }
  707. dst = buf;
  708. incr = 0;
  709. }
  710. /* Since all the following encodings are single-byte encodings... */
  711. if (len > NWC) {
  712. len = NWC;
  713. }
  714. count = len;
  715. s = (const __uwchar_t *) *src;
  716. #ifdef __CTYPE_HAS_8_BIT_LOCALES
  717. if (ENCODING == __ctype_encoding_8_bit) {
  718. __uwchar_t wc;
  719. __uwchar_t u;
  720. while (count) {
  721. if ((wc = *s) <= 0x7f) {
  722. if (!(*dst = (unsigned char) wc)) {
  723. s = NULL;
  724. break;
  725. }
  726. } else {
  727. u = 0;
  728. if (wc <= Cwc2c_DOMAIN_MAX) {
  729. u = __UCLIBC_CURLOCALE_DATA.idx8wc2c[wc >> (Cwc2c_TI_SHIFT
  730. + Cwc2c_TT_SHIFT)];
  731. u = __UCLIBC_CURLOCALE_DATA.tbl8wc2c[(u << Cwc2c_TI_SHIFT)
  732. + ((wc >> Cwc2c_TT_SHIFT)
  733. & ((1 << Cwc2c_TI_SHIFT)-1))];
  734. u = __UCLIBC_CURLOCALE_DATA.tbl8wc2c[Cwc2c_TI_LEN
  735. + (u << Cwc2c_TT_SHIFT)
  736. + (wc & ((1 << Cwc2c_TT_SHIFT)-1))];
  737. }
  738. #define __WCHAR_REPLACEMENT_CHAR '?'
  739. #ifdef __WCHAR_REPLACEMENT_CHAR
  740. *dst = (unsigned char) ( u ? u : __WCHAR_REPLACEMENT_CHAR );
  741. #else /* __WCHAR_REPLACEMENT_CHAR */
  742. if (!u) {
  743. goto BAD;
  744. }
  745. *dst = (unsigned char) u;
  746. #endif /* __WCHAR_REPLACEMENT_CHAR */
  747. }
  748. ++s;
  749. dst += incr;
  750. --count;
  751. }
  752. if (dst != buf) {
  753. *src = (const wchar_t *) s;
  754. }
  755. return len - count;
  756. }
  757. #endif /* __CTYPE_HAS_8_BIT_LOCALES */
  758. #ifdef __UCLIBC_HAS_LOCALE__
  759. assert(ENCODING == __ctype_encoding_7_bit);
  760. #endif
  761. while (count) {
  762. if (*s >= 0x80) {
  763. #if defined(__CTYPE_HAS_8_BIT_LOCALES) && !defined(__WCHAR_REPLACEMENT_CHAR)
  764. BAD:
  765. #endif
  766. __set_errno(EILSEQ);
  767. return (size_t) -1;
  768. }
  769. if ((*dst = (unsigned char) *s) == 0) {
  770. s = NULL;
  771. break;
  772. }
  773. ++s;
  774. dst += incr;
  775. --count;
  776. }
  777. if (dst != buf) {
  778. *src = (const wchar_t *) s;
  779. }
  780. return len - count;
  781. }
  782. #endif
  783. /**********************************************************************/
  784. #ifdef L_wcswidth
  785. #ifdef __UCLIBC_MJN3_ONLY__
  786. #warning REMINDER: If we start doing translit, wcwidth and wcswidth will need updating.
  787. #warning TODO: Update wcwidth to match latest by Kuhn.
  788. #endif
  789. #if defined(__UCLIBC_HAS_LOCALE__) && \
  790. ( defined(__CTYPE_HAS_8_BIT_LOCALES) || defined(__CTYPE_HAS_UTF_8_LOCALES) )
  791. static const unsigned char new_idx[] = {
  792. 0, 5, 5, 6, 10, 15, 28, 39,
  793. 48, 48, 71, 94, 113, 128, 139, 154,
  794. 175, 186, 188, 188, 188, 188, 188, 188,
  795. 203, 208, 208, 208, 208, 208, 208, 208,
  796. 208, 219, 219, 219, 222, 222, 222, 222,
  797. 222, 222, 222, 222, 222, 222, 222, 224,
  798. 224, 231, 231, 231, 231, 231, 231, 231,
  799. 231, 231, 231, 231, 231, 231, 231, 231,
  800. 231, 231, 231, 231, 231, 231, 231, 231,
  801. 231, 231, 231, 231, 231, 231, 231, 231,
  802. 231, 231, 231, 231, 231, 231, 231, 231,
  803. 231, 231, 231, 231, 231, 231, 231, 231,
  804. 231, 231, 231, 231, 231, 231, 231, 231,
  805. 231, 231, 231, 231, 231, 231, 231, 231,
  806. 231, 231, 231, 231, 231, 231, 231, 231,
  807. 231, 231, 231, 231, 231, 231, 231, 231,
  808. 231, 231, 231, 231, 231, 231, 231, 231,
  809. 231, 231, 231, 231, 231, 231, 231, 231,
  810. 231, 231, 231, 231, 231, 231, 231, 231,
  811. 231, 231, 231, 231, 231, 231, 231, 231,
  812. 231, 231, 231, 231, 231, 233, 233, 233,
  813. 233, 233, 233, 233, 234, 234, 234, 234,
  814. 234, 234, 234, 234, 234, 234, 234, 234,
  815. 234, 234, 234, 234, 234, 234, 234, 234,
  816. 234, 234, 234, 234, 234, 234, 234, 234,
  817. 234, 234, 234, 234, 234, 234, 234, 234,
  818. 234, 234, 234, 234, 234, 234, 234, 234,
  819. 236, 236, 236, 236, 236, 236, 236, 236,
  820. 236, 236, 236, 236, 236, 236, 236, 236,
  821. 236, 236, 236, 236, 236, 236, 236, 236,
  822. 236, 236, 236, 236, 236, 236, 236, 236,
  823. 236, 237, 237, 238, 241, 241, 242, 249,
  824. 255,
  825. };
  826. static const unsigned char new_tbl[] = {
  827. 0x00, 0x01, 0x20, 0x7f, 0xa0, 0x00, 0x00, 0x50,
  828. 0x60, 0x70, 0x00, 0x83, 0x87, 0x88, 0x8a, 0x00,
  829. 0x91, 0xa2, 0xa3, 0xba, 0xbb, 0xbe, 0xbf, 0xc0,
  830. 0xc1, 0xc3, 0xc4, 0xc5, 0x00, 0x4b, 0x56, 0x70,
  831. 0x71, 0xd6, 0xe5, 0xe7, 0xe9, 0xea, 0xee, 0x00,
  832. 0x0f, 0x10, 0x11, 0x12, 0x30, 0x4b, 0xa6, 0xb1,
  833. 0x00, 0x01, 0x03, 0x3c, 0x3d, 0x41, 0x49, 0x4d,
  834. 0x4e, 0x51, 0x55, 0x62, 0x64, 0x81, 0x82, 0xbc,
  835. 0xbd, 0xc1, 0xc5, 0xcd, 0xce, 0xe2, 0xe4, 0x00,
  836. 0x02, 0x03, 0x3c, 0x3d, 0x41, 0x43, 0x47, 0x49,
  837. 0x4b, 0x4e, 0x70, 0x72, 0x81, 0x83, 0xbc, 0xbd,
  838. 0xc1, 0xc6, 0xc7, 0xc9, 0xcd, 0xce, 0x00, 0x01,
  839. 0x02, 0x3c, 0x3d, 0x3f, 0x40, 0x41, 0x44, 0x4d,
  840. 0x4e, 0x56, 0x57, 0x82, 0x83, 0xc0, 0xc1, 0xcd,
  841. 0xce, 0x00, 0x3e, 0x41, 0x46, 0x49, 0x4a, 0x4e,
  842. 0x55, 0x57, 0xbf, 0xc0, 0xc6, 0xc7, 0xcc, 0xce,
  843. 0x00, 0x41, 0x44, 0x4d, 0x4e, 0xca, 0xcb, 0xd2,
  844. 0xd5, 0xd6, 0xd7, 0x00, 0x31, 0x32, 0x34, 0x3b,
  845. 0x47, 0x4f, 0xb1, 0xb2, 0xb4, 0xba, 0xbb, 0xbd,
  846. 0xc8, 0xce, 0x00, 0x18, 0x1a, 0x35, 0x36, 0x37,
  847. 0x38, 0x39, 0x3a, 0x71, 0x7f, 0x80, 0x85, 0x86,
  848. 0x88, 0x90, 0x98, 0x99, 0xbd, 0xc6, 0xc7, 0x00,
  849. 0x2d, 0x31, 0x32, 0x33, 0x36, 0x38, 0x39, 0x3a,
  850. 0x58, 0x5a, 0x00, 0x60, 0x00, 0x12, 0x15, 0x32,
  851. 0x35, 0x52, 0x54, 0x72, 0x74, 0xb7, 0xbe, 0xc6,
  852. 0xc7, 0xc9, 0xd4, 0x00, 0x0b, 0x0f, 0xa9, 0xaa,
  853. 0x00, 0x0b, 0x10, 0x2a, 0x2f, 0x60, 0x64, 0x6a,
  854. 0x70, 0xd0, 0xeb, 0x00, 0x29, 0x2b, 0x00, 0x80,
  855. 0x00, 0x2a, 0x30, 0x3f, 0x40, 0x99, 0x9b, 0x00,
  856. 0xd0, 0x00, 0x00, 0xa4, 0x00, 0x00, 0x00, 0x1e,
  857. 0x1f, 0x00, 0x00, 0x10, 0x20, 0x24, 0x30, 0x70,
  858. 0xff, 0x00, 0x61, 0xe0, 0xe7, 0xf9, 0xfc,
  859. };
  860. static const signed char new_wtbl[] = {
  861. 0, -1, 1, -1, 1, 1, 0, 1,
  862. 0, 1, 1, 0, 1, 0, 1, 1,
  863. 0, 1, 0, 1, 0, 1, 0, 1,
  864. 0, 1, 0, 1, 1, 0, 1, 0,
  865. 1, 0, 1, 0, 1, 0, 1, 1,
  866. 0, 1, 0, 1, 0, 1, 0, 1,
  867. 1, 0, 1, 0, 1, 0, 1, 0,
  868. 1, 0, 1, 0, 1, 0, 1, 0,
  869. 1, 0, 1, 0, 1, 0, 1, 1,
  870. 0, 1, 0, 1, 0, 1, 0, 1,
  871. 0, 1, 0, 1, 0, 1, 0, 1,
  872. 0, 1, 0, 1, 0, 1, 1, 0,
  873. 1, 0, 1, 0, 1, 0, 1, 0,
  874. 1, 0, 1, 0, 1, 0, 1, 0,
  875. 1, 1, 0, 1, 0, 1, 0, 1,
  876. 0, 1, 0, 1, 0, 1, 0, 1,
  877. 1, 0, 1, 0, 1, 0, 1, 0,
  878. 1, 0, 1, 1, 0, 1, 0, 1,
  879. 0, 1, 0, 1, 0, 1, 0, 1,
  880. 0, 1, 1, 0, 1, 0, 1, 0,
  881. 1, 0, 1, 0, 1, 0, 1, 0,
  882. 1, 0, 1, 0, 1, 0, 1, 1,
  883. 0, 1, 0, 1, 0, 1, 0, 1,
  884. 0, 1, 2, 0, 1, 0, 1, 0,
  885. 1, 0, 1, 0, 1, 0, 1, 0,
  886. 1, 0, 1, 1, 0, 1, 0, 1,
  887. 1, 0, 1, 0, 1, 0, 1, 0,
  888. 1, 0, 1, 1, 2, 1, 1, 2,
  889. 2, 0, 2, 1, 2, 0, 2, 2,
  890. 1, 1, 2, 1, 1, 2, 1, 0,
  891. 1, 1, 0, 1, 0, 1, 2, 1,
  892. 0, 2, 1, 2, 1, 0, 1,
  893. };
  894. int wcswidth(const wchar_t *pwcs, size_t n)
  895. {
  896. int h, l, m, count;
  897. wchar_t wc;
  898. unsigned char b;
  899. if (ENCODING == __ctype_encoding_7_bit) {
  900. size_t i;
  901. for (i = 0 ; (i < n) && pwcs[i] ; i++) {
  902. if (pwcs[i] != ((unsigned char)(pwcs[i]))) {
  903. return -1;
  904. }
  905. }
  906. }
  907. #ifdef __CTYPE_HAS_8_BIT_LOCALES
  908. else if (ENCODING == __ctype_encoding_8_bit) {
  909. mbstate_t mbstate;
  910. mbstate.mask = 0; /* Initialize the mbstate. */
  911. if (__wcsnrtombs(NULL, &pwcs, n, SIZE_MAX, &mbstate) == ((size_t) - 1)) {
  912. return -1;
  913. }
  914. }
  915. #endif /* __CTYPE_HAS_8_BIT_LOCALES */
  916. #if defined(__CTYPE_HAS_UTF_8_LOCALES) && defined(KUHN)
  917. /* For stricter handling of allowed unicode values... see comments above. */
  918. else if (ENCODING == __ctype_encoding_utf8) {
  919. size_t i;
  920. for (i = 0 ; (i < n) && pwcs[i] ; i++) {
  921. if ( (((__uwchar_t)((pwcs[i]) - 0xfffeU)) < 2)
  922. || (((__uwchar_t)((pwcs[i]) - 0xd800U)) < (0xe000U - 0xd800U))
  923. ) {
  924. return -1;
  925. }
  926. }
  927. }
  928. #endif /* __CTYPE_HAS_UTF_8_LOCALES */
  929. for (count = 0 ; n && (wc = *pwcs++) ; n--) {
  930. if (wc <= 0xff) {
  931. /* If we're here, wc != 0. */
  932. if ((wc < 32) || ((wc >= 0x7f) && (wc < 0xa0))) {
  933. return -1;
  934. }
  935. ++count;
  936. continue;
  937. }
  938. if (((unsigned int) wc) <= 0xffff) {
  939. b = wc & 0xff;
  940. h = (wc >> 8);
  941. l = new_idx[h];
  942. h = new_idx[h+1];
  943. while ((m = (l+h) >> 1) != l) {
  944. if (b >= new_tbl[m]) {
  945. l = m;
  946. } else { /* wc < tbl[m] */
  947. h = m;
  948. }
  949. }
  950. count += new_wtbl[l]; /* none should be -1. */
  951. continue;
  952. }
  953. /* Redo this to minimize average number of compares?*/
  954. if (wc >= 0x1d167) {
  955. if (wc <= 0x1d1ad) {
  956. if ((wc <= 0x1d169
  957. || (wc >= 0x1d173
  958. && (wc <= 0x1d182
  959. || (wc >= 0x1d185
  960. && (wc <= 0x1d18b
  961. || (wc >= 0x1d1aa))))))
  962. ) {
  963. continue;
  964. }
  965. } else if (((wc >= 0xe0020) && (wc <= 0xe007f)) || (wc == 0xe0001)) {
  966. continue;
  967. } else if ((wc >= 0x20000) && (wc <= 0x2ffff)) {
  968. ++count; /* need 2.. add one here */
  969. }
  970. #if (WCHAR_MAX > 0x7fffffffL)
  971. else if (wc > 0x7fffffffL) {
  972. return -1;
  973. }
  974. #endif /* (WCHAR_MAX > 0x7fffffffL) */
  975. }
  976. ++count;
  977. }
  978. return count;
  979. }
  980. #else /* __UCLIBC_HAS_LOCALE__ */
  981. int wcswidth(const wchar_t *pwcs, size_t n)
  982. {
  983. int count;
  984. wchar_t wc;
  985. for (count = 0 ; n && (wc = *pwcs++) ; n--) {
  986. if (wc <= 0xff) {
  987. /* If we're here, wc != 0. */
  988. if ((wc < 32) || ((wc >= 0x7f) && (wc < 0xa0))) {
  989. return -1;
  990. }
  991. ++count;
  992. continue;
  993. } else {
  994. return -1;
  995. }
  996. }
  997. return count;
  998. }
  999. #endif /* __UCLIBC_HAS_LOCALE__ */
  1000. #endif
  1001. /**********************************************************************/
  1002. #ifdef L_wcwidth
  1003. int wcwidth(wchar_t wc)
  1004. {
  1005. return wcswidth(&wc, 1);
  1006. }
  1007. #endif
  1008. /**********************************************************************/
  1009. typedef struct {
  1010. mbstate_t tostate;
  1011. mbstate_t fromstate;
  1012. int tocodeset;
  1013. int fromcodeset;
  1014. int frombom;
  1015. int tobom;
  1016. int fromcodeset0;
  1017. int frombom0;
  1018. int tobom0;
  1019. int skip_invalid_input; /* To support iconv -c option. */
  1020. } _UC_iconv_t;
  1021. #ifdef L_iconv
  1022. #include <iconv.h>
  1023. #include <string.h>
  1024. #include <endian.h>
  1025. #include <byteswap.h>
  1026. #if (__BYTE_ORDER != __BIG_ENDIAN) && (__BYTE_ORDER != __LITTLE_ENDIAN)
  1027. #error unsupported endianness for iconv
  1028. #endif
  1029. #ifndef __CTYPE_HAS_8_BIT_LOCALES
  1030. #error currently iconv requires 8 bit locales
  1031. #endif
  1032. #ifndef __CTYPE_HAS_UTF_8_LOCALES
  1033. #error currently iconv requires UTF-8 locales
  1034. #endif
  1035. enum {
  1036. IC_WCHAR_T = 0xe0,
  1037. IC_MULTIBYTE = 0xe0,
  1038. #if __BYTE_ORDER == __BIG_ENDIAN
  1039. IC_UCS_4 = 0xec,
  1040. IC_UTF_32 = 0xe4,
  1041. IC_UCS_2 = 0xe2,
  1042. IC_UTF_16 = 0xea,
  1043. #else
  1044. IC_UCS_4 = 0xed,
  1045. IC_UTF_32 = 0xe5,
  1046. IC_UCS_2 = 0xe3,
  1047. IC_UTF_16 = 0xeb,
  1048. #endif
  1049. IC_UTF_8 = 2,
  1050. IC_ASCII = 1
  1051. };
  1052. /* For the multibyte
  1053. * bit 0 means swap endian
  1054. * bit 1 means 2 byte
  1055. * bit 2 means 4 byte
  1056. *
  1057. */
  1058. const unsigned char __iconv_codesets[] =
  1059. "\x0a\xe0""WCHAR_T\x00" /* superset of UCS-4 but platform-endian */
  1060. #if __BYTE_ORDER == __BIG_ENDIAN
  1061. "\x08\xec""UCS-4\x00" /* always BE */
  1062. "\x0a\xec""UCS-4BE\x00"
  1063. "\x0a\xed""UCS-4LE\x00"
  1064. "\x09\fe4""UTF-32\x00" /* platform endian with BOM */
  1065. "\x0b\xe4""UTF-32BE\x00"
  1066. "\x0b\xe5""UTF-32LE\x00"
  1067. "\x08\xe2""UCS-2\x00" /* always BE */
  1068. "\x0a\xe2""UCS-2BE\x00"
  1069. "\x0a\xe3""UCS-2LE\x00"
  1070. "\x09\xea""UTF-16\x00" /* platform endian with BOM */
  1071. "\x0b\xea""UTF-16BE\x00"
  1072. "\x0b\xeb""UTF-16LE\x00"
  1073. #elif __BYTE_ORDER == __LITTLE_ENDIAN
  1074. "\x08\xed""UCS-4\x00" /* always BE */
  1075. "\x0a\xed""UCS-4BE\x00"
  1076. "\x0a\xec""UCS-4LE\x00"
  1077. "\x09\xf4""UTF-32\x00" /* platform endian with BOM */
  1078. "\x0b\xe5""UTF-32BE\x00"
  1079. "\x0b\xe4""UTF-32LE\x00"
  1080. "\x08\xe3""UCS-2\x00" /* always BE */
  1081. "\x0a\xe3""UCS-2BE\x00"
  1082. "\x0a\xe2""UCS-2LE\x00"
  1083. "\x09\xfa""UTF-16\x00" /* platform endian with BOM */
  1084. "\x0b\xeb""UTF-16BE\x00"
  1085. "\x0b\xea""UTF-16LE\x00"
  1086. #endif
  1087. "\x08\x02""UTF-8\x00"
  1088. "\x0b\x01""US-ASCII\x00"
  1089. "\x07\x01""ASCII"; /* Must be last! (special case to save a nul) */
  1090. static int find_codeset(const char *name)
  1091. {
  1092. const unsigned char *s;
  1093. int codeset;
  1094. for (s = __iconv_codesets ; *s ; s += *s) {
  1095. if (!strcasecmp(s+2, name)) {
  1096. return s[1];
  1097. }
  1098. }
  1099. /* The following is ripped from find_locale in locale.c. */
  1100. /* TODO: maybe CODESET_LIST + *s ??? */
  1101. /* 7bit is 1, UTF-8 is 2, 8-bit is >= 3 */
  1102. codeset = 2;
  1103. s = __LOCALE_DATA_CODESET_LIST;
  1104. do {
  1105. ++codeset; /* Increment codeset first. */
  1106. if (!strcasecmp(__LOCALE_DATA_CODESET_LIST+*s, name)) {
  1107. return codeset;
  1108. }
  1109. } while (*++s);
  1110. return 0; /* No matching codeset! */
  1111. }
  1112. iconv_t weak_function iconv_open(const char *tocode, const char *fromcode)
  1113. {
  1114. register _UC_iconv_t *px;
  1115. int tocodeset, fromcodeset;
  1116. if (((tocodeset = find_codeset(tocode)) != 0)
  1117. && ((fromcodeset = find_codeset(fromcode)) != 0)) {
  1118. if ((px = malloc(sizeof(_UC_iconv_t))) != NULL) {
  1119. px->tocodeset = tocodeset;
  1120. px->tobom0 = px->tobom = (tocodeset & 0x10) >> 4;
  1121. px->fromcodeset0 = px->fromcodeset = fromcodeset;
  1122. px->frombom0 = px->frombom = (fromcodeset & 0x10) >> 4;
  1123. px->skip_invalid_input = px->tostate.mask = px->fromstate.mask = 0;
  1124. return (iconv_t) px;
  1125. }
  1126. } else {
  1127. __set_errno(EINVAL);
  1128. }
  1129. return (iconv_t)(-1);
  1130. }
  1131. int weak_function iconv_close(iconv_t cd)
  1132. {
  1133. free(cd);
  1134. return 0;
  1135. }
  1136. size_t weak_function iconv(iconv_t cd, char **__restrict inbuf,
  1137. size_t *__restrict inbytesleft,
  1138. char **__restrict outbuf,
  1139. size_t *__restrict outbytesleft)
  1140. {
  1141. _UC_iconv_t *px = (_UC_iconv_t *) cd;
  1142. size_t nrcount, r;
  1143. wchar_t wc, wc2;
  1144. int inci, inco;
  1145. assert(px != (_UC_iconv_t *)(-1));
  1146. assert(sizeof(wchar_t) == 4);
  1147. if (!inbuf || !*inbuf) { /* Need to reinitialze conversion state. */
  1148. /* Note: For shift-state encodings we possibly need to output the
  1149. * shift sequence to return to initial state! */
  1150. if ((px->fromcodeset & 0xf0) == 0xe0) {
  1151. }
  1152. px->tostate.mask = px->fromstate.mask = 0;
  1153. px->fromcodeset = px->fromcodeset0;
  1154. px->tobom = px->tobom0;
  1155. px->frombom = px->frombom0;
  1156. return 0;
  1157. }
  1158. nrcount = 0;
  1159. while (*inbytesleft) {
  1160. if (!*outbytesleft) {
  1161. TOO_BIG:
  1162. __set_errno(E2BIG);
  1163. return (size_t) -1;
  1164. }
  1165. inci = inco = 1;
  1166. if (px->fromcodeset >= IC_MULTIBYTE) {
  1167. inci = (px->fromcodeset == IC_WCHAR_T) ? 4: (px->fromcodeset & 6);
  1168. if (*inbytesleft < inci) goto INVALID;
  1169. wc = (((unsigned int)((unsigned char)((*inbuf)[0]))) << 8)
  1170. + ((unsigned char)((*inbuf)[1]));
  1171. if (inci == 4) {
  1172. wc = (((unsigned int)((unsigned char)((*inbuf)[2]))) << 8)
  1173. + ((unsigned char)((*inbuf)[3])) + (wc << 16);
  1174. if (!(px->fromcodeset & 1)) wc = bswap_32(wc);
  1175. } else {
  1176. if (!(px->fromcodeset & 1)) wc = bswap_16(wc);
  1177. if (((px->fromcodeset & IC_UTF_16) == IC_UTF_16)
  1178. && (((__uwchar_t)(wc - 0xd800U)) < (0xdc00U - 0xd800U))
  1179. ) { /* surrogate */
  1180. wc =- 0xd800U;
  1181. if (*inbytesleft < 4) goto INVALID;
  1182. wc2 = (((unsigned int)((unsigned char)((*inbuf)[2]))) << 8)
  1183. + ((unsigned char)((*inbuf)[3]));
  1184. if (!(px->fromcodeset & 1)) wc = bswap_16(wc2);
  1185. if (((__uwchar_t)(wc2 -= 0xdc00U)) < (0xe0000U - 0xdc00U)) {
  1186. goto ILLEGAL;
  1187. }
  1188. inci = 4; /* Change inci here in case skipping illegals. */
  1189. wc = 0x10000UL + (wc << 10) + wc2;
  1190. }
  1191. }
  1192. if (px->frombom) {
  1193. px->frombom = 0;
  1194. if ((wc == 0xfeffU)
  1195. || (wc == ((inci == 4)
  1196. ? (((wchar_t) 0xfffe0000UL))
  1197. : ((wchar_t)(0xfffeUL))))
  1198. ) {
  1199. if (wc != 0xfeffU) {
  1200. px->fromcodeset ^= 1; /* toggle endianness */
  1201. wc = 0xfeffU;
  1202. }
  1203. if (!px->frombom) {
  1204. goto BOM_SKIP_OUTPUT;
  1205. }
  1206. goto GOT_BOM;
  1207. }
  1208. }
  1209. if (px->fromcodeset != IC_WCHAR_T) {
  1210. if (((__uwchar_t) wc) > (((px->fromcodeset & IC_UCS_4) == IC_UCS_4)
  1211. ? 0x7fffffffUL : 0x10ffffUL)
  1212. #ifdef KUHN
  1213. || (((__uwchar_t)(wc - 0xfffeU)) < 2)
  1214. || (((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U))
  1215. #endif
  1216. ) {
  1217. goto ILLEGAL;
  1218. }
  1219. }
  1220. } else if (px->fromcodeset == IC_UTF_8) {
  1221. const char *p = *inbuf;
  1222. r = _wchar_utf8sntowcs(&wc, 1, &p, *inbytesleft, &px->fromstate, 0);
  1223. if (((ssize_t) r) <= 0) { /* either EILSEQ or incomplete or nul */
  1224. if (((ssize_t) r) < 0) { /* either EILSEQ or incomplete or nul */
  1225. assert((r == (size_t)(-1)) || (r == (size_t)(-2)));
  1226. if (r == (size_t)(-2)) {
  1227. INVALID:
  1228. __set_errno(EINVAL);
  1229. } else {
  1230. px->fromstate.mask = 0;
  1231. inci = 1;
  1232. ILLEGAL:
  1233. if (px->skip_invalid_input) {
  1234. px->skip_invalid_input = 2; /* flag for iconv utility */
  1235. goto BOM_SKIP_OUTPUT;
  1236. }
  1237. __set_errno(EILSEQ);
  1238. }
  1239. return (size_t)(-1);
  1240. }
  1241. #ifdef __UCLIBC_MJN3_ONLY__
  1242. #warning TODO: optimize this.
  1243. #endif
  1244. if (p != NULL) { /* incomplete char case */
  1245. goto INVALID;
  1246. }
  1247. p = *inbuf + 1; /* nul */
  1248. }
  1249. inci = p - *inbuf;
  1250. } else if ((wc = ((unsigned char)(**inbuf))) >= 0x80) { /* Non-ASCII... */
  1251. if (px->fromcodeset == IC_ASCII) { /* US-ASCII codeset */
  1252. goto ILLEGAL;
  1253. } else { /* some other 8-bit ascii-extension codeset */
  1254. const __codeset_8_bit_t *c8b
  1255. = __locale_mmap->codeset_8_bit + px->fromcodeset - 3;
  1256. wc -= 0x80;
  1257. wc = __UCLIBC_CURLOCALE_DATA.tbl8c2wc[
  1258. (c8b->idx8c2wc[wc >> Cc2wc_IDX_SHIFT]
  1259. << Cc2wc_IDX_SHIFT) + (wc & (Cc2wc_ROW_LEN - 1))];
  1260. if (!wc) {
  1261. goto ILLEGAL;
  1262. }
  1263. }
  1264. }
  1265. if (px->tobom) {
  1266. inci = 0;
  1267. wc = 0xfeffU;
  1268. GOT_BOM:
  1269. px->tobom = 0;
  1270. }
  1271. if (px->tocodeset >= IC_MULTIBYTE) {
  1272. inco = (px->tocodeset == IC_WCHAR_T) ? 4: (px->tocodeset & 6);
  1273. if (*outbytesleft < inci) goto TOO_BIG;
  1274. if (px->tocodeset != IC_WCHAR_T) {
  1275. if (((__uwchar_t) wc) > (((px->tocodeset & IC_UCS_4) == IC_UCS_4)
  1276. ? 0x7fffffffUL : 0x10ffffUL)
  1277. #ifdef KUHN
  1278. || (((__uwchar_t)(wc - 0xfffeU)) < 2)
  1279. || (((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U))
  1280. #endif
  1281. ) {
  1282. REPLACE_32:
  1283. wc = 0xfffd;
  1284. ++nrcount;
  1285. }
  1286. }
  1287. if (inco == 4) {
  1288. if (px->tocodeset & 1) wc = bswap_32(wc);
  1289. } else {
  1290. if (((__uwchar_t)wc ) > 0xffffU) {
  1291. if ((px->tocodeset & IC_UTF_16) != IC_UTF_16) {
  1292. goto REPLACE_32;
  1293. }
  1294. if (*outbytesleft < (inco = 4)) goto TOO_BIG;
  1295. wc2 = 0xdc00U + (wc & 0x3ff);
  1296. wc = 0xd800U + ((wc >> 10) & 0x3ff);
  1297. if (px->tocodeset & 1) {
  1298. wc = bswap_16(wc);
  1299. wc2 = bswap_16(wc2);
  1300. }
  1301. wc += (wc2 << 16);
  1302. } else if (px->tocodeset & 1) wc = bswap_16(wc);
  1303. }
  1304. (*outbuf)[0] = (char)((unsigned char)(wc));
  1305. (*outbuf)[1] = (char)((unsigned char)(wc >> 8));
  1306. if (inco == 4) {
  1307. (*outbuf)[2] = (char)((unsigned char)(wc >> 16));
  1308. (*outbuf)[3] = (char)((unsigned char)(wc >> 24));
  1309. }
  1310. } else if (px->tocodeset == IC_UTF_8) {
  1311. const wchar_t *pw = &wc;
  1312. do {
  1313. r = _wchar_wcsntoutf8s(*outbuf, *outbytesleft, &pw, 1);
  1314. if (r != (size_t)(-1)) {
  1315. #ifdef __UCLIBC_MJN3_ONLY__
  1316. #warning TODO: What happens for a nul?
  1317. #endif
  1318. if (r == 0) {
  1319. if (wc != 0) {
  1320. goto TOO_BIG;
  1321. }
  1322. ++r;
  1323. }
  1324. break;
  1325. }
  1326. wc = 0xfffdU;
  1327. ++nrcount;
  1328. } while (1);
  1329. inco = r;
  1330. } else if (((__uwchar_t)(wc)) < 0x80) {
  1331. CHAR_GOOD:
  1332. **outbuf = wc;
  1333. } else {
  1334. if ((px->tocodeset != 0x01) && (wc <= Cwc2c_DOMAIN_MAX)) {
  1335. const __codeset_8_bit_t *c8b
  1336. = __locale_mmap->codeset_8_bit + px->tocodeset - 3;
  1337. __uwchar_t u;
  1338. u = c8b->idx8wc2c[wc >> (Cwc2c_TI_SHIFT + Cwc2c_TT_SHIFT)];
  1339. u = __UCLIBC_CURLOCALE_DATA.tbl8wc2c[(u << Cwc2c_TI_SHIFT)
  1340. + ((wc >> Cwc2c_TT_SHIFT)
  1341. & ((1 << Cwc2c_TI_SHIFT)-1))];
  1342. wc = __UCLIBC_CURLOCALE_DATA.tbl8wc2c[Cwc2c_TI_LEN
  1343. + (u << Cwc2c_TT_SHIFT)
  1344. + (wc & ((1 << Cwc2c_TT_SHIFT)-1))];
  1345. if (wc) {
  1346. goto CHAR_GOOD;
  1347. }
  1348. }
  1349. **outbuf = '?';
  1350. ++nrcount;
  1351. }
  1352. *outbuf += inco;
  1353. *outbytesleft -= inco;
  1354. BOM_SKIP_OUTPUT:
  1355. *inbuf += inci;
  1356. *inbytesleft -= inci;
  1357. }
  1358. return nrcount;
  1359. }
  1360. #endif
  1361. /**********************************************************************/
  1362. #ifdef L_iconv_main
  1363. #include <stdio.h>
  1364. #include <stdlib.h>
  1365. #include <string.h>
  1366. #include <wchar.h>
  1367. #include <iconv.h>
  1368. #include <stdarg.h>
  1369. #include <libgen.h>
  1370. extern const unsigned char __iconv_codesets[];
  1371. #define IBUF BUFSIZ
  1372. #define OBUF BUFSIZ
  1373. char *progname;
  1374. int hide_errors;
  1375. static void error_msg(const char *fmt, ...)
  1376. __attribute__ ((noreturn, format (printf, 1, 2)));
  1377. static void error_msg(const char *fmt, ...)
  1378. {
  1379. va_list arg;
  1380. if (!hide_errors) {
  1381. fprintf(stderr, "%s: ", progname);
  1382. va_start(arg, fmt);
  1383. vfprintf(stderr, fmt, arg);
  1384. va_end(arg);
  1385. }
  1386. exit(EXIT_FAILURE);
  1387. }
  1388. int main(int argc, char **argv)
  1389. {
  1390. FILE *ifile;
  1391. FILE *ofile = stdout;
  1392. const char *p;
  1393. const char *s;
  1394. static const char opt_chars[] = "tfocsl";
  1395. /* 012345 */
  1396. const char *opts[sizeof(opt_chars)]; /* last is infile name */
  1397. iconv_t ic;
  1398. char ibuf[IBUF];
  1399. char obuf[OBUF];
  1400. char *pi;
  1401. char *po;
  1402. size_t ni, no, r, pos;
  1403. hide_errors = 0;
  1404. for (s = opt_chars ; *s ; s++) {
  1405. opts[ s - opt_chars ] = NULL;
  1406. }
  1407. progname = *argv;
  1408. while (--argc) {
  1409. p = *++argv;
  1410. if ((*p != '-') || (*++p == 0)) {
  1411. break;
  1412. }
  1413. do {
  1414. if ((s = strchr(opt_chars,*p)) == NULL) {
  1415. USAGE:
  1416. s = basename(progname);
  1417. fprintf(stderr,
  1418. "%s [-cs] -f fromcode -t tocode [-o outputfile] [inputfile ...]\n"
  1419. " or\n%s -l\n", s, s);
  1420. return EXIT_FAILURE;
  1421. }
  1422. if ((s - opt_chars) < 3) {
  1423. if ((--argc == 0) || opts[s - opt_chars]) {
  1424. goto USAGE;
  1425. }
  1426. opts[s - opt_chars] = *++argv;
  1427. } else {
  1428. opts[s - opt_chars] = p;
  1429. }
  1430. } while (*++p);
  1431. }
  1432. if (opts[5]) { /* -l */
  1433. fprintf(stderr, "Recognized codesets:\n");
  1434. for (s = __iconv_codesets ; *s ; s += *s) {
  1435. fprintf(stderr," %s\n", s+2);
  1436. }
  1437. s = __LOCALE_DATA_CODESET_LIST;
  1438. do {
  1439. fprintf(stderr," %s\n", __LOCALE_DATA_CODESET_LIST+ (unsigned char)(*s));
  1440. } while (*++s);
  1441. return EXIT_SUCCESS;
  1442. }
  1443. if (opts[4]) {
  1444. hide_errors = 1;
  1445. }
  1446. if (!opts[0] || !opts[1]) {
  1447. goto USAGE;
  1448. }
  1449. if ((ic = iconv_open(opts[0],opts[1])) == ((iconv_t)(-1))) {
  1450. error_msg( "unsupported codeset in %s -> %s conversion\n", opts[0], opts[1]);
  1451. }
  1452. if (opts[3]) { /* -c */
  1453. ((_UC_iconv_t *) ic)->skip_invalid_input = 1;
  1454. }
  1455. if ((s = opts[2]) != NULL) {
  1456. if (!(ofile = fopen(s, "w"))) {
  1457. error_msg( "couldn't open %s for writing\n", s);
  1458. }
  1459. }
  1460. pos = ni = 0;
  1461. do {
  1462. if (!argc || ((**argv == '-') && !((*argv)[1]))) {
  1463. ifile = stdin; /* we don't check for duplicates */
  1464. } else if (!(ifile = fopen(*argv, "r"))) {
  1465. error_msg( "couldn't open %s for reading\n", *argv);
  1466. }
  1467. while ((r = fread(ibuf + ni, 1, IBUF - ni, ifile)) > 0) {
  1468. pos += r;
  1469. ni += r;
  1470. no = OBUF;
  1471. pi = ibuf;
  1472. po = obuf;
  1473. if ((r = iconv(ic, &pi, &ni, &po, &no)) == ((size_t)(-1))) {
  1474. if ((errno != EINVAL) && (errno != E2BIG)) {
  1475. error_msg( "iconv failed at pos %lu : %m\n", (unsigned long) (pos - ni));
  1476. }
  1477. }
  1478. if ((r = OBUF - no) > 0) {
  1479. if (fwrite(obuf, 1, OBUF - no, ofile) < r) {
  1480. error_msg( "write error\n");
  1481. }
  1482. }
  1483. if (ni) { /* still bytes in buffer! */
  1484. memmove(ibuf, pi, ni);
  1485. }
  1486. }
  1487. if (ferror(ifile)) {
  1488. error_msg( "read error\n");
  1489. }
  1490. ++argv;
  1491. if (ifile != stdin) {
  1492. fclose(ifile);
  1493. }
  1494. } while (--argc > 0);
  1495. iconv_close(ic);
  1496. if (ni) {
  1497. error_msg( "incomplete sequence\n");
  1498. }
  1499. return (((_UC_iconv_t *) ic)->skip_invalid_input < 2)
  1500. ? EXIT_SUCCESS : EXIT_FAILURE;
  1501. }
  1502. #endif
  1503. /**********************************************************************/