123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746 |
- /* Copyright (C) 2002, 2003, 2004 Manuel Novoa III
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Library General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Library General Public License for more details.
- *
- * You should have received a copy of the GNU Library General Public
- * License along with this library; if not, write to the Free
- * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
- /* ATTENTION! ATTENTION! ATTENTION! ATTENTION! ATTENTION!
- *
- * Besides uClibc, I'm using this code in my libc for elks, which is
- * a 16-bit environment with a fairly limited compiler. It would make
- * things much easier for me if this file isn't modified unnecessarily.
- * In particular, please put any new or replacement functions somewhere
- * else, and modify the makefile to use your version instead.
- * Thanks. Manuel
- *
- * ATTENTION! ATTENTION! ATTENTION! ATTENTION! ATTENTION! */
- /* May 23, 2002 Initial Notes:
- *
- * I'm still tweaking this stuff, but it passes the tests I've thrown
- * at it, and Erik needs it for the gcc port. The glibc extension
- * __wcsnrtombs() hasn't been tested, as I didn't find a test for it
- * in the glibc source. I also need to fix the behavior of
- * _wchar_utf8sntowcs() if the max number of wchars to convert is 0.
- *
- * UTF-8 -> wchar -> UTF-8 conversion tests on Markus Kuhn's UTF-8-demo.txt
- * file on my platform (x86) show about 5-10% faster conversion speed than
- * glibc with mbsrtowcs()/wcsrtombs() and almost twice as fast as glibc with
- * individual mbrtowc()/wcrtomb() calls.
- *
- * If 'DECODER' is defined, then _wchar_utf8sntowcs() will be compiled
- * as a fail-safe UTF-8 decoder appropriate for a terminal, etc. which
- * needs to deal gracefully with whatever is sent to it. In that mode,
- * it passes Markus Kuhn's UTF-8-test.txt stress test. I plan to add
- * an arg to force that behavior, so the interface will be changing.
- *
- * I need to fix the error checking for 16-bit wide chars. This isn't
- * an issue for uClibc, but may be for ELKS. I'm currently not sure
- * if I'll use 16-bit, 32-bit, or configureable wchars in ELKS.
- *
- * July 1, 2002
- *
- * Fixed _wchar_utf8sntowcs() for the max number of wchars == 0 case.
- * Fixed nul-char bug in btowc(), and another in __mbsnrtowcs() for 8-bit
- * locales.
- * Enabled building of a C/POSIX-locale-only version, so full locale support
- * no longer needs to be enabled.
- *
- * Nov 4, 2002
- *
- * Fixed a bug in _wchar_wcsntoutf8s(). Don't store wcs position if dst is NULL.
- * Also, introduce an awful hack into _wchar_wcsntoutf8s() and wcsrtombs() in
- * order to support %ls in printf. See comments below for details.
- * Change behaviour of wc<->mb functions when in the C locale. Now they do
- * a 1-1 map for the range 0x80-UCHAR_MAX. This is for backwards compatibility
- * and consistency with the stds requirements that a printf format string by
- * a valid multibyte string beginning and ending in it's initial shift state.
- *
- * Nov 5, 2002
- *
- * Forgot to change btowc and wctob when I changed the wc<->mb functions yesterday.
- *
- * Nov 7, 2002
- *
- * Add wcwidth and wcswidth, based on Markus Kuhn's wcwidth of 2002-05-08.
- * Added some size/speed optimizations and integrated it into my locale
- * framework. Minimally tested at the moment, but the stub C-locale
- * version (which most people would probably be using) should be fine.
- *
- * Nov 21, 2002
- *
- * Revert the wc<->mb changes from earlier this month involving the C-locale.
- * Add a couple of ugly hacks to support *wprintf.
- * Add a mini iconv() and iconv implementation (requires locale support).
- *
- * Aug 1, 2003
- * Bug fix for mbrtowc.
- *
- * Aug 18, 2003
- * Bug fix: _wchar_utf8sntowcs and _wchar_wcsntoutf8s now set errno if EILSEQ.
- *
- * Feb 11, 2004
- * Bug fix: Fix size check for remaining output space in iconv().
- *
- * Manuel
- */
- #include <errno.h>
- #include <stddef.h>
- #include <limits.h>
- #include <stdint.h>
- #include <inttypes.h>
- #include <stdlib.h>
- #include <stdio.h>
- #include <assert.h>
- #include <locale.h>
- #include <wchar.h>
- #include <bits/uClibc_uwchar.h>
- /**********************************************************************/
- #ifdef __UCLIBC_HAS_LOCALE__
- #ifdef __UCLIBC_MJN3_ONLY__
- #ifdef L_iswspace
- /* generates one warning */
- #warning TODO: Fix Cc2wc* and Cwc2c* defines!
- #endif
- #endif /* __UCLIBC_MJN3_ONLY__ */
- #define ENCODING (__UCLIBC_CURLOCALE->encoding)
- #define Cc2wc_IDX_SHIFT __LOCALE_DATA_Cc2wc_IDX_SHIFT
- #define Cc2wc_ROW_LEN __LOCALE_DATA_Cc2wc_ROW_LEN
- #define Cwc2c_DOMAIN_MAX __LOCALE_DATA_Cwc2c_DOMAIN_MAX
- #define Cwc2c_TI_SHIFT __LOCALE_DATA_Cwc2c_TI_SHIFT
- #define Cwc2c_TT_SHIFT __LOCALE_DATA_Cwc2c_TT_SHIFT
- #define Cwc2c_TI_LEN __LOCALE_DATA_Cwc2c_TI_LEN
- #ifndef __CTYPE_HAS_UTF_8_LOCALES
- #warning __CTYPE_HAS_UTF_8_LOCALES not set!
- #endif
- #else /* __UCLIBC_HAS_LOCALE__ */
- #ifdef __UCLIBC_MJN3_ONLY__
- #ifdef L_btowc
- /* emit only once */
- #warning fix preprocessor logic testing locale settings
- #endif
- #endif
- #define ENCODING (__ctype_encoding_7_bit)
- #ifdef __CTYPE_HAS_8_BIT_LOCALES
- #error __CTYPE_HAS_8_BIT_LOCALES is defined!
- #endif
- #ifdef __CTYPE_HAS_UTF_8_LOCALES
- #error __CTYPE_HAS_UTF_8_LOCALES is defined!
- #endif
- #undef L__wchar_utf8sntowcs
- #undef L__wchar_wcsntoutf8s
- #endif /* __UCLIBC_HAS_LOCALE__ */
- /**********************************************************************/
- #if WCHAR_MAX > 0xffffUL
- #define UTF_8_MAX_LEN 6
- #else
- #define UTF_8_MAX_LEN 3
- #endif
- #define KUHN 1
- /* Implementation-specific work functions. */
- extern size_t _wchar_utf8sntowcs(wchar_t *__restrict pwc, size_t wn,
- const char **__restrict src, size_t n,
- mbstate_t *ps, int allow_continuation) attribute_hidden;
- extern size_t _wchar_wcsntoutf8s(char *__restrict s, size_t n,
- const wchar_t **__restrict src, size_t wn) attribute_hidden;
- /**********************************************************************/
- #ifdef L_btowc
- /* libc_hidden_proto(mbrtowc) */
- /* libc_hidden_proto(btowc) */
- wint_t btowc(int c)
- {
- #ifdef __CTYPE_HAS_8_BIT_LOCALES
- wchar_t wc;
- unsigned char buf[1];
- mbstate_t mbstate;
- if (c != EOF) {
- *buf = (unsigned char) c;
- mbstate.__mask = 0; /* Initialize the mbstate. */
- if (mbrtowc(&wc, (char*) buf, 1, &mbstate) <= 1) {
- return wc;
- }
- }
- return WEOF;
- #else /* !__CTYPE_HAS_8_BIT_LOCALES */
- #ifdef __UCLIBC_HAS_LOCALE__
- assert((ENCODING == __ctype_encoding_7_bit)
- || (ENCODING == __ctype_encoding_utf8));
- #endif
- /* If we don't have 8-bit locale support, then this is trivial since
- * anything outside of 0-0x7f is illegal in C/POSIX and UTF-8 locales. */
- return (((unsigned int)c) < 0x80) ? c : WEOF;
- #endif /* !__CTYPE_HAS_8_BIT_LOCALES */
- }
- libc_hidden_def(btowc)
- #endif
- /**********************************************************************/
- #ifdef L_wctob
- /* Note: We completely ignore ps in all currently supported conversions. */
- /* libc_hidden_proto(wcrtomb) */
- int wctob(wint_t c)
- {
- #ifdef __CTYPE_HAS_8_BIT_LOCALES
- unsigned char buf[MB_LEN_MAX];
- return (wcrtomb((char*) buf, c, NULL) == 1) ? *buf : EOF;
- #else /* __CTYPE_HAS_8_BIT_LOCALES */
- #ifdef __UCLIBC_HAS_LOCALE__
- assert((ENCODING == __ctype_encoding_7_bit)
- || (ENCODING == __ctype_encoding_utf8));
- #endif /* __UCLIBC_HAS_LOCALE__ */
- /* If we don't have 8-bit locale support, then this is trivial since
- * anything outside of 0-0x7f is illegal in C/POSIX and UTF-8 locales. */
- /* TODO: need unsigned version of wint_t... */
- /* return (((unsigned int)c) < 0x80) ? c : WEOF; */
- return ((c >= 0) && (c < 0x80)) ? c : EOF;
- #endif /* __CTYPE_HAS_8_BIT_LOCALES */
- }
- #endif
- /**********************************************************************/
- #ifdef L_mbsinit
- /* libc_hidden_proto(mbsinit) */
- int mbsinit(const mbstate_t *ps)
- {
- return !ps || !ps->__mask;
- }
- libc_hidden_def(mbsinit)
- #endif
- /**********************************************************************/
- #ifdef L_mbrlen
- /* libc_hidden_proto(mbrtowc) */
- /* libc_hidden_proto(mbrlen) */
- size_t mbrlen(const char *__restrict s, size_t n, mbstate_t *__restrict ps)
- {
- static mbstate_t mbstate; /* Rely on bss 0-init. */
- return mbrtowc(NULL, s, n, (ps != NULL) ? ps : &mbstate);
- }
- libc_hidden_def(mbrlen)
- #endif
- /**********************************************************************/
- #ifdef L_mbrtowc
- /* libc_hidden_proto(mbsnrtowcs) */
- /* libc_hidden_proto(mbrtowc) */
- size_t mbrtowc(wchar_t *__restrict pwc, const char *__restrict s,
- size_t n, mbstate_t *__restrict ps)
- {
- static mbstate_t mbstate; /* Rely on bss 0-init. */
- wchar_t wcbuf[1];
- const char *p;
- size_t r;
- char empty_string[1]; /* Avoid static to be fPIC friendly. */
- if (!ps) {
- ps = &mbstate;
- }
- if (!s) {
- pwc = (wchar_t *) s; /* NULL */
- empty_string[0] = 0; /* Init the empty string when necessary. */
- s = empty_string;
- n = 1;
- } else if (*s == '\0') {
- /* According to the ISO C 89 standard this is the expected behaviour. */
- return 0;
- } else if (!n) {
- /* TODO: change error code? */
- #if 0
- return (ps->__mask && (ps->__wc == 0xffffU))
- ? ((size_t) -1) : ((size_t) -2);
- #else
- return 0;
- #endif
- }
- p = s;
- #ifdef __CTYPE_HAS_UTF_8_LOCALES
- /* Need to do this here since mbsrtowcs doesn't allow incompletes. */
- if (ENCODING == __ctype_encoding_utf8) {
- if (!pwc) {
- pwc = wcbuf;
- }
- r = _wchar_utf8sntowcs(pwc, 1, &p, n, ps, 1);
- return (r == 1) ? (p-s) : r; /* Need to return 0 if nul char. */
- }
- #endif
- #ifdef __UCLIBC_MJN3_ONLY__
- #warning TODO: This adds a trailing nul!
- #endif /* __UCLIBC_MJN3_ONLY__ */
- r = mbsnrtowcs(wcbuf, &p, SIZE_MAX, 1, ps);
- if (((ssize_t) r) >= 0) {
- if (pwc) {
- *pwc = *wcbuf;
- }
- }
- return (size_t) r;
- }
- libc_hidden_def(mbrtowc)
- #endif
- /**********************************************************************/
- #ifdef L_wcrtomb
- /* libc_hidden_proto(wcsnrtombs) */
- /* Note: We completely ignore ps in all currently supported conversions. */
- /* TODO: Check for valid state anyway? */
- /* libc_hidden_proto(wcrtomb) */
- size_t wcrtomb(register char *__restrict s, wchar_t wc,
- mbstate_t *__restrict ps)
- {
- #ifdef __UCLIBC_MJN3_ONLY__
- #warning TODO: Should wcsnrtombs nul-terminate unconditionally? Check glibc.
- #endif /* __UCLIBC_MJN3_ONLY__ */
- wchar_t wcbuf[1];
- const wchar_t *pwc;
- size_t r;
- char buf[MB_LEN_MAX];
- if (!s) {
- s = buf;
- wc = 0;
- }
- pwc = wcbuf;
- wcbuf[0] = wc;
- r = wcsnrtombs(s, &pwc, 1, MB_LEN_MAX, ps);
- return (r != 0) ? r : 1;
- }
- libc_hidden_def(wcrtomb)
- #endif
- /**********************************************************************/
- #ifdef L_mbsrtowcs
- /* libc_hidden_proto(mbsnrtowcs) */
- /* libc_hidden_proto(mbsrtowcs) */
- size_t mbsrtowcs(wchar_t *__restrict dst, const char **__restrict src,
- size_t len, mbstate_t *__restrict ps)
- {
- static mbstate_t mbstate; /* Rely on bss 0-init. */
- return mbsnrtowcs(dst, src, SIZE_MAX, len,
- ((ps != NULL) ? ps : &mbstate));
- }
- libc_hidden_def(mbsrtowcs)
- #endif
- /**********************************************************************/
- #ifdef L_wcsrtombs
- /* Note: We completely ignore ps in all currently supported conversions.
- * TODO: Check for valid state anyway? */
- /* libc_hidden_proto(wcsnrtombs) */
- /* libc_hidden_proto(wcsrtombs) */
- size_t wcsrtombs(char *__restrict dst, const wchar_t **__restrict src,
- size_t len, mbstate_t *__restrict ps)
- {
- return wcsnrtombs(dst, src, SIZE_MAX, len, ps);
- }
- libc_hidden_def(wcsrtombs)
- #endif
- /**********************************************************************/
- #ifdef L__wchar_utf8sntowcs
- /* Define DECODER to generate a UTF-8 decoder which passes Markus Kuhn's
- * UTF-8-test.txt strss test.
- */
- /* #define DECODER */
- #ifdef DECODER
- #ifndef KUHN
- #define KUHN
- #endif
- #endif
- size_t attribute_hidden _wchar_utf8sntowcs(wchar_t *__restrict pwc, size_t wn,
- const char **__restrict src, size_t n,
- mbstate_t *ps, int allow_continuation)
- {
- register const char *s;
- __uwchar_t mask;
- __uwchar_t wc;
- wchar_t wcbuf[1];
- size_t count;
- int incr;
- s = *src;
- assert(s != NULL);
- assert(ps != NULL);
- incr = 1;
- /* NOTE: The following is an AWFUL HACK! In order to support %s in
- * wprintf, we need to be able to compute the number of wchars needed
- * for the mbs conversion, not to exceed the precision specified.
- * But if dst is NULL, the return value is the length assuming a
- * sufficiently sized buffer. So, we allow passing of (wchar_t *) ps
- * as pwc in order to flag that we really want the length, subject
- * to the restricted buffer size and no partial conversions.
- * See mbsnrtowcs() as well. */
- if (!pwc || (pwc == ((wchar_t *)ps))) {
- if (!pwc) {
- wn = SIZE_MAX;
- }
- pwc = wcbuf;
- incr = 0;
- }
- /* This is really here only to support the glibc extension function
- * __mbsnrtowcs which apparently returns 0 if wn == 0 without any
- * check on the validity of the mbstate. */
- if (!(count = wn)) {
- return 0;
- }
- if ((mask = (__uwchar_t) ps->__mask) != 0) { /* A continuation... */
- #ifdef DECODER
- wc = (__uwchar_t) ps->__wc;
- if (n) {
- goto CONTINUE;
- }
- goto DONE;
- #else
- if ((wc = (__uwchar_t) ps->__wc) != 0xffffU) {
- /* TODO: change error code here and below? */
- if (n) {
- goto CONTINUE;
- }
- goto DONE;
- }
- __set_errno(EILSEQ);
- return (size_t) -1; /* We're in an error state. */
- #endif
- }
- do {
- if (!n) {
- goto DONE;
- }
- --n;
- if ((wc = ((unsigned char) *s++)) >= 0x80) { /* Not ASCII... */
- mask = 0x40;
- #ifdef __UCLIBC_MJN3_ONLY__
- #warning TODO: Fix range for 16 bit wchar_t case.
- #endif
- if (( ((unsigned char)(s[-1] - 0xc0)) < (0xfe - 0xc0) ) &&
- (((unsigned char)s[-1] != 0xc0 ) && ((unsigned char)s[-1] != 0xc1 ))) {
- goto START;
- }
- BAD:
- #ifdef DECODER
- wc = 0xfffdU;
- goto COMPLETE;
- #else
- ps->__mask = mask;
- ps->__wc = 0xffffU;
- __set_errno(EILSEQ);
- return (size_t) -1; /* Illegal start byte! */
- #endif
- CONTINUE:
- while (n) {
- --n;
- if ((*s & 0xc0) != 0x80) {
- goto BAD;
- }
- mask <<= 5;
- wc <<= 6;
- wc += (*s & 0x3f); /* keep seperate for bcc (smaller code) */
- ++s;
- START:
- wc &= ~(mask << 1);
- if ((wc & mask) == 0) { /* Character completed. */
- if ((mask >>= 5) == 0x40) {
- mask += mask;
- }
- /* Check for invalid sequences (longer than necessary)
- * and invalid chars. */
- if ( (wc < mask) /* Sequence not minimal length. */
- #ifdef KUHN
- #if UTF_8_MAX_LEN == 3
- #error broken since mask can overflow!!
- /* For plane 0, these are the only defined values.*/
- || (wc > 0xfffdU)
- #else
- /* Note that we don't need to worry about exceeding */
- /* 31 bits as that is the most that UTF-8 provides. */
- || ( ((__uwchar_t)(wc - 0xfffeU)) < 2)
- #endif
- || ( ((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U) )
- #endif /* KUHN */
- ) {
- goto BAD;
- }
- goto COMPLETE;
- }
- }
- /* Character potentially valid but incomplete. */
- if (!allow_continuation) {
- if (count != wn) {
- return 0;
- }
- /* NOTE: The following can fail if you allow and then disallow
- * continuation!!! */
- #if UTF_8_MAX_LEN == 3
- #error broken since mask can overflow!!
- #endif
- /* Need to back up... */
- do {
- --s;
- } while ((mask >>= 5) >= 0x40);
- goto DONE;
- }
- ps->__mask = (wchar_t) mask;
- ps->__wc = (wchar_t) wc;
- *src = s;
- return (size_t) -2;
- }
- COMPLETE:
- *pwc = wc;
- pwc += incr;
- }
- #ifdef DECODER
- while (--count);
- #else
- while (wc && --count);
- if (!wc) {
- s = NULL;
- }
- #endif
- DONE:
- /* ps->__wc is irrelavent here. */
- ps->__mask = 0;
- if (pwc != wcbuf) {
- *src = s;
- }
- return wn - count;
- }
- #endif
- /**********************************************************************/
- #ifdef L__wchar_wcsntoutf8s
- size_t attribute_hidden _wchar_wcsntoutf8s(char *__restrict s, size_t n,
- const wchar_t **__restrict src, size_t wn)
- {
- register char *p;
- size_t len, t;
- __uwchar_t wc;
- const __uwchar_t *swc;
- int store;
- char buf[MB_LEN_MAX];
- char m;
- store = 1;
- /* NOTE: The following is an AWFUL HACK! In order to support %ls in
- * printf, we need to be able to compute the number of bytes needed
- * for the mbs conversion, not to exceed the precision specified.
- * But if dst is NULL, the return value is the length assuming a
- * sufficiently sized buffer. So, we allow passing of (char *) src
- * as dst in order to flag that we really want the length, subject
- * to the restricted buffer size and no partial conversions.
- * See wcsnrtombs() as well. */
- if (!s || (s == ((char *) src))) {
- if (!s) {
- n = SIZE_MAX;
- }
- s = buf;
- store = 0;
- }
- t = n;
- swc = (const __uwchar_t *) *src;
- assert(swc != NULL);
- while (wn && t) {
- wc = *swc;
- *s = wc;
- len = 1;
- if (wc >= 0x80) {
- #ifdef KUHN
- if (
- #if UTF_8_MAX_LEN == 3
- /* For plane 0, these are the only defined values.*/
- /* Note that we don't need to worry about exceeding */
- /* 31 bits as that is the most that UTF-8 provides. */
- (wc > 0xfffdU)
- #else
- /* UTF_8_MAX_LEN == 6 */
- (wc > 0x7fffffffUL)
- || ( ((__uwchar_t)(wc - 0xfffeU)) < 2)
- #endif
- || ( ((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U) )
- ) {
- __set_errno(EILSEQ);
- return (size_t) -1;
- }
- #else /* KUHN */
- #if UTF_8_MAX_LEN != 3
- if (wc > 0x7fffffffUL) { /* Value too large. */
- __set_errno(EILSEQ);
- return (size_t) -1;
- }
- #endif
- #endif /* KUHN */
- wc >>= 1;
- p = s;
- do {
- ++p;
- } while (wc >>= 5);
- wc = *swc;
- if ((len = p - s) > t) { /* Not enough space. */
- break;
- }
- m = 0x80;
- while( p>s ) {
- m = (m >> 1) | 0x80;
- *--p = (wc & 0x3f) | 0x80;
- wc >>= 6;
- }
- *s |= (m << 1);
- } else if (wc == 0) { /* End of string. */
- swc = NULL;
- break;
- }
- ++swc;
- --wn;
- t -= len;
- if (store) {
- s += len;
- }
- }
- if (store) {
- *src = (const wchar_t *) swc;
- }
- return n - t;
- }
- #endif
- /**********************************************************************/
- #ifdef L_mbsnrtowcs
- /* WARNING: We treat len as SIZE_MAX when dst is NULL! */
- /* libc_hidden_proto(mbsnrtowcs) */
- size_t mbsnrtowcs(wchar_t *__restrict dst, const char **__restrict src,
- size_t NMC, size_t len, mbstate_t *__restrict ps)
- {
- static mbstate_t mbstate; /* Rely on bss 0-init. */
- wchar_t wcbuf[1];
- const char *s;
- size_t count;
- int incr;
- if (!ps) {
- ps = &mbstate;
- }
- #ifdef __CTYPE_HAS_UTF_8_LOCALES
- if (ENCODING == __ctype_encoding_utf8) {
- size_t r;
- return ((r = _wchar_utf8sntowcs(dst, len, src, NMC, ps, 1))
- != (size_t) -2) ? r : 0;
- }
- #endif
- incr = 1;
- /* NOTE: The following is an AWFUL HACK! In order to support %s in
- * wprintf, we need to be able to compute the number of wchars needed
- * for the mbs conversion, not to exceed the precision specified.
- * But if dst is NULL, the return value is the length assuming a
- * sufficiently sized buffer. So, we allow passing of ((wchar_t *)ps)
- * as dst in order to flag that we really want the length, subject
- * to the restricted buffer size and no partial conversions.
- * See _wchar_utf8sntowcs() as well. */
- if (!dst || (dst == ((wchar_t *)ps))) {
- if (!dst) {
- len = SIZE_MAX;
- }
- dst = wcbuf;
- incr = 0;
- }
- /* Since all the following encodings are single-byte encodings... */
- if (len > NMC) {
- len = NMC;
- }
- count = len;
- s = *src;
- #ifdef __CTYPE_HAS_8_BIT_LOCALES
- if (ENCODING == __ctype_encoding_8_bit) {
- wchar_t wc;
- while (count) {
- if ((wc = ((unsigned char)(*s))) >= 0x80) { /* Non-ASCII... */
- wc -= 0x80;
- wc = __UCLIBC_CURLOCALE->tbl8c2wc[
- (__UCLIBC_CURLOCALE->idx8c2wc[wc >> Cc2wc_IDX_SHIFT]
- << Cc2wc_IDX_SHIFT) + (wc & (Cc2wc_ROW_LEN - 1))];
- if (!wc) {
- goto BAD;
- }
- }
- if (!(*dst = wc)) {
- s = NULL;
- break;
- }
- dst += incr;
- ++s;
- --count;
- }
- if (dst != wcbuf) {
- *src = s;
- }
- return len - count;
- }
- #endif
- #ifdef __UCLIBC_HAS_LOCALE__
- assert(ENCODING == __ctype_encoding_7_bit);
- #endif
- while (count) {
- if ((*dst = (unsigned char) *s) == 0) {
- s = NULL;
- break;
- }
- if (*dst >= 0x80) {
- #ifdef __CTYPE_HAS_8_BIT_LOCALES
- BAD:
- #endif
- __set_errno(EILSEQ);
- return (size_t) -1;
- }
- ++s;
- dst += incr;
- --count;
- }
- if (dst != wcbuf) {
- *src = s;
- }
- return len - count;
- }
- libc_hidden_def(mbsnrtowcs)
- #endif
- /**********************************************************************/
- #ifdef L_wcsnrtombs
- /* WARNING: We treat len as SIZE_MAX when dst is NULL! */
- /* Note: We completely ignore ps in all currently supported conversions.
- * TODO: Check for valid state anyway? */
- /* libc_hidden_proto(wcsnrtombs) */
- size_t wcsnrtombs(char *__restrict dst, const wchar_t **__restrict src,
- size_t NWC, size_t len, mbstate_t *__restrict ps)
- {
- const __uwchar_t *s;
- size_t count;
- int incr;
- char buf[MB_LEN_MAX];
- #ifdef __CTYPE_HAS_UTF_8_LOCALES
- if (ENCODING == __ctype_encoding_utf8) {
- return _wchar_wcsntoutf8s(dst, len, src, NWC);
- }
- #endif /* __CTYPE_HAS_UTF_8_LOCALES */
- incr = 1;
- /* NOTE: The following is an AWFUL HACK! In order to support %ls in
- * printf, we need to be able to compute the number of bytes needed
- * for the mbs conversion, not to exceed the precision specified.
- * But if dst is NULL, the return value is the length assuming a
- * sufficiently sized buffer. So, we allow passing of (char *) src
- * as dst in order to flag that we really want the length, subject
- * to the restricted buffer size and no partial conversions.
- * See _wchar_wcsntoutf8s() as well. */
- if (!dst || (dst == ((char *) src))) {
- if (!dst) {
- len = SIZE_MAX;
- }
- dst = buf;
- incr = 0;
- }
- /* Since all the following encodings are single-byte encodings... */
- if (len > NWC) {
- len = NWC;
- }
- count = len;
- s = (const __uwchar_t *) *src;
- #ifdef __CTYPE_HAS_8_BIT_LOCALES
- if (ENCODING == __ctype_encoding_8_bit) {
- __uwchar_t wc;
- __uwchar_t u;
- while (count) {
- if ((wc = *s) <= 0x7f) {
- if (!(*dst = (unsigned char) wc)) {
- s = NULL;
- break;
- }
- } else {
- u = 0;
- if (wc <= Cwc2c_DOMAIN_MAX) {
- u = __UCLIBC_CURLOCALE->idx8wc2c[wc >> (Cwc2c_TI_SHIFT
- + Cwc2c_TT_SHIFT)];
- u = __UCLIBC_CURLOCALE->tbl8wc2c[(u << Cwc2c_TI_SHIFT)
- + ((wc >> Cwc2c_TT_SHIFT)
- & ((1 << Cwc2c_TI_SHIFT)-1))];
- u = __UCLIBC_CURLOCALE->tbl8wc2c[Cwc2c_TI_LEN
- + (u << Cwc2c_TT_SHIFT)
- + (wc & ((1 << Cwc2c_TT_SHIFT)-1))];
- }
- #ifdef __WCHAR_REPLACEMENT_CHAR
- *dst = (unsigned char) ( u ? u : __WCHAR_REPLACEMENT_CHAR );
- #else /* __WCHAR_REPLACEMENT_CHAR */
- if (!u) {
- goto BAD;
- }
- *dst = (unsigned char) u;
- #endif /* __WCHAR_REPLACEMENT_CHAR */
- }
- ++s;
- dst += incr;
- --count;
- }
- if (dst != buf) {
- *src = (const wchar_t *) s;
- }
- return len - count;
- }
- #endif /* __CTYPE_HAS_8_BIT_LOCALES */
- #ifdef __UCLIBC_HAS_LOCALE__
- assert(ENCODING == __ctype_encoding_7_bit);
- #endif
- while (count) {
- if (*s >= 0x80) {
- #if defined(__CTYPE_HAS_8_BIT_LOCALES) && !defined(__WCHAR_REPLACEMENT_CHAR)
- BAD:
- #endif
- __set_errno(EILSEQ);
- return (size_t) -1;
- }
- if ((*dst = (unsigned char) *s) == 0) {
- s = NULL;
- break;
- }
- ++s;
- dst += incr;
- --count;
- }
- if (dst != buf) {
- *src = (const wchar_t *) s;
- }
- return len - count;
- }
- libc_hidden_def(wcsnrtombs)
- #endif
- /**********************************************************************/
- #ifdef L_wcswidth
- /* libc_hidden_proto(wcswidth) */
- #ifdef __UCLIBC_MJN3_ONLY__
- #warning REMINDER: If we start doing translit, wcwidth and wcswidth will need updating.
- #warning TODO: Update wcwidth to match latest by Kuhn.
- #endif
- #if defined(__UCLIBC_HAS_LOCALE__) && \
- ( defined(__CTYPE_HAS_8_BIT_LOCALES) || defined(__CTYPE_HAS_UTF_8_LOCALES) )
- static const unsigned char new_idx[] = {
- 0, 5, 5, 6, 10, 15, 28, 39,
- 48, 48, 71, 94, 113, 128, 139, 154,
- 175, 186, 188, 188, 188, 188, 188, 188,
- 203, 208, 208, 208, 208, 208, 208, 208,
- 208, 219, 219, 219, 222, 222, 222, 222,
- 222, 222, 222, 222, 222, 222, 222, 224,
- 224, 231, 231, 231, 231, 231, 231, 231,
- 231, 231, 231, 231, 231, 231, 231, 231,
- 231, 231, 231, 231, 231, 231, 231, 231,
- 231, 231, 231, 231, 231, 231, 231, 231,
- 231, 231, 231, 231, 231, 231, 231, 231,
- 231, 231, 231, 231, 231, 231, 231, 231,
- 231, 231, 231, 231, 231, 231, 231, 231,
- 231, 231, 231, 231, 231, 231, 231, 231,
- 231, 231, 231, 231, 231, 231, 231, 231,
- 231, 231, 231, 231, 231, 231, 231, 231,
- 231, 231, 231, 231, 231, 231, 231, 231,
- 231, 231, 231, 231, 231, 231, 231, 231,
- 231, 231, 231, 231, 231, 231, 231, 231,
- 231, 231, 231, 231, 231, 231, 231, 231,
- 231, 231, 231, 231, 231, 233, 233, 233,
- 233, 233, 233, 233, 234, 234, 234, 234,
- 234, 234, 234, 234, 234, 234, 234, 234,
- 234, 234, 234, 234, 234, 234, 234, 234,
- 234, 234, 234, 234, 234, 234, 234, 234,
- 234, 234, 234, 234, 234, 234, 234, 234,
- 234, 234, 234, 234, 234, 234, 234, 234,
- 236, 236, 236, 236, 236, 236, 236, 236,
- 236, 236, 236, 236, 236, 236, 236, 236,
- 236, 236, 236, 236, 236, 236, 236, 236,
- 236, 236, 236, 236, 236, 236, 236, 236,
- 236, 237, 237, 238, 241, 241, 242, 249,
- 255,
- };
- static const unsigned char new_tbl[] = {
- 0x00, 0x01, 0x20, 0x7f, 0xa0, 0x00, 0x00, 0x50,
- 0x60, 0x70, 0x00, 0x83, 0x87, 0x88, 0x8a, 0x00,
- 0x91, 0xa2, 0xa3, 0xba, 0xbb, 0xbe, 0xbf, 0xc0,
- 0xc1, 0xc3, 0xc4, 0xc5, 0x00, 0x4b, 0x56, 0x70,
- 0x71, 0xd6, 0xe5, 0xe7, 0xe9, 0xea, 0xee, 0x00,
- 0x0f, 0x10, 0x11, 0x12, 0x30, 0x4b, 0xa6, 0xb1,
- 0x00, 0x01, 0x03, 0x3c, 0x3d, 0x41, 0x49, 0x4d,
- 0x4e, 0x51, 0x55, 0x62, 0x64, 0x81, 0x82, 0xbc,
- 0xbd, 0xc1, 0xc5, 0xcd, 0xce, 0xe2, 0xe4, 0x00,
- 0x02, 0x03, 0x3c, 0x3d, 0x41, 0x43, 0x47, 0x49,
- 0x4b, 0x4e, 0x70, 0x72, 0x81, 0x83, 0xbc, 0xbd,
- 0xc1, 0xc6, 0xc7, 0xc9, 0xcd, 0xce, 0x00, 0x01,
- 0x02, 0x3c, 0x3d, 0x3f, 0x40, 0x41, 0x44, 0x4d,
- 0x4e, 0x56, 0x57, 0x82, 0x83, 0xc0, 0xc1, 0xcd,
- 0xce, 0x00, 0x3e, 0x41, 0x46, 0x49, 0x4a, 0x4e,
- 0x55, 0x57, 0xbf, 0xc0, 0xc6, 0xc7, 0xcc, 0xce,
- 0x00, 0x41, 0x44, 0x4d, 0x4e, 0xca, 0xcb, 0xd2,
- 0xd5, 0xd6, 0xd7, 0x00, 0x31, 0x32, 0x34, 0x3b,
- 0x47, 0x4f, 0xb1, 0xb2, 0xb4, 0xba, 0xbb, 0xbd,
- 0xc8, 0xce, 0x00, 0x18, 0x1a, 0x35, 0x36, 0x37,
- 0x38, 0x39, 0x3a, 0x71, 0x7f, 0x80, 0x85, 0x86,
- 0x88, 0x90, 0x98, 0x99, 0xbd, 0xc6, 0xc7, 0x00,
- 0x2d, 0x31, 0x32, 0x33, 0x36, 0x38, 0x39, 0x3a,
- 0x58, 0x5a, 0x00, 0x60, 0x00, 0x12, 0x15, 0x32,
- 0x35, 0x52, 0x54, 0x72, 0x74, 0xb7, 0xbe, 0xc6,
- 0xc7, 0xc9, 0xd4, 0x00, 0x0b, 0x0f, 0xa9, 0xaa,
- 0x00, 0x0b, 0x10, 0x2a, 0x2f, 0x60, 0x64, 0x6a,
- 0x70, 0xd0, 0xeb, 0x00, 0x29, 0x2b, 0x00, 0x80,
- 0x00, 0x2a, 0x30, 0x3f, 0x40, 0x99, 0x9b, 0x00,
- 0xd0, 0x00, 0x00, 0xa4, 0x00, 0x00, 0x00, 0x1e,
- 0x1f, 0x00, 0x00, 0x10, 0x20, 0x24, 0x30, 0x70,
- 0xff, 0x00, 0x61, 0xe0, 0xe7, 0xf9, 0xfc,
- };
- static const signed char new_wtbl[] = {
- 0, -1, 1, -1, 1, 1, 0, 1,
- 0, 1, 1, 0, 1, 0, 1, 1,
- 0, 1, 0, 1, 0, 1, 0, 1,
- 0, 1, 0, 1, 1, 0, 1, 0,
- 1, 0, 1, 0, 1, 0, 1, 1,
- 0, 1, 0, 1, 0, 1, 0, 1,
- 1, 0, 1, 0, 1, 0, 1, 0,
- 1, 0, 1, 0, 1, 0, 1, 0,
- 1, 0, 1, 0, 1, 0, 1, 1,
- 0, 1, 0, 1, 0, 1, 0, 1,
- 0, 1, 0, 1, 0, 1, 0, 1,
- 0, 1, 0, 1, 0, 1, 1, 0,
- 1, 0, 1, 0, 1, 0, 1, 0,
- 1, 0, 1, 0, 1, 0, 1, 0,
- 1, 1, 0, 1, 0, 1, 0, 1,
- 0, 1, 0, 1, 0, 1, 0, 1,
- 1, 0, 1, 0, 1, 0, 1, 0,
- 1, 0, 1, 1, 0, 1, 0, 1,
- 0, 1, 0, 1, 0, 1, 0, 1,
- 0, 1, 1, 0, 1, 0, 1, 0,
- 1, 0, 1, 0, 1, 0, 1, 0,
- 1, 0, 1, 0, 1, 0, 1, 1,
- 0, 1, 0, 1, 0, 1, 0, 1,
- 0, 1, 2, 0, 1, 0, 1, 0,
- 1, 0, 1, 0, 1, 0, 1, 0,
- 1, 0, 1, 1, 0, 1, 0, 1,
- 1, 0, 1, 0, 1, 0, 1, 0,
- 1, 0, 1, 1, 2, 1, 1, 2,
- 2, 0, 2, 1, 2, 0, 2, 2,
- 1, 1, 2, 1, 1, 2, 1, 0,
- 1, 1, 0, 1, 0, 1, 2, 1,
- 0, 2, 1, 2, 1, 0, 1,
- };
- /* libc_hidden_proto(wcsnrtombs) */
- int wcswidth(const wchar_t *pwcs, size_t n)
- {
- int h, l, m, count;
- wchar_t wc;
- unsigned char b;
- if (ENCODING == __ctype_encoding_7_bit) {
- size_t i;
- for (i = 0 ; (i < n) && pwcs[i] ; i++) {
- if (pwcs[i] != (pwcs[i] & 0x7f)) {
- return -1;
- }
- }
- }
- #ifdef __CTYPE_HAS_8_BIT_LOCALES
- else if (ENCODING == __ctype_encoding_8_bit) {
- mbstate_t mbstate;
- mbstate.__mask = 0; /* Initialize the mbstate. */
- if (wcsnrtombs(NULL, &pwcs, n, SIZE_MAX, &mbstate) == ((size_t) - 1)) {
- return -1;
- }
- }
- #endif /* __CTYPE_HAS_8_BIT_LOCALES */
- #if defined(__CTYPE_HAS_UTF_8_LOCALES) && defined(KUHN)
- /* For stricter handling of allowed unicode values... see comments above. */
- else if (ENCODING == __ctype_encoding_utf8) {
- size_t i;
- for (i = 0 ; (i < n) && pwcs[i] ; i++) {
- if ( (((__uwchar_t)((pwcs[i]) - 0xfffeU)) < 2)
- || (((__uwchar_t)((pwcs[i]) - 0xd800U)) < (0xe000U - 0xd800U))
- ) {
- return -1;
- }
- }
- }
- #endif /* __CTYPE_HAS_UTF_8_LOCALES */
- for (count = 0 ; n && (wc = *pwcs++) ; n--) {
- if (wc <= 0xff) {
- /* If we're here, wc != 0. */
- if ((wc < 32) || ((wc >= 0x7f) && (wc < 0xa0))) {
- return -1;
- }
- ++count;
- continue;
- }
- if (((unsigned int) wc) <= 0xffff) {
- b = wc & 0xff;
- h = (wc >> 8);
- l = new_idx[h];
- h = new_idx[h+1];
- while ((m = (l+h) >> 1) != l) {
- if (b >= new_tbl[m]) {
- l = m;
- } else { /* wc < tbl[m] */
- h = m;
- }
- }
- count += new_wtbl[l]; /* none should be -1. */
- continue;
- }
- /* Redo this to minimize average number of compares?*/
- if (wc >= 0x1d167) {
- if (wc <= 0x1d1ad) {
- if ((wc <= 0x1d169
- || (wc >= 0x1d173
- && (wc <= 0x1d182
- || (wc >= 0x1d185
- && (wc <= 0x1d18b
- || (wc >= 0x1d1aa))))))
- ) {
- continue;
- }
- } else if (((wc >= 0xe0020) && (wc <= 0xe007f)) || (wc == 0xe0001)) {
- continue;
- } else if ((wc >= 0x20000) && (wc <= 0x2ffff)) {
- ++count; /* need 2.. add one here */
- }
- #if (WCHAR_MAX > 0x7fffffffL)
- else if (wc > 0x7fffffffL) {
- return -1;
- }
- #endif /* (WCHAR_MAX > 0x7fffffffL) */
- }
- ++count;
- }
- return count;
- }
- #else /* __UCLIBC_HAS_LOCALE__ */
- int wcswidth(const wchar_t *pwcs, size_t n)
- {
- int count;
- wchar_t wc;
- size_t i;
- for (i = 0 ; (i < n) && pwcs[i] ; i++) {
- if (pwcs[i] != (pwcs[i] & 0x7f)) {
- return -1;
- }
- }
- for (count = 0 ; n && (wc = *pwcs++) ; n--) {
- if (wc <= 0xff) {
- /* If we're here, wc != 0. */
- if ((wc < 32) || ((wc >= 0x7f) && (wc < 0xa0))) {
- return -1;
- }
- ++count;
- continue;
- } else {
- return -1;
- }
- }
- return count;
- }
- #endif /* __UCLIBC_HAS_LOCALE__ */
- libc_hidden_def(wcswidth)
- #endif
- /**********************************************************************/
- #ifdef L_wcwidth
- /* libc_hidden_proto(wcswidth) */
- int wcwidth(wchar_t wc)
- {
- return wcswidth(&wc, 1);
- }
- #endif
- /**********************************************************************/
- typedef struct {
- mbstate_t tostate;
- mbstate_t fromstate;
- int tocodeset;
- int fromcodeset;
- int frombom;
- int tobom;
- int fromcodeset0;
- int frombom0;
- int tobom0;
- int skip_invalid_input; /* To support iconv -c option. */
- } _UC_iconv_t;
- #ifdef L_iconv
- #include <iconv.h>
- #include <string.h>
- #include <endian.h>
- #include <byteswap.h>
- #if (__BYTE_ORDER != __BIG_ENDIAN) && (__BYTE_ORDER != __LITTLE_ENDIAN)
- #error unsupported endianness for iconv
- #endif
- #ifndef __CTYPE_HAS_8_BIT_LOCALES
- #error currently iconv requires 8 bit locales
- #endif
- #ifndef __CTYPE_HAS_UTF_8_LOCALES
- #error currently iconv requires UTF-8 locales
- #endif
- enum {
- IC_WCHAR_T = 0xe0,
- IC_MULTIBYTE = 0xe0,
- #if __BYTE_ORDER == __BIG_ENDIAN
- IC_UCS_4 = 0xec,
- IC_UTF_32 = 0xe4,
- IC_UCS_2 = 0xe2,
- IC_UTF_16 = 0xea,
- #else
- IC_UCS_4 = 0xed,
- IC_UTF_32 = 0xe5,
- IC_UCS_2 = 0xe3,
- IC_UTF_16 = 0xeb,
- #endif
- IC_UTF_8 = 2,
- IC_ASCII = 1
- };
- /* For the multibyte
- * bit 0 means swap endian
- * bit 1 means 2 byte
- * bit 2 means 4 byte
- *
- */
- /* Used externally only by iconv utility */
- extern const unsigned char __iconv_codesets[];
- libc_hidden_proto(__iconv_codesets)
- const unsigned char __iconv_codesets[] =
- "\x0a\xe0""WCHAR_T\x00" /* superset of UCS-4 but platform-endian */
- #if __BYTE_ORDER == __BIG_ENDIAN
- "\x08\xec""UCS-4\x00" /* always BE */
- "\x0a\xec""UCS-4BE\x00"
- "\x0a\xed""UCS-4LE\x00"
- "\x09\xe4""UTF-32\x00" /* platform endian with BOM */
- "\x0b\xe4""UTF-32BE\x00"
- "\x0b\xe5""UTF-32LE\x00"
- "\x08\xe2""UCS-2\x00" /* always BE */
- "\x0a\xe2""UCS-2BE\x00"
- "\x0a\xe3""UCS-2LE\x00"
- "\x09\xea""UTF-16\x00" /* platform endian with BOM */
- "\x0b\xea""UTF-16BE\x00"
- "\x0b\xeb""UTF-16LE\x00"
- #elif __BYTE_ORDER == __LITTLE_ENDIAN
- "\x08\xed""UCS-4\x00" /* always BE */
- "\x0a\xed""UCS-4BE\x00"
- "\x0a\xec""UCS-4LE\x00"
- "\x09\xf4""UTF-32\x00" /* platform endian with BOM */
- "\x0b\xe5""UTF-32BE\x00"
- "\x0b\xe4""UTF-32LE\x00"
- "\x08\xe3""UCS-2\x00" /* always BE */
- "\x0a\xe3""UCS-2BE\x00"
- "\x0a\xe2""UCS-2LE\x00"
- "\x09\xfa""UTF-16\x00" /* platform endian with BOM */
- "\x0b\xeb""UTF-16BE\x00"
- "\x0b\xea""UTF-16LE\x00"
- #endif
- "\x08\x02""UTF-8\x00"
- "\x0b\x01""US-ASCII\x00"
- "\x07\x01""ASCII"; /* Must be last! (special case to save a nul) */
- libc_hidden_data_def(__iconv_codesets)
- /* Experimentally off - libc_hidden_proto(strcasecmp) */
- static int find_codeset(const char *name)
- {
- const unsigned char *s;
- int codeset;
- for (s = __iconv_codesets; *s; s += *s) {
- if (!strcasecmp((char*) (s + 2), name)) {
- return s[1];
- }
- }
- /* The following is ripped from find_locale in locale.c. */
- /* TODO: maybe CODESET_LIST + *s ??? */
- /* 7bit is 1, UTF-8 is 2, 8-bit is >= 3 */
- codeset = 2;
- s = (const unsigned char *) __LOCALE_DATA_CODESET_LIST;
- do {
- ++codeset; /* Increment codeset first. */
- if (!strcasecmp(__LOCALE_DATA_CODESET_LIST+*s, name)) {
- return codeset;
- }
- } while (*++s);
- return 0; /* No matching codeset! */
- }
- iconv_t weak_function iconv_open(const char *tocode, const char *fromcode)
- {
- register _UC_iconv_t *px;
- int tocodeset, fromcodeset;
- if (((tocodeset = find_codeset(tocode)) != 0)
- && ((fromcodeset = find_codeset(fromcode)) != 0)) {
- if ((px = malloc(sizeof(_UC_iconv_t))) != NULL) {
- px->tocodeset = tocodeset;
- px->tobom0 = px->tobom = (tocodeset & 0x10) >> 4;
- px->fromcodeset0 = px->fromcodeset = fromcodeset;
- px->frombom0 = px->frombom = (fromcodeset & 0x10) >> 4;
- px->skip_invalid_input = px->tostate.__mask
- = px->fromstate.__mask = 0;
- return (iconv_t) px;
- }
- } else {
- __set_errno(EINVAL);
- }
- return (iconv_t)(-1);
- }
- int weak_function iconv_close(iconv_t cd)
- {
- free(cd);
- return 0;
- }
- size_t weak_function iconv(iconv_t cd, char **__restrict inbuf,
- size_t *__restrict inbytesleft,
- char **__restrict outbuf,
- size_t *__restrict outbytesleft)
- {
- _UC_iconv_t *px = (_UC_iconv_t *) cd;
- size_t nrcount, r;
- wchar_t wc, wc2;
- int inci, inco;
- assert(px != (_UC_iconv_t *)(-1));
- assert(sizeof(wchar_t) == 4);
- if (!inbuf || !*inbuf) { /* Need to reinitialze conversion state. */
- /* Note: For shift-state encodings we possibly need to output the
- * shift sequence to return to initial state! */
- if ((px->fromcodeset & 0xf0) == 0xe0) {
- }
- px->tostate.__mask = px->fromstate.__mask = 0;
- px->fromcodeset = px->fromcodeset0;
- px->tobom = px->tobom0;
- px->frombom = px->frombom0;
- return 0;
- }
- nrcount = 0;
- while (*inbytesleft) {
- if (!*outbytesleft) {
- TOO_BIG:
- __set_errno(E2BIG);
- return (size_t) -1;
- }
- inci = inco = 1;
- if (px->fromcodeset >= IC_MULTIBYTE) {
- inci = (px->fromcodeset == IC_WCHAR_T) ? 4: (px->fromcodeset & 6);
- if (*inbytesleft < inci) goto INVALID;
- wc = (((unsigned int)((unsigned char)((*inbuf)[0]))) << 8)
- + ((unsigned char)((*inbuf)[1]));
- if (inci == 4) {
- wc = (((unsigned int)((unsigned char)((*inbuf)[2]))) << 8)
- + ((unsigned char)((*inbuf)[3])) + (wc << 16);
- if (!(px->fromcodeset & 1)) wc = bswap_32(wc);
- } else {
- if (!(px->fromcodeset & 1)) wc = bswap_16(wc);
- if (((px->fromcodeset & IC_UTF_16) == IC_UTF_16)
- && (((__uwchar_t)(wc - 0xd800U)) < (0xdc00U - 0xd800U))
- ) { /* surrogate */
- wc =- 0xd800U;
- if (*inbytesleft < 4) goto INVALID;
- wc2 = (((unsigned int)((unsigned char)((*inbuf)[2]))) << 8)
- + ((unsigned char)((*inbuf)[3]));
- if (!(px->fromcodeset & 1)) wc = bswap_16(wc2);
- if (((__uwchar_t)(wc2 -= 0xdc00U)) < (0xe0000U - 0xdc00U)) {
- goto ILLEGAL;
- }
- inci = 4; /* Change inci here in case skipping illegals. */
- wc = 0x10000UL + (wc << 10) + wc2;
- }
- }
- if (px->frombom) {
- px->frombom = 0;
- if ((wc == 0xfeffU)
- || (wc == ((inci == 4)
- ? (((wchar_t) 0xfffe0000UL))
- : ((wchar_t)(0xfffeUL))))
- ) {
- if (wc != 0xfeffU) {
- px->fromcodeset ^= 1; /* toggle endianness */
- wc = 0xfeffU;
- }
- if (!px->frombom) {
- goto BOM_SKIP_OUTPUT;
- }
- goto GOT_BOM;
- }
- }
- if (px->fromcodeset != IC_WCHAR_T) {
- if (((__uwchar_t) wc) > (((px->fromcodeset & IC_UCS_4) == IC_UCS_4)
- ? 0x7fffffffUL : 0x10ffffUL)
- #ifdef KUHN
- || (((__uwchar_t)(wc - 0xfffeU)) < 2)
- || (((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U))
- #endif
- ) {
- goto ILLEGAL;
- }
- }
- } else if (px->fromcodeset == IC_UTF_8) {
- const char *p = *inbuf;
- r = _wchar_utf8sntowcs(&wc, 1, &p, *inbytesleft, &px->fromstate, 0);
- if (((ssize_t) r) <= 0) { /* either EILSEQ or incomplete or nul */
- if (((ssize_t) r) < 0) { /* either EILSEQ or incomplete or nul */
- assert((r == (size_t)(-1)) || (r == (size_t)(-2)));
- if (r == (size_t)(-2)) {
- INVALID:
- __set_errno(EINVAL);
- } else {
- px->fromstate.__mask = 0;
- inci = 1;
- ILLEGAL:
- if (px->skip_invalid_input) {
- px->skip_invalid_input = 2; /* flag for iconv utility */
- goto BOM_SKIP_OUTPUT;
- }
- __set_errno(EILSEQ);
- }
- return (size_t)(-1);
- }
- #ifdef __UCLIBC_MJN3_ONLY__
- #warning TODO: optimize this.
- #endif
- if (p != NULL) { /* incomplete char case */
- goto INVALID;
- }
- p = *inbuf + 1; /* nul */
- }
- inci = p - *inbuf;
- } else if ((wc = ((unsigned char)(**inbuf))) >= 0x80) { /* Non-ASCII... */
- if (px->fromcodeset == IC_ASCII) { /* US-ASCII codeset */
- goto ILLEGAL;
- } else { /* some other 8-bit ascii-extension codeset */
- const __codeset_8_bit_t *c8b
- = __locale_mmap->codeset_8_bit + px->fromcodeset - 3;
- wc -= 0x80;
- wc = __UCLIBC_CURLOCALE->tbl8c2wc[
- (c8b->idx8c2wc[wc >> Cc2wc_IDX_SHIFT]
- << Cc2wc_IDX_SHIFT) + (wc & (Cc2wc_ROW_LEN - 1))];
- if (!wc) {
- goto ILLEGAL;
- }
- }
- }
- if (px->tobom) {
- inci = 0;
- wc = 0xfeffU;
- GOT_BOM:
- px->tobom = 0;
- }
- if (px->tocodeset >= IC_MULTIBYTE) {
- inco = (px->tocodeset == IC_WCHAR_T) ? 4: (px->tocodeset & 6);
- if (*outbytesleft < inco) goto TOO_BIG;
- if (px->tocodeset != IC_WCHAR_T) {
- if (((__uwchar_t) wc) > (((px->tocodeset & IC_UCS_4) == IC_UCS_4)
- ? 0x7fffffffUL : 0x10ffffUL)
- #ifdef KUHN
- || (((__uwchar_t)(wc - 0xfffeU)) < 2)
- || (((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U))
- #endif
- ) {
- REPLACE_32:
- wc = 0xfffd;
- ++nrcount;
- }
- }
- if (inco == 4) {
- if (px->tocodeset & 1) wc = bswap_32(wc);
- } else {
- if (((__uwchar_t)wc ) > 0xffffU) {
- if ((px->tocodeset & IC_UTF_16) != IC_UTF_16) {
- goto REPLACE_32;
- }
- if (*outbytesleft < (inco = 4)) goto TOO_BIG;
- wc2 = 0xdc00U + (wc & 0x3ff);
- wc = 0xd800U + ((wc >> 10) & 0x3ff);
- if (px->tocodeset & 1) {
- wc = bswap_16(wc);
- wc2 = bswap_16(wc2);
- }
- wc += (wc2 << 16);
- } else if (px->tocodeset & 1) wc = bswap_16(wc);
- }
- (*outbuf)[0] = (char)((unsigned char)(wc));
- (*outbuf)[1] = (char)((unsigned char)(wc >> 8));
- if (inco == 4) {
- (*outbuf)[2] = (char)((unsigned char)(wc >> 16));
- (*outbuf)[3] = (char)((unsigned char)(wc >> 24));
- }
- } else if (px->tocodeset == IC_UTF_8) {
- const wchar_t *pw = &wc;
- do {
- r = _wchar_wcsntoutf8s(*outbuf, *outbytesleft, &pw, 1);
- if (r != (size_t)(-1)) {
- #ifdef __UCLIBC_MJN3_ONLY__
- #warning TODO: What happens for a nul?
- #endif
- if (r == 0) {
- if (wc != 0) {
- goto TOO_BIG;
- }
- ++r;
- }
- break;
- }
- wc = 0xfffdU;
- ++nrcount;
- } while (1);
- inco = r;
- } else if (((__uwchar_t)(wc)) < 0x80) {
- CHAR_GOOD:
- **outbuf = wc;
- } else {
- if ((px->tocodeset != 0x01) && (wc <= Cwc2c_DOMAIN_MAX)) {
- const __codeset_8_bit_t *c8b
- = __locale_mmap->codeset_8_bit + px->tocodeset - 3;
- __uwchar_t u;
- u = c8b->idx8wc2c[wc >> (Cwc2c_TI_SHIFT + Cwc2c_TT_SHIFT)];
- u = __UCLIBC_CURLOCALE->tbl8wc2c[(u << Cwc2c_TI_SHIFT)
- + ((wc >> Cwc2c_TT_SHIFT)
- & ((1 << Cwc2c_TI_SHIFT)-1))];
- wc = __UCLIBC_CURLOCALE->tbl8wc2c[Cwc2c_TI_LEN
- + (u << Cwc2c_TT_SHIFT)
- + (wc & ((1 << Cwc2c_TT_SHIFT)-1))];
- if (wc) {
- goto CHAR_GOOD;
- }
- }
- **outbuf = '?';
- ++nrcount;
- }
- *outbuf += inco;
- *outbytesleft -= inco;
- BOM_SKIP_OUTPUT:
- *inbuf += inci;
- *inbytesleft -= inci;
- }
- return nrcount;
- }
- #endif
- /**********************************************************************/
- #ifdef L_iconv_main
- #include <string.h>
- #include <iconv.h>
- #include <stdarg.h>
- #include <libgen.h>
- extern const unsigned char __iconv_codesets[];
- #define IBUF BUFSIZ
- #define OBUF BUFSIZ
- static char *progname;
- static int hide_errors;
- static void error_msg(const char *fmt, ...)
- __attribute__ ((noreturn, format (printf, 1, 2)));
- static void error_msg(const char *fmt, ...)
- {
- va_list arg;
- if (!hide_errors) {
- fprintf(stderr, "%s: ", progname);
- va_start(arg, fmt);
- vfprintf(stderr, fmt, arg);
- va_end(arg);
- }
- exit(EXIT_FAILURE);
- }
- int main(int argc, char **argv)
- {
- FILE *ifile;
- FILE *ofile = stdout;
- const char *p;
- const char *s;
- static const char opt_chars[] = "tfocsl";
- /* 012345 */
- const char *opts[sizeof(opt_chars)]; /* last is infile name */
- iconv_t ic;
- char ibuf[IBUF];
- char obuf[OBUF];
- char *pi;
- char *po;
- size_t ni, no, r, pos;
- hide_errors = 0;
- for (s = opt_chars ; *s ; s++) {
- opts[ s - opt_chars ] = NULL;
- }
- progname = *argv;
- while (--argc) {
- p = *++argv;
- if ((*p != '-') || (*++p == 0)) {
- break;
- }
- do {
- if ((s = strchr(opt_chars,*p)) == NULL) {
- USAGE:
- s = basename(progname);
- fprintf(stderr,
- "%s [-cs] -f fromcode -t tocode [-o outputfile] [inputfile ...]\n"
- " or\n%s -l\n", s, s);
- return EXIT_FAILURE;
- }
- if ((s - opt_chars) < 3) {
- if ((--argc == 0) || opts[s - opt_chars]) {
- goto USAGE;
- }
- opts[s - opt_chars] = *++argv;
- } else {
- opts[s - opt_chars] = p;
- }
- } while (*++p);
- }
- if (opts[5]) { /* -l */
- fprintf(stderr, "Recognized codesets:\n");
- for (s = (char *)__iconv_codesets ; *s ; s += *s) {
- fprintf(stderr," %s\n", s+2);
- }
- s = __LOCALE_DATA_CODESET_LIST;
- do {
- fprintf(stderr," %s\n", __LOCALE_DATA_CODESET_LIST+ (unsigned char)(*s));
- } while (*++s);
- return EXIT_SUCCESS;
- }
- if (opts[4]) {
- hide_errors = 1;
- }
- if (!opts[0] || !opts[1]) {
- goto USAGE;
- }
- if ((ic = iconv_open(opts[0],opts[1])) == ((iconv_t)(-1))) {
- error_msg( "unsupported codeset in %s -> %s conversion\n", opts[0], opts[1]);
- }
- if (opts[3]) { /* -c */
- ((_UC_iconv_t *) ic)->skip_invalid_input = 1;
- }
- if ((s = opts[2]) != NULL) {
- if (!(ofile = fopen(s, "w"))) {
- error_msg( "couldn't open %s for writing\n", s);
- }
- }
- pos = ni = 0;
- do {
- if (!argc || ((**argv == '-') && !((*argv)[1]))) {
- ifile = stdin; /* we don't check for duplicates */
- } else if (!(ifile = fopen(*argv, "r"))) {
- error_msg( "couldn't open %s for reading\n", *argv);
- }
- while ((r = fread(ibuf + ni, 1, IBUF - ni, ifile)) > 0) {
- pos += r;
- ni += r;
- no = OBUF;
- pi = ibuf;
- po = obuf;
- if ((r = iconv(ic, &pi, &ni, &po, &no)) == ((size_t)(-1))) {
- if ((errno != EINVAL) && (errno != E2BIG)) {
- error_msg( "iconv failed at pos %lu : %m\n", (unsigned long) (pos - ni));
- }
- }
- if ((r = OBUF - no) > 0) {
- if (fwrite(obuf, 1, OBUF - no, ofile) < r) {
- error_msg( "write error\n");
- }
- }
- if (ni) { /* still bytes in buffer! */
- memmove(ibuf, pi, ni);
- }
- }
- if (ferror(ifile)) {
- error_msg( "read error\n");
- }
- ++argv;
- if (ifile != stdin) {
- fclose(ifile);
- }
- } while (--argc > 0);
- iconv_close(ic);
- if (ni) {
- error_msg( "incomplete sequence\n");
- }
- return (((_UC_iconv_t *) ic)->skip_invalid_input < 2)
- ? EXIT_SUCCESS : EXIT_FAILURE;
- }
- #endif
- /**********************************************************************/
|