123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723 |
- /* Copyright (C) 2002 Manuel Novoa III
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Library General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Library General Public License for more details.
- *
- * You should have received a copy of the GNU Library General Public
- * License along with this library; if not, write to the Free
- * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
- /* ATTENTION! ATTENTION! ATTENTION! ATTENTION! ATTENTION!
- *
- * Besides uClibc, I'm using this code in my libc for elks, which is
- * a 16-bit environment with a fairly limited compiler. It would make
- * things much easier for me if this file isn't modified unnecessarily.
- * In particular, please put any new or replacement functions somewhere
- * else, and modify the makefile to use your version instead.
- * Thanks. Manuel
- *
- * ATTENTION! ATTENTION! ATTENTION! ATTENTION! ATTENTION! */
- /* May 23, 2002 Initial Notes:
- *
- * I'm still tweaking this stuff, but it passes the tests I've thrown
- * at it, and Erik needs it for the gcc port. The glibc extension
- * __wcsnrtombs() hasn't been tested, as I didn't find a test for it
- * in the glibc source. I also need to fix the behavior of
- * _wchar_utf8sntowcs() if the max number of wchars to convert is 0.
- *
- * UTF-8 -> wchar -> UTF-8 conversion tests on Markus Kuhn's UTF-8-demo.txt
- * file on my platform (x86) show about 5-10% faster conversion speed than
- * glibc with mbsrtowcs()/wcsrtombs() and almost twice as fast as glibc with
- * individual mbrtowc()/wcrtomb() calls.
- *
- * If 'DECODER' is defined, then _wchar_utf8sntowcs() will be compiled
- * as a fail-safe UTF-8 decoder appropriate for a terminal, etc. which
- * needs to deal gracefully with whatever is sent to it. In that mode,
- * it passes Markus Kuhn's UTF-8-test.txt stress test. I plan to add
- * an arg to force that behavior, so the interface will be changing.
- *
- * I need to fix the error checking for 16-bit wide chars. This isn't
- * an issue for uClibc, but may be for ELKS. I'm currently not sure
- * if I'll use 16-bit, 32-bit, or configureable wchars in ELKS.
- *
- * Manuel
- */
- #define _GNU_SOURCE
- #define _ISOC99_SOURCE
- #include <errno.h>
- #include <stddef.h>
- #include <limits.h>
- #include <stdint.h>
- #include <inttypes.h>
- #include <stdlib.h>
- #include <stdio.h>
- #include <assert.h>
- #include <locale.h>
- #include <wchar.h>
- #define ENCODING (__global_locale.encoding)
- #if WCHAR_MAX > 0xffffU
- #define UTF_8_MAX_LEN 6
- #else
- #define UTF_8_MAX_LEN 3
- #endif
- /* #define KUHN */
- #warning implement __CTYPE_HAS_UTF_8_LOCALES!
- #define __CTYPE_HAS_UTF_8_LOCALES
- /* Implementation-specific work functions. */
- extern size_t _wchar_utf8sntowcs(wchar_t *__restrict pwc, size_t wn,
- const char **__restrict src, size_t n,
- mbstate_t *ps, int allow_continuation);
- extern size_t _wchar_wcsntoutf8s(char *__restrict s, size_t n,
- const wchar_t **__restrict src, size_t wn);
- /* glibc extensions. */
- extern size_t __mbsnrtowcs(wchar_t *__restrict dst,
- const char **__restrict src,
- size_t NMC, size_t len, mbstate_t *__restrict ps);
- extern size_t __wcsnrtombs(char *__restrict dst,
- const wchar_t **__restrict src,
- size_t NWC, size_t len, mbstate_t *__restrict ps);
- /**********************************************************************/
- #ifdef L_btowc
- wint_t btowc(int c)
- {
- wchar_t wc;
- unsigned char buf[1];
- mbstate_t mbstate;
- if (c != EOF) {
- *buf = (unsigned char) c;
- mbstate.mask = 0; /* Initialize the mbstate. */
- if (mbrtowc(&wc, buf, 1, &mbstate) == 1) {
- return wc;
- }
- }
- return WEOF;
- }
- #endif
- /**********************************************************************/
- #ifdef L_wctob
- /* Note: We completely ignore ps in all currently supported conversions. */
- int wctob(wint_t c)
- {
- unsigned char buf[MB_LEN_MAX];
- return (wcrtomb(buf, c, NULL) == 1) ? *buf : EOF;
- }
- #endif
- /**********************************************************************/
- #ifdef L_mbsinit
- int mbsinit(const mbstate_t *ps)
- {
- return !ps || !ps->mask;
- }
- #endif
- /**********************************************************************/
- #ifdef L_mbrlen
- size_t mbrlen(const char *__restrict s, size_t n, mbstate_t *__restrict ps)
- {
- static mbstate_t mbstate; /* Rely on bss 0-init. */
- return mbrtowc(NULL, s, n, (ps != NULL) ? ps : &mbstate);
- }
- #endif
- /**********************************************************************/
- #ifdef L_mbrtowc
- size_t mbrtowc(wchar_t *__restrict pwc, const char *__restrict s,
- size_t n, mbstate_t *__restrict ps)
- {
- static mbstate_t mbstate; /* Rely on bss 0-init. */
- wchar_t wcbuf[1];
- const char *p;
- size_t r;
- char empty_string[1]; /* Avoid static to be fPIC friendly. */
- if (!ps) {
- ps = &mbstate;
- }
- if (!s) {
- pwc = (wchar_t *) s; /* NULL */
- empty_string[0] = 0; /* Init the empty string when necessary. */
- s = empty_string;
- n = 1;
- } else if (!n) {
- return (ps->mask && (ps->wc == 0xffffU)) /* TODO: change error code? */
- ? ((size_t) -1) : ((size_t) -2);
- }
- p = s;
- #ifdef __CTYPE_HAS_UTF_8_LOCALES
- /* Need to do this here since mbsrtowcs doesn't allow incompletes. */
- if (ENCODING == __ctype_encoding_utf8) {
- r = _wchar_utf8sntowcs(pwc, 1, &p, n, ps, 1);
- return (r == 1) ? (p-s) : r;
- }
- #endif
- r = __mbsnrtowcs(wcbuf, &p, SIZE_MAX, 1, ps);
- if (((ssize_t) r) >= 0) {
- if (pwc) {
- *pwc = *wcbuf;
- }
- }
- return (size_t) r;
- }
- #endif
- /**********************************************************************/
- #ifdef L_wcrtomb
- /* Note: We completely ignore ps in all currently supported conversions. */
- /* TODO: Check for valid state anyway? */
- size_t wcrtomb(register char *__restrict s, wchar_t wc,
- mbstate_t *__restrict ps)
- {
- wchar_t wcbuf[2];
- const wchar_t *pwc;
- size_t r;
- char buf[MB_LEN_MAX];
- if (!s) {
- s = buf;
- wc = 0;
- }
- pwc = wcbuf;
- wcbuf[0] = wc;
- wcbuf[1] = 0;
- r = __wcsnrtombs(s, &pwc, SIZE_MAX, MB_LEN_MAX, ps);
- return (r != 0) ? r : 1;
- }
- #endif
- /**********************************************************************/
- #ifdef L_mbsrtowcs
- size_t mbsrtowcs(wchar_t *__restrict dst, const char **__restrict src,
- size_t len, mbstate_t *__restrict ps)
- {
- static mbstate_t mbstate; /* Rely on bss 0-init. */
- return __mbsnrtowcs(dst, src, SIZE_MAX, len,
- ((ps != NULL) ? ps : &mbstate));
- }
- #endif
- /**********************************************************************/
- #ifdef L_wcsrtombs
- /* Note: We completely ignore ps in all currently supported conversions.
- * TODO: Check for valid state anyway? */
- size_t wcsrtombs(char *__restrict dst, const wchar_t **__restrict src,
- size_t len, mbstate_t *__restrict ps)
- {
- return __wcsnrtombs(dst, src, SIZE_MAX, len, ps);
- }
- #endif
- /**********************************************************************/
- #ifdef L__wchar_utf8sntowcs
- /* Define DECODER to generate a UTF-8 decoder which passes Markus Kuhn's
- * UTF-8-test.txt strss test.
- */
- /* #define DECODER */
- #ifdef DECODER
- #ifndef KUHN
- #define KUHN
- #endif
- #endif
- size_t _wchar_utf8sntowcs(wchar_t *__restrict pwc, size_t wn,
- const char **__restrict src, size_t n,
- mbstate_t *ps, int allow_continuation)
- {
- register const char *s;
- __uwchar_t mask;
- __uwchar_t wc;
- wchar_t wcbuf[1];
- size_t count;
- int incr;
- s = *src;
- assert(s != NULL);
- assert(ps != NULL);
- incr = 1;
- if (!pwc) {
- pwc = wcbuf;
- wn = SIZE_MAX;
- incr = 0;
- }
- #warning fix _wchar_utf8sntowcs to allow wn == 0!
- assert(wn > 0); /* TODO: fix this!! */
- count = wn;
- if ((mask = (__uwchar_t) ps->mask) != 0) { /* A continuation... */
- #ifdef DECODER
- wc = (__uwchar_t) ps->wc;
- if (n) {
- goto CONTINUE;
- }
- goto DONE;
- #else
- if ((wc = (__uwchar_t) ps->wc) != 0xffffU) {
- /* TODO: change error code here and below? */
- if (n) {
- goto CONTINUE;
- }
- goto DONE;
- }
- return (size_t) -1; /* We're in an error state. */
- #endif
- }
- do {
- if (!n) {
- goto DONE;
- }
- --n;
- if ((wc = ((unsigned char) *s++)) >= 0x80) { /* Not ASCII... */
- mask = 0x40;
- #warning fix range for 16 bit wides
- if ( ((unsigned char)(s[-1] - 0xc0)) < (0xfe - 0xc0) ) {
- goto START;
- }
- BAD:
- #ifdef DECODER
- wc = 0xfffd;
- goto COMPLETE;
- #else
- ps->mask = mask;
- ps->wc = 0xffffU;
- return (size_t) -1; /* Illegal start byte! */
- #endif
- CONTINUE:
- while (n) {
- --n;
- if ((*s & 0xc0) != 0x80) {
- goto BAD;
- }
- mask <<= 5;
- wc <<= 6;
- wc += (*s & 0x3f); /* keep seperate for bcc (smaller code) */
- ++s;
- START:
- wc &= ~(mask << 1);
- if ((wc & mask) == 0) { /* Character completed. */
- if ((mask >>= 5) == 0x40) {
- mask += mask;
- }
- /* Check for invalid sequences (longer than necessary)
- * and invalid chars. */
- if ( (wc < mask) /* Sequence not minimal length. */
- #ifdef KUHN
- #if UTF_8_MAX_LEN == 3
- #error broken since mask can overflow!!
- /* For plane 0, these are the only defined values.*/
- || (wc > 0xfffdU)
- #else
- /* Note that we don't need to worry about exceeding */
- /* 31 bits as that is the most that UTF-8 provides. */
- || ( ((__uwchar_t)(wc - 0xfffeU)) < 2)
- #endif
- || ( ((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U) )
- #endif /* KUHN */
- ) {
- goto BAD;
- }
- goto COMPLETE;
- }
- }
- /* Character potentially valid but incomplete. */
- if (!allow_continuation) {
- if (count != wn) {
- return 0;
- }
- /* NOTE: The following can fail if you allow and then disallow
- * continuation!!! */
- #if UTF_8_MAX_LEN == 3
- #error broken since mask can overflow!!
- #endif
- /* Need to back up... */
- do {
- --s;
- } while ((mask >>= 5) >= 0x40);
- goto DONE;
- }
- ps->mask = (wchar_t) mask;
- ps->wc = (wchar_t) wc;
- *src = s;
- return (size_t) -2;
- }
- COMPLETE:
- *pwc = wc;
- pwc += incr;
- }
- #ifdef DECODER
- while (--count);
- #else
- while (wc && --count);
- if (!wc) {
- s = NULL;
- }
- #endif
- DONE:
- /* ps->wc is irrelavent here. */
- ps->mask = 0;
- if (pwc != wcbuf) {
- *src = s;
- }
- return wn - count;
- }
- #endif
- /**********************************************************************/
- #ifdef L__wchar_wcstoutf8s
- size_t _wchar_wcsntoutf8s(char *__restrict s, size_t n,
- const wchar_t **__restrict src, size_t wn)
- {
- register char *p;
- size_t len, t;
- __uwchar_t wc;
- const __uwchar_t *swc;
- int store;
- char buf[MB_LEN_MAX];
- char m;
- store = 1;
- if (!s) {
- s = buf;
- n = SIZE_MAX;
- store = 0;
- }
- t = n;
- swc = (const __uwchar_t *) *src;
- assert(swc != NULL);
- while (wn && t) {
- wc = *swc;
- *s = wc;
- len = 1;
- if (wc >= 0x80) {
- #ifdef KUHN
- if (
- #if UTF_8_MAX_LEN == 3
- /* For plane 0, these are the only defined values.*/
- /* Note that we don't need to worry about exceeding */
- /* 31 bits as that is the most that UTF-8 provides. */
- (wc > 0xfffdU)
- #else
- /* UTF_8_MAX_LEN == 6 */
- (wc > 0x7fffffffUL)
- || ( ((__uwchar_t)(wc - 0xfffeU)) < 2)
- #endif
- || ( ((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U) )
- ) {
- return (size_t) -1;
- }
- #else /* KUHN */
- #if UTF_8_MAX_LEN != 3
- if (wc > 0x7fffffffUL) { /* Value too large. */
- return (size_t) -1;
- }
- #endif
- #endif /* KUHN */
- wc >>= 1;
- p = s;
- do {
- ++p;
- } while (wc >>= 5);
- wc = *swc;
- if ((len = p - s) > t) { /* Not enough space. */
- break;
- }
- m = 0x80;
- while( p>s ) {
- m = (m >> 1) | 0x80;
- *--p = (wc & 0x3f) | 0x80;
- wc >>= 6;
- }
- *s |= (m << 1);
- } else if (wc == 0) { /* End of string. */
- swc = NULL;
- break;
- }
- ++swc;
- --wn;
- t -= len;
- if (store) {
- s += len;
- }
- }
- *src = (const wchar_t *) swc;
- return n - t;
- }
- #endif
- /**********************************************************************/
- #ifdef L___mbsnrtowcs
- /* WARNING: We treat len as SIZE_MAX when dst is NULL! */
- size_t mbsnrtowcs(wchar_t *__restrict dst, const char **__restrict src,
- size_t NMC, size_t len, mbstate_t *__restrict ps)
- __attribute__ ((__weak__, __alias__("__mbsnrtowcs")));
- size_t __mbsnrtowcs(wchar_t *__restrict dst, const char **__restrict src,
- size_t NMC, size_t len, mbstate_t *__restrict ps)
- {
- static mbstate_t mbstate; /* Rely on bss 0-init. */
- wchar_t wcbuf[1];
- const char *s;
- size_t count, r;
- int incr;
- if (!ps) {
- ps = &mbstate;
- }
- #ifdef __CTYPE_HAS_UTF_8_LOCALES
- if (ENCODING == __ctype_encoding_utf8) {
- return ((r = _wchar_utf8sntowcs(dst, len, src, NMC, ps, 1))
- != (size_t) -2) ? r : 0;
- }
- #endif
- incr = 1;
- if (!dst) {
- dst = wcbuf;
- len = SIZE_MAX;
- incr = 0;
- }
- /* Since all the following encodings are single-byte encodings... */
- if (len > NMC) {
- len = NMC;
- }
- count = len;
- s = *src;
- #ifdef __CTYPE_HAS_8_BIT_LOCALES
- if (ENCODING == __ctype_encoding_8_bit) {
- wchar_t wc;
- while (count) {
- if ((wc = ((unsigned char)(*s))) >= 0x80) { /* Non-ASCII... */
- wc -= 0x80;
- wc = __global_locale.tbl8c2wc[
- (__global_locale.idx8c2wc[wc >> Cc2wc_IDX_SHIFT]
- << Cc2wc_IDX_SHIFT) + (wc & (Cc2wc_ROW_LEN - 1))];
- if (!wc) {
- goto BAD;
- }
- } else if (!wc) {
- s = NULL;
- break;
- }
- ++s;
- *dst = wc;
- dst += incr;
- --count;
- }
- if (dst != wcbuf) {
- *src = s;
- }
- return len - count;
- }
- #endif
- assert(ENCODING == __ctype_encoding_7_bit);
- while (count) {
- if ((*dst = (unsigned char) *s) == 0) {
- s = NULL;
- break;
- }
- if (*dst >= 0x80) {
- #ifdef __CTYPE_HAS_8_BIT_LOCALES
- BAD:
- #endif
- __set_errno(EILSEQ);
- return (size_t) -1;
- }
- ++s;
- dst += incr;
- --count;
- }
- if (dst != wcbuf) {
- *src = s;
- }
- return len - count;
- }
- #endif
- /**********************************************************************/
- #ifdef L___wcsnrtombs
- /* WARNING: We treat len as SIZE_MAX when dst is NULL! */
- /* Note: We completely ignore ps in all currently supported conversions.
- * TODO: Check for valid state anyway? */
- size_t wcsnrtombs(char *__restrict dst, const wchar_t **__restrict src,
- size_t NWC, size_t len, mbstate_t *__restrict ps)
- __attribute__ ((__weak__, __alias__("__wcsnrtombs")));
- size_t __wcsnrtombs(char *__restrict dst, const wchar_t **__restrict src,
- size_t NWC, size_t len, mbstate_t *__restrict ps)
- {
- const __uwchar_t *s;
- size_t count;
- int incr;
- char buf[MB_LEN_MAX];
- #ifdef __CTYPE_HAS_UTF_8_LOCALES
- if (ENCODING == __ctype_encoding_utf8) {
- return _wchar_wcsntoutf8s(dst, len, src, NWC);
- }
- #endif
- incr = 1;
- if (!dst) {
- dst = buf;
- len = SIZE_MAX;
- incr = 0;
- }
- /* Since all the following encodings are single-byte encodings... */
- if (len > NWC) {
- len = NWC;
- }
- count = len;
- s = (const __uwchar_t *) *src;
- #ifdef __CTYPE_HAS_8_BIT_LOCALES
- if (ENCODING == __ctype_encoding_8_bit) {
- __uwchar_t wc;
- __uwchar_t u;
- while (count) {
- if ((wc = *s) <= 0x7f) {
- if (!(*dst = (unsigned char) wc)) {
- s = NULL;
- break;
- }
- } else {
- u = 0;
- if (wc <= Cwc2c_DOMAIN_MAX) {
- u = __global_locale.idx8wc2c[wc >> (Cwc2c_TI_SHIFT
- + Cwc2c_TT_SHIFT)];
- u = __global_locale.tbl8wc2c[(u << Cwc2c_TI_SHIFT)
- + ((wc >> Cwc2c_TT_SHIFT)
- & ((1 << Cwc2c_TI_SHIFT)-1))];
- u = __global_locale.tbl8wc2c[Cwc2c_TI_LEN
- + (u << Cwc2c_TT_SHIFT)
- + (wc & ((1 << Cwc2c_TT_SHIFT)-1))];
- }
- /* #define __WCHAR_REPLACEMENT_CHAR '?' */
- #ifdef __WCHAR_REPLACEMENT_CHAR
- *dst = (unsigned char) ( u ? u : __WCHAR_REPLACEMENT_CHAR );
- #else
- if (!u) {
- goto BAD;
- }
- *dst = (unsigned char) u;
- #endif
- }
- ++s;
- dst += incr;
- --count;
- }
- if (dst != buf) {
- *src = (const wchar_t *) s;
- }
- return len - count;
- }
- #endif
- assert(ENCODING == __ctype_encoding_7_bit);
- while (count) {
- if (*s >= 0x80) {
- #if defined(__CTYPE_HAS_8_BIT_LOCALES) && !defined(__WCHAR_REPLACEMENT_CHAR)
- BAD:
- #endif
- __set_errno(EILSEQ);
- return (size_t) -1;
- }
- if ((*dst = (unsigned char) *s) == 0) {
- s = NULL;
- break;
- }
- ++s;
- dst += incr;
- --count;
- }
- if (dst != buf) {
- *src = (const wchar_t *) s;
- }
- return len - count;
- }
- #endif
- /**********************************************************************/
|