iconv.c 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440
  1. #include <iconv.h>
  2. #include <errno.h>
  3. #include <wchar.h>
  4. #include <string.h>
  5. #include <strings.h>
  6. #include <stdlib.h>
  7. #include <limits.h>
  8. #include <dirent.h>
  9. #include <fcntl.h>
  10. #include <sys/mman.h>
  11. #include <sys/stat.h>
  12. #include <unistd.h>
  13. #include <stdint.h>
  14. /* builtin charmaps */
  15. #include "charmaps.h"
  16. /* only 0-7 are valid as dest charset */
  17. #define UTF_16BE 000
  18. #define UTF_16LE 001
  19. #define UTF_32BE 002
  20. #define UTF_32LE 003
  21. #define WCHAR_T 004
  22. #define UTF_8 005
  23. #define US_ASCII 006
  24. #define LATIN_1 007
  25. /* additional charsets with algorithmic conversion */
  26. #define LATIN_9 010
  27. #define TIS_620 011
  28. #define JIS_0201 012
  29. /* some programs like php need this */
  30. int _libiconv_version = _LIBICONV_VERSION;
  31. /* these must match the constants above */
  32. static const unsigned char charsets[] =
  33. "\005" "UTF-8" "\0"
  34. "\004" "WCHAR_T" "\0"
  35. "\000" "UTF-16BE" "\0"
  36. "\001" "UTF-16LE" "\0"
  37. "\002" "UTF-32BE" "\0"
  38. "\003" "UTF-32LE" "\0"
  39. "\006" "ASCII" "\0"
  40. "\006" "US-ASCII" "\0"
  41. "\007" "ISO-8859-1" "\0"
  42. "\007" "LATIN1" "\0"
  43. "\010" "ISO-8859-15""\0"
  44. "\010" "LATIN9" "\0"
  45. "\011" "ISO-8859-11""\0"
  46. "\011" "TIS-620" "\0"
  47. "\012" "JIS-0201" "\0"
  48. "\377";
  49. /* separate identifiers for sbcs/dbcs/etc map type */
  50. #define UCS2_8BIT 000
  51. #define UCS3_8BIT 001
  52. #define EUC 002
  53. #define EUC_TW 003
  54. #define SHIFT_JIS 004
  55. #define BIG5 005
  56. #define GBK 006
  57. /* FIXME: these are not implemented yet
  58. // EUC: A1-FE A1-FE
  59. // GBK: 81-FE 40-7E,80-FE
  60. // Big5: A1-FE 40-7E,A1-FE
  61. */
  62. static const unsigned short maplen[] = {
  63. [UCS2_8BIT] = 4+ 2* 128,
  64. [UCS3_8BIT] = 4+ 3* 128,
  65. [EUC] = 4+ 2* 94*94,
  66. [SHIFT_JIS] = 4+ 2* 94*94,
  67. [BIG5] = 4+ 2* 94*157,
  68. [GBK] = 4+ 2* 126*190,
  69. [EUC_TW] = 4+ 2* 2*94*94,
  70. };
  71. static int find_charmap(const char *name)
  72. {
  73. int i;
  74. for (i = 0; i < (sizeof(charmaps) / sizeof(charmaps[0])); i++)
  75. if (!strcasecmp(charmaps[i].name, name))
  76. return i;
  77. return -1;
  78. }
  79. static int find_charset(const char *name)
  80. {
  81. const unsigned char *s;
  82. for (s=charsets; *s<0xff && strcasecmp(s+1, name); s+=strlen(s)+1);
  83. return *s;
  84. }
  85. iconv_t iconv_open(const char *to, const char *from)
  86. {
  87. unsigned f, t;
  88. int m;
  89. if ((t = find_charset(to)) > 8)
  90. return -1;
  91. if ((f = find_charset(from)) < 255)
  92. return 0 | (t<<1) | (f<<8);
  93. if ((m = find_charmap(from)) > -1)
  94. return 1 | (t<<1) | (m<<8);
  95. return -1;
  96. }
  97. int iconv_close(iconv_t cd)
  98. {
  99. return 0;
  100. }
  101. static inline wchar_t get_16(const unsigned char *s, int endian)
  102. {
  103. endian &= 1;
  104. return s[endian]<<8 | s[endian^1];
  105. }
  106. static inline void put_16(unsigned char *s, wchar_t c, int endian)
  107. {
  108. endian &= 1;
  109. s[endian] = c>>8;
  110. s[endian^1] = c;
  111. }
  112. static inline int utf8enc_wchar(char *outb, wchar_t c)
  113. {
  114. if (c <= 0x7F) {
  115. *outb = c;
  116. return 1;
  117. }
  118. else if (c <= 0x7FF) {
  119. *outb++ = ((c >> 6) & 0x1F) | 0xC0;
  120. *outb++ = ( c & 0x3F) | 0x80;
  121. return 2;
  122. }
  123. else if (c <= 0xFFFF) {
  124. *outb++ = ((c >> 12) & 0x0F) | 0xE0;
  125. *outb++ = ((c >> 6) & 0x3F) | 0x80;
  126. *outb++ = ( c & 0x3F) | 0x80;
  127. return 3;
  128. }
  129. else if (c <= 0x10FFFF) {
  130. *outb++ = ((c >> 18) & 0x07) | 0xF0;
  131. *outb++ = ((c >> 12) & 0x3F) | 0x80;
  132. *outb++ = ((c >> 6) & 0x3F) | 0x80;
  133. *outb++ = ( c & 0x3F) | 0x80;
  134. return 4;
  135. }
  136. else {
  137. *outb++ = '?';
  138. return 1;
  139. }
  140. }
  141. static inline int utf8seq_is_overlong(char *s, int n)
  142. {
  143. switch (n)
  144. {
  145. case 2:
  146. /* 1100000x (10xxxxxx) */
  147. return (((*s >> 1) == 0x60) &&
  148. ((*(s+1) >> 6) == 0x02));
  149. case 3:
  150. /* 11100000 100xxxxx (10xxxxxx) */
  151. return ((*s == 0xE0) &&
  152. ((*(s+1) >> 5) == 0x04) &&
  153. ((*(s+2) >> 6) == 0x02));
  154. case 4:
  155. /* 11110000 1000xxxx (10xxxxxx 10xxxxxx) */
  156. return ((*s == 0xF0) &&
  157. ((*(s+1) >> 4) == 0x08) &&
  158. ((*(s+2) >> 6) == 0x02) &&
  159. ((*(s+3) >> 6) == 0x02));
  160. }
  161. return 0;
  162. }
  163. static inline int utf8seq_is_surrogate(char *s, int n)
  164. {
  165. return ((n == 3) && (*s == 0xED) && (*(s+1) >= 0xA0) && (*(s+1) <= 0xBF));
  166. }
  167. static inline int utf8seq_is_illegal(char *s, int n)
  168. {
  169. return ((n == 3) && (*s == 0xEF) && (*(s+1) == 0xBF) &&
  170. (*(s+2) >= 0xBE) && (*(s+2) <= 0xBF));
  171. }
  172. static inline int utf8dec_wchar(wchar_t *c, unsigned char *in, size_t inb)
  173. {
  174. int i;
  175. int n = -1;
  176. /* trivial char */
  177. if (*in <= 0x7F) {
  178. *c = *in;
  179. return 1;
  180. }
  181. /* find utf8 sequence length */
  182. if ((*in & 0xE0) == 0xC0) n = 2;
  183. else if ((*in & 0xF0) == 0xE0) n = 3;
  184. else if ((*in & 0xF8) == 0xF0) n = 4;
  185. else if ((*in & 0xFC) == 0xF8) n = 5;
  186. else if ((*in & 0xFE) == 0xFC) n = 6;
  187. /* starved? */
  188. if (n > inb)
  189. return -2;
  190. /* decode ... */
  191. if (n > 1 && n < 5) {
  192. /* reject invalid sequences */
  193. if (utf8seq_is_overlong(in, n) ||
  194. utf8seq_is_surrogate(in, n) ||
  195. utf8seq_is_illegal(in, n))
  196. return -1;
  197. /* decode ... */
  198. *c = (char)(*in++ & (0x7F >> n));
  199. for (i = 1; i < n; i++) {
  200. /* illegal continuation byte */
  201. if (*in < 0x80 || *in > 0xBF)
  202. return -1;
  203. *c = (*c << 6) | (*in++ & 0x3F);
  204. }
  205. return n;
  206. }
  207. /* unmapped sequence (> 4) */
  208. return -1;
  209. }
  210. static inline char latin9_translit(wchar_t c)
  211. {
  212. /* a number of trivial iso-8859-15 <> utf-8 transliterations */
  213. switch (c) {
  214. case 0x20AC: return 0xA4; /* Euro */
  215. case 0x0160: return 0xA6; /* S caron */
  216. case 0x0161: return 0xA8; /* s caron */
  217. case 0x017D: return 0xB4; /* Z caron */
  218. case 0x017E: return 0xB8; /* z caron */
  219. case 0x0152: return 0xBC; /* OE */
  220. case 0x0153: return 0xBD; /* oe */
  221. case 0x0178: return 0xBE; /* Y diaeresis */
  222. default: return '?';
  223. }
  224. }
  225. size_t iconv(iconv_t cd, char **in, size_t *inb, char **out, size_t *outb)
  226. {
  227. size_t x=0;
  228. unsigned char to = (cd>>1)&127;
  229. unsigned char from = 255;
  230. const unsigned char *map = 0;
  231. char tmp[MB_LEN_MAX];
  232. wchar_t c, d;
  233. size_t k, l;
  234. int err;
  235. if (!in || !*in || !*inb) return 0;
  236. if (cd & 1)
  237. map = charmaps[cd>>8].map;
  238. else
  239. from = cd>>8;
  240. for (; *inb; *in+=l, *inb-=l) {
  241. c = *(unsigned char *)*in;
  242. l = 1;
  243. if (from >= UTF_8 && c < 0x80) goto charok;
  244. switch (from) {
  245. case WCHAR_T:
  246. l = sizeof(wchar_t);
  247. if (*inb < l) goto starved;
  248. c = *(wchar_t *)*in;
  249. break;
  250. case UTF_8:
  251. l = utf8dec_wchar(&c, *in, *inb);
  252. if (!l) l++;
  253. else if (l == (size_t)-1) goto ilseq;
  254. else if (l == (size_t)-2) goto starved;
  255. break;
  256. case US_ASCII:
  257. goto ilseq;
  258. case LATIN_9:
  259. if ((unsigned)c - 0xa4 <= 0xbe - 0xa4) {
  260. static const unsigned char map[] = {
  261. 0, 0x60, 0, 0x61, 0, 0, 0, 0, 0, 0, 0,
  262. 0, 0, 0, 0, 0x7d, 0, 0, 0, 0x7e, 0, 0, 0,
  263. 0x52, 0x53, 0x78
  264. };
  265. if (c == 0xa4) c = 0x20ac;
  266. else if (map[c-0xa5]) c = 0x100 | map[c-0xa5];
  267. }
  268. case LATIN_1:
  269. goto charok;
  270. case TIS_620:
  271. if (c >= 0xa1) c += 0x0e01-0xa1;
  272. goto charok;
  273. case JIS_0201:
  274. if (c >= 0xa1) {
  275. if (c <= 0xdf) c += 0xff61-0xa1;
  276. else goto ilseq;
  277. }
  278. goto charok;
  279. case UTF_16BE:
  280. case UTF_16LE:
  281. l = 2;
  282. if (*inb < 2) goto starved;
  283. c = get_16(*in, from);
  284. if ((unsigned)(c-0xdc00) < 0x400) goto ilseq;
  285. if ((unsigned)(c-0xd800) < 0x400) {
  286. l = 4;
  287. if (*inb < 4) goto starved;
  288. d = get_16(*in + 2, from);
  289. if ((unsigned)(c-0xdc00) >= 0x400) goto ilseq;
  290. c = ((c-0xd800)<<10) | (d-0xdc00);
  291. }
  292. break;
  293. case UTF_32BE:
  294. case UTF_32LE:
  295. l = 4;
  296. if (*inb < 4) goto starved;
  297. // FIXME
  298. // c = get_32(*in, from);
  299. break;
  300. default:
  301. /* only support ascii supersets */
  302. if (c < 0x80) break;
  303. switch (map[0]) {
  304. case UCS2_8BIT:
  305. c -= 0x80;
  306. break;
  307. case EUC:
  308. if ((unsigned)c - 0xa1 >= 94) goto ilseq;
  309. if ((unsigned)in[0][1] - 0xa1 >= 94) goto ilseq;
  310. c = (c-0xa1)*94 + (in[0][1]-0xa1);
  311. l = 2;
  312. break;
  313. case SHIFT_JIS:
  314. if ((unsigned)c - 0xa1 <= 0xdf-0xa1) {
  315. c += 0xff61-0xa1;
  316. goto charok;
  317. }
  318. // FIXME...
  319. l = 2;
  320. break;
  321. default:
  322. goto badf;
  323. }
  324. c = get_16(map + 4 + 2*c, 0);
  325. if (c == 0xffff) goto ilseq;
  326. goto charok;
  327. }
  328. if ((unsigned)c - 0xd800 < 0x800 || (unsigned)c >= 0x110000)
  329. goto ilseq;
  330. charok:
  331. switch (to) {
  332. case WCHAR_T:
  333. if (*outb < sizeof(wchar_t)) goto toobig;
  334. *(wchar_t *)*out = c;
  335. *out += sizeof(wchar_t);
  336. *outb -= sizeof(wchar_t);
  337. break;
  338. case UTF_8:
  339. if (*outb < 4) {
  340. k = utf8enc_wchar(tmp, c);
  341. if (*outb < k) goto toobig;
  342. memcpy(*out, tmp, k);
  343. } else k = utf8enc_wchar(*out, c);
  344. *out += k;
  345. *outb -= k;
  346. break;
  347. case US_ASCII:
  348. if (c > 0x7f) c = 0xfffd;
  349. /* fall thru and count replacement in latin1 case */
  350. case LATIN_9:
  351. if (c >= 0x100 && c != 0xfffd)
  352. c = latin9_translit(c);
  353. /* fall through */
  354. case LATIN_1:
  355. if (!*outb) goto toobig;
  356. if (c < 0x100) **out = c;
  357. else x++, **out = '*'; //FIXME: translit?
  358. ++*out;
  359. --*outb;
  360. break;
  361. case UTF_16BE:
  362. case UTF_16LE:
  363. if (c < 0x10000) {
  364. if (*outb < 2) goto toobig;
  365. put_16(*out, c, to);
  366. *out += 2;
  367. *outb -= 2;
  368. break;
  369. }
  370. if (*outb < 4) goto toobig;
  371. put_16(*out, (c>>10)|0xd800, to);
  372. put_16(*out + 2, (c&0x3ff)|0xdc00, to);
  373. *out += 4;
  374. *outb -= 4;
  375. break;
  376. default:
  377. goto badf;
  378. }
  379. }
  380. return x;
  381. ilseq:
  382. err = EILSEQ;
  383. x = -1;
  384. goto end;
  385. badf:
  386. err = EBADF;
  387. x = -1;
  388. goto end;
  389. toobig:
  390. err = E2BIG;
  391. x = -1;
  392. goto end;
  393. starved:
  394. err = EINVAL;
  395. end:
  396. errno = err;
  397. return x;
  398. }