iconv.c 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442
  1. #include <iconv.h>
  2. #include <errno.h>
  3. #include <wchar.h>
  4. #include <string.h>
  5. #include <strings.h>
  6. #include <stdlib.h>
  7. #include <limits.h>
  8. #include <dirent.h>
  9. #include <fcntl.h>
  10. #include <sys/mman.h>
  11. #include <sys/stat.h>
  12. #include <unistd.h>
  13. #include <stdint.h>
  14. /* builtin charmaps */
  15. #include "charmaps.h"
  16. /* only 0-7 are valid as dest charset */
  17. #define UTF_16BE 000
  18. #define UTF_16LE 001
  19. #define UTF_32BE 002
  20. #define UTF_32LE 003
  21. #define WCHAR_T 004
  22. #define UTF_8 005
  23. #define US_ASCII 006
  24. #define LATIN_1 007
  25. /* additional charsets with algorithmic conversion */
  26. #define LATIN_9 010
  27. #define TIS_620 011
  28. #define JIS_0201 012
  29. #define EUC_JP 013
  30. /* some programs like php need this */
  31. int _libiconv_version = _LIBICONV_VERSION;
  32. /* these must match the constants above */
  33. static const unsigned char charsets[] =
  34. "\005" "UTF-8" "\0"
  35. "\004" "WCHAR_T" "\0"
  36. "\000" "UTF-16BE" "\0"
  37. "\001" "UTF-16LE" "\0"
  38. "\002" "UTF-32BE" "\0"
  39. "\003" "UTF-32LE" "\0"
  40. "\006" "ASCII" "\0"
  41. "\006" "US-ASCII" "\0"
  42. "\007" "ISO-8859-1" "\0"
  43. "\007" "LATIN1" "\0"
  44. "\010" "ISO-8859-15""\0"
  45. "\010" "LATIN9" "\0"
  46. "\011" "ISO-8859-11""\0"
  47. "\011" "TIS-620" "\0"
  48. "\012" "JIS-0201" "\0"
  49. "\013" "EUC-JP" "\0"
  50. "\377";
  51. /* separate identifiers for sbcs/dbcs/etc map type */
  52. #define UCS2_8BIT 000
  53. #define UCS3_8BIT 001
  54. #define EUC_JP 002
  55. #define EUC_TW 003
  56. #define SHIFT_JIS 004
  57. #define BIG5 005
  58. #define GBK 006
  59. /* FIXME: these are not implemented yet
  60. // EUC: A1-FE A1-FE
  61. // GBK: 81-FE 40-7E,80-FE
  62. // Big5: A1-FE 40-7E,A1-FE
  63. */
  64. static const unsigned short maplen[] = {
  65. [UCS2_8BIT] = 4+ 2* 128,
  66. [UCS3_8BIT] = 4+ 3* 128,
  67. [EUC_JP] = 4+ 2* 94*94,
  68. [SHIFT_JIS] = 4+ 2* 94*94,
  69. [BIG5] = 4+ 2* 94*157,
  70. [GBK] = 4+ 2* 126*190,
  71. [EUC_TW] = 4+ 2* 2*94*94,
  72. };
  73. static int find_charmap(const char *name)
  74. {
  75. int i;
  76. for (i = 0; i < (sizeof(charmaps) / sizeof(charmaps[0])); i++)
  77. if (!strcasecmp(charmaps[i].name, name))
  78. return i;
  79. return -1;
  80. }
  81. static int find_charset(const char *name)
  82. {
  83. const unsigned char *s;
  84. for (s=charsets; *s<0xff && strcasecmp(s+1, name); s+=strlen(s)+1);
  85. return *s;
  86. }
  87. iconv_t iconv_open(const char *to, const char *from)
  88. {
  89. unsigned f, t;
  90. int m;
  91. if ((t = find_charset(to)) > 8)
  92. return -1;
  93. if ((f = find_charset(from)) < 255)
  94. return 0 | (t<<1) | (f<<8);
  95. if ((m = find_charmap(from)) > -1)
  96. return 1 | (t<<1) | (m<<8);
  97. return -1;
  98. }
  99. int iconv_close(iconv_t cd)
  100. {
  101. return 0;
  102. }
  103. static inline wchar_t get_16(const unsigned char *s, int endian)
  104. {
  105. endian &= 1;
  106. return s[endian]<<8 | s[endian^1];
  107. }
  108. static inline void put_16(unsigned char *s, wchar_t c, int endian)
  109. {
  110. endian &= 1;
  111. s[endian] = c>>8;
  112. s[endian^1] = c;
  113. }
  114. static inline int utf8enc_wchar(char *outb, wchar_t c)
  115. {
  116. if (c <= 0x7F) {
  117. *outb = c;
  118. return 1;
  119. }
  120. else if (c <= 0x7FF) {
  121. *outb++ = ((c >> 6) & 0x1F) | 0xC0;
  122. *outb++ = ( c & 0x3F) | 0x80;
  123. return 2;
  124. }
  125. else if (c <= 0xFFFF) {
  126. *outb++ = ((c >> 12) & 0x0F) | 0xE0;
  127. *outb++ = ((c >> 6) & 0x3F) | 0x80;
  128. *outb++ = ( c & 0x3F) | 0x80;
  129. return 3;
  130. }
  131. else if (c <= 0x10FFFF) {
  132. *outb++ = ((c >> 18) & 0x07) | 0xF0;
  133. *outb++ = ((c >> 12) & 0x3F) | 0x80;
  134. *outb++ = ((c >> 6) & 0x3F) | 0x80;
  135. *outb++ = ( c & 0x3F) | 0x80;
  136. return 4;
  137. }
  138. else {
  139. *outb++ = '?';
  140. return 1;
  141. }
  142. }
  143. static inline int utf8seq_is_overlong(char *s, int n)
  144. {
  145. switch (n)
  146. {
  147. case 2:
  148. /* 1100000x (10xxxxxx) */
  149. return (((*s >> 1) == 0x60) &&
  150. ((*(s+1) >> 6) == 0x02));
  151. case 3:
  152. /* 11100000 100xxxxx (10xxxxxx) */
  153. return ((*s == 0xE0) &&
  154. ((*(s+1) >> 5) == 0x04) &&
  155. ((*(s+2) >> 6) == 0x02));
  156. case 4:
  157. /* 11110000 1000xxxx (10xxxxxx 10xxxxxx) */
  158. return ((*s == 0xF0) &&
  159. ((*(s+1) >> 4) == 0x08) &&
  160. ((*(s+2) >> 6) == 0x02) &&
  161. ((*(s+3) >> 6) == 0x02));
  162. }
  163. return 0;
  164. }
  165. static inline int utf8seq_is_surrogate(char *s, int n)
  166. {
  167. return ((n == 3) && (*s == 0xED) && (*(s+1) >= 0xA0) && (*(s+1) <= 0xBF));
  168. }
  169. static inline int utf8seq_is_illegal(char *s, int n)
  170. {
  171. return ((n == 3) && (*s == 0xEF) && (*(s+1) == 0xBF) &&
  172. (*(s+2) >= 0xBE) && (*(s+2) <= 0xBF));
  173. }
  174. static inline int utf8dec_wchar(wchar_t *c, unsigned char *in, size_t inb)
  175. {
  176. int i;
  177. int n = -1;
  178. /* trivial char */
  179. if (*in <= 0x7F) {
  180. *c = *in;
  181. return 1;
  182. }
  183. /* find utf8 sequence length */
  184. if ((*in & 0xE0) == 0xC0) n = 2;
  185. else if ((*in & 0xF0) == 0xE0) n = 3;
  186. else if ((*in & 0xF8) == 0xF0) n = 4;
  187. else if ((*in & 0xFC) == 0xF8) n = 5;
  188. else if ((*in & 0xFE) == 0xFC) n = 6;
  189. /* starved? */
  190. if (n > inb)
  191. return -2;
  192. /* decode ... */
  193. if (n > 1 && n < 5) {
  194. /* reject invalid sequences */
  195. if (utf8seq_is_overlong(in, n) ||
  196. utf8seq_is_surrogate(in, n) ||
  197. utf8seq_is_illegal(in, n))
  198. return -1;
  199. /* decode ... */
  200. *c = (char)(*in++ & (0x7F >> n));
  201. for (i = 1; i < n; i++) {
  202. /* illegal continuation byte */
  203. if (*in < 0x80 || *in > 0xBF)
  204. return -1;
  205. *c = (*c << 6) | (*in++ & 0x3F);
  206. }
  207. return n;
  208. }
  209. /* unmapped sequence (> 4) */
  210. return -1;
  211. }
  212. static inline char latin9_translit(wchar_t c)
  213. {
  214. /* a number of trivial iso-8859-15 <> utf-8 transliterations */
  215. switch (c) {
  216. case 0x20AC: return 0xA4; /* Euro */
  217. case 0x0160: return 0xA6; /* S caron */
  218. case 0x0161: return 0xA8; /* s caron */
  219. case 0x017D: return 0xB4; /* Z caron */
  220. case 0x017E: return 0xB8; /* z caron */
  221. case 0x0152: return 0xBC; /* OE */
  222. case 0x0153: return 0xBD; /* oe */
  223. case 0x0178: return 0xBE; /* Y diaeresis */
  224. default: return '?';
  225. }
  226. }
  227. size_t iconv(iconv_t cd, char **in, size_t *inb, char **out, size_t *outb)
  228. {
  229. size_t x=0;
  230. unsigned char to = (cd>>1)&127;
  231. unsigned char from = 255;
  232. const unsigned char *map = 0;
  233. char tmp[MB_LEN_MAX];
  234. wchar_t c, d;
  235. size_t k, l;
  236. int err;
  237. if (!in || !*in || !*inb) return 0;
  238. if (cd & 1)
  239. map = charmaps[cd>>8].map;
  240. else
  241. from = cd>>8;
  242. for (; *inb; *in+=l, *inb-=l) {
  243. c = *(unsigned char *)*in;
  244. l = 1;
  245. if (from >= UTF_8 && c < 0x80) goto charok;
  246. switch (from) {
  247. case WCHAR_T:
  248. l = sizeof(wchar_t);
  249. if (*inb < l) goto starved;
  250. c = *(wchar_t *)*in;
  251. break;
  252. case UTF_8:
  253. l = utf8dec_wchar(&c, *in, *inb);
  254. if (!l) l++;
  255. else if (l == (size_t)-1) goto ilseq;
  256. else if (l == (size_t)-2) goto starved;
  257. break;
  258. case US_ASCII:
  259. goto ilseq;
  260. case LATIN_9:
  261. if ((unsigned)c - 0xa4 <= 0xbe - 0xa4) {
  262. static const unsigned char map[] = {
  263. 0, 0x60, 0, 0x61, 0, 0, 0, 0, 0, 0, 0,
  264. 0, 0, 0, 0, 0x7d, 0, 0, 0, 0x7e, 0, 0, 0,
  265. 0x52, 0x53, 0x78
  266. };
  267. if (c == 0xa4) c = 0x20ac;
  268. else if (map[c-0xa5]) c = 0x100 | map[c-0xa5];
  269. }
  270. case LATIN_1:
  271. goto charok;
  272. case TIS_620:
  273. if (c >= 0xa1) c += 0x0e01-0xa1;
  274. goto charok;
  275. case JIS_0201:
  276. if (c >= 0xa1) {
  277. if (c <= 0xdf) c += 0xff61-0xa1;
  278. else goto ilseq;
  279. }
  280. goto charok;
  281. case UTF_16BE:
  282. case UTF_16LE:
  283. l = 2;
  284. if (*inb < 2) goto starved;
  285. c = get_16(*in, from);
  286. if ((unsigned)(c-0xdc00) < 0x400) goto ilseq;
  287. if ((unsigned)(c-0xd800) < 0x400) {
  288. l = 4;
  289. if (*inb < 4) goto starved;
  290. d = get_16(*in + 2, from);
  291. if ((unsigned)(c-0xdc00) >= 0x400) goto ilseq;
  292. c = ((c-0xd800)<<10) | (d-0xdc00);
  293. }
  294. break;
  295. case UTF_32BE:
  296. case UTF_32LE:
  297. l = 4;
  298. if (*inb < 4) goto starved;
  299. // FIXME
  300. // c = get_32(*in, from);
  301. break;
  302. default:
  303. /* only support ascii supersets */
  304. if (c < 0x80) break;
  305. switch (map[0]) {
  306. case UCS2_8BIT:
  307. c -= 0x80;
  308. break;
  309. case EUC_JP:
  310. if ((unsigned)c - 0xa1 >= 94) goto ilseq;
  311. if ((unsigned)in[0][1] - 0xa1 >= 94) goto ilseq;
  312. c = (c-0xa1)*94 + (in[0][1]-0xa1);
  313. l = 2;
  314. break;
  315. case SHIFT_JIS:
  316. if ((unsigned)c - 0xa1 <= 0xdf-0xa1) {
  317. c += 0xff61-0xa1;
  318. goto charok;
  319. }
  320. // FIXME...
  321. l = 2;
  322. break;
  323. default:
  324. goto badf;
  325. }
  326. c = get_16(map + 4 + 2*c, 0);
  327. if (c == 0xffff) goto ilseq;
  328. goto charok;
  329. }
  330. if ((unsigned)c - 0xd800 < 0x800 || (unsigned)c >= 0x110000)
  331. goto ilseq;
  332. charok:
  333. switch (to) {
  334. case WCHAR_T:
  335. if (*outb < sizeof(wchar_t)) goto toobig;
  336. *(wchar_t *)*out = c;
  337. *out += sizeof(wchar_t);
  338. *outb -= sizeof(wchar_t);
  339. break;
  340. case UTF_8:
  341. if (*outb < 4) {
  342. k = utf8enc_wchar(tmp, c);
  343. if (*outb < k) goto toobig;
  344. memcpy(*out, tmp, k);
  345. } else k = utf8enc_wchar(*out, c);
  346. *out += k;
  347. *outb -= k;
  348. break;
  349. case US_ASCII:
  350. if (c > 0x7f) c = 0xfffd;
  351. /* fall thru and count replacement in latin1 case */
  352. case LATIN_9:
  353. if (c >= 0x100 && c != 0xfffd)
  354. c = latin9_translit(c);
  355. /* fall through */
  356. case LATIN_1:
  357. if (!*outb) goto toobig;
  358. if (c < 0x100) **out = c;
  359. else x++, **out = '*'; //FIXME: translit?
  360. ++*out;
  361. --*outb;
  362. break;
  363. case UTF_16BE:
  364. case UTF_16LE:
  365. if (c < 0x10000) {
  366. if (*outb < 2) goto toobig;
  367. put_16(*out, c, to);
  368. *out += 2;
  369. *outb -= 2;
  370. break;
  371. }
  372. if (*outb < 4) goto toobig;
  373. put_16(*out, (c>>10)|0xd800, to);
  374. put_16(*out + 2, (c&0x3ff)|0xdc00, to);
  375. *out += 4;
  376. *outb -= 4;
  377. break;
  378. default:
  379. goto badf;
  380. }
  381. }
  382. return x;
  383. ilseq:
  384. err = EILSEQ;
  385. x = -1;
  386. goto end;
  387. badf:
  388. err = EBADF;
  389. x = -1;
  390. goto end;
  391. toobig:
  392. err = E2BIG;
  393. x = -1;
  394. goto end;
  395. starved:
  396. err = EINVAL;
  397. end:
  398. errno = err;
  399. return x;
  400. }