gen_wctype.c 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892
  1. /*
  2. * Copyright (C) 2000-2006 Erik Andersen <andersen@uclibc.org>
  3. *
  4. * Licensed under the LGPL v2.1, see the file COPYING.LIB in this tarball.
  5. */
  6. #define _GNU_SOURCE
  7. #include <stdio.h>
  8. #include <stdlib.h>
  9. #include <string.h>
  10. #include <locale.h>
  11. #include <wctype.h>
  12. #include <limits.h>
  13. #include <stdint.h>
  14. #include <wchar.h>
  15. #include <ctype.h>
  16. #ifndef _CTYPE_H
  17. #define _CTYPE_H
  18. #endif
  19. #ifndef _WCTYPE_H
  20. #define _WCTYPE_H
  21. #endif
  22. #include UCLIBC_CTYPE_HEADER
  23. /* 0x9 : space blank */
  24. /* 0xa : space */
  25. /* 0xb : space */
  26. /* 0xc : space */
  27. /* 0xd : space */
  28. /* 0x20 : space blank */
  29. /* 0x1680 : space blank */
  30. /* 0x2000 : space blank */
  31. /* 0x2001 : space blank */
  32. /* 0x2002 : space blank */
  33. /* 0x2003 : space blank */
  34. /* 0x2004 : space blank */
  35. /* 0x2005 : space blank */
  36. /* 0x2006 : space blank */
  37. /* 0x2008 : space blank */
  38. /* 0x2009 : space blank */
  39. /* 0x200a : space blank */
  40. /* 0x200b : space blank */
  41. /* 0x2028 : space */
  42. /* 0x2029 : space */
  43. /* 0x3000 : space blank */
  44. /* typecount[ 0] = 88670 C_alpha_nonupper_nonlower */
  45. /* typecount[ 1] = 742 C_alpha_lower */
  46. /* typecount[ 2] = 4 C_alpha_upper_lower */
  47. /* typecount[ 3] = 731 C_alpha_upper */
  48. /* typecount[ 4] = 10 C_digit */
  49. /* typecount[ 5] = 10270 C_punct */
  50. /* typecount[ 6] = 0 C_graph */
  51. /* typecount[ 7] = 0 C_print_space_nonblank */
  52. /* typecount[ 8] = 14 C_print_space_blank */
  53. /* typecount[ 9] = 0 C_space_nonblank_noncntrl */
  54. /* typecount[10] = 0 C_space_blank_noncntrl */
  55. /* typecount[11] = 6 C_cntrl_space_nonblank */
  56. /* typecount[12] = 1 C_cntrl_space_blank */
  57. /* typecount[13] = 60 C_cntrl_nonspace */
  58. /* typecount[14] = 96100 C_unclassified */
  59. /* typecount[15] = 0 empty_slot */
  60. /* Set to #if 0 to restrict wchars to 16 bits. */
  61. #if 1
  62. #define RANGE 0x2ffffUL
  63. #elif 0
  64. #define RANGE 0x1ffffUL
  65. #else
  66. #define RANGE 0xffffUL /* Restrict for 16-bit wchar_t... */
  67. #endif
  68. #if 0
  69. /* Classification codes. */
  70. static const char *typename[] = {
  71. "C_unclassified",
  72. "C_alpha_nonupper_nonlower",
  73. "C_alpha_lower",
  74. "C_alpha_upper_lower",
  75. "C_alpha_upper",
  76. "C_digit",
  77. "C_punct",
  78. "C_graph",
  79. "C_print_space_nonblank",
  80. "C_print_space_blank",
  81. "C_space_nonblank_noncntrl",
  82. "C_space_blank_noncntrl",
  83. "C_cntrl_space_nonblank",
  84. "C_cntrl_space_blank",
  85. "C_cntrl_nonspace",
  86. "empty_slot"
  87. };
  88. #endif
  89. #if 0
  90. /* Taking advantage of the C99 mutual-exclusion guarantees for the various
  91. * (w)ctype classes, including the descriptions of printing and control
  92. * (w)chars, we can place each in one of the following mutually-exlusive
  93. * subsets. Since there are less than 16, we can store the data for
  94. * each (w)chars in a nibble. In contrast, glibc uses an unsigned int
  95. * per (w)char, with one bit flag for each is* type. While this allows
  96. * a simple '&' operation to determine the type vs. a range test and a
  97. * little special handling for the "blank" and "xdigit" types in my
  98. * approach, it also uses 8 times the space for the tables on the typical
  99. * 32-bit archs we supported.*/
  100. enum {
  101. __CTYPE_unclassified = 0,
  102. __CTYPE_alpha_nonupper_nonlower,
  103. __CTYPE_alpha_lower,
  104. __CTYPE_alpha_upper_lower,
  105. __CTYPE_alpha_upper,
  106. __CTYPE_digit,
  107. __CTYPE_punct,
  108. __CTYPE_graph,
  109. __CTYPE_print_space_nonblank,
  110. __CTYPE_print_space_blank,
  111. __CTYPE_space_nonblank_noncntrl,
  112. __CTYPE_space_blank_noncntrl,
  113. __CTYPE_cntrl_space_nonblank,
  114. __CTYPE_cntrl_space_blank,
  115. __CTYPE_cntrl_nonspace,
  116. };
  117. #endif
  118. #define __CTYPE_isxdigit(D,X) \
  119. (__CTYPE_isdigit(D) || (((unsigned int)(((X)|0x20) - 'a')) <= 5))
  120. #define mywalnum(x) __CTYPE_isalnum(d)
  121. #define mywalpha(x) __CTYPE_isalpha(d)
  122. #define mywblank(x) __CTYPE_isblank(d)
  123. #define mywcntrl(x) __CTYPE_iscntrl(d)
  124. #define mywdigit(x) __CTYPE_isdigit(d)
  125. #define mywgraph(x) __CTYPE_isgraph(d)
  126. #define mywlower(x) __CTYPE_islower(d)
  127. #define mywprint(x) __CTYPE_isprint(d)
  128. #define mywpunct(x) __CTYPE_ispunct(d)
  129. #define mywspace(x) __CTYPE_isspace(d)
  130. #define mywupper(x) __CTYPE_isupper(d)
  131. #define mywxdigit(x) __CTYPE_isxdigit(d,x)
  132. typedef struct {
  133. short l;
  134. short u;
  135. } uldiff_entry;
  136. typedef struct {
  137. uint16_t ii_len;
  138. uint16_t ti_len;
  139. uint16_t ut_len;
  140. unsigned char ii_shift;
  141. unsigned char ti_shift;
  142. unsigned char *ii;
  143. unsigned char *ti;
  144. unsigned char *ut;
  145. } table_data;
  146. static unsigned verbose;
  147. #define verbose_msg(msg...) if (verbose) fprintf(stderr, msg)
  148. void output_table(FILE *fp, const char *name, table_data *tbl)
  149. {
  150. size_t i;
  151. fprintf(fp, "#define __LOCALE_DATA_WC%s_II_LEN %7u\n", name, tbl->ii_len);
  152. fprintf(fp, "#define __LOCALE_DATA_WC%s_TI_LEN %7u\n", name, tbl->ti_len);
  153. fprintf(fp, "#define __LOCALE_DATA_WC%s_UT_LEN %7u\n", name, tbl->ut_len);
  154. fprintf(fp, "#define __LOCALE_DATA_WC%s_II_SHIFT %7u\n", name, tbl->ii_shift);
  155. fprintf(fp, "#define __LOCALE_DATA_WC%s_TI_SHIFT %7u\n", name, tbl->ti_shift);
  156. fprintf(fp, "\n#ifdef WANT_WC%s_data\n", name);
  157. i = tbl->ii_len + tbl->ti_len + tbl->ut_len;
  158. fprintf(fp, "\nstatic const unsigned char __LOCALE_DATA_WC%s_data[%zu] = {", name, i);
  159. for (i=0 ; i < tbl->ii_len ; i++) {
  160. if (i % 12 == 0) {
  161. fprintf(fp, "\n");
  162. }
  163. fprintf(fp, " %#04x,", tbl->ii[i]);
  164. }
  165. for (i=0 ; i < tbl->ti_len ; i++) {
  166. if (i % 12 == 0) {
  167. fprintf(fp, "\n");
  168. }
  169. fprintf(fp, " %#04x,", tbl->ti[i]);
  170. }
  171. for (i=0 ; i < tbl->ut_len ; i++) {
  172. if (i % 12 == 0) {
  173. fprintf(fp, "\n");
  174. }
  175. fprintf(fp, " %#04x,", tbl->ut[i]);
  176. }
  177. fprintf(fp, "\n};\n\n");
  178. fprintf(fp, "#endif /* WANT_WC%s_data */\n\n", name);
  179. }
  180. static void dump_table_data(table_data *tbl)
  181. {
  182. verbose_msg("ii_shift = %d ti_shift = %d\n"
  183. "ii_len = %d ti_len = %d ut_len = %d\n"
  184. "total = %d\n",
  185. tbl->ii_shift, tbl->ti_shift,
  186. tbl->ii_len, tbl->ti_len, tbl->ut_len,
  187. (int) tbl->ii_len + (int) tbl->ti_len + (int) tbl->ut_len);
  188. }
  189. /* For sorting the blocks of unsigned chars. */
  190. static size_t nu_val;
  191. int nu_memcmp(const void *a, const void *b)
  192. {
  193. return memcmp(*(unsigned char**)a, *(unsigned char**)b, nu_val);
  194. }
  195. static size_t newopt(unsigned char *ut, size_t usize, int shift, table_data *tbl);
  196. #define MAXTO 255 /* Restrict to minimal unsigned char max. */
  197. int main(int argc, char **argv)
  198. {
  199. long int u, l, tt;
  200. size_t smallest, t;
  201. unsigned int c;
  202. unsigned int d;
  203. int i, n;
  204. int ul_count = 0;
  205. uldiff_entry uldiff[MAXTO];
  206. table_data cttable;
  207. table_data ultable;
  208. #if 0
  209. table_data combtable;
  210. table_data widthtable;
  211. long int last_comb = 0;
  212. #endif
  213. unsigned char wct[(RANGE/2)+1]; /* wctype table (nibble per wchar) */
  214. unsigned char ult[RANGE+1]; /* upper/lower table */
  215. unsigned char combt[(RANGE/4)+1]; /* combining */
  216. unsigned char widtht[(RANGE/4)+1]; /* width */
  217. wctrans_t totitle;
  218. wctype_t is_comb, is_comb3;
  219. long int typecount[16];
  220. const char *typename[16];
  221. static const char empty_slot[] = "empty_slot";
  222. int built = 0;
  223. #define INIT_TYPENAME(X) typename[__CTYPE_##X] = "C_" #X
  224. for (i=0 ; i < 16 ; i++) {
  225. typename[i] = empty_slot;
  226. }
  227. INIT_TYPENAME(unclassified);
  228. INIT_TYPENAME(alpha_nonupper_nonlower);
  229. INIT_TYPENAME(alpha_lower);
  230. INIT_TYPENAME(alpha_upper_lower);
  231. INIT_TYPENAME(alpha_upper);
  232. INIT_TYPENAME(digit);
  233. INIT_TYPENAME(punct);
  234. INIT_TYPENAME(graph);
  235. INIT_TYPENAME(print_space_nonblank);
  236. INIT_TYPENAME(print_space_blank);
  237. INIT_TYPENAME(space_nonblank_noncntrl);
  238. INIT_TYPENAME(space_blank_noncntrl);
  239. INIT_TYPENAME(cntrl_space_nonblank);
  240. INIT_TYPENAME(cntrl_space_blank);
  241. INIT_TYPENAME(cntrl_nonspace);
  242. memset(&cttable, 0, sizeof(table_data));
  243. memset(&ultable, 0, sizeof(table_data));
  244. #if 0
  245. memset(combtable, 0, sizeof table_data);
  246. memset(widthtable, 0, sizeof table_data);
  247. #endif
  248. setvbuf(stdout, NULL, _IONBF, 0);
  249. while (--argc) {
  250. ++argv;
  251. if (!strcmp(*argv, "-v")) {
  252. ++verbose;
  253. continue;
  254. } else if (!setlocale(LC_CTYPE, *argv)) {
  255. verbose_msg("setlocale(LC_CTYPE,%s) failed! Skipping this locale...\n", *argv);
  256. continue;
  257. }
  258. if (!(totitle = wctrans("totitle"))) {
  259. verbose_msg("no totitle transformation.\n");
  260. }
  261. if (!(is_comb = wctype("combining"))) {
  262. verbose_msg("no combining wctype.\n");
  263. }
  264. if (!(is_comb3 = wctype("combining_level3"))) {
  265. verbose_msg("no combining_level3 wctype.\n");
  266. }
  267. if (!built) {
  268. built = 1;
  269. ul_count = 1;
  270. uldiff[0].u = uldiff[0].l = 0;
  271. memset(wct, 0, sizeof(wct));
  272. memset(combt, 0, sizeof(combt));
  273. memset(widtht, 0, sizeof(widtht));
  274. for (i = 0 ; i < 16 ; i++) {
  275. typecount[i] = 0;
  276. }
  277. for (c=0 ; c <= RANGE ; c++) {
  278. if (iswdigit(c)) {
  279. d = __CTYPE_digit;
  280. } else if (iswalpha(c)) {
  281. d = __CTYPE_alpha_nonupper_nonlower;
  282. if (iswlower(c)) {
  283. d = __CTYPE_alpha_lower;
  284. if (iswupper(c)) {
  285. d = __CTYPE_alpha_upper_lower;
  286. }
  287. } else if (iswupper(c)) {
  288. d = __CTYPE_alpha_upper;
  289. }
  290. } else if (iswpunct(c)) {
  291. d = __CTYPE_punct;
  292. } else if (iswgraph(c)) {
  293. d = __CTYPE_graph;
  294. } else if (iswprint(c)) {
  295. d = __CTYPE_print_space_nonblank;
  296. if (iswblank(c)) {
  297. d = __CTYPE_print_space_blank;
  298. }
  299. } else if (iswspace(c) && !iswcntrl(c)) {
  300. d = __CTYPE_space_nonblank_noncntrl;
  301. if (iswblank(c)) {
  302. d = __CTYPE_space_blank_noncntrl;
  303. }
  304. } else if (iswcntrl(c)) {
  305. d = __CTYPE_cntrl_nonspace;
  306. if (iswspace(c)) {
  307. d = __CTYPE_cntrl_space_nonblank;
  308. if (iswblank(c)) {
  309. d = __CTYPE_cntrl_space_blank;
  310. }
  311. }
  312. } else {
  313. d = __CTYPE_unclassified;
  314. }
  315. ++typecount[d];
  316. #if 0
  317. if (iswspace(c)) {
  318. if (iswblank(c)) {
  319. verbose_msg("%#8x : space blank\n", c);
  320. } else {
  321. verbose_msg("%#8x : space\n", c);
  322. }
  323. }
  324. #endif
  325. #if 0
  326. if (c < 256) {
  327. unsigned int glibc;
  328. glibc = 0;
  329. if (isalnum(c)) ++glibc; glibc <<= 1;
  330. if (isalpha(c)) ++glibc; glibc <<= 1;
  331. if (isblank(c)) ++glibc; glibc <<= 1;
  332. if (iscntrl(c)) ++glibc; glibc <<= 1;
  333. if (isdigit(c)) ++glibc; glibc <<= 1;
  334. if (isgraph(c)) ++glibc; glibc <<= 1;
  335. if (islower(c)) ++glibc; glibc <<= 1;
  336. if (isprint(c)) ++glibc; glibc <<= 1;
  337. if (ispunct(c)) ++glibc; glibc <<= 1;
  338. if (isspace(c)) ++glibc; glibc <<= 1;
  339. if (isupper(c)) ++glibc; glibc <<= 1;
  340. if (isxdigit(c)) ++glibc;
  341. verbose_msg("%#8x : ctype %#4x\n", c, glibc);
  342. }
  343. #endif
  344. #if 1
  345. /* Paranoid checking... */
  346. {
  347. unsigned int glibc;
  348. unsigned int mine;
  349. glibc = 0;
  350. if (iswalnum(c)) ++glibc; glibc <<= 1;
  351. if (iswalpha(c)) ++glibc; glibc <<= 1;
  352. if (iswblank(c)) ++glibc; glibc <<= 1;
  353. if (iswcntrl(c)) ++glibc; glibc <<= 1;
  354. if (iswdigit(c)) ++glibc; glibc <<= 1;
  355. if (iswgraph(c)) ++glibc; glibc <<= 1;
  356. if (iswlower(c)) ++glibc; glibc <<= 1;
  357. if (iswprint(c)) ++glibc; glibc <<= 1;
  358. if (iswpunct(c)) ++glibc; glibc <<= 1;
  359. if (iswspace(c)) ++glibc; glibc <<= 1;
  360. if (iswupper(c)) ++glibc; glibc <<= 1;
  361. if (iswxdigit(c)) ++glibc;
  362. mine = 0;
  363. if (mywalnum(c)) ++mine; mine <<= 1;
  364. if (mywalpha(c)) ++mine; mine <<= 1;
  365. if (mywblank(c)) ++mine; mine <<= 1;
  366. if (mywcntrl(c)) ++mine; mine <<= 1;
  367. if (mywdigit(c)) ++mine; mine <<= 1;
  368. if (mywgraph(c)) ++mine; mine <<= 1;
  369. if (mywlower(c)) ++mine; mine <<= 1;
  370. if (mywprint(c)) ++mine; mine <<= 1;
  371. if (mywpunct(c)) ++mine; mine <<= 1;
  372. if (mywspace(c)) ++mine; mine <<= 1;
  373. if (mywupper(c)) ++mine; mine <<= 1;
  374. if (mywxdigit(c)) ++mine;
  375. if (glibc != mine) {
  376. verbose_msg("%#8x : glibc %#4x != %#4x mine %u\n", c, glibc, mine, d);
  377. return EXIT_FAILURE;
  378. }
  379. #if 0
  380. if (iswctype(c,is_comb) || iswctype(c,is_comb3)) {
  381. /* if (!iswpunct(c)) { */
  382. verbose_msg("%#8x : %d %d %#4x\n",
  383. c, iswctype(c,is_comb),iswctype(c,is_comb3), glibc);
  384. /* } */
  385. }
  386. #endif
  387. #if 0
  388. if (iswctype(c,is_comb) || iswctype(c,is_comb3)) {
  389. if (!last_comb) {
  390. verbose_msg("%#8x - ", c);
  391. last_comb = c;
  392. } else if (last_comb + 1 < c) {
  393. verbose_msg("%#8x\n%#8x - ", last_comb, c);
  394. last_comb = c;
  395. } else {
  396. last_comb = c;
  397. }
  398. }
  399. #endif
  400. }
  401. #endif
  402. combt[c/4] |= ((((!!iswctype(c,is_comb)) << 1) | !!iswctype(c,is_comb3))
  403. << ((c & 3) << 1));
  404. /* comb3t[c/8] |= ((!!iswctype(c,is_comb3)) << (c & 7)); */
  405. /* widtht[c/4] |= (wcwidth(c) << ((c & 3) << 1)); */
  406. if (c & 1) { /* Use the high nibble for odd numbered wchars. */
  407. d <<= 4;
  408. }
  409. wct[c/2] |= d;
  410. l = (long)(int) towlower(c) - c;
  411. u = (long)(int) towupper(c) - c;
  412. ult[c] = 0;
  413. if (l || u) {
  414. if ((l != (short)l) || (u != (short)u)) {
  415. verbose_msg("range assumption error! %x %ld %ld\n", c, l, u);
  416. return EXIT_FAILURE;
  417. }
  418. for (i=0 ; i < ul_count ; i++) {
  419. if ((l == uldiff[i].l) && (u == uldiff[i].u)) {
  420. goto found;
  421. }
  422. }
  423. uldiff[ul_count].l = l;
  424. uldiff[ul_count].u = u;
  425. ++ul_count;
  426. if (ul_count > MAXTO) {
  427. verbose_msg("too many touppers/tolowers!\n");
  428. return EXIT_FAILURE;
  429. }
  430. found:
  431. ult[c] = i;
  432. }
  433. }
  434. for (i = 0 ; i < 16 ; i++) {
  435. verbose_msg("typecount[%2d] = %8ld %s\n", i, typecount[i], typename[i]);
  436. }
  437. verbose_msg("optimizing is* table..\n");
  438. n = -1;
  439. smallest = SIZE_MAX;
  440. cttable.ii = NULL;
  441. for (i=0 ; i < 14 ; i++) {
  442. t = newopt(wct, (RANGE/2)+1, i, &cttable);
  443. if (smallest >= t) {
  444. n = i;
  445. smallest = t;
  446. /* } else { */
  447. /* break; */
  448. }
  449. }
  450. verbose_msg("smallest = %zu\n", smallest);
  451. if (!(cttable.ii = malloc(smallest))) {
  452. verbose_msg("couldn't allocate space!\n");
  453. return EXIT_FAILURE;
  454. }
  455. smallest = SIZE_MAX;
  456. newopt(wct, (RANGE/2)+1, n, &cttable);
  457. ++cttable.ti_shift; /* correct for nibble mode */
  458. verbose_msg("optimizing u/l-to table..\n");
  459. smallest = SIZE_MAX;
  460. ultable.ii = NULL;
  461. for (i=0 ; i < 14 ; i++) {
  462. t = newopt(ult, RANGE+1, i, &ultable);
  463. if (smallest >= t) {
  464. n = i;
  465. smallest = t;
  466. /* } else { */
  467. /* break; */
  468. }
  469. }
  470. verbose_msg("%zu (smallest) + %zu (u/l diffs) = %zu\n",
  471. smallest, 4 * ul_count, smallest + 4 * ul_count);
  472. verbose_msg("smallest = %zu\n", smallest);
  473. if (!(ultable.ii = malloc(smallest))) {
  474. verbose_msg("couldn't allocate space!\n");
  475. return EXIT_FAILURE;
  476. }
  477. smallest = SIZE_MAX;
  478. newopt(ult, RANGE+1, n, &ultable);
  479. #if 0
  480. verbose_msg("optimizing comb table..\n");
  481. smallest = SIZE_MAX;
  482. combtable.ii = NULL;
  483. for (i=0 ; i < 14 ; i++) {
  484. t = newopt(combt, sizeof(combt), i, &combtable);
  485. if (smallest >= t) {
  486. n = i;
  487. smallest = t;
  488. /* } else { */
  489. /* break; */
  490. }
  491. }
  492. verbose_msg("smallest = %zu\n", smallest);
  493. if (!(combtable.ii = malloc(smallest))) {
  494. verbose_msg("couldn't allocate space!\n");
  495. return EXIT_FAILURE;
  496. }
  497. smallest = SIZE_MAX;
  498. newopt(combt, sizeof(combt), n, &combtable);
  499. combtable.ti_shift += 4; /* correct for 4 entries per */
  500. #endif
  501. #if 0
  502. verbose_msg("optimizing width table..\n");
  503. smallest = SIZE_MAX;
  504. widthtable.ii = NULL;
  505. for (i=0 ; i < 14 ; i++) {
  506. t = newopt(widtht, sizeof(widtht), i, &widthtable);
  507. if (smallest >= t) {
  508. n = i;
  509. smallest = t;
  510. /* } else { */
  511. /* break; */
  512. }
  513. }
  514. verbose_msg("smallest = %zu\n", smallest);
  515. if (!(widthtable.ii = malloc(smallest))) {
  516. verbose_msg("couldn't allocate space!\n");
  517. return EXIT_FAILURE;
  518. }
  519. smallest = SIZE_MAX;
  520. newopt(widtht, sizeof(widtht), n, &widthtable);
  521. widthtable.ti_shift += 4; /* correct for 4 entries per */
  522. #endif
  523. #if 0
  524. verbose_msg("optimizing comb3 table..\n");
  525. smallest = SIZE_MAX;
  526. comb3table.ii = NULL;
  527. for (i=0 ; i < 14 ; i++) {
  528. t = newopt(comb3t, sizeof(comb3t), i, &comb3table);
  529. if (smallest >= t) {
  530. n = i;
  531. smallest = t;
  532. /* } else { */
  533. /* break; */
  534. }
  535. }
  536. verbose_msg("smallest = %zu\n", smallest);
  537. if (!(comb3table.ii = malloc(smallest))) {
  538. verbose_msg("couldn't allocate space!\n");
  539. return EXIT_FAILURE;
  540. }
  541. smallest = SIZE_MAX;
  542. newopt(comb3t, sizeof(comb3t), n, &comb3table);
  543. comb3table.ti_shift += 8; /* correct for 4 entries per */
  544. #endif
  545. dump_table_data(&cttable);
  546. dump_table_data(&ultable);
  547. #if 0
  548. dump_table_data(&combtable);
  549. #endif
  550. }
  551. verbose_msg("verifying for %s...\n", *argv);
  552. #if RANGE == 0xffffU
  553. for (c=0 ; c <= 0xffffUL ; c++)
  554. #else
  555. for (c=0 ; c <= 0x10ffffUL ; c++)
  556. #endif
  557. {
  558. unsigned int glibc;
  559. unsigned int mine;
  560. unsigned int upper, lower;
  561. #if 0
  562. #if RANGE < 0x10000UL
  563. if (c == 0x10000UL) {
  564. c = 0x30000UL; /* skip 1st and 2nd sup planes */
  565. }
  566. #elif RANGE < 0x20000UL
  567. if (c == 0x20000UL) {
  568. c = 0x30000UL; /* skip 2nd sup planes */
  569. }
  570. #endif
  571. #endif
  572. glibc = 0;
  573. if (iswalnum(c)) ++glibc; glibc <<= 1;
  574. if (iswalpha(c)) ++glibc; glibc <<= 1;
  575. if (iswblank(c)) ++glibc; glibc <<= 1;
  576. if (iswcntrl(c)) ++glibc; glibc <<= 1;
  577. if (iswdigit(c)) ++glibc; glibc <<= 1;
  578. if (iswgraph(c)) ++glibc; glibc <<= 1;
  579. if (iswlower(c)) ++glibc; glibc <<= 1;
  580. if (iswprint(c)) ++glibc; glibc <<= 1;
  581. if (iswpunct(c)) ++glibc; glibc <<= 1;
  582. if (iswspace(c)) ++glibc; glibc <<= 1;
  583. if (iswupper(c)) ++glibc; glibc <<= 1;
  584. if (iswxdigit(c)) ++glibc;
  585. {
  586. unsigned int u;
  587. int n, sc;
  588. int i0, i1;
  589. u = c;
  590. if (u <= RANGE) {
  591. sc = u & ((1 << cttable.ti_shift) - 1);
  592. u >>= cttable.ti_shift;
  593. n = u & ((1 << cttable.ii_shift) - 1);
  594. u >>= cttable.ii_shift;
  595. i0 = cttable.ii[u];
  596. i0 <<= cttable.ii_shift;
  597. i1 = cttable.ti[i0 + n];
  598. i1 <<= (cttable.ti_shift-1);
  599. d = cttable.ut[i1 + (sc >> 1)];
  600. if (sc & 1) {
  601. d >>= 4;
  602. }
  603. d &= 0x0f;
  604. } else if ((((unsigned int)(c - 0xe0020UL)) <= 0x5f) || (c == 0xe0001UL)){
  605. d = __CTYPE_punct;
  606. } else if (((unsigned int)(c - 0xf0000UL)) < 0x20000UL) {
  607. if ((c & 0xffffU) <= 0xfffdU) {
  608. d = __CTYPE_punct;
  609. } else {
  610. d = __CTYPE_unclassified;
  611. }
  612. } else {
  613. d = __CTYPE_unclassified;
  614. }
  615. mine = 0;
  616. if (mywalnum(c)) ++mine; mine <<= 1;
  617. if (mywalpha(c)) ++mine; mine <<= 1;
  618. if (mywblank(c)) ++mine; mine <<= 1;
  619. if (mywcntrl(c)) ++mine; mine <<= 1;
  620. if (mywdigit(c)) ++mine; mine <<= 1;
  621. if (mywgraph(c)) ++mine; mine <<= 1;
  622. if (mywlower(c)) ++mine; mine <<= 1;
  623. if (mywprint(c)) ++mine; mine <<= 1;
  624. if (mywpunct(c)) ++mine; mine <<= 1;
  625. if (mywspace(c)) ++mine; mine <<= 1;
  626. if (mywupper(c)) ++mine; mine <<= 1;
  627. if (mywxdigit(c)) ++mine;
  628. if (glibc != mine) {
  629. verbose_msg("%#8x : glibc %#4x != %#4x mine %d\n", c, glibc, mine, d);
  630. if (c < 0x30000UL) {
  631. verbose_msg("sc=%#x u=%#x n=%#x i0=%#x i1=%#x\n", sc, u, n, i0, i1);
  632. }
  633. }
  634. upper = lower = u = c;
  635. if (u <= RANGE) {
  636. sc = u & ((1 << ultable.ti_shift) - 1);
  637. u >>= ultable.ti_shift;
  638. n = u & ((1 << ultable.ii_shift) - 1);
  639. u >>= ultable.ii_shift;
  640. i0 = ultable.ii[u];
  641. i0 <<= ultable.ii_shift;
  642. i1 = ultable.ti[i0 + n];
  643. i1 <<= (ultable.ti_shift);
  644. i1 += sc;
  645. i0 = ultable.ut[i1];
  646. upper = c + uldiff[i0].u;
  647. lower = c + uldiff[i0].l;
  648. }
  649. if (towupper(c) != upper) {
  650. verbose_msg("%#8x : towupper glibc %#4x != %#4x mine\n",
  651. c, towupper(c), upper);
  652. }
  653. if (towlower(c) != lower) {
  654. verbose_msg("%#8x : towlower glibc %#4x != %#4x mine i0 = %d\n",
  655. c, towlower(c), lower, i0);
  656. }
  657. if (totitle && ((tt = towctrans(c, totitle)) != upper)) {
  658. verbose_msg("%#8x : totitle glibc %#4lx != %#4x mine i0 = %d\n",
  659. c, tt, upper, i0);
  660. }
  661. }
  662. if ((c & 0xfff) == 0xfff) verbose_msg(".");
  663. }
  664. verbose_msg("done\n");
  665. }
  666. if (built) {
  667. FILE *fp;
  668. if (!(fp = fopen("wctables.h", "w"))) {
  669. verbose_msg("cannot open output file 'wctables.h'!\n");
  670. return EXIT_FAILURE;
  671. }
  672. fprintf(fp, "#define __LOCALE_DATA_WC_TABLE_DOMAIN_MAX %#8lx\n\n",
  673. (unsigned long) RANGE);
  674. output_table(fp, "ctype", &cttable);
  675. output_table(fp, "uplow", &ultable);
  676. #warning fix the upper bound on the upper/lower tables... save 200 bytes or so
  677. fprintf(fp, "#define __LOCALE_DATA_WCuplow_diffs %7u\n", ul_count);
  678. fprintf(fp, "\n#ifdef WANT_WCuplow_diff_data\n\n");
  679. fprintf(fp, "\nstatic const short __LOCALE_DATA_WCuplow_diff_data[%zu] = {",
  680. 2 * (size_t) ul_count);
  681. for (i=0 ; i < ul_count ; i++) {
  682. if (i % 4 == 0) {
  683. fprintf(fp, "\n");
  684. }
  685. fprintf(fp, " %6d, %6d,", uldiff[i].u, uldiff[i].l);
  686. }
  687. fprintf(fp, "\n};\n\n");
  688. fprintf(fp, "#endif /* WANT_WCuplow_diff_data */\n\n");
  689. /* output_table(fp, "comb", &combtable); */
  690. /* output_table(fp, "width", &widthtable); */
  691. fclose(fp);
  692. }
  693. return !built;
  694. }
  695. size_t newopt(unsigned char *ut, size_t usize, int shift, table_data *tbl)
  696. {
  697. static int recurse;
  698. unsigned char *ti[RANGE+1]; /* table index */
  699. size_t numblocks;
  700. size_t blocksize;
  701. size_t uniq;
  702. size_t i, j;
  703. size_t smallest, t;
  704. unsigned char *ii_save;
  705. int uniqblock[256];
  706. unsigned char uit[RANGE+1];
  707. int shift2;
  708. memset(uniqblock, 0x00, sizeof(uniqblock));
  709. ii_save = NULL;
  710. blocksize = 1 << shift;
  711. numblocks = usize >> shift;
  712. /* init table index */
  713. for (i=j=0 ; i < numblocks ; i++) {
  714. ti[i] = ut + j;
  715. j += blocksize;
  716. }
  717. /* sort */
  718. nu_val = blocksize;
  719. qsort(ti, numblocks, sizeof(unsigned char *), nu_memcmp);
  720. uniq = 1;
  721. uit[(ti[0]-ut)/blocksize] = 0;
  722. for (i=1 ; i < numblocks ; i++) {
  723. if (memcmp(ti[i-1], ti[i], blocksize) < 0) {
  724. if (++uniq > 255) {
  725. break;
  726. }
  727. uniqblock[uniq - 1] = i;
  728. }
  729. #if 1
  730. else if (memcmp(ti[i-1], ti[i], blocksize) > 0) {
  731. verbose_msg("bad sort %i!\n", i);
  732. abort();
  733. }
  734. #endif
  735. uit[(ti[i]-ut)/blocksize] = uniq - 1;
  736. }
  737. smallest = SIZE_MAX;
  738. shift2 = -1;
  739. if (uniq <= 255) {
  740. smallest = numblocks + uniq * blocksize;
  741. if (!recurse) {
  742. ++recurse;
  743. for (j=1 ; j < 14 ; j++) {
  744. if ((numblocks >> j) < 2) break;
  745. if (tbl) {
  746. ii_save = tbl->ii;
  747. tbl->ii = NULL;
  748. }
  749. if ((t = newopt(uit, numblocks, j, tbl)) < SIZE_MAX) {
  750. t += uniq * blocksize;
  751. }
  752. if (tbl) {
  753. tbl->ii = ii_save;
  754. }
  755. if (smallest >= t) {
  756. shift2 = j;
  757. smallest = t;
  758. if (!tbl->ii) {
  759. verbose_msg("ishift %zu tshift %zu size %zu\n",
  760. shift2, shift, t);
  761. }
  762. /* } else { */
  763. /* break; */
  764. }
  765. }
  766. --recurse;
  767. }
  768. } else {
  769. return SIZE_MAX;
  770. }
  771. if (tbl->ii) {
  772. if (recurse) {
  773. tbl->ii_shift = shift;
  774. tbl->ii_len = numblocks;
  775. memcpy(tbl->ii, uit, numblocks);
  776. tbl->ti = tbl->ii + tbl->ii_len;
  777. tbl->ti_len = uniq * blocksize;
  778. for (i=0 ; i < uniq ; i++) {
  779. memcpy(tbl->ti + i * blocksize, ti[uniqblock[i]], blocksize);
  780. }
  781. } else {
  782. ++recurse;
  783. verbose_msg("setting ishift %zu tshift %zu\n",
  784. shift2, shift);
  785. newopt(uit, numblocks, shift2, tbl);
  786. --recurse;
  787. tbl->ti_shift = shift;
  788. tbl->ut_len = uniq * blocksize;
  789. tbl->ut = tbl->ti + tbl->ti_len;
  790. for (i=0 ; i < uniq ; i++) {
  791. memcpy(tbl->ut + i * blocksize, ti[uniqblock[i]], blocksize);
  792. }
  793. }
  794. }
  795. return smallest;
  796. }
  797. /* vi: set sw=4 ts=4: */