gen_wctype.c 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823
  1. /*
  2. * Copyright (C) 2000-2006 Erik Andersen <andersen@uclibc.org>
  3. *
  4. * Licensed under the LGPL v2.1, see the file COPYING.LIB in this tarball.
  5. */
  6. #define _GNU_SOURCE
  7. #include <stdio.h>
  8. #include <stdlib.h>
  9. #include <string.h>
  10. #include <locale.h>
  11. #include <wctype.h>
  12. #include <limits.h>
  13. #include <stdint.h>
  14. #include <wchar.h>
  15. #include <ctype.h>
  16. #include "include/bits/uClibc_charclass.h"
  17. /* 0x9 : space blank */
  18. /* 0xa : space */
  19. /* 0xb : space */
  20. /* 0xc : space */
  21. /* 0xd : space */
  22. /* 0x20 : space blank */
  23. /* 0x1680 : space blank */
  24. /* 0x2000 : space blank */
  25. /* 0x2001 : space blank */
  26. /* 0x2002 : space blank */
  27. /* 0x2003 : space blank */
  28. /* 0x2004 : space blank */
  29. /* 0x2005 : space blank */
  30. /* 0x2006 : space blank */
  31. /* 0x2008 : space blank */
  32. /* 0x2009 : space blank */
  33. /* 0x200a : space blank */
  34. /* 0x200b : space blank */
  35. /* 0x2028 : space */
  36. /* 0x2029 : space */
  37. /* 0x3000 : space blank */
  38. /* typecount[ 0] = 88670 C_alpha_nonupper_nonlower */
  39. /* typecount[ 1] = 742 C_alpha_lower */
  40. /* typecount[ 2] = 4 C_alpha_upper_lower */
  41. /* typecount[ 3] = 731 C_alpha_upper */
  42. /* typecount[ 4] = 10 C_digit */
  43. /* typecount[ 5] = 10270 C_punct */
  44. /* typecount[ 6] = 0 C_graph */
  45. /* typecount[ 7] = 0 C_print_space_nonblank */
  46. /* typecount[ 8] = 14 C_print_space_blank */
  47. /* typecount[ 9] = 0 C_space_nonblank_noncntrl */
  48. /* typecount[10] = 0 C_space_blank_noncntrl */
  49. /* typecount[11] = 6 C_cntrl_space_nonblank */
  50. /* typecount[12] = 1 C_cntrl_space_blank */
  51. /* typecount[13] = 60 C_cntrl_nonspace */
  52. /* typecount[14] = 96100 C_unclassified */
  53. /* typecount[15] = 0 empty_slot */
  54. /* Set to #if 0 to restrict wchars to 16 bits. */
  55. #if 1
  56. #define RANGE 0x2ffffUL
  57. #elif 0
  58. #define RANGE 0x1ffffUL
  59. #else
  60. #define RANGE 0xffffUL /* Restrict for 16-bit wchar_t... */
  61. #endif
  62. /* Some macros that test for various (w)ctype classes when passed one of the
  63. * designator values enumerated above. */
  64. #define mywalnum(D,C) ((unsigned)(D - 1) <= (__CTYPE_digit - 1))
  65. #define mywalpha(D,C) ((unsigned)(D - 1) <= (__CTYPE_alpha_upper - 1))
  66. #define mywblank(D,C) ((unsigned)(D - __CTYPE_print_space_nonblank) <= 5 && (D & 1))
  67. #define mywcntrl(D,C) ((unsigned)(D - __CTYPE_cntrl_space_nonblank) <= 2)
  68. #define mywdigit(D,C) (D == __CTYPE_digit)
  69. #define mywgraph(D,C) ((unsigned)(D - 1) <= (__CTYPE_graph - 1))
  70. #define mywlower(D,C) ((unsigned)(D - __CTYPE_alpha_lower) <= 1)
  71. #define mywprint(D,C) ((unsigned)(D - 1) <= (__CTYPE_print_space_blank - 1))
  72. #define mywpunct(D,C) (D == __CTYPE_punct)
  73. #define mywspace(D,C) ((unsigned)(D - __CTYPE_print_space_nonblank) <= 5)
  74. #define mywupper(D,C) ((unsigned)(D - __CTYPE_alpha_upper_lower) <= 1)
  75. /* #define mywxdigit(D,C) -- isxdigit is untestable this way.
  76. * But that's ok as isxdigit() (and isdigit() too) are locale-invariant. */
  77. #define mywxdigit(D,C) (mywdigit(D,C) || (unsigned)(((C) | 0x20) - 'a') <= 5)
  78. typedef struct {
  79. short l;
  80. short u;
  81. } uldiff_entry;
  82. typedef struct {
  83. uint16_t ii_len;
  84. uint16_t ti_len;
  85. uint16_t ut_len;
  86. unsigned char ii_shift;
  87. unsigned char ti_shift;
  88. unsigned char *ii;
  89. unsigned char *ti;
  90. unsigned char *ut;
  91. } table_data;
  92. static unsigned verbose;
  93. #define verbose_msg(msg...) if (verbose) fprintf(stderr, msg)
  94. void output_table(FILE *fp, const char *name, table_data *tbl)
  95. {
  96. size_t i;
  97. fprintf(fp, "#define __LOCALE_DATA_WC%s_II_LEN %7u\n", name, tbl->ii_len);
  98. fprintf(fp, "#define __LOCALE_DATA_WC%s_TI_LEN %7u\n", name, tbl->ti_len);
  99. fprintf(fp, "#define __LOCALE_DATA_WC%s_UT_LEN %7u\n", name, tbl->ut_len);
  100. fprintf(fp, "#define __LOCALE_DATA_WC%s_II_SHIFT %7u\n", name, tbl->ii_shift);
  101. fprintf(fp, "#define __LOCALE_DATA_WC%s_TI_SHIFT %7u\n", name, tbl->ti_shift);
  102. fprintf(fp, "\n#ifdef WANT_WC%s_data\n", name);
  103. i = tbl->ii_len + tbl->ti_len + tbl->ut_len;
  104. fprintf(fp, "\nstatic const unsigned char __LOCALE_DATA_WC%s_data[%zu] = {", name, i);
  105. for (i = 0; i < tbl->ii_len; i++) {
  106. if (i % 12 == 0) {
  107. fprintf(fp, "\n");
  108. }
  109. fprintf(fp, " %#04x,", tbl->ii[i]);
  110. }
  111. for (i = 0; i < tbl->ti_len; i++) {
  112. if (i % 12 == 0) {
  113. fprintf(fp, "\n");
  114. }
  115. fprintf(fp, " %#04x,", tbl->ti[i]);
  116. }
  117. for (i = 0; i < tbl->ut_len; i++) {
  118. if (i % 12 == 0) {
  119. fprintf(fp, "\n");
  120. }
  121. fprintf(fp, " %#04x,", tbl->ut[i]);
  122. }
  123. fprintf(fp, "\n};\n\n");
  124. fprintf(fp, "#endif /* WANT_WC%s_data */\n\n", name);
  125. }
  126. static void dump_table_data(table_data *tbl)
  127. {
  128. verbose_msg("ii_shift = %d ti_shift = %d\n"
  129. "ii_len = %d ti_len = %d ut_len = %d\n"
  130. "total = %d\n",
  131. tbl->ii_shift, tbl->ti_shift,
  132. tbl->ii_len, tbl->ti_len, tbl->ut_len,
  133. (int) tbl->ii_len + (int) tbl->ti_len + (int) tbl->ut_len);
  134. }
  135. /* For sorting the blocks of unsigned chars. */
  136. static size_t nu_val;
  137. int nu_memcmp(const void *a, const void *b)
  138. {
  139. return memcmp(*(unsigned char**)a, *(unsigned char**)b, nu_val);
  140. }
  141. static size_t newopt(unsigned char *ut, size_t usize, int shift, table_data *tbl);
  142. #define MAXTO 255 /* Restrict to minimal unsigned char max. */
  143. int main(int argc, char **argv)
  144. {
  145. long int u, l, tt;
  146. size_t smallest, t;
  147. unsigned int c;
  148. unsigned int d;
  149. int i, n;
  150. int ul_count = 0;
  151. uldiff_entry uldiff[MAXTO];
  152. table_data cttable;
  153. table_data ultable;
  154. #if 0
  155. table_data combtable;
  156. table_data widthtable;
  157. long int last_comb = 0;
  158. #endif
  159. unsigned char wct[(RANGE/2)+1]; /* wctype table (nibble per wchar) */
  160. unsigned char ult[RANGE+1]; /* upper/lower table */
  161. unsigned char combt[(RANGE/4)+1]; /* combining */
  162. unsigned char widtht[(RANGE/4)+1]; /* width */
  163. wctrans_t totitle;
  164. wctype_t is_comb, is_comb3;
  165. long int typecount[16];
  166. const char *typename[16];
  167. static const char empty_slot[] = "empty_slot";
  168. int built = 0;
  169. #define INIT_TYPENAME(X) typename[__CTYPE_##X] = "C_" #X
  170. for (i = 0; i < 16; i++) {
  171. typename[i] = empty_slot;
  172. }
  173. INIT_TYPENAME(unclassified);
  174. INIT_TYPENAME(alpha_nonupper_nonlower);
  175. INIT_TYPENAME(alpha_lower);
  176. INIT_TYPENAME(alpha_upper_lower);
  177. INIT_TYPENAME(alpha_upper);
  178. INIT_TYPENAME(digit);
  179. INIT_TYPENAME(punct);
  180. INIT_TYPENAME(graph);
  181. INIT_TYPENAME(print_space_nonblank);
  182. INIT_TYPENAME(print_space_blank);
  183. INIT_TYPENAME(space_nonblank_noncntrl);
  184. INIT_TYPENAME(space_blank_noncntrl);
  185. INIT_TYPENAME(cntrl_space_nonblank);
  186. INIT_TYPENAME(cntrl_space_blank);
  187. INIT_TYPENAME(cntrl_nonspace);
  188. memset(&cttable, 0, sizeof(table_data));
  189. memset(&ultable, 0, sizeof(table_data));
  190. #if 0
  191. memset(combtable, 0, sizeof(table_data));
  192. memset(widthtable, 0, sizeof(table_data));
  193. #endif
  194. setvbuf(stdout, NULL, _IONBF, 0);
  195. while (--argc) {
  196. ++argv;
  197. if (!strcmp(*argv, "-v")) {
  198. ++verbose;
  199. continue;
  200. }
  201. if (!setlocale(LC_CTYPE, *argv)) {
  202. verbose_msg("setlocale(LC_CTYPE,%s) failed! Skipping this locale...\n", *argv);
  203. continue;
  204. }
  205. if (!(totitle = wctrans("totitle"))) {
  206. verbose_msg("no totitle transformation.\n");
  207. }
  208. if (!(is_comb = wctype("combining"))) {
  209. verbose_msg("no combining wctype.\n");
  210. }
  211. if (!(is_comb3 = wctype("combining_level3"))) {
  212. verbose_msg("no combining_level3 wctype.\n");
  213. }
  214. if (!built) {
  215. built = 1;
  216. ul_count = 1;
  217. uldiff[0].u = uldiff[0].l = 0;
  218. memset(wct, 0, sizeof(wct));
  219. memset(combt, 0, sizeof(combt));
  220. memset(widtht, 0, sizeof(widtht));
  221. for (i = 0; i < 16; i++) {
  222. typecount[i] = 0;
  223. }
  224. for (c = 0; c <= RANGE; c++) {
  225. if (iswdigit(c)) {
  226. d = __CTYPE_digit;
  227. } else if (iswalpha(c)) {
  228. d = __CTYPE_alpha_nonupper_nonlower;
  229. if (iswlower(c)) {
  230. d = __CTYPE_alpha_lower;
  231. if (iswupper(c)) {
  232. d = __CTYPE_alpha_upper_lower;
  233. }
  234. } else if (iswupper(c)) {
  235. d = __CTYPE_alpha_upper;
  236. }
  237. } else if (iswpunct(c)) {
  238. d = __CTYPE_punct;
  239. } else if (iswgraph(c)) {
  240. d = __CTYPE_graph;
  241. } else if (iswprint(c)) {
  242. d = __CTYPE_print_space_nonblank;
  243. if (iswblank(c)) {
  244. d = __CTYPE_print_space_blank;
  245. }
  246. } else if (iswspace(c) && !iswcntrl(c)) {
  247. d = __CTYPE_space_nonblank_noncntrl;
  248. if (iswblank(c)) {
  249. d = __CTYPE_space_blank_noncntrl;
  250. }
  251. } else if (iswcntrl(c)) {
  252. d = __CTYPE_cntrl_nonspace;
  253. if (iswspace(c)) {
  254. d = __CTYPE_cntrl_space_nonblank;
  255. if (iswblank(c)) {
  256. d = __CTYPE_cntrl_space_blank;
  257. }
  258. }
  259. } else {
  260. d = __CTYPE_unclassified;
  261. }
  262. ++typecount[d];
  263. #if 0
  264. if (iswspace(c)) {
  265. if (iswblank(c)) {
  266. verbose_msg("%#8x : space blank\n", c);
  267. } else {
  268. verbose_msg("%#8x : space\n", c);
  269. }
  270. }
  271. #endif
  272. #if 0
  273. if (c < 256) {
  274. unsigned int glibc;
  275. glibc = 0;
  276. if (isalnum(c)) ++glibc; glibc <<= 1;
  277. if (isalpha(c)) ++glibc; glibc <<= 1;
  278. if (isblank(c)) ++glibc; glibc <<= 1;
  279. if (iscntrl(c)) ++glibc; glibc <<= 1;
  280. if (isdigit(c)) ++glibc; glibc <<= 1;
  281. if (isgraph(c)) ++glibc; glibc <<= 1;
  282. if (islower(c)) ++glibc; glibc <<= 1;
  283. if (isprint(c)) ++glibc; glibc <<= 1;
  284. if (ispunct(c)) ++glibc; glibc <<= 1;
  285. if (isspace(c)) ++glibc; glibc <<= 1;
  286. if (isupper(c)) ++glibc; glibc <<= 1;
  287. if (isxdigit(c)) ++glibc;
  288. verbose_msg("%#8x : ctype %#4x\n", c, glibc);
  289. }
  290. #endif
  291. #if 1
  292. /* Paranoid checking... */
  293. {
  294. unsigned int glibc;
  295. unsigned int mine;
  296. glibc = 0;
  297. if (iswalnum(c)) ++glibc; glibc <<= 1;
  298. if (iswalpha(c)) ++glibc; glibc <<= 1;
  299. if (iswblank(c)) ++glibc; glibc <<= 1;
  300. if (iswcntrl(c)) ++glibc; glibc <<= 1;
  301. if (iswdigit(c)) ++glibc; glibc <<= 1;
  302. if (iswgraph(c)) ++glibc; glibc <<= 1;
  303. if (iswlower(c)) ++glibc; glibc <<= 1;
  304. if (iswprint(c)) ++glibc; glibc <<= 1;
  305. if (iswpunct(c)) ++glibc; glibc <<= 1;
  306. if (iswspace(c)) ++glibc; glibc <<= 1;
  307. if (iswupper(c)) ++glibc; glibc <<= 1;
  308. if (iswxdigit(c)) ++glibc;
  309. mine = 0;
  310. if (mywalnum(d,c)) ++mine; mine <<= 1;
  311. if (mywalpha(d,c)) ++mine; mine <<= 1;
  312. if (mywblank(d,c)) ++mine; mine <<= 1;
  313. if (mywcntrl(d,c)) ++mine; mine <<= 1;
  314. if (mywdigit(d,c)) ++mine; mine <<= 1;
  315. if (mywgraph(d,c)) ++mine; mine <<= 1;
  316. if (mywlower(d,c)) ++mine; mine <<= 1;
  317. if (mywprint(d,c)) ++mine; mine <<= 1;
  318. if (mywpunct(d,c)) ++mine; mine <<= 1;
  319. if (mywspace(d,c)) ++mine; mine <<= 1;
  320. if (mywupper(d,c)) ++mine; mine <<= 1;
  321. if (mywxdigit(d,c)) ++mine;
  322. if (glibc != mine) {
  323. verbose_msg("%#8x : glibc %#4x != %#4x mine %u\n", c, glibc, mine, d);
  324. return EXIT_FAILURE;
  325. }
  326. #if 0
  327. if (iswctype(c,is_comb) || iswctype(c,is_comb3)) {
  328. /* if (!iswpunct(c)) { */
  329. verbose_msg("%#8x : %d %d %#4x\n",
  330. c, iswctype(c,is_comb),iswctype(c,is_comb3), glibc);
  331. /* } */
  332. }
  333. #endif
  334. #if 0
  335. if (iswctype(c,is_comb) || iswctype(c,is_comb3)) {
  336. if (!last_comb) {
  337. verbose_msg("%#8x - ", c);
  338. last_comb = c;
  339. } else if (last_comb + 1 < c) {
  340. verbose_msg("%#8x\n%#8x - ", last_comb, c);
  341. last_comb = c;
  342. } else {
  343. last_comb = c;
  344. }
  345. }
  346. #endif
  347. }
  348. #endif
  349. combt[c/4] |= ((((!!iswctype(c,is_comb)) << 1) | !!iswctype(c,is_comb3))
  350. << ((c & 3) << 1));
  351. /* comb3t[c/8] |= ((!!iswctype(c,is_comb3)) << (c & 7)); */
  352. /* widtht[c/4] |= (wcwidth(c) << ((c & 3) << 1)); */
  353. if (c & 1) { /* Use the high nibble for odd numbered wchars. */
  354. d <<= 4;
  355. }
  356. wct[c/2] |= d;
  357. l = (long)(int) towlower(c) - c;
  358. u = (long)(int) towupper(c) - c;
  359. ult[c] = 0;
  360. if (l || u) {
  361. if ((l != (short)l) || (u != (short)u)) {
  362. verbose_msg("range assumption error! %x %ld %ld\n", c, l, u);
  363. return EXIT_FAILURE;
  364. }
  365. for (i = 0; i < ul_count; i++) {
  366. if ((l == uldiff[i].l) && (u == uldiff[i].u)) {
  367. goto found;
  368. }
  369. }
  370. uldiff[ul_count].l = l;
  371. uldiff[ul_count].u = u;
  372. ++ul_count;
  373. if (ul_count > MAXTO) {
  374. verbose_msg("too many touppers/tolowers!\n");
  375. return EXIT_FAILURE;
  376. }
  377. found:
  378. ult[c] = i;
  379. }
  380. }
  381. for (i = 0; i < 16; i++) {
  382. verbose_msg("typecount[%2d] = %8ld %s\n", i, typecount[i], typename[i]);
  383. }
  384. verbose_msg("optimizing is* table..\n");
  385. n = -1;
  386. smallest = SIZE_MAX;
  387. cttable.ii = NULL;
  388. for (i = 0; i < 14; i++) {
  389. t = newopt(wct, (RANGE/2)+1, i, &cttable);
  390. if (smallest >= t) {
  391. n = i;
  392. smallest = t;
  393. /* } else { */
  394. /* break; */
  395. }
  396. }
  397. verbose_msg("smallest = %zu\n", smallest);
  398. if (!(cttable.ii = malloc(smallest))) {
  399. verbose_msg("couldn't allocate space!\n");
  400. return EXIT_FAILURE;
  401. }
  402. smallest = SIZE_MAX;
  403. newopt(wct, (RANGE/2)+1, n, &cttable);
  404. ++cttable.ti_shift; /* correct for nibble mode */
  405. verbose_msg("optimizing u/l-to table..\n");
  406. smallest = SIZE_MAX;
  407. ultable.ii = NULL;
  408. for (i = 0; i < 14; i++) {
  409. t = newopt(ult, RANGE+1, i, &ultable);
  410. if (smallest >= t) {
  411. n = i;
  412. smallest = t;
  413. /* } else { */
  414. /* break; */
  415. }
  416. }
  417. verbose_msg("%lu (smallest) + %lu (u/l diffs) = %lu\n",
  418. (unsigned long) smallest,
  419. (unsigned long) (4 * ul_count),
  420. (unsigned long) (smallest + 4 * ul_count)
  421. );
  422. verbose_msg("smallest = %zu\n", smallest);
  423. if (!(ultable.ii = malloc(smallest))) {
  424. verbose_msg("couldn't allocate space!\n");
  425. return EXIT_FAILURE;
  426. }
  427. smallest = SIZE_MAX;
  428. newopt(ult, RANGE+1, n, &ultable);
  429. #if 0
  430. verbose_msg("optimizing comb table..\n");
  431. smallest = SIZE_MAX;
  432. combtable.ii = NULL;
  433. for (i = 0; i < 14; i++) {
  434. t = newopt(combt, sizeof(combt), i, &combtable);
  435. if (smallest >= t) {
  436. n = i;
  437. smallest = t;
  438. /* } else { */
  439. /* break; */
  440. }
  441. }
  442. verbose_msg("smallest = %zu\n", smallest);
  443. if (!(combtable.ii = malloc(smallest))) {
  444. verbose_msg("couldn't allocate space!\n");
  445. return EXIT_FAILURE;
  446. }
  447. smallest = SIZE_MAX;
  448. newopt(combt, sizeof(combt), n, &combtable);
  449. combtable.ti_shift += 4; /* correct for 4 entries per */
  450. #endif
  451. #if 0
  452. verbose_msg("optimizing width table..\n");
  453. smallest = SIZE_MAX;
  454. widthtable.ii = NULL;
  455. for (i = 0; i < 14; i++) {
  456. t = newopt(widtht, sizeof(widtht), i, &widthtable);
  457. if (smallest >= t) {
  458. n = i;
  459. smallest = t;
  460. /* } else { */
  461. /* break; */
  462. }
  463. }
  464. verbose_msg("smallest = %zu\n", smallest);
  465. if (!(widthtable.ii = malloc(smallest))) {
  466. verbose_msg("couldn't allocate space!\n");
  467. return EXIT_FAILURE;
  468. }
  469. smallest = SIZE_MAX;
  470. newopt(widtht, sizeof(widtht), n, &widthtable);
  471. widthtable.ti_shift += 4; /* correct for 4 entries per */
  472. #endif
  473. #if 0
  474. verbose_msg("optimizing comb3 table..\n");
  475. smallest = SIZE_MAX;
  476. comb3table.ii = NULL;
  477. for (i = 0; i < 14; i++) {
  478. t = newopt(comb3t, sizeof(comb3t), i, &comb3table);
  479. if (smallest >= t) {
  480. n = i;
  481. smallest = t;
  482. /* } else { */
  483. /* break; */
  484. }
  485. }
  486. verbose_msg("smallest = %zu\n", smallest);
  487. if (!(comb3table.ii = malloc(smallest))) {
  488. verbose_msg("couldn't allocate space!\n");
  489. return EXIT_FAILURE;
  490. }
  491. smallest = SIZE_MAX;
  492. newopt(comb3t, sizeof(comb3t), n, &comb3table);
  493. comb3table.ti_shift += 8; /* correct for 4 entries per */
  494. #endif
  495. dump_table_data(&cttable);
  496. dump_table_data(&ultable);
  497. #if 0
  498. dump_table_data(&combtable);
  499. #endif
  500. }
  501. verbose_msg("verifying for %s...\n", *argv);
  502. #if RANGE == 0xffffU
  503. for (c = 0; c <= 0xffffUL; c++)
  504. #else
  505. for (c = 0; c <= 0x10ffffUL; c++)
  506. #endif
  507. {
  508. unsigned int glibc;
  509. unsigned int mine;
  510. unsigned int upper, lower;
  511. #if 0
  512. #if RANGE < 0x10000UL
  513. if (c == 0x10000UL) {
  514. c = 0x30000UL; /* skip 1st and 2nd sup planes */
  515. }
  516. #elif RANGE < 0x20000UL
  517. if (c == 0x20000UL) {
  518. c = 0x30000UL; /* skip 2nd sup planes */
  519. }
  520. #endif
  521. #endif
  522. glibc = 0;
  523. if (iswalnum(c)) ++glibc; glibc <<= 1;
  524. if (iswalpha(c)) ++glibc; glibc <<= 1;
  525. if (iswblank(c)) ++glibc; glibc <<= 1;
  526. if (iswcntrl(c)) ++glibc; glibc <<= 1;
  527. if (iswdigit(c)) ++glibc; glibc <<= 1;
  528. if (iswgraph(c)) ++glibc; glibc <<= 1;
  529. if (iswlower(c)) ++glibc; glibc <<= 1;
  530. if (iswprint(c)) ++glibc; glibc <<= 1;
  531. if (iswpunct(c)) ++glibc; glibc <<= 1;
  532. if (iswspace(c)) ++glibc; glibc <<= 1;
  533. if (iswupper(c)) ++glibc; glibc <<= 1;
  534. if (iswxdigit(c)) ++glibc;
  535. {
  536. unsigned int u;
  537. int n = 0, sc = 0; /* = 0 for verbose_msg only */
  538. int i0 = 0, i1 = 0;
  539. u = c;
  540. if (u <= RANGE) {
  541. sc = u & ((1 << cttable.ti_shift) - 1);
  542. u >>= cttable.ti_shift;
  543. n = u & ((1 << cttable.ii_shift) - 1);
  544. u >>= cttable.ii_shift;
  545. i0 = cttable.ii[u];
  546. i0 <<= cttable.ii_shift;
  547. i1 = cttable.ti[i0 + n];
  548. i1 <<= (cttable.ti_shift - 1);
  549. d = cttable.ut[i1 + (sc >> 1)];
  550. if (sc & 1) {
  551. d >>= 4;
  552. }
  553. d &= 0x0f;
  554. } else if (((unsigned)(c - 0xe0020UL) <= 0x5f) || (c == 0xe0001UL)) {
  555. d = __CTYPE_punct;
  556. } else if ((unsigned)(c - 0xf0000UL) < 0x20000UL) {
  557. if ((c & 0xffffU) <= 0xfffdU) {
  558. d = __CTYPE_punct;
  559. } else {
  560. d = __CTYPE_unclassified;
  561. }
  562. } else {
  563. d = __CTYPE_unclassified;
  564. }
  565. mine = 0;
  566. if (mywalnum(d,c)) ++mine; mine <<= 1;
  567. if (mywalpha(d,c)) ++mine; mine <<= 1;
  568. if (mywblank(d,c)) ++mine; mine <<= 1;
  569. if (mywcntrl(d,c)) ++mine; mine <<= 1;
  570. if (mywdigit(d,c)) ++mine; mine <<= 1;
  571. if (mywgraph(d,c)) ++mine; mine <<= 1;
  572. if (mywlower(d,c)) ++mine; mine <<= 1;
  573. if (mywprint(d,c)) ++mine; mine <<= 1;
  574. if (mywpunct(d,c)) ++mine; mine <<= 1;
  575. if (mywspace(d,c)) ++mine; mine <<= 1;
  576. if (mywupper(d,c)) ++mine; mine <<= 1;
  577. if (mywxdigit(d,c)) ++mine;
  578. if (glibc != mine) {
  579. verbose_msg("%#8x : glibc %#4x != %#4x mine %d\n", c, glibc, mine, d);
  580. if (c < 0x30000UL) {
  581. verbose_msg("sc=%#x u=%#x n=%#x i0=%#x i1=%#x\n", sc, u, n, i0, i1);
  582. }
  583. }
  584. upper = lower = u = c;
  585. if (u <= RANGE) {
  586. sc = u & ((1 << ultable.ti_shift) - 1);
  587. u >>= ultable.ti_shift;
  588. n = u & ((1 << ultable.ii_shift) - 1);
  589. u >>= ultable.ii_shift;
  590. i0 = ultable.ii[u];
  591. i0 <<= ultable.ii_shift;
  592. i1 = ultable.ti[i0 + n];
  593. i1 <<= (ultable.ti_shift);
  594. i1 += sc;
  595. i0 = ultable.ut[i1];
  596. upper = c + uldiff[i0].u;
  597. lower = c + uldiff[i0].l;
  598. }
  599. if (towupper(c) != upper) {
  600. verbose_msg("%#8x : towupper glibc %#4x != %#4x mine\n",
  601. c, towupper(c), upper);
  602. }
  603. if (towlower(c) != lower) {
  604. verbose_msg("%#8x : towlower glibc %#4x != %#4x mine i0 = %d\n",
  605. c, towlower(c), lower, i0);
  606. }
  607. if (totitle && ((tt = towctrans(c, totitle)) != upper)) {
  608. verbose_msg("%#8x : totitle glibc %#4lx != %#4x mine i0 = %d\n",
  609. c, tt, upper, i0);
  610. }
  611. }
  612. if ((c & 0xfff) == 0xfff) verbose_msg(".");
  613. }
  614. verbose_msg("done\n");
  615. }
  616. if (built) {
  617. FILE *fp;
  618. if (!(fp = fopen("wctables.h", "w"))) {
  619. verbose_msg("cannot open output file 'wctables.h'!\n");
  620. return EXIT_FAILURE;
  621. }
  622. fprintf(fp, "#define __LOCALE_DATA_WC_TABLE_DOMAIN_MAX %#8lx\n\n",
  623. (unsigned long) RANGE);
  624. output_table(fp, "ctype", &cttable);
  625. output_table(fp, "uplow", &ultable);
  626. #warning fix the upper bound on the upper/lower tables... save 200 bytes or so
  627. fprintf(fp, "#define __LOCALE_DATA_WCuplow_diffs %7u\n", ul_count);
  628. fprintf(fp, "\n#ifdef WANT_WCuplow_diff_data\n\n");
  629. fprintf(fp, "\nstatic const short __LOCALE_DATA_WCuplow_diff_data[%zu] = {",
  630. 2 * (size_t) ul_count);
  631. for (i = 0; i < ul_count; i++) {
  632. if (i % 4 == 0) {
  633. fprintf(fp, "\n");
  634. }
  635. fprintf(fp, " %6d, %6d,", uldiff[i].u, uldiff[i].l);
  636. }
  637. fprintf(fp, "\n};\n\n");
  638. fprintf(fp, "#endif /* WANT_WCuplow_diff_data */\n\n");
  639. /* output_table(fp, "comb", &combtable); */
  640. /* output_table(fp, "width", &widthtable); */
  641. fclose(fp);
  642. }
  643. return !built;
  644. }
  645. size_t newopt(unsigned char *ut, size_t usize, int shift, table_data *tbl)
  646. {
  647. static int recurse;
  648. unsigned char *ti[RANGE+1]; /* table index */
  649. size_t numblocks;
  650. size_t blocksize;
  651. size_t uniq;
  652. size_t i, j;
  653. size_t smallest, t;
  654. unsigned char *ii_save;
  655. int uniqblock[256];
  656. unsigned char uit[RANGE+1];
  657. int shift2;
  658. memset(uniqblock, 0x00, sizeof(uniqblock));
  659. ii_save = NULL;
  660. blocksize = 1 << shift;
  661. numblocks = usize >> shift;
  662. /* init table index */
  663. for (i=j = 0; i < numblocks; i++) {
  664. ti[i] = ut + j;
  665. j += blocksize;
  666. }
  667. /* sort */
  668. nu_val = blocksize;
  669. qsort(ti, numblocks, sizeof(unsigned char *), nu_memcmp);
  670. uniq = 1;
  671. uit[(ti[0]-ut)/blocksize] = 0;
  672. for (i=1; i < numblocks; i++) {
  673. if (memcmp(ti[i-1], ti[i], blocksize) < 0) {
  674. if (++uniq > 255) {
  675. break;
  676. }
  677. uniqblock[uniq - 1] = i;
  678. }
  679. #if 1
  680. else if (memcmp(ti[i-1], ti[i], blocksize) > 0) {
  681. verbose_msg("bad sort %li!\n", (long) i);
  682. abort();
  683. }
  684. #endif
  685. uit[(ti[i]-ut)/blocksize] = uniq - 1;
  686. }
  687. smallest = SIZE_MAX;
  688. shift2 = -1;
  689. if (uniq > 255)
  690. return SIZE_MAX;
  691. smallest = numblocks + uniq * blocksize;
  692. if (!recurse) {
  693. ++recurse;
  694. for (j=1; j < 14; j++) {
  695. if ((numblocks >> j) < 2) break;
  696. if (tbl) {
  697. ii_save = tbl->ii;
  698. tbl->ii = NULL;
  699. }
  700. if ((t = newopt(uit, numblocks, j, tbl)) < SIZE_MAX) {
  701. t += uniq * blocksize;
  702. }
  703. if (tbl) {
  704. tbl->ii = ii_save;
  705. }
  706. if (smallest >= t) {
  707. shift2 = j;
  708. smallest = t;
  709. if (!tbl->ii) {
  710. verbose_msg("ishift %u tshift %u size %lu\n",
  711. shift2, shift, (unsigned long) t);
  712. }
  713. /* } else { */
  714. /* break; */
  715. }
  716. }
  717. --recurse;
  718. }
  719. if (tbl->ii) {
  720. if (recurse) {
  721. tbl->ii_shift = shift;
  722. tbl->ii_len = numblocks;
  723. memcpy(tbl->ii, uit, numblocks);
  724. tbl->ti = tbl->ii + tbl->ii_len;
  725. tbl->ti_len = uniq * blocksize;
  726. for (i = 0; i < uniq; i++) {
  727. memcpy(tbl->ti + i * blocksize, ti[uniqblock[i]], blocksize);
  728. }
  729. } else {
  730. ++recurse;
  731. verbose_msg("setting ishift %u tshift %u\n",
  732. shift2, shift);
  733. newopt(uit, numblocks, shift2, tbl);
  734. --recurse;
  735. tbl->ti_shift = shift;
  736. tbl->ut_len = uniq * blocksize;
  737. tbl->ut = tbl->ti + tbl->ti_len;
  738. for (i = 0; i < uniq; i++) {
  739. memcpy(tbl->ut + i * blocksize, ti[uniqblock[i]], blocksize);
  740. }
  741. }
  742. }
  743. return smallest;
  744. }
  745. /* vi: set sw=4 ts=4: */