gen_wctype.c 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833
  1. #define _GNU_SOURCE
  2. #include <stdio.h>
  3. #include <stdlib.h>
  4. #include <string.h>
  5. #include <locale.h>
  6. #include <wctype.h>
  7. #include <limits.h>
  8. #include <stdint.h>
  9. #include <wchar.h>
  10. #include <ctype.h>
  11. /* 0x9 : space blank */
  12. /* 0xa : space */
  13. /* 0xb : space */
  14. /* 0xc : space */
  15. /* 0xd : space */
  16. /* 0x20 : space blank */
  17. /* 0x1680 : space blank */
  18. /* 0x2000 : space blank */
  19. /* 0x2001 : space blank */
  20. /* 0x2002 : space blank */
  21. /* 0x2003 : space blank */
  22. /* 0x2004 : space blank */
  23. /* 0x2005 : space blank */
  24. /* 0x2006 : space blank */
  25. /* 0x2008 : space blank */
  26. /* 0x2009 : space blank */
  27. /* 0x200a : space blank */
  28. /* 0x200b : space blank */
  29. /* 0x2028 : space */
  30. /* 0x2029 : space */
  31. /* 0x3000 : space blank */
  32. /* typecount[ 0] = 88670 C_alpha_nonupper_nonlower */
  33. /* typecount[ 1] = 742 C_alpha_lower */
  34. /* typecount[ 2] = 4 C_alpha_upper_lower */
  35. /* typecount[ 3] = 731 C_alpha_upper */
  36. /* typecount[ 4] = 10 C_digit */
  37. /* typecount[ 5] = 10270 C_punct */
  38. /* typecount[ 6] = 0 C_graph */
  39. /* typecount[ 7] = 0 C_print_space_nonblank */
  40. /* typecount[ 8] = 14 C_print_space_blank */
  41. /* typecount[ 9] = 0 C_space_nonblank_noncntrl */
  42. /* typecount[10] = 0 C_space_blank_noncntrl */
  43. /* typecount[11] = 6 C_cntrl_space_nonblank */
  44. /* typecount[12] = 1 C_cntrl_space_blank */
  45. /* typecount[13] = 60 C_cntrl_nonspace */
  46. /* typecount[14] = 96100 C_unclassified */
  47. /* typecount[15] = 0 empty_slot */
  48. /* Set to #if 0 to restrict wchars to 16 bits. */
  49. #if 1
  50. #define RANGE 0x2ffffUL
  51. #elif 0
  52. #define RANGE 0x1ffffUL
  53. #else
  54. #define RANGE 0xffffUL /* Restrict for 16-bit wchar_t... */
  55. #endif
  56. /* Classification codes. */
  57. static const char *typename[] = {
  58. "C_unclassified",
  59. "C_alpha_nonupper_nonlower",
  60. "C_alpha_lower",
  61. "C_alpha_upper_lower",
  62. "C_alpha_upper",
  63. "C_digit",
  64. "C_punct",
  65. "C_graph",
  66. "C_print_space_nonblank",
  67. "C_print_space_blank",
  68. "C_space_nonblank_noncntrl",
  69. "C_space_blank_noncntrl",
  70. "C_cntrl_space_nonblank",
  71. "C_cntrl_space_blank",
  72. "C_cntrl_nonspace",
  73. "empty_slot"
  74. };
  75. /* Taking advantage of the C99 mutual-exclusion guarantees for the various
  76. * (w)ctype classes, including the descriptions of printing and control
  77. * (w)chars, we can place each in one of the following mutually-exlusive
  78. * subsets. Since there are less than 16, we can store the data for
  79. * each (w)chars in a nibble. In contrast, glibc uses an unsigned int
  80. * per (w)char, with one bit flag for each is* type. While this allows
  81. * a simple '&' operation to determine the type vs. a range test and a
  82. * little special handling for the "blank" and "xdigit" types in my
  83. * approach, it also uses 8 times the space for the tables on the typical
  84. * 32-bit archs we supported.*/
  85. enum {
  86. __CTYPE_unclassified = 0,
  87. __CTYPE_alpha_nonupper_nonlower,
  88. __CTYPE_alpha_lower,
  89. __CTYPE_alpha_upper_lower,
  90. __CTYPE_alpha_upper,
  91. __CTYPE_digit,
  92. __CTYPE_punct,
  93. __CTYPE_graph,
  94. __CTYPE_print_space_nonblank,
  95. __CTYPE_print_space_blank,
  96. __CTYPE_space_nonblank_noncntrl,
  97. __CTYPE_space_blank_noncntrl,
  98. __CTYPE_cntrl_space_nonblank,
  99. __CTYPE_cntrl_space_blank,
  100. __CTYPE_cntrl_nonspace,
  101. };
  102. /* Some macros that test for various (w)ctype classes when passed one of the
  103. * designator values enumerated above. */
  104. #define __CTYPE_isalnum(D) ((unsigned int)(D-1) <= (__CTYPE_digit-1))
  105. #define __CTYPE_isalpha(D) ((unsigned int)(D-1) <= (__CTYPE_alpha_upper-1))
  106. #define __CTYPE_isblank(D) \
  107. ((((unsigned int)(D - __CTYPE_print_space_nonblank)) <= 5) && (D & 1))
  108. #define __CTYPE_iscntrl(D) (((unsigned int)(D - __CTYPE_cntrl_space_nonblank)) <= 2)
  109. #define __CTYPE_isdigit(D) (D == __CTYPE_digit)
  110. #define __CTYPE_isgraph(D) ((unsigned int)(D-1) <= (__CTYPE_graph-1))
  111. #define __CTYPE_islower(D) (((unsigned int)(D - __CTYPE_alpha_lower)) <= 1)
  112. #define __CTYPE_isprint(D) ((unsigned int)(D-1) <= (__CTYPE_print_space_blank-1))
  113. #define __CTYPE_ispunct(D) (D == __CTYPE_punct)
  114. #define __CTYPE_isspace(D) (((unsigned int)(D - __CTYPE_print_space_nonblank)) <= 5)
  115. #define __CTYPE_isupper(D) (((unsigned int)(D - __CTYPE_alpha_upper_lower)) <= 1)
  116. #define __CTYPE_isxdigit(D,X) \
  117. (__CTYPE_isdigit(D) || (((unsigned int)(((X)|0x20) - 'a')) <= 5))
  118. #define mywalnum(x) __CTYPE_isalnum(d)
  119. #define mywalpha(x) __CTYPE_isalpha(d)
  120. #define mywblank(x) __CTYPE_isblank(d)
  121. #define mywcntrl(x) __CTYPE_iscntrl(d)
  122. #define mywdigit(x) __CTYPE_isdigit(d)
  123. #define mywgraph(x) __CTYPE_isgraph(d)
  124. #define mywlower(x) __CTYPE_islower(d)
  125. #define mywprint(x) __CTYPE_isprint(d)
  126. #define mywpunct(x) __CTYPE_ispunct(d)
  127. #define mywspace(x) __CTYPE_isspace(d)
  128. #define mywupper(x) __CTYPE_isupper(d)
  129. #define mywxdigit(x) __CTYPE_isxdigit(d,x)
  130. typedef struct {
  131. short l;
  132. short u;
  133. } uldiff_entry;
  134. typedef struct {
  135. uint16_t ii_len;
  136. uint16_t ti_len;
  137. uint16_t ut_len;
  138. unsigned char ii_shift;
  139. unsigned char ti_shift;
  140. unsigned char *ii;
  141. unsigned char *ti;
  142. unsigned char *ut;
  143. } table_data;
  144. void output_table(FILE *fp, const char *name, table_data *tbl)
  145. {
  146. size_t i;
  147. fprintf(fp, "#define WC%s_II_LEN %7u\n", name, tbl->ii_len);
  148. fprintf(fp, "#define WC%s_TI_LEN %7u\n", name, tbl->ti_len);
  149. fprintf(fp, "#define WC%s_UT_LEN %7u\n", name, tbl->ut_len);
  150. fprintf(fp, "#define WC%s_II_SHIFT %7u\n", name, tbl->ii_shift);
  151. fprintf(fp, "#define WC%s_TI_SHIFT %7u\n", name, tbl->ti_shift);
  152. fprintf(fp, "\n#ifdef WANT_WC%s_data\n", name);
  153. i = tbl->ii_len + tbl->ti_len + tbl->ut_len;
  154. fprintf(fp, "\nstatic const unsigned char WC%s_data[%zu] = {", name, i);
  155. for (i=0 ; i < tbl->ii_len ; i++) {
  156. if (i % 12 == 0) {
  157. fprintf(fp, "\n");
  158. }
  159. fprintf(fp, " %#04x,", tbl->ii[i]);
  160. }
  161. for (i=0 ; i < tbl->ti_len ; i++) {
  162. if (i % 12 == 0) {
  163. fprintf(fp, "\n");
  164. }
  165. fprintf(fp, " %#04x,", tbl->ti[i]);
  166. }
  167. for (i=0 ; i < tbl->ut_len ; i++) {
  168. if (i % 12 == 0) {
  169. fprintf(fp, "\n");
  170. }
  171. fprintf(fp, " %#04x,", tbl->ut[i]);
  172. }
  173. fprintf(fp, "\n};\n\n");
  174. fprintf(fp, "#endif /* WANT_WC%s_data */\n\n", name);
  175. }
  176. static void dump_table_data(table_data *tbl)
  177. {
  178. printf("ii_shift = %d ti_shift = %d\n"
  179. "ii_len = %d ti_len = %d ut_len = %d\n"
  180. "total = %d\n",
  181. tbl->ii_shift, tbl->ti_shift,
  182. tbl->ii_len, tbl->ti_len, tbl->ut_len,
  183. (int) tbl->ii_len + (int) tbl->ti_len + (int) tbl->ut_len);
  184. }
  185. /* For sorting the blocks of unsigned chars. */
  186. static size_t nu_val;
  187. int nu_memcmp(const void *a, const void *b)
  188. {
  189. return memcmp(*(unsigned char**)a, *(unsigned char**)b, nu_val);
  190. }
  191. static size_t newopt(unsigned char *ut, size_t usize, int shift, table_data *tbl);
  192. #define MAXTO 255 /* Restrict to minimal unsigned char max. */
  193. int main(int argc, char **argv)
  194. {
  195. long int u, l, tt;
  196. size_t smallest, t;
  197. unsigned int c;
  198. unsigned int d;
  199. int i, n;
  200. int ul_count = 0;
  201. uldiff_entry uldiff[MAXTO];
  202. table_data cttable;
  203. table_data ultable;
  204. table_data combtable;
  205. table_data widthtable;
  206. unsigned char wct[(RANGE/2)+1]; /* wctype table (nibble per wchar) */
  207. unsigned char ult[RANGE+1]; /* upper/lower table */
  208. unsigned char combt[(RANGE/4)+1]; /* combining */
  209. unsigned char widtht[(RANGE/4)+1]; /* width */
  210. wctrans_t totitle;
  211. wctype_t is_comb, is_comb3;
  212. long int typecount[16];
  213. int built = 0;
  214. setvbuf(stdout, NULL, _IONBF, 0);
  215. while (--argc) {
  216. if (!setlocale(LC_CTYPE, *++argv)) {
  217. printf("setlocale(LC_CTYPE,%s) failed!\n", *argv);
  218. continue;
  219. }
  220. if (!(totitle = wctrans("totitle"))) {
  221. printf("no totitle transformation.\n");
  222. }
  223. if (!(is_comb = wctype("combining"))) {
  224. printf("no combining wctype.\n");
  225. }
  226. if (!(is_comb3 = wctype("combining_level3"))) {
  227. printf("no combining_level3 wctype.\n");
  228. }
  229. if (!built) {
  230. built = 1;
  231. ul_count = 1;
  232. uldiff[0].u = uldiff[0].l = 0;
  233. memset(wct, 0, sizeof(wct));
  234. memset(combt, 0, sizeof(combt));
  235. memset(widtht, 0, sizeof(widtht));
  236. for (i = 0 ; i < 16 ; i++) {
  237. typecount[i] = 0;
  238. }
  239. for (c=0 ; c <= RANGE ; c++) {
  240. if (iswdigit(c)) {
  241. d = __CTYPE_digit;
  242. } else if (iswalpha(c)) {
  243. d = __CTYPE_alpha_nonupper_nonlower;
  244. if (iswlower(c)) {
  245. d = __CTYPE_alpha_lower;
  246. if (iswupper(c)) {
  247. d = __CTYPE_alpha_upper_lower;
  248. }
  249. } else if (iswupper(c)) {
  250. d = __CTYPE_alpha_upper;
  251. }
  252. } else if (iswpunct(c)) {
  253. d = __CTYPE_punct;
  254. } else if (iswgraph(c)) {
  255. d = __CTYPE_graph;
  256. } else if (iswprint(c)) {
  257. d = __CTYPE_print_space_nonblank;
  258. if (iswblank(c)) {
  259. d = __CTYPE_print_space_blank;
  260. }
  261. } else if (iswspace(c) && !iswcntrl(c)) {
  262. d = __CTYPE_space_nonblank_noncntrl;
  263. if (iswblank(c)) {
  264. d = __CTYPE_space_blank_noncntrl;
  265. }
  266. } else if (iswcntrl(c)) {
  267. d = __CTYPE_cntrl_nonspace;
  268. if (iswspace(c)) {
  269. d = __CTYPE_cntrl_space_nonblank;
  270. if (iswblank(c)) {
  271. d = __CTYPE_cntrl_space_blank;
  272. }
  273. }
  274. } else {
  275. d = __CTYPE_unclassified;
  276. }
  277. ++typecount[d];
  278. #if 0
  279. if (iswspace(c)) {
  280. if (iswblank(c)) {
  281. printf("%#8x : space blank\n", c);
  282. } else {
  283. printf("%#8x : space\n", c);
  284. }
  285. }
  286. #endif
  287. #if 0
  288. if (c < 256) {
  289. unsigned int glibc;
  290. glibc = 0;
  291. if (isalnum(c)) ++glibc; glibc <<= 1;
  292. if (isalpha(c)) ++glibc; glibc <<= 1;
  293. if (isblank(c)) ++glibc; glibc <<= 1;
  294. if (iscntrl(c)) ++glibc; glibc <<= 1;
  295. if (isdigit(c)) ++glibc; glibc <<= 1;
  296. if (isgraph(c)) ++glibc; glibc <<= 1;
  297. if (islower(c)) ++glibc; glibc <<= 1;
  298. if (isprint(c)) ++glibc; glibc <<= 1;
  299. if (ispunct(c)) ++glibc; glibc <<= 1;
  300. if (isspace(c)) ++glibc; glibc <<= 1;
  301. if (isupper(c)) ++glibc; glibc <<= 1;
  302. if (isxdigit(c)) ++glibc;
  303. printf("%#8x : ctype %#4x\n", c, glibc);
  304. }
  305. #endif
  306. #if 1
  307. /* Paranoid checking... */
  308. {
  309. unsigned int glibc;
  310. unsigned int mine;
  311. glibc = 0;
  312. if (iswalnum(c)) ++glibc; glibc <<= 1;
  313. if (iswalpha(c)) ++glibc; glibc <<= 1;
  314. if (iswblank(c)) ++glibc; glibc <<= 1;
  315. if (iswcntrl(c)) ++glibc; glibc <<= 1;
  316. if (iswdigit(c)) ++glibc; glibc <<= 1;
  317. if (iswgraph(c)) ++glibc; glibc <<= 1;
  318. if (iswlower(c)) ++glibc; glibc <<= 1;
  319. if (iswprint(c)) ++glibc; glibc <<= 1;
  320. if (iswpunct(c)) ++glibc; glibc <<= 1;
  321. if (iswspace(c)) ++glibc; glibc <<= 1;
  322. if (iswupper(c)) ++glibc; glibc <<= 1;
  323. if (iswxdigit(c)) ++glibc;
  324. mine = 0;
  325. if (mywalnum(c)) ++mine; mine <<= 1;
  326. if (mywalpha(c)) ++mine; mine <<= 1;
  327. if (mywblank(c)) ++mine; mine <<= 1;
  328. if (mywcntrl(c)) ++mine; mine <<= 1;
  329. if (mywdigit(c)) ++mine; mine <<= 1;
  330. if (mywgraph(c)) ++mine; mine <<= 1;
  331. if (mywlower(c)) ++mine; mine <<= 1;
  332. if (mywprint(c)) ++mine; mine <<= 1;
  333. if (mywpunct(c)) ++mine; mine <<= 1;
  334. if (mywspace(c)) ++mine; mine <<= 1;
  335. if (mywupper(c)) ++mine; mine <<= 1;
  336. if (mywxdigit(c)) ++mine;
  337. if (glibc != mine) {
  338. printf("%#8x : glibc %#4x != %#4x mine %u\n", c, glibc, mine, d);
  339. return EXIT_FAILURE;
  340. }
  341. #if 0
  342. if (iswctype(c,is_comb) || iswctype(c,is_comb3)) {
  343. /* if (!iswpunct(c)) { */
  344. printf("%#8x : %d %d %#4x\n",
  345. c, iswctype(c,is_comb),iswctype(c,is_comb3), glibc);
  346. /* } */
  347. }
  348. #endif
  349. }
  350. #endif
  351. combt[c/4] |= ((((!!iswctype(c,is_comb)) << 1) | !!iswctype(c,is_comb3))
  352. << ((c & 3) << 1));
  353. /* comb3t[c/8] |= ((!!iswctype(c,is_comb3)) << (c & 7)); */
  354. widtht[c/4] |= (wcwidth(c) << ((c & 3) << 1));
  355. if (c & 1) { /* Use the high nibble for odd numbered wchars. */
  356. d <<= 4;
  357. }
  358. wct[c/2] |= d;
  359. l = towlower(c) - c;
  360. u = towupper(c) - c;
  361. ult[c] = 0;
  362. if (l || u) {
  363. if ((l != (short)l) || (u != (short)u)) {
  364. printf("range assumption error! %x %ld %ld\n", c, l, u);
  365. return EXIT_FAILURE;
  366. }
  367. for (i=0 ; i < ul_count ; i++) {
  368. if ((l == uldiff[i].l) && (u == uldiff[i].u)) {
  369. goto found;
  370. }
  371. }
  372. uldiff[ul_count].l = l;
  373. uldiff[ul_count].u = u;
  374. ++ul_count;
  375. if (ul_count > MAXTO) {
  376. printf("too many touppers/tolowers!\n");
  377. return EXIT_FAILURE;
  378. }
  379. found:
  380. ult[c] = i;
  381. }
  382. }
  383. for (i = 0 ; i < 16 ; i++) {
  384. printf("typecount[%2d] = %8ld %s\n", i, typecount[i], typename[i]);
  385. }
  386. printf("optimizing is* table..\n");
  387. n = -1;
  388. smallest = SIZE_MAX;
  389. cttable.ii = NULL;
  390. for (i=0 ; i < 14 ; i++) {
  391. t = newopt(wct, (RANGE/2)+1, i, &cttable);
  392. if (smallest >= t) {
  393. n = i;
  394. smallest = t;
  395. /* } else { */
  396. /* break; */
  397. }
  398. }
  399. printf("smallest = %zu\n", smallest);
  400. if (!(cttable.ii = malloc(smallest))) {
  401. printf("couldn't allocate space!\n");
  402. return EXIT_FAILURE;
  403. }
  404. smallest = SIZE_MAX;
  405. newopt(wct, (RANGE/2)+1, n, &cttable);
  406. ++cttable.ti_shift; /* correct for nibble mode */
  407. printf("optimizing u/l-to table..\n");
  408. smallest = SIZE_MAX;
  409. ultable.ii = NULL;
  410. for (i=0 ; i < 14 ; i++) {
  411. t = newopt(ult, RANGE+1, i, &ultable);
  412. if (smallest >= t) {
  413. n = i;
  414. smallest = t;
  415. /* } else { */
  416. /* break; */
  417. }
  418. }
  419. printf("%zu (smallest) + %zu (u/l diffs) = %zu\n",
  420. smallest, 4 * ul_count, smallest + 4 * ul_count);
  421. printf("smallest = %zu\n", smallest);
  422. if (!(ultable.ii = malloc(smallest))) {
  423. printf("couldn't allocate space!\n");
  424. return EXIT_FAILURE;
  425. }
  426. smallest = SIZE_MAX;
  427. newopt(ult, RANGE+1, n, &ultable);
  428. printf("optimizing comb table..\n");
  429. smallest = SIZE_MAX;
  430. combtable.ii = NULL;
  431. for (i=0 ; i < 14 ; i++) {
  432. t = newopt(combt, sizeof(combt), i, &combtable);
  433. if (smallest >= t) {
  434. n = i;
  435. smallest = t;
  436. /* } else { */
  437. /* break; */
  438. }
  439. }
  440. printf("smallest = %zu\n", smallest);
  441. if (!(combtable.ii = malloc(smallest))) {
  442. printf("couldn't allocate space!\n");
  443. return EXIT_FAILURE;
  444. }
  445. smallest = SIZE_MAX;
  446. newopt(combt, sizeof(combt), n, &combtable);
  447. combtable.ti_shift += 4; /* correct for 4 entries per */
  448. printf("optimizing width table..\n");
  449. smallest = SIZE_MAX;
  450. widthtable.ii = NULL;
  451. for (i=0 ; i < 14 ; i++) {
  452. t = newopt(widtht, sizeof(widtht), i, &widthtable);
  453. if (smallest >= t) {
  454. n = i;
  455. smallest = t;
  456. /* } else { */
  457. /* break; */
  458. }
  459. }
  460. printf("smallest = %zu\n", smallest);
  461. if (!(widthtable.ii = malloc(smallest))) {
  462. printf("couldn't allocate space!\n");
  463. return EXIT_FAILURE;
  464. }
  465. smallest = SIZE_MAX;
  466. newopt(widtht, sizeof(widtht), n, &widthtable);
  467. widthtable.ti_shift += 4; /* correct for 4 entries per */
  468. #if 0
  469. printf("optimizing comb3 table..\n");
  470. smallest = SIZE_MAX;
  471. comb3table.ii = NULL;
  472. for (i=0 ; i < 14 ; i++) {
  473. t = newopt(comb3t, sizeof(comb3t), i, &comb3table);
  474. if (smallest >= t) {
  475. n = i;
  476. smallest = t;
  477. /* } else { */
  478. /* break; */
  479. }
  480. }
  481. printf("smallest = %zu\n", smallest);
  482. if (!(comb3table.ii = malloc(smallest))) {
  483. printf("couldn't allocate space!\n");
  484. return EXIT_FAILURE;
  485. }
  486. smallest = SIZE_MAX;
  487. newopt(comb3t, sizeof(comb3t), n, &comb3table);
  488. comb3table.ti_shift += 8; /* correct for 4 entries per */
  489. #endif
  490. dump_table_data(&cttable);
  491. dump_table_data(&ultable);
  492. dump_table_data(&combtable);
  493. dump_table_data(&widthtable);
  494. }
  495. printf("verifying for %s...\n", *argv);
  496. #if RANGE == 0xffffU
  497. for (c=0 ; c <= 0xffffUL ; c++)
  498. #else
  499. for (c=0 ; c <= 0x10ffffUL ; c++)
  500. #endif
  501. {
  502. unsigned int glibc;
  503. unsigned int mine;
  504. unsigned int upper, lower;
  505. #if 0
  506. #if RANGE < 0x10000UL
  507. if (c == 0x10000UL) {
  508. c = 0x30000UL; /* skip 1st and 2nd sup planes */
  509. }
  510. #elif RANGE < 0x20000UL
  511. if (c == 0x20000UL) {
  512. c = 0x30000UL; /* skip 2nd sup planes */
  513. }
  514. #endif
  515. #endif
  516. glibc = 0;
  517. if (iswalnum(c)) ++glibc; glibc <<= 1;
  518. if (iswalpha(c)) ++glibc; glibc <<= 1;
  519. if (iswblank(c)) ++glibc; glibc <<= 1;
  520. if (iswcntrl(c)) ++glibc; glibc <<= 1;
  521. if (iswdigit(c)) ++glibc; glibc <<= 1;
  522. if (iswgraph(c)) ++glibc; glibc <<= 1;
  523. if (iswlower(c)) ++glibc; glibc <<= 1;
  524. if (iswprint(c)) ++glibc; glibc <<= 1;
  525. if (iswpunct(c)) ++glibc; glibc <<= 1;
  526. if (iswspace(c)) ++glibc; glibc <<= 1;
  527. if (iswupper(c)) ++glibc; glibc <<= 1;
  528. if (iswxdigit(c)) ++glibc;
  529. {
  530. unsigned int u;
  531. int n, sc;
  532. int i0, i1;
  533. u = c;
  534. if (u <= RANGE) {
  535. sc = u & ((1 << cttable.ti_shift) - 1);
  536. u >>= cttable.ti_shift;
  537. n = u & ((1 << cttable.ii_shift) - 1);
  538. u >>= cttable.ii_shift;
  539. i0 = cttable.ii[u];
  540. i0 <<= cttable.ii_shift;
  541. i1 = cttable.ti[i0 + n];
  542. i1 <<= (cttable.ti_shift-1);
  543. d = cttable.ut[i1 + (sc >> 1)];
  544. if (sc & 1) {
  545. d >>= 4;
  546. }
  547. d &= 0x0f;
  548. } else if ((((unsigned int)(c - 0xe0020UL)) <= 0x5f) || (c == 0xe0001UL)){
  549. d = __CTYPE_punct;
  550. } else if (((unsigned int)(c - 0xf0000UL)) < 0x20000UL) {
  551. if ((c & 0xffffU) <= 0xfffdU) {
  552. d = __CTYPE_punct;
  553. } else {
  554. d = __CTYPE_unclassified;
  555. }
  556. } else {
  557. d = __CTYPE_unclassified;
  558. }
  559. mine = 0;
  560. if (mywalnum(c)) ++mine; mine <<= 1;
  561. if (mywalpha(c)) ++mine; mine <<= 1;
  562. if (mywblank(c)) ++mine; mine <<= 1;
  563. if (mywcntrl(c)) ++mine; mine <<= 1;
  564. if (mywdigit(c)) ++mine; mine <<= 1;
  565. if (mywgraph(c)) ++mine; mine <<= 1;
  566. if (mywlower(c)) ++mine; mine <<= 1;
  567. if (mywprint(c)) ++mine; mine <<= 1;
  568. if (mywpunct(c)) ++mine; mine <<= 1;
  569. if (mywspace(c)) ++mine; mine <<= 1;
  570. if (mywupper(c)) ++mine; mine <<= 1;
  571. if (mywxdigit(c)) ++mine;
  572. if (glibc != mine) {
  573. printf("%#8x : glibc %#4x != %#4x mine %d\n", c, glibc, mine, d);
  574. if (c < 0x30000UL) {
  575. printf("sc=%#x u=%#x n=%#x i0=%#x i1=%#x\n", sc, u, n, i0, i1);
  576. }
  577. }
  578. upper = lower = u = c;
  579. if (u <= RANGE) {
  580. sc = u & ((1 << ultable.ti_shift) - 1);
  581. u >>= ultable.ti_shift;
  582. n = u & ((1 << ultable.ii_shift) - 1);
  583. u >>= ultable.ii_shift;
  584. i0 = ultable.ii[u];
  585. i0 <<= ultable.ii_shift;
  586. i1 = ultable.ti[i0 + n];
  587. i1 <<= (ultable.ti_shift);
  588. i1 += sc;
  589. i0 = ultable.ut[i1];
  590. upper = c + uldiff[i0].u;
  591. lower = c + uldiff[i0].l;
  592. }
  593. if (towupper(c) != upper) {
  594. printf("%#8x : towupper glibc %#4x != %#4x mine\n",
  595. c, towupper(c), upper);
  596. }
  597. if (towlower(c) != lower) {
  598. printf("%#8x : towlower glibc %#4x != %#4x mine i0 = %d\n",
  599. c, towlower(c), lower, i0);
  600. }
  601. if (totitle && ((tt = towctrans(c, totitle)) != upper)) {
  602. printf("%#8x : totitle glibc %#4lx != %#4x mine i0 = %d\n",
  603. c, tt, upper, i0);
  604. }
  605. }
  606. if ((c & 0xfff) == 0xfff) printf(".");
  607. }
  608. printf("done\n");
  609. }
  610. if (1) {
  611. FILE *fp;
  612. if (!(fp = fopen("wctables.h", "w"))) {
  613. printf("couldn't open wctables.h!\n");
  614. return EXIT_FAILURE;
  615. }
  616. fprintf(fp, "#define WC_TABLE_DOMAIN_MAX %#8lx\n\n",
  617. (unsigned long) RANGE);
  618. output_table(fp, "ctype", &cttable);
  619. output_table(fp, "uplow", &ultable);
  620. #warning fix the upper bound on the upper/lower tables... save 200 bytes or so
  621. fprintf(fp, "#define WCuplow_diffs %7u\n", ul_count);
  622. fprintf(fp, "\n#ifdef WANT_WCuplow_diff_data\n\n");
  623. fprintf(fp, "\nstatic const short WCuplow_diff_data[%zu] = {",
  624. 2 * (size_t) ul_count);
  625. for (i=0 ; i < ul_count ; i++) {
  626. if (i % 4 == 0) {
  627. fprintf(fp, "\n");
  628. }
  629. fprintf(fp, " %6d, %6d,", uldiff[i].u, uldiff[i].l);
  630. }
  631. fprintf(fp, "\n};\n\n");
  632. fprintf(fp, "#endif /* WANT_WCuplow_diff_data */\n\n");
  633. output_table(fp, "comb", &combtable);
  634. output_table(fp, "width", &widthtable);
  635. fclose(fp);
  636. }
  637. return EXIT_SUCCESS;
  638. }
  639. size_t newopt(unsigned char *ut, size_t usize, int shift, table_data *tbl)
  640. {
  641. static int recurse = 0;
  642. unsigned char *ti[RANGE+1]; /* table index */
  643. size_t numblocks;
  644. size_t blocksize;
  645. size_t uniq;
  646. size_t i, j;
  647. size_t smallest, t;
  648. unsigned char *ii_save;
  649. int uniqblock[256];
  650. unsigned char uit[RANGE+1];
  651. int shift2;
  652. ii_save = NULL;
  653. blocksize = 1 << shift;
  654. numblocks = usize >> shift;
  655. /* init table index */
  656. for (i=j=0 ; i < numblocks ; i++) {
  657. ti[i] = ut + j;
  658. j += blocksize;
  659. }
  660. /* sort */
  661. nu_val = blocksize;
  662. qsort(ti, numblocks, sizeof(unsigned char *), nu_memcmp);
  663. uniq = 1;
  664. uit[(ti[0]-ut)/blocksize] = 0;
  665. for (i=1 ; i < numblocks ; i++) {
  666. if (memcmp(ti[i-1], ti[i], blocksize) < 0) {
  667. if (++uniq > 255) {
  668. break;
  669. }
  670. uniqblock[uniq - 1] = i;
  671. }
  672. #if 1
  673. else if (memcmp(ti[i-1], ti[i], blocksize) > 0) {
  674. printf("bad sort %i!\n", i);
  675. abort();
  676. }
  677. #endif
  678. uit[(ti[i]-ut)/blocksize] = uniq - 1;
  679. }
  680. smallest = SIZE_MAX;
  681. shift2 = -1;
  682. if (uniq <= 255) {
  683. smallest = numblocks + uniq * blocksize;
  684. if (!recurse) {
  685. ++recurse;
  686. for (j=1 ; j < 14 ; j++) {
  687. if ((numblocks >> j) < 2) break;
  688. if (tbl) {
  689. ii_save = tbl->ii;
  690. tbl->ii = NULL;
  691. }
  692. if ((t = newopt(uit, numblocks, j, tbl)) < SIZE_MAX) {
  693. t += uniq * blocksize;
  694. }
  695. if (tbl) {
  696. tbl->ii = ii_save;
  697. }
  698. if (smallest >= t) {
  699. shift2 = j;
  700. smallest = t;
  701. if (!tbl->ii) {
  702. printf("ishift %zu tshift %zu size %zu\n",
  703. shift2, shift, t);
  704. }
  705. /* } else { */
  706. /* break; */
  707. }
  708. }
  709. --recurse;
  710. }
  711. } else {
  712. return SIZE_MAX;
  713. }
  714. if (tbl->ii) {
  715. if (recurse) {
  716. tbl->ii_shift = shift;
  717. tbl->ii_len = numblocks;
  718. memcpy(tbl->ii, uit, numblocks);
  719. tbl->ti = tbl->ii + tbl->ii_len;
  720. tbl->ti_len = uniq * blocksize;
  721. for (i=0 ; i < uniq ; i++) {
  722. memcpy(tbl->ti + i * blocksize, ti[uniqblock[i]], blocksize);
  723. }
  724. } else {
  725. ++recurse;
  726. printf("setting ishift %zu tshift %zu\n",
  727. shift2, shift);
  728. newopt(uit, numblocks, shift2, tbl);
  729. --recurse;
  730. tbl->ti_shift = shift;
  731. tbl->ut_len = uniq * blocksize;
  732. tbl->ut = tbl->ti + tbl->ti_len;
  733. for (i=0 ; i < uniq ; i++) {
  734. memcpy(tbl->ut + i * blocksize, ti[uniqblock[i]], blocksize);
  735. }
  736. }
  737. }
  738. return smallest;
  739. }