gen_collate.c 97 KB


  1. /* TODO:
  2. *
  3. * add UNDEFINED at end if not specified
  4. * convert POSITION -> FORWARD,POSITION
  5. *
  6. *
  7. * deal with lowercase in <Uhhhh>
  8. *
  9. * what about reorders that keep the same rule?
  10. *
  11. * remove "unused" collation elements? (probably doesn't save much)
  12. *
  13. * add_rule function ... returns index into rule table after possibly adding custom-indexed rule
  14. * but don't forget about multichar weights... replace with strings of indexes
  15. *
  16. */
  17. #include <stddef.h>
  18. #include <stdio.h>
  19. #include <stdlib.h>
  20. #include <string.h>
  21. #include <stdint.h>
  22. #include <stdarg.h>
  23. #include <limits.h>
  24. #include <ctype.h>
  25. #include <assert.h>
  26. #include <search.h>
  27. typedef struct {
  28. char *name; /* */
  29. int num_weights; /* */
  30. int ii_shift; /* */
  31. int ti_shift; /* */
  32. int ii_len; /* */
  33. int ti_len; /* */
  34. int max_weight; /* */
  35. int num_col_base; /* */
  36. int max_col_index; /* */
  37. int undefined_idx; /* */
  38. int range_low; /* */
  39. int range_count; /* high - low */
  40. int range_base_weight; /* */
  41. int num_starters; /* */
  42. int range_rule_offset; /* */
  43. int wcs2colidt_offset; /* */
  44. int index2weight_offset; /* */
  45. int index2ruleidx_offset; /* */
  46. int multistart_offset; /* */
  47. } base_locale_t;
  48. #define BASE_LOCALE_LEN 20
  49. static base_locale_t base_locale_array[BASE_LOCALE_LEN];
  50. static size_t base_locale_len;
  51. typedef struct {
  52. char *name; /* */
  53. int base_idx; /* */
  54. int undefined_idx; /* */
  55. int overrides_offset; /* */
  56. int multistart_offset; /* */
  57. } der_locale_t;
  58. #define DER_LOCALE_LEN 300
  59. static der_locale_t der_locale_array[DER_LOCALE_LEN];
  60. static size_t der_locale_len;
  61. #define OVERRIDE_LEN 50000
  62. static uint16_t override_buffer[OVERRIDE_LEN];
  63. static size_t override_len;
  64. #define MULTISTART_LEN 10000
  65. static uint16_t multistart_buffer[MULTISTART_LEN];
  66. static size_t multistart_len;
  67. #define WCS2COLIDT_LEN 200000
  68. static uint16_t wcs2colidt_buffer[WCS2COLIDT_LEN];
  69. static size_t wcs2colidt_len;
  70. #define INDEX2WEIGHT_LEN 200000
  71. static uint16_t index2weight_buffer[INDEX2WEIGHT_LEN];
  72. static size_t index2weight_len;
  73. static uint16_t index2ruleidx_buffer[INDEX2WEIGHT_LEN];
  74. static size_t index2ruleidx_len;
  75. #define WEIGHTSTR_LEN 10000
  76. static uint16_t weightstr_buffer[WEIGHTSTR_LEN];
  77. static size_t weightstr_len;
  78. #define RULETABLE_LEN (1L<<16)
  79. static uint16_t ruletable_buffer[RULETABLE_LEN];
  80. static size_t ruletable_len;
  81. #define RANGE (0x10000UL)
  82. typedef uint16_t tbl_item;
  83. static uint16_t u16_buf[10000];
  84. static int u16_buf_len;
  85. static int u16_starter;
  86. typedef struct {
  87. uint16_t ii_len;
  88. uint16_t ti_len;
  89. uint16_t ut_len;
  90. unsigned char ii_shift;
  91. unsigned char ti_shift;
  92. tbl_item *ii;
  93. tbl_item *ti;
  94. tbl_item *ut;
  95. } table_data;
  96. static size_t newopt(tbl_item *ut, size_t usize, int shift, table_data *tbl);
  97. #define MAX_COLLATION_WEIGHTS 4
  98. #define MAX_FNO 1
  99. #define MAX_FILES (MAX_FNO + 1)
  100. static FILE *fstack[MAX_FILES];
  101. static char *fname[MAX_FILES];
  102. static int lineno[MAX_FILES];
  103. static int fno = -1;
  104. static tbl_item wcs2index[RANGE];
  105. static char linebuf[1024];
  106. static char *pos;
  107. static char *pos_e = NULL;
  108. static char end_of_token = 0; /* slot to save */
  109. #define IN_ORDER 0x01
  110. #define IN_REORDER 0x02
  111. #define IN_REORDER_SECTIONS 0x04
  112. static int order_state;
  113. static int cur_num_weights; /* number of weights in current use */
  114. static char cur_rule[MAX_COLLATION_WEIGHTS];
  115. static int anonsection = 0;
  116. typedef struct ll_item_struct ll_item_t;
  117. struct ll_item_struct {
  118. ll_item_t *next;
  119. ll_item_t *prev;
  120. void *data;
  121. int data_type;
  122. int idx;
  123. };
  124. static ll_item_t *reorder_section_ptr = NULL;
  125. static int superset;
  126. static int superset_order_start_cnt; /* only support one order for now */
  127. static int superset_in_sync;
  128. static ll_item_t *comm_cur_ptr;
  129. static ll_item_t *comm_prev_ptr;
  130. enum {
  131. R_FORWARD = 0x01,
  132. R_POSITION = 0x02,
  133. R_BACKWARD = 0x04 /* must be largest in value */
  134. };
  135. typedef struct {
  136. size_t num_weights;
  137. char rule[MAX_COLLATION_WEIGHTS];
  138. const char *colitem[MAX_COLLATION_WEIGHTS];
  139. } weight_t;
  140. static void *root_weight = NULL;
  141. size_t unique_weights = 0;
  142. typedef struct {
  143. const char *symbol;
  144. weight_t *weight;
  145. } weighted_item_t;
  146. typedef struct {
  147. const char *symbol1;
  148. const char *symbol2;
  149. int length;
  150. weight_t *weight;
  151. } range_item_t;
  152. typedef struct {
  153. const char *name;
  154. ll_item_t *itm_list; /* weighted_item_t list .. circular!!! */
  155. size_t num_items;
  156. size_t num_rules;
  157. char rules[MAX_COLLATION_WEIGHTS];
  158. } section_t;
  159. static section_t *cur_section = NULL;
  160. typedef struct {
  161. const char *symbol;
  162. ll_item_t *node;
  163. } wi_index_t;
  164. typedef struct col_locale_struct col_locale_t;
  165. struct col_locale_struct {
  166. char *name;
  167. void *root_colitem; /* all base and derived, or just derived */
  168. void *root_element;
  169. void *root_scripts;
  170. void *root_wi_index;
  171. void *root_wi_index_reordered;
  172. ll_item_t *section_list;
  173. col_locale_t *base_locale; /* null if this is a base */
  174. void *root_derived_wi;
  175. ll_item_t *derived_list;
  176. void *root_starter_char;
  177. void *root_starter_all;
  178. ll_item_t *undefined_idx;
  179. };
  180. typedef struct {
  181. const char *symbol;
  182. int idx;
  183. } col_index_t;
  184. static void *root_col_locale = NULL;
  185. typedef struct {
  186. const char *keyword;
  187. void (*handler)(void);
  188. } keyword_table_t;
  189. typedef struct {
  190. const char *string;
  191. const char *element; /* NULL if collating symbol */
  192. } colitem_t;
  193. static col_locale_t *cur_base = NULL;
  194. static col_locale_t *cur_derived = NULL;
  195. static col_locale_t *cur_col = NULL;
  196. static void *root_sym = NULL;
  197. static size_t num_sym = 0;
  198. static size_t mem_sym = 0;
  199. static void error_msg(const char *fmt, ...) __attribute__ ((noreturn, format (printf, 1, 2)));
  200. static void *xmalloc(size_t n);
  201. static char *xsymdup(const char *s); /* only allocate once... store in a tree */
  202. static void pushfile(char *filename);
  203. static void popfile(void);
  204. static void processfile(void);
  205. static int iscommentchar(int);
  206. static void eatwhitespace(void);
  207. static int next_line(void);
  208. static char *next_token(void);
  209. static void do_unrecognized(void);
  210. static col_locale_t *new_col_locale(char *name);
  211. static ll_item_t *new_ll_item(int data_type, void *data);
  212. static weight_t *register_weight(weight_t *w);
  213. static size_t ll_len(ll_item_t *l);
  214. static size_t ll_count(ll_item_t *l, int mask);
  215. static void add_wi_index(ll_item_t *l);
  216. static size_t tnumnodes(const void *root);
  217. static ll_item_t *find_wi_index(const char *sym, col_locale_t *cl);
  218. static void mark_reordered(const char *sym);
  219. static ll_item_t *find_wi_index_reordered(const char *sym);
  220. static ll_item_t *next_comm_ptr(void);
  221. static ll_item_t *init_comm_ptr(void);
  222. static ll_item_t *find_ll_last(ll_item_t *p);
  223. static void dump_weights(const char *name);
  224. static void finalize_base(void);
  225. static int is_ucode(const char *s);
  226. static int sym_cmp(const void *n1, const void *n2);
  227. static void do_starter_lists(col_locale_t *cl);
  228. static void dump_base_locale(int n);
  229. static void dump_der_locale(int n);
  230. static void dump_collate(FILE *fp);
  231. enum {
  232. DT_SECTION = 0x01,
  233. DT_WEIGHTED = 0x02,
  234. DT_REORDER = 0x04, /* a section to support reorder_after */
  235. DT_COL_LOCALE = 0x08,
  236. DT_RANGE = 0x10,
  237. };
  238. static section_t *new_section(const char *name)
  239. {
  240. section_t *p;
  241. char buf[128];
  242. p = xmalloc(sizeof(section_t));
  243. if (!name) { /* anonymous section */
  244. name = buf;
  245. snprintf(buf, sizeof(buf), "anon%05d", anonsection);
  246. ++anonsection;
  247. } else if (*name != '<') { /* reorder */
  248. name = buf;
  249. snprintf(buf, sizeof(buf), "%s %05d", cur_col->name, anonsection);
  250. ++anonsection;
  251. }
  252. #warning devel code
  253. /* fprintf(stderr, "section %s\n", name); */
  254. p->name = xsymdup(name);
  255. p->itm_list = NULL;
  256. p->num_items = 0;
  257. p->num_rules = 0;
  258. memset(p->rules, 0, MAX_COLLATION_WEIGHTS);
  259. /* cur_num_weights = p->num_rules = 0; */
  260. /* memset(p->rules, 0, MAX_COLLATION_WEIGHTS); */
  261. /* memset(cur_rule, R_FORWARD, 4); */
  262. #warning devel code
  263. if (*p->name == 'a') {
  264. cur_num_weights = p->num_rules = 4;
  265. memset(p->rules, R_FORWARD, 4);
  266. memset(cur_rule, R_FORWARD, 4);
  267. p->rules[3] |= R_POSITION;
  268. cur_rule[3] |= R_POSITION;
  269. }
  270. /* fprintf(stderr, "new section %s -- cur_num_weights = %d\n", p->name, cur_num_weights); */
  271. return p;
  272. }
  273. static void do_order_start(void);
  274. static void do_order_end(void);
  275. static void do_reorder_after(void);
  276. static void do_reorder_end(void);
  277. static void do_reorder_sections_after(void);
  278. static void do_reorder_sections_end(void);
  279. static void do_copy(void);
  280. static void do_colsym(void);
  281. static void do_colele(void);
  282. static void do_script(void);
  283. static void do_range(void);
  284. static col_locale_t *new_col_locale(char *name);
  285. static int colitem_cmp(const void *n1, const void *n2);
  286. static int colelement_cmp(const void *n1, const void *n2);
  287. static void del_colitem(colitem_t *p);
  288. static colitem_t *new_colitem(char *item, char *def);
  289. static void add_colitem(char *item, char *def);
  290. static void add_script(const char *s);
  291. static unsigned int add_rule(weighted_item_t *wi);
  292. static unsigned int add_range_rule(range_item_t *ri);
  293. static const keyword_table_t keyword_table[] = {
  294. { "collating-symbol", do_colsym },
  295. { "collating-element", do_colele },
  296. { "script", do_script },
  297. { "copy", do_copy },
  298. { "order_start", do_order_start },
  299. { "order_end", do_order_end },
  300. { "order-end", do_order_end },
  301. { "reorder-after", do_reorder_after },
  302. { "reorder-end", do_reorder_end },
  303. { "reorder-sections-after", do_reorder_sections_after },
  304. { "reorder-sections-end", do_reorder_sections_end },
  305. { "UCLIBC_RANGE", do_range },
  306. { NULL, do_unrecognized }
  307. };
  308. static void do_unrecognized(void)
  309. {
  310. #if 1
  311. error_msg("warning: unrecognized: %s", pos);
  312. #else
  313. /* fprintf(stderr, "warning: unrecognized initial keyword \"%s\"\n", pos); */
  314. fprintf(stderr, "warning: unrecognized: %s", pos);
  315. if (end_of_token) {
  316. fprintf(stderr, "%c%s", end_of_token, pos_e+1);
  317. }
  318. fprintf(stderr, "\n");
  319. #endif
  320. }
  321. /* typedef struct { */
  322. /* const char *symbol1; */
  323. /* const char *symbol2; */
  324. /* int length; */
  325. /* weight_t *weight; */
  326. /* } range_item_t; */
  327. static void do_range(void)
  328. {
  329. range_item_t *ri;
  330. weight_t w;
  331. int i;
  332. char *s;
  333. char *s1;
  334. char *s2;
  335. const char **ci;
  336. ll_item_t *lli;
  337. assert(!superset);
  338. assert(order_state == IN_ORDER);
  339. s1 = next_token();
  340. if (!s1) {
  341. error_msg("missing start of range");
  342. }
  343. if (!is_ucode(s1)) {
  344. error_msg("start of range is not a ucode: %s", s1);
  345. }
  346. s1 = xsymdup(s1);
  347. s2 = next_token();
  348. if (!s2) {
  349. error_msg("missing end of range");
  350. }
  351. if (!is_ucode(s2)) {
  352. error_msg("end of range is not a ucode: %s", s2);
  353. }
  354. s2 = xsymdup(s2);
  355. ri = (range_item_t *) xmalloc(sizeof(range_item_t));
  356. ri->symbol1 = s1;
  357. ri->symbol2 = s2;
  358. ri->length = strtoul(s2+2, NULL, 16) - strtoul(s1+2, NULL, 16);
  359. if (ri->length <= 0) {
  360. error_msg("illegal range length %d", ri->length);
  361. }
  362. s = next_token();
  363. w.num_weights = cur_num_weights;
  364. for (i=0 ; i < cur_num_weights ; i++) {
  365. w.rule[i] = cur_rule[i];
  366. }
  367. ci = w.colitem + (i-1);
  368. /* now i == cur_num_weights */
  369. #define STR_DITTO "."
  370. while (s && *s && i) {
  371. --i;
  372. if (*s == ';') {
  373. ci[-i] = xsymdup(STR_DITTO);
  374. if (*++s) {
  375. continue;
  376. }
  377. }
  378. if (*s) {
  379. ci[-i] = xsymdup(s);
  380. }
  381. s = next_token();
  382. if (s) {
  383. if (*s == ';') {
  384. ++s;
  385. } else if (i) {
  386. error_msg("missing seperator");
  387. }
  388. }
  389. }
  390. if (s) {
  391. error_msg("too many weights: %d %d |%s| %d", cur_num_weights, i, s, (int)*s);
  392. }
  393. while (i) { /* missing weights are not an error */
  394. --i;
  395. ci[-i] = xsymdup(STR_DITTO);
  396. }
  397. ri->weight = register_weight(&w);
  398. /* if ((i = is_ucode(t)) != 0) { */
  399. /* assert(!t[i]); */
  400. /* add_colitem(t, NULL); */
  401. /* } */
  402. lli = new_ll_item(DT_RANGE, ri);
  403. if (!cur_section->itm_list) {
  404. /* printf("creating new item list: %s\n", wi->symbol); */
  405. cur_section->itm_list = lli;
  406. lli->prev = lli->next = lli;
  407. ++cur_section->num_items;
  408. } else {
  409. insque(lli, cur_section->itm_list->prev);
  410. /* printf("adding item to list: %d - %s\n", ll_len(cur_section->itm_list), wi->symbol); */
  411. ++cur_section->num_items;
  412. }
  413. /* add_wi_index(lli); */
  414. }
  415. static weighted_item_t *add_weight(char *t)
  416. {
  417. weighted_item_t *wi;
  418. weight_t w;
  419. int i;
  420. char *s;
  421. const char **ci;
  422. t = xsymdup(t);
  423. s = next_token();
  424. w.num_weights = cur_num_weights;
  425. for (i=0 ; i < cur_num_weights ; i++) {
  426. w.rule[i] = cur_rule[i];
  427. }
  428. ci = w.colitem + (i-1);
  429. /* now i == cur_num_weights */
  430. while (s && *s && i) {
  431. --i;
  432. if (*s == ';') {
  433. ci[-i] = xsymdup(STR_DITTO);
  434. if (*++s) {
  435. continue;
  436. }
  437. }
  438. if (*s) {
  439. if (!strcmp(s,t)) {
  440. s = STR_DITTO;
  441. }
  442. ci[-i] = xsymdup(s);
  443. }
  444. s = next_token();
  445. if (s) {
  446. if (*s == ';') {
  447. ++s;
  448. } else if (i) {
  449. error_msg("missing seperator");
  450. }
  451. }
  452. }
  453. if (s) {
  454. error_msg("too many weights: %d %d |%s| %d", cur_num_weights, i, s, (int)*s);
  455. }
  456. while (i) { /* missing weights are not an error */
  457. --i;
  458. ci[-i] = xsymdup(STR_DITTO);
  459. }
  460. wi = xmalloc(sizeof(weighted_item_t));
  461. wi->symbol = t;
  462. wi->weight = register_weight(&w);
  463. if ((i = is_ucode(t)) != 0) {
  464. assert(!t[i]);
  465. add_colitem(t, NULL);
  466. }
  467. return wi;
  468. }
  469. static void add_superset_weight(char *t)
  470. {
  471. ll_item_t *lli;
  472. weighted_item_t *wi;
  473. if (!comm_cur_ptr
  474. || (strcmp(t, ((weighted_item_t *)(comm_cur_ptr->data))->symbol) != 0)
  475. ) { /* now out of sync */
  476. if (superset_in_sync) { /* need a new section */
  477. superset_in_sync = 0;
  478. cur_section = new_section("R");
  479. cur_num_weights = cur_section->num_rules
  480. = ((section_t *)(cur_base->section_list->data))->num_rules;
  481. memcpy(cur_rule,
  482. ((section_t *)(cur_base->section_list->data))->rules,
  483. MAX_COLLATION_WEIGHTS);
  484. memcpy(cur_section->rules,
  485. ((section_t *)(cur_base->section_list->data))->rules,
  486. MAX_COLLATION_WEIGHTS);
  487. insque(new_ll_item(DT_REORDER, cur_section), find_ll_last(cur_col->section_list));
  488. assert(comm_prev_ptr);
  489. lli = new_ll_item(DT_REORDER, cur_section);
  490. lli->prev = lli->next = lli;
  491. insque(lli, comm_prev_ptr);
  492. /* fprintf(stderr, " subsection -----------------------\n"); */
  493. }
  494. /* fprintf(stderr, " %s %s\n", t, ((weighted_item_t *)(comm_cur_ptr->data))->symbol); */
  495. wi = add_weight(t);
  496. lli = new_ll_item(DT_WEIGHTED, wi);
  497. mark_reordered(wi->symbol);
  498. /* printf("reorder: %s\n", t); */
  499. if (!cur_section->itm_list) {
  500. cur_section->itm_list = lli;
  501. lli->prev = lli->next = lli;
  502. ++cur_section->num_items;
  503. } else {
  504. insque(lli, cur_section->itm_list->prev);
  505. ++cur_section->num_items;
  506. }
  507. add_wi_index(lli);
  508. } else { /* in sync */
  509. superset_in_sync = 1;
  510. next_comm_ptr();
  511. }
  512. }
  513. static void do_weight(char *t)
  514. {
  515. weighted_item_t *wi;
  516. ll_item_t *lli;
  517. if (superset) {
  518. add_superset_weight(t);
  519. return;
  520. }
  521. switch(order_state) {
  522. case 0:
  523. /* fprintf(stdout, "no-order weight: %s\n", t); */
  524. /* break; */
  525. case IN_ORDER:
  526. /* in a section */
  527. /* fprintf(stdout, "weight: %s\n", t); */
  528. wi = add_weight(t);
  529. lli = new_ll_item(DT_WEIGHTED, wi);
  530. if (!cur_section->itm_list) {
  531. /* fprintf(stdout, "creating new item list: %s %s %p\n", wi->symbol, cur_section->name, lli); */
  532. cur_section->itm_list = lli;
  533. lli->prev = lli->next = lli;
  534. ++cur_section->num_items;
  535. } else {
  536. insque(lli, cur_section->itm_list->prev);
  537. /* fprintf(stdout, "adding item to list: %d - %s %p\n", ll_len(cur_section->itm_list), wi->symbol, lli); */
  538. ++cur_section->num_items;
  539. }
  540. add_wi_index(lli);
  541. break;
  542. case IN_REORDER:
  543. /* std rule - but in a block with an insert-after pt */
  544. wi = add_weight(t);
  545. lli = new_ll_item(DT_WEIGHTED, wi);
  546. mark_reordered(wi->symbol);
  547. /* fprintf(stdout, "reorder: %s %s %p\n", t, cur_section->name, lli); */
  548. if (!cur_section->itm_list) {
  549. cur_section->itm_list = lli;
  550. lli->prev = lli->next = lli;
  551. ++cur_section->num_items;
  552. } else {
  553. insque(lli, cur_section->itm_list->prev);
  554. ++cur_section->num_items;
  555. }
  556. add_wi_index(lli);
  557. break;
  558. case IN_REORDER_SECTIONS:
  559. t = xsymdup(t);
  560. if (next_token() != NULL) {
  561. error_msg("trailing text in reorder section item: %s", pos);
  562. }
  563. lli = cur_col->section_list;
  564. do {
  565. if (lli->data_type & DT_SECTION) {
  566. if (!strcmp(((section_t *)(lli->data))->name, t)) {
  567. lli->data_type = DT_REORDER;
  568. lli = new_ll_item(DT_REORDER, (section_t *)(lli->data));
  569. insque(lli, reorder_section_ptr);
  570. reorder_section_ptr = lli;
  571. return;
  572. }
  573. }
  574. lli = lli->next;
  575. } while (lli);
  576. error_msg("reorder_sections_after for non-base item currently not supported: %s", t);
  577. /* fprintf(stdout, "reorder_secitons: %s\n", t); */
  578. break;
  579. default:
  580. error_msg("invalid order_state %d", order_state);
  581. }
  582. }
  583. static int col_locale_cmp(const void *n1, const void *n2)
  584. {
  585. return strcmp(((const col_locale_t *) n1)->name, ((const col_locale_t *) n2)->name);
  586. }
  587. static void processfile(void)
  588. {
  589. char *t;
  590. const keyword_table_t *k;
  591. order_state = 0;
  592. #warning devel code
  593. /* cur_num_weights = 0; */
  594. /* cur_num_weights = 4; */
  595. /* memset(cur_rule, R_FORWARD, 4); */
  596. if (cur_col != cur_base) {
  597. cur_col->base_locale = cur_base;
  598. cur_col->undefined_idx = cur_base->undefined_idx;
  599. if (!cur_base->derived_list) {
  600. cur_base->derived_list = new_ll_item(DT_COL_LOCALE, cur_col);
  601. } else {
  602. insque(new_ll_item(DT_COL_LOCALE, cur_col), find_ll_last(cur_base->derived_list));
  603. }
  604. }
  605. if (tfind(cur_col, &root_col_locale, col_locale_cmp)) {
  606. error_msg("attempt to read locale: %s", cur_col->name);
  607. }
  608. if (!tsearch(cur_col, &root_col_locale, col_locale_cmp)) {
  609. error_msg("OUT OF MEMORY!");
  610. }
  611. if (superset) {
  612. superset_order_start_cnt = 0;
  613. superset_in_sync = 0;
  614. init_comm_ptr();
  615. }
  616. while (next_line()) {
  617. /* printf("%5d:", lineno[fno]); */
  618. /* while ((t = next_token()) != NULL) { */
  619. /* printf(" |%s|", t); */
  620. /* printf("\n"); */
  621. /* } */
  622. t = next_token();
  623. assert(t);
  624. assert(t == pos);
  625. if ((*t == '<') || (!strcmp(t, "UNDEFINED"))) {
  626. do_weight(t);
  627. } else {
  628. for (k = keyword_table ; k->keyword ; k++) {
  629. if (!strcmp(k->keyword, t)) {
  630. break;
  631. }
  632. }
  633. k->handler();
  634. }
  635. }
  636. if (cur_base == cur_col) {
  637. fprintf(stderr, "Base: %15s", cur_col->name);
  638. } else {
  639. #if 1
  640. if (!cur_col->undefined_idx) {
  641. #if 0
  642. if (superset) {
  643. if (superset_order_start_cnt == 1) {
  644. --superset_order_start_cnt; /* ugh.. hack this */
  645. }
  646. }
  647. #endif
  648. /* This is an awful hack to get around the problem of unspecified UNDEFINED
  649. * definitions in the supported locales derived from iso14651_t1. */
  650. if (!strcmp(cur_base->name, "iso14651_t1")) {
  651. fprintf(stderr, "Warning: adding UNDEFINED entry for %s\n", cur_col->name);
  652. strcpy(linebuf, "script <UNDEFINED_SECTION>\n");
  653. pos_e = NULL;
  654. pos = linebuf;
  655. t = next_token();
  656. assert(t);
  657. assert(t == pos);
  658. do_script();
  659. strcpy(linebuf, "order_start <UNDEFINED_SECTION>;forward;backward;forward;forward,position\n");
  660. pos_e = NULL;
  661. pos = linebuf;
  662. t = next_token();
  663. assert(t);
  664. assert(t == pos);
  665. do_order_start();
  666. strcpy(linebuf, "UNDEFINED IGNORE;IGNORE;IGNORE\n");
  667. pos_e = NULL;
  668. pos = linebuf;
  669. t = next_token();
  670. assert(t);
  671. assert(t == pos);
  672. do_weight(t);
  673. strcpy(linebuf, "order_end\n");
  674. pos_e = NULL;
  675. pos = linebuf;
  676. t = next_token();
  677. assert(t);
  678. assert(t == pos);
  679. do_order_end();
  680. } else {
  681. error_msg("no definition of UNDEFINED for %s", cur_col->name);
  682. }
  683. }
  684. #endif
  685. fprintf(stderr, " Der: %15s", cur_col->name);
  686. }
  687. {
  688. ll_item_t *p = cur_col->section_list;
  689. fprintf(stderr, "%6u weights", tnumnodes(cur_col->root_wi_index));
  690. if (cur_base) {
  691. fprintf(stderr, " %6u der %6u reor %6u starter - %u new stubs",
  692. tnumnodes(cur_base->root_derived_wi),
  693. tnumnodes(cur_base->root_wi_index_reordered),
  694. tnumnodes(cur_base->root_starter_char),
  695. ll_count(cur_col->section_list, DT_REORDER));
  696. }
  697. fprintf(stderr, "\n");
  698. #if 0
  699. while (p) {
  700. assert(((section_t *)(p->data))->num_items ==
  701. ll_len(((section_t *)(p->data))->itm_list));
  702. if (!p->next &&
  703. ((*((section_t *)(p->data))->name == 'a')
  704. && (((section_t *)(p->data))->num_items == 0))
  705. ) {
  706. break;
  707. }
  708. if (!(p->data_type & DT_REORDER)) {
  709. if ((*((section_t *)(p->data))->name != 'a')
  710. || (((section_t *)(p->data))->num_items > 0)
  711. ) {
  712. fprintf(stderr,
  713. /* "\t%-15s %zu\n", */
  714. "\t%-15s %6u\n",
  715. ((section_t *)(p->data))->name,
  716. ((section_t *)(p->data))->num_items);
  717. }
  718. }
  719. p = p->next;
  720. }
  721. #endif
  722. }
  723. }
  724. static void print_colnode(const void *ptr, VISIT order, int level)
  725. {
  726. const colitem_t *p = *(const colitem_t **) ptr;
  727. if (order == postorder || order == leaf) {
  728. printf("collating item = \"%s\"", p->string);
  729. if (p->element) {
  730. printf(" is %s", p->element);
  731. }
  732. printf("\n");
  733. }
  734. }
  735. static void print_weight_node(const void *ptr, VISIT order, int level)
  736. {
  737. const weight_t *p = *(const weight_t **) ptr;
  738. int i;
  739. if (order == postorder || order == leaf) {
  740. printf("weight: (%d) ", p->num_weights);
  741. for (i = 0 ; i < p->num_weights ; i++) {
  742. if (p->rule[i] & R_FORWARD) {
  743. printf("F");
  744. }
  745. if (p->rule[i] & R_BACKWARD) {
  746. printf("B");
  747. }
  748. if (p->rule[i] & R_POSITION) {
  749. printf("P");
  750. }
  751. printf(",");
  752. }
  753. for (i = 0 ; i < p->num_weights ; i++) {
  754. printf(" %s", p->colitem[i]);
  755. }
  756. printf("\n");
  757. }
  758. }
  759. typedef struct {
  760. const char *der_name;
  761. int base_locale;
  762. } deps_t;
  763. enum {
  764. BASE_iso14651_t1,
  765. BASE_comm,
  766. BASE_cs_CZ,
  767. BASE_ar_SA,
  768. BASE_th_TH,
  769. BASE_ja_JP,
  770. BASE_ko_KR,
  771. BASE_MAX
  772. };
  773. static const char *base_name[] = {
  774. "iso14651_t1",
  775. "comm",
  776. "cs_CZ",
  777. "ar_SA",
  778. "th_TH",
  779. "ja_JP",
  780. "ko_KR"
  781. };
  782. static ll_item_t *locale_list[BASE_MAX];
  783. static void init_locale_list(void)
  784. {
  785. int i;
  786. for (i=0 ; i < BASE_MAX ; i++) {
  787. locale_list[i] = (ll_item_t *) xmalloc(sizeof(ll_item_t));
  788. locale_list[i]->prev = locale_list[i]->next = locale_list[i];
  789. locale_list[i]->data = (void *) base_name[i];
  790. }
  791. }
  792. deps_t deps[] = {
  793. { "af_ZA", BASE_iso14651_t1 },
  794. { "am_ET", BASE_iso14651_t1 },
  795. { "ar_AE", BASE_iso14651_t1 },
  796. { "ar_BH", BASE_iso14651_t1 },
  797. { "ar_DZ", BASE_iso14651_t1 },
  798. { "ar_EG", BASE_iso14651_t1 },
  799. { "ar_IN", BASE_iso14651_t1 },
  800. { "ar_IQ", BASE_iso14651_t1 },
  801. { "ar_JO", BASE_iso14651_t1 },
  802. { "ar_KW", BASE_iso14651_t1 },
  803. { "ar_LB", BASE_iso14651_t1 },
  804. { "ar_LY", BASE_iso14651_t1 },
  805. { "ar_MA", BASE_iso14651_t1 },
  806. { "ar_OM", BASE_iso14651_t1 },
  807. { "ar_QA", BASE_iso14651_t1 },
  808. { "ar_SA", BASE_ar_SA },
  809. { "ar_SD", BASE_iso14651_t1 },
  810. { "ar_SY", BASE_iso14651_t1 },
  811. { "ar_TN", BASE_iso14651_t1 },
  812. { "ar_YE", BASE_iso14651_t1 },
  813. { "az_AZ", BASE_iso14651_t1 },
  814. { "be_BY", BASE_iso14651_t1 },
  815. { "bg_BG", BASE_iso14651_t1 },
  816. { "bn_BD", BASE_iso14651_t1 },
  817. { "bn_IN", BASE_iso14651_t1 },
  818. { "br_FR", BASE_iso14651_t1 },
  819. { "bs_BA", BASE_iso14651_t1 },
  820. { "ca_ES", BASE_comm },
  821. { "cs_CZ", BASE_cs_CZ },
  822. { "cy_GB", BASE_iso14651_t1 },
  823. { "da_DK", BASE_comm },
  824. { "de_AT", BASE_iso14651_t1 },
  825. { "de_BE", BASE_iso14651_t1 },
  826. { "de_CH", BASE_iso14651_t1 },
  827. { "de_DE", BASE_iso14651_t1 },
  828. { "de_LU", BASE_iso14651_t1 },
  829. { "el_GR", BASE_iso14651_t1 },
  830. { "en_AU", BASE_iso14651_t1 },
  831. { "en_BW", BASE_iso14651_t1 },
  832. { "en_CA", BASE_comm },
  833. { "en_DK", BASE_iso14651_t1 },
  834. { "en_GB", BASE_iso14651_t1 },
  835. { "en_HK", BASE_iso14651_t1 },
  836. { "en_IE", BASE_iso14651_t1 },
  837. { "en_IN", BASE_iso14651_t1 },
  838. { "en_NZ", BASE_iso14651_t1 },
  839. { "en_PH", BASE_iso14651_t1 },
  840. { "en_SG", BASE_iso14651_t1 },
  841. { "en_US", BASE_iso14651_t1 },
  842. { "en_ZA", BASE_iso14651_t1 },
  843. { "en_ZW", BASE_iso14651_t1 },
  844. { "eo_EO", BASE_iso14651_t1 },
  845. { "es_AR", BASE_comm },
  846. { "es_BO", BASE_comm },
  847. { "es_CL", BASE_comm },
  848. { "es_CO", BASE_comm },
  849. { "es_CR", BASE_comm },
  850. { "es_DO", BASE_comm },
  851. { "es_EC", BASE_comm },
  852. { "es_ES", BASE_comm },
  853. { "es_GT", BASE_comm },
  854. { "es_HN", BASE_comm },
  855. { "es_MX", BASE_comm },
  856. { "es_NI", BASE_comm },
  857. { "es_PA", BASE_comm },
  858. { "es_PE", BASE_comm },
  859. { "es_PR", BASE_comm },
  860. { "es_PY", BASE_comm },
  861. { "es_SV", BASE_comm },
  862. { "es_US", BASE_comm },
  863. { "es_UY", BASE_comm },
  864. { "es_VE", BASE_comm },
  865. { "et_EE", BASE_comm },
  866. { "eu_ES", BASE_iso14651_t1 },
  867. { "fa_IR", BASE_iso14651_t1 },
  868. { "fi_FI", BASE_comm },
  869. { "fo_FO", BASE_comm },
  870. { "fr_BE", BASE_iso14651_t1 },
  871. { "fr_CA", BASE_comm },
  872. { "fr_CH", BASE_iso14651_t1 },
  873. { "fr_FR", BASE_iso14651_t1 },
  874. { "fr_LU", BASE_iso14651_t1 },
  875. { "ga_IE", BASE_iso14651_t1 },
  876. { "gd_GB", BASE_iso14651_t1 },
  877. { "gl_ES", BASE_comm },
  878. { "gv_GB", BASE_iso14651_t1 },
  879. { "he_IL", BASE_iso14651_t1 },
  880. { "hi_IN", BASE_iso14651_t1 },
  881. { "hr_HR", BASE_comm },
  882. { "hu_HU", BASE_iso14651_t1 },
  883. { "hy_AM", BASE_iso14651_t1 },
  884. { "id_ID", BASE_iso14651_t1 },
  885. { "is_IS", BASE_comm },
  886. { "it_CH", BASE_iso14651_t1 },
  887. { "it_IT", BASE_iso14651_t1 },
  888. { "iw_IL", BASE_iso14651_t1 },
  889. { "ja_JP", BASE_ja_JP },
  890. { "ka_GE", BASE_iso14651_t1 },
  891. { "kl_GL", BASE_comm },
  892. { "ko_KR", BASE_ko_KR },
  893. { "kw_GB", BASE_iso14651_t1 },
  894. { "lt_LT", BASE_comm },
  895. { "lv_LV", BASE_comm },
  896. { "mi_NZ", BASE_iso14651_t1 },
  897. { "mk_MK", BASE_iso14651_t1 },
  898. { "mr_IN", BASE_iso14651_t1 },
  899. { "ms_MY", BASE_iso14651_t1 },
  900. { "mt_MT", BASE_iso14651_t1 },
  901. { "nl_BE", BASE_iso14651_t1 },
  902. { "nl_NL", BASE_iso14651_t1 },
  903. { "nn_NO", BASE_iso14651_t1 },
  904. { "no_NO", BASE_comm },
  905. { "oc_FR", BASE_iso14651_t1 },
  906. { "pl_PL", BASE_comm },
  907. { "pt_BR", BASE_iso14651_t1 },
  908. { "pt_PT", BASE_iso14651_t1 },
  909. { "ro_RO", BASE_iso14651_t1 },
  910. { "ru_RU", BASE_iso14651_t1 },
  911. { "ru_UA", BASE_iso14651_t1 },
  912. { "se_NO", BASE_iso14651_t1 },
  913. { "sk_SK", BASE_cs_CZ },
  914. { "sl_SI", BASE_comm },
  915. { "sq_AL", BASE_iso14651_t1 },
  916. { "sr_YU", BASE_iso14651_t1 },
  917. { "sv_FI", BASE_comm },
  918. { "sv_SE", BASE_iso14651_t1 },
  919. { "ta_IN", BASE_iso14651_t1 },
  920. { "te_IN", BASE_iso14651_t1 },
  921. { "tg_TJ", BASE_iso14651_t1 },
  922. { "th_TH", BASE_th_TH },
  923. { "ti_ER", BASE_iso14651_t1 },
  924. { "ti_ET", BASE_iso14651_t1 },
  925. { "tl_PH", BASE_iso14651_t1 },
  926. { "tr_TR", BASE_comm },
  927. { "tt_RU", BASE_iso14651_t1 },
  928. { "uk_UA", BASE_iso14651_t1 },
  929. { "ur_PK", BASE_iso14651_t1 },
  930. { "uz_UZ", BASE_iso14651_t1 },
  931. { "vi_VN", BASE_iso14651_t1 },
  932. { "wa_BE", BASE_iso14651_t1 },
  933. { "yi_US", BASE_iso14651_t1 },
  934. { "zh_CN", BASE_iso14651_t1 },
  935. { "zh_HK", BASE_iso14651_t1 },
  936. { "zh_SG", BASE_iso14651_t1 },
  937. { "zh_TW", BASE_iso14651_t1 },
  938. };
  939. static int der_count[BASE_MAX];
  940. static const char *new_args[500];
  941. static int new_arg_count;
  942. static int dep_cmp(const void *s1, const void *s2)
  943. {
  944. return strcmp( (const char *) s1, ((const deps_t *) s2)->der_name);
  945. }
  946. static int old_main(int argc, char **argv);
  947. int main(int argc, char **argv)
  948. {
  949. const deps_t *p;
  950. ll_item_t *lli;
  951. int i;
  952. int total;
  953. if (argc < 2) {
  954. return EXIT_FAILURE;
  955. }
  956. init_locale_list();
  957. while (--argc) {
  958. p = (const deps_t *) bsearch(*++argv, deps, sizeof(deps)/sizeof(deps[0]), sizeof(deps[0]), dep_cmp);
  959. if (!p) {
  960. if (!strcmp("C", *argv)) {
  961. printf("ignoring C locale\n");
  962. continue;
  963. } else {
  964. printf("%s not found\n", *argv);
  965. return EXIT_FAILURE;
  966. }
  967. }
  968. i = p->base_locale;
  969. ++der_count[i];
  970. if (!strcmp(base_name[i], *argv)) {
  971. /* same name as base, so skip after count incremented */
  972. continue;
  973. }
  974. /* add it to the list. the main body will catch duplicates */
  975. lli = (ll_item_t *) xmalloc(sizeof(ll_item_t));
  976. lli->prev = lli->next = NULL;
  977. lli->data = (void *) *argv;
  978. insque(lli, locale_list[i]);
  979. }
  980. total = 0;
  981. for (i=0 ; i < BASE_MAX ; i++) {
  982. /* printf("der_count[%2d] = %3d\n", i, der_count[i]); */
  983. total += der_count[i];
  984. }
  985. /* printf("total = %d\n", total); */
  986. new_args[new_arg_count++] = "dummyprogramname";
  987. for (i=0 ; i < BASE_MAX ; i++) {
  988. if (!der_count[i]) {
  989. continue;
  990. }
  991. new_args[new_arg_count++] = (i == BASE_comm) ? "-c" : "-b";
  992. lli = locale_list[i];
  993. do {
  994. new_args[new_arg_count++] = (const char *) (lli->data);
  995. lli = lli->next;
  996. } while (lli != locale_list[i]);
  997. new_args[new_arg_count++] = "-f";
  998. }
  999. /* for (i=0 ; i < new_arg_count ; i++) { */
  1000. /* printf("%3d: %s\n", i, new_args[i]); */
  1001. /* } */
  1002. return old_main(new_arg_count, (char **) new_args);
  1003. }
  1004. /* usage... prog -b basefile derived {derived} -s single {single} */
  1005. static int old_main(int argc, char **argv)
  1006. {
  1007. int next_is_base = 0;
  1008. int next_is_subset = 0;
  1009. superset = 0;
  1010. while (--argc) {
  1011. ++argv;
  1012. if (**argv == '-') {
  1013. if ((*argv)[1] == 'd') {
  1014. dump_weights((*argv) + 2);
  1015. } else if ((*argv)[1] == 'f') { /* dump all weight rules */
  1016. finalize_base();
  1017. } else if ((*argv)[1] == 'R') { /* dump all weight rules */
  1018. twalk(root_weight, print_weight_node);
  1019. } else if (((*argv)[1] == 'c') && !(*argv)[2]) { /* new common subset */
  1020. cur_base = cur_derived = NULL;
  1021. next_is_subset = 1;
  1022. next_is_base = 1;
  1023. superset = 0;
  1024. } else if (((*argv)[1] == 'b') && !(*argv)[2]) { /* new base locale */
  1025. cur_base = cur_derived = NULL;
  1026. next_is_subset = 0;
  1027. next_is_base = 1;
  1028. superset = 0;
  1029. } else if (((*argv)[1] == 's') && !(*argv)[2]) { /* single locales follow */
  1030. cur_base = cur_derived = NULL;
  1031. next_is_subset = 0;
  1032. next_is_base = 2;
  1033. superset = 0;
  1034. } else {
  1035. error_msg("unrecognized option %s", *argv);
  1036. }
  1037. continue;
  1038. }
  1039. /* new file */
  1040. new_col_locale(*argv); /* automaticly sets cur_col */
  1041. if (next_is_base) {
  1042. cur_base = cur_col;
  1043. } else {
  1044. cur_derived = cur_col;
  1045. }
  1046. pushfile(*argv);
  1047. /* fprintf(stderr, "processing file %s\n", *argv); */
  1048. processfile(); /* this does a popfile */
  1049. /* twalk(cur_col->root_colitem, print_colnode); */
  1050. if (next_is_base == 1) {
  1051. next_is_base = 0;
  1052. }
  1053. if (next_is_subset) {
  1054. next_is_subset = 0;
  1055. superset = 1;
  1056. }
  1057. }
  1058. fprintf(stderr, "success!\n");
  1059. fprintf(stderr,
  1060. /* "num_sym=%zu mem_sym=%zu unique_weights=%zu\n", */
  1061. "num_sym=%u mem_sym=%u unique_weights=%u\n",
  1062. num_sym, mem_sym, unique_weights);
  1063. /* twalk(root_weight, print_weight_node); */
  1064. fprintf(stderr, "num base locales = %d num derived locales = %d\n",
  1065. base_locale_len, der_locale_len);
  1066. fprintf(stderr,
  1067. "override_len = %d multistart_len = %d weightstr_len = %d\n"
  1068. "wcs2colidt_len = %d index2weight_len = %d index2ruleidx_len = %d\n"
  1069. "ruletable_len = %d\n"
  1070. "total size is %d bytes or %d kB\n",
  1071. override_len, multistart_len, weightstr_len,
  1072. wcs2colidt_len, index2weight_len, index2ruleidx_len,
  1073. ruletable_len,
  1074. #warning mult by 2 for rule indecies
  1075. (override_len + multistart_len + weightstr_len
  1076. + wcs2colidt_len + index2weight_len + index2ruleidx_len + ruletable_len) * 2,
  1077. (override_len + multistart_len + weightstr_len
  1078. + wcs2colidt_len + index2weight_len + index2ruleidx_len + ruletable_len + 511) / 512);
  1079. #if 0
  1080. {
  1081. int i;
  1082. for (i=0 ; i < base_locale_len ; i++) {
  1083. dump_base_locale(i);
  1084. }
  1085. for (i=0 ; i < der_locale_len ; i++) {
  1086. dump_der_locale(i);
  1087. }
  1088. }
  1089. #endif
  1090. {
  1091. FILE *fp = fopen("locale_collate.h", "w");
  1092. if (!fp) {
  1093. error_msg("couldn't open output file!");
  1094. }
  1095. dump_collate(fp);
  1096. if (ferror(fp) || fclose(fp)) {
  1097. error_msg("write error or close error for output file!\n");
  1098. }
  1099. }
  1100. return EXIT_SUCCESS;
  1101. }
  1102. static void error_msg(const char *fmt, ...)
  1103. {
  1104. va_list arg;
  1105. fprintf(stderr, "Error: ");
  1106. if (fno >= 0) {
  1107. fprintf(stderr, "file %s (%d): ", fname[fno], lineno[fno]);
  1108. }
  1109. va_start(arg, fmt);
  1110. vfprintf(stderr, fmt, arg);
  1111. va_end(arg);
  1112. fprintf(stderr, "\n");
  1113. exit(EXIT_FAILURE);
  1114. }
  1115. static void pushfile(char *filename)
  1116. {
  1117. static char fbuf[PATH_MAX];
  1118. snprintf(fbuf, PATH_MAX, "collation/%s", filename);
  1119. if (fno >= MAX_FNO) {
  1120. error_msg("file stack size exceeded");
  1121. }
  1122. if (!(fstack[++fno] = fopen(fbuf, "r"))) {
  1123. --fno; /* oops */
  1124. error_msg("cannot open file %s", fbuf);
  1125. }
  1126. fname[fno] = xsymdup(filename);
  1127. lineno[fno] = 0;
  1128. }
  1129. static void popfile(void)
  1130. {
  1131. if (fno < 0) {
  1132. error_msg("pop on empty file stack");
  1133. }
  1134. /* free(fname[fno]); */
  1135. fclose(fstack[fno]);
  1136. --fno;
  1137. }
  1138. static void eatwhitespace(void)
  1139. {
  1140. while (isspace(*pos)) {
  1141. ++pos;
  1142. }
  1143. }
  1144. static int iscommentchar(int c)
  1145. {
  1146. return ((c == '#') || (c == '%'));
  1147. }
  1148. static int next_line(void)
  1149. {
  1150. size_t n;
  1151. char *s = linebuf;
  1152. assert(fno >= 0);
  1153. pos_e = NULL;
  1154. do {
  1155. if (fgets(s, sizeof(linebuf), fstack[fno]) != NULL) {
  1156. ++lineno[fno];
  1157. n = strlen(linebuf);
  1158. if ((n == sizeof(linebuf) - 1) && (linebuf[n-1] != '\n')) {
  1159. /* Either line is too long or last line is very long with
  1160. * no trailing newline. But we'll always treat it as an
  1161. * errro. */
  1162. error_msg("line too long?");
  1163. }
  1164. --n;
  1165. /* Be careful... last line doesn't need a newline. */
  1166. if (linebuf[n] == '\n') {
  1167. linebuf[n--] = 0; /* trim trailing newline */
  1168. }
  1169. pos = linebuf;
  1170. eatwhitespace();
  1171. if (*pos && !iscommentchar(*pos)) { /* not empty or comment line */
  1172. return 1; /* got a line */
  1173. }
  1174. } else { /* eof */
  1175. popfile();
  1176. }
  1177. } while (fno >= 0);
  1178. return 0;
  1179. }
  1180. static char *next_token(void)
  1181. {
  1182. char *p;
  1183. #if 0
  1184. if (pos_e == NULL) {
  1185. return NULL
  1186. pos = pos_e;
  1187. *pos = end_of_token;
  1188. end_of_token = 0;
  1189. }
  1190. #else
  1191. if (pos_e != NULL) {
  1192. pos = pos_e;
  1193. *pos = end_of_token;
  1194. end_of_token = 0;
  1195. }
  1196. #endif
  1197. eatwhitespace();
  1198. p = pos;
  1199. if (!*p || iscommentchar(*p)) { /* end of line or start of comment */
  1200. pos = pos_e = NULL;
  1201. *p = 0; /* treat comment as end of line */
  1202. /* fprintf(stdout, "returning NUL token |%s|\n", pos); */
  1203. return NULL;
  1204. #if 1
  1205. } else if (*p == '<') { /* collating symbol, element, or value */
  1206. while (*++p) {
  1207. if ((*p == '/') && p[1]) {
  1208. ++p;
  1209. continue;
  1210. }
  1211. if (*p == '>') {
  1212. pos_e = ++p;
  1213. end_of_token = *p;
  1214. *p = 0;
  1215. /* fprintf(stdout, "returning col token |%s|\n", pos); */
  1216. return pos;
  1217. }
  1218. }
  1219. } else if (*p == '"') { /* collating element value? */
  1220. while (*++p) {
  1221. if (*p == '"') { /* found the end of the quoted string */
  1222. pos_e = ++p;
  1223. end_of_token = *p;
  1224. *p = 0;
  1225. /* fprintf(stdout, "returning quote token |%s|\n", pos); */
  1226. return pos;
  1227. }
  1228. }
  1229. #endif
  1230. } else { /* some kind of keyword */
  1231. while (*++p) {
  1232. if (isspace(*p) || (*p == ';')) {
  1233. break;
  1234. }
  1235. }
  1236. pos_e = p;
  1237. end_of_token = *p;
  1238. *p = 0;
  1239. /* fprintf(stdout, "returning key token |%s|\n", pos); */
  1240. return pos;
  1241. }
  1242. error_msg("illegal token |%s|", pos);
  1243. }
  1244. static void *xmalloc(size_t n)
  1245. {
  1246. void *p;
  1247. if (!(p = malloc(n))) {
  1248. error_msg("OUT OF MEMORY");
  1249. }
  1250. return p;
  1251. }
  1252. static void do_copy(void)
  1253. {
  1254. char *s;
  1255. char *e;
  1256. if ((s = next_token()) != NULL) {
  1257. e = strchr(s + 1, '"');
  1258. if ((*s == '"') && e && (*e == '"') && !e[1]) {
  1259. if (next_token() != NULL) {
  1260. error_msg("illegal trailing text: %s", pos);
  1261. }
  1262. *e = 0;
  1263. ++s;
  1264. if (cur_base && !strcmp(cur_base->name,s)) {
  1265. /* fprintf(stderr, "skipping copy of base file %s\n", s); */
  1266. #warning need to update last in order and position or check
  1267. return;
  1268. }
  1269. /* fprintf(stderr, "full copy of %s\n", s); */
  1270. pushfile(s);
  1271. return;
  1272. }
  1273. }
  1274. error_msg("illegal or missing arg for copy: %s", s);
  1275. }
  1276. static void do_colsym(void)
  1277. {
  1278. char *s;
  1279. char *e;
  1280. if ((s = next_token()) != NULL) {
  1281. e = strrchr(s,'>');
  1282. if ((*s == '<') && e && (*e == '>') && !e[1]) {
  1283. if (next_token() != NULL) {
  1284. error_msg("illegal trailing text: %s", pos);
  1285. }
  1286. e[1] = 0; /* cleanup in case next_token stored something */
  1287. add_colitem(s,NULL);
  1288. return;
  1289. }
  1290. }
  1291. error_msg("illegal or missing arg for collating-symbol: %s", s);
  1292. }
  1293. static void do_colele(void)
  1294. {
  1295. char *s;
  1296. char *e;
  1297. char *s1;
  1298. char *e1;
  1299. int n;
  1300. if ((s = next_token()) != NULL) {
  1301. e = strrchr(s,'>');
  1302. if ((*s == '<') && e && (*e == '>') && !e[1]) {
  1303. if (((s1 = next_token()) == NULL)
  1304. || (strcmp(s1,"from") != 0)
  1305. || ((s1 = next_token()) == NULL)
  1306. || (*s1 != '\"')
  1307. ) {
  1308. error_msg("illegal format for collating-element spec");
  1309. }
  1310. e1 = strchr(s1 + 1, '"');
  1311. if ((*s1 != '"') || !e1 || (*e1 != '"') || (e1[1] != 0)) {
  1312. error_msg("illegal definition for collating-element: %s", s1);
  1313. }
  1314. if (next_token() != NULL) {
  1315. error_msg("illegal trailing text: %s", pos);
  1316. }
  1317. e[1] = 0; /* cleanup in case next_token stored something */
  1318. e1[1] = 0;
  1319. add_colitem(s,s1);
  1320. ++s1;
  1321. if (!(n = is_ucode(s1))) {
  1322. error_msg("starting char must be a <U####> code: %s", s1);
  1323. }
  1324. assert(s1[n] == '<');
  1325. s1[n] = 0;
  1326. s = xsymdup(s1);
  1327. if (!(tsearch(s, &cur_base->root_starter_char, sym_cmp))) {
  1328. error_msg("OUT OF MEMORY");
  1329. }
  1330. return;
  1331. }
  1332. }
  1333. error_msg("illegal or missing arg for collating-element: %s", s);
  1334. }
  1335. static ll_item_t *find_section_list_item(const char *name, col_locale_t *loc)
  1336. {
  1337. ll_item_t *p;
  1338. if (!loc) {
  1339. return NULL;
  1340. }
  1341. p = loc->section_list;
  1342. while (p) {
  1343. #warning devel code
  1344. /* if (!((p->data_type == DT_SECTION) || (p->data_type == DT_REORDER))) { */
  1345. /* fprintf(stderr, "fsli = %d\n", p->data_type); */
  1346. /* } */
  1347. assert((p->data_type == DT_SECTION) || (p->data_type == DT_REORDER));
  1348. if (!strcmp(name, ((section_t *)(p->data))->name)) {
  1349. break;
  1350. }
  1351. p = p->next;
  1352. }
  1353. return p;
  1354. }
  1355. static ll_item_t *find_ll_last(ll_item_t *p)
  1356. {
  1357. assert(p);
  1358. while (p->next) {
  1359. p = p->next;
  1360. }
  1361. return p;
  1362. }
  1363. static void do_script(void)
  1364. {
  1365. char *s;
  1366. char *e;
  1367. if ((s = next_token()) != NULL) {
  1368. e = strrchr(s,'>');
  1369. if ((*s == '<') && e && (*e == '>') && !e[1]) {
  1370. if (next_token() != NULL) {
  1371. error_msg("illegal trailing text: %s", pos);
  1372. }
  1373. e[1] = 0; /* cleanup in case next_token stored something */
  1374. add_script(s);
  1375. return;
  1376. }
  1377. }
  1378. error_msg("illegal or missing arg for script: %s", s);
  1379. }
  1380. static col_locale_t *new_col_locale(char *name)
  1381. {
  1382. ll_item_t *lli;
  1383. ll_item_t *lli2;
  1384. cur_col = (col_locale_t *) xmalloc(sizeof(col_locale_t));
  1385. cur_col->name = name;
  1386. cur_col->root_colitem = NULL;
  1387. cur_col->root_element = NULL;
  1388. cur_col->root_scripts = NULL;
  1389. cur_col->base_locale = NULL;
  1390. if (!superset) {
  1391. /* start with an anonymous section */
  1392. cur_section = new_section(NULL);
  1393. cur_col->section_list = new_ll_item(DT_SECTION, cur_section);
  1394. } else {
  1395. /* start with a reorder section */
  1396. cur_section = new_section("R");
  1397. cur_num_weights = cur_section->num_rules
  1398. = ((section_t *)(cur_base->section_list->data))->num_rules;
  1399. memcpy(cur_rule,
  1400. ((section_t *)(cur_base->section_list->data))->rules,
  1401. MAX_COLLATION_WEIGHTS);
  1402. memcpy(cur_section->rules,
  1403. ((section_t *)(cur_base->section_list->data))->rules,
  1404. MAX_COLLATION_WEIGHTS);
  1405. cur_col->section_list = new_ll_item(DT_REORDER, cur_section);
  1406. assert(cur_base->section_list->next == NULL); /* currently only one section allowed */
  1407. lli = ((section_t *)(cur_base->section_list->data))->itm_list;
  1408. assert(lli);
  1409. lli2 = new_ll_item(DT_REORDER, cur_section);
  1410. lli2->prev = lli2->next = lli2;
  1411. insque(lli2, lli->prev);
  1412. ((section_t *)(cur_base->section_list->data))->itm_list = lli2;
  1413. }
  1414. /* cur_col->section_list = NULL; */
  1415. /* add_script(((section_t *)(cur_col->section_list->data))->name); */
  1416. cur_col->root_wi_index = NULL;
  1417. cur_col->root_wi_index_reordered = NULL;
  1418. cur_col->root_derived_wi = NULL;
  1419. cur_col->derived_list = NULL;
  1420. cur_col->root_starter_char = NULL;
  1421. cur_col->root_starter_all = NULL;
  1422. cur_col->undefined_idx = NULL;
  1423. return cur_col;
  1424. }
  1425. static int colitem_cmp(const void *n1, const void *n2)
  1426. {
  1427. return strcmp(((colitem_t *)n1)->string, ((colitem_t *)n2)->string);
  1428. }
  1429. static int colelement_cmp(const void *n1, const void *n2)
  1430. {
  1431. int r;
  1432. r = strcmp(((colitem_t *)n1)->string, ((colitem_t *)n2)->string);
  1433. if (!r) {
  1434. if (((colitem_t *)n1)->element && ((colitem_t *)n2)->element) {
  1435. r = strcmp(((colitem_t *)n1)->element, ((colitem_t *)n2)->element);
  1436. } else if (((colitem_t *)n1)->element == ((colitem_t *)n2)->element) {
  1437. r = 0; /* both null */
  1438. } else {
  1439. r = (((colitem_t *)n1)->element == NULL) ? -1 : 1;
  1440. }
  1441. }
  1442. return r;
  1443. }
  1444. static void del_colitem(colitem_t *p)
  1445. {
  1446. /* free((void *) p->element); */
  1447. /* free((void *) p->string); */
  1448. free(p);
  1449. }
  1450. static colitem_t *new_colitem(char *item, char *def)
  1451. {
  1452. colitem_t *p;
  1453. p = xmalloc(sizeof(colitem_t));
  1454. p->string = xsymdup(item);
  1455. p->element = (!def) ? def : xsymdup(def);
  1456. return p;
  1457. }
  1458. static void add_colitem(char *item, char *def)
  1459. {
  1460. colitem_t *p;
  1461. #if 0
  1462. printf("adding collation item %s", item);
  1463. if (def) {
  1464. printf(" with definition %s", def);
  1465. }
  1466. printf("\n");
  1467. #endif
  1468. p = new_colitem(item, def);
  1469. #warning devel code
  1470. if (superset) {
  1471. if (tfind(p, &cur_base->root_colitem, colitem_cmp)) {
  1472. /* fprintf(stderr, "skipping superset duplicate collating item \"%s\"\n", p->string); */
  1473. del_colitem(p);
  1474. return;
  1475. /* } else { */
  1476. /* fprintf(stderr, "superset: new collating item \"%s\" = %s\n", p->string, p->element); */
  1477. }
  1478. }
  1479. if (cur_col == cur_derived) {
  1480. if (!tfind(p, &cur_base->root_colitem, colitem_cmp)) {
  1481. /* not in current but could be in base */
  1482. if (!tsearch(p, &cur_base->root_colitem, colitem_cmp)) {
  1483. error_msg("OUT OF MEMORY!");
  1484. }
  1485. } else if (!tfind(p, &cur_base->root_colitem, colelement_cmp)) {
  1486. error_msg("collating element/symbol mismatch: item=%s def=%s", item, def);
  1487. }
  1488. }
  1489. if (!tfind(p, &cur_col->root_colitem, colitem_cmp)) {
  1490. /* not in current but could be in base */
  1491. if (!tsearch(p, &cur_col->root_colitem, colitem_cmp)) {
  1492. error_msg("OUT OF MEMORY!");
  1493. }
  1494. } else if (!tfind(p, &cur_col->root_colitem, colelement_cmp)) {
  1495. error_msg("collating element/symbol mismatch");
  1496. } else { /* already there */
  1497. fprintf(stderr, "duplicate collating item \"%s\"\n", p->string);
  1498. del_colitem(p);
  1499. }
  1500. }
  1501. /* add a script (section) to the current locale */
  1502. static void add_script(const char *s)
  1503. {
  1504. ll_item_t *l;
  1505. /* make sure it isn't in base if working with derived */
  1506. if (cur_base != cur_col) {
  1507. if (find_section_list_item(s, cur_base)) {
  1508. error_msg("attempt to add script %s for derived when already in base", s);
  1509. }
  1510. }
  1511. if (find_section_list_item(s, cur_col)) {
  1512. error_msg("attempt to readd script %s", s);
  1513. }
  1514. l = find_ll_last(cur_col->section_list);
  1515. insque(new_ll_item(DT_SECTION, new_section(s)), l);
  1516. }
  1517. static const char str_forward[] = "forward";
  1518. static const char str_backward[] = "backward";
  1519. static const char str_position[] = "position";
  1520. static void do_order_start(void)
  1521. {
  1522. const char *s;
  1523. char *e;
  1524. ll_item_t *l;
  1525. section_t *sect;
  1526. int rule;
  1527. if (order_state & ~IN_ORDER) {
  1528. error_msg("order_start following reorder{_sections}_after");
  1529. }
  1530. order_state |= IN_ORDER;
  1531. if (superset) {
  1532. if (++superset_order_start_cnt > 1) {
  1533. error_msg("currently only a common order_start is supported in superset");
  1534. }
  1535. return;
  1536. }
  1537. if (!(s = next_token())) {
  1538. s = str_forward; /* if no args */
  1539. }
  1540. if (*s == '<') { /* section (script) */
  1541. e = strrchr(s,'>');
  1542. if ((*s == '<') && e && (*e == '>') && !e[1]) {
  1543. e[1] = 0; /* cleanup in case next_token stored something */
  1544. if (!(l = find_section_list_item(s, cur_col))) {
  1545. error_msg("ref of undefined sections: %s", s);
  1546. }
  1547. sect = (section_t *)(l->data);
  1548. if (sect->num_rules) {
  1549. error_msg("sections already defined: %s", s);
  1550. }
  1551. } else {
  1552. error_msg("illegal section ref: %s", s);
  1553. }
  1554. if (!(s = next_token())) {
  1555. s = str_forward; /* if no args */
  1556. } else if (*s != ';') {
  1557. error_msg("missing seperator!");
  1558. }
  1559. } else { /* need an anonymous section */
  1560. if ((*cur_section->name != '<') && (cur_section->num_items == 0)) { /* already in an empty anonymous section */
  1561. sect = cur_section;
  1562. /* fprintf(stdout, "using empty anon section %s\n", sect->name); */
  1563. } else {
  1564. sect = new_section(NULL);
  1565. l = find_ll_last(cur_col->section_list);
  1566. insque(new_ll_item(DT_SECTION, sect), l);
  1567. /* fprintf(stdout, "adding order section after section %s\n", ((section_t *)(l->data))->name); */
  1568. /* fprintf(stdout, " last section is %s\n", ((section_t *)(l->next->data))->name); */
  1569. }
  1570. sect->num_rules = 0; /* setting this below so nix default */
  1571. }
  1572. cur_section = sect;
  1573. /* fprintf(stdout, "cur_section now %s\n", cur_section->name); */
  1574. #warning need to add section to weight list?
  1575. /* now do rules */
  1576. do {
  1577. rule = 0;
  1578. if (*s == ';') {
  1579. ++s;
  1580. }
  1581. while (*s) {
  1582. if (!strncmp(str_forward, s, 7)) {
  1583. rule |= R_FORWARD;
  1584. s += 7;
  1585. } else if (!strncmp(str_backward, s, 8)) {
  1586. rule |= R_BACKWARD;
  1587. s += 8;
  1588. } else if (!strncmp(str_position, s, 8)) {
  1589. rule |= R_POSITION;
  1590. s += 8;
  1591. }
  1592. if (*s == ',') {
  1593. ++s;
  1594. continue;
  1595. }
  1596. if (!*s || (*s == ';')) {
  1597. if (sect->num_rules >= MAX_COLLATION_WEIGHTS) {
  1598. error_msg("more than %d weight rules!", MAX_COLLATION_WEIGHTS);
  1599. }
  1600. if (!rule) {
  1601. error_msg("missing weight rule!");
  1602. }
  1603. if ((rule & (R_FORWARD|R_BACKWARD|R_POSITION)) > R_BACKWARD) {
  1604. error_msg("backward paired with forward and/or position!");
  1605. }
  1606. sect->rules[sect->num_rules++] = rule;
  1607. rule = 0;
  1608. continue;
  1609. }
  1610. error_msg("illegal weight rule: %s", s);
  1611. }
  1612. } while ((s = next_token()) != NULL);
  1613. cur_section = sect;
  1614. /* fprintf(stderr, "setting cur_num_weights to %d for %s\n", sect->num_rules, sect->name); */
  1615. cur_num_weights = sect->num_rules;
  1616. memcpy(cur_rule, sect->rules, MAX_COLLATION_WEIGHTS);
  1617. }
  1618. static void do_order_end(void)
  1619. {
  1620. if (!(order_state & IN_ORDER)) {
  1621. error_msg("order_end with no matching order_start");
  1622. }
  1623. order_state &= ~IN_ORDER;
  1624. cur_section = new_section(NULL);
  1625. }
  1626. static void do_reorder_after(void)
  1627. {
  1628. char *t;
  1629. ll_item_t *lli;
  1630. const weight_t *w;
  1631. int save_cur_num_weights;
  1632. char save_cur_rule[MAX_COLLATION_WEIGHTS];
  1633. if (order_state & ~IN_REORDER) {
  1634. error_msg("reorder_after following order_start or reorder_sections_after");
  1635. }
  1636. order_state |= IN_REORDER;
  1637. if (superset) {
  1638. error_msg("currently reorder_after is not supported in supersets");
  1639. }
  1640. #warning have to use rule for current section!!!
  1641. if (!(t = next_token())) {
  1642. error_msg("missing arg for reorder_after");
  1643. }
  1644. t = xsymdup(t);
  1645. if (next_token() != NULL) {
  1646. error_msg("trailing text reorder_after: %s", pos);
  1647. }
  1648. if (cur_col == cur_base) {
  1649. error_msg("sorry.. reorder_after in base locale is not currently supported");
  1650. }
  1651. if (!(lli = find_wi_index(t, cur_base))) {
  1652. error_msg("reorder_after for non-base item currently not supported: %s", t);
  1653. }
  1654. w = ((weighted_item_t *)(lli->data))->weight;
  1655. save_cur_num_weights = cur_num_weights;
  1656. memcpy(save_cur_rule, cur_rule, MAX_COLLATION_WEIGHTS);
  1657. cur_section = new_section("R");
  1658. insque(new_ll_item(DT_REORDER, cur_section), lli);
  1659. #if 0
  1660. {
  1661. ll_item_t *l1;
  1662. ll_item_t *l2;
  1663. ll_item_t *l3;
  1664. l1 = new_ll_item(DT_REORDER, cur_section);
  1665. l2 = find_ll_last(cur_col->section_list);
  1666. insque(l1, l2);
  1667. l3 = find_ll_last(cur_col->section_list);
  1668. fprintf(stderr, "reorder_after %p %p %p %s\n", l1, l2, l3, cur_section->name);
  1669. }
  1670. #else
  1671. insque(new_ll_item(DT_REORDER, cur_section), find_ll_last(cur_col->section_list));
  1672. #endif
  1673. cur_num_weights = cur_section->num_rules = save_cur_num_weights;
  1674. memcpy(cur_rule, save_cur_rule, MAX_COLLATION_WEIGHTS);
  1675. memcpy(cur_section->rules, save_cur_rule, MAX_COLLATION_WEIGHTS);
  1676. #warning devel code
  1677. /* fprintf(stderr, "reorder -- %s %d\n", ((weighted_item_t *)(lli->data))->symbol, w->num_weights); */
  1678. #warning hack to get around hu_HU reorder-after problem
  1679. /* if (!w->num_weights) { */
  1680. /* } else { */
  1681. /* cur_num_weights = w->num_weights; */
  1682. /* memcpy(cur_rule, w->rule, MAX_COLLATION_WEIGHTS); */
  1683. /* } */
  1684. /* fprintf(stderr, "reorder_after succeeded for %s\n", t); */
  1685. }
  1686. static void do_reorder_end(void)
  1687. {
  1688. if (!(order_state & IN_REORDER)) {
  1689. error_msg("reorder_end with no matching reorder_after");
  1690. }
  1691. order_state &= ~IN_REORDER;
  1692. }
  1693. static void do_reorder_sections_after(void)
  1694. {
  1695. const char *t;
  1696. ll_item_t *lli;
  1697. if (order_state & ~IN_REORDER_SECTIONS) {
  1698. error_msg("reorder_sections_after following order_start or reorder_after");
  1699. }
  1700. order_state |= IN_REORDER_SECTIONS;
  1701. if (superset) {
  1702. error_msg("currently reorder_sections_after is not supported in supersets");
  1703. }
  1704. if (!(t = next_token())) {
  1705. error_msg("missing arg for reorder_sections_after");
  1706. }
  1707. t = xsymdup(t);
  1708. if (next_token() != NULL) {
  1709. error_msg("trailing text reorder_sections_after: %s", pos);
  1710. }
  1711. if (cur_col == cur_base) {
  1712. error_msg("sorry.. reorder_sections_after in base locale is not currently supported");
  1713. }
  1714. lli = cur_base->section_list;
  1715. do {
  1716. /* fprintf(stderr, "hmm -- |%s|%d|\n", ((section_t *)(lli->data))->name, lli->data_type); */
  1717. if (lli->data_type & DT_SECTION) {
  1718. /* fprintf(stderr, "checking |%s|%s|\n", ((section_t *)(lli->data))->name, t); */
  1719. if (!strcmp(((section_t *)(lli->data))->name, t)) {
  1720. reorder_section_ptr = lli;
  1721. return;
  1722. }
  1723. }
  1724. lli = lli->next;
  1725. } while (lli);
  1726. error_msg("reorder_sections_after for non-base item currently not supported: %s", t);
  1727. }
  1728. static void do_reorder_sections_end(void)
  1729. {
  1730. if (!(order_state & IN_REORDER_SECTIONS)) {
  1731. error_msg("reorder_sections_end with no matching reorder_sections_after");
  1732. }
  1733. order_state &= ~IN_REORDER_SECTIONS;
  1734. reorder_section_ptr = NULL;
  1735. }
  1736. static ll_item_t *new_ll_item(int data_type, void *data)
  1737. {
  1738. ll_item_t *p;
  1739. p = xmalloc(sizeof(ll_item_t));
  1740. p->next = p->prev = NULL;
  1741. p->data_type = data_type;
  1742. p->data = data;
  1743. p->idx = INT_MIN;
  1744. return p;
  1745. }
  1746. static int sym_cmp(const void *n1, const void *n2)
  1747. {
  1748. /* fprintf(stderr, "sym_cmp: |%s| |%s|\n", (const char *)n1, (const char *)n2); */
  1749. return strcmp((const char *) n1, (const char *) n2);
  1750. }
  1751. static char *xsymdup(const char *s)
  1752. {
  1753. void *p;
  1754. if (!(p = tfind(s, &root_sym, sym_cmp))) { /* not a currently known symbol */
  1755. if (!(s = strdup(s)) || !(p = tsearch(s, &root_sym, sym_cmp))) {
  1756. error_msg("OUT OF MEMORY!");
  1757. }
  1758. ++num_sym;
  1759. mem_sym += strlen(s) + 1;
  1760. /* fprintf(stderr, "xsymdup: alloc |%s| %p |%s| %p\n", *(char **)p, p, s, s); */
  1761. /* } else { */
  1762. /* fprintf(stderr, "xsymdup: found |%s| %p\n", *(char **)p, p); */
  1763. }
  1764. return *(char **) p;
  1765. }
  1766. static int weight_cmp(const void *n1, const void *n2)
  1767. {
  1768. const weight_t *w1 = (const weight_t *) n1;
  1769. const weight_t *w2 = (const weight_t *) n2;
  1770. int i, r;
  1771. if (w1->num_weights != w2->num_weights) {
  1772. return w1->num_weights - w2->num_weights;
  1773. }
  1774. for (i=0 ; i < w1->num_weights ; i++) {
  1775. if (w1->rule[i] != w2->rule[i]) {
  1776. return w1->rule[i] - w2->rule[i];
  1777. }
  1778. if ((r = strcmp(w1->colitem[i], w2->colitem[i])) != 0) {
  1779. return r;
  1780. }
  1781. }
  1782. return 0;
  1783. }
  1784. static weight_t *register_weight(weight_t *w)
  1785. {
  1786. void *p;
  1787. if (!(p = tfind(w, &root_weight, weight_cmp))) { /* new weight */
  1788. p = xmalloc(sizeof(weight_t));
  1789. memcpy(p, w, sizeof(weight_t));
  1790. if (!(p = tsearch(p, &root_weight, weight_cmp))) {
  1791. error_msg("OUT OF MEMORY!");
  1792. }
  1793. ++unique_weights;
  1794. /* } else { */
  1795. /* fprintf(stderr, "rw: found\n"); */
  1796. }
  1797. return *(weight_t **)p;
  1798. }
  1799. static size_t ll_len(ll_item_t *l)
  1800. {
  1801. size_t n = 0;
  1802. ll_item_t *p = l;
  1803. while (p) {
  1804. ++n;
  1805. p = p->next;
  1806. if (p == l) { /* work for circular too */
  1807. break;
  1808. }
  1809. }
  1810. return n;
  1811. }
  1812. static size_t ll_count(ll_item_t *l, int mask)
  1813. {
  1814. size_t n = 0;
  1815. ll_item_t *p = l;
  1816. while (p) {
  1817. if (p->data_type & mask) {
  1818. ++n;
  1819. }
  1820. p = p->next;
  1821. if (p == l) { /* work for circular too */
  1822. break;
  1823. }
  1824. }
  1825. return n;
  1826. }
  1827. static int wi_index_cmp(const void *n1, const void *n2)
  1828. {
  1829. const char *s1 = ((weighted_item_t *)(((ll_item_t *) n1)->data))->symbol;
  1830. const char *s2 = ((weighted_item_t *)(((ll_item_t *) n2)->data))->symbol;
  1831. return strcmp(s1, s2);
  1832. }
  1833. static void add_wi_index(ll_item_t *l)
  1834. {
  1835. assert(l->data_type == DT_WEIGHTED);
  1836. if (!strcmp(((weighted_item_t *)(l->data))->symbol, "UNDEFINED")) {
  1837. cur_col->undefined_idx = l;
  1838. }
  1839. if (!tfind(l, &cur_col->root_wi_index, wi_index_cmp)) { /* new wi_index */
  1840. if (!tsearch(l, &cur_col->root_wi_index, wi_index_cmp)) {
  1841. error_msg("OUT OF MEMORY!");
  1842. }
  1843. }
  1844. if (cur_base != cur_col) {
  1845. if (!tfind(l, &cur_base->root_wi_index, wi_index_cmp)) {/* not a base val */
  1846. /* printf("derived: %s\n", ((weighted_item_t *)(l->data))->symbol); */
  1847. if (!tfind(l, &cur_base->root_derived_wi, wi_index_cmp)) { /* new derived */
  1848. if (!tsearch(l, &cur_base->root_derived_wi, wi_index_cmp)) {
  1849. error_msg("OUT OF MEMORY!");
  1850. }
  1851. }
  1852. }
  1853. }
  1854. }
  1855. static int final_index;
  1856. static int is_ucode(const char *s)
  1857. {
  1858. if ((s[0] == '<')
  1859. && (s[1] == 'U')
  1860. && isxdigit(s[2])
  1861. && isxdigit(s[3])
  1862. && isxdigit(s[4])
  1863. && isxdigit(s[5])
  1864. && (s[6] == '>')
  1865. ) {
  1866. return 7;
  1867. } else {
  1868. return 0;
  1869. }
  1870. }
  1871. static void add_final_col_index(const char *s)
  1872. {
  1873. ENTRY e;
  1874. e.key = (char *) s;
  1875. e.data = (void *)(final_index);
  1876. if (!hsearch(e, FIND)) { /* not in the table */
  1877. if (!hsearch(e, ENTER)) {
  1878. error_msg("OUT OF MEMORY! (hsearch)");
  1879. }
  1880. #if 0
  1881. {
  1882. int n;
  1883. void *v;
  1884. colitem_t ci;
  1885. colitem_t *p;
  1886. const char *t;
  1887. if (!strcmp(s, "UNDEFINED")) {
  1888. printf("%6d: %s\n", final_index, s);
  1889. } else {
  1890. assert(*s == '<');
  1891. if ((n = is_ucode(s)) != 0) {
  1892. assert(!s[n]);
  1893. printf("%6d: %s\n", final_index, s);
  1894. } else {
  1895. ci.string = (char *) s;
  1896. ci.element = NULL; /* don't care */
  1897. v = tfind(&ci, &cur_base->root_colitem, colitem_cmp);
  1898. if (!v) {
  1899. fprintf(stderr, "%s NOT DEFINED!!!\n", s);
  1900. } else {
  1901. p = *((colitem_t **) v);
  1902. if (p->element != NULL) {
  1903. t = p->element;
  1904. assert(*t == '"');
  1905. ++t;
  1906. n = is_ucode(t);
  1907. assert(n);
  1908. printf("%6d: %.*s | ", final_index, n, t);
  1909. do {
  1910. t += n;
  1911. assert(*t);
  1912. if (*t == '"') {
  1913. assert(!t[1]);
  1914. break;
  1915. }
  1916. n = is_ucode(t);
  1917. assert(n);
  1918. printf("%.*s", n, t);
  1919. } while (1);
  1920. printf(" collating-element %s\n", s);
  1921. } else {
  1922. printf("%6d: %s (collating-symbol)\n", final_index, s);
  1923. }
  1924. }
  1925. }
  1926. }
  1927. }
  1928. #endif
  1929. ++final_index;
  1930. }
  1931. }
  1932. static int final_index_val0(const char *s)
  1933. {
  1934. ENTRY *p;
  1935. ENTRY e;
  1936. e.key = (char *) s;
  1937. if (!(p = hsearch(e, FIND))) { /* not in the table */
  1938. return 0;
  1939. }
  1940. return (int)(p->data);
  1941. }
  1942. static int final_index_val(const char *s)
  1943. {
  1944. ENTRY *p;
  1945. ENTRY e;
  1946. e.key = (char *) s;
  1947. if (!(p = hsearch(e, FIND))) { /* not in the table */
  1948. error_msg("can't find final index: %s", s);
  1949. }
  1950. return (int)(p->data);
  1951. }
  1952. static size_t num_tree_nodes;
  1953. static void count_nodes(const void *ptr, VISIT order, int level)
  1954. {
  1955. if ((order == postorder) || (order == leaf)) {
  1956. ++num_tree_nodes;
  1957. }
  1958. }
  1959. static size_t tnumnodes(const void *root)
  1960. {
  1961. num_tree_nodes = 0;
  1962. twalk(root, count_nodes);
  1963. return num_tree_nodes;
  1964. }
  1965. static ll_item_t *find_wi_index(const char *sym, col_locale_t *cl)
  1966. {
  1967. weighted_item_t w;
  1968. ll_item_t l;
  1969. void *p;
  1970. w.symbol = sym;
  1971. l.data = &w;
  1972. l.data_type = DT_WEIGHTED;
  1973. p = tfind(&l, &cl->root_wi_index, wi_index_cmp);
  1974. if (p) {
  1975. p = *(ll_item_t **)p;
  1976. }
  1977. return (ll_item_t *) p;
  1978. }
  1979. static void mark_reordered(const char *sym)
  1980. {
  1981. ll_item_t *lli;
  1982. lli = find_wi_index(sym, cur_base);
  1983. if (lli) {
  1984. if (!tsearch(lli, &cur_base->root_wi_index_reordered, wi_index_cmp)) {
  1985. error_msg("OUT OF MEMORY!");
  1986. }
  1987. }
  1988. }
  1989. static ll_item_t *find_wi_index_reordered(const char *sym)
  1990. {
  1991. weighted_item_t w;
  1992. ll_item_t l;
  1993. void *p;
  1994. w.symbol = sym;
  1995. l.data = &w;
  1996. l.data_type = DT_WEIGHTED;
  1997. p = tfind(&l, &cur_base->root_wi_index_reordered, wi_index_cmp);
  1998. if (p) {
  1999. p = *(ll_item_t **)p;
  2000. }
  2001. return (ll_item_t *) p;
  2002. }
  2003. static ll_item_t *init_comm_ptr(void)
  2004. {
  2005. assert(cur_base);
  2006. assert(cur_base->section_list);
  2007. /* at the moment, only support one section in comm */
  2008. assert(cur_base->section_list->next == NULL);
  2009. comm_cur_ptr = ((section_t *)(cur_base->section_list->data))->itm_list;
  2010. while (comm_cur_ptr && (comm_cur_ptr->data_type & DT_REORDER)) {
  2011. comm_cur_ptr = comm_cur_ptr->next;
  2012. }
  2013. #warning devel code
  2014. /* { */
  2015. /* ll_item_t *p = comm_cur_ptr; */
  2016. /* fprintf(stderr, "init_comm_ptr\n"); */
  2017. /* while (p != comm_cur_ptr) { */
  2018. /* if (p->data_type & DT_WEIGHTED) { */
  2019. /* fprintf(stderr, "%s", ((weighted_item_t *)p)->symbol); */
  2020. /* } */
  2021. /* p = p->next; */
  2022. /* } */
  2023. /* } */
  2024. assert(comm_cur_ptr);
  2025. /* fprintf(stderr, "init_comm_ptr -- %s %p %p %p %d\n", */
  2026. /* ((weighted_item_t *)(comm_cur_ptr->data))->symbol, */
  2027. /* comm_cur_ptr, comm_cur_ptr->prev, comm_cur_ptr->next, */
  2028. /* ll_len(comm_cur_ptr)); */
  2029. comm_prev_ptr = NULL;
  2030. return comm_cur_ptr;
  2031. }
  2032. static ll_item_t *next_comm_ptr(void)
  2033. {
  2034. /* at the moment, only support one section in comm */
  2035. assert(cur_base->section_list->next == NULL);
  2036. comm_prev_ptr = comm_cur_ptr;
  2037. while (comm_cur_ptr && ((comm_cur_ptr = comm_cur_ptr->next) != NULL)) {
  2038. if (!(comm_cur_ptr->data_type & DT_REORDER)) {
  2039. break;
  2040. }
  2041. }
  2042. return comm_cur_ptr;
  2043. }
  2044. static int dump_count;
  2045. #if 0
  2046. static void dump_section(section_t *s, int mask, col_locale_t *der)
  2047. {
  2048. ll_item_t *lli;
  2049. ll_item_t *lli0;
  2050. weighted_item_t *w;
  2051. weight_t *p;
  2052. int i;
  2053. lli0 = lli = s->itm_list;
  2054. if (!lli0) {
  2055. return;
  2056. }
  2057. do {
  2058. if (!(lli->data_type & mask)) {
  2059. lli = lli->next;
  2060. continue;
  2061. }
  2062. if (lli->data_type & DT_WEIGHTED) {
  2063. ++dump_count;
  2064. w = (weighted_item_t *)(lli->data);
  2065. p = w->weight;
  2066. printf("%6d: %s (%d) ", dump_count, w->symbol, p->num_weights);
  2067. for (i = 0 ; i < p->num_weights ; i++) {
  2068. if (p->rule[i] & R_FORWARD) {
  2069. printf("F");
  2070. }
  2071. if (p->rule[i] & R_BACKWARD) {
  2072. printf("B");
  2073. }
  2074. if (p->rule[i] & R_POSITION) {
  2075. printf("P");
  2076. }
  2077. printf(",");
  2078. }
  2079. for (i = 0 ; i < p->num_weights ; i++) {
  2080. printf(" %s", p->colitem[i]);
  2081. }
  2082. printf("\n");
  2083. } else if (lli->data_type & (DT_SECTION|DT_REORDER)) {
  2084. if (lli->data_type == DT_REORDER) {
  2085. assert(der);
  2086. if (strncmp(((section_t *)(lli->data))->name, der->name, strlen(der->name))) {
  2087. lli = lli->next;
  2088. continue;
  2089. }
  2090. }
  2091. if (lli->data_type & DT_SECTION) {
  2092. printf("SECTION -----------------\n");
  2093. } else {
  2094. printf("REORDER -----------------\n");
  2095. }
  2096. dump_section((section_t *)(lli->data), mask, der);
  2097. printf("DONE --------------------\n");
  2098. }
  2099. lli = lli->next;
  2100. } while (lli != lli0);
  2101. }
  2102. #else
  2103. static int in_reorder_section = 0;
  2104. static void dump_section(section_t *s, int mask, col_locale_t *der)
  2105. {
  2106. ll_item_t *lli;
  2107. ll_item_t *lli0;
  2108. weighted_item_t *w;
  2109. weight_t *p;
  2110. int i;
  2111. lli0 = lli = s->itm_list;
  2112. if (!lli0) {
  2113. return;
  2114. }
  2115. do {
  2116. if (!(lli->data_type & mask)) {
  2117. lli = lli->next;
  2118. continue;
  2119. }
  2120. if (lli->data_type & DT_WEIGHTED) {
  2121. ++dump_count;
  2122. w = (weighted_item_t *)(lli->data);
  2123. p = w->weight;
  2124. #if 1
  2125. if (in_reorder_section) {
  2126. printf(" %p", w);
  2127. }
  2128. #else
  2129. printf("%6d: %s (%d) ", dump_count, w->symbol, p->num_weights);
  2130. for (i = 0 ; i < p->num_weights ; i++) {
  2131. if (p->rule[i] & R_FORWARD) {
  2132. printf("F");
  2133. }
  2134. if (p->rule[i] & R_BACKWARD) {
  2135. printf("B");
  2136. }
  2137. if (p->rule[i] & R_POSITION) {
  2138. printf("P");
  2139. }
  2140. printf(",");
  2141. }
  2142. for (i = 0 ; i < p->num_weights ; i++) {
  2143. printf(" %s", p->colitem[i]);
  2144. }
  2145. printf("\n");
  2146. #endif
  2147. } else if (lli->data_type & (DT_SECTION|DT_REORDER)) {
  2148. if (lli->data_type == DT_REORDER) {
  2149. assert(der);
  2150. if (strncmp(((section_t *)(lli->data))->name, der->name, strlen(der->name))) {
  2151. lli = lli->next;
  2152. continue;
  2153. }
  2154. }
  2155. if (lli->data_type & DT_SECTION) {
  2156. /* printf("SECTION -----------------\n"); */
  2157. assert(0);
  2158. } else {
  2159. /* printf("REORDER -----------------\n"); */
  2160. in_reorder_section = 1;
  2161. }
  2162. dump_section((section_t *)(lli->data), mask, der);
  2163. /* printf("DONE --------------------\n"); */
  2164. printf("\n");
  2165. in_reorder_section = 0;
  2166. }
  2167. lli = lli->next;
  2168. } while (lli != lli0);
  2169. }
  2170. #endif
  2171. static void dump_weights(const char *name)
  2172. {
  2173. ll_item_t *lli;
  2174. col_locale_t *base;
  2175. col_locale_t *der;
  2176. col_locale_t cl;
  2177. void *p;
  2178. assert(name);
  2179. if (!*name) { /* use last */
  2180. base = cur_base;
  2181. der = cur_derived;
  2182. } else {
  2183. cl.name = (char *) name;
  2184. if (!(p = tfind(&cl, &root_col_locale, col_locale_cmp))) {
  2185. error_msg("unknown locale: %s", name);
  2186. }
  2187. base = *((col_locale_t **) p);
  2188. der = NULL;
  2189. if (base->base_locale) { /* oops... really derived */
  2190. der = base;
  2191. base = der->base_locale;
  2192. }
  2193. }
  2194. dump_count = 0;
  2195. if (base) {
  2196. /* printf("BASE - %s\n", base->name); */
  2197. for (lli = base->section_list ; lli ; lli = lli->next) {
  2198. /* printf("SECTION %s\n", ((section_t *)(lli->data))->name); */
  2199. dump_section((section_t *)(lli->data), ~0, der);
  2200. }
  2201. }
  2202. assert(der != base);
  2203. if (der) {
  2204. /* printf("DERIVED - %s\n", der->name); */
  2205. for (lli = der->section_list ; lli ; lli = lli->next) {
  2206. if (lli->data_type == DT_SECTION) {
  2207. dump_section((section_t *)(lli->data), DT_WEIGHTED, der);
  2208. }
  2209. }
  2210. }
  2211. /* printf("DONE\n"); */
  2212. }
  2213. static void print_starter_node(const void *ptr, VISIT order, int level)
  2214. {
  2215. if (order == postorder || order == leaf) {
  2216. fprintf(stderr, " %s\n", *(const char **) ptr);
  2217. }
  2218. }
  2219. static void finalize_base(void)
  2220. {
  2221. ll_item_t *s;
  2222. ll_item_t *h;
  2223. ll_item_t *lli;
  2224. ll_item_t *h2;
  2225. ll_item_t *l2;
  2226. ll_item_t *cli;
  2227. ll_item_t *rli = NULL;
  2228. weighted_item_t *w;
  2229. weight_t *p;
  2230. int i, n, mr, r, mi;
  2231. col_locale_t *cl;
  2232. void *mm;
  2233. int num_invariant = 0;
  2234. int num_varying = 0;
  2235. int max_weight;
  2236. int index2weight_len_inc = 1;
  2237. assert(cur_base);
  2238. assert(base_locale_len+1 < BASE_LOCALE_LEN);
  2239. base_locale_array[base_locale_len].name = cur_base->name;
  2240. base_locale_array[base_locale_len].num_weights = 1;
  2241. base_locale_array[base_locale_len].index2weight_offset = index2weight_len;
  2242. base_locale_array[base_locale_len].index2ruleidx_offset = index2ruleidx_len;
  2243. if (!strcmp(cur_base->name,"ja_JP") || !strcmp(cur_base->name,"ko_KR")) {
  2244. #warning fix the index2weight check!!
  2245. index2weight_len_inc = 0;
  2246. }
  2247. /* printf("%s -- index2weight_len = %d\n", cur_base->name, index2weight_len); */
  2248. if (!hcreate(30000)) {
  2249. error_msg("OUT OF MEMORY!");
  2250. }
  2251. /* first pass ... set the fixed indexes */
  2252. final_index = i = 1;
  2253. mr = 0;
  2254. for (s = cur_base->section_list ; s ; s = s->next) {
  2255. #if 1
  2256. if (s->data_type & DT_REORDER) { /* a reordered section */
  2257. fprintf(stderr, "pass1: reordered section %s - xxx\n", ((section_t *)(s->data))->name);
  2258. lli = ((section_t *)(s->data))->itm_list;
  2259. r = 0;
  2260. if (lli) {
  2261. /* r = ll_len( ((section_t *)(lli->data))->itm_list ); */
  2262. r = ll_len(lli) + 1;
  2263. }
  2264. if (r > mr) {
  2265. mr = r;
  2266. }
  2267. fprintf(stderr, "pass1: reordered section %s - %d\n", ((section_t *)(s->data))->name, r);
  2268. continue;
  2269. }
  2270. #endif
  2271. h = lli = ((section_t *)(s->data))->itm_list;
  2272. if (!lli) {
  2273. continue;
  2274. }
  2275. do {
  2276. if (lli->data_type & DT_RANGE) {
  2277. i += mr;
  2278. mr = 0;
  2279. #warning check ko_kR and 9
  2280. /* ++i; */
  2281. lli->idx = i;
  2282. assert(!rli);
  2283. rli = lli;
  2284. fprintf(stderr, "range pre = %d after = ", i);
  2285. i += ((range_item_t *)(lli->data))->length + 1;
  2286. #warning check ko_kR and 9
  2287. /* ++i; */
  2288. fprintf(stderr, "%d\n", i);
  2289. if (!index2weight_len_inc) { /* ko_KR hack */
  2290. final_index += ((range_item_t *)(lli->data))->length + 1;
  2291. }
  2292. /* add_final_col_index("RANGE"); */
  2293. } else if (lli->data_type & DT_WEIGHTED) {
  2294. i += mr;
  2295. mr = 0;
  2296. w = (weighted_item_t *)(lli->data);
  2297. if (find_wi_index_reordered(w->symbol)) { /* reordered symbol so skip on first pass */
  2298. ++num_varying;
  2299. ++i;
  2300. continue;
  2301. }
  2302. ++num_invariant;
  2303. index2weight_buffer[index2weight_len] = lli->idx = i++;
  2304. index2weight_len += index2weight_len_inc;
  2305. add_final_col_index(w->symbol);
  2306. } else {
  2307. assert(lli->data_type & DT_REORDER);
  2308. r = ll_len( ((section_t *)(lli->data))->itm_list );
  2309. #warning check ko_kR and 9
  2310. if (r > mr) {
  2311. mr = r;
  2312. }
  2313. /* r = 0; */
  2314. }
  2315. } while ((lli = lli->next) != h);
  2316. }
  2317. /* second pass ... set the reordered indexes */
  2318. mi = i + mr;
  2319. mr = i = 0;
  2320. for (s = cur_base->section_list ; s ; s = s->next) {
  2321. h = lli = ((section_t *)(s->data))->itm_list;
  2322. if (!lli) {
  2323. continue;
  2324. }
  2325. do {
  2326. if (lli->data_type & DT_RANGE) {
  2327. i += mr;
  2328. mr = 0;
  2329. i = lli->idx + ((range_item_t *)(lli->data))->length + 1;
  2330. #warning check
  2331. } else if ((lli->data_type & DT_WEIGHTED) && !(s->data_type & DT_REORDER)) {
  2332. i += mr;
  2333. mr = 0;
  2334. w = (weighted_item_t *)(lli->data);
  2335. if (find_wi_index_reordered(w->symbol) /* reordered symbol skipped on first pass */
  2336. #if 0
  2337. || (s->data_type & DT_REORDER) /* or in a reordered section */
  2338. #endif
  2339. ) {
  2340. assert(!(s->data_type & DT_REORDER));
  2341. index2weight_buffer[index2weight_len] = lli->idx = ++i;
  2342. index2weight_len += index2weight_len_inc;
  2343. add_final_col_index(w->symbol);
  2344. /* fprintf(stdout, "%11s: r %6d %6d %s\n", */
  2345. /* cur_base->name, lli->idx, final_index_val(w->symbol), w->symbol); */
  2346. continue;
  2347. }
  2348. i = lli->idx;
  2349. /* fprintf(stdout, "%11s: w %6d %6d %s\n", */
  2350. /* cur_base->name, lli->idx, final_index_val(w->symbol), w->symbol); */
  2351. } else {
  2352. /* fprintf(stderr, "section: %s %d %d\n", ((section_t *)(s->data))->name, */
  2353. /* s->data_type, lli->data_type); */
  2354. /* assert(!(s->data_type & DT_REORDER)); */
  2355. /* assert(lli->data_type & DT_REORDER); */
  2356. #if 1
  2357. if (s->data_type & DT_REORDER) {
  2358. h2 = l2 = lli;
  2359. if (!h2) {
  2360. continue;
  2361. }
  2362. } else {
  2363. assert(s->data_type & DT_SECTION);
  2364. h2 = l2 = ((section_t *)(lli->data))->itm_list;
  2365. if (!h2) {
  2366. continue;
  2367. }
  2368. }
  2369. #else
  2370. h2 = l2 = ((section_t *)(lli->data))->itm_list;
  2371. if (!h2) {
  2372. continue;
  2373. }
  2374. #endif
  2375. r = 0;
  2376. do {
  2377. assert(l2->data_type & DT_WEIGHTED);
  2378. ++r;
  2379. l2->idx = i + r;
  2380. /* fprintf(stdout, "%s: R %6d %s\n", */
  2381. /* ((section_t *)(lli->data))->name, l2->idx, ((weighted_item_t *)(l2->data))->symbol); */
  2382. } while ((l2 = l2->next) != h2);
  2383. if (r > mr) {
  2384. mr = r;
  2385. }
  2386. }
  2387. } while ((lli = lli->next) != h);
  2388. }
  2389. /* finally, walk through all derived locales and set non-reordered section items */
  2390. mr = mi;
  2391. for (cli = cur_base->derived_list ; cli ; cli = cli->next) {
  2392. cl = (col_locale_t *)(cli->data);
  2393. /* fprintf(stderr, "pass3: %d %s\n", cli->data_type, cl->name); */
  2394. /* fprintf(stdout, "pass3: %d %s\n", cli->data_type, cl->name); */
  2395. assert(cli->data_type == DT_COL_LOCALE);
  2396. i = mi;
  2397. for (s = cl->section_list ; s ; s = s->next) {
  2398. /* if (s->data_type & DT_REORDER) { */
  2399. /* continue; */
  2400. /* } */
  2401. h = lli = ((section_t *)(s->data))->itm_list;
  2402. if (!lli) {
  2403. continue;
  2404. }
  2405. do {
  2406. assert(!(lli->data_type & DT_RANGE));
  2407. if (lli->data_type & DT_WEIGHTED) {
  2408. /* fprintf(stderr, " %d %d %s\n", lli->data_type, lli->idx, ((weighted_item_t *)(lli->data))->symbol); */
  2409. add_final_col_index(((weighted_item_t *)(lli->data))->symbol);
  2410. if (s->data_type & DT_REORDER) {
  2411. continue;
  2412. }
  2413. assert(lli->idx == INT_MIN);
  2414. lli->idx = ++i;
  2415. /* fprintf(stdout, "%11s: S %6d %6d %s\n", */
  2416. /* cl->name, lli->idx, */
  2417. /* final_index_val(((weighted_item_t *)(lli->data))->symbol), */
  2418. /* ((weighted_item_t *)(lli->data))->symbol); */
  2419. } else {
  2420. assert(0);
  2421. assert(lli->data_type & DT_SECTION);
  2422. h2 = l2 = ((section_t *)(lli->data))->itm_list;
  2423. if (!h2) {
  2424. continue;
  2425. }
  2426. do {
  2427. assert(l2->data_type & DT_WEIGHTED);
  2428. assert(l2->idx == INT_MIN);
  2429. l2->idx = ++i;
  2430. add_final_col_index(((weighted_item_t *)(l2->data))->symbol);
  2431. } while ((l2 = l2->next) != h2);
  2432. }
  2433. } while ((lli = lli->next) != h);
  2434. }
  2435. if (i > mr) {
  2436. mr = i;
  2437. }
  2438. }
  2439. max_weight = mr;
  2440. assert(num_varying == tnumnodes(cur_base->root_wi_index_reordered));
  2441. /* we can now initialize the wcs2index array */
  2442. {
  2443. ENTRY *p;
  2444. ENTRY e;
  2445. char buf[8];
  2446. static const char xd[] = "0123456789ABCDEF";
  2447. int starter_index = final_index;
  2448. int wcs2index_count = 0;
  2449. strcpy(buf, "<U....>");
  2450. memset(wcs2index, 0, sizeof(wcs2index));
  2451. e.key = (char *) buf;
  2452. for (i=1 ; i <= 0xffff ; i++) {
  2453. buf[5] = xd[ i & 0xf ];
  2454. buf[4] = xd[ (i >> 4) & 0xf ];
  2455. buf[3] = xd[ (i >> 8) & 0xf ];
  2456. buf[2] = xd[ (i >> 12) & 0xf ];
  2457. if ((p = hsearch(e, FIND)) != NULL) {
  2458. ++wcs2index_count;
  2459. if ((tfind(buf, &cur_base->root_starter_char, sym_cmp)) != NULL) {
  2460. wcs2index[i] = ++starter_index;
  2461. /* fprintf(stderr, "wcs2index[ %#06x ] = %d (starter)\n", i, wcs2index[i]); */
  2462. } else {
  2463. wcs2index[i] = (int)(p->data);
  2464. /* fprintf(stderr, "wcs2index[ %#06x ] = %d\n", i, wcs2index[i]); */
  2465. }
  2466. } else {
  2467. if ((tfind(buf, &cur_base->root_starter_char, sym_cmp)) != NULL) {
  2468. error_msg("marked starter but not in hash: %s", buf);
  2469. }
  2470. }
  2471. }
  2472. /* ---------------------------------------------------------------------- */
  2473. {
  2474. int i, n;
  2475. table_data table;
  2476. size_t t, smallest;
  2477. n = 0;
  2478. smallest = SIZE_MAX;
  2479. table.ii = NULL;
  2480. for (i=0 ; i < 14 ; i++) {
  2481. if ((RANGE >> i) < 4) {
  2482. break;
  2483. }
  2484. t = newopt(wcs2index, RANGE, i, &table);
  2485. if (smallest >= t) {
  2486. n = i;
  2487. smallest = t;
  2488. /* } else { */
  2489. /* break; */
  2490. }
  2491. }
  2492. /* printf("smallest = %u for range %#x (%u)\n", smallest, RANGE, RANGE); */
  2493. assert(smallest != SIZE_MAX);
  2494. if (smallest + wcs2colidt_len >= WCS2COLIDT_LEN) {
  2495. error_msg("WCS2COLIDT_LEN too small");
  2496. }
  2497. base_locale_array[base_locale_len].wcs2colidt_offset = wcs2colidt_len;
  2498. table.ii = wcs2colidt_buffer + wcs2colidt_len;
  2499. t = smallest;
  2500. smallest = SIZE_MAX;
  2501. smallest = newopt(wcs2index, RANGE, n, &table);
  2502. assert(t == smallest);
  2503. wcs2colidt_len += smallest;
  2504. /* fprintf(stderr, "smallest = %d wcs2colidt_len = %d\n", smallest, wcs2colidt_len); */
  2505. #if 0
  2506. {
  2507. unsigned int sc, n, i0, i1;
  2508. unsigned int u = 0xe40;
  2509. table_data *tbl = &table;
  2510. #define __LOCALE_DATA_WCctype_TI_MASK ((1 << tbl->ti_shift)-1)
  2511. #define __LOCALE_DATA_WCctype_TI_SHIFT (tbl->ti_shift)
  2512. #define __LOCALE_DATA_WCctype_TI_LEN (tbl->ti_len)
  2513. #define __LOCALE_DATA_WCctype_II_MASK ((1 << tbl->ii_shift)-1)
  2514. #define __LOCALE_DATA_WCctype_II_SHIFT (tbl->ii_shift)
  2515. #define __LOCALE_DATA_WCctype_II_LEN (tbl->ii_len)
  2516. sc = u & __LOCALE_DATA_WCctype_TI_MASK;
  2517. u >>= __LOCALE_DATA_WCctype_TI_SHIFT;
  2518. n = u & __LOCALE_DATA_WCctype_II_MASK;
  2519. u >>= __LOCALE_DATA_WCctype_II_SHIFT;
  2520. i0 = tbl->ii[u];
  2521. fprintf(stderr, "i0 = %d\n", i0);
  2522. i0 <<= __LOCALE_DATA_WCctype_II_SHIFT;
  2523. i1 = tbl->ii[__LOCALE_DATA_WCctype_II_LEN + i0 + n];
  2524. /* i1 = tbl->ti[i0 + n]; */
  2525. fprintf(stderr, "i1 = %d\n", i1);
  2526. i1 <<= __LOCALE_DATA_WCctype_TI_SHIFT;
  2527. /* return *(uint16_t *)(&(tbl->ii[__LOCALE_DATA_WCctype_II_LEN + __LOCALE_DATA_WCctype_TI_LEN + i1 + sc])); */
  2528. fprintf(stderr, "i2 = %d\n", __LOCALE_DATA_WCctype_II_LEN + __LOCALE_DATA_WCctype_TI_LEN + i1 + sc);
  2529. fprintf(stderr, "val = %d\n", tbl->ii[__LOCALE_DATA_WCctype_II_LEN + __LOCALE_DATA_WCctype_TI_LEN + i1 + sc]);
  2530. /* return tbl->ut[i1 + sc]; */
  2531. }
  2532. #endif
  2533. base_locale_array[base_locale_len].ii_shift = table.ii_shift;
  2534. base_locale_array[base_locale_len].ti_shift = table.ti_shift;
  2535. base_locale_array[base_locale_len].ii_len = table.ii_len;
  2536. base_locale_array[base_locale_len].ti_len = table.ti_len;
  2537. }
  2538. /* ---------------------------------------------------------------------- */
  2539. base_locale_array[base_locale_len].num_col_base = num_invariant + num_varying;
  2540. base_locale_array[base_locale_len].max_col_index = final_index;
  2541. base_locale_array[base_locale_len].max_weight = max_weight;
  2542. fprintf(stderr, "%s: %6u invariant %6u varying %6u derived %6u total %6u max weight %6u wcs2\n",
  2543. cur_base->name, num_invariant, num_varying,
  2544. tnumnodes(cur_base->root_derived_wi), final_index, max_weight,
  2545. wcs2index_count);
  2546. }
  2547. #if 1
  2548. /* ok, now we need to dump out the base and derived tables... */
  2549. /* don't forget to break up collating elements!!! */
  2550. /* fprintf(stdout, "**************************************************\n"); */
  2551. /* first pass ... set the invariants */
  2552. for (s = cur_base->section_list ; s ; s = s->next) {
  2553. #if 1
  2554. if (s->data_type & DT_REORDER) {
  2555. fprintf(stderr, "1: skipping reordered section %s\n", ((section_t *)(s->data))->name);
  2556. continue;
  2557. }
  2558. #endif
  2559. h = lli = ((section_t *)(s->data))->itm_list;
  2560. if (!lli) {
  2561. continue;
  2562. }
  2563. do {
  2564. if (lli->data_type & DT_WEIGHTED) {
  2565. w = (weighted_item_t *)(lli->data);
  2566. if (find_wi_index_reordered(w->symbol)) { /* reordered symbol so skip on first pass */
  2567. continue;
  2568. }
  2569. if (index2weight_len_inc) {
  2570. index2ruleidx_buffer[index2ruleidx_len++] =
  2571. add_rule((weighted_item_t *)(lli->data));
  2572. }
  2573. /* fprintf(stdout, "%11s: w %6d %6d %s\n", */
  2574. /* cur_base->name, lli->idx, final_index_val(w->symbol), w->symbol); */
  2575. }
  2576. } while ((lli = lli->next) != h);
  2577. }
  2578. /* second pass ... set varying */
  2579. for (s = cur_base->section_list ; s ; s = s->next) {
  2580. #if 1
  2581. if (s->data_type & DT_REORDER) {
  2582. fprintf(stderr, "2: skipping reordered section %s\n", ((section_t *)(s->data))->name);
  2583. continue;
  2584. }
  2585. #endif
  2586. h = lli = ((section_t *)(s->data))->itm_list;
  2587. if (!lli) {
  2588. continue;
  2589. }
  2590. do {
  2591. if (lli->data_type & DT_WEIGHTED) {
  2592. w = (weighted_item_t *)(lli->data);
  2593. if (find_wi_index_reordered(w->symbol)) { /* reordered symbol so skip on first pass */
  2594. if (index2weight_len_inc) {
  2595. index2ruleidx_buffer[index2ruleidx_len++] =
  2596. add_rule((weighted_item_t *)(lli->data));
  2597. }
  2598. /* fprintf(stdout, "%11s: r %6d %6d %s\n", */
  2599. /* cur_base->name, lli->idx, final_index_val(w->symbol), w->symbol); */
  2600. continue;
  2601. }
  2602. }
  2603. } while ((lli = lli->next) != h);
  2604. }
  2605. do_starter_lists(cur_base);
  2606. /* fprintf(stderr,"updated final_index = %d\n", final_index); */
  2607. if (rli) {
  2608. base_locale_array[base_locale_len].range_low
  2609. = strtoul(((range_item_t *)(rli->data))->symbol1 + 2, NULL, 16);
  2610. base_locale_array[base_locale_len].range_count
  2611. = ((range_item_t *)(rli->data))->length;
  2612. base_locale_array[base_locale_len].range_base_weight = rli->idx;
  2613. base_locale_array[base_locale_len].range_rule_offset = add_range_rule((range_item_t *)(rli->data));
  2614. /* fprintf(stdout, "%11s: %6d %6d %s %s (%d)\n", */
  2615. /* "RANGE", rli->idx, -1, */
  2616. /* ((range_item_t *)(rli->data))->symbol1, */
  2617. /* ((range_item_t *)(rli->data))->symbol2, */
  2618. /* ((range_item_t *)(rli->data))->length); */
  2619. }
  2620. /* fprintf(stdout,"\nDerived\n\n"); */
  2621. /* first, if base name is of the form ll_CC, add a derived locale for it */
  2622. if ((strlen(cur_base->name) == 5)
  2623. && islower(cur_base->name[0])
  2624. && islower(cur_base->name[1])
  2625. && (cur_base->name[2] == '_')
  2626. && isupper(cur_base->name[3])
  2627. && isupper(cur_base->name[4])
  2628. ) {
  2629. fprintf(stderr, "adding special derived for %s\n", cur_base->name);
  2630. /* fprintf(stderr,"updated final_index = %d\n", final_index); */
  2631. assert(der_locale_len+1 < DER_LOCALE_LEN);
  2632. der_locale_array[der_locale_len].name = cur_base->name;
  2633. der_locale_array[der_locale_len].base_idx = base_locale_len;
  2634. u16_buf[0] = 1;
  2635. u16_buf[1] = 0;
  2636. u16_buf_len = 2;
  2637. mm = NULL;
  2638. if ((u16_buf_len > override_len) ||
  2639. !(mm = memmem(override_buffer, override_len*sizeof(override_buffer[0]),
  2640. u16_buf, u16_buf_len*sizeof(u16_buf[0])))
  2641. ) {
  2642. assert(override_len + u16_buf_len < OVERRIDE_LEN);
  2643. memcpy(override_buffer + override_len, u16_buf, u16_buf_len*sizeof(u16_buf[0]));
  2644. der_locale_array[der_locale_len].overrides_offset = override_len;
  2645. override_len += u16_buf_len;
  2646. /* printf("%s: override_len = %d u16_buf_len = %d\n", cl->name, override_len, u16_buf_len); */
  2647. } else if (!(u16_buf_len > override_len)) {
  2648. assert(mm);
  2649. der_locale_array[der_locale_len].overrides_offset = ((uint16_t *)(mm)) - override_buffer;
  2650. /* printf("%s: memmem found a match with u16_buf_len = %d\n", cl->name, u16_buf_len); */
  2651. }
  2652. der_locale_array[der_locale_len].multistart_offset
  2653. = base_locale_array[base_locale_len].multistart_offset;
  2654. der_locale_array[der_locale_len].undefined_idx = final_index_val0("UNDEFINED");
  2655. if (!der_locale_array[der_locale_len].undefined_idx) {
  2656. error_msg("no UNDEFINED definition for %s", cur_base->name);
  2657. }
  2658. ++der_locale_len;
  2659. } else {
  2660. fprintf(stderr, "NOT adding special derived for %s\n", cur_base->name);
  2661. }
  2662. /* now all the derived... */
  2663. for (cli = cur_base->derived_list ; cli ; cli = cli->next) {
  2664. cl = (col_locale_t *)(cli->data);
  2665. assert(cli->data_type == DT_COL_LOCALE);
  2666. assert(der_locale_len+1 < DER_LOCALE_LEN);
  2667. der_locale_array[der_locale_len].name = cl->name;
  2668. der_locale_array[der_locale_len].base_idx = base_locale_len;
  2669. u16_buf_len = 0;
  2670. for (i = 0 ; i < 2 ; i++) {
  2671. if (i) {
  2672. /* fprintf(stdout, " section --- (singles)\n"); */
  2673. u16_buf[u16_buf_len++] = 1; /* single */
  2674. }
  2675. /* we do this in two passes... first all sequences, then all single reorders */
  2676. for (s = cl->section_list ; s ; s = s->next) {
  2677. /* fprintf(stderr, "doing section %s\n", ((section_t *)(s->data))->name); */
  2678. h = lli = ((section_t *)(s->data))->itm_list;
  2679. if (!lli) {
  2680. /* fprintf(stdout, "EMPTY ITEM LIST IN SECTION %s\n", ((section_t *)(s->data))->name ); */
  2681. continue;
  2682. }
  2683. assert(u16_buf_len +4 < sizeof(u16_buf)/sizeof(u16_buf[0]));
  2684. if ((!i && (ll_len(h) > 1) ) || (ll_len(h) == i)) {
  2685. if (!i) {
  2686. /* fprintf(stdout, " section ----------------- %d %d\n", i, ll_len(h)); */
  2687. u16_buf[u16_buf_len++] = ll_len(h); /* multi */
  2688. assert(lli->data_type & DT_WEIGHTED);
  2689. #if 0
  2690. u16_buf[u16_buf_len++] = final_index_val(((weighted_item_t *)(lli->data))->symbol); /* start index */
  2691. #endif
  2692. u16_buf[u16_buf_len++] = lli->idx; /* start weight */
  2693. }
  2694. do {
  2695. assert(lli->data_type & DT_WEIGHTED);
  2696. if (lli->data_type & DT_WEIGHTED) {
  2697. /* fprintf(stdout, "%11s: S %6d %6d %s\n", */
  2698. /* cl->name, lli->idx, */
  2699. /* final_index_val(((weighted_item_t *)(lli->data))->symbol), */
  2700. /* ((weighted_item_t *)(lli->data))->symbol); */
  2701. #if 0
  2702. if (i) {
  2703. assert(u16_buf_len +4 < sizeof(u16_buf)/sizeof(u16_buf[0]));
  2704. u16_buf[u16_buf_len++] = final_index_val(((weighted_item_t *)(lli->data))->symbol);
  2705. assert(u16_buf[u16_buf_len-1]);
  2706. u16_buf[u16_buf_len++] = lli->idx; /* weight */
  2707. }
  2708. #else
  2709. assert(u16_buf_len +4 < sizeof(u16_buf)/sizeof(u16_buf[0]));
  2710. u16_buf[u16_buf_len++] = final_index_val(((weighted_item_t *)(lli->data))->symbol);
  2711. assert(u16_buf[u16_buf_len-1]);
  2712. if (i) {
  2713. u16_buf[u16_buf_len++] = lli->idx; /* weight */
  2714. }
  2715. #endif
  2716. u16_buf[u16_buf_len++] = add_rule((weighted_item_t *)(lli->data));
  2717. }
  2718. } while ((lli = lli->next) != h);
  2719. }
  2720. }
  2721. }
  2722. u16_buf[u16_buf_len++] = 0;
  2723. mm = NULL;
  2724. if ((u16_buf_len > override_len) ||
  2725. !(mm = memmem(override_buffer, override_len*sizeof(override_buffer[0]),
  2726. u16_buf, u16_buf_len*sizeof(u16_buf[0])))
  2727. ) {
  2728. assert(override_len + u16_buf_len < OVERRIDE_LEN);
  2729. memcpy(override_buffer + override_len, u16_buf, u16_buf_len*sizeof(u16_buf[0]));
  2730. der_locale_array[der_locale_len].overrides_offset = override_len;
  2731. override_len += u16_buf_len;
  2732. /* printf("%s: override_len = %d u16_buf_len = %d\n", cl->name, override_len, u16_buf_len); */
  2733. } else if (!(u16_buf_len > override_len)) {
  2734. assert(mm);
  2735. der_locale_array[der_locale_len].overrides_offset = ((uint16_t *)(mm)) - override_buffer;
  2736. /* printf("%s: memmem found a match with u16_buf_len = %d\n", cl->name, u16_buf_len); */
  2737. }
  2738. do_starter_lists(cl);
  2739. der_locale_array[der_locale_len].undefined_idx = final_index_val0("UNDEFINED");
  2740. #if 0
  2741. assert(der_locale_array[der_locale_len].undefined_idx);
  2742. if (!der_locale_array[der_locale_len].undefined_idx) {
  2743. der_locale_array[der_locale_len].undefined_idx = base_locale_array[base_locale_len].undefined_idx;
  2744. }
  2745. #endif
  2746. if (!der_locale_array[der_locale_len].undefined_idx) {
  2747. error_msg("no UNDEFINED definition for %s", cl->name);
  2748. }
  2749. ++der_locale_len;
  2750. }
  2751. #endif
  2752. #warning handle UNDEFINED idx specially? what if in only some of derived?
  2753. /* base_locale_array[base_locale_len].undefined_idx = final_index_val0("UNDEFINED"); */
  2754. base_locale_array[base_locale_len].undefined_idx = 0;
  2755. hdestroy();
  2756. ++base_locale_len;
  2757. /* if (tnumnodes(cur_base->root_starter_char)) { */
  2758. /* fprintf(stderr, "starter nodes\n"); */
  2759. /* twalk(cur_base->root_starter_char, print_starter_node); */
  2760. /* } */
  2761. }
  2762. static int starter_all_cmp(const void *n1, const void *n2)
  2763. {
  2764. const char *s1 = ((weighted_item_t *) n1)->symbol;
  2765. const char *s2 = ((weighted_item_t *) n2)->symbol;
  2766. colitem_t x;
  2767. colitem_t *p;
  2768. int n;
  2769. /* sort by 1st char ... then inverse for string */
  2770. x.element = NULL;
  2771. if (!is_ucode(s1)) {
  2772. x.string = s1;
  2773. p = tfind(&x, &cur_base->root_colitem, colitem_cmp);
  2774. s1 = (*((colitem_t **) p))->element + 1;
  2775. }
  2776. if (!is_ucode(s2)) {
  2777. x.string = s2;
  2778. p = tfind(&x, &cur_base->root_colitem, colitem_cmp);
  2779. s2 = (*((colitem_t **) p))->element + 1;
  2780. }
  2781. /* <U####>< */
  2782. /* 01234567 */
  2783. assert(is_ucode(s1));
  2784. assert(is_ucode(s2));
  2785. n = strncmp(s1+2, s2+2, 4);
  2786. if (n) {
  2787. return n;
  2788. }
  2789. s1 += 7;
  2790. s2 += 7;
  2791. return strcmp(s2, s1);
  2792. }
  2793. static void print_starter_all_node(const void *ptr, VISIT order, int level)
  2794. {
  2795. const weighted_item_t *w = *(const weighted_item_t **) ptr;
  2796. colitem_t *ci;
  2797. void *p;
  2798. int n;
  2799. colitem_t x;
  2800. if (order == postorder || order == leaf) {
  2801. #if 0
  2802. if ((n = is_ucode(w->symbol)) != 0) {
  2803. printf(" %s\n", w->symbol);
  2804. } else {
  2805. x.string = w->symbol;
  2806. x.element = NULL;
  2807. p = tfind(&x, &cur_base->root_colitem, colitem_cmp);
  2808. assert(p);
  2809. ci = *((colitem_t **) p);
  2810. printf("%s = %s\n", ci->element, w->symbol);
  2811. }
  2812. #else
  2813. printf("%s|", w->symbol);
  2814. /* if ((n = is_ucode(w->symbol)) != 0) { */
  2815. /* printf("\n"); */
  2816. /* } */
  2817. #endif
  2818. }
  2819. }
  2820. static void process_starter_node(const void *ptr, VISIT order, int level)
  2821. {
  2822. const weighted_item_t *w = *(const weighted_item_t **) ptr;
  2823. colitem_t *ci;
  2824. void *p;
  2825. int n;
  2826. colitem_t x;
  2827. const char *s;
  2828. char buf[32];
  2829. /* store index of collation item followed by (unprefixed) nul-terminated string */
  2830. if (order == postorder || order == leaf) {
  2831. if ((n = is_ucode(w->symbol)) != 0) {
  2832. u16_buf[u16_buf_len++] = final_index_val(w->symbol);
  2833. assert(u16_buf[u16_buf_len-1]);
  2834. u16_buf[u16_buf_len++] = 0;
  2835. if (++u16_starter < base_locale_array[base_locale_len].num_starters) {
  2836. u16_buf[u16_starter] = u16_buf_len;
  2837. }
  2838. /* fprintf(stderr, "ucode - %d %d\n", u16_buf[u16_starter-1], u16_buf_len); */
  2839. } else {
  2840. x.string = w->symbol;
  2841. x.element = NULL;
  2842. p = tfind(&x, &cur_base->root_colitem, colitem_cmp);
  2843. assert(p);
  2844. ci = *((colitem_t **) p);
  2845. s = ci->element;
  2846. u16_buf[u16_buf_len++] = final_index_val(w->symbol);
  2847. assert(u16_buf[u16_buf_len-1]);
  2848. assert(*s == '"');
  2849. n = is_ucode(++s);
  2850. /* fprintf(stderr, "s is |%s| with len %d (%d)\n", s, strlen(s), n); */
  2851. assert(n);
  2852. s += n;
  2853. while (*s != '"') {
  2854. n = is_ucode(s);
  2855. assert(n);
  2856. strncpy(buf, s, n+1);
  2857. buf[n] = 0;
  2858. /* fprintf(stderr, "buf is |%s| with len %d (%d)\n", buf, strlen(buf), n); */
  2859. u16_buf[u16_buf_len++] = final_index_val(buf);
  2860. assert(u16_buf[u16_buf_len-1]);
  2861. s += n;
  2862. }
  2863. u16_buf[u16_buf_len++] = 0;
  2864. }
  2865. }
  2866. }
  2867. static void **p_cl_root_starter_all;
  2868. static void complete_starter_node(const void *ptr, VISIT order, int level)
  2869. {
  2870. weighted_item_t w;
  2871. weighted_item_t *p;
  2872. if (order == postorder || order == leaf) {
  2873. w.symbol = *(const char **) ptr;
  2874. w.weight = NULL;
  2875. if (!tfind(&w, p_cl_root_starter_all, starter_all_cmp)) {
  2876. p = xmalloc(sizeof(weighted_item_t));
  2877. p->symbol = w.symbol;
  2878. p->weight = NULL;
  2879. /* fprintf(stderr, "complete_starter_node: %s\n", *(const char **) ptr); */
  2880. if (!tsearch(p, p_cl_root_starter_all, starter_all_cmp)) {
  2881. error_msg("OUT OF MEMORY");
  2882. }
  2883. }
  2884. }
  2885. }
  2886. static void do_starter_lists(col_locale_t *cl)
  2887. {
  2888. ll_item_t *s;
  2889. ll_item_t *h;
  2890. ll_item_t *lli;
  2891. col_locale_t *c;
  2892. colitem_t *ci;
  2893. weighted_item_t *w;
  2894. void *p;
  2895. char buf[32];
  2896. int n;
  2897. colitem_t x;
  2898. void *mm;
  2899. c = cl;
  2900. if (c != cur_base) {
  2901. c = cur_base;
  2902. }
  2903. /* printf("STARTERS %s --------------------\n", cl->name); */
  2904. LOOP:
  2905. for (s = c->section_list ; s ; s = s->next) {
  2906. h = lli = ((section_t *)(s->data))->itm_list;
  2907. if (!lli) {
  2908. continue;
  2909. }
  2910. do {
  2911. if (lli->data_type & DT_WEIGHTED) {
  2912. w = (weighted_item_t *)(lli->data);
  2913. ci = NULL;
  2914. if ((n = is_ucode(w->symbol)) != 0) {
  2915. strcpy(buf, w->symbol);
  2916. } else {
  2917. /* fprintf(stdout, "looking for |%s|\n", w->symbol); */
  2918. x.string = w->symbol;
  2919. x.element = NULL;
  2920. p = tfind(&x, &cur_base->root_colitem, colitem_cmp);
  2921. if (!p) {
  2922. /* fprintf(stderr, "Whoa... processing starters for %s and couldn't find %s\n", */
  2923. /* cl->name, w->symbol); */
  2924. continue;
  2925. }
  2926. ci = *((colitem_t **) p);
  2927. if (!ci->element) { /* just a collating symbol */
  2928. continue;
  2929. }
  2930. assert(ci->element[0] == '"');
  2931. n = is_ucode(ci->element + 1);
  2932. assert(n);
  2933. strncpy(buf, ci->element + 1, n);
  2934. }
  2935. if ((tfind(buf, &cur_base->root_starter_char, sym_cmp)) != NULL) {
  2936. /* fprintf(stdout, "adding from %s: %s", c->name, w->symbol); */
  2937. /* if (ci) { */
  2938. /* fprintf(stdout, " = %s", ci->element); */
  2939. /* } */
  2940. /* fprintf(stdout, "\n"); */
  2941. if (!tsearch(w, &cl->root_starter_all, starter_all_cmp)) {
  2942. error_msg("OUT OF MEMORY");
  2943. }
  2944. }
  2945. }
  2946. } while ((lli = lli->next) != h);
  2947. }
  2948. if (c != cl) {
  2949. c = cl;
  2950. goto LOOP;
  2951. }
  2952. p_cl_root_starter_all = &cl->root_starter_all;
  2953. twalk(cur_base->root_starter_char, complete_starter_node);
  2954. if (cl == cur_base) {
  2955. base_locale_array[base_locale_len].num_starters = tnumnodes(cur_base->root_starter_char);
  2956. }
  2957. #if 0
  2958. printf("\nNow walking tree...\n\n");
  2959. twalk(cl->root_starter_all, print_starter_all_node);
  2960. printf("\n\n");
  2961. #endif
  2962. u16_starter = 0;
  2963. u16_buf[0] = u16_buf_len = base_locale_array[base_locale_len].num_starters;
  2964. twalk(cl->root_starter_all, process_starter_node);
  2965. /* fprintf(stderr, "s=%d n=%d\n", u16_starter, base_locale_array[base_locale_len].num_starters); */
  2966. assert(u16_starter == base_locale_array[base_locale_len].num_starters);
  2967. #if 0
  2968. { int i;
  2969. for (i=0 ; i < u16_buf_len ; i++) {
  2970. fprintf(stderr, "starter %2d: %d - %#06x\n", i, u16_buf[i], u16_buf[i]);
  2971. }}
  2972. #endif
  2973. mm = NULL;
  2974. if (u16_buf_len) {
  2975. /* assert(base_locale_array[base_locale_len].num_starters); */
  2976. if ((u16_buf_len > multistart_len) ||
  2977. !(mm = memmem(multistart_buffer, multistart_len*sizeof(multistart_buffer[0]),
  2978. u16_buf, u16_buf_len*sizeof(u16_buf[0])))
  2979. ) {
  2980. assert(multistart_len + u16_buf_len < MULTISTART_LEN);
  2981. memcpy(multistart_buffer + multistart_len, u16_buf, u16_buf_len*sizeof(u16_buf[0]));
  2982. if (cl == cur_base) {
  2983. base_locale_array[base_locale_len].multistart_offset = multistart_len;
  2984. } else {
  2985. der_locale_array[der_locale_len].multistart_offset = multistart_len;
  2986. }
  2987. multistart_len += u16_buf_len;
  2988. /* fprintf(stderr, "%s: multistart_len = %d u16_buf_len = %d\n", cl->name, multistart_len, u16_buf_len); */
  2989. } else if (!(u16_buf_len > multistart_len)) {
  2990. assert(mm);
  2991. if (cl == cur_base) {
  2992. base_locale_array[base_locale_len].multistart_offset = ((uint16_t *)(mm)) - multistart_buffer;
  2993. } else {
  2994. der_locale_array[der_locale_len].multistart_offset = ((uint16_t *)(mm)) - multistart_buffer;
  2995. }
  2996. /* fprintf(stderr, "%s: memmem found a match with u16_buf_len = %d\n", cl->name, u16_buf_len); */
  2997. }
  2998. } else {
  2999. assert(!base_locale_array[base_locale_len].num_starters);
  3000. }
  3001. /* printf("u16_buf_len = %d\n", u16_buf_len); */
  3002. /* printf("STARTERS %s DONE ---------------\n", cl->name); */
  3003. }
  3004. /* For sorting the blocks of unsigned chars. */
  3005. static size_t nu_val;
  3006. int nu_memcmp(const void *a, const void *b)
  3007. {
  3008. return memcmp(*(unsigned char**)a, *(unsigned char**)b, nu_val * sizeof(tbl_item));
  3009. }
  3010. size_t newopt(tbl_item *ut, size_t usize, int shift, table_data *tbl)
  3011. {
  3012. static int recurse = 0;
  3013. tbl_item *ti[RANGE]; /* table index */
  3014. size_t numblocks;
  3015. size_t blocksize;
  3016. size_t uniq;
  3017. size_t i, j;
  3018. size_t smallest, t;
  3019. tbl_item *ii_save;
  3020. int uniqblock[1 << (8*sizeof(tbl_item) - 1)];
  3021. tbl_item uit[RANGE];
  3022. int shift2;
  3023. if (shift > 15) {
  3024. return SIZE_MAX;
  3025. }
  3026. ii_save = NULL;
  3027. blocksize = 1 << shift;
  3028. numblocks = usize >> shift;
  3029. /* init table index */
  3030. for (i=j=0 ; i < numblocks ; i++) {
  3031. ti[i] = ut + j;
  3032. j += blocksize;
  3033. }
  3034. /* sort */
  3035. nu_val = blocksize;
  3036. qsort(ti, numblocks, sizeof(unsigned char *), nu_memcmp);
  3037. uniq = 1;
  3038. uit[(ti[0]-ut)/blocksize] = 0;
  3039. for (i=1 ; i < numblocks ; i++) {
  3040. if (memcmp(ti[i-1], ti[i], blocksize*sizeof(tbl_item)) < 0) {
  3041. if (++uniq > (1 << (8*sizeof(tbl_item) - 1))) {
  3042. break;
  3043. }
  3044. uniqblock[uniq - 1] = i;
  3045. }
  3046. #if 1
  3047. else if (memcmp(ti[i-1], ti[i], blocksize*sizeof(tbl_item)) > 0) {
  3048. printf("bad sort %i!\n", i);
  3049. abort();
  3050. }
  3051. #endif
  3052. uit[(ti[i]-ut)/blocksize] = uniq - 1;
  3053. }
  3054. smallest = SIZE_MAX;
  3055. shift2 = -1;
  3056. if (uniq <= (1 << (8*sizeof(tbl_item) - 1))) {
  3057. smallest = numblocks + uniq * blocksize;
  3058. if (!recurse) {
  3059. ++recurse;
  3060. for (j=1 ; j < 14 ; j++) {
  3061. if ((numblocks >> j) < 2) break;
  3062. if (tbl) {
  3063. ii_save = tbl->ii;
  3064. tbl->ii = NULL;
  3065. }
  3066. if ((t = newopt(uit, numblocks, j, tbl)) < SIZE_MAX) {
  3067. t += uniq * blocksize;
  3068. }
  3069. if (tbl) {
  3070. tbl->ii = ii_save;
  3071. }
  3072. if (smallest >= t) {
  3073. shift2 = j;
  3074. smallest = t;
  3075. /* if (!tbl->ii) { */
  3076. /* printf("ishift %u tshift %u size %u\n", */
  3077. /* shift2, shift, t); */
  3078. /* } */
  3079. /* } else { */
  3080. /* break; */
  3081. }
  3082. }
  3083. --recurse;
  3084. }
  3085. } else {
  3086. return SIZE_MAX;
  3087. }
  3088. if (tbl->ii) {
  3089. if (recurse) {
  3090. tbl->ii_shift = shift;
  3091. tbl->ii_len = numblocks;
  3092. memcpy(tbl->ii, uit, numblocks*sizeof(tbl_item));
  3093. tbl->ti = tbl->ii + tbl->ii_len;
  3094. tbl->ti_len = uniq * blocksize;
  3095. for (i=0 ; i < uniq ; i++) {
  3096. memcpy(tbl->ti + i * blocksize, ti[uniqblock[i]], blocksize*sizeof(tbl_item));
  3097. }
  3098. } else {
  3099. ++recurse;
  3100. /* printf("setting ishift %u tshift %u\n", shift2, shift); */
  3101. newopt(uit, numblocks, shift2, tbl);
  3102. --recurse;
  3103. tbl->ti_shift = shift;
  3104. tbl->ut_len = uniq * blocksize;
  3105. tbl->ut = tbl->ti + tbl->ti_len;
  3106. for (i=0 ; i < uniq ; i++) {
  3107. memcpy(tbl->ut + i * blocksize, ti[uniqblock[i]], blocksize*sizeof(tbl_item));
  3108. }
  3109. }
  3110. }
  3111. return smallest;
  3112. }
  3113. static const int rule2val[8] = {
  3114. -1,
  3115. (1 << 14), /* forward */
  3116. (2 << 14), /* position */
  3117. (3 << 14), /* forward,position */
  3118. 0, /* backward */
  3119. -1,
  3120. -1,
  3121. -1,
  3122. };
  3123. static int final_index_val_x(const char *s, const char *sym)
  3124. {
  3125. int r;
  3126. if (!(r = final_index_val0(s))) {
  3127. if (!strcmp(s, "IGNORE")) {
  3128. r = 0;
  3129. } else if (!strcmp(s, "..") || !strcmp(sym, "RANGE")) {
  3130. if (*sym == '.') {
  3131. final_index_val(sym); /* make sure it's known */
  3132. }
  3133. r = 0x3fff;
  3134. } else if (!strcmp(s, ".")) {
  3135. r = 0x3ffe;
  3136. } else {
  3137. error_msg("can't find final index: %s", s);
  3138. }
  3139. }
  3140. return r;
  3141. }
  3142. /* store rule2val in 2 high bits and collation index in lower.
  3143. * for sort strings, store (offset from base) + max colindex as index.
  3144. */
  3145. static unsigned int add_rule(weighted_item_t *wi)
  3146. {
  3147. weight_t *w = wi->weight;
  3148. int i, j, r, n;
  3149. uint16_t rbuf[MAX_COLLATION_WEIGHTS];
  3150. uint16_t ws_buf[32];
  3151. void *mm;
  3152. char buf[32];
  3153. const char *s;
  3154. const char *e;
  3155. for (i=0 ; i < MAX_COLLATION_WEIGHTS ; i++) {
  3156. rbuf[i] = rule2val[R_FORWARD]; /* set a default to forward-ignore */
  3157. }
  3158. if (base_locale_array[base_locale_len].num_weights < w->num_weights) {
  3159. base_locale_array[base_locale_len].num_weights = w->num_weights;
  3160. }
  3161. for (i=0 ; i < w->num_weights ; i++) {
  3162. assert(rule2val[(int)(w->rule[i])] >= 0);
  3163. assert(w->colitem[i] && *w->colitem[i]);
  3164. if (*w->colitem[i] == '"') { /* string... */
  3165. s = w->colitem[i] + 1;
  3166. assert(*s == '<');
  3167. n = 0;
  3168. do {
  3169. e = s;
  3170. do {
  3171. if (*e == '/') {
  3172. e += 2;
  3173. continue;
  3174. }
  3175. } while (*e++ != '>');
  3176. assert(((size_t)(e-s) < sizeof(buf)));
  3177. memcpy(buf, s, (size_t)(e-s));
  3178. buf[(size_t)(e-s)] = 0;
  3179. r = final_index_val_x(buf, wi->symbol);
  3180. assert(n + 1 < sizeof(ws_buf)/sizeof(ws_buf[0]));
  3181. ws_buf[n++] = r | rule2val[(int)(w->rule[i])];
  3182. s = e;
  3183. } while (*s != '"');
  3184. ws_buf[n++] = 0; /* terminator */
  3185. mm = memmem(weightstr_buffer, weightstr_len*sizeof(weightstr_buffer[0]),
  3186. ws_buf, n*sizeof(ws_buf[0]));
  3187. if (!mm) {
  3188. assert(weightstr_len + n < WEIGHTSTR_LEN);
  3189. memcpy(weightstr_buffer + weightstr_len, ws_buf, n*sizeof(ws_buf[0]));
  3190. mm = weightstr_buffer + weightstr_len;
  3191. weightstr_len += n;
  3192. }
  3193. r = (((uint16_t *)(mm)) - weightstr_buffer)
  3194. + base_locale_array[base_locale_len].max_col_index + 2;
  3195. assert(r < (1 << 14));
  3196. rbuf[i] = r | rule2val[(int)(w->rule[i])];
  3197. } else { /* item */
  3198. r = final_index_val_x(w->colitem[i], wi->symbol);
  3199. rbuf[i] = r | rule2val[(int)(w->rule[i])];
  3200. }
  3201. }
  3202. for (i=0 ; i < ruletable_len ; i += MAX_COLLATION_WEIGHTS) {
  3203. if (!memcmp(ruletable_buffer + i, rbuf, MAX_COLLATION_WEIGHTS*sizeof(ruletable_buffer[0]))) {
  3204. return i/MAX_COLLATION_WEIGHTS;
  3205. }
  3206. }
  3207. memcpy(ruletable_buffer + ruletable_len, rbuf, MAX_COLLATION_WEIGHTS*sizeof(ruletable_buffer[0]));
  3208. ruletable_len += MAX_COLLATION_WEIGHTS;
  3209. return (ruletable_len / MAX_COLLATION_WEIGHTS)-1;
  3210. }
  3211. static unsigned int add_range_rule(range_item_t *ri)
  3212. {
  3213. weight_t *w = ri->weight;
  3214. int i, j, r, n;
  3215. uint16_t rbuf[MAX_COLLATION_WEIGHTS];
  3216. uint16_t ws_buf[32];
  3217. void *mm;
  3218. char buf[32];
  3219. const char *s;
  3220. const char *e;
  3221. for (i=0 ; i < MAX_COLLATION_WEIGHTS ; i++) {
  3222. rbuf[i] = rule2val[R_FORWARD]; /* set a default to forward-ignore */
  3223. }
  3224. if (base_locale_array[base_locale_len].num_weights < w->num_weights) {
  3225. base_locale_array[base_locale_len].num_weights = w->num_weights;
  3226. }
  3227. for (i=0 ; i < w->num_weights ; i++) {
  3228. assert(rule2val[(int)(w->rule[i])] >= 0);
  3229. assert(w->colitem[i] && *w->colitem[i]);
  3230. if (*w->colitem[i] == '"') { /* string... */
  3231. s = w->colitem[i] + 1;
  3232. assert(*s == '<');
  3233. n = 0;
  3234. do {
  3235. e = s;
  3236. do {
  3237. if (*e == '/') {
  3238. e += 2;
  3239. continue;
  3240. }
  3241. } while (*e++ != '>');
  3242. assert(((size_t)(e-s) < sizeof(buf)));
  3243. memcpy(buf, s, (size_t)(e-s));
  3244. buf[(size_t)(e-s)] = 0;
  3245. r = final_index_val_x(buf, "RANGE");
  3246. assert(n + 1 < sizeof(ws_buf)/sizeof(ws_buf[0]));
  3247. ws_buf[n++] = r | rule2val[(int)(w->rule[i])];
  3248. s = e;
  3249. } while (*s != '"');
  3250. ws_buf[n++] = 0; /* terminator */
  3251. mm = memmem(weightstr_buffer, weightstr_len*sizeof(weightstr_buffer[0]),
  3252. ws_buf, n*sizeof(ws_buf[0]));
  3253. if (!mm) {
  3254. assert(weightstr_len + n < WEIGHTSTR_LEN);
  3255. memcpy(weightstr_buffer + weightstr_len, ws_buf, n*sizeof(ws_buf[0]));
  3256. mm = weightstr_buffer + weightstr_len;
  3257. weightstr_len += n;
  3258. }
  3259. r = (((uint16_t *)(mm)) - weightstr_buffer)
  3260. + base_locale_array[base_locale_len].max_col_index + 2;
  3261. assert(r < (1 << 14));
  3262. rbuf[i] = r | rule2val[(int)(w->rule[i])];
  3263. } else { /* item */
  3264. r = final_index_val_x(w->colitem[i], "RANGE");
  3265. rbuf[i] = r | rule2val[(int)(w->rule[i])];
  3266. }
  3267. }
  3268. for (i=0 ; i < ruletable_len ; i += MAX_COLLATION_WEIGHTS) {
  3269. if (!memcmp(ruletable_buffer + i, rbuf, MAX_COLLATION_WEIGHTS*sizeof(ruletable_buffer[0]))) {
  3270. return i/MAX_COLLATION_WEIGHTS;
  3271. }
  3272. }
  3273. memcpy(ruletable_buffer + ruletable_len, rbuf, MAX_COLLATION_WEIGHTS*sizeof(ruletable_buffer[0]));
  3274. ruletable_len += MAX_COLLATION_WEIGHTS;
  3275. return (ruletable_len / MAX_COLLATION_WEIGHTS)-1;
  3276. }
  3277. #define DUMPn(X) fprintf(stderr, "%10d-%-.20s", base_locale_array[n]. X, #X);
  3278. static void dump_base_locale(int n)
  3279. {
  3280. assert(n < base_locale_len);
  3281. fprintf(stderr, "Base Locale: %s\n", base_locale_array[n].name);
  3282. DUMPn(num_weights);
  3283. DUMPn(ii_shift);
  3284. DUMPn(ti_shift);
  3285. DUMPn(ii_len);
  3286. DUMPn(ti_len);
  3287. DUMPn(max_weight);
  3288. fprintf(stderr, "\n");
  3289. DUMPn(num_col_base);
  3290. DUMPn(max_col_index);
  3291. DUMPn(undefined_idx);
  3292. DUMPn(range_low);
  3293. DUMPn(range_count);
  3294. fprintf(stderr, "\n");
  3295. DUMPn(range_base_weight);
  3296. DUMPn(num_starters);
  3297. fprintf(stderr, "\n");
  3298. DUMPn(range_rule_offset);
  3299. DUMPn(wcs2colidt_offset);
  3300. DUMPn(index2weight_offset);
  3301. fprintf(stderr, "\n");
  3302. DUMPn(index2ruleidx_offset);
  3303. DUMPn(multistart_offset);
  3304. fprintf(stderr, "\n");
  3305. }
  3306. #undef DUMPn
  3307. #define DUMPn(X) fprintf(stderr, "%10d-%s", der_locale_array[n]. X, #X);
  3308. static void dump_der_locale(int n)
  3309. {
  3310. assert(n < der_locale_len);
  3311. fprintf(stderr, "Derived Locale: %s (%.12s)",
  3312. der_locale_array[n].name,
  3313. base_locale_array[der_locale_array[n].base_idx].name);
  3314. DUMPn(base_idx);
  3315. DUMPn(undefined_idx);
  3316. DUMPn(overrides_offset);
  3317. DUMPn(multistart_offset);
  3318. fprintf(stderr, "\n");
  3319. }
  3320. static unsigned long collate_pos;
  3321. static void dump_u16_array(FILE *fp, uint16_t *u, int len, const char *name)
  3322. {
  3323. int i;
  3324. fprintf(fp, "\t/* %8lu %s */\n", collate_pos, name);
  3325. for (i=0 ; i < len ; i++) {
  3326. if (!(i & 7)) {
  3327. fprintf(fp, "\n\t");
  3328. }
  3329. fprintf(fp," %#06x,", (unsigned int)(u[i]));
  3330. }
  3331. fprintf(fp,"\n");
  3332. collate_pos += len;
  3333. }
  3334. #define OUT_U16C(X,N) fprintf(fp,"\t%10d, /* %8lu %s */\n", X, collate_pos++, N);
  3335. static void dump_collate(FILE *fp)
  3336. {
  3337. int n;
  3338. fprintf(fp, "const uint16_t __locale_collate_tbl[] = {\n");
  3339. OUT_U16C(base_locale_len, "numbef of base locales");
  3340. OUT_U16C(der_locale_len, "number of derived locales");
  3341. OUT_U16C(MAX_COLLATION_WEIGHTS, "max collation weights");
  3342. OUT_U16C(index2weight_len, "number of index2{weight|ruleidx} elements");
  3343. OUT_U16C(weightstr_len, "number of weightstr elements");
  3344. OUT_U16C(multistart_len, "number of multistart elements");
  3345. OUT_U16C(override_len, "number of override elements");
  3346. OUT_U16C(ruletable_len, "number of ruletable elements");
  3347. #undef DUMPn
  3348. #define DUMPn(X) fprintf(fp, "\t%10d, /* %8lu %s */\n", base_locale_array[n]. X, collate_pos++, #X);
  3349. for (n=0 ; n < base_locale_len ; n++) {
  3350. unsigned wcs2colidt_offset_low = base_locale_array[n].wcs2colidt_offset & 0xffffU;
  3351. unsigned wcs2colidt_offset_hi = base_locale_array[n].wcs2colidt_offset >> 16;
  3352. fprintf(fp, "\t/* Base Locale %2d: %s */\n", n, base_locale_array[n].name);
  3353. DUMPn(num_weights);
  3354. DUMPn(num_starters);
  3355. DUMPn(ii_shift);
  3356. DUMPn(ti_shift);
  3357. DUMPn(ii_len);
  3358. DUMPn(ti_len);
  3359. DUMPn(max_weight);
  3360. DUMPn(num_col_base);
  3361. DUMPn(max_col_index);
  3362. DUMPn(undefined_idx);
  3363. DUMPn(range_low);
  3364. DUMPn(range_count);
  3365. DUMPn(range_base_weight);
  3366. DUMPn(range_rule_offset);
  3367. DUMPn(index2weight_offset);
  3368. DUMPn(index2ruleidx_offset);
  3369. DUMPn(multistart_offset);
  3370. #undef DUMPn
  3371. #define DUMPn(X) fprintf(fp, "\t%10d, /* %8lu %s */\n", X, collate_pos++, #X);
  3372. DUMPn(wcs2colidt_offset_low);
  3373. DUMPn(wcs2colidt_offset_hi);
  3374. }
  3375. #undef DUMPn
  3376. fprintf(fp, "#define COL_IDX_C %5d\n", 0);
  3377. #define DUMPn(X) fprintf(fp, "\t%10d, /* %8lu %s */\n", der_locale_array[n]. X, collate_pos++, #X);
  3378. for (n=0 ; n < der_locale_len ; n++) {
  3379. fprintf(fp, "#define COL_IDX_%s %5d\n", der_locale_array[n].name, n+1);
  3380. fprintf(fp, "\t/* Derived Locale %4d: %s (%.12s) */\n",
  3381. n, der_locale_array[n].name,
  3382. base_locale_array[der_locale_array[n].base_idx].name);
  3383. DUMPn(base_idx);
  3384. DUMPn(undefined_idx);
  3385. DUMPn(overrides_offset);
  3386. DUMPn(multistart_offset);
  3387. }
  3388. #undef DUMPn
  3389. fprintf(fp, "\n");
  3390. dump_u16_array(fp, index2weight_buffer, index2weight_len, "index2weight");
  3391. dump_u16_array(fp, index2ruleidx_buffer, index2ruleidx_len, "index2ruleidx");
  3392. dump_u16_array(fp, multistart_buffer, multistart_len, "multistart");
  3393. dump_u16_array(fp, override_buffer, override_len, "override");
  3394. dump_u16_array(fp, ruletable_buffer, ruletable_len, "ruletable");
  3395. dump_u16_array(fp, weightstr_buffer, weightstr_len, "weightstr");
  3396. dump_u16_array(fp, wcs2colidt_buffer, wcs2colidt_len, "wcs2colidt");
  3397. fprintf(fp,"}; /* %8lu */\n", collate_pos);
  3398. fprintf(fp,"#define __lc_collate_data_LEN %d\n\n", collate_pos);
  3399. }