gen_collate.c 99 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997
  1. /*
  2. * Usage:
  3. * gen_collate <INPUTDIR> [-o OUTPUTFILE] LOCALE ...
  4. *
  5. * Generate collation data from locales LOCALE.
  6. * Reads all LOCALE from INPUTDIR and writes collation data to OUTPUTFILE.
  7. *
  8. * The output file defaults to "locales_collate.h".
  9. */
  10. /* TODO:
  11. *
  12. * add UNDEFINED at end if not specified
  13. * convert POSITION -> FORWARD,POSITION
  14. *
  15. *
  16. * deal with lowercase in <Uhhhh>
  17. *
  18. * what about reorders that keep the same rule?
  19. *
  20. * remove "unused" collation elements? (probably doesn't save much)
  21. *
  22. * add_rule function ... returns index into rule table after possibly adding custom-indexed rule
  23. * but don't forget about multichar weights... replace with strings of indexes
  24. *
  25. */
  26. #include <stddef.h>
  27. #include <stdio.h>
  28. #include <stdlib.h>
  29. #include <string.h>
  30. #include <stdint.h>
  31. #include <stdarg.h>
  32. #include <limits.h>
  33. #include <ctype.h>
  34. #include <assert.h>
  35. #include <errno.h>
  36. #include <search.h>
  37. typedef struct {
  38. char *name; /* */
  39. int num_weights; /* */
  40. int ii_shift; /* */
  41. int ti_shift; /* */
  42. int ii_len; /* */
  43. int ti_len; /* */
  44. int max_weight; /* */
  45. int num_col_base; /* */
  46. int max_col_index; /* */
  47. int undefined_idx; /* */
  48. int range_low; /* */
  49. int range_count; /* high - low */
  50. int range_base_weight; /* */
  51. int num_starters; /* */
  52. int range_rule_offset; /* */
  53. int wcs2colidt_offset; /* */
  54. int index2weight_offset; /* */
  55. int index2ruleidx_offset; /* */
  56. int multistart_offset; /* */
  57. } base_locale_t;
  58. #define BASE_LOCALE_LEN 20
  59. static base_locale_t base_locale_array[BASE_LOCALE_LEN];
  60. static size_t base_locale_len;
  61. typedef struct {
  62. char *name; /* */
  63. int base_idx; /* */
  64. int undefined_idx; /* */
  65. int overrides_offset; /* */
  66. int multistart_offset; /* */
  67. } der_locale_t;
  68. #define DER_LOCALE_LEN 300
  69. static der_locale_t der_locale_array[DER_LOCALE_LEN];
  70. static size_t der_locale_len;
  71. #define OVERRIDE_LEN 50000
  72. static uint16_t override_buffer[OVERRIDE_LEN];
  73. static size_t override_len;
  74. #define MULTISTART_LEN 10000
  75. static uint16_t multistart_buffer[MULTISTART_LEN];
  76. static size_t multistart_len;
  77. #define WCS2COLIDT_LEN 200000
  78. static uint16_t wcs2colidt_buffer[WCS2COLIDT_LEN];
  79. static size_t wcs2colidt_len;
  80. #define INDEX2WEIGHT_LEN 200000
  81. static uint16_t index2weight_buffer[INDEX2WEIGHT_LEN];
  82. static size_t index2weight_len;
  83. static uint16_t index2ruleidx_buffer[INDEX2WEIGHT_LEN];
  84. static size_t index2ruleidx_len;
  85. #define WEIGHTSTR_LEN 10000
  86. static uint16_t weightstr_buffer[WEIGHTSTR_LEN];
  87. static size_t weightstr_len;
  88. #define RULETABLE_LEN (1L<<16)
  89. static uint16_t ruletable_buffer[RULETABLE_LEN];
  90. static size_t ruletable_len;
  91. #define RANGE (0x10000UL)
  92. typedef uint16_t tbl_item;
  93. static uint16_t u16_buf[10000];
  94. static int u16_buf_len;
  95. static int u16_starter;
  96. typedef struct {
  97. uint16_t ii_len;
  98. uint16_t ti_len;
  99. uint16_t ut_len;
  100. unsigned char ii_shift;
  101. unsigned char ti_shift;
  102. tbl_item *ii;
  103. tbl_item *ti;
  104. tbl_item *ut;
  105. } table_data;
  106. static size_t newopt(tbl_item *ut, size_t usize, int shift, table_data *tbl);
  107. #define MAX_COLLATION_WEIGHTS 4
  108. #define MAX_FNO 1
  109. #define MAX_FILES (MAX_FNO + 1)
  110. static FILE *fstack[MAX_FILES];
  111. static char *fname[MAX_FILES];
  112. static int lineno[MAX_FILES];
  113. static int fno = -1;
  114. static tbl_item wcs2index[RANGE];
  115. static char linebuf[1024];
  116. static char *pos;
  117. static char *pos_e = NULL;
  118. static char end_of_token = 0; /* slot to save */
  119. #define IN_ORDER 0x01
  120. #define IN_REORDER 0x02
  121. #define IN_REORDER_SECTIONS 0x04
  122. static int order_state;
  123. static int cur_num_weights; /* number of weights in current use */
  124. static char cur_rule[MAX_COLLATION_WEIGHTS];
  125. static int anonsection = 0;
  126. typedef struct ll_item_struct ll_item_t;
  127. struct ll_item_struct {
  128. ll_item_t *next;
  129. ll_item_t *prev;
  130. void *data;
  131. int data_type;
  132. int idx;
  133. };
  134. static ll_item_t *reorder_section_ptr = NULL;
  135. static int superset;
  136. static int superset_order_start_cnt; /* only support one order for now */
  137. static int superset_in_sync;
  138. static ll_item_t *comm_cur_ptr;
  139. static ll_item_t *comm_prev_ptr;
  140. enum {
  141. R_FORWARD = 0x01,
  142. R_POSITION = 0x02,
  143. R_BACKWARD = 0x04 /* must be largest in value */
  144. };
  145. typedef struct {
  146. size_t num_weights;
  147. char rule[MAX_COLLATION_WEIGHTS];
  148. const char *colitem[MAX_COLLATION_WEIGHTS];
  149. } weight_t;
  150. static void *root_weight = NULL;
  151. size_t unique_weights = 0;
  152. typedef struct {
  153. const char *symbol;
  154. weight_t *weight;
  155. } weighted_item_t;
  156. typedef struct {
  157. const char *symbol1;
  158. const char *symbol2;
  159. int length;
  160. weight_t *weight;
  161. } range_item_t;
  162. typedef struct {
  163. const char *name;
  164. ll_item_t *itm_list; /* weighted_item_t list .. circular!!! */
  165. size_t num_items;
  166. size_t num_rules;
  167. char rules[MAX_COLLATION_WEIGHTS];
  168. } section_t;
  169. static section_t *cur_section = NULL;
  170. typedef struct {
  171. const char *symbol;
  172. ll_item_t *node;
  173. } wi_index_t;
  174. typedef struct col_locale_struct col_locale_t;
  175. struct col_locale_struct {
  176. char *name;
  177. void *root_colitem; /* all base and derived, or just derived */
  178. void *root_element;
  179. void *root_scripts;
  180. void *root_wi_index;
  181. void *root_wi_index_reordered;
  182. ll_item_t *section_list;
  183. col_locale_t *base_locale; /* null if this is a base */
  184. void *root_derived_wi;
  185. ll_item_t *derived_list;
  186. void *root_starter_char;
  187. void *root_starter_all;
  188. ll_item_t *undefined_idx;
  189. };
  190. typedef struct {
  191. const char *symbol;
  192. int idx;
  193. } col_index_t;
  194. static void *root_col_locale = NULL;
  195. typedef struct {
  196. const char *keyword;
  197. void (*handler)(void);
  198. } keyword_table_t;
  199. typedef struct {
  200. const char *string;
  201. const char *element; /* NULL if collating symbol */
  202. } colitem_t;
  203. static col_locale_t *cur_base = NULL;
  204. static col_locale_t *cur_derived = NULL;
  205. static col_locale_t *cur_col = NULL;
  206. static void *root_sym = NULL;
  207. static size_t num_sym = 0;
  208. static size_t mem_sym = 0;
  209. static const char *inputdir;
  210. static size_t inputdir_len;
  211. static unsigned verbose = 0;
  212. enum {
  213. VINFO = (1<<0),
  214. VDETAIL = (1<<1),
  215. };
  216. static void error_msg(const char *fmt, ...) __attribute__ ((noreturn, format (printf, 1, 2)));
  217. static void *xmalloc(size_t n);
  218. static char *xsymdup(const char *s); /* only allocate once... store in a tree */
  219. static void pushfile(char *filename);
  220. static void popfile(void);
  221. static void processfile(void);
  222. static int iscommentchar(int);
  223. static void eatwhitespace(void);
  224. static int next_line(void);
  225. static char *next_token(void);
  226. static void do_unrecognized(void);
  227. static col_locale_t *new_col_locale(char *name);
  228. static ll_item_t *new_ll_item(int data_type, void *data);
  229. static weight_t *register_weight(weight_t *w);
  230. static size_t ll_len(ll_item_t *l);
  231. static size_t ll_count(ll_item_t *l, int mask);
  232. static void add_wi_index(ll_item_t *l);
  233. static size_t tnumnodes(const void *root);
  234. static ll_item_t *find_wi_index(const char *sym, col_locale_t *cl);
  235. static void mark_reordered(const char *sym);
  236. static ll_item_t *find_wi_index_reordered(const char *sym);
  237. static ll_item_t *next_comm_ptr(void);
  238. static ll_item_t *init_comm_ptr(void);
  239. static ll_item_t *find_ll_last(ll_item_t *p);
  240. static void dump_weights(const char *name);
  241. static void finalize_base(void);
  242. static int is_ucode(const char *s);
  243. static int sym_cmp(const void *n1, const void *n2);
  244. static void do_starter_lists(col_locale_t *cl);
  245. static void dump_base_locale(int n);
  246. static void dump_der_locale(int n);
  247. static void dump_collate(FILE *fp);
  248. enum {
  249. DT_SECTION = 0x01,
  250. DT_WEIGHTED = 0x02,
  251. DT_REORDER = 0x04, /* a section to support reorder_after */
  252. DT_COL_LOCALE = 0x08,
  253. DT_RANGE = 0x10,
  254. };
  255. static int verbose_msg(const unsigned lvl, const char *fmt, ...)
  256. {
  257. va_list arg;
  258. int ret = 0;
  259. if (verbose & lvl) {
  260. va_start(arg, fmt);
  261. ret = vfprintf(stderr, fmt, arg);
  262. va_end(arg);
  263. }
  264. return ret;
  265. }
  266. static section_t *new_section(const char *name)
  267. {
  268. section_t *p;
  269. char buf[128];
  270. p = xmalloc(sizeof(section_t));
  271. if (!name) { /* anonymous section */
  272. name = buf;
  273. snprintf(buf, sizeof(buf), "anon%05d", anonsection);
  274. ++anonsection;
  275. } else if (*name != '<') { /* reorder */
  276. name = buf;
  277. snprintf(buf, sizeof(buf), "%s %05d", cur_col->name, anonsection);
  278. ++anonsection;
  279. }
  280. #warning devel code
  281. /* verbose_msg(VDETAIL, "section %s\n", name); */
  282. p->name = xsymdup(name);
  283. p->itm_list = NULL;
  284. p->num_items = 0;
  285. p->num_rules = 0;
  286. memset(p->rules, 0, MAX_COLLATION_WEIGHTS);
  287. /* cur_num_weights = p->num_rules = 0; */
  288. /* memset(p->rules, 0, MAX_COLLATION_WEIGHTS); */
  289. /* memset(cur_rule, R_FORWARD, 4); */
  290. #warning devel code
  291. if (*p->name == 'a') {
  292. cur_num_weights = p->num_rules = 4;
  293. memset(p->rules, R_FORWARD, 4);
  294. memset(cur_rule, R_FORWARD, 4);
  295. p->rules[3] |= R_POSITION;
  296. cur_rule[3] |= R_POSITION;
  297. }
  298. /* verbose_msg(VDETAIL, "new section %s -- cur_num_weights = %d\n", p->name, cur_num_weights); */
  299. return p;
  300. }
  301. static void do_order_start(void);
  302. static void do_order_end(void);
  303. static void do_reorder_after(void);
  304. static void do_reorder_end(void);
  305. static void do_reorder_sections_after(void);
  306. static void do_reorder_sections_end(void);
  307. static void do_copy(void);
  308. static void do_colsym(void);
  309. static void do_colele(void);
  310. static void do_script(void);
  311. static void do_range(void);
  312. static col_locale_t *new_col_locale(char *name);
  313. static int colitem_cmp(const void *n1, const void *n2);
  314. static int colelement_cmp(const void *n1, const void *n2);
  315. static void del_colitem(colitem_t *p);
  316. static colitem_t *new_colitem(char *item, char *def);
  317. static void add_colitem(char *item, char *def);
  318. static void add_script(const char *s);
  319. static unsigned int add_rule(weighted_item_t *wi);
  320. static unsigned int add_range_rule(range_item_t *ri);
  321. static const keyword_table_t keyword_table[] = {
  322. { "collating-symbol", do_colsym },
  323. { "collating-element", do_colele },
  324. { "script", do_script },
  325. { "copy", do_copy },
  326. { "order_start", do_order_start },
  327. { "order_end", do_order_end },
  328. { "order-end", do_order_end },
  329. { "reorder-after", do_reorder_after },
  330. { "reorder-end", do_reorder_end },
  331. { "reorder-sections-after", do_reorder_sections_after },
  332. { "reorder-sections-end", do_reorder_sections_end },
  333. { "UCLIBC_RANGE", do_range },
  334. { NULL, do_unrecognized }
  335. };
  336. static void do_unrecognized(void)
  337. {
  338. #if 1
  339. error_msg("warning: unrecognized: %s", pos);
  340. #else
  341. /* verbose_msg(VDETAIL, "warning: unrecognized initial keyword \"%s\"\n", pos); */
  342. fprintf(stderr, "warning: unrecognized: %s", pos);
  343. if (end_of_token) {
  344. fprintf(stderr, "%c%s", end_of_token, pos_e+1);
  345. }
  346. fprintf(stderr, "\n");
  347. #endif
  348. }
  349. /* typedef struct { */
  350. /* const char *symbol1; */
  351. /* const char *symbol2; */
  352. /* int length; */
  353. /* weight_t *weight; */
  354. /* } range_item_t; */
  355. static void do_range(void)
  356. {
  357. range_item_t *ri;
  358. weight_t w;
  359. int i;
  360. char *s;
  361. char *s1;
  362. char *s2;
  363. const char **ci;
  364. ll_item_t *lli;
  365. assert(!superset);
  366. assert(order_state == IN_ORDER);
  367. s1 = next_token();
  368. if (!s1) {
  369. error_msg("missing start of range");
  370. }
  371. if (!is_ucode(s1)) {
  372. error_msg("start of range is not a ucode: %s", s1);
  373. }
  374. s1 = xsymdup(s1);
  375. s2 = next_token();
  376. if (!s2) {
  377. error_msg("missing end of range");
  378. }
  379. if (!is_ucode(s2)) {
  380. error_msg("end of range is not a ucode: %s", s2);
  381. }
  382. s2 = xsymdup(s2);
  383. ri = (range_item_t *) xmalloc(sizeof(range_item_t));
  384. ri->symbol1 = s1;
  385. ri->symbol2 = s2;
  386. ri->length = strtoul(s2+2, NULL, 16) - strtoul(s1+2, NULL, 16);
  387. if (ri->length <= 0) {
  388. error_msg("illegal range length %d", ri->length);
  389. }
  390. s = next_token();
  391. w.num_weights = cur_num_weights;
  392. for (i=0 ; i < cur_num_weights ; i++) {
  393. w.rule[i] = cur_rule[i];
  394. }
  395. ci = w.colitem + (i-1);
  396. /* now i == cur_num_weights */
  397. #define STR_DITTO "."
  398. while (s && *s && i) {
  399. --i;
  400. if (*s == ';') {
  401. ci[-i] = xsymdup(STR_DITTO);
  402. if (*++s) {
  403. continue;
  404. }
  405. }
  406. if (*s) {
  407. ci[-i] = xsymdup(s);
  408. }
  409. s = next_token();
  410. if (s) {
  411. if (*s == ';') {
  412. ++s;
  413. } else if (i) {
  414. error_msg("missing seperator");
  415. }
  416. }
  417. }
  418. if (s) {
  419. error_msg("too many weights: %d %d |%s| %d", cur_num_weights, i, s, (int)*s);
  420. }
  421. while (i) { /* missing weights are not an error */
  422. --i;
  423. ci[-i] = xsymdup(STR_DITTO);
  424. }
  425. ri->weight = register_weight(&w);
  426. /* if ((i = is_ucode(t)) != 0) { */
  427. /* assert(!t[i]); */
  428. /* add_colitem(t, NULL); */
  429. /* } */
  430. lli = new_ll_item(DT_RANGE, ri);
  431. if (!cur_section->itm_list) {
  432. /* printf("creating new item list: %s\n", wi->symbol); */
  433. cur_section->itm_list = lli;
  434. lli->prev = lli->next = lli;
  435. ++cur_section->num_items;
  436. } else {
  437. insque(lli, cur_section->itm_list->prev);
  438. /* printf("adding item to list: %d - %s\n", ll_len(cur_section->itm_list), wi->symbol); */
  439. ++cur_section->num_items;
  440. }
  441. /* add_wi_index(lli); */
  442. }
  443. static weighted_item_t *add_weight(char *t)
  444. {
  445. weighted_item_t *wi;
  446. weight_t w;
  447. int i;
  448. char *s;
  449. const char **ci;
  450. t = xsymdup(t);
  451. s = next_token();
  452. w.num_weights = cur_num_weights;
  453. for (i=0 ; i < cur_num_weights ; i++) {
  454. w.rule[i] = cur_rule[i];
  455. }
  456. ci = w.colitem + (i-1);
  457. /* now i == cur_num_weights */
  458. while (s && *s && i) {
  459. --i;
  460. if (*s == ';') {
  461. ci[-i] = xsymdup(STR_DITTO);
  462. if (*++s) {
  463. continue;
  464. }
  465. }
  466. if (*s) {
  467. if (!strcmp(s,t)) {
  468. s = STR_DITTO;
  469. }
  470. ci[-i] = xsymdup(s);
  471. }
  472. s = next_token();
  473. if (s) {
  474. if (*s == ';') {
  475. ++s;
  476. } else if (i) {
  477. error_msg("missing seperator");
  478. }
  479. }
  480. }
  481. if (s) {
  482. error_msg("too many weights: %d %d |%s| %d", cur_num_weights, i, s, (int)*s);
  483. }
  484. while (i) { /* missing weights are not an error */
  485. --i;
  486. ci[-i] = xsymdup(STR_DITTO);
  487. }
  488. wi = xmalloc(sizeof(weighted_item_t));
  489. wi->symbol = t;
  490. wi->weight = register_weight(&w);
  491. if ((i = is_ucode(t)) != 0) {
  492. assert(!t[i]);
  493. add_colitem(t, NULL);
  494. }
  495. return wi;
  496. }
  497. static void add_superset_weight(char *t)
  498. {
  499. ll_item_t *lli;
  500. weighted_item_t *wi;
  501. if (!comm_cur_ptr
  502. || (strcmp(t, ((weighted_item_t *)(comm_cur_ptr->data))->symbol) != 0)
  503. ) { /* now out of sync */
  504. if (superset_in_sync) { /* need a new section */
  505. superset_in_sync = 0;
  506. cur_section = new_section("R");
  507. cur_num_weights = cur_section->num_rules
  508. = ((section_t *)(cur_base->section_list->data))->num_rules;
  509. memcpy(cur_rule,
  510. ((section_t *)(cur_base->section_list->data))->rules,
  511. MAX_COLLATION_WEIGHTS);
  512. memcpy(cur_section->rules,
  513. ((section_t *)(cur_base->section_list->data))->rules,
  514. MAX_COLLATION_WEIGHTS);
  515. insque(new_ll_item(DT_REORDER, cur_section), find_ll_last(cur_col->section_list));
  516. assert(comm_prev_ptr);
  517. lli = new_ll_item(DT_REORDER, cur_section);
  518. lli->prev = lli->next = lli;
  519. insque(lli, comm_prev_ptr);
  520. /* verbose_msg(VDETAIL, " subsection -----------------------\n"); */
  521. }
  522. /* verbose_msg(VDETAIL, " %s %s\n", t, ((weighted_item_t *)(comm_cur_ptr->data))->symbol); */
  523. wi = add_weight(t);
  524. lli = new_ll_item(DT_WEIGHTED, wi);
  525. mark_reordered(wi->symbol);
  526. /* printf("reorder: %s\n", t); */
  527. if (!cur_section->itm_list) {
  528. cur_section->itm_list = lli;
  529. lli->prev = lli->next = lli;
  530. ++cur_section->num_items;
  531. } else {
  532. insque(lli, cur_section->itm_list->prev);
  533. ++cur_section->num_items;
  534. }
  535. add_wi_index(lli);
  536. } else { /* in sync */
  537. superset_in_sync = 1;
  538. next_comm_ptr();
  539. }
  540. }
  541. static void do_weight(char *t)
  542. {
  543. weighted_item_t *wi;
  544. ll_item_t *lli;
  545. if (superset) {
  546. add_superset_weight(t);
  547. return;
  548. }
  549. switch(order_state) {
  550. case 0:
  551. /* fprintf(stdout, "no-order weight: %s\n", t); */
  552. /* break; */
  553. case IN_ORDER:
  554. /* in a section */
  555. /* fprintf(stdout, "weight: %s\n", t); */
  556. wi = add_weight(t);
  557. lli = new_ll_item(DT_WEIGHTED, wi);
  558. if (!cur_section->itm_list) {
  559. /* fprintf(stdout, "creating new item list: %s %s %p\n", wi->symbol, cur_section->name, lli); */
  560. cur_section->itm_list = lli;
  561. lli->prev = lli->next = lli;
  562. ++cur_section->num_items;
  563. } else {
  564. insque(lli, cur_section->itm_list->prev);
  565. /* fprintf(stdout, "adding item to list: %d - %s %p\n", ll_len(cur_section->itm_list), wi->symbol, lli); */
  566. ++cur_section->num_items;
  567. }
  568. add_wi_index(lli);
  569. break;
  570. case IN_REORDER:
  571. /* std rule - but in a block with an insert-after pt */
  572. wi = add_weight(t);
  573. lli = new_ll_item(DT_WEIGHTED, wi);
  574. mark_reordered(wi->symbol);
  575. /* fprintf(stdout, "reorder: %s %s %p\n", t, cur_section->name, lli); */
  576. if (!cur_section->itm_list) {
  577. cur_section->itm_list = lli;
  578. lli->prev = lli->next = lli;
  579. ++cur_section->num_items;
  580. } else {
  581. insque(lli, cur_section->itm_list->prev);
  582. ++cur_section->num_items;
  583. }
  584. add_wi_index(lli);
  585. break;
  586. case IN_REORDER_SECTIONS:
  587. t = xsymdup(t);
  588. if (next_token() != NULL) {
  589. error_msg("trailing text in reorder section item: %s", pos);
  590. }
  591. lli = cur_col->section_list;
  592. do {
  593. if (lli->data_type & DT_SECTION) {
  594. if (!strcmp(((section_t *)(lli->data))->name, t)) {
  595. lli->data_type = DT_REORDER;
  596. lli = new_ll_item(DT_REORDER, (section_t *)(lli->data));
  597. insque(lli, reorder_section_ptr);
  598. reorder_section_ptr = lli;
  599. return;
  600. }
  601. }
  602. lli = lli->next;
  603. } while (lli);
  604. error_msg("reorder_sections_after for non-base item currently not supported: %s", t);
  605. /* fprintf(stdout, "reorder_secitons: %s\n", t); */
  606. break;
  607. default:
  608. error_msg("invalid order_state %d", order_state);
  609. }
  610. }
  611. static int col_locale_cmp(const void *n1, const void *n2)
  612. {
  613. return strcmp(((const col_locale_t *) n1)->name, ((const col_locale_t *) n2)->name);
  614. }
  615. static void processfile(void)
  616. {
  617. char *t;
  618. const keyword_table_t *k;
  619. order_state = 0;
  620. #warning devel code
  621. /* cur_num_weights = 0; */
  622. /* cur_num_weights = 4; */
  623. /* memset(cur_rule, R_FORWARD, 4); */
  624. if (cur_col != cur_base) {
  625. cur_col->base_locale = cur_base;
  626. cur_col->undefined_idx = cur_base->undefined_idx;
  627. if (!cur_base->derived_list) {
  628. cur_base->derived_list = new_ll_item(DT_COL_LOCALE, cur_col);
  629. } else {
  630. insque(new_ll_item(DT_COL_LOCALE, cur_col), find_ll_last(cur_base->derived_list));
  631. }
  632. }
  633. if (tfind(cur_col, &root_col_locale, col_locale_cmp)) {
  634. error_msg("attempt to read locale: %s", cur_col->name);
  635. }
  636. if (!tsearch(cur_col, &root_col_locale, col_locale_cmp)) {
  637. error_msg("OUT OF MEMORY!");
  638. }
  639. if (superset) {
  640. superset_order_start_cnt = 0;
  641. superset_in_sync = 0;
  642. init_comm_ptr();
  643. }
  644. while (next_line()) {
  645. /* printf("%5d:", lineno[fno]); */
  646. /* while ((t = next_token()) != NULL) { */
  647. /* printf(" |%s|", t); */
  648. /* printf("\n"); */
  649. /* } */
  650. t = next_token();
  651. assert(t);
  652. assert(t == pos);
  653. if ((*t == '<') || (!strcmp(t, "UNDEFINED"))) {
  654. do_weight(t);
  655. } else {
  656. for (k = keyword_table ; k->keyword ; k++) {
  657. if (!strcmp(k->keyword, t)) {
  658. break;
  659. }
  660. }
  661. k->handler();
  662. }
  663. }
  664. if (cur_base == cur_col) {
  665. verbose_msg(VDETAIL, "Base: %15s", cur_col->name);
  666. } else {
  667. #if 1
  668. if (!cur_col->undefined_idx) {
  669. #if 0
  670. if (superset) {
  671. if (superset_order_start_cnt == 1) {
  672. --superset_order_start_cnt; /* ugh.. hack this */
  673. }
  674. }
  675. #endif
  676. /* This is an awful hack to get around the problem of unspecified UNDEFINED
  677. * definitions in the supported locales derived from iso14651_t1. */
  678. if (!strcmp(cur_base->name, "iso14651_t1")) {
  679. fprintf(stderr, "Warning: adding UNDEFINED entry for %s\n", cur_col->name);
  680. strcpy(linebuf, "script <UNDEFINED_SECTION>\n");
  681. pos_e = NULL;
  682. pos = linebuf;
  683. t = next_token();
  684. assert(t);
  685. assert(t == pos);
  686. do_script();
  687. strcpy(linebuf, "order_start <UNDEFINED_SECTION>;forward;backward;forward;forward,position\n");
  688. pos_e = NULL;
  689. pos = linebuf;
  690. t = next_token();
  691. assert(t);
  692. assert(t == pos);
  693. do_order_start();
  694. strcpy(linebuf, "UNDEFINED IGNORE;IGNORE;IGNORE\n");
  695. pos_e = NULL;
  696. pos = linebuf;
  697. t = next_token();
  698. assert(t);
  699. assert(t == pos);
  700. do_weight(t);
  701. strcpy(linebuf, "order_end\n");
  702. pos_e = NULL;
  703. pos = linebuf;
  704. t = next_token();
  705. assert(t);
  706. assert(t == pos);
  707. do_order_end();
  708. } else {
  709. error_msg("no definition of UNDEFINED for %s", cur_col->name);
  710. }
  711. }
  712. #endif
  713. verbose_msg(VDETAIL, " Der: %15s", cur_col->name);
  714. }
  715. {
  716. #if 0
  717. ll_item_t *p = cur_col->section_list;
  718. #endif
  719. verbose_msg(VDETAIL, "%6u weights", tnumnodes(cur_col->root_wi_index));
  720. if (cur_base) {
  721. verbose_msg(VDETAIL, " %6u der %6u reor %6u starter - %u new stubs",
  722. tnumnodes(cur_base->root_derived_wi),
  723. tnumnodes(cur_base->root_wi_index_reordered),
  724. tnumnodes(cur_base->root_starter_char),
  725. ll_count(cur_col->section_list, DT_REORDER));
  726. }
  727. verbose_msg(VDETAIL, "\n");
  728. #if 0
  729. while (p) {
  730. assert(((section_t *)(p->data))->num_items ==
  731. ll_len(((section_t *)(p->data))->itm_list));
  732. if (!p->next &&
  733. ((*((section_t *)(p->data))->name == 'a')
  734. && (((section_t *)(p->data))->num_items == 0))
  735. ) {
  736. break;
  737. }
  738. if (!(p->data_type & DT_REORDER)) {
  739. if ((*((section_t *)(p->data))->name != 'a')
  740. || (((section_t *)(p->data))->num_items > 0)
  741. ) {
  742. verbose_msg(VDETAIL,
  743. /* "\t%-15s %zu\n", */
  744. "\t%-15s %6u\n",
  745. ((section_t *)(p->data))->name,
  746. ((section_t *)(p->data))->num_items);
  747. }
  748. }
  749. p = p->next;
  750. }
  751. #endif
  752. }
  753. }
  754. static void print_colnode(const void *ptr, VISIT order, int level)
  755. {
  756. const colitem_t *p = *(const colitem_t **) ptr;
  757. if (order == postorder || order == leaf) {
  758. printf("collating item = \"%s\"", p->string);
  759. if (p->element) {
  760. printf(" is %s", p->element);
  761. }
  762. printf("\n");
  763. }
  764. }
  765. static void print_weight_node(const void *ptr, VISIT order, int level)
  766. {
  767. const weight_t *p = *(const weight_t **) ptr;
  768. int i;
  769. if (order == postorder || order == leaf) {
  770. printf("weight: (%d) ", p->num_weights);
  771. for (i = 0 ; i < p->num_weights ; i++) {
  772. if (p->rule[i] & R_FORWARD) {
  773. printf("F");
  774. }
  775. if (p->rule[i] & R_BACKWARD) {
  776. printf("B");
  777. }
  778. if (p->rule[i] & R_POSITION) {
  779. printf("P");
  780. }
  781. printf(",");
  782. }
  783. for (i = 0 ; i < p->num_weights ; i++) {
  784. printf(" %s", p->colitem[i]);
  785. }
  786. printf("\n");
  787. }
  788. }
  789. typedef struct {
  790. const char *der_name;
  791. int base_locale;
  792. } deps_t;
  793. enum {
  794. BASE_iso14651_t1,
  795. BASE_comm,
  796. BASE_cs_CZ,
  797. BASE_ar_SA,
  798. BASE_th_TH,
  799. BASE_ja_JP,
  800. BASE_ko_KR,
  801. BASE_MAX
  802. };
  803. static const char *base_name[] = {
  804. "iso14651_t1",
  805. "comm",
  806. "cs_CZ",
  807. "ar_SA",
  808. "th_TH",
  809. "ja_JP",
  810. "ko_KR"
  811. };
  812. static ll_item_t *locale_list[BASE_MAX];
  813. static void init_locale_list(void)
  814. {
  815. int i;
  816. for (i=0 ; i < BASE_MAX ; i++) {
  817. locale_list[i] = (ll_item_t *) xmalloc(sizeof(ll_item_t));
  818. locale_list[i]->prev = locale_list[i]->next = locale_list[i];
  819. locale_list[i]->data = (void *) base_name[i];
  820. }
  821. }
  822. deps_t deps[] = {
  823. { "af_ZA", BASE_iso14651_t1 },
  824. { "am_ET", BASE_iso14651_t1 },
  825. { "ar_AE", BASE_iso14651_t1 },
  826. { "ar_BH", BASE_iso14651_t1 },
  827. { "ar_DZ", BASE_iso14651_t1 },
  828. { "ar_EG", BASE_iso14651_t1 },
  829. { "ar_IN", BASE_iso14651_t1 },
  830. { "ar_IQ", BASE_iso14651_t1 },
  831. { "ar_JO", BASE_iso14651_t1 },
  832. { "ar_KW", BASE_iso14651_t1 },
  833. { "ar_LB", BASE_iso14651_t1 },
  834. { "ar_LY", BASE_iso14651_t1 },
  835. { "ar_MA", BASE_iso14651_t1 },
  836. { "ar_OM", BASE_iso14651_t1 },
  837. { "ar_QA", BASE_iso14651_t1 },
  838. { "ar_SA", BASE_ar_SA },
  839. { "ar_SD", BASE_iso14651_t1 },
  840. { "ar_SY", BASE_iso14651_t1 },
  841. { "ar_TN", BASE_iso14651_t1 },
  842. { "ar_YE", BASE_iso14651_t1 },
  843. { "az_AZ", BASE_iso14651_t1 },
  844. { "be_BY", BASE_iso14651_t1 },
  845. { "bg_BG", BASE_iso14651_t1 },
  846. { "bn_BD", BASE_iso14651_t1 },
  847. { "bn_IN", BASE_iso14651_t1 },
  848. { "br_FR", BASE_iso14651_t1 },
  849. { "bs_BA", BASE_iso14651_t1 },
  850. { "ca_ES", BASE_comm },
  851. { "cs_CZ", BASE_cs_CZ },
  852. { "cy_GB", BASE_iso14651_t1 },
  853. { "da_DK", BASE_comm },
  854. { "de_AT", BASE_iso14651_t1 },
  855. { "de_BE", BASE_iso14651_t1 },
  856. { "de_CH", BASE_iso14651_t1 },
  857. { "de_DE", BASE_iso14651_t1 },
  858. { "de_LU", BASE_iso14651_t1 },
  859. { "el_GR", BASE_iso14651_t1 },
  860. { "en_AU", BASE_iso14651_t1 },
  861. { "en_BW", BASE_iso14651_t1 },
  862. { "en_CA", BASE_comm },
  863. { "en_DK", BASE_iso14651_t1 },
  864. { "en_GB", BASE_iso14651_t1 },
  865. { "en_HK", BASE_iso14651_t1 },
  866. { "en_IE", BASE_iso14651_t1 },
  867. { "en_IN", BASE_iso14651_t1 },
  868. { "en_NZ", BASE_iso14651_t1 },
  869. { "en_PH", BASE_iso14651_t1 },
  870. { "en_SG", BASE_iso14651_t1 },
  871. { "en_US", BASE_iso14651_t1 },
  872. { "en_ZA", BASE_iso14651_t1 },
  873. { "en_ZW", BASE_iso14651_t1 },
  874. { "eo_EO", BASE_iso14651_t1 },
  875. { "es_AR", BASE_comm },
  876. { "es_BO", BASE_comm },
  877. { "es_CL", BASE_comm },
  878. { "es_CO", BASE_comm },
  879. { "es_CR", BASE_comm },
  880. { "es_DO", BASE_comm },
  881. { "es_EC", BASE_comm },
  882. { "es_ES", BASE_comm },
  883. { "es_GT", BASE_comm },
  884. { "es_HN", BASE_comm },
  885. { "es_MX", BASE_comm },
  886. { "es_NI", BASE_comm },
  887. { "es_PA", BASE_comm },
  888. { "es_PE", BASE_comm },
  889. { "es_PR", BASE_comm },
  890. { "es_PY", BASE_comm },
  891. { "es_SV", BASE_comm },
  892. { "es_US", BASE_comm },
  893. { "es_UY", BASE_comm },
  894. { "es_VE", BASE_comm },
  895. { "et_EE", BASE_comm },
  896. { "eu_ES", BASE_iso14651_t1 },
  897. { "fa_IR", BASE_iso14651_t1 },
  898. { "fi_FI", BASE_comm },
  899. { "fo_FO", BASE_comm },
  900. { "fr_BE", BASE_iso14651_t1 },
  901. { "fr_CA", BASE_comm },
  902. { "fr_CH", BASE_iso14651_t1 },
  903. { "fr_FR", BASE_iso14651_t1 },
  904. { "fr_LU", BASE_iso14651_t1 },
  905. { "ga_IE", BASE_iso14651_t1 },
  906. { "gd_GB", BASE_iso14651_t1 },
  907. { "gl_ES", BASE_comm },
  908. { "gv_GB", BASE_iso14651_t1 },
  909. { "he_IL", BASE_iso14651_t1 },
  910. { "hi_IN", BASE_iso14651_t1 },
  911. { "hr_HR", BASE_comm },
  912. { "hu_HU", BASE_iso14651_t1 },
  913. { "hy_AM", BASE_iso14651_t1 },
  914. { "id_ID", BASE_iso14651_t1 },
  915. { "is_IS", BASE_comm },
  916. { "it_CH", BASE_iso14651_t1 },
  917. { "it_IT", BASE_iso14651_t1 },
  918. { "iw_IL", BASE_iso14651_t1 },
  919. { "ja_JP", BASE_ja_JP },
  920. { "ka_GE", BASE_iso14651_t1 },
  921. { "kl_GL", BASE_comm },
  922. { "ko_KR", BASE_ko_KR },
  923. { "kw_GB", BASE_iso14651_t1 },
  924. { "lt_LT", BASE_comm },
  925. { "lv_LV", BASE_comm },
  926. { "mi_NZ", BASE_iso14651_t1 },
  927. { "mk_MK", BASE_iso14651_t1 },
  928. { "mr_IN", BASE_iso14651_t1 },
  929. { "ms_MY", BASE_iso14651_t1 },
  930. { "mt_MT", BASE_iso14651_t1 },
  931. { "nl_BE", BASE_iso14651_t1 },
  932. { "nl_NL", BASE_iso14651_t1 },
  933. { "nn_NO", BASE_iso14651_t1 },
  934. { "no_NO", BASE_comm },
  935. { "oc_FR", BASE_iso14651_t1 },
  936. { "pl_PL", BASE_comm },
  937. { "pt_BR", BASE_iso14651_t1 },
  938. { "pt_PT", BASE_iso14651_t1 },
  939. { "ro_RO", BASE_iso14651_t1 },
  940. { "ru_RU", BASE_iso14651_t1 },
  941. { "ru_UA", BASE_iso14651_t1 },
  942. { "se_NO", BASE_iso14651_t1 },
  943. { "sk_SK", BASE_cs_CZ },
  944. { "sl_SI", BASE_comm },
  945. { "sq_AL", BASE_iso14651_t1 },
  946. { "sr_YU", BASE_iso14651_t1 },
  947. { "sv_FI", BASE_comm },
  948. { "sv_SE", BASE_iso14651_t1 },
  949. { "ta_IN", BASE_iso14651_t1 },
  950. { "te_IN", BASE_iso14651_t1 },
  951. { "tg_TJ", BASE_iso14651_t1 },
  952. { "th_TH", BASE_th_TH },
  953. { "ti_ER", BASE_iso14651_t1 },
  954. { "ti_ET", BASE_iso14651_t1 },
  955. { "tl_PH", BASE_iso14651_t1 },
  956. { "tr_TR", BASE_comm },
  957. { "tt_RU", BASE_iso14651_t1 },
  958. { "uk_UA", BASE_iso14651_t1 },
  959. { "ur_PK", BASE_iso14651_t1 },
  960. { "uz_UZ", BASE_iso14651_t1 },
  961. { "vi_VN", BASE_iso14651_t1 },
  962. { "wa_BE", BASE_iso14651_t1 },
  963. { "yi_US", BASE_iso14651_t1 },
  964. { "zh_CN", BASE_iso14651_t1 },
  965. { "zh_HK", BASE_iso14651_t1 },
  966. { "zh_SG", BASE_iso14651_t1 },
  967. { "zh_TW", BASE_iso14651_t1 },
  968. };
  969. static int der_count[BASE_MAX];
  970. static const char *new_args[500];
  971. static int new_arg_count;
  972. static int dep_cmp(const void *s1, const void *s2)
  973. {
  974. return strcmp( (const char *) s1, ((const deps_t *) s2)->der_name);
  975. }
  976. static int old_main(int argc, char **argv);
  977. int main(int argc, char **argv)
  978. {
  979. const deps_t *p;
  980. ll_item_t *lli;
  981. int i;
  982. int total;
  983. char *output_file = "locale_collate.h";
  984. unsigned verbosity = 0;
  985. if (argc < 3) {
  986. return EXIT_FAILURE;
  987. }
  988. --argc;
  989. inputdir = strdup(*++argv);
  990. inputdir_len = strlen(inputdir);
  991. init_locale_list();
  992. while (--argc) {
  993. ++argv;
  994. if (!strcmp(*argv, "-o")) {
  995. --argc;
  996. if (*++argv == NULL) {
  997. printf("-o <outfile> requires an argument\n");
  998. return EXIT_FAILURE;
  999. }
  1000. output_file = strdup(*argv);
  1001. continue;
  1002. } else if (!strcmp(*argv, "-v")) {
  1003. verbosity++;
  1004. continue;
  1005. }
  1006. p = (const deps_t *) bsearch(*argv, deps, sizeof(deps)/sizeof(deps[0]), sizeof(deps[0]), dep_cmp);
  1007. if (!p) {
  1008. if (!strcmp("C", *argv)) {
  1009. printf("ignoring %s locale\n", *argv);
  1010. continue;
  1011. } else {
  1012. printf("%s not found\n", *argv);
  1013. return EXIT_FAILURE;
  1014. }
  1015. }
  1016. i = p->base_locale;
  1017. ++der_count[i];
  1018. if (!strcmp(base_name[i], *argv)) {
  1019. /* same name as base, so skip after count incremented */
  1020. continue;
  1021. }
  1022. /* add it to the list. the main body will catch duplicates */
  1023. lli = (ll_item_t *) xmalloc(sizeof(ll_item_t));
  1024. lli->prev = lli->next = NULL;
  1025. lli->data = (void *) *argv;
  1026. insque(lli, locale_list[i]);
  1027. }
  1028. total = 0;
  1029. for (i=0 ; i < BASE_MAX ; i++) {
  1030. /* printf("der_count[%2d] = %3d\n", i, der_count[i]); */
  1031. total += der_count[i];
  1032. }
  1033. /* printf("total = %d\n", total); */
  1034. new_args[new_arg_count++] = "dummyprogramname";
  1035. for (i=0 ; i < BASE_MAX ; i++) {
  1036. if (!der_count[i]) {
  1037. continue;
  1038. }
  1039. new_args[new_arg_count++] = (i == BASE_comm) ? "-c" : "-b";
  1040. lli = locale_list[i];
  1041. do {
  1042. new_args[new_arg_count++] = (const char *) (lli->data);
  1043. lli = lli->next;
  1044. } while (lli != locale_list[i]);
  1045. new_args[new_arg_count++] = "-f";
  1046. }
  1047. for (i=0; i < verbosity; i++)
  1048. new_args[new_arg_count++] = "-v";
  1049. new_args[new_arg_count++] = "-o";
  1050. new_args[new_arg_count++] = output_file;
  1051. /*
  1052. for (i=0 ; i < new_arg_count ; i++) {
  1053. printf("%3d: %s\n", i, new_args[i]);
  1054. }
  1055. */
  1056. return old_main(new_arg_count, (char **) new_args);
  1057. }
  1058. /* usage... prog -b basefile derived {derived} -s single {single} */
  1059. static int old_main(int argc, char **argv)
  1060. {
  1061. int next_is_base = 0;
  1062. int next_is_subset = 0;
  1063. char *output_file = NULL;
  1064. superset = 0;
  1065. while (--argc) {
  1066. ++argv;
  1067. if (**argv == '-') {
  1068. if ((*argv)[1] == 'd') {
  1069. dump_weights((*argv) + 2);
  1070. } else if ((*argv)[1] == 'f') { /* dump all weight rules */
  1071. finalize_base();
  1072. } else if ((*argv)[1] == 'R') { /* dump all weight rules */
  1073. twalk(root_weight, print_weight_node);
  1074. } else if (((*argv)[1] == 'c') && !(*argv)[2]) { /* new common subset */
  1075. cur_base = cur_derived = NULL;
  1076. next_is_subset = 1;
  1077. next_is_base = 1;
  1078. superset = 0;
  1079. } else if (((*argv)[1] == 'b') && !(*argv)[2]) { /* new base locale */
  1080. cur_base = cur_derived = NULL;
  1081. next_is_subset = 0;
  1082. next_is_base = 1;
  1083. superset = 0;
  1084. } else if (((*argv)[1] == 's') && !(*argv)[2]) { /* single locales follow */
  1085. cur_base = cur_derived = NULL;
  1086. next_is_subset = 0;
  1087. next_is_base = 2;
  1088. superset = 0;
  1089. } else if (((*argv)[1] == 'o') && !(*argv)[2]) { /* output file */
  1090. --argc;
  1091. output_file = *++argv;
  1092. } else if (((*argv)[1] == 'v') && !(*argv)[2]) { /* verbose */
  1093. ++verbose;
  1094. } else {
  1095. error_msg("unrecognized option %s", *argv);
  1096. }
  1097. continue;
  1098. }
  1099. /* new file */
  1100. new_col_locale(*argv); /* automaticly sets cur_col */
  1101. if (next_is_base) {
  1102. cur_base = cur_col;
  1103. } else {
  1104. cur_derived = cur_col;
  1105. }
  1106. pushfile(*argv);
  1107. /* verbose_msg(VDETAIL, "processing file %s\n", *argv); */
  1108. processfile(); /* this does a popfile */
  1109. /* twalk(cur_col->root_colitem, print_colnode); */
  1110. if (next_is_base == 1) {
  1111. next_is_base = 0;
  1112. }
  1113. if (next_is_subset) {
  1114. next_is_subset = 0;
  1115. superset = 1;
  1116. }
  1117. }
  1118. verbose_msg(VINFO, "success!\n");
  1119. verbose_msg(VINFO,
  1120. /* "num_sym=%zu mem_sym=%zu unique_weights=%zu\n", */
  1121. "num_sym=%u mem_sym=%u unique_weights=%u\n",
  1122. num_sym, mem_sym, unique_weights);
  1123. /* twalk(root_weight, print_weight_node); */
  1124. verbose_msg(VINFO, "num base locales = %d num derived locales = %d\n",
  1125. base_locale_len, der_locale_len);
  1126. verbose_msg(VINFO,
  1127. "override_len = %d multistart_len = %d weightstr_len = %d\n"
  1128. "wcs2colidt_len = %d index2weight_len = %d index2ruleidx_len = %d\n"
  1129. "ruletable_len = %d\n"
  1130. "total size is %d bytes or %d kB\n",
  1131. override_len, multistart_len, weightstr_len,
  1132. wcs2colidt_len, index2weight_len, index2ruleidx_len,
  1133. ruletable_len,
  1134. #warning mult by 2 for rule indecies
  1135. (override_len + multistart_len + weightstr_len
  1136. + wcs2colidt_len + index2weight_len + index2ruleidx_len + ruletable_len) * 2,
  1137. (override_len + multistart_len + weightstr_len
  1138. + wcs2colidt_len + index2weight_len + index2ruleidx_len + ruletable_len + 511) / 512);
  1139. #if 0
  1140. {
  1141. int i;
  1142. for (i=0 ; i < base_locale_len ; i++) {
  1143. dump_base_locale(i);
  1144. }
  1145. for (i=0 ; i < der_locale_len ; i++) {
  1146. dump_der_locale(i);
  1147. }
  1148. }
  1149. #endif
  1150. {
  1151. FILE *fp = fopen(output_file, "w");
  1152. if (!fp) {
  1153. error_msg("cannot open output file '%s'!", output_file);
  1154. }
  1155. dump_collate(fp);
  1156. if (ferror(fp) || fclose(fp)) {
  1157. error_msg("write error or close error for output file!\n");
  1158. }
  1159. }
  1160. return EXIT_SUCCESS;
  1161. }
  1162. static void error_msg(const char *fmt, ...)
  1163. {
  1164. va_list arg;
  1165. fprintf(stderr, "Error: ");
  1166. if (fno >= 0) {
  1167. fprintf(stderr, "file %s (%d): ", fname[fno], lineno[fno]);
  1168. }
  1169. va_start(arg, fmt);
  1170. vfprintf(stderr, fmt, arg);
  1171. va_end(arg);
  1172. fprintf(stderr, "\n");
  1173. exit(EXIT_FAILURE);
  1174. }
  1175. static void pushfile(char *filename)
  1176. {
  1177. char *inputfile;
  1178. size_t inputfile_len;
  1179. if (fno >= MAX_FNO) {
  1180. error_msg("file stack size exceeded");
  1181. }
  1182. inputfile_len = inputdir_len + strlen(filename) + 2;
  1183. inputfile = xmalloc(inputfile_len);
  1184. memset(inputfile, 0, inputfile_len);
  1185. sprintf(inputfile, "%s/%s", inputdir, filename);
  1186. if (!(fstack[++fno] = fopen(inputfile, "r"))) {
  1187. --fno; /* oops */
  1188. error_msg("cannot open file %s: %s", inputfile, strerror(errno));
  1189. }
  1190. fname[fno] = xsymdup(inputfile);
  1191. lineno[fno] = 0;
  1192. }
  1193. static void popfile(void)
  1194. {
  1195. if (fno < 0) {
  1196. error_msg("pop on empty file stack");
  1197. }
  1198. /* free(fname[fno]); */
  1199. fclose(fstack[fno]);
  1200. --fno;
  1201. }
  1202. static void eatwhitespace(void)
  1203. {
  1204. while (isspace(*pos)) {
  1205. ++pos;
  1206. }
  1207. }
  1208. static int iscommentchar(int c)
  1209. {
  1210. return ((c == '#') || (c == '%'));
  1211. }
  1212. static int next_line(void)
  1213. {
  1214. size_t n;
  1215. char *s = linebuf;
  1216. assert(fno >= 0);
  1217. pos_e = NULL;
  1218. do {
  1219. if (fgets(s, sizeof(linebuf), fstack[fno]) != NULL) {
  1220. ++lineno[fno];
  1221. n = strlen(linebuf);
  1222. if ((n == sizeof(linebuf) - 1) && (linebuf[n-1] != '\n')) {
  1223. /* Either line is too long or last line is very long with
  1224. * no trailing newline. But we'll always treat it as an
  1225. * errro. */
  1226. error_msg("line too long?");
  1227. }
  1228. --n;
  1229. /* Be careful... last line doesn't need a newline. */
  1230. if (linebuf[n] == '\n') {
  1231. linebuf[n--] = 0; /* trim trailing newline */
  1232. }
  1233. pos = linebuf;
  1234. eatwhitespace();
  1235. if (*pos && !iscommentchar(*pos)) { /* not empty or comment line */
  1236. return 1; /* got a line */
  1237. }
  1238. } else { /* eof */
  1239. popfile();
  1240. }
  1241. } while (fno >= 0);
  1242. return 0;
  1243. }
  1244. static char *next_token(void)
  1245. {
  1246. char *p;
  1247. #if 0
  1248. if (pos_e == NULL) {
  1249. return NULL
  1250. pos = pos_e;
  1251. *pos = end_of_token;
  1252. end_of_token = 0;
  1253. }
  1254. #else
  1255. if (pos_e != NULL) {
  1256. pos = pos_e;
  1257. *pos = end_of_token;
  1258. end_of_token = 0;
  1259. }
  1260. #endif
  1261. eatwhitespace();
  1262. p = pos;
  1263. if (!*p || iscommentchar(*p)) { /* end of line or start of comment */
  1264. pos = pos_e = NULL;
  1265. *p = 0; /* treat comment as end of line */
  1266. /* fprintf(stdout, "returning NUL token |%s|\n", pos); */
  1267. return NULL;
  1268. #if 1
  1269. } else if (*p == '<') { /* collating symbol, element, or value */
  1270. while (*++p) {
  1271. if ((*p == '/') && p[1]) {
  1272. ++p;
  1273. continue;
  1274. }
  1275. if (*p == '>') {
  1276. pos_e = ++p;
  1277. end_of_token = *p;
  1278. *p = 0;
  1279. /* fprintf(stdout, "returning col token |%s|\n", pos); */
  1280. return pos;
  1281. }
  1282. }
  1283. } else if (*p == '"') { /* collating element value? */
  1284. while (*++p) {
  1285. if (*p == '"') { /* found the end of the quoted string */
  1286. pos_e = ++p;
  1287. end_of_token = *p;
  1288. *p = 0;
  1289. /* fprintf(stdout, "returning quote token |%s|\n", pos); */
  1290. return pos;
  1291. }
  1292. }
  1293. #endif
  1294. } else { /* some kind of keyword */
  1295. while (*++p) {
  1296. if (isspace(*p) || (*p == ';')) {
  1297. break;
  1298. }
  1299. }
  1300. pos_e = p;
  1301. end_of_token = *p;
  1302. *p = 0;
  1303. /* fprintf(stdout, "returning key token |%s|\n", pos); */
  1304. return pos;
  1305. }
  1306. error_msg("illegal token |%s|", pos);
  1307. }
  1308. static void *xmalloc(size_t n)
  1309. {
  1310. void *p;
  1311. if (!(p = malloc(n))) {
  1312. error_msg("OUT OF MEMORY");
  1313. }
  1314. return p;
  1315. }
  1316. static void do_copy(void)
  1317. {
  1318. char *s;
  1319. char *e;
  1320. if ((s = next_token()) != NULL) {
  1321. e = strchr(s + 1, '"');
  1322. if ((*s == '"') && e && (*e == '"') && !e[1]) {
  1323. if (next_token() != NULL) {
  1324. error_msg("illegal trailing text: %s", pos);
  1325. }
  1326. *e = 0;
  1327. ++s;
  1328. if (cur_base && !strcmp(cur_base->name,s)) {
  1329. /* verbose_msg(VDETAIL, "skipping copy of base file %s\n", s); */
  1330. #warning need to update last in order and position or check
  1331. return;
  1332. }
  1333. /* verbose_msg(VDETAIL, "full copy of %s\n", s); */
  1334. pushfile(s);
  1335. return;
  1336. }
  1337. }
  1338. error_msg("illegal or missing arg for copy: %s", s);
  1339. }
  1340. static void do_colsym(void)
  1341. {
  1342. char *s;
  1343. char *e;
  1344. if ((s = next_token()) != NULL) {
  1345. e = strrchr(s,'>');
  1346. if ((*s == '<') && e && (*e == '>') && !e[1]) {
  1347. if (next_token() != NULL) {
  1348. error_msg("illegal trailing text: %s", pos);
  1349. }
  1350. e[1] = 0; /* cleanup in case next_token stored something */
  1351. add_colitem(s,NULL);
  1352. return;
  1353. }
  1354. }
  1355. error_msg("illegal or missing arg for collating-symbol: %s", s);
  1356. }
  1357. static void do_colele(void)
  1358. {
  1359. char *s;
  1360. char *e;
  1361. char *s1;
  1362. char *e1;
  1363. int n;
  1364. if ((s = next_token()) != NULL) {
  1365. e = strrchr(s,'>');
  1366. if ((*s == '<') && e && (*e == '>') && !e[1]) {
  1367. if (((s1 = next_token()) == NULL)
  1368. || (strcmp(s1,"from") != 0)
  1369. || ((s1 = next_token()) == NULL)
  1370. || (*s1 != '\"')
  1371. ) {
  1372. error_msg("illegal format for collating-element spec");
  1373. }
  1374. e1 = strchr(s1 + 1, '"');
  1375. if ((*s1 != '"') || !e1 || (*e1 != '"') || (e1[1] != 0)) {
  1376. error_msg("illegal definition for collating-element: %s", s1);
  1377. }
  1378. if (next_token() != NULL) {
  1379. error_msg("illegal trailing text: %s", pos);
  1380. }
  1381. e[1] = 0; /* cleanup in case next_token stored something */
  1382. e1[1] = 0;
  1383. add_colitem(s,s1);
  1384. ++s1;
  1385. if (!(n = is_ucode(s1))) {
  1386. error_msg("starting char must be a <U####> code: %s", s1);
  1387. }
  1388. assert(s1[n] == '<');
  1389. s1[n] = 0;
  1390. s = xsymdup(s1);
  1391. if (!(tsearch(s, &cur_base->root_starter_char, sym_cmp))) {
  1392. error_msg("OUT OF MEMORY");
  1393. }
  1394. return;
  1395. }
  1396. }
  1397. error_msg("illegal or missing arg for collating-element: %s", s);
  1398. }
  1399. static ll_item_t *find_section_list_item(const char *name, col_locale_t *loc)
  1400. {
  1401. ll_item_t *p;
  1402. if (!loc) {
  1403. return NULL;
  1404. }
  1405. p = loc->section_list;
  1406. while (p) {
  1407. #warning devel code
  1408. /* if (!((p->data_type == DT_SECTION) || (p->data_type == DT_REORDER))) { */
  1409. /* verbose_msg(VDETAIL, "fsli = %d\n", p->data_type); */
  1410. /* } */
  1411. assert((p->data_type == DT_SECTION) || (p->data_type == DT_REORDER));
  1412. if (!strcmp(name, ((section_t *)(p->data))->name)) {
  1413. break;
  1414. }
  1415. p = p->next;
  1416. }
  1417. return p;
  1418. }
  1419. static ll_item_t *find_ll_last(ll_item_t *p)
  1420. {
  1421. assert(p);
  1422. while (p->next) {
  1423. p = p->next;
  1424. }
  1425. return p;
  1426. }
  1427. static void do_script(void)
  1428. {
  1429. char *s;
  1430. char *e;
  1431. if ((s = next_token()) != NULL) {
  1432. e = strrchr(s,'>');
  1433. if ((*s == '<') && e && (*e == '>') && !e[1]) {
  1434. if (next_token() != NULL) {
  1435. error_msg("illegal trailing text: %s", pos);
  1436. }
  1437. e[1] = 0; /* cleanup in case next_token stored something */
  1438. add_script(s);
  1439. return;
  1440. }
  1441. }
  1442. error_msg("illegal or missing arg for script: %s", s);
  1443. }
  1444. static col_locale_t *new_col_locale(char *name)
  1445. {
  1446. ll_item_t *lli;
  1447. ll_item_t *lli2;
  1448. cur_col = (col_locale_t *) xmalloc(sizeof(col_locale_t));
  1449. cur_col->name = name;
  1450. cur_col->root_colitem = NULL;
  1451. cur_col->root_element = NULL;
  1452. cur_col->root_scripts = NULL;
  1453. cur_col->base_locale = NULL;
  1454. if (!superset) {
  1455. /* start with an anonymous section */
  1456. cur_section = new_section(NULL);
  1457. cur_col->section_list = new_ll_item(DT_SECTION, cur_section);
  1458. } else {
  1459. /* start with a reorder section */
  1460. cur_section = new_section("R");
  1461. cur_num_weights = cur_section->num_rules
  1462. = ((section_t *)(cur_base->section_list->data))->num_rules;
  1463. memcpy(cur_rule,
  1464. ((section_t *)(cur_base->section_list->data))->rules,
  1465. MAX_COLLATION_WEIGHTS);
  1466. memcpy(cur_section->rules,
  1467. ((section_t *)(cur_base->section_list->data))->rules,
  1468. MAX_COLLATION_WEIGHTS);
  1469. cur_col->section_list = new_ll_item(DT_REORDER, cur_section);
  1470. assert(cur_base->section_list->next == NULL); /* currently only one section allowed */
  1471. lli = ((section_t *)(cur_base->section_list->data))->itm_list;
  1472. assert(lli);
  1473. lli2 = new_ll_item(DT_REORDER, cur_section);
  1474. lli2->prev = lli2->next = lli2;
  1475. insque(lli2, lli->prev);
  1476. ((section_t *)(cur_base->section_list->data))->itm_list = lli2;
  1477. }
  1478. /* cur_col->section_list = NULL; */
  1479. /* add_script(((section_t *)(cur_col->section_list->data))->name); */
  1480. cur_col->root_wi_index = NULL;
  1481. cur_col->root_wi_index_reordered = NULL;
  1482. cur_col->root_derived_wi = NULL;
  1483. cur_col->derived_list = NULL;
  1484. cur_col->root_starter_char = NULL;
  1485. cur_col->root_starter_all = NULL;
  1486. cur_col->undefined_idx = NULL;
  1487. return cur_col;
  1488. }
  1489. static int colitem_cmp(const void *n1, const void *n2)
  1490. {
  1491. return strcmp(((colitem_t *)n1)->string, ((colitem_t *)n2)->string);
  1492. }
  1493. static int colelement_cmp(const void *n1, const void *n2)
  1494. {
  1495. int r;
  1496. r = strcmp(((colitem_t *)n1)->string, ((colitem_t *)n2)->string);
  1497. if (!r) {
  1498. if (((colitem_t *)n1)->element && ((colitem_t *)n2)->element) {
  1499. r = strcmp(((colitem_t *)n1)->element, ((colitem_t *)n2)->element);
  1500. } else if (((colitem_t *)n1)->element == ((colitem_t *)n2)->element) {
  1501. r = 0; /* both null */
  1502. } else {
  1503. r = (((colitem_t *)n1)->element == NULL) ? -1 : 1;
  1504. }
  1505. }
  1506. return r;
  1507. }
  1508. static void del_colitem(colitem_t *p)
  1509. {
  1510. /* free((void *) p->element); */
  1511. /* free((void *) p->string); */
  1512. free(p);
  1513. }
  1514. static colitem_t *new_colitem(char *item, char *def)
  1515. {
  1516. colitem_t *p;
  1517. p = xmalloc(sizeof(colitem_t));
  1518. p->string = xsymdup(item);
  1519. p->element = (!def) ? def : xsymdup(def);
  1520. return p;
  1521. }
  1522. static void add_colitem(char *item, char *def)
  1523. {
  1524. colitem_t *p;
  1525. #if 0
  1526. printf("adding collation item %s", item);
  1527. if (def) {
  1528. printf(" with definition %s", def);
  1529. }
  1530. printf("\n");
  1531. #endif
  1532. p = new_colitem(item, def);
  1533. #warning devel code
  1534. if (superset) {
  1535. if (tfind(p, &cur_base->root_colitem, colitem_cmp)) {
  1536. /* verbose_msg(VDETAIL, "skipping superset duplicate collating item \"%s\"\n", p->string); */
  1537. del_colitem(p);
  1538. return;
  1539. /* } else { */
  1540. /* verbose_msg(VDETAIL, "superset: new collating item \"%s\" = %s\n", p->string, p->element); */
  1541. }
  1542. }
  1543. if (cur_col == cur_derived) {
  1544. if (!tfind(p, &cur_base->root_colitem, colitem_cmp)) {
  1545. /* not in current but could be in base */
  1546. if (!tsearch(p, &cur_base->root_colitem, colitem_cmp)) {
  1547. error_msg("OUT OF MEMORY!");
  1548. }
  1549. } else if (!tfind(p, &cur_base->root_colitem, colelement_cmp)) {
  1550. error_msg("collating element/symbol mismatch: item=%s def=%s", item, def);
  1551. }
  1552. }
  1553. if (!tfind(p, &cur_col->root_colitem, colitem_cmp)) {
  1554. /* not in current but could be in base */
  1555. if (!tsearch(p, &cur_col->root_colitem, colitem_cmp)) {
  1556. error_msg("OUT OF MEMORY!");
  1557. }
  1558. } else if (!tfind(p, &cur_col->root_colitem, colelement_cmp)) {
  1559. error_msg("collating element/symbol mismatch");
  1560. } else { /* already there */
  1561. fprintf(stderr, "duplicate collating item \"%s\"\n", p->string);
  1562. del_colitem(p);
  1563. }
  1564. }
  1565. /* add a script (section) to the current locale */
  1566. static void add_script(const char *s)
  1567. {
  1568. ll_item_t *l;
  1569. /* make sure it isn't in base if working with derived */
  1570. if (cur_base != cur_col) {
  1571. if (find_section_list_item(s, cur_base)) {
  1572. error_msg("attempt to add script %s for derived when already in base", s);
  1573. }
  1574. }
  1575. if (find_section_list_item(s, cur_col)) {
  1576. error_msg("attempt to readd script %s", s);
  1577. }
  1578. l = find_ll_last(cur_col->section_list);
  1579. insque(new_ll_item(DT_SECTION, new_section(s)), l);
  1580. }
  1581. static const char str_forward[] = "forward";
  1582. static const char str_backward[] = "backward";
  1583. static const char str_position[] = "position";
  1584. static void do_order_start(void)
  1585. {
  1586. const char *s;
  1587. char *e;
  1588. ll_item_t *l;
  1589. section_t *sect;
  1590. int rule;
  1591. if (order_state & ~IN_ORDER) {
  1592. error_msg("order_start following reorder{_sections}_after");
  1593. }
  1594. order_state |= IN_ORDER;
  1595. if (superset) {
  1596. if (++superset_order_start_cnt > 1) {
  1597. error_msg("currently only a common order_start is supported in superset");
  1598. }
  1599. return;
  1600. }
  1601. if (!(s = next_token())) {
  1602. s = str_forward; /* if no args */
  1603. }
  1604. if (*s == '<') { /* section (script) */
  1605. e = strrchr(s,'>');
  1606. if ((*s == '<') && e && (*e == '>') && !e[1]) {
  1607. e[1] = 0; /* cleanup in case next_token stored something */
  1608. if (!(l = find_section_list_item(s, cur_col))) {
  1609. error_msg("ref of undefined sections: %s", s);
  1610. }
  1611. sect = (section_t *)(l->data);
  1612. if (sect->num_rules) {
  1613. error_msg("sections already defined: %s", s);
  1614. }
  1615. } else {
  1616. error_msg("illegal section ref: %s", s);
  1617. }
  1618. if (!(s = next_token())) {
  1619. s = str_forward; /* if no args */
  1620. } else if (*s != ';') {
  1621. error_msg("missing seperator!");
  1622. }
  1623. } else { /* need an anonymous section */
  1624. if ((*cur_section->name != '<') && (cur_section->num_items == 0)) { /* already in an empty anonymous section */
  1625. sect = cur_section;
  1626. /* fprintf(stdout, "using empty anon section %s\n", sect->name); */
  1627. } else {
  1628. sect = new_section(NULL);
  1629. l = find_ll_last(cur_col->section_list);
  1630. insque(new_ll_item(DT_SECTION, sect), l);
  1631. /* fprintf(stdout, "adding order section after section %s\n", ((section_t *)(l->data))->name); */
  1632. /* fprintf(stdout, " last section is %s\n", ((section_t *)(l->next->data))->name); */
  1633. }
  1634. sect->num_rules = 0; /* setting this below so nix default */
  1635. }
  1636. cur_section = sect;
  1637. /* fprintf(stdout, "cur_section now %s\n", cur_section->name); */
  1638. #warning need to add section to weight list?
  1639. /* now do rules */
  1640. do {
  1641. rule = 0;
  1642. if (*s == ';') {
  1643. ++s;
  1644. }
  1645. while (*s) {
  1646. if (!strncmp(str_forward, s, 7)) {
  1647. rule |= R_FORWARD;
  1648. s += 7;
  1649. } else if (!strncmp(str_backward, s, 8)) {
  1650. rule |= R_BACKWARD;
  1651. s += 8;
  1652. } else if (!strncmp(str_position, s, 8)) {
  1653. rule |= R_POSITION;
  1654. s += 8;
  1655. }
  1656. if (*s == ',') {
  1657. ++s;
  1658. continue;
  1659. }
  1660. if (!*s || (*s == ';')) {
  1661. if (sect->num_rules >= MAX_COLLATION_WEIGHTS) {
  1662. error_msg("more than %d weight rules!", MAX_COLLATION_WEIGHTS);
  1663. }
  1664. if (!rule) {
  1665. error_msg("missing weight rule!");
  1666. }
  1667. if ((rule & (R_FORWARD|R_BACKWARD|R_POSITION)) > R_BACKWARD) {
  1668. error_msg("backward paired with forward and/or position!");
  1669. }
  1670. sect->rules[sect->num_rules++] = rule;
  1671. rule = 0;
  1672. continue;
  1673. }
  1674. error_msg("illegal weight rule: %s", s);
  1675. }
  1676. } while ((s = next_token()) != NULL);
  1677. cur_section = sect;
  1678. /* verbose_msg(VDETAIL, "setting cur_num_weights to %d for %s\n", sect->num_rules, sect->name); */
  1679. cur_num_weights = sect->num_rules;
  1680. memcpy(cur_rule, sect->rules, MAX_COLLATION_WEIGHTS);
  1681. }
  1682. static void do_order_end(void)
  1683. {
  1684. if (!(order_state & IN_ORDER)) {
  1685. error_msg("order_end with no matching order_start");
  1686. }
  1687. order_state &= ~IN_ORDER;
  1688. cur_section = new_section(NULL);
  1689. }
  1690. static void do_reorder_after(void)
  1691. {
  1692. char *t;
  1693. ll_item_t *lli;
  1694. const weight_t *w;
  1695. int save_cur_num_weights;
  1696. char save_cur_rule[MAX_COLLATION_WEIGHTS];
  1697. if (order_state & ~IN_REORDER) {
  1698. error_msg("reorder_after following order_start or reorder_sections_after");
  1699. }
  1700. order_state |= IN_REORDER;
  1701. if (superset) {
  1702. error_msg("currently reorder_after is not supported in supersets");
  1703. }
  1704. #warning have to use rule for current section!!!
  1705. if (!(t = next_token())) {
  1706. error_msg("missing arg for reorder_after");
  1707. }
  1708. t = xsymdup(t);
  1709. if (next_token() != NULL) {
  1710. error_msg("trailing text reorder_after: %s", pos);
  1711. }
  1712. if (cur_col == cur_base) {
  1713. error_msg("sorry.. reorder_after in base locale is not currently supported");
  1714. }
  1715. if (!(lli = find_wi_index(t, cur_base))) {
  1716. error_msg("reorder_after for non-base item currently not supported: %s", t);
  1717. }
  1718. w = ((weighted_item_t *)(lli->data))->weight;
  1719. save_cur_num_weights = cur_num_weights;
  1720. memcpy(save_cur_rule, cur_rule, MAX_COLLATION_WEIGHTS);
  1721. cur_section = new_section("R");
  1722. insque(new_ll_item(DT_REORDER, cur_section), lli);
  1723. #if 0
  1724. {
  1725. ll_item_t *l1;
  1726. ll_item_t *l2;
  1727. ll_item_t *l3;
  1728. l1 = new_ll_item(DT_REORDER, cur_section);
  1729. l2 = find_ll_last(cur_col->section_list);
  1730. insque(l1, l2);
  1731. l3 = find_ll_last(cur_col->section_list);
  1732. verbose_msg(VDETAIL, "reorder_after %p %p %p %s\n", l1, l2, l3, cur_section->name);
  1733. }
  1734. #else
  1735. insque(new_ll_item(DT_REORDER, cur_section), find_ll_last(cur_col->section_list));
  1736. #endif
  1737. cur_num_weights = cur_section->num_rules = save_cur_num_weights;
  1738. memcpy(cur_rule, save_cur_rule, MAX_COLLATION_WEIGHTS);
  1739. memcpy(cur_section->rules, save_cur_rule, MAX_COLLATION_WEIGHTS);
  1740. #warning devel code
  1741. /* verbose_msg(VDETAIL, "reorder -- %s %d\n", ((weighted_item_t *)(lli->data))->symbol, w->num_weights); */
  1742. #warning hack to get around hu_HU reorder-after problem
  1743. /* if (!w->num_weights) { */
  1744. /* } else { */
  1745. /* cur_num_weights = w->num_weights; */
  1746. /* memcpy(cur_rule, w->rule, MAX_COLLATION_WEIGHTS); */
  1747. /* } */
  1748. /* verbose_msg(VDETAIL, "reorder_after succeeded for %s\n", t); */
  1749. }
  1750. static void do_reorder_end(void)
  1751. {
  1752. if (!(order_state & IN_REORDER)) {
  1753. error_msg("reorder_end with no matching reorder_after");
  1754. }
  1755. order_state &= ~IN_REORDER;
  1756. }
  1757. static void do_reorder_sections_after(void)
  1758. {
  1759. const char *t;
  1760. ll_item_t *lli;
  1761. if (order_state & ~IN_REORDER_SECTIONS) {
  1762. error_msg("reorder_sections_after following order_start or reorder_after");
  1763. }
  1764. order_state |= IN_REORDER_SECTIONS;
  1765. if (superset) {
  1766. error_msg("currently reorder_sections_after is not supported in supersets");
  1767. }
  1768. if (!(t = next_token())) {
  1769. error_msg("missing arg for reorder_sections_after");
  1770. }
  1771. t = xsymdup(t);
  1772. if (next_token() != NULL) {
  1773. error_msg("trailing text reorder_sections_after: %s", pos);
  1774. }
  1775. if (cur_col == cur_base) {
  1776. error_msg("sorry.. reorder_sections_after in base locale is not currently supported");
  1777. }
  1778. lli = cur_base->section_list;
  1779. do {
  1780. /* verbose_msg(VDETAIL, "hmm -- |%s|%d|\n", ((section_t *)(lli->data))->name, lli->data_type); */
  1781. if (lli->data_type & DT_SECTION) {
  1782. /* verbose_msg(VDETAIL, "checking |%s|%s|\n", ((section_t *)(lli->data))->name, t); */
  1783. if (!strcmp(((section_t *)(lli->data))->name, t)) {
  1784. reorder_section_ptr = lli;
  1785. return;
  1786. }
  1787. }
  1788. lli = lli->next;
  1789. } while (lli);
  1790. error_msg("reorder_sections_after for non-base item currently not supported: %s", t);
  1791. }
  1792. static void do_reorder_sections_end(void)
  1793. {
  1794. if (!(order_state & IN_REORDER_SECTIONS)) {
  1795. error_msg("reorder_sections_end with no matching reorder_sections_after");
  1796. }
  1797. order_state &= ~IN_REORDER_SECTIONS;
  1798. reorder_section_ptr = NULL;
  1799. }
  1800. static ll_item_t *new_ll_item(int data_type, void *data)
  1801. {
  1802. ll_item_t *p;
  1803. p = xmalloc(sizeof(ll_item_t));
  1804. p->next = p->prev = NULL;
  1805. p->data_type = data_type;
  1806. p->data = data;
  1807. p->idx = INT_MIN;
  1808. return p;
  1809. }
  1810. static int sym_cmp(const void *n1, const void *n2)
  1811. {
  1812. /* verbose_msg(VDETAIL, "sym_cmp: |%s| |%s|\n", (const char *)n1, (const char *)n2); */
  1813. return strcmp((const char *) n1, (const char *) n2);
  1814. }
  1815. static char *xsymdup(const char *s)
  1816. {
  1817. void *p;
  1818. if (!(p = tfind(s, &root_sym, sym_cmp))) { /* not a currently known symbol */
  1819. if (!(s = strdup(s)) || !(p = tsearch(s, &root_sym, sym_cmp))) {
  1820. error_msg("OUT OF MEMORY!");
  1821. }
  1822. ++num_sym;
  1823. mem_sym += strlen(s) + 1;
  1824. /* verbose_msg(VDETAIL, "xsymdup: alloc |%s| %p |%s| %p\n", *(char **)p, p, s, s); */
  1825. /* } else { */
  1826. /* verbose_msg(VDETAIL, "xsymdup: found |%s| %p\n", *(char **)p, p); */
  1827. }
  1828. return *(char **) p;
  1829. }
  1830. static int weight_cmp(const void *n1, const void *n2)
  1831. {
  1832. const weight_t *w1 = (const weight_t *) n1;
  1833. const weight_t *w2 = (const weight_t *) n2;
  1834. int i, r;
  1835. if (w1->num_weights != w2->num_weights) {
  1836. return w1->num_weights - w2->num_weights;
  1837. }
  1838. for (i=0 ; i < w1->num_weights ; i++) {
  1839. if (w1->rule[i] != w2->rule[i]) {
  1840. return w1->rule[i] - w2->rule[i];
  1841. }
  1842. if ((r = strcmp(w1->colitem[i], w2->colitem[i])) != 0) {
  1843. return r;
  1844. }
  1845. }
  1846. return 0;
  1847. }
  1848. static weight_t *register_weight(weight_t *w)
  1849. {
  1850. void *p;
  1851. if (!(p = tfind(w, &root_weight, weight_cmp))) { /* new weight */
  1852. p = xmalloc(sizeof(weight_t));
  1853. memcpy(p, w, sizeof(weight_t));
  1854. if (!(p = tsearch(p, &root_weight, weight_cmp))) {
  1855. error_msg("OUT OF MEMORY!");
  1856. }
  1857. ++unique_weights;
  1858. /* } else { */
  1859. /* verbose_msg(VDETAIL, "rw: found\n"); */
  1860. }
  1861. return *(weight_t **)p;
  1862. }
  1863. static size_t ll_len(ll_item_t *l)
  1864. {
  1865. size_t n = 0;
  1866. ll_item_t *p = l;
  1867. while (p) {
  1868. ++n;
  1869. p = p->next;
  1870. if (p == l) { /* work for circular too */
  1871. break;
  1872. }
  1873. }
  1874. return n;
  1875. }
  1876. static size_t ll_count(ll_item_t *l, int mask)
  1877. {
  1878. size_t n = 0;
  1879. ll_item_t *p = l;
  1880. while (p) {
  1881. if (p->data_type & mask) {
  1882. ++n;
  1883. }
  1884. p = p->next;
  1885. if (p == l) { /* work for circular too */
  1886. break;
  1887. }
  1888. }
  1889. return n;
  1890. }
  1891. static int wi_index_cmp(const void *n1, const void *n2)
  1892. {
  1893. const char *s1 = ((weighted_item_t *)(((ll_item_t *) n1)->data))->symbol;
  1894. const char *s2 = ((weighted_item_t *)(((ll_item_t *) n2)->data))->symbol;
  1895. return strcmp(s1, s2);
  1896. }
  1897. static void add_wi_index(ll_item_t *l)
  1898. {
  1899. assert(l->data_type == DT_WEIGHTED);
  1900. if (!strcmp(((weighted_item_t *)(l->data))->symbol, "UNDEFINED")) {
  1901. cur_col->undefined_idx = l;
  1902. }
  1903. if (!tfind(l, &cur_col->root_wi_index, wi_index_cmp)) { /* new wi_index */
  1904. if (!tsearch(l, &cur_col->root_wi_index, wi_index_cmp)) {
  1905. error_msg("OUT OF MEMORY!");
  1906. }
  1907. }
  1908. if (cur_base != cur_col) {
  1909. if (!tfind(l, &cur_base->root_wi_index, wi_index_cmp)) {/* not a base val */
  1910. /* printf("derived: %s\n", ((weighted_item_t *)(l->data))->symbol); */
  1911. if (!tfind(l, &cur_base->root_derived_wi, wi_index_cmp)) { /* new derived */
  1912. if (!tsearch(l, &cur_base->root_derived_wi, wi_index_cmp)) {
  1913. error_msg("OUT OF MEMORY!");
  1914. }
  1915. }
  1916. }
  1917. }
  1918. }
  1919. static int final_index;
  1920. static int is_ucode(const char *s)
  1921. {
  1922. if ((s[0] == '<')
  1923. && (s[1] == 'U')
  1924. && isxdigit(s[2])
  1925. && isxdigit(s[3])
  1926. && isxdigit(s[4])
  1927. && isxdigit(s[5])
  1928. && (s[6] == '>')
  1929. ) {
  1930. return 7;
  1931. } else {
  1932. return 0;
  1933. }
  1934. }
  1935. static void add_final_col_index(const char *s)
  1936. {
  1937. ENTRY e;
  1938. e.key = (char *) s;
  1939. e.data = (void *)(final_index);
  1940. if (!hsearch(e, FIND)) { /* not in the table */
  1941. if (!hsearch(e, ENTER)) {
  1942. error_msg("OUT OF MEMORY! (hsearch)");
  1943. }
  1944. #if 0
  1945. {
  1946. int n;
  1947. void *v;
  1948. colitem_t ci;
  1949. colitem_t *p;
  1950. const char *t;
  1951. if (!strcmp(s, "UNDEFINED")) {
  1952. printf("%6d: %s\n", final_index, s);
  1953. } else {
  1954. assert(*s == '<');
  1955. if ((n = is_ucode(s)) != 0) {
  1956. assert(!s[n]);
  1957. printf("%6d: %s\n", final_index, s);
  1958. } else {
  1959. ci.string = (char *) s;
  1960. ci.element = NULL; /* don't care */
  1961. v = tfind(&ci, &cur_base->root_colitem, colitem_cmp);
  1962. if (!v) {
  1963. verbose_msg(VDETAIL, "%s NOT DEFINED!!!\n", s);
  1964. } else {
  1965. p = *((colitem_t **) v);
  1966. if (p->element != NULL) {
  1967. t = p->element;
  1968. assert(*t == '"');
  1969. ++t;
  1970. n = is_ucode(t);
  1971. assert(n);
  1972. printf("%6d: %.*s | ", final_index, n, t);
  1973. do {
  1974. t += n;
  1975. assert(*t);
  1976. if (*t == '"') {
  1977. assert(!t[1]);
  1978. break;
  1979. }
  1980. n = is_ucode(t);
  1981. assert(n);
  1982. printf("%.*s", n, t);
  1983. } while (1);
  1984. printf(" collating-element %s\n", s);
  1985. } else {
  1986. printf("%6d: %s (collating-symbol)\n", final_index, s);
  1987. }
  1988. }
  1989. }
  1990. }
  1991. }
  1992. #endif
  1993. ++final_index;
  1994. }
  1995. }
  1996. static int final_index_val0(const char *s)
  1997. {
  1998. ENTRY *p;
  1999. ENTRY e;
  2000. e.key = (char *) s;
  2001. if (!(p = hsearch(e, FIND))) { /* not in the table */
  2002. return 0;
  2003. }
  2004. return (int)(p->data);
  2005. }
  2006. static int final_index_val(const char *s)
  2007. {
  2008. ENTRY *p;
  2009. ENTRY e;
  2010. e.key = (char *) s;
  2011. if (!(p = hsearch(e, FIND))) { /* not in the table */
  2012. error_msg("can't find final index: %s", s);
  2013. }
  2014. return (int)(p->data);
  2015. }
  2016. static size_t num_tree_nodes;
  2017. static void count_nodes(const void *ptr, VISIT order, int level)
  2018. {
  2019. if ((order == postorder) || (order == leaf)) {
  2020. ++num_tree_nodes;
  2021. }
  2022. }
  2023. static size_t tnumnodes(const void *root)
  2024. {
  2025. num_tree_nodes = 0;
  2026. twalk(root, count_nodes);
  2027. return num_tree_nodes;
  2028. }
  2029. static ll_item_t *find_wi_index(const char *sym, col_locale_t *cl)
  2030. {
  2031. weighted_item_t w;
  2032. ll_item_t l;
  2033. void *p;
  2034. w.symbol = sym;
  2035. l.data = &w;
  2036. l.data_type = DT_WEIGHTED;
  2037. p = tfind(&l, &cl->root_wi_index, wi_index_cmp);
  2038. if (p) {
  2039. p = *(ll_item_t **)p;
  2040. }
  2041. return (ll_item_t *) p;
  2042. }
  2043. static void mark_reordered(const char *sym)
  2044. {
  2045. ll_item_t *lli;
  2046. lli = find_wi_index(sym, cur_base);
  2047. if (lli) {
  2048. if (!tsearch(lli, &cur_base->root_wi_index_reordered, wi_index_cmp)) {
  2049. error_msg("OUT OF MEMORY!");
  2050. }
  2051. }
  2052. }
  2053. static ll_item_t *find_wi_index_reordered(const char *sym)
  2054. {
  2055. weighted_item_t w;
  2056. ll_item_t l;
  2057. void *p;
  2058. w.symbol = sym;
  2059. l.data = &w;
  2060. l.data_type = DT_WEIGHTED;
  2061. p = tfind(&l, &cur_base->root_wi_index_reordered, wi_index_cmp);
  2062. if (p) {
  2063. p = *(ll_item_t **)p;
  2064. }
  2065. return (ll_item_t *) p;
  2066. }
  2067. static ll_item_t *init_comm_ptr(void)
  2068. {
  2069. assert(cur_base);
  2070. assert(cur_base->section_list);
  2071. /* at the moment, only support one section in comm */
  2072. assert(cur_base->section_list->next == NULL);
  2073. comm_cur_ptr = ((section_t *)(cur_base->section_list->data))->itm_list;
  2074. while (comm_cur_ptr && (comm_cur_ptr->data_type & DT_REORDER)) {
  2075. comm_cur_ptr = comm_cur_ptr->next;
  2076. }
  2077. #warning devel code
  2078. /* { */
  2079. /* ll_item_t *p = comm_cur_ptr; */
  2080. /* verbose_msg(VDETAIL, "init_comm_ptr\n"); */
  2081. /* while (p != comm_cur_ptr) { */
  2082. /* if (p->data_type & DT_WEIGHTED) { */
  2083. /* verbose_msg(VDETAIL, "%s", ((weighted_item_t *)p)->symbol); */
  2084. /* } */
  2085. /* p = p->next; */
  2086. /* } */
  2087. /* } */
  2088. assert(comm_cur_ptr);
  2089. /* verbose_msg(VDETAIL, "init_comm_ptr -- %s %p %p %p %d\n", */
  2090. /* ((weighted_item_t *)(comm_cur_ptr->data))->symbol, */
  2091. /* comm_cur_ptr, comm_cur_ptr->prev, comm_cur_ptr->next, */
  2092. /* ll_len(comm_cur_ptr)); */
  2093. comm_prev_ptr = NULL;
  2094. return comm_cur_ptr;
  2095. }
  2096. static ll_item_t *next_comm_ptr(void)
  2097. {
  2098. /* at the moment, only support one section in comm */
  2099. assert(cur_base->section_list->next == NULL);
  2100. comm_prev_ptr = comm_cur_ptr;
  2101. while (comm_cur_ptr && ((comm_cur_ptr = comm_cur_ptr->next) != NULL)) {
  2102. if (!(comm_cur_ptr->data_type & DT_REORDER)) {
  2103. break;
  2104. }
  2105. }
  2106. return comm_cur_ptr;
  2107. }
  2108. static int dump_count;
  2109. #if 0
  2110. static void dump_section(section_t *s, int mask, col_locale_t *der)
  2111. {
  2112. ll_item_t *lli;
  2113. ll_item_t *lli0;
  2114. weighted_item_t *w;
  2115. weight_t *p;
  2116. int i;
  2117. lli0 = lli = s->itm_list;
  2118. if (!lli0) {
  2119. return;
  2120. }
  2121. do {
  2122. if (!(lli->data_type & mask)) {
  2123. lli = lli->next;
  2124. continue;
  2125. }
  2126. if (lli->data_type & DT_WEIGHTED) {
  2127. ++dump_count;
  2128. w = (weighted_item_t *)(lli->data);
  2129. p = w->weight;
  2130. printf("%6d: %s (%d) ", dump_count, w->symbol, p->num_weights);
  2131. for (i = 0 ; i < p->num_weights ; i++) {
  2132. if (p->rule[i] & R_FORWARD) {
  2133. printf("F");
  2134. }
  2135. if (p->rule[i] & R_BACKWARD) {
  2136. printf("B");
  2137. }
  2138. if (p->rule[i] & R_POSITION) {
  2139. printf("P");
  2140. }
  2141. printf(",");
  2142. }
  2143. for (i = 0 ; i < p->num_weights ; i++) {
  2144. printf(" %s", p->colitem[i]);
  2145. }
  2146. printf("\n");
  2147. } else if (lli->data_type & (DT_SECTION|DT_REORDER)) {
  2148. if (lli->data_type == DT_REORDER) {
  2149. assert(der);
  2150. if (strncmp(((section_t *)(lli->data))->name, der->name, strlen(der->name))) {
  2151. lli = lli->next;
  2152. continue;
  2153. }
  2154. }
  2155. if (lli->data_type & DT_SECTION) {
  2156. printf("SECTION -----------------\n");
  2157. } else {
  2158. printf("REORDER -----------------\n");
  2159. }
  2160. dump_section((section_t *)(lli->data), mask, der);
  2161. printf("DONE --------------------\n");
  2162. }
  2163. lli = lli->next;
  2164. } while (lli != lli0);
  2165. }
  2166. #else
  2167. static int in_reorder_section = 0;
  2168. static void dump_section(section_t *s, int mask, col_locale_t *der)
  2169. {
  2170. ll_item_t *lli;
  2171. ll_item_t *lli0;
  2172. weighted_item_t *w;
  2173. weight_t *p;
  2174. int i;
  2175. lli0 = lli = s->itm_list;
  2176. if (!lli0) {
  2177. return;
  2178. }
  2179. do {
  2180. if (!(lli->data_type & mask)) {
  2181. lli = lli->next;
  2182. continue;
  2183. }
  2184. if (lli->data_type & DT_WEIGHTED) {
  2185. ++dump_count;
  2186. w = (weighted_item_t *)(lli->data);
  2187. p = w->weight;
  2188. #if 1
  2189. if (in_reorder_section) {
  2190. printf(" %p", w);
  2191. }
  2192. #else
  2193. printf("%6d: %s (%d) ", dump_count, w->symbol, p->num_weights);
  2194. for (i = 0 ; i < p->num_weights ; i++) {
  2195. if (p->rule[i] & R_FORWARD) {
  2196. printf("F");
  2197. }
  2198. if (p->rule[i] & R_BACKWARD) {
  2199. printf("B");
  2200. }
  2201. if (p->rule[i] & R_POSITION) {
  2202. printf("P");
  2203. }
  2204. printf(",");
  2205. }
  2206. for (i = 0 ; i < p->num_weights ; i++) {
  2207. printf(" %s", p->colitem[i]);
  2208. }
  2209. printf("\n");
  2210. #endif
  2211. } else if (lli->data_type & (DT_SECTION|DT_REORDER)) {
  2212. if (lli->data_type == DT_REORDER) {
  2213. assert(der);
  2214. if (strncmp(((section_t *)(lli->data))->name, der->name, strlen(der->name))) {
  2215. lli = lli->next;
  2216. continue;
  2217. }
  2218. }
  2219. if (lli->data_type & DT_SECTION) {
  2220. /* printf("SECTION -----------------\n"); */
  2221. assert(0);
  2222. } else {
  2223. /* printf("REORDER -----------------\n"); */
  2224. in_reorder_section = 1;
  2225. }
  2226. dump_section((section_t *)(lli->data), mask, der);
  2227. /* printf("DONE --------------------\n"); */
  2228. printf("\n");
  2229. in_reorder_section = 0;
  2230. }
  2231. lli = lli->next;
  2232. } while (lli != lli0);
  2233. }
  2234. #endif
  2235. static void dump_weights(const char *name)
  2236. {
  2237. ll_item_t *lli;
  2238. col_locale_t *base;
  2239. col_locale_t *der;
  2240. col_locale_t cl;
  2241. void *p;
  2242. assert(name);
  2243. if (!*name) { /* use last */
  2244. base = cur_base;
  2245. der = cur_derived;
  2246. } else {
  2247. cl.name = (char *) name;
  2248. if (!(p = tfind(&cl, &root_col_locale, col_locale_cmp))) {
  2249. error_msg("unknown locale: %s", name);
  2250. }
  2251. base = *((col_locale_t **) p);
  2252. der = NULL;
  2253. if (base->base_locale) { /* oops... really derived */
  2254. der = base;
  2255. base = der->base_locale;
  2256. }
  2257. }
  2258. dump_count = 0;
  2259. if (base) {
  2260. /* printf("BASE - %s\n", base->name); */
  2261. for (lli = base->section_list ; lli ; lli = lli->next) {
  2262. /* printf("SECTION %s\n", ((section_t *)(lli->data))->name); */
  2263. dump_section((section_t *)(lli->data), ~0, der);
  2264. }
  2265. }
  2266. assert(der != base);
  2267. if (der) {
  2268. /* printf("DERIVED - %s\n", der->name); */
  2269. for (lli = der->section_list ; lli ; lli = lli->next) {
  2270. if (lli->data_type == DT_SECTION) {
  2271. dump_section((section_t *)(lli->data), DT_WEIGHTED, der);
  2272. }
  2273. }
  2274. }
  2275. /* printf("DONE\n"); */
  2276. }
  2277. static void print_starter_node(const void *ptr, VISIT order, int level)
  2278. {
  2279. if (order == postorder || order == leaf) {
  2280. fprintf(stderr, " %s\n", *(const char **) ptr);
  2281. }
  2282. }
  2283. static void finalize_base(void)
  2284. {
  2285. ll_item_t *s;
  2286. ll_item_t *h;
  2287. ll_item_t *lli;
  2288. ll_item_t *h2;
  2289. ll_item_t *l2;
  2290. ll_item_t *cli;
  2291. ll_item_t *rli = NULL;
  2292. weighted_item_t *w;
  2293. weight_t *p;
  2294. int i, n, mr, r, mi;
  2295. col_locale_t *cl;
  2296. void *mm;
  2297. int num_invariant = 0;
  2298. int num_varying = 0;
  2299. int max_weight;
  2300. int index2weight_len_inc = 1;
  2301. assert(cur_base);
  2302. assert(base_locale_len+1 < BASE_LOCALE_LEN);
  2303. base_locale_array[base_locale_len].name = cur_base->name;
  2304. base_locale_array[base_locale_len].num_weights = 1;
  2305. base_locale_array[base_locale_len].index2weight_offset = index2weight_len;
  2306. base_locale_array[base_locale_len].index2ruleidx_offset = index2ruleidx_len;
  2307. if (!strcmp(cur_base->name,"ja_JP") || !strcmp(cur_base->name,"ko_KR")) {
  2308. #warning fix the index2weight check!!
  2309. index2weight_len_inc = 0;
  2310. }
  2311. /* printf("%s -- index2weight_len = %d\n", cur_base->name, index2weight_len); */
  2312. if (!hcreate(30000)) {
  2313. error_msg("OUT OF MEMORY!");
  2314. }
  2315. /* first pass ... set the fixed indexes */
  2316. final_index = i = 1;
  2317. mr = 0;
  2318. for (s = cur_base->section_list ; s ; s = s->next) {
  2319. #if 1
  2320. if (s->data_type & DT_REORDER) { /* a reordered section */
  2321. verbose_msg(VDETAIL, "pass1: reordered section %s - xxx\n", ((section_t *)(s->data))->name);
  2322. lli = ((section_t *)(s->data))->itm_list;
  2323. r = 0;
  2324. if (lli) {
  2325. /* r = ll_len( ((section_t *)(lli->data))->itm_list ); */
  2326. r = ll_len(lli) + 1;
  2327. }
  2328. if (r > mr) {
  2329. mr = r;
  2330. }
  2331. verbose_msg(VDETAIL, "pass1: reordered section %s - %d\n", ((section_t *)(s->data))->name, r);
  2332. continue;
  2333. }
  2334. #endif
  2335. h = lli = ((section_t *)(s->data))->itm_list;
  2336. if (!lli) {
  2337. continue;
  2338. }
  2339. do {
  2340. if (lli->data_type & DT_RANGE) {
  2341. i += mr;
  2342. mr = 0;
  2343. #warning check ko_kR and 9
  2344. /* ++i; */
  2345. lli->idx = i;
  2346. assert(!rli);
  2347. rli = lli;
  2348. verbose_msg(VDETAIL, "range pre = %d after = ", i);
  2349. i += ((range_item_t *)(lli->data))->length + 1;
  2350. #warning check ko_kR and 9
  2351. /* ++i; */
  2352. verbose_msg(VDETAIL, "%d\n", i);
  2353. if (!index2weight_len_inc) { /* ko_KR hack */
  2354. final_index += ((range_item_t *)(lli->data))->length + 1;
  2355. }
  2356. /* add_final_col_index("RANGE"); */
  2357. } else if (lli->data_type & DT_WEIGHTED) {
  2358. i += mr;
  2359. mr = 0;
  2360. w = (weighted_item_t *)(lli->data);
  2361. if (find_wi_index_reordered(w->symbol)) { /* reordered symbol so skip on first pass */
  2362. ++num_varying;
  2363. ++i;
  2364. continue;
  2365. }
  2366. ++num_invariant;
  2367. index2weight_buffer[index2weight_len] = lli->idx = i++;
  2368. index2weight_len += index2weight_len_inc;
  2369. add_final_col_index(w->symbol);
  2370. } else {
  2371. assert(lli->data_type & DT_REORDER);
  2372. r = ll_len( ((section_t *)(lli->data))->itm_list );
  2373. #warning check ko_kR and 9
  2374. if (r > mr) {
  2375. mr = r;
  2376. }
  2377. /* r = 0; */
  2378. }
  2379. } while ((lli = lli->next) != h);
  2380. }
  2381. /* second pass ... set the reordered indexes */
  2382. mi = i + mr;
  2383. mr = i = 0;
  2384. for (s = cur_base->section_list ; s ; s = s->next) {
  2385. h = lli = ((section_t *)(s->data))->itm_list;
  2386. if (!lli) {
  2387. continue;
  2388. }
  2389. do {
  2390. if (lli->data_type & DT_RANGE) {
  2391. i += mr;
  2392. mr = 0;
  2393. i = lli->idx + ((range_item_t *)(lli->data))->length + 1;
  2394. #warning check
  2395. } else if ((lli->data_type & DT_WEIGHTED) && !(s->data_type & DT_REORDER)) {
  2396. i += mr;
  2397. mr = 0;
  2398. w = (weighted_item_t *)(lli->data);
  2399. if (find_wi_index_reordered(w->symbol) /* reordered symbol skipped on first pass */
  2400. #if 0
  2401. || (s->data_type & DT_REORDER) /* or in a reordered section */
  2402. #endif
  2403. ) {
  2404. assert(!(s->data_type & DT_REORDER));
  2405. index2weight_buffer[index2weight_len] = lli->idx = ++i;
  2406. index2weight_len += index2weight_len_inc;
  2407. add_final_col_index(w->symbol);
  2408. /* fprintf(stdout, "%11s: r %6d %6d %s\n", */
  2409. /* cur_base->name, lli->idx, final_index_val(w->symbol), w->symbol); */
  2410. continue;
  2411. }
  2412. i = lli->idx;
  2413. /* fprintf(stdout, "%11s: w %6d %6d %s\n", */
  2414. /* cur_base->name, lli->idx, final_index_val(w->symbol), w->symbol); */
  2415. } else {
  2416. /* verbose_msg(VDETAIL, "section: %s %d %d\n", ((section_t *)(s->data))->name, */
  2417. /* s->data_type, lli->data_type); */
  2418. /* assert(!(s->data_type & DT_REORDER)); */
  2419. /* assert(lli->data_type & DT_REORDER); */
  2420. #if 1
  2421. if (s->data_type & DT_REORDER) {
  2422. h2 = l2 = lli;
  2423. if (!h2) {
  2424. continue;
  2425. }
  2426. } else {
  2427. assert(s->data_type & DT_SECTION);
  2428. h2 = l2 = ((section_t *)(lli->data))->itm_list;
  2429. if (!h2) {
  2430. continue;
  2431. }
  2432. }
  2433. #else
  2434. h2 = l2 = ((section_t *)(lli->data))->itm_list;
  2435. if (!h2) {
  2436. continue;
  2437. }
  2438. #endif
  2439. r = 0;
  2440. do {
  2441. assert(l2->data_type & DT_WEIGHTED);
  2442. ++r;
  2443. l2->idx = i + r;
  2444. /* fprintf(stdout, "%s: R %6d %s\n", */
  2445. /* ((section_t *)(lli->data))->name, l2->idx, ((weighted_item_t *)(l2->data))->symbol); */
  2446. } while ((l2 = l2->next) != h2);
  2447. if (r > mr) {
  2448. mr = r;
  2449. }
  2450. }
  2451. } while ((lli = lli->next) != h);
  2452. }
  2453. /* finally, walk through all derived locales and set non-reordered section items */
  2454. mr = mi;
  2455. for (cli = cur_base->derived_list ; cli ; cli = cli->next) {
  2456. cl = (col_locale_t *)(cli->data);
  2457. /* verbose_msg(VDETAIL, "pass3: %d %s\n", cli->data_type, cl->name); */
  2458. /* fprintf(stdout, "pass3: %d %s\n", cli->data_type, cl->name); */
  2459. assert(cli->data_type == DT_COL_LOCALE);
  2460. i = mi;
  2461. for (s = cl->section_list ; s ; s = s->next) {
  2462. /* if (s->data_type & DT_REORDER) { */
  2463. /* continue; */
  2464. /* } */
  2465. h = lli = ((section_t *)(s->data))->itm_list;
  2466. if (!lli) {
  2467. continue;
  2468. }
  2469. do {
  2470. assert(!(lli->data_type & DT_RANGE));
  2471. if (lli->data_type & DT_WEIGHTED) {
  2472. /* verbose_msg(VDETAIL, " %d %d %s\n", lli->data_type, lli->idx, ((weighted_item_t *)(lli->data))->symbol); */
  2473. add_final_col_index(((weighted_item_t *)(lli->data))->symbol);
  2474. if (s->data_type & DT_REORDER) {
  2475. continue;
  2476. }
  2477. assert(lli->idx == INT_MIN);
  2478. lli->idx = ++i;
  2479. /* fprintf(stdout, "%11s: S %6d %6d %s\n", */
  2480. /* cl->name, lli->idx, */
  2481. /* final_index_val(((weighted_item_t *)(lli->data))->symbol), */
  2482. /* ((weighted_item_t *)(lli->data))->symbol); */
  2483. } else {
  2484. assert(0);
  2485. assert(lli->data_type & DT_SECTION);
  2486. h2 = l2 = ((section_t *)(lli->data))->itm_list;
  2487. if (!h2) {
  2488. continue;
  2489. }
  2490. do {
  2491. assert(l2->data_type & DT_WEIGHTED);
  2492. assert(l2->idx == INT_MIN);
  2493. l2->idx = ++i;
  2494. add_final_col_index(((weighted_item_t *)(l2->data))->symbol);
  2495. } while ((l2 = l2->next) != h2);
  2496. }
  2497. } while ((lli = lli->next) != h);
  2498. }
  2499. if (i > mr) {
  2500. mr = i;
  2501. }
  2502. }
  2503. max_weight = mr;
  2504. assert(num_varying == tnumnodes(cur_base->root_wi_index_reordered));
  2505. /* we can now initialize the wcs2index array */
  2506. {
  2507. ENTRY *p;
  2508. ENTRY e;
  2509. char buf[8];
  2510. static const char xd[] = "0123456789ABCDEF";
  2511. int starter_index = final_index;
  2512. int wcs2index_count = 0;
  2513. strcpy(buf, "<U....>");
  2514. memset(wcs2index, 0, sizeof(wcs2index));
  2515. e.key = (char *) buf;
  2516. for (i=1 ; i <= 0xffff ; i++) {
  2517. buf[5] = xd[ i & 0xf ];
  2518. buf[4] = xd[ (i >> 4) & 0xf ];
  2519. buf[3] = xd[ (i >> 8) & 0xf ];
  2520. buf[2] = xd[ (i >> 12) & 0xf ];
  2521. if ((p = hsearch(e, FIND)) != NULL) {
  2522. ++wcs2index_count;
  2523. if ((tfind(buf, &cur_base->root_starter_char, sym_cmp)) != NULL) {
  2524. wcs2index[i] = ++starter_index;
  2525. /* verbose_msg(VDETAIL, "wcs2index[ %#06x ] = %d (starter)\n", i, wcs2index[i]); */
  2526. } else {
  2527. wcs2index[i] = (int)(p->data);
  2528. /* verbose_msg(VDETAIL, "wcs2index[ %#06x ] = %d\n", i, wcs2index[i]); */
  2529. }
  2530. } else {
  2531. if ((tfind(buf, &cur_base->root_starter_char, sym_cmp)) != NULL) {
  2532. error_msg("marked starter but not in hash: %s", buf);
  2533. }
  2534. }
  2535. }
  2536. /* ---------------------------------------------------------------------- */
  2537. {
  2538. int i, n;
  2539. table_data table;
  2540. size_t t, smallest;
  2541. n = 0;
  2542. smallest = SIZE_MAX;
  2543. table.ii = NULL;
  2544. for (i=0 ; i < 14 ; i++) {
  2545. if ((RANGE >> i) < 4) {
  2546. break;
  2547. }
  2548. t = newopt(wcs2index, RANGE, i, &table);
  2549. if (smallest >= t) {
  2550. n = i;
  2551. smallest = t;
  2552. /* } else { */
  2553. /* break; */
  2554. }
  2555. }
  2556. /* printf("smallest = %u for range %#x (%u)\n", smallest, RANGE, RANGE); */
  2557. assert(smallest != SIZE_MAX);
  2558. if (smallest + wcs2colidt_len >= WCS2COLIDT_LEN) {
  2559. error_msg("WCS2COLIDT_LEN too small");
  2560. }
  2561. base_locale_array[base_locale_len].wcs2colidt_offset = wcs2colidt_len;
  2562. table.ii = wcs2colidt_buffer + wcs2colidt_len;
  2563. t = smallest;
  2564. smallest = SIZE_MAX;
  2565. smallest = newopt(wcs2index, RANGE, n, &table);
  2566. assert(t == smallest);
  2567. wcs2colidt_len += smallest;
  2568. /* verbose_msg(VDETAIL, "smallest = %d wcs2colidt_len = %d\n", smallest, wcs2colidt_len); */
  2569. #if 0
  2570. {
  2571. unsigned int sc, n, i0, i1;
  2572. unsigned int u = 0xe40;
  2573. table_data *tbl = &table;
  2574. #define __LOCALE_DATA_WCctype_TI_MASK ((1 << tbl->ti_shift)-1)
  2575. #define __LOCALE_DATA_WCctype_TI_SHIFT (tbl->ti_shift)
  2576. #define __LOCALE_DATA_WCctype_TI_LEN (tbl->ti_len)
  2577. #define __LOCALE_DATA_WCctype_II_MASK ((1 << tbl->ii_shift)-1)
  2578. #define __LOCALE_DATA_WCctype_II_SHIFT (tbl->ii_shift)
  2579. #define __LOCALE_DATA_WCctype_II_LEN (tbl->ii_len)
  2580. sc = u & __LOCALE_DATA_WCctype_TI_MASK;
  2581. u >>= __LOCALE_DATA_WCctype_TI_SHIFT;
  2582. n = u & __LOCALE_DATA_WCctype_II_MASK;
  2583. u >>= __LOCALE_DATA_WCctype_II_SHIFT;
  2584. i0 = tbl->ii[u];
  2585. verbose_msg(VDETAIL, "i0 = %d\n", i0);
  2586. i0 <<= __LOCALE_DATA_WCctype_II_SHIFT;
  2587. i1 = tbl->ii[__LOCALE_DATA_WCctype_II_LEN + i0 + n];
  2588. /* i1 = tbl->ti[i0 + n]; */
  2589. verbose_msg(VDETAIL, "i1 = %d\n", i1);
  2590. i1 <<= __LOCALE_DATA_WCctype_TI_SHIFT;
  2591. /* return *(uint16_t *)(&(tbl->ii[__LOCALE_DATA_WCctype_II_LEN + __LOCALE_DATA_WCctype_TI_LEN + i1 + sc])); */
  2592. verbose_msg(VDETAIL, "i2 = %d\n", __LOCALE_DATA_WCctype_II_LEN + __LOCALE_DATA_WCctype_TI_LEN + i1 + sc);
  2593. verbose_msg(VDETAIL, "val = %d\n", tbl->ii[__LOCALE_DATA_WCctype_II_LEN + __LOCALE_DATA_WCctype_TI_LEN + i1 + sc]);
  2594. /* return tbl->ut[i1 + sc]; */
  2595. }
  2596. #endif
  2597. base_locale_array[base_locale_len].ii_shift = table.ii_shift;
  2598. base_locale_array[base_locale_len].ti_shift = table.ti_shift;
  2599. base_locale_array[base_locale_len].ii_len = table.ii_len;
  2600. base_locale_array[base_locale_len].ti_len = table.ti_len;
  2601. }
  2602. /* ---------------------------------------------------------------------- */
  2603. base_locale_array[base_locale_len].num_col_base = num_invariant + num_varying;
  2604. base_locale_array[base_locale_len].max_col_index = final_index;
  2605. base_locale_array[base_locale_len].max_weight = max_weight;
  2606. verbose_msg(VDETAIL, "%s: %6u invariant %6u varying %6u derived %6u total %6u max weight %6u wcs2\n",
  2607. cur_base->name, num_invariant, num_varying,
  2608. tnumnodes(cur_base->root_derived_wi), final_index, max_weight,
  2609. wcs2index_count);
  2610. }
  2611. #if 1
  2612. /* ok, now we need to dump out the base and derived tables... */
  2613. /* don't forget to break up collating elements!!! */
  2614. /* fprintf(stdout, "**************************************************\n"); */
  2615. /* first pass ... set the invariants */
  2616. for (s = cur_base->section_list ; s ; s = s->next) {
  2617. #if 1
  2618. if (s->data_type & DT_REORDER) {
  2619. verbose_msg(VDETAIL, "1: skipping reordered section %s\n", ((section_t *)(s->data))->name);
  2620. continue;
  2621. }
  2622. #endif
  2623. h = lli = ((section_t *)(s->data))->itm_list;
  2624. if (!lli) {
  2625. continue;
  2626. }
  2627. do {
  2628. if (lli->data_type & DT_WEIGHTED) {
  2629. w = (weighted_item_t *)(lli->data);
  2630. if (find_wi_index_reordered(w->symbol)) { /* reordered symbol so skip on first pass */
  2631. continue;
  2632. }
  2633. if (index2weight_len_inc) {
  2634. index2ruleidx_buffer[index2ruleidx_len++] =
  2635. add_rule((weighted_item_t *)(lli->data));
  2636. }
  2637. /* fprintf(stdout, "%11s: w %6d %6d %s\n", */
  2638. /* cur_base->name, lli->idx, final_index_val(w->symbol), w->symbol); */
  2639. }
  2640. } while ((lli = lli->next) != h);
  2641. }
  2642. /* second pass ... set varying */
  2643. for (s = cur_base->section_list ; s ; s = s->next) {
  2644. #if 1
  2645. if (s->data_type & DT_REORDER) {
  2646. verbose_msg(VDETAIL, "2: skipping reordered section %s\n", ((section_t *)(s->data))->name);
  2647. continue;
  2648. }
  2649. #endif
  2650. h = lli = ((section_t *)(s->data))->itm_list;
  2651. if (!lli) {
  2652. continue;
  2653. }
  2654. do {
  2655. if (lli->data_type & DT_WEIGHTED) {
  2656. w = (weighted_item_t *)(lli->data);
  2657. if (find_wi_index_reordered(w->symbol)) { /* reordered symbol so skip on first pass */
  2658. if (index2weight_len_inc) {
  2659. index2ruleidx_buffer[index2ruleidx_len++] =
  2660. add_rule((weighted_item_t *)(lli->data));
  2661. }
  2662. /* fprintf(stdout, "%11s: r %6d %6d %s\n", */
  2663. /* cur_base->name, lli->idx, final_index_val(w->symbol), w->symbol); */
  2664. continue;
  2665. }
  2666. }
  2667. } while ((lli = lli->next) != h);
  2668. }
  2669. do_starter_lists(cur_base);
  2670. /* verbose_msg(VDETAIL,"updated final_index = %d\n", final_index); */
  2671. if (rli) {
  2672. base_locale_array[base_locale_len].range_low
  2673. = strtoul(((range_item_t *)(rli->data))->symbol1 + 2, NULL, 16);
  2674. base_locale_array[base_locale_len].range_count
  2675. = ((range_item_t *)(rli->data))->length;
  2676. base_locale_array[base_locale_len].range_base_weight = rli->idx;
  2677. base_locale_array[base_locale_len].range_rule_offset = add_range_rule((range_item_t *)(rli->data));
  2678. /* fprintf(stdout, "%11s: %6d %6d %s %s (%d)\n", */
  2679. /* "RANGE", rli->idx, -1, */
  2680. /* ((range_item_t *)(rli->data))->symbol1, */
  2681. /* ((range_item_t *)(rli->data))->symbol2, */
  2682. /* ((range_item_t *)(rli->data))->length); */
  2683. }
  2684. /* fprintf(stdout,"\nDerived\n\n"); */
  2685. /* first, if base name is of the form ll_CC, add a derived locale for it */
  2686. if ((strlen(cur_base->name) == 5)
  2687. && islower(cur_base->name[0])
  2688. && islower(cur_base->name[1])
  2689. && (cur_base->name[2] == '_')
  2690. && isupper(cur_base->name[3])
  2691. && isupper(cur_base->name[4])
  2692. ) {
  2693. verbose_msg(VDETAIL, "adding special derived for %s\n", cur_base->name);
  2694. /* verbose_msg(VDETAIL,"updated final_index = %d\n", final_index); */
  2695. assert(der_locale_len+1 < DER_LOCALE_LEN);
  2696. der_locale_array[der_locale_len].name = cur_base->name;
  2697. der_locale_array[der_locale_len].base_idx = base_locale_len;
  2698. u16_buf[0] = 1;
  2699. u16_buf[1] = 0;
  2700. u16_buf_len = 2;
  2701. mm = NULL;
  2702. if ((u16_buf_len > override_len) ||
  2703. !(mm = memmem(override_buffer, override_len*sizeof(override_buffer[0]),
  2704. u16_buf, u16_buf_len*sizeof(u16_buf[0])))
  2705. ) {
  2706. assert(override_len + u16_buf_len < OVERRIDE_LEN);
  2707. memcpy(override_buffer + override_len, u16_buf, u16_buf_len*sizeof(u16_buf[0]));
  2708. der_locale_array[der_locale_len].overrides_offset = override_len;
  2709. override_len += u16_buf_len;
  2710. /* printf("%s: override_len = %d u16_buf_len = %d\n", cl->name, override_len, u16_buf_len); */
  2711. } else if (!(u16_buf_len > override_len)) {
  2712. assert(mm);
  2713. der_locale_array[der_locale_len].overrides_offset = ((uint16_t *)(mm)) - override_buffer;
  2714. /* printf("%s: memmem found a match with u16_buf_len = %d\n", cl->name, u16_buf_len); */
  2715. }
  2716. der_locale_array[der_locale_len].multistart_offset
  2717. = base_locale_array[base_locale_len].multistart_offset;
  2718. der_locale_array[der_locale_len].undefined_idx = final_index_val0("UNDEFINED");
  2719. if (!der_locale_array[der_locale_len].undefined_idx) {
  2720. error_msg("no UNDEFINED definition for %s", cur_base->name);
  2721. }
  2722. ++der_locale_len;
  2723. } else {
  2724. verbose_msg(VDETAIL, "NOT adding special derived for %s\n", cur_base->name);
  2725. }
  2726. /* now all the derived... */
  2727. for (cli = cur_base->derived_list ; cli ; cli = cli->next) {
  2728. cl = (col_locale_t *)(cli->data);
  2729. assert(cli->data_type == DT_COL_LOCALE);
  2730. assert(der_locale_len+1 < DER_LOCALE_LEN);
  2731. der_locale_array[der_locale_len].name = cl->name;
  2732. der_locale_array[der_locale_len].base_idx = base_locale_len;
  2733. u16_buf_len = 0;
  2734. for (i = 0 ; i < 2 ; i++) {
  2735. if (i) {
  2736. /* fprintf(stdout, " section --- (singles)\n"); */
  2737. u16_buf[u16_buf_len++] = 1; /* single */
  2738. }
  2739. /* we do this in two passes... first all sequences, then all single reorders */
  2740. for (s = cl->section_list ; s ; s = s->next) {
  2741. /* verbose_msg(VDETAIL, "doing section %s\n", ((section_t *)(s->data))->name); */
  2742. h = lli = ((section_t *)(s->data))->itm_list;
  2743. if (!lli) {
  2744. /* fprintf(stdout, "EMPTY ITEM LIST IN SECTION %s\n", ((section_t *)(s->data))->name ); */
  2745. continue;
  2746. }
  2747. assert(u16_buf_len +4 < sizeof(u16_buf)/sizeof(u16_buf[0]));
  2748. if ((!i && (ll_len(h) > 1) ) || (ll_len(h) == i)) {
  2749. if (!i) {
  2750. /* fprintf(stdout, " section ----------------- %d %d\n", i, ll_len(h)); */
  2751. u16_buf[u16_buf_len++] = ll_len(h); /* multi */
  2752. assert(lli->data_type & DT_WEIGHTED);
  2753. #if 0
  2754. u16_buf[u16_buf_len++] = final_index_val(((weighted_item_t *)(lli->data))->symbol); /* start index */
  2755. #endif
  2756. u16_buf[u16_buf_len++] = lli->idx; /* start weight */
  2757. }
  2758. do {
  2759. assert(lli->data_type & DT_WEIGHTED);
  2760. if (lli->data_type & DT_WEIGHTED) {
  2761. /* fprintf(stdout, "%11s: S %6d %6d %s\n", */
  2762. /* cl->name, lli->idx, */
  2763. /* final_index_val(((weighted_item_t *)(lli->data))->symbol), */
  2764. /* ((weighted_item_t *)(lli->data))->symbol); */
  2765. #if 0
  2766. if (i) {
  2767. assert(u16_buf_len +4 < sizeof(u16_buf)/sizeof(u16_buf[0]));
  2768. u16_buf[u16_buf_len++] = final_index_val(((weighted_item_t *)(lli->data))->symbol);
  2769. assert(u16_buf[u16_buf_len-1]);
  2770. u16_buf[u16_buf_len++] = lli->idx; /* weight */
  2771. }
  2772. #else
  2773. assert(u16_buf_len +4 < sizeof(u16_buf)/sizeof(u16_buf[0]));
  2774. u16_buf[u16_buf_len++] = final_index_val(((weighted_item_t *)(lli->data))->symbol);
  2775. assert(u16_buf[u16_buf_len-1]);
  2776. if (i) {
  2777. u16_buf[u16_buf_len++] = lli->idx; /* weight */
  2778. }
  2779. #endif
  2780. u16_buf[u16_buf_len++] = add_rule((weighted_item_t *)(lli->data));
  2781. }
  2782. } while ((lli = lli->next) != h);
  2783. }
  2784. }
  2785. }
  2786. u16_buf[u16_buf_len++] = 0;
  2787. mm = NULL;
  2788. if ((u16_buf_len > override_len) ||
  2789. !(mm = memmem(override_buffer, override_len*sizeof(override_buffer[0]),
  2790. u16_buf, u16_buf_len*sizeof(u16_buf[0])))
  2791. ) {
  2792. assert(override_len + u16_buf_len < OVERRIDE_LEN);
  2793. memcpy(override_buffer + override_len, u16_buf, u16_buf_len*sizeof(u16_buf[0]));
  2794. der_locale_array[der_locale_len].overrides_offset = override_len;
  2795. override_len += u16_buf_len;
  2796. /* printf("%s: override_len = %d u16_buf_len = %d\n", cl->name, override_len, u16_buf_len); */
  2797. } else if (!(u16_buf_len > override_len)) {
  2798. assert(mm);
  2799. der_locale_array[der_locale_len].overrides_offset = ((uint16_t *)(mm)) - override_buffer;
  2800. /* printf("%s: memmem found a match with u16_buf_len = %d\n", cl->name, u16_buf_len); */
  2801. }
  2802. do_starter_lists(cl);
  2803. der_locale_array[der_locale_len].undefined_idx = final_index_val0("UNDEFINED");
  2804. #if 0
  2805. assert(der_locale_array[der_locale_len].undefined_idx);
  2806. if (!der_locale_array[der_locale_len].undefined_idx) {
  2807. der_locale_array[der_locale_len].undefined_idx = base_locale_array[base_locale_len].undefined_idx;
  2808. }
  2809. #endif
  2810. if (!der_locale_array[der_locale_len].undefined_idx) {
  2811. error_msg("no UNDEFINED definition for %s", cl->name);
  2812. }
  2813. ++der_locale_len;
  2814. }
  2815. #endif
  2816. #warning handle UNDEFINED idx specially? what if in only some of derived?
  2817. /* base_locale_array[base_locale_len].undefined_idx = final_index_val0("UNDEFINED"); */
  2818. base_locale_array[base_locale_len].undefined_idx = 0;
  2819. hdestroy();
  2820. ++base_locale_len;
  2821. /* if (tnumnodes(cur_base->root_starter_char)) { */
  2822. /* verbose_msg(VDETAIL, "starter nodes\n"); */
  2823. /* twalk(cur_base->root_starter_char, print_starter_node); */
  2824. /* } */
  2825. }
  2826. static int starter_all_cmp(const void *n1, const void *n2)
  2827. {
  2828. const char *s1 = ((weighted_item_t *) n1)->symbol;
  2829. const char *s2 = ((weighted_item_t *) n2)->symbol;
  2830. colitem_t x;
  2831. colitem_t *p;
  2832. int n;
  2833. /* sort by 1st char ... then inverse for string */
  2834. x.element = NULL;
  2835. if (!is_ucode(s1)) {
  2836. x.string = s1;
  2837. p = tfind(&x, &cur_base->root_colitem, colitem_cmp);
  2838. s1 = (*((colitem_t **) p))->element + 1;
  2839. }
  2840. if (!is_ucode(s2)) {
  2841. x.string = s2;
  2842. p = tfind(&x, &cur_base->root_colitem, colitem_cmp);
  2843. s2 = (*((colitem_t **) p))->element + 1;
  2844. }
  2845. /* <U####>< */
  2846. /* 01234567 */
  2847. assert(is_ucode(s1));
  2848. assert(is_ucode(s2));
  2849. n = strncmp(s1+2, s2+2, 4);
  2850. if (n) {
  2851. return n;
  2852. }
  2853. s1 += 7;
  2854. s2 += 7;
  2855. return strcmp(s2, s1);
  2856. }
  2857. static void print_starter_all_node(const void *ptr, VISIT order, int level)
  2858. {
  2859. const weighted_item_t *w = *(const weighted_item_t **) ptr;
  2860. colitem_t *ci;
  2861. void *p;
  2862. int n;
  2863. colitem_t x;
  2864. if (order == postorder || order == leaf) {
  2865. #if 0
  2866. if ((n = is_ucode(w->symbol)) != 0) {
  2867. printf(" %s\n", w->symbol);
  2868. } else {
  2869. x.string = w->symbol;
  2870. x.element = NULL;
  2871. p = tfind(&x, &cur_base->root_colitem, colitem_cmp);
  2872. assert(p);
  2873. ci = *((colitem_t **) p);
  2874. printf("%s = %s\n", ci->element, w->symbol);
  2875. }
  2876. #else
  2877. printf("%s|", w->symbol);
  2878. /* if ((n = is_ucode(w->symbol)) != 0) { */
  2879. /* printf("\n"); */
  2880. /* } */
  2881. #endif
  2882. }
  2883. }
  2884. static void process_starter_node(const void *ptr, VISIT order, int level)
  2885. {
  2886. const weighted_item_t *w = *(const weighted_item_t **) ptr;
  2887. colitem_t *ci;
  2888. void *p;
  2889. int n;
  2890. colitem_t x;
  2891. const char *s;
  2892. char buf[32];
  2893. /* store index of collation item followed by (unprefixed) nul-terminated string */
  2894. if (order == postorder || order == leaf) {
  2895. if ((n = is_ucode(w->symbol)) != 0) {
  2896. u16_buf[u16_buf_len++] = final_index_val(w->symbol);
  2897. assert(u16_buf[u16_buf_len-1]);
  2898. u16_buf[u16_buf_len++] = 0;
  2899. if (++u16_starter < base_locale_array[base_locale_len].num_starters) {
  2900. u16_buf[u16_starter] = u16_buf_len;
  2901. }
  2902. /* verbose_msg(VDETAIL, "ucode - %d %d\n", u16_buf[u16_starter-1], u16_buf_len); */
  2903. } else {
  2904. x.string = w->symbol;
  2905. x.element = NULL;
  2906. p = tfind(&x, &cur_base->root_colitem, colitem_cmp);
  2907. assert(p);
  2908. ci = *((colitem_t **) p);
  2909. s = ci->element;
  2910. u16_buf[u16_buf_len++] = final_index_val(w->symbol);
  2911. assert(u16_buf[u16_buf_len-1]);
  2912. assert(*s == '"');
  2913. n = is_ucode(++s);
  2914. /* verbose_msg(VDETAIL, "s is |%s| with len %d (%d)\n", s, strlen(s), n); */
  2915. assert(n);
  2916. s += n;
  2917. while (*s != '"') {
  2918. n = is_ucode(s);
  2919. assert(n);
  2920. strncpy(buf, s, n+1);
  2921. buf[n] = 0;
  2922. /* verbose_msg(VDETAIL, "buf is |%s| with len %d (%d)\n", buf, strlen(buf), n); */
  2923. u16_buf[u16_buf_len++] = final_index_val(buf);
  2924. assert(u16_buf[u16_buf_len-1]);
  2925. s += n;
  2926. }
  2927. u16_buf[u16_buf_len++] = 0;
  2928. }
  2929. }
  2930. }
  2931. static void **p_cl_root_starter_all;
  2932. static void complete_starter_node(const void *ptr, VISIT order, int level)
  2933. {
  2934. weighted_item_t w;
  2935. weighted_item_t *p;
  2936. if (order == postorder || order == leaf) {
  2937. w.symbol = *(const char **) ptr;
  2938. w.weight = NULL;
  2939. if (!tfind(&w, p_cl_root_starter_all, starter_all_cmp)) {
  2940. p = xmalloc(sizeof(weighted_item_t));
  2941. p->symbol = w.symbol;
  2942. p->weight = NULL;
  2943. /* verbose_msg(VDETAIL, "complete_starter_node: %s\n", *(const char **) ptr); */
  2944. if (!tsearch(p, p_cl_root_starter_all, starter_all_cmp)) {
  2945. error_msg("OUT OF MEMORY");
  2946. }
  2947. }
  2948. }
  2949. }
  2950. static void do_starter_lists(col_locale_t *cl)
  2951. {
  2952. ll_item_t *s;
  2953. ll_item_t *h;
  2954. ll_item_t *lli;
  2955. col_locale_t *c;
  2956. colitem_t *ci;
  2957. weighted_item_t *w;
  2958. void *p;
  2959. char buf[32];
  2960. int n;
  2961. colitem_t x;
  2962. void *mm;
  2963. c = cl;
  2964. if (c != cur_base) {
  2965. c = cur_base;
  2966. }
  2967. /* printf("STARTERS %s --------------------\n", cl->name); */
  2968. LOOP:
  2969. for (s = c->section_list ; s ; s = s->next) {
  2970. h = lli = ((section_t *)(s->data))->itm_list;
  2971. if (!lli) {
  2972. continue;
  2973. }
  2974. do {
  2975. if (lli->data_type & DT_WEIGHTED) {
  2976. w = (weighted_item_t *)(lli->data);
  2977. ci = NULL;
  2978. if ((n = is_ucode(w->symbol)) != 0) {
  2979. strcpy(buf, w->symbol);
  2980. } else {
  2981. /* fprintf(stdout, "looking for |%s|\n", w->symbol); */
  2982. x.string = w->symbol;
  2983. x.element = NULL;
  2984. p = tfind(&x, &cur_base->root_colitem, colitem_cmp);
  2985. if (!p) {
  2986. /* verbose_msg(VDETAIL, "Whoa... processing starters for %s and couldn't find %s\n", */
  2987. /* cl->name, w->symbol); */
  2988. continue;
  2989. }
  2990. ci = *((colitem_t **) p);
  2991. if (!ci->element) { /* just a collating symbol */
  2992. continue;
  2993. }
  2994. assert(ci->element[0] == '"');
  2995. n = is_ucode(ci->element + 1);
  2996. assert(n);
  2997. strncpy(buf, ci->element + 1, n);
  2998. }
  2999. if ((tfind(buf, &cur_base->root_starter_char, sym_cmp)) != NULL) {
  3000. /* fprintf(stdout, "adding from %s: %s", c->name, w->symbol); */
  3001. /* if (ci) { */
  3002. /* fprintf(stdout, " = %s", ci->element); */
  3003. /* } */
  3004. /* fprintf(stdout, "\n"); */
  3005. if (!tsearch(w, &cl->root_starter_all, starter_all_cmp)) {
  3006. error_msg("OUT OF MEMORY");
  3007. }
  3008. }
  3009. }
  3010. } while ((lli = lli->next) != h);
  3011. }
  3012. if (c != cl) {
  3013. c = cl;
  3014. goto LOOP;
  3015. }
  3016. p_cl_root_starter_all = &cl->root_starter_all;
  3017. twalk(cur_base->root_starter_char, complete_starter_node);
  3018. if (cl == cur_base) {
  3019. base_locale_array[base_locale_len].num_starters = tnumnodes(cur_base->root_starter_char);
  3020. }
  3021. #if 0
  3022. printf("\nNow walking tree...\n\n");
  3023. twalk(cl->root_starter_all, print_starter_all_node);
  3024. printf("\n\n");
  3025. #endif
  3026. u16_starter = 0;
  3027. u16_buf[0] = u16_buf_len = base_locale_array[base_locale_len].num_starters;
  3028. twalk(cl->root_starter_all, process_starter_node);
  3029. /* verbose_msg(VDETAIL, "s=%d n=%d\n", u16_starter, base_locale_array[base_locale_len].num_starters); */
  3030. assert(u16_starter == base_locale_array[base_locale_len].num_starters);
  3031. #if 0
  3032. { int i;
  3033. for (i=0 ; i < u16_buf_len ; i++) {
  3034. verbose_msg(VDETAIL, "starter %2d: %d - %#06x\n", i, u16_buf[i], u16_buf[i]);
  3035. }}
  3036. #endif
  3037. mm = NULL;
  3038. if (u16_buf_len) {
  3039. /* assert(base_locale_array[base_locale_len].num_starters); */
  3040. if ((u16_buf_len > multistart_len) ||
  3041. !(mm = memmem(multistart_buffer, multistart_len*sizeof(multistart_buffer[0]),
  3042. u16_buf, u16_buf_len*sizeof(u16_buf[0])))
  3043. ) {
  3044. assert(multistart_len + u16_buf_len < MULTISTART_LEN);
  3045. memcpy(multistart_buffer + multistart_len, u16_buf, u16_buf_len*sizeof(u16_buf[0]));
  3046. if (cl == cur_base) {
  3047. base_locale_array[base_locale_len].multistart_offset = multistart_len;
  3048. } else {
  3049. der_locale_array[der_locale_len].multistart_offset = multistart_len;
  3050. }
  3051. multistart_len += u16_buf_len;
  3052. /* verbose_msg(VDETAIL, "%s: multistart_len = %d u16_buf_len = %d\n", cl->name, multistart_len, u16_buf_len); */
  3053. } else if (!(u16_buf_len > multistart_len)) {
  3054. assert(mm);
  3055. if (cl == cur_base) {
  3056. base_locale_array[base_locale_len].multistart_offset = ((uint16_t *)(mm)) - multistart_buffer;
  3057. } else {
  3058. der_locale_array[der_locale_len].multistart_offset = ((uint16_t *)(mm)) - multistart_buffer;
  3059. }
  3060. /* verbose_msg(VDETAIL, "%s: memmem found a match with u16_buf_len = %d\n", cl->name, u16_buf_len); */
  3061. }
  3062. } else {
  3063. assert(!base_locale_array[base_locale_len].num_starters);
  3064. }
  3065. /* printf("u16_buf_len = %d\n", u16_buf_len); */
  3066. /* printf("STARTERS %s DONE ---------------\n", cl->name); */
  3067. }
  3068. /* For sorting the blocks of unsigned chars. */
  3069. static size_t nu_val;
  3070. int nu_memcmp(const void *a, const void *b)
  3071. {
  3072. return memcmp(*(unsigned char**)a, *(unsigned char**)b, nu_val * sizeof(tbl_item));
  3073. }
  3074. size_t newopt(tbl_item *ut, size_t usize, int shift, table_data *tbl)
  3075. {
  3076. static int recurse;
  3077. tbl_item *ti[RANGE]; /* table index */
  3078. size_t numblocks;
  3079. size_t blocksize;
  3080. size_t uniq;
  3081. size_t i, j;
  3082. size_t smallest, t;
  3083. tbl_item *ii_save;
  3084. int uniqblock[1 << (8*sizeof(tbl_item) - 1)];
  3085. tbl_item uit[RANGE];
  3086. int shift2;
  3087. if (shift > 15) {
  3088. return SIZE_MAX;
  3089. }
  3090. ii_save = NULL;
  3091. blocksize = 1 << shift;
  3092. numblocks = usize >> shift;
  3093. /* init table index */
  3094. for (i=j=0 ; i < numblocks ; i++) {
  3095. ti[i] = ut + j;
  3096. j += blocksize;
  3097. }
  3098. /* sort */
  3099. nu_val = blocksize;
  3100. qsort(ti, numblocks, sizeof(unsigned char *), nu_memcmp);
  3101. uniq = 1;
  3102. uit[(ti[0]-ut)/blocksize] = 0;
  3103. for (i=1 ; i < numblocks ; i++) {
  3104. if (memcmp(ti[i-1], ti[i], blocksize*sizeof(tbl_item)) < 0) {
  3105. if (++uniq > (1 << (8*sizeof(tbl_item) - 1))) {
  3106. break;
  3107. }
  3108. uniqblock[uniq - 1] = i;
  3109. }
  3110. #if 1
  3111. else if (memcmp(ti[i-1], ti[i], blocksize*sizeof(tbl_item)) > 0) {
  3112. printf("bad sort %i!\n", i);
  3113. abort();
  3114. }
  3115. #endif
  3116. uit[(ti[i]-ut)/blocksize] = uniq - 1;
  3117. }
  3118. smallest = SIZE_MAX;
  3119. shift2 = -1;
  3120. if (uniq <= (1 << (8*sizeof(tbl_item) - 1))) {
  3121. smallest = numblocks + uniq * blocksize;
  3122. if (!recurse) {
  3123. ++recurse;
  3124. for (j=1 ; j < 14 ; j++) {
  3125. if ((numblocks >> j) < 2) break;
  3126. if (tbl) {
  3127. ii_save = tbl->ii;
  3128. tbl->ii = NULL;
  3129. }
  3130. if ((t = newopt(uit, numblocks, j, tbl)) < SIZE_MAX) {
  3131. t += uniq * blocksize;
  3132. }
  3133. if (tbl) {
  3134. tbl->ii = ii_save;
  3135. }
  3136. if (smallest >= t) {
  3137. shift2 = j;
  3138. smallest = t;
  3139. /* if (!tbl->ii) { */
  3140. /* printf("ishift %u tshift %u size %u\n", */
  3141. /* shift2, shift, t); */
  3142. /* } */
  3143. /* } else { */
  3144. /* break; */
  3145. }
  3146. }
  3147. --recurse;
  3148. }
  3149. } else {
  3150. return SIZE_MAX;
  3151. }
  3152. if (tbl->ii) {
  3153. if (recurse) {
  3154. tbl->ii_shift = shift;
  3155. tbl->ii_len = numblocks;
  3156. memcpy(tbl->ii, uit, numblocks*sizeof(tbl_item));
  3157. tbl->ti = tbl->ii + tbl->ii_len;
  3158. tbl->ti_len = uniq * blocksize;
  3159. for (i=0 ; i < uniq ; i++) {
  3160. memcpy(tbl->ti + i * blocksize, ti[uniqblock[i]], blocksize*sizeof(tbl_item));
  3161. }
  3162. } else {
  3163. ++recurse;
  3164. /* printf("setting ishift %u tshift %u\n", shift2, shift); */
  3165. newopt(uit, numblocks, shift2, tbl);
  3166. --recurse;
  3167. tbl->ti_shift = shift;
  3168. tbl->ut_len = uniq * blocksize;
  3169. tbl->ut = tbl->ti + tbl->ti_len;
  3170. for (i=0 ; i < uniq ; i++) {
  3171. memcpy(tbl->ut + i * blocksize, ti[uniqblock[i]], blocksize*sizeof(tbl_item));
  3172. }
  3173. }
  3174. }
  3175. return smallest;
  3176. }
  3177. static const int rule2val[8] = {
  3178. -1,
  3179. (1 << 14), /* forward */
  3180. (2 << 14), /* position */
  3181. (3 << 14), /* forward,position */
  3182. 0, /* backward */
  3183. -1,
  3184. -1,
  3185. -1,
  3186. };
  3187. static int final_index_val_x(const char *s, const char *sym)
  3188. {
  3189. int r;
  3190. if (!(r = final_index_val0(s))) {
  3191. if (!strcmp(s, "IGNORE")) {
  3192. r = 0;
  3193. } else if (!strcmp(s, "..") || !strcmp(sym, "RANGE")) {
  3194. if (*sym == '.') {
  3195. final_index_val(sym); /* make sure it's known */
  3196. }
  3197. r = 0x3fff;
  3198. } else if (!strcmp(s, ".")) {
  3199. r = 0x3ffe;
  3200. } else {
  3201. error_msg("can't find final index: %s", s);
  3202. }
  3203. }
  3204. return r;
  3205. }
  3206. /* store rule2val in 2 high bits and collation index in lower.
  3207. * for sort strings, store (offset from base) + max colindex as index.
  3208. */
  3209. static unsigned int add_rule(weighted_item_t *wi)
  3210. {
  3211. weight_t *w = wi->weight;
  3212. int i, j, r, n;
  3213. uint16_t rbuf[MAX_COLLATION_WEIGHTS];
  3214. uint16_t ws_buf[32];
  3215. void *mm;
  3216. char buf[32];
  3217. const char *s;
  3218. const char *e;
  3219. for (i=0 ; i < MAX_COLLATION_WEIGHTS ; i++) {
  3220. rbuf[i] = rule2val[R_FORWARD]; /* set a default to forward-ignore */
  3221. }
  3222. if (base_locale_array[base_locale_len].num_weights < w->num_weights) {
  3223. base_locale_array[base_locale_len].num_weights = w->num_weights;
  3224. }
  3225. for (i=0 ; i < w->num_weights ; i++) {
  3226. assert(rule2val[(int)(w->rule[i])] >= 0);
  3227. assert(w->colitem[i] && *w->colitem[i]);
  3228. if (*w->colitem[i] == '"') { /* string... */
  3229. s = w->colitem[i] + 1;
  3230. assert(*s == '<');
  3231. n = 0;
  3232. do {
  3233. e = s;
  3234. do {
  3235. if (*e == '/') {
  3236. e += 2;
  3237. continue;
  3238. }
  3239. } while (*e++ != '>');
  3240. assert(((size_t)(e-s) < sizeof(buf)));
  3241. memcpy(buf, s, (size_t)(e-s));
  3242. buf[(size_t)(e-s)] = 0;
  3243. r = final_index_val_x(buf, wi->symbol);
  3244. assert(n + 1 < sizeof(ws_buf)/sizeof(ws_buf[0]));
  3245. ws_buf[n++] = r | rule2val[(int)(w->rule[i])];
  3246. s = e;
  3247. } while (*s != '"');
  3248. ws_buf[n++] = 0; /* terminator */
  3249. mm = memmem(weightstr_buffer, weightstr_len*sizeof(weightstr_buffer[0]),
  3250. ws_buf, n*sizeof(ws_buf[0]));
  3251. if (!mm) {
  3252. assert(weightstr_len + n < WEIGHTSTR_LEN);
  3253. memcpy(weightstr_buffer + weightstr_len, ws_buf, n*sizeof(ws_buf[0]));
  3254. mm = weightstr_buffer + weightstr_len;
  3255. weightstr_len += n;
  3256. }
  3257. r = (((uint16_t *)(mm)) - weightstr_buffer)
  3258. + base_locale_array[base_locale_len].max_col_index + 2;
  3259. assert(r < (1 << 14));
  3260. rbuf[i] = r | rule2val[(int)(w->rule[i])];
  3261. } else { /* item */
  3262. r = final_index_val_x(w->colitem[i], wi->symbol);
  3263. rbuf[i] = r | rule2val[(int)(w->rule[i])];
  3264. }
  3265. }
  3266. for (i=0 ; i < ruletable_len ; i += MAX_COLLATION_WEIGHTS) {
  3267. if (!memcmp(ruletable_buffer + i, rbuf, MAX_COLLATION_WEIGHTS*sizeof(ruletable_buffer[0]))) {
  3268. return i/MAX_COLLATION_WEIGHTS;
  3269. }
  3270. }
  3271. memcpy(ruletable_buffer + ruletable_len, rbuf, MAX_COLLATION_WEIGHTS*sizeof(ruletable_buffer[0]));
  3272. ruletable_len += MAX_COLLATION_WEIGHTS;
  3273. return (ruletable_len / MAX_COLLATION_WEIGHTS)-1;
  3274. }
  3275. static unsigned int add_range_rule(range_item_t *ri)
  3276. {
  3277. weight_t *w = ri->weight;
  3278. int i, j, r, n;
  3279. uint16_t rbuf[MAX_COLLATION_WEIGHTS];
  3280. uint16_t ws_buf[32];
  3281. void *mm;
  3282. char buf[32];
  3283. const char *s;
  3284. const char *e;
  3285. for (i=0 ; i < MAX_COLLATION_WEIGHTS ; i++) {
  3286. rbuf[i] = rule2val[R_FORWARD]; /* set a default to forward-ignore */
  3287. }
  3288. if (base_locale_array[base_locale_len].num_weights < w->num_weights) {
  3289. base_locale_array[base_locale_len].num_weights = w->num_weights;
  3290. }
  3291. for (i=0 ; i < w->num_weights ; i++) {
  3292. assert(rule2val[(int)(w->rule[i])] >= 0);
  3293. assert(w->colitem[i] && *w->colitem[i]);
  3294. if (*w->colitem[i] == '"') { /* string... */
  3295. s = w->colitem[i] + 1;
  3296. assert(*s == '<');
  3297. n = 0;
  3298. do {
  3299. e = s;
  3300. do {
  3301. if (*e == '/') {
  3302. e += 2;
  3303. continue;
  3304. }
  3305. } while (*e++ != '>');
  3306. assert(((size_t)(e-s) < sizeof(buf)));
  3307. memcpy(buf, s, (size_t)(e-s));
  3308. buf[(size_t)(e-s)] = 0;
  3309. r = final_index_val_x(buf, "RANGE");
  3310. assert(n + 1 < sizeof(ws_buf)/sizeof(ws_buf[0]));
  3311. ws_buf[n++] = r | rule2val[(int)(w->rule[i])];
  3312. s = e;
  3313. } while (*s != '"');
  3314. ws_buf[n++] = 0; /* terminator */
  3315. mm = memmem(weightstr_buffer, weightstr_len*sizeof(weightstr_buffer[0]),
  3316. ws_buf, n*sizeof(ws_buf[0]));
  3317. if (!mm) {
  3318. assert(weightstr_len + n < WEIGHTSTR_LEN);
  3319. memcpy(weightstr_buffer + weightstr_len, ws_buf, n*sizeof(ws_buf[0]));
  3320. mm = weightstr_buffer + weightstr_len;
  3321. weightstr_len += n;
  3322. }
  3323. r = (((uint16_t *)(mm)) - weightstr_buffer)
  3324. + base_locale_array[base_locale_len].max_col_index + 2;
  3325. assert(r < (1 << 14));
  3326. rbuf[i] = r | rule2val[(int)(w->rule[i])];
  3327. } else { /* item */
  3328. r = final_index_val_x(w->colitem[i], "RANGE");
  3329. rbuf[i] = r | rule2val[(int)(w->rule[i])];
  3330. }
  3331. }
  3332. for (i=0 ; i < ruletable_len ; i += MAX_COLLATION_WEIGHTS) {
  3333. if (!memcmp(ruletable_buffer + i, rbuf, MAX_COLLATION_WEIGHTS*sizeof(ruletable_buffer[0]))) {
  3334. return i/MAX_COLLATION_WEIGHTS;
  3335. }
  3336. }
  3337. memcpy(ruletable_buffer + ruletable_len, rbuf, MAX_COLLATION_WEIGHTS*sizeof(ruletable_buffer[0]));
  3338. ruletable_len += MAX_COLLATION_WEIGHTS;
  3339. return (ruletable_len / MAX_COLLATION_WEIGHTS)-1;
  3340. }
  3341. #define DUMPn(X) fprintf(stderr, "%10d-%-.20s", base_locale_array[n]. X, #X);
  3342. static void dump_base_locale(int n)
  3343. {
  3344. assert(n < base_locale_len);
  3345. fprintf(stderr, "Base Locale: %s\n", base_locale_array[n].name);
  3346. DUMPn(num_weights);
  3347. DUMPn(ii_shift);
  3348. DUMPn(ti_shift);
  3349. DUMPn(ii_len);
  3350. DUMPn(ti_len);
  3351. DUMPn(max_weight);
  3352. fprintf(stderr, "\n");
  3353. DUMPn(num_col_base);
  3354. DUMPn(max_col_index);
  3355. DUMPn(undefined_idx);
  3356. DUMPn(range_low);
  3357. DUMPn(range_count);
  3358. fprintf(stderr, "\n");
  3359. DUMPn(range_base_weight);
  3360. DUMPn(num_starters);
  3361. fprintf(stderr, "\n");
  3362. DUMPn(range_rule_offset);
  3363. DUMPn(wcs2colidt_offset);
  3364. DUMPn(index2weight_offset);
  3365. fprintf(stderr, "\n");
  3366. DUMPn(index2ruleidx_offset);
  3367. DUMPn(multistart_offset);
  3368. fprintf(stderr, "\n");
  3369. }
  3370. #undef DUMPn
  3371. #define DUMPn(X) fprintf(stderr, "%10d-%s", der_locale_array[n]. X, #X);
  3372. static void dump_der_locale(int n)
  3373. {
  3374. assert(n < der_locale_len);
  3375. fprintf(stderr, "Derived Locale: %s (%.12s)",
  3376. der_locale_array[n].name,
  3377. base_locale_array[der_locale_array[n].base_idx].name);
  3378. DUMPn(base_idx);
  3379. DUMPn(undefined_idx);
  3380. DUMPn(overrides_offset);
  3381. DUMPn(multistart_offset);
  3382. fprintf(stderr, "\n");
  3383. }
  3384. static unsigned long collate_pos;
  3385. static void dump_u16_array(FILE *fp, uint16_t *u, int len, const char *name)
  3386. {
  3387. int i;
  3388. fprintf(fp, "\t/* %8lu %s */\n", collate_pos, name);
  3389. for (i=0 ; i < len ; i++) {
  3390. if (!(i & 7)) {
  3391. fprintf(fp, "\n\t");
  3392. }
  3393. fprintf(fp," %#06x,", (unsigned int)(u[i]));
  3394. }
  3395. fprintf(fp,"\n");
  3396. collate_pos += len;
  3397. }
  3398. #define OUT_U16C(X,N) fprintf(fp,"\t%10d, /* %8lu %s */\n", X, collate_pos++, N);
  3399. static void dump_collate(FILE *fp)
  3400. {
  3401. int n;
  3402. fprintf(fp, "const uint16_t __locale_collate_tbl[] = {\n");
  3403. OUT_U16C(base_locale_len, "numbef of base locales");
  3404. OUT_U16C(der_locale_len, "number of derived locales");
  3405. OUT_U16C(MAX_COLLATION_WEIGHTS, "max collation weights");
  3406. OUT_U16C(index2weight_len, "number of index2{weight|ruleidx} elements");
  3407. OUT_U16C(weightstr_len, "number of weightstr elements");
  3408. OUT_U16C(multistart_len, "number of multistart elements");
  3409. OUT_U16C(override_len, "number of override elements");
  3410. OUT_U16C(ruletable_len, "number of ruletable elements");
  3411. #undef DUMPn
  3412. #define DUMPn(X) fprintf(fp, "\t%10d, /* %8lu %s */\n", base_locale_array[n]. X, collate_pos++, #X);
  3413. for (n=0 ; n < base_locale_len ; n++) {
  3414. unsigned wcs2colidt_offset_low = base_locale_array[n].wcs2colidt_offset & 0xffffU;
  3415. unsigned wcs2colidt_offset_hi = base_locale_array[n].wcs2colidt_offset >> 16;
  3416. fprintf(fp, "\t/* Base Locale %2d: %s */\n", n, base_locale_array[n].name);
  3417. DUMPn(num_weights);
  3418. DUMPn(num_starters);
  3419. DUMPn(ii_shift);
  3420. DUMPn(ti_shift);
  3421. DUMPn(ii_len);
  3422. DUMPn(ti_len);
  3423. DUMPn(max_weight);
  3424. DUMPn(num_col_base);
  3425. DUMPn(max_col_index);
  3426. DUMPn(undefined_idx);
  3427. DUMPn(range_low);
  3428. DUMPn(range_count);
  3429. DUMPn(range_base_weight);
  3430. DUMPn(range_rule_offset);
  3431. DUMPn(index2weight_offset);
  3432. DUMPn(index2ruleidx_offset);
  3433. DUMPn(multistart_offset);
  3434. #undef DUMPn
  3435. #define DUMPn(X) fprintf(fp, "\t%10d, /* %8lu %s */\n", X, collate_pos++, #X);
  3436. DUMPn(wcs2colidt_offset_low);
  3437. DUMPn(wcs2colidt_offset_hi);
  3438. }
  3439. #undef DUMPn
  3440. fprintf(fp, "#define COL_IDX_C %5d\n", 0);
  3441. #define DUMPn(X) fprintf(fp, "\t%10d, /* %8lu %s */\n", der_locale_array[n]. X, collate_pos++, #X);
  3442. for (n=0 ; n < der_locale_len ; n++) {
  3443. fprintf(fp, "#define COL_IDX_%s %5d\n", der_locale_array[n].name, n+1);
  3444. fprintf(fp, "\t/* Derived Locale %4d: %s (%.12s) */\n",
  3445. n, der_locale_array[n].name,
  3446. base_locale_array[der_locale_array[n].base_idx].name);
  3447. DUMPn(base_idx);
  3448. DUMPn(undefined_idx);
  3449. DUMPn(overrides_offset);
  3450. DUMPn(multistart_offset);
  3451. }
  3452. #undef DUMPn
  3453. fprintf(fp, "\n");
  3454. dump_u16_array(fp, index2weight_buffer, index2weight_len, "index2weight");
  3455. dump_u16_array(fp, index2ruleidx_buffer, index2ruleidx_len, "index2ruleidx");
  3456. dump_u16_array(fp, multistart_buffer, multistart_len, "multistart");
  3457. dump_u16_array(fp, override_buffer, override_len, "override");
  3458. dump_u16_array(fp, ruletable_buffer, ruletable_len, "ruletable");
  3459. dump_u16_array(fp, weightstr_buffer, weightstr_len, "weightstr");
  3460. dump_u16_array(fp, wcs2colidt_buffer, wcs2colidt_len, "wcs2colidt");
  3461. fprintf(fp,"}; /* %8lu */\n", collate_pos);
  3462. fprintf(fp,"#define __lc_collate_data_LEN %lu\n\n", collate_pos);
  3463. }