1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997 |
- /*
- * Usage:
- * gen_collate <INPUTDIR> [-o OUTPUTFILE] LOCALE ...
- *
- * Generate collation data from locales LOCALE.
- * Reads all LOCALE from INPUTDIR and writes collation data to OUTPUTFILE.
- *
- * The output file defaults to "locales_collate.h".
- */
- /* TODO:
- *
- * add UNDEFINED at end if not specified
- * convert POSITION -> FORWARD,POSITION
- *
- *
- * deal with lowercase in <Uhhhh>
- *
- * what about reorders that keep the same rule?
- *
- * remove "unused" collation elements? (probably doesn't save much)
- *
- * add_rule function ... returns index into rule table after possibly adding custom-indexed rule
- * but don't forget about multichar weights... replace with strings of indexes
- *
- */
- #include <stddef.h>
- #include <stdio.h>
- #include <stdlib.h>
- #include <string.h>
- #include <stdint.h>
- #include <stdarg.h>
- #include <limits.h>
- #include <ctype.h>
- #include <assert.h>
- #include <errno.h>
- #include <search.h>
- typedef struct {
- char *name; /* */
- int num_weights; /* */
- int ii_shift; /* */
- int ti_shift; /* */
- int ii_len; /* */
- int ti_len; /* */
- int max_weight; /* */
- int num_col_base; /* */
- int max_col_index; /* */
- int undefined_idx; /* */
- int range_low; /* */
- int range_count; /* high - low */
- int range_base_weight; /* */
- int num_starters; /* */
- int range_rule_offset; /* */
- int wcs2colidt_offset; /* */
- int index2weight_offset; /* */
- int index2ruleidx_offset; /* */
- int multistart_offset; /* */
- } base_locale_t;
- #define BASE_LOCALE_LEN 20
- static base_locale_t base_locale_array[BASE_LOCALE_LEN];
- static size_t base_locale_len;
- typedef struct {
- char *name; /* */
- int base_idx; /* */
- int undefined_idx; /* */
- int overrides_offset; /* */
- int multistart_offset; /* */
- } der_locale_t;
- #define DER_LOCALE_LEN 300
- static der_locale_t der_locale_array[DER_LOCALE_LEN];
- static size_t der_locale_len;
- #define OVERRIDE_LEN 50000
- static uint16_t override_buffer[OVERRIDE_LEN];
- static size_t override_len;
- #define MULTISTART_LEN 10000
- static uint16_t multistart_buffer[MULTISTART_LEN];
- static size_t multistart_len;
- #define WCS2COLIDT_LEN 200000
- static uint16_t wcs2colidt_buffer[WCS2COLIDT_LEN];
- static size_t wcs2colidt_len;
- #define INDEX2WEIGHT_LEN 200000
- static uint16_t index2weight_buffer[INDEX2WEIGHT_LEN];
- static size_t index2weight_len;
- static uint16_t index2ruleidx_buffer[INDEX2WEIGHT_LEN];
- static size_t index2ruleidx_len;
- #define WEIGHTSTR_LEN 10000
- static uint16_t weightstr_buffer[WEIGHTSTR_LEN];
- static size_t weightstr_len;
- #define RULETABLE_LEN (1L<<16)
- static uint16_t ruletable_buffer[RULETABLE_LEN];
- static size_t ruletable_len;
- #define RANGE (0x10000UL)
- typedef uint16_t tbl_item;
- static uint16_t u16_buf[10000];
- static int u16_buf_len;
- static int u16_starter;
- typedef struct {
- uint16_t ii_len;
- uint16_t ti_len;
- uint16_t ut_len;
- unsigned char ii_shift;
- unsigned char ti_shift;
- tbl_item *ii;
- tbl_item *ti;
- tbl_item *ut;
- } table_data;
- static size_t newopt(tbl_item *ut, size_t usize, int shift, table_data *tbl);
- #define MAX_COLLATION_WEIGHTS 4
- #define MAX_FNO 1
- #define MAX_FILES (MAX_FNO + 1)
- static FILE *fstack[MAX_FILES];
- static char *fname[MAX_FILES];
- static int lineno[MAX_FILES];
- static int fno = -1;
- static tbl_item wcs2index[RANGE];
- static char linebuf[1024];
- static char *pos;
- static char *pos_e = NULL;
- static char end_of_token = 0; /* slot to save */
- #define IN_ORDER 0x01
- #define IN_REORDER 0x02
- #define IN_REORDER_SECTIONS 0x04
- static int order_state;
- static int cur_num_weights; /* number of weights in current use */
- static char cur_rule[MAX_COLLATION_WEIGHTS];
- static int anonsection = 0;
- typedef struct ll_item_struct ll_item_t;
- struct ll_item_struct {
- ll_item_t *next;
- ll_item_t *prev;
- void *data;
- int data_type;
- int idx;
- };
- static ll_item_t *reorder_section_ptr = NULL;
- static int superset;
- static int superset_order_start_cnt; /* only support one order for now */
- static int superset_in_sync;
- static ll_item_t *comm_cur_ptr;
- static ll_item_t *comm_prev_ptr;
- enum {
- R_FORWARD = 0x01,
- R_POSITION = 0x02,
- R_BACKWARD = 0x04 /* must be largest in value */
- };
- typedef struct {
- size_t num_weights;
- char rule[MAX_COLLATION_WEIGHTS];
- const char *colitem[MAX_COLLATION_WEIGHTS];
- } weight_t;
- static void *root_weight = NULL;
- size_t unique_weights = 0;
- typedef struct {
- const char *symbol;
- weight_t *weight;
- } weighted_item_t;
- typedef struct {
- const char *symbol1;
- const char *symbol2;
- int length;
- weight_t *weight;
- } range_item_t;
- typedef struct {
- const char *name;
- ll_item_t *itm_list; /* weighted_item_t list .. circular!!! */
- size_t num_items;
- size_t num_rules;
- char rules[MAX_COLLATION_WEIGHTS];
- } section_t;
- static section_t *cur_section = NULL;
- typedef struct {
- const char *symbol;
- ll_item_t *node;
- } wi_index_t;
- typedef struct col_locale_struct col_locale_t;
- struct col_locale_struct {
- char *name;
- void *root_colitem; /* all base and derived, or just derived */
- void *root_element;
- void *root_scripts;
- void *root_wi_index;
- void *root_wi_index_reordered;
- ll_item_t *section_list;
- col_locale_t *base_locale; /* null if this is a base */
- void *root_derived_wi;
- ll_item_t *derived_list;
- void *root_starter_char;
- void *root_starter_all;
- ll_item_t *undefined_idx;
- };
- typedef struct {
- const char *symbol;
- int idx;
- } col_index_t;
- static void *root_col_locale = NULL;
- typedef struct {
- const char *keyword;
- void (*handler)(void);
- } keyword_table_t;
- typedef struct {
- const char *string;
- const char *element; /* NULL if collating symbol */
- } colitem_t;
- static col_locale_t *cur_base = NULL;
- static col_locale_t *cur_derived = NULL;
- static col_locale_t *cur_col = NULL;
- static void *root_sym = NULL;
- static size_t num_sym = 0;
- static size_t mem_sym = 0;
- static const char *inputdir;
- static size_t inputdir_len;
- static unsigned verbose = 0;
- enum {
- VINFO = (1<<0),
- VDETAIL = (1<<1),
- };
- static void error_msg(const char *fmt, ...) __attribute__ ((noreturn, format (printf, 1, 2)));
- static void *xmalloc(size_t n);
- static char *xsymdup(const char *s); /* only allocate once... store in a tree */
- static void pushfile(char *filename);
- static void popfile(void);
- static void processfile(void);
- static int iscommentchar(int);
- static void eatwhitespace(void);
- static int next_line(void);
- static char *next_token(void);
- static void do_unrecognized(void);
- static col_locale_t *new_col_locale(char *name);
- static ll_item_t *new_ll_item(int data_type, void *data);
- static weight_t *register_weight(weight_t *w);
- static size_t ll_len(ll_item_t *l);
- static size_t ll_count(ll_item_t *l, int mask);
- static void add_wi_index(ll_item_t *l);
- static size_t tnumnodes(const void *root);
- static ll_item_t *find_wi_index(const char *sym, col_locale_t *cl);
- static void mark_reordered(const char *sym);
- static ll_item_t *find_wi_index_reordered(const char *sym);
- static ll_item_t *next_comm_ptr(void);
- static ll_item_t *init_comm_ptr(void);
- static ll_item_t *find_ll_last(ll_item_t *p);
- static void dump_weights(const char *name);
- static void finalize_base(void);
- static int is_ucode(const char *s);
- static int sym_cmp(const void *n1, const void *n2);
- static void do_starter_lists(col_locale_t *cl);
- static void dump_base_locale(int n);
- static void dump_der_locale(int n);
- static void dump_collate(FILE *fp);
- enum {
- DT_SECTION = 0x01,
- DT_WEIGHTED = 0x02,
- DT_REORDER = 0x04, /* a section to support reorder_after */
- DT_COL_LOCALE = 0x08,
- DT_RANGE = 0x10,
- };
- static int verbose_msg(const unsigned lvl, const char *fmt, ...)
- {
- va_list arg;
- int ret = 0;
- if (verbose & lvl) {
- va_start(arg, fmt);
- ret = vfprintf(stderr, fmt, arg);
- va_end(arg);
- }
- return ret;
- }
- static section_t *new_section(const char *name)
- {
- section_t *p;
- char buf[128];
- p = xmalloc(sizeof(section_t));
- if (!name) { /* anonymous section */
- name = buf;
- snprintf(buf, sizeof(buf), "anon%05d", anonsection);
- ++anonsection;
- } else if (*name != '<') { /* reorder */
- name = buf;
- snprintf(buf, sizeof(buf), "%s %05d", cur_col->name, anonsection);
- ++anonsection;
- }
- #warning devel code
- /* verbose_msg(VDETAIL, "section %s\n", name); */
- p->name = xsymdup(name);
- p->itm_list = NULL;
- p->num_items = 0;
- p->num_rules = 0;
- memset(p->rules, 0, MAX_COLLATION_WEIGHTS);
- /* cur_num_weights = p->num_rules = 0; */
- /* memset(p->rules, 0, MAX_COLLATION_WEIGHTS); */
- /* memset(cur_rule, R_FORWARD, 4); */
- #warning devel code
- if (*p->name == 'a') {
- cur_num_weights = p->num_rules = 4;
- memset(p->rules, R_FORWARD, 4);
- memset(cur_rule, R_FORWARD, 4);
- p->rules[3] |= R_POSITION;
- cur_rule[3] |= R_POSITION;
- }
- /* verbose_msg(VDETAIL, "new section %s -- cur_num_weights = %d\n", p->name, cur_num_weights); */
- return p;
- }
- static void do_order_start(void);
- static void do_order_end(void);
- static void do_reorder_after(void);
- static void do_reorder_end(void);
- static void do_reorder_sections_after(void);
- static void do_reorder_sections_end(void);
- static void do_copy(void);
- static void do_colsym(void);
- static void do_colele(void);
- static void do_script(void);
- static void do_range(void);
- static col_locale_t *new_col_locale(char *name);
- static int colitem_cmp(const void *n1, const void *n2);
- static int colelement_cmp(const void *n1, const void *n2);
- static void del_colitem(colitem_t *p);
- static colitem_t *new_colitem(char *item, char *def);
- static void add_colitem(char *item, char *def);
- static void add_script(const char *s);
- static unsigned int add_rule(weighted_item_t *wi);
- static unsigned int add_range_rule(range_item_t *ri);
- static const keyword_table_t keyword_table[] = {
- { "collating-symbol", do_colsym },
- { "collating-element", do_colele },
- { "script", do_script },
- { "copy", do_copy },
- { "order_start", do_order_start },
- { "order_end", do_order_end },
- { "order-end", do_order_end },
- { "reorder-after", do_reorder_after },
- { "reorder-end", do_reorder_end },
- { "reorder-sections-after", do_reorder_sections_after },
- { "reorder-sections-end", do_reorder_sections_end },
- { "UCLIBC_RANGE", do_range },
- { NULL, do_unrecognized }
- };
- static void do_unrecognized(void)
- {
- #if 1
- error_msg("warning: unrecognized: %s", pos);
- #else
- /* verbose_msg(VDETAIL, "warning: unrecognized initial keyword \"%s\"\n", pos); */
- fprintf(stderr, "warning: unrecognized: %s", pos);
- if (end_of_token) {
- fprintf(stderr, "%c%s", end_of_token, pos_e+1);
- }
- fprintf(stderr, "\n");
- #endif
- }
- /* typedef struct { */
- /* const char *symbol1; */
- /* const char *symbol2; */
- /* int length; */
- /* weight_t *weight; */
- /* } range_item_t; */
- static void do_range(void)
- {
- range_item_t *ri;
- weight_t w;
- int i;
- char *s;
- char *s1;
- char *s2;
- const char **ci;
- ll_item_t *lli;
- assert(!superset);
- assert(order_state == IN_ORDER);
- s1 = next_token();
- if (!s1) {
- error_msg("missing start of range");
- }
- if (!is_ucode(s1)) {
- error_msg("start of range is not a ucode: %s", s1);
- }
- s1 = xsymdup(s1);
- s2 = next_token();
- if (!s2) {
- error_msg("missing end of range");
- }
- if (!is_ucode(s2)) {
- error_msg("end of range is not a ucode: %s", s2);
- }
- s2 = xsymdup(s2);
- ri = (range_item_t *) xmalloc(sizeof(range_item_t));
- ri->symbol1 = s1;
- ri->symbol2 = s2;
- ri->length = strtoul(s2+2, NULL, 16) - strtoul(s1+2, NULL, 16);
- if (ri->length <= 0) {
- error_msg("illegal range length %d", ri->length);
- }
- s = next_token();
- w.num_weights = cur_num_weights;
- for (i=0 ; i < cur_num_weights ; i++) {
- w.rule[i] = cur_rule[i];
- }
- ci = w.colitem + (i-1);
- /* now i == cur_num_weights */
- #define STR_DITTO "."
- while (s && *s && i) {
- --i;
- if (*s == ';') {
- ci[-i] = xsymdup(STR_DITTO);
- if (*++s) {
- continue;
- }
- }
- if (*s) {
- ci[-i] = xsymdup(s);
- }
- s = next_token();
- if (s) {
- if (*s == ';') {
- ++s;
- } else if (i) {
- error_msg("missing seperator");
- }
- }
- }
- if (s) {
- error_msg("too many weights: %d %d |%s| %d", cur_num_weights, i, s, (int)*s);
- }
- while (i) { /* missing weights are not an error */
- --i;
- ci[-i] = xsymdup(STR_DITTO);
- }
- ri->weight = register_weight(&w);
- /* if ((i = is_ucode(t)) != 0) { */
- /* assert(!t[i]); */
- /* add_colitem(t, NULL); */
- /* } */
- lli = new_ll_item(DT_RANGE, ri);
- if (!cur_section->itm_list) {
- /* printf("creating new item list: %s\n", wi->symbol); */
- cur_section->itm_list = lli;
- lli->prev = lli->next = lli;
- ++cur_section->num_items;
- } else {
- insque(lli, cur_section->itm_list->prev);
- /* printf("adding item to list: %d - %s\n", ll_len(cur_section->itm_list), wi->symbol); */
- ++cur_section->num_items;
- }
- /* add_wi_index(lli); */
- }
- static weighted_item_t *add_weight(char *t)
- {
- weighted_item_t *wi;
- weight_t w;
- int i;
- char *s;
- const char **ci;
- t = xsymdup(t);
- s = next_token();
- w.num_weights = cur_num_weights;
- for (i=0 ; i < cur_num_weights ; i++) {
- w.rule[i] = cur_rule[i];
- }
- ci = w.colitem + (i-1);
- /* now i == cur_num_weights */
- while (s && *s && i) {
- --i;
- if (*s == ';') {
- ci[-i] = xsymdup(STR_DITTO);
- if (*++s) {
- continue;
- }
- }
- if (*s) {
- if (!strcmp(s,t)) {
- s = STR_DITTO;
- }
- ci[-i] = xsymdup(s);
- }
- s = next_token();
- if (s) {
- if (*s == ';') {
- ++s;
- } else if (i) {
- error_msg("missing seperator");
- }
- }
- }
- if (s) {
- error_msg("too many weights: %d %d |%s| %d", cur_num_weights, i, s, (int)*s);
- }
- while (i) { /* missing weights are not an error */
- --i;
- ci[-i] = xsymdup(STR_DITTO);
- }
- wi = xmalloc(sizeof(weighted_item_t));
- wi->symbol = t;
- wi->weight = register_weight(&w);
- if ((i = is_ucode(t)) != 0) {
- assert(!t[i]);
- add_colitem(t, NULL);
- }
- return wi;
- }
- static void add_superset_weight(char *t)
- {
- ll_item_t *lli;
- weighted_item_t *wi;
- if (!comm_cur_ptr
- || (strcmp(t, ((weighted_item_t *)(comm_cur_ptr->data))->symbol) != 0)
- ) { /* now out of sync */
- if (superset_in_sync) { /* need a new section */
- superset_in_sync = 0;
- cur_section = new_section("R");
- cur_num_weights = cur_section->num_rules
- = ((section_t *)(cur_base->section_list->data))->num_rules;
- memcpy(cur_rule,
- ((section_t *)(cur_base->section_list->data))->rules,
- MAX_COLLATION_WEIGHTS);
- memcpy(cur_section->rules,
- ((section_t *)(cur_base->section_list->data))->rules,
- MAX_COLLATION_WEIGHTS);
- insque(new_ll_item(DT_REORDER, cur_section), find_ll_last(cur_col->section_list));
- assert(comm_prev_ptr);
- lli = new_ll_item(DT_REORDER, cur_section);
- lli->prev = lli->next = lli;
- insque(lli, comm_prev_ptr);
- /* verbose_msg(VDETAIL, " subsection -----------------------\n"); */
- }
- /* verbose_msg(VDETAIL, " %s %s\n", t, ((weighted_item_t *)(comm_cur_ptr->data))->symbol); */
- wi = add_weight(t);
- lli = new_ll_item(DT_WEIGHTED, wi);
- mark_reordered(wi->symbol);
- /* printf("reorder: %s\n", t); */
- if (!cur_section->itm_list) {
- cur_section->itm_list = lli;
- lli->prev = lli->next = lli;
- ++cur_section->num_items;
- } else {
- insque(lli, cur_section->itm_list->prev);
- ++cur_section->num_items;
- }
- add_wi_index(lli);
- } else { /* in sync */
- superset_in_sync = 1;
- next_comm_ptr();
- }
- }
- static void do_weight(char *t)
- {
- weighted_item_t *wi;
- ll_item_t *lli;
- if (superset) {
- add_superset_weight(t);
- return;
- }
- switch(order_state) {
- case 0:
- /* fprintf(stdout, "no-order weight: %s\n", t); */
- /* break; */
- case IN_ORDER:
- /* in a section */
- /* fprintf(stdout, "weight: %s\n", t); */
- wi = add_weight(t);
- lli = new_ll_item(DT_WEIGHTED, wi);
- if (!cur_section->itm_list) {
- /* fprintf(stdout, "creating new item list: %s %s %p\n", wi->symbol, cur_section->name, lli); */
- cur_section->itm_list = lli;
- lli->prev = lli->next = lli;
- ++cur_section->num_items;
- } else {
- insque(lli, cur_section->itm_list->prev);
- /* fprintf(stdout, "adding item to list: %d - %s %p\n", ll_len(cur_section->itm_list), wi->symbol, lli); */
- ++cur_section->num_items;
- }
- add_wi_index(lli);
- break;
- case IN_REORDER:
- /* std rule - but in a block with an insert-after pt */
- wi = add_weight(t);
- lli = new_ll_item(DT_WEIGHTED, wi);
- mark_reordered(wi->symbol);
- /* fprintf(stdout, "reorder: %s %s %p\n", t, cur_section->name, lli); */
- if (!cur_section->itm_list) {
- cur_section->itm_list = lli;
- lli->prev = lli->next = lli;
- ++cur_section->num_items;
- } else {
- insque(lli, cur_section->itm_list->prev);
- ++cur_section->num_items;
- }
- add_wi_index(lli);
- break;
- case IN_REORDER_SECTIONS:
- t = xsymdup(t);
- if (next_token() != NULL) {
- error_msg("trailing text in reorder section item: %s", pos);
- }
- lli = cur_col->section_list;
- do {
- if (lli->data_type & DT_SECTION) {
- if (!strcmp(((section_t *)(lli->data))->name, t)) {
- lli->data_type = DT_REORDER;
- lli = new_ll_item(DT_REORDER, (section_t *)(lli->data));
- insque(lli, reorder_section_ptr);
- reorder_section_ptr = lli;
- return;
- }
- }
- lli = lli->next;
- } while (lli);
- error_msg("reorder_sections_after for non-base item currently not supported: %s", t);
- /* fprintf(stdout, "reorder_secitons: %s\n", t); */
- break;
- default:
- error_msg("invalid order_state %d", order_state);
- }
- }
- static int col_locale_cmp(const void *n1, const void *n2)
- {
- return strcmp(((const col_locale_t *) n1)->name, ((const col_locale_t *) n2)->name);
- }
- static void processfile(void)
- {
- char *t;
- const keyword_table_t *k;
- order_state = 0;
- #warning devel code
- /* cur_num_weights = 0; */
- /* cur_num_weights = 4; */
- /* memset(cur_rule, R_FORWARD, 4); */
- if (cur_col != cur_base) {
- cur_col->base_locale = cur_base;
- cur_col->undefined_idx = cur_base->undefined_idx;
- if (!cur_base->derived_list) {
- cur_base->derived_list = new_ll_item(DT_COL_LOCALE, cur_col);
- } else {
- insque(new_ll_item(DT_COL_LOCALE, cur_col), find_ll_last(cur_base->derived_list));
- }
- }
- if (tfind(cur_col, &root_col_locale, col_locale_cmp)) {
- error_msg("attempt to read locale: %s", cur_col->name);
- }
- if (!tsearch(cur_col, &root_col_locale, col_locale_cmp)) {
- error_msg("OUT OF MEMORY!");
- }
- if (superset) {
- superset_order_start_cnt = 0;
- superset_in_sync = 0;
- init_comm_ptr();
- }
- while (next_line()) {
- /* printf("%5d:", lineno[fno]); */
- /* while ((t = next_token()) != NULL) { */
- /* printf(" |%s|", t); */
- /* printf("\n"); */
- /* } */
- t = next_token();
- assert(t);
- assert(t == pos);
- if ((*t == '<') || (!strcmp(t, "UNDEFINED"))) {
- do_weight(t);
- } else {
- for (k = keyword_table ; k->keyword ; k++) {
- if (!strcmp(k->keyword, t)) {
- break;
- }
- }
- k->handler();
- }
- }
- if (cur_base == cur_col) {
- verbose_msg(VDETAIL, "Base: %15s", cur_col->name);
- } else {
- #if 1
- if (!cur_col->undefined_idx) {
- #if 0
- if (superset) {
- if (superset_order_start_cnt == 1) {
- --superset_order_start_cnt; /* ugh.. hack this */
- }
- }
- #endif
- /* This is an awful hack to get around the problem of unspecified UNDEFINED
- * definitions in the supported locales derived from iso14651_t1. */
- if (!strcmp(cur_base->name, "iso14651_t1")) {
- fprintf(stderr, "Warning: adding UNDEFINED entry for %s\n", cur_col->name);
- strcpy(linebuf, "script <UNDEFINED_SECTION>\n");
- pos_e = NULL;
- pos = linebuf;
- t = next_token();
- assert(t);
- assert(t == pos);
- do_script();
- strcpy(linebuf, "order_start <UNDEFINED_SECTION>;forward;backward;forward;forward,position\n");
- pos_e = NULL;
- pos = linebuf;
- t = next_token();
- assert(t);
- assert(t == pos);
- do_order_start();
- strcpy(linebuf, "UNDEFINED IGNORE;IGNORE;IGNORE\n");
- pos_e = NULL;
- pos = linebuf;
- t = next_token();
- assert(t);
- assert(t == pos);
- do_weight(t);
- strcpy(linebuf, "order_end\n");
- pos_e = NULL;
- pos = linebuf;
- t = next_token();
- assert(t);
- assert(t == pos);
- do_order_end();
- } else {
- error_msg("no definition of UNDEFINED for %s", cur_col->name);
- }
- }
- #endif
- verbose_msg(VDETAIL, " Der: %15s", cur_col->name);
- }
- {
- #if 0
- ll_item_t *p = cur_col->section_list;
- #endif
- verbose_msg(VDETAIL, "%6u weights", tnumnodes(cur_col->root_wi_index));
- if (cur_base) {
- verbose_msg(VDETAIL, " %6u der %6u reor %6u starter - %u new stubs",
- tnumnodes(cur_base->root_derived_wi),
- tnumnodes(cur_base->root_wi_index_reordered),
- tnumnodes(cur_base->root_starter_char),
- ll_count(cur_col->section_list, DT_REORDER));
- }
- verbose_msg(VDETAIL, "\n");
- #if 0
- while (p) {
- assert(((section_t *)(p->data))->num_items ==
- ll_len(((section_t *)(p->data))->itm_list));
- if (!p->next &&
- ((*((section_t *)(p->data))->name == 'a')
- && (((section_t *)(p->data))->num_items == 0))
- ) {
- break;
- }
- if (!(p->data_type & DT_REORDER)) {
- if ((*((section_t *)(p->data))->name != 'a')
- || (((section_t *)(p->data))->num_items > 0)
- ) {
- verbose_msg(VDETAIL,
- /* "\t%-15s %zu\n", */
- "\t%-15s %6u\n",
- ((section_t *)(p->data))->name,
- ((section_t *)(p->data))->num_items);
- }
- }
- p = p->next;
- }
- #endif
- }
- }
- static void print_colnode(const void *ptr, VISIT order, int level)
- {
- const colitem_t *p = *(const colitem_t **) ptr;
- if (order == postorder || order == leaf) {
- printf("collating item = \"%s\"", p->string);
- if (p->element) {
- printf(" is %s", p->element);
- }
- printf("\n");
- }
- }
- static void print_weight_node(const void *ptr, VISIT order, int level)
- {
- const weight_t *p = *(const weight_t **) ptr;
- int i;
- if (order == postorder || order == leaf) {
- printf("weight: (%d) ", p->num_weights);
- for (i = 0 ; i < p->num_weights ; i++) {
- if (p->rule[i] & R_FORWARD) {
- printf("F");
- }
- if (p->rule[i] & R_BACKWARD) {
- printf("B");
- }
- if (p->rule[i] & R_POSITION) {
- printf("P");
- }
- printf(",");
- }
- for (i = 0 ; i < p->num_weights ; i++) {
- printf(" %s", p->colitem[i]);
- }
- printf("\n");
- }
- }
- typedef struct {
- const char *der_name;
- int base_locale;
- } deps_t;
- enum {
- BASE_iso14651_t1,
- BASE_comm,
- BASE_cs_CZ,
- BASE_ar_SA,
- BASE_th_TH,
- BASE_ja_JP,
- BASE_ko_KR,
- BASE_MAX
- };
- static const char *base_name[] = {
- "iso14651_t1",
- "comm",
- "cs_CZ",
- "ar_SA",
- "th_TH",
- "ja_JP",
- "ko_KR"
- };
- static ll_item_t *locale_list[BASE_MAX];
- static void init_locale_list(void)
- {
- int i;
- for (i=0 ; i < BASE_MAX ; i++) {
- locale_list[i] = (ll_item_t *) xmalloc(sizeof(ll_item_t));
- locale_list[i]->prev = locale_list[i]->next = locale_list[i];
- locale_list[i]->data = (void *) base_name[i];
- }
- }
- deps_t deps[] = {
- { "af_ZA", BASE_iso14651_t1 },
- { "am_ET", BASE_iso14651_t1 },
- { "ar_AE", BASE_iso14651_t1 },
- { "ar_BH", BASE_iso14651_t1 },
- { "ar_DZ", BASE_iso14651_t1 },
- { "ar_EG", BASE_iso14651_t1 },
- { "ar_IN", BASE_iso14651_t1 },
- { "ar_IQ", BASE_iso14651_t1 },
- { "ar_JO", BASE_iso14651_t1 },
- { "ar_KW", BASE_iso14651_t1 },
- { "ar_LB", BASE_iso14651_t1 },
- { "ar_LY", BASE_iso14651_t1 },
- { "ar_MA", BASE_iso14651_t1 },
- { "ar_OM", BASE_iso14651_t1 },
- { "ar_QA", BASE_iso14651_t1 },
- { "ar_SA", BASE_ar_SA },
- { "ar_SD", BASE_iso14651_t1 },
- { "ar_SY", BASE_iso14651_t1 },
- { "ar_TN", BASE_iso14651_t1 },
- { "ar_YE", BASE_iso14651_t1 },
- { "az_AZ", BASE_iso14651_t1 },
- { "be_BY", BASE_iso14651_t1 },
- { "bg_BG", BASE_iso14651_t1 },
- { "bn_BD", BASE_iso14651_t1 },
- { "bn_IN", BASE_iso14651_t1 },
- { "br_FR", BASE_iso14651_t1 },
- { "bs_BA", BASE_iso14651_t1 },
- { "ca_ES", BASE_comm },
- { "cs_CZ", BASE_cs_CZ },
- { "cy_GB", BASE_iso14651_t1 },
- { "da_DK", BASE_comm },
- { "de_AT", BASE_iso14651_t1 },
- { "de_BE", BASE_iso14651_t1 },
- { "de_CH", BASE_iso14651_t1 },
- { "de_DE", BASE_iso14651_t1 },
- { "de_LU", BASE_iso14651_t1 },
- { "el_GR", BASE_iso14651_t1 },
- { "en_AU", BASE_iso14651_t1 },
- { "en_BW", BASE_iso14651_t1 },
- { "en_CA", BASE_comm },
- { "en_DK", BASE_iso14651_t1 },
- { "en_GB", BASE_iso14651_t1 },
- { "en_HK", BASE_iso14651_t1 },
- { "en_IE", BASE_iso14651_t1 },
- { "en_IN", BASE_iso14651_t1 },
- { "en_NZ", BASE_iso14651_t1 },
- { "en_PH", BASE_iso14651_t1 },
- { "en_SG", BASE_iso14651_t1 },
- { "en_US", BASE_iso14651_t1 },
- { "en_ZA", BASE_iso14651_t1 },
- { "en_ZW", BASE_iso14651_t1 },
- { "eo_EO", BASE_iso14651_t1 },
- { "es_AR", BASE_comm },
- { "es_BO", BASE_comm },
- { "es_CL", BASE_comm },
- { "es_CO", BASE_comm },
- { "es_CR", BASE_comm },
- { "es_DO", BASE_comm },
- { "es_EC", BASE_comm },
- { "es_ES", BASE_comm },
- { "es_GT", BASE_comm },
- { "es_HN", BASE_comm },
- { "es_MX", BASE_comm },
- { "es_NI", BASE_comm },
- { "es_PA", BASE_comm },
- { "es_PE", BASE_comm },
- { "es_PR", BASE_comm },
- { "es_PY", BASE_comm },
- { "es_SV", BASE_comm },
- { "es_US", BASE_comm },
- { "es_UY", BASE_comm },
- { "es_VE", BASE_comm },
- { "et_EE", BASE_comm },
- { "eu_ES", BASE_iso14651_t1 },
- { "fa_IR", BASE_iso14651_t1 },
- { "fi_FI", BASE_comm },
- { "fo_FO", BASE_comm },
- { "fr_BE", BASE_iso14651_t1 },
- { "fr_CA", BASE_comm },
- { "fr_CH", BASE_iso14651_t1 },
- { "fr_FR", BASE_iso14651_t1 },
- { "fr_LU", BASE_iso14651_t1 },
- { "ga_IE", BASE_iso14651_t1 },
- { "gd_GB", BASE_iso14651_t1 },
- { "gl_ES", BASE_comm },
- { "gv_GB", BASE_iso14651_t1 },
- { "he_IL", BASE_iso14651_t1 },
- { "hi_IN", BASE_iso14651_t1 },
- { "hr_HR", BASE_comm },
- { "hu_HU", BASE_iso14651_t1 },
- { "hy_AM", BASE_iso14651_t1 },
- { "id_ID", BASE_iso14651_t1 },
- { "is_IS", BASE_comm },
- { "it_CH", BASE_iso14651_t1 },
- { "it_IT", BASE_iso14651_t1 },
- { "iw_IL", BASE_iso14651_t1 },
- { "ja_JP", BASE_ja_JP },
- { "ka_GE", BASE_iso14651_t1 },
- { "kl_GL", BASE_comm },
- { "ko_KR", BASE_ko_KR },
- { "kw_GB", BASE_iso14651_t1 },
- { "lt_LT", BASE_comm },
- { "lv_LV", BASE_comm },
- { "mi_NZ", BASE_iso14651_t1 },
- { "mk_MK", BASE_iso14651_t1 },
- { "mr_IN", BASE_iso14651_t1 },
- { "ms_MY", BASE_iso14651_t1 },
- { "mt_MT", BASE_iso14651_t1 },
- { "nl_BE", BASE_iso14651_t1 },
- { "nl_NL", BASE_iso14651_t1 },
- { "nn_NO", BASE_iso14651_t1 },
- { "no_NO", BASE_comm },
- { "oc_FR", BASE_iso14651_t1 },
- { "pl_PL", BASE_comm },
- { "pt_BR", BASE_iso14651_t1 },
- { "pt_PT", BASE_iso14651_t1 },
- { "ro_RO", BASE_iso14651_t1 },
- { "ru_RU", BASE_iso14651_t1 },
- { "ru_UA", BASE_iso14651_t1 },
- { "se_NO", BASE_iso14651_t1 },
- { "sk_SK", BASE_cs_CZ },
- { "sl_SI", BASE_comm },
- { "sq_AL", BASE_iso14651_t1 },
- { "sr_YU", BASE_iso14651_t1 },
- { "sv_FI", BASE_comm },
- { "sv_SE", BASE_iso14651_t1 },
- { "ta_IN", BASE_iso14651_t1 },
- { "te_IN", BASE_iso14651_t1 },
- { "tg_TJ", BASE_iso14651_t1 },
- { "th_TH", BASE_th_TH },
- { "ti_ER", BASE_iso14651_t1 },
- { "ti_ET", BASE_iso14651_t1 },
- { "tl_PH", BASE_iso14651_t1 },
- { "tr_TR", BASE_comm },
- { "tt_RU", BASE_iso14651_t1 },
- { "uk_UA", BASE_iso14651_t1 },
- { "ur_PK", BASE_iso14651_t1 },
- { "uz_UZ", BASE_iso14651_t1 },
- { "vi_VN", BASE_iso14651_t1 },
- { "wa_BE", BASE_iso14651_t1 },
- { "yi_US", BASE_iso14651_t1 },
- { "zh_CN", BASE_iso14651_t1 },
- { "zh_HK", BASE_iso14651_t1 },
- { "zh_SG", BASE_iso14651_t1 },
- { "zh_TW", BASE_iso14651_t1 },
- };
- static int der_count[BASE_MAX];
- static const char *new_args[500];
- static int new_arg_count;
- static int dep_cmp(const void *s1, const void *s2)
- {
- return strcmp( (const char *) s1, ((const deps_t *) s2)->der_name);
- }
- static int old_main(int argc, char **argv);
- int main(int argc, char **argv)
- {
- const deps_t *p;
- ll_item_t *lli;
- int i;
- int total;
- char *output_file = "locale_collate.h";
- unsigned verbosity = 0;
- if (argc < 3) {
- return EXIT_FAILURE;
- }
- --argc;
- inputdir = strdup(*++argv);
- inputdir_len = strlen(inputdir);
- init_locale_list();
- while (--argc) {
- ++argv;
- if (!strcmp(*argv, "-o")) {
- --argc;
- if (*++argv == NULL) {
- printf("-o <outfile> requires an argument\n");
- return EXIT_FAILURE;
- }
- output_file = strdup(*argv);
- continue;
- } else if (!strcmp(*argv, "-v")) {
- verbosity++;
- continue;
- }
- p = (const deps_t *) bsearch(*argv, deps, sizeof(deps)/sizeof(deps[0]), sizeof(deps[0]), dep_cmp);
- if (!p) {
- if (!strcmp("C", *argv)) {
- printf("ignoring %s locale\n", *argv);
- continue;
- } else {
- printf("%s not found\n", *argv);
- return EXIT_FAILURE;
- }
- }
- i = p->base_locale;
- ++der_count[i];
- if (!strcmp(base_name[i], *argv)) {
- /* same name as base, so skip after count incremented */
- continue;
- }
- /* add it to the list. the main body will catch duplicates */
- lli = (ll_item_t *) xmalloc(sizeof(ll_item_t));
- lli->prev = lli->next = NULL;
- lli->data = (void *) *argv;
- insque(lli, locale_list[i]);
- }
- total = 0;
- for (i=0 ; i < BASE_MAX ; i++) {
- /* printf("der_count[%2d] = %3d\n", i, der_count[i]); */
- total += der_count[i];
- }
- /* printf("total = %d\n", total); */
- new_args[new_arg_count++] = "dummyprogramname";
- for (i=0 ; i < BASE_MAX ; i++) {
- if (!der_count[i]) {
- continue;
- }
- new_args[new_arg_count++] = (i == BASE_comm) ? "-c" : "-b";
- lli = locale_list[i];
- do {
- new_args[new_arg_count++] = (const char *) (lli->data);
- lli = lli->next;
- } while (lli != locale_list[i]);
- new_args[new_arg_count++] = "-f";
- }
- for (i=0; i < verbosity; i++)
- new_args[new_arg_count++] = "-v";
- new_args[new_arg_count++] = "-o";
- new_args[new_arg_count++] = output_file;
- /*
- for (i=0 ; i < new_arg_count ; i++) {
- printf("%3d: %s\n", i, new_args[i]);
- }
- */
- return old_main(new_arg_count, (char **) new_args);
- }
- /* usage... prog -b basefile derived {derived} -s single {single} */
- static int old_main(int argc, char **argv)
- {
- int next_is_base = 0;
- int next_is_subset = 0;
- char *output_file = NULL;
- superset = 0;
- while (--argc) {
- ++argv;
- if (**argv == '-') {
- if ((*argv)[1] == 'd') {
- dump_weights((*argv) + 2);
- } else if ((*argv)[1] == 'f') { /* dump all weight rules */
- finalize_base();
- } else if ((*argv)[1] == 'R') { /* dump all weight rules */
- twalk(root_weight, print_weight_node);
- } else if (((*argv)[1] == 'c') && !(*argv)[2]) { /* new common subset */
- cur_base = cur_derived = NULL;
- next_is_subset = 1;
- next_is_base = 1;
- superset = 0;
- } else if (((*argv)[1] == 'b') && !(*argv)[2]) { /* new base locale */
- cur_base = cur_derived = NULL;
- next_is_subset = 0;
- next_is_base = 1;
- superset = 0;
- } else if (((*argv)[1] == 's') && !(*argv)[2]) { /* single locales follow */
- cur_base = cur_derived = NULL;
- next_is_subset = 0;
- next_is_base = 2;
- superset = 0;
- } else if (((*argv)[1] == 'o') && !(*argv)[2]) { /* output file */
- --argc;
- output_file = *++argv;
- } else if (((*argv)[1] == 'v') && !(*argv)[2]) { /* verbose */
- ++verbose;
- } else {
- error_msg("unrecognized option %s", *argv);
- }
- continue;
- }
- /* new file */
- new_col_locale(*argv); /* automaticly sets cur_col */
- if (next_is_base) {
- cur_base = cur_col;
- } else {
- cur_derived = cur_col;
- }
- pushfile(*argv);
- /* verbose_msg(VDETAIL, "processing file %s\n", *argv); */
- processfile(); /* this does a popfile */
- /* twalk(cur_col->root_colitem, print_colnode); */
- if (next_is_base == 1) {
- next_is_base = 0;
- }
- if (next_is_subset) {
- next_is_subset = 0;
- superset = 1;
- }
- }
- verbose_msg(VINFO, "success!\n");
- verbose_msg(VINFO,
- /* "num_sym=%zu mem_sym=%zu unique_weights=%zu\n", */
- "num_sym=%u mem_sym=%u unique_weights=%u\n",
- num_sym, mem_sym, unique_weights);
- /* twalk(root_weight, print_weight_node); */
- verbose_msg(VINFO, "num base locales = %d num derived locales = %d\n",
- base_locale_len, der_locale_len);
- verbose_msg(VINFO,
- "override_len = %d multistart_len = %d weightstr_len = %d\n"
- "wcs2colidt_len = %d index2weight_len = %d index2ruleidx_len = %d\n"
- "ruletable_len = %d\n"
- "total size is %d bytes or %d kB\n",
- override_len, multistart_len, weightstr_len,
- wcs2colidt_len, index2weight_len, index2ruleidx_len,
- ruletable_len,
- #warning mult by 2 for rule indecies
- (override_len + multistart_len + weightstr_len
- + wcs2colidt_len + index2weight_len + index2ruleidx_len + ruletable_len) * 2,
- (override_len + multistart_len + weightstr_len
- + wcs2colidt_len + index2weight_len + index2ruleidx_len + ruletable_len + 511) / 512);
- #if 0
- {
- int i;
- for (i=0 ; i < base_locale_len ; i++) {
- dump_base_locale(i);
- }
- for (i=0 ; i < der_locale_len ; i++) {
- dump_der_locale(i);
- }
- }
- #endif
- {
- FILE *fp = fopen(output_file, "w");
- if (!fp) {
- error_msg("cannot open output file '%s'!", output_file);
- }
- dump_collate(fp);
- if (ferror(fp) || fclose(fp)) {
- error_msg("write error or close error for output file!\n");
- }
- }
- return EXIT_SUCCESS;
- }
- static void error_msg(const char *fmt, ...)
- {
- va_list arg;
- fprintf(stderr, "Error: ");
- if (fno >= 0) {
- fprintf(stderr, "file %s (%d): ", fname[fno], lineno[fno]);
- }
- va_start(arg, fmt);
- vfprintf(stderr, fmt, arg);
- va_end(arg);
- fprintf(stderr, "\n");
- exit(EXIT_FAILURE);
- }
- static void pushfile(char *filename)
- {
- char *inputfile;
- size_t inputfile_len;
- if (fno >= MAX_FNO) {
- error_msg("file stack size exceeded");
- }
- inputfile_len = inputdir_len + strlen(filename) + 2;
- inputfile = xmalloc(inputfile_len);
- memset(inputfile, 0, inputfile_len);
- sprintf(inputfile, "%s/%s", inputdir, filename);
- if (!(fstack[++fno] = fopen(inputfile, "r"))) {
- --fno; /* oops */
- error_msg("cannot open file %s: %s", inputfile, strerror(errno));
- }
- fname[fno] = xsymdup(inputfile);
- lineno[fno] = 0;
- }
- static void popfile(void)
- {
- if (fno < 0) {
- error_msg("pop on empty file stack");
- }
- /* free(fname[fno]); */
- fclose(fstack[fno]);
- --fno;
- }
- static void eatwhitespace(void)
- {
- while (isspace(*pos)) {
- ++pos;
- }
- }
- static int iscommentchar(int c)
- {
- return ((c == '#') || (c == '%'));
- }
- static int next_line(void)
- {
- size_t n;
- char *s = linebuf;
- assert(fno >= 0);
- pos_e = NULL;
- do {
- if (fgets(s, sizeof(linebuf), fstack[fno]) != NULL) {
- ++lineno[fno];
- n = strlen(linebuf);
- if ((n == sizeof(linebuf) - 1) && (linebuf[n-1] != '\n')) {
- /* Either line is too long or last line is very long with
- * no trailing newline. But we'll always treat it as an
- * errro. */
- error_msg("line too long?");
- }
- --n;
- /* Be careful... last line doesn't need a newline. */
- if (linebuf[n] == '\n') {
- linebuf[n--] = 0; /* trim trailing newline */
- }
- pos = linebuf;
- eatwhitespace();
- if (*pos && !iscommentchar(*pos)) { /* not empty or comment line */
- return 1; /* got a line */
- }
- } else { /* eof */
- popfile();
- }
- } while (fno >= 0);
- return 0;
- }
- static char *next_token(void)
- {
- char *p;
- #if 0
- if (pos_e == NULL) {
- return NULL
- pos = pos_e;
- *pos = end_of_token;
- end_of_token = 0;
- }
- #else
- if (pos_e != NULL) {
- pos = pos_e;
- *pos = end_of_token;
- end_of_token = 0;
- }
- #endif
- eatwhitespace();
- p = pos;
- if (!*p || iscommentchar(*p)) { /* end of line or start of comment */
- pos = pos_e = NULL;
- *p = 0; /* treat comment as end of line */
- /* fprintf(stdout, "returning NUL token |%s|\n", pos); */
- return NULL;
- #if 1
- } else if (*p == '<') { /* collating symbol, element, or value */
- while (*++p) {
- if ((*p == '/') && p[1]) {
- ++p;
- continue;
- }
- if (*p == '>') {
- pos_e = ++p;
- end_of_token = *p;
- *p = 0;
- /* fprintf(stdout, "returning col token |%s|\n", pos); */
- return pos;
- }
- }
- } else if (*p == '"') { /* collating element value? */
- while (*++p) {
- if (*p == '"') { /* found the end of the quoted string */
- pos_e = ++p;
- end_of_token = *p;
- *p = 0;
- /* fprintf(stdout, "returning quote token |%s|\n", pos); */
- return pos;
- }
- }
- #endif
- } else { /* some kind of keyword */
- while (*++p) {
- if (isspace(*p) || (*p == ';')) {
- break;
- }
- }
- pos_e = p;
- end_of_token = *p;
- *p = 0;
- /* fprintf(stdout, "returning key token |%s|\n", pos); */
- return pos;
- }
- error_msg("illegal token |%s|", pos);
- }
- static void *xmalloc(size_t n)
- {
- void *p;
- if (!(p = malloc(n))) {
- error_msg("OUT OF MEMORY");
- }
- return p;
- }
- static void do_copy(void)
- {
- char *s;
- char *e;
- if ((s = next_token()) != NULL) {
- e = strchr(s + 1, '"');
- if ((*s == '"') && e && (*e == '"') && !e[1]) {
- if (next_token() != NULL) {
- error_msg("illegal trailing text: %s", pos);
- }
- *e = 0;
- ++s;
- if (cur_base && !strcmp(cur_base->name,s)) {
- /* verbose_msg(VDETAIL, "skipping copy of base file %s\n", s); */
- #warning need to update last in order and position or check
- return;
- }
- /* verbose_msg(VDETAIL, "full copy of %s\n", s); */
- pushfile(s);
- return;
- }
- }
- error_msg("illegal or missing arg for copy: %s", s);
- }
- static void do_colsym(void)
- {
- char *s;
- char *e;
- if ((s = next_token()) != NULL) {
- e = strrchr(s,'>');
- if ((*s == '<') && e && (*e == '>') && !e[1]) {
- if (next_token() != NULL) {
- error_msg("illegal trailing text: %s", pos);
- }
- e[1] = 0; /* cleanup in case next_token stored something */
- add_colitem(s,NULL);
- return;
- }
- }
- error_msg("illegal or missing arg for collating-symbol: %s", s);
- }
- static void do_colele(void)
- {
- char *s;
- char *e;
- char *s1;
- char *e1;
- int n;
- if ((s = next_token()) != NULL) {
- e = strrchr(s,'>');
- if ((*s == '<') && e && (*e == '>') && !e[1]) {
- if (((s1 = next_token()) == NULL)
- || (strcmp(s1,"from") != 0)
- || ((s1 = next_token()) == NULL)
- || (*s1 != '\"')
- ) {
- error_msg("illegal format for collating-element spec");
- }
- e1 = strchr(s1 + 1, '"');
- if ((*s1 != '"') || !e1 || (*e1 != '"') || (e1[1] != 0)) {
- error_msg("illegal definition for collating-element: %s", s1);
- }
- if (next_token() != NULL) {
- error_msg("illegal trailing text: %s", pos);
- }
- e[1] = 0; /* cleanup in case next_token stored something */
- e1[1] = 0;
- add_colitem(s,s1);
- ++s1;
- if (!(n = is_ucode(s1))) {
- error_msg("starting char must be a <U####> code: %s", s1);
- }
- assert(s1[n] == '<');
- s1[n] = 0;
- s = xsymdup(s1);
- if (!(tsearch(s, &cur_base->root_starter_char, sym_cmp))) {
- error_msg("OUT OF MEMORY");
- }
- return;
- }
- }
- error_msg("illegal or missing arg for collating-element: %s", s);
- }
- static ll_item_t *find_section_list_item(const char *name, col_locale_t *loc)
- {
- ll_item_t *p;
- if (!loc) {
- return NULL;
- }
- p = loc->section_list;
- while (p) {
- #warning devel code
- /* if (!((p->data_type == DT_SECTION) || (p->data_type == DT_REORDER))) { */
- /* verbose_msg(VDETAIL, "fsli = %d\n", p->data_type); */
- /* } */
- assert((p->data_type == DT_SECTION) || (p->data_type == DT_REORDER));
- if (!strcmp(name, ((section_t *)(p->data))->name)) {
- break;
- }
- p = p->next;
- }
- return p;
- }
- static ll_item_t *find_ll_last(ll_item_t *p)
- {
- assert(p);
- while (p->next) {
- p = p->next;
- }
- return p;
- }
- static void do_script(void)
- {
- char *s;
- char *e;
- if ((s = next_token()) != NULL) {
- e = strrchr(s,'>');
- if ((*s == '<') && e && (*e == '>') && !e[1]) {
- if (next_token() != NULL) {
- error_msg("illegal trailing text: %s", pos);
- }
- e[1] = 0; /* cleanup in case next_token stored something */
- add_script(s);
- return;
- }
- }
- error_msg("illegal or missing arg for script: %s", s);
- }
- static col_locale_t *new_col_locale(char *name)
- {
- ll_item_t *lli;
- ll_item_t *lli2;
- cur_col = (col_locale_t *) xmalloc(sizeof(col_locale_t));
- cur_col->name = name;
- cur_col->root_colitem = NULL;
- cur_col->root_element = NULL;
- cur_col->root_scripts = NULL;
- cur_col->base_locale = NULL;
- if (!superset) {
- /* start with an anonymous section */
- cur_section = new_section(NULL);
- cur_col->section_list = new_ll_item(DT_SECTION, cur_section);
- } else {
- /* start with a reorder section */
- cur_section = new_section("R");
- cur_num_weights = cur_section->num_rules
- = ((section_t *)(cur_base->section_list->data))->num_rules;
- memcpy(cur_rule,
- ((section_t *)(cur_base->section_list->data))->rules,
- MAX_COLLATION_WEIGHTS);
- memcpy(cur_section->rules,
- ((section_t *)(cur_base->section_list->data))->rules,
- MAX_COLLATION_WEIGHTS);
- cur_col->section_list = new_ll_item(DT_REORDER, cur_section);
- assert(cur_base->section_list->next == NULL); /* currently only one section allowed */
- lli = ((section_t *)(cur_base->section_list->data))->itm_list;
- assert(lli);
- lli2 = new_ll_item(DT_REORDER, cur_section);
- lli2->prev = lli2->next = lli2;
- insque(lli2, lli->prev);
- ((section_t *)(cur_base->section_list->data))->itm_list = lli2;
- }
- /* cur_col->section_list = NULL; */
- /* add_script(((section_t *)(cur_col->section_list->data))->name); */
- cur_col->root_wi_index = NULL;
- cur_col->root_wi_index_reordered = NULL;
- cur_col->root_derived_wi = NULL;
- cur_col->derived_list = NULL;
- cur_col->root_starter_char = NULL;
- cur_col->root_starter_all = NULL;
- cur_col->undefined_idx = NULL;
- return cur_col;
- }
- static int colitem_cmp(const void *n1, const void *n2)
- {
- return strcmp(((colitem_t *)n1)->string, ((colitem_t *)n2)->string);
- }
- static int colelement_cmp(const void *n1, const void *n2)
- {
- int r;
- r = strcmp(((colitem_t *)n1)->string, ((colitem_t *)n2)->string);
- if (!r) {
- if (((colitem_t *)n1)->element && ((colitem_t *)n2)->element) {
- r = strcmp(((colitem_t *)n1)->element, ((colitem_t *)n2)->element);
- } else if (((colitem_t *)n1)->element == ((colitem_t *)n2)->element) {
- r = 0; /* both null */
- } else {
- r = (((colitem_t *)n1)->element == NULL) ? -1 : 1;
- }
- }
- return r;
- }
- static void del_colitem(colitem_t *p)
- {
- /* free((void *) p->element); */
- /* free((void *) p->string); */
- free(p);
- }
- static colitem_t *new_colitem(char *item, char *def)
- {
- colitem_t *p;
- p = xmalloc(sizeof(colitem_t));
- p->string = xsymdup(item);
- p->element = (!def) ? def : xsymdup(def);
- return p;
- }
- static void add_colitem(char *item, char *def)
- {
- colitem_t *p;
- #if 0
- printf("adding collation item %s", item);
- if (def) {
- printf(" with definition %s", def);
- }
- printf("\n");
- #endif
- p = new_colitem(item, def);
- #warning devel code
- if (superset) {
- if (tfind(p, &cur_base->root_colitem, colitem_cmp)) {
- /* verbose_msg(VDETAIL, "skipping superset duplicate collating item \"%s\"\n", p->string); */
- del_colitem(p);
- return;
- /* } else { */
- /* verbose_msg(VDETAIL, "superset: new collating item \"%s\" = %s\n", p->string, p->element); */
- }
- }
- if (cur_col == cur_derived) {
- if (!tfind(p, &cur_base->root_colitem, colitem_cmp)) {
- /* not in current but could be in base */
- if (!tsearch(p, &cur_base->root_colitem, colitem_cmp)) {
- error_msg("OUT OF MEMORY!");
- }
- } else if (!tfind(p, &cur_base->root_colitem, colelement_cmp)) {
- error_msg("collating element/symbol mismatch: item=%s def=%s", item, def);
- }
- }
- if (!tfind(p, &cur_col->root_colitem, colitem_cmp)) {
- /* not in current but could be in base */
- if (!tsearch(p, &cur_col->root_colitem, colitem_cmp)) {
- error_msg("OUT OF MEMORY!");
- }
- } else if (!tfind(p, &cur_col->root_colitem, colelement_cmp)) {
- error_msg("collating element/symbol mismatch");
- } else { /* already there */
- fprintf(stderr, "duplicate collating item \"%s\"\n", p->string);
- del_colitem(p);
- }
- }
- /* add a script (section) to the current locale */
- static void add_script(const char *s)
- {
- ll_item_t *l;
- /* make sure it isn't in base if working with derived */
- if (cur_base != cur_col) {
- if (find_section_list_item(s, cur_base)) {
- error_msg("attempt to add script %s for derived when already in base", s);
- }
- }
- if (find_section_list_item(s, cur_col)) {
- error_msg("attempt to readd script %s", s);
- }
- l = find_ll_last(cur_col->section_list);
- insque(new_ll_item(DT_SECTION, new_section(s)), l);
- }
- static const char str_forward[] = "forward";
- static const char str_backward[] = "backward";
- static const char str_position[] = "position";
- static void do_order_start(void)
- {
- const char *s;
- char *e;
- ll_item_t *l;
- section_t *sect;
- int rule;
- if (order_state & ~IN_ORDER) {
- error_msg("order_start following reorder{_sections}_after");
- }
- order_state |= IN_ORDER;
- if (superset) {
- if (++superset_order_start_cnt > 1) {
- error_msg("currently only a common order_start is supported in superset");
- }
- return;
- }
- if (!(s = next_token())) {
- s = str_forward; /* if no args */
- }
- if (*s == '<') { /* section (script) */
- e = strrchr(s,'>');
- if ((*s == '<') && e && (*e == '>') && !e[1]) {
- e[1] = 0; /* cleanup in case next_token stored something */
- if (!(l = find_section_list_item(s, cur_col))) {
- error_msg("ref of undefined sections: %s", s);
- }
- sect = (section_t *)(l->data);
- if (sect->num_rules) {
- error_msg("sections already defined: %s", s);
- }
- } else {
- error_msg("illegal section ref: %s", s);
- }
- if (!(s = next_token())) {
- s = str_forward; /* if no args */
- } else if (*s != ';') {
- error_msg("missing seperator!");
- }
- } else { /* need an anonymous section */
- if ((*cur_section->name != '<') && (cur_section->num_items == 0)) { /* already in an empty anonymous section */
- sect = cur_section;
- /* fprintf(stdout, "using empty anon section %s\n", sect->name); */
- } else {
- sect = new_section(NULL);
- l = find_ll_last(cur_col->section_list);
- insque(new_ll_item(DT_SECTION, sect), l);
- /* fprintf(stdout, "adding order section after section %s\n", ((section_t *)(l->data))->name); */
- /* fprintf(stdout, " last section is %s\n", ((section_t *)(l->next->data))->name); */
- }
- sect->num_rules = 0; /* setting this below so nix default */
- }
- cur_section = sect;
- /* fprintf(stdout, "cur_section now %s\n", cur_section->name); */
- #warning need to add section to weight list?
- /* now do rules */
- do {
- rule = 0;
- if (*s == ';') {
- ++s;
- }
- while (*s) {
- if (!strncmp(str_forward, s, 7)) {
- rule |= R_FORWARD;
- s += 7;
- } else if (!strncmp(str_backward, s, 8)) {
- rule |= R_BACKWARD;
- s += 8;
- } else if (!strncmp(str_position, s, 8)) {
- rule |= R_POSITION;
- s += 8;
- }
- if (*s == ',') {
- ++s;
- continue;
- }
- if (!*s || (*s == ';')) {
- if (sect->num_rules >= MAX_COLLATION_WEIGHTS) {
- error_msg("more than %d weight rules!", MAX_COLLATION_WEIGHTS);
- }
- if (!rule) {
- error_msg("missing weight rule!");
- }
- if ((rule & (R_FORWARD|R_BACKWARD|R_POSITION)) > R_BACKWARD) {
- error_msg("backward paired with forward and/or position!");
- }
- sect->rules[sect->num_rules++] = rule;
- rule = 0;
- continue;
- }
- error_msg("illegal weight rule: %s", s);
- }
- } while ((s = next_token()) != NULL);
- cur_section = sect;
- /* verbose_msg(VDETAIL, "setting cur_num_weights to %d for %s\n", sect->num_rules, sect->name); */
- cur_num_weights = sect->num_rules;
- memcpy(cur_rule, sect->rules, MAX_COLLATION_WEIGHTS);
- }
- static void do_order_end(void)
- {
- if (!(order_state & IN_ORDER)) {
- error_msg("order_end with no matching order_start");
- }
- order_state &= ~IN_ORDER;
- cur_section = new_section(NULL);
- }
- static void do_reorder_after(void)
- {
- char *t;
- ll_item_t *lli;
- const weight_t *w;
- int save_cur_num_weights;
- char save_cur_rule[MAX_COLLATION_WEIGHTS];
- if (order_state & ~IN_REORDER) {
- error_msg("reorder_after following order_start or reorder_sections_after");
- }
- order_state |= IN_REORDER;
- if (superset) {
- error_msg("currently reorder_after is not supported in supersets");
- }
- #warning have to use rule for current section!!!
- if (!(t = next_token())) {
- error_msg("missing arg for reorder_after");
- }
- t = xsymdup(t);
- if (next_token() != NULL) {
- error_msg("trailing text reorder_after: %s", pos);
- }
- if (cur_col == cur_base) {
- error_msg("sorry.. reorder_after in base locale is not currently supported");
- }
- if (!(lli = find_wi_index(t, cur_base))) {
- error_msg("reorder_after for non-base item currently not supported: %s", t);
- }
- w = ((weighted_item_t *)(lli->data))->weight;
- save_cur_num_weights = cur_num_weights;
- memcpy(save_cur_rule, cur_rule, MAX_COLLATION_WEIGHTS);
- cur_section = new_section("R");
- insque(new_ll_item(DT_REORDER, cur_section), lli);
- #if 0
- {
- ll_item_t *l1;
- ll_item_t *l2;
- ll_item_t *l3;
- l1 = new_ll_item(DT_REORDER, cur_section);
- l2 = find_ll_last(cur_col->section_list);
- insque(l1, l2);
- l3 = find_ll_last(cur_col->section_list);
- verbose_msg(VDETAIL, "reorder_after %p %p %p %s\n", l1, l2, l3, cur_section->name);
- }
- #else
- insque(new_ll_item(DT_REORDER, cur_section), find_ll_last(cur_col->section_list));
- #endif
- cur_num_weights = cur_section->num_rules = save_cur_num_weights;
- memcpy(cur_rule, save_cur_rule, MAX_COLLATION_WEIGHTS);
- memcpy(cur_section->rules, save_cur_rule, MAX_COLLATION_WEIGHTS);
- #warning devel code
- /* verbose_msg(VDETAIL, "reorder -- %s %d\n", ((weighted_item_t *)(lli->data))->symbol, w->num_weights); */
- #warning hack to get around hu_HU reorder-after problem
- /* if (!w->num_weights) { */
- /* } else { */
- /* cur_num_weights = w->num_weights; */
- /* memcpy(cur_rule, w->rule, MAX_COLLATION_WEIGHTS); */
- /* } */
- /* verbose_msg(VDETAIL, "reorder_after succeeded for %s\n", t); */
- }
- static void do_reorder_end(void)
- {
- if (!(order_state & IN_REORDER)) {
- error_msg("reorder_end with no matching reorder_after");
- }
- order_state &= ~IN_REORDER;
- }
- static void do_reorder_sections_after(void)
- {
- const char *t;
- ll_item_t *lli;
- if (order_state & ~IN_REORDER_SECTIONS) {
- error_msg("reorder_sections_after following order_start or reorder_after");
- }
- order_state |= IN_REORDER_SECTIONS;
- if (superset) {
- error_msg("currently reorder_sections_after is not supported in supersets");
- }
- if (!(t = next_token())) {
- error_msg("missing arg for reorder_sections_after");
- }
- t = xsymdup(t);
- if (next_token() != NULL) {
- error_msg("trailing text reorder_sections_after: %s", pos);
- }
- if (cur_col == cur_base) {
- error_msg("sorry.. reorder_sections_after in base locale is not currently supported");
- }
- lli = cur_base->section_list;
- do {
- /* verbose_msg(VDETAIL, "hmm -- |%s|%d|\n", ((section_t *)(lli->data))->name, lli->data_type); */
- if (lli->data_type & DT_SECTION) {
- /* verbose_msg(VDETAIL, "checking |%s|%s|\n", ((section_t *)(lli->data))->name, t); */
- if (!strcmp(((section_t *)(lli->data))->name, t)) {
- reorder_section_ptr = lli;
- return;
- }
- }
- lli = lli->next;
- } while (lli);
- error_msg("reorder_sections_after for non-base item currently not supported: %s", t);
- }
- static void do_reorder_sections_end(void)
- {
- if (!(order_state & IN_REORDER_SECTIONS)) {
- error_msg("reorder_sections_end with no matching reorder_sections_after");
- }
- order_state &= ~IN_REORDER_SECTIONS;
- reorder_section_ptr = NULL;
- }
- static ll_item_t *new_ll_item(int data_type, void *data)
- {
- ll_item_t *p;
- p = xmalloc(sizeof(ll_item_t));
- p->next = p->prev = NULL;
- p->data_type = data_type;
- p->data = data;
- p->idx = INT_MIN;
- return p;
- }
- static int sym_cmp(const void *n1, const void *n2)
- {
- /* verbose_msg(VDETAIL, "sym_cmp: |%s| |%s|\n", (const char *)n1, (const char *)n2); */
- return strcmp((const char *) n1, (const char *) n2);
- }
- static char *xsymdup(const char *s)
- {
- void *p;
- if (!(p = tfind(s, &root_sym, sym_cmp))) { /* not a currently known symbol */
- if (!(s = strdup(s)) || !(p = tsearch(s, &root_sym, sym_cmp))) {
- error_msg("OUT OF MEMORY!");
- }
- ++num_sym;
- mem_sym += strlen(s) + 1;
- /* verbose_msg(VDETAIL, "xsymdup: alloc |%s| %p |%s| %p\n", *(char **)p, p, s, s); */
- /* } else { */
- /* verbose_msg(VDETAIL, "xsymdup: found |%s| %p\n", *(char **)p, p); */
- }
- return *(char **) p;
- }
- static int weight_cmp(const void *n1, const void *n2)
- {
- const weight_t *w1 = (const weight_t *) n1;
- const weight_t *w2 = (const weight_t *) n2;
- int i, r;
- if (w1->num_weights != w2->num_weights) {
- return w1->num_weights - w2->num_weights;
- }
- for (i=0 ; i < w1->num_weights ; i++) {
- if (w1->rule[i] != w2->rule[i]) {
- return w1->rule[i] - w2->rule[i];
- }
- if ((r = strcmp(w1->colitem[i], w2->colitem[i])) != 0) {
- return r;
- }
- }
- return 0;
- }
- static weight_t *register_weight(weight_t *w)
- {
- void *p;
- if (!(p = tfind(w, &root_weight, weight_cmp))) { /* new weight */
- p = xmalloc(sizeof(weight_t));
- memcpy(p, w, sizeof(weight_t));
- if (!(p = tsearch(p, &root_weight, weight_cmp))) {
- error_msg("OUT OF MEMORY!");
- }
- ++unique_weights;
- /* } else { */
- /* verbose_msg(VDETAIL, "rw: found\n"); */
- }
- return *(weight_t **)p;
- }
- static size_t ll_len(ll_item_t *l)
- {
- size_t n = 0;
- ll_item_t *p = l;
- while (p) {
- ++n;
- p = p->next;
- if (p == l) { /* work for circular too */
- break;
- }
- }
- return n;
- }
- static size_t ll_count(ll_item_t *l, int mask)
- {
- size_t n = 0;
- ll_item_t *p = l;
- while (p) {
- if (p->data_type & mask) {
- ++n;
- }
- p = p->next;
- if (p == l) { /* work for circular too */
- break;
- }
- }
- return n;
- }
- static int wi_index_cmp(const void *n1, const void *n2)
- {
- const char *s1 = ((weighted_item_t *)(((ll_item_t *) n1)->data))->symbol;
- const char *s2 = ((weighted_item_t *)(((ll_item_t *) n2)->data))->symbol;
- return strcmp(s1, s2);
- }
- static void add_wi_index(ll_item_t *l)
- {
- assert(l->data_type == DT_WEIGHTED);
- if (!strcmp(((weighted_item_t *)(l->data))->symbol, "UNDEFINED")) {
- cur_col->undefined_idx = l;
- }
- if (!tfind(l, &cur_col->root_wi_index, wi_index_cmp)) { /* new wi_index */
- if (!tsearch(l, &cur_col->root_wi_index, wi_index_cmp)) {
- error_msg("OUT OF MEMORY!");
- }
- }
- if (cur_base != cur_col) {
- if (!tfind(l, &cur_base->root_wi_index, wi_index_cmp)) {/* not a base val */
- /* printf("derived: %s\n", ((weighted_item_t *)(l->data))->symbol); */
- if (!tfind(l, &cur_base->root_derived_wi, wi_index_cmp)) { /* new derived */
- if (!tsearch(l, &cur_base->root_derived_wi, wi_index_cmp)) {
- error_msg("OUT OF MEMORY!");
- }
- }
- }
- }
- }
- static int final_index;
- static int is_ucode(const char *s)
- {
- if ((s[0] == '<')
- && (s[1] == 'U')
- && isxdigit(s[2])
- && isxdigit(s[3])
- && isxdigit(s[4])
- && isxdigit(s[5])
- && (s[6] == '>')
- ) {
- return 7;
- } else {
- return 0;
- }
- }
- static void add_final_col_index(const char *s)
- {
- ENTRY e;
- e.key = (char *) s;
- e.data = (void *)(final_index);
- if (!hsearch(e, FIND)) { /* not in the table */
- if (!hsearch(e, ENTER)) {
- error_msg("OUT OF MEMORY! (hsearch)");
- }
- #if 0
- {
- int n;
- void *v;
- colitem_t ci;
- colitem_t *p;
- const char *t;
- if (!strcmp(s, "UNDEFINED")) {
- printf("%6d: %s\n", final_index, s);
- } else {
- assert(*s == '<');
- if ((n = is_ucode(s)) != 0) {
- assert(!s[n]);
- printf("%6d: %s\n", final_index, s);
- } else {
- ci.string = (char *) s;
- ci.element = NULL; /* don't care */
- v = tfind(&ci, &cur_base->root_colitem, colitem_cmp);
- if (!v) {
- verbose_msg(VDETAIL, "%s NOT DEFINED!!!\n", s);
- } else {
- p = *((colitem_t **) v);
- if (p->element != NULL) {
- t = p->element;
- assert(*t == '"');
- ++t;
- n = is_ucode(t);
- assert(n);
- printf("%6d: %.*s | ", final_index, n, t);
- do {
- t += n;
- assert(*t);
- if (*t == '"') {
- assert(!t[1]);
- break;
- }
- n = is_ucode(t);
- assert(n);
- printf("%.*s", n, t);
- } while (1);
- printf(" collating-element %s\n", s);
- } else {
- printf("%6d: %s (collating-symbol)\n", final_index, s);
- }
- }
- }
- }
- }
- #endif
- ++final_index;
- }
- }
- static int final_index_val0(const char *s)
- {
- ENTRY *p;
- ENTRY e;
- e.key = (char *) s;
- if (!(p = hsearch(e, FIND))) { /* not in the table */
- return 0;
- }
- return (int)(p->data);
- }
- static int final_index_val(const char *s)
- {
- ENTRY *p;
- ENTRY e;
- e.key = (char *) s;
- if (!(p = hsearch(e, FIND))) { /* not in the table */
- error_msg("can't find final index: %s", s);
- }
- return (int)(p->data);
- }
- static size_t num_tree_nodes;
- static void count_nodes(const void *ptr, VISIT order, int level)
- {
- if ((order == postorder) || (order == leaf)) {
- ++num_tree_nodes;
- }
- }
- static size_t tnumnodes(const void *root)
- {
- num_tree_nodes = 0;
- twalk(root, count_nodes);
- return num_tree_nodes;
- }
- static ll_item_t *find_wi_index(const char *sym, col_locale_t *cl)
- {
- weighted_item_t w;
- ll_item_t l;
- void *p;
- w.symbol = sym;
- l.data = &w;
- l.data_type = DT_WEIGHTED;
- p = tfind(&l, &cl->root_wi_index, wi_index_cmp);
- if (p) {
- p = *(ll_item_t **)p;
- }
- return (ll_item_t *) p;
- }
- static void mark_reordered(const char *sym)
- {
- ll_item_t *lli;
- lli = find_wi_index(sym, cur_base);
- if (lli) {
- if (!tsearch(lli, &cur_base->root_wi_index_reordered, wi_index_cmp)) {
- error_msg("OUT OF MEMORY!");
- }
- }
- }
- static ll_item_t *find_wi_index_reordered(const char *sym)
- {
- weighted_item_t w;
- ll_item_t l;
- void *p;
- w.symbol = sym;
- l.data = &w;
- l.data_type = DT_WEIGHTED;
- p = tfind(&l, &cur_base->root_wi_index_reordered, wi_index_cmp);
- if (p) {
- p = *(ll_item_t **)p;
- }
- return (ll_item_t *) p;
- }
- static ll_item_t *init_comm_ptr(void)
- {
- assert(cur_base);
- assert(cur_base->section_list);
- /* at the moment, only support one section in comm */
- assert(cur_base->section_list->next == NULL);
- comm_cur_ptr = ((section_t *)(cur_base->section_list->data))->itm_list;
- while (comm_cur_ptr && (comm_cur_ptr->data_type & DT_REORDER)) {
- comm_cur_ptr = comm_cur_ptr->next;
- }
- #warning devel code
- /* { */
- /* ll_item_t *p = comm_cur_ptr; */
- /* verbose_msg(VDETAIL, "init_comm_ptr\n"); */
- /* while (p != comm_cur_ptr) { */
- /* if (p->data_type & DT_WEIGHTED) { */
- /* verbose_msg(VDETAIL, "%s", ((weighted_item_t *)p)->symbol); */
- /* } */
- /* p = p->next; */
- /* } */
- /* } */
- assert(comm_cur_ptr);
- /* verbose_msg(VDETAIL, "init_comm_ptr -- %s %p %p %p %d\n", */
- /* ((weighted_item_t *)(comm_cur_ptr->data))->symbol, */
- /* comm_cur_ptr, comm_cur_ptr->prev, comm_cur_ptr->next, */
- /* ll_len(comm_cur_ptr)); */
- comm_prev_ptr = NULL;
- return comm_cur_ptr;
- }
- static ll_item_t *next_comm_ptr(void)
- {
- /* at the moment, only support one section in comm */
- assert(cur_base->section_list->next == NULL);
- comm_prev_ptr = comm_cur_ptr;
- while (comm_cur_ptr && ((comm_cur_ptr = comm_cur_ptr->next) != NULL)) {
- if (!(comm_cur_ptr->data_type & DT_REORDER)) {
- break;
- }
- }
- return comm_cur_ptr;
- }
- static int dump_count;
- #if 0
- static void dump_section(section_t *s, int mask, col_locale_t *der)
- {
- ll_item_t *lli;
- ll_item_t *lli0;
- weighted_item_t *w;
- weight_t *p;
- int i;
- lli0 = lli = s->itm_list;
- if (!lli0) {
- return;
- }
- do {
- if (!(lli->data_type & mask)) {
- lli = lli->next;
- continue;
- }
- if (lli->data_type & DT_WEIGHTED) {
- ++dump_count;
- w = (weighted_item_t *)(lli->data);
- p = w->weight;
- printf("%6d: %s (%d) ", dump_count, w->symbol, p->num_weights);
- for (i = 0 ; i < p->num_weights ; i++) {
- if (p->rule[i] & R_FORWARD) {
- printf("F");
- }
- if (p->rule[i] & R_BACKWARD) {
- printf("B");
- }
- if (p->rule[i] & R_POSITION) {
- printf("P");
- }
- printf(",");
- }
- for (i = 0 ; i < p->num_weights ; i++) {
- printf(" %s", p->colitem[i]);
- }
- printf("\n");
- } else if (lli->data_type & (DT_SECTION|DT_REORDER)) {
- if (lli->data_type == DT_REORDER) {
- assert(der);
- if (strncmp(((section_t *)(lli->data))->name, der->name, strlen(der->name))) {
- lli = lli->next;
- continue;
- }
- }
- if (lli->data_type & DT_SECTION) {
- printf("SECTION -----------------\n");
- } else {
- printf("REORDER -----------------\n");
- }
- dump_section((section_t *)(lli->data), mask, der);
- printf("DONE --------------------\n");
- }
- lli = lli->next;
- } while (lli != lli0);
- }
- #else
- static int in_reorder_section = 0;
- static void dump_section(section_t *s, int mask, col_locale_t *der)
- {
- ll_item_t *lli;
- ll_item_t *lli0;
- weighted_item_t *w;
- weight_t *p;
- int i;
- lli0 = lli = s->itm_list;
- if (!lli0) {
- return;
- }
- do {
- if (!(lli->data_type & mask)) {
- lli = lli->next;
- continue;
- }
- if (lli->data_type & DT_WEIGHTED) {
- ++dump_count;
- w = (weighted_item_t *)(lli->data);
- p = w->weight;
- #if 1
- if (in_reorder_section) {
- printf(" %p", w);
- }
- #else
- printf("%6d: %s (%d) ", dump_count, w->symbol, p->num_weights);
- for (i = 0 ; i < p->num_weights ; i++) {
- if (p->rule[i] & R_FORWARD) {
- printf("F");
- }
- if (p->rule[i] & R_BACKWARD) {
- printf("B");
- }
- if (p->rule[i] & R_POSITION) {
- printf("P");
- }
- printf(",");
- }
- for (i = 0 ; i < p->num_weights ; i++) {
- printf(" %s", p->colitem[i]);
- }
- printf("\n");
- #endif
- } else if (lli->data_type & (DT_SECTION|DT_REORDER)) {
- if (lli->data_type == DT_REORDER) {
- assert(der);
- if (strncmp(((section_t *)(lli->data))->name, der->name, strlen(der->name))) {
- lli = lli->next;
- continue;
- }
- }
- if (lli->data_type & DT_SECTION) {
- /* printf("SECTION -----------------\n"); */
- assert(0);
- } else {
- /* printf("REORDER -----------------\n"); */
- in_reorder_section = 1;
- }
- dump_section((section_t *)(lli->data), mask, der);
- /* printf("DONE --------------------\n"); */
- printf("\n");
- in_reorder_section = 0;
- }
- lli = lli->next;
- } while (lli != lli0);
- }
- #endif
- static void dump_weights(const char *name)
- {
- ll_item_t *lli;
- col_locale_t *base;
- col_locale_t *der;
- col_locale_t cl;
- void *p;
- assert(name);
- if (!*name) { /* use last */
- base = cur_base;
- der = cur_derived;
- } else {
- cl.name = (char *) name;
- if (!(p = tfind(&cl, &root_col_locale, col_locale_cmp))) {
- error_msg("unknown locale: %s", name);
- }
- base = *((col_locale_t **) p);
- der = NULL;
- if (base->base_locale) { /* oops... really derived */
- der = base;
- base = der->base_locale;
- }
- }
- dump_count = 0;
- if (base) {
- /* printf("BASE - %s\n", base->name); */
- for (lli = base->section_list ; lli ; lli = lli->next) {
- /* printf("SECTION %s\n", ((section_t *)(lli->data))->name); */
- dump_section((section_t *)(lli->data), ~0, der);
- }
- }
- assert(der != base);
- if (der) {
- /* printf("DERIVED - %s\n", der->name); */
- for (lli = der->section_list ; lli ; lli = lli->next) {
- if (lli->data_type == DT_SECTION) {
- dump_section((section_t *)(lli->data), DT_WEIGHTED, der);
- }
- }
- }
- /* printf("DONE\n"); */
- }
- static void print_starter_node(const void *ptr, VISIT order, int level)
- {
- if (order == postorder || order == leaf) {
- fprintf(stderr, " %s\n", *(const char **) ptr);
- }
- }
- static void finalize_base(void)
- {
- ll_item_t *s;
- ll_item_t *h;
- ll_item_t *lli;
- ll_item_t *h2;
- ll_item_t *l2;
- ll_item_t *cli;
- ll_item_t *rli = NULL;
- weighted_item_t *w;
- weight_t *p;
- int i, n, mr, r, mi;
- col_locale_t *cl;
- void *mm;
- int num_invariant = 0;
- int num_varying = 0;
- int max_weight;
- int index2weight_len_inc = 1;
- assert(cur_base);
- assert(base_locale_len+1 < BASE_LOCALE_LEN);
- base_locale_array[base_locale_len].name = cur_base->name;
- base_locale_array[base_locale_len].num_weights = 1;
- base_locale_array[base_locale_len].index2weight_offset = index2weight_len;
- base_locale_array[base_locale_len].index2ruleidx_offset = index2ruleidx_len;
- if (!strcmp(cur_base->name,"ja_JP") || !strcmp(cur_base->name,"ko_KR")) {
- #warning fix the index2weight check!!
- index2weight_len_inc = 0;
- }
- /* printf("%s -- index2weight_len = %d\n", cur_base->name, index2weight_len); */
- if (!hcreate(30000)) {
- error_msg("OUT OF MEMORY!");
- }
- /* first pass ... set the fixed indexes */
- final_index = i = 1;
- mr = 0;
- for (s = cur_base->section_list ; s ; s = s->next) {
- #if 1
- if (s->data_type & DT_REORDER) { /* a reordered section */
- verbose_msg(VDETAIL, "pass1: reordered section %s - xxx\n", ((section_t *)(s->data))->name);
- lli = ((section_t *)(s->data))->itm_list;
- r = 0;
- if (lli) {
- /* r = ll_len( ((section_t *)(lli->data))->itm_list ); */
- r = ll_len(lli) + 1;
- }
- if (r > mr) {
- mr = r;
- }
- verbose_msg(VDETAIL, "pass1: reordered section %s - %d\n", ((section_t *)(s->data))->name, r);
- continue;
- }
- #endif
- h = lli = ((section_t *)(s->data))->itm_list;
- if (!lli) {
- continue;
- }
- do {
- if (lli->data_type & DT_RANGE) {
- i += mr;
- mr = 0;
- #warning check ko_kR and 9
- /* ++i; */
- lli->idx = i;
- assert(!rli);
- rli = lli;
- verbose_msg(VDETAIL, "range pre = %d after = ", i);
- i += ((range_item_t *)(lli->data))->length + 1;
- #warning check ko_kR and 9
- /* ++i; */
- verbose_msg(VDETAIL, "%d\n", i);
- if (!index2weight_len_inc) { /* ko_KR hack */
- final_index += ((range_item_t *)(lli->data))->length + 1;
- }
- /* add_final_col_index("RANGE"); */
- } else if (lli->data_type & DT_WEIGHTED) {
- i += mr;
- mr = 0;
- w = (weighted_item_t *)(lli->data);
- if (find_wi_index_reordered(w->symbol)) { /* reordered symbol so skip on first pass */
- ++num_varying;
- ++i;
- continue;
- }
- ++num_invariant;
- index2weight_buffer[index2weight_len] = lli->idx = i++;
- index2weight_len += index2weight_len_inc;
- add_final_col_index(w->symbol);
- } else {
- assert(lli->data_type & DT_REORDER);
- r = ll_len( ((section_t *)(lli->data))->itm_list );
- #warning check ko_kR and 9
- if (r > mr) {
- mr = r;
- }
- /* r = 0; */
- }
- } while ((lli = lli->next) != h);
- }
- /* second pass ... set the reordered indexes */
- mi = i + mr;
- mr = i = 0;
- for (s = cur_base->section_list ; s ; s = s->next) {
- h = lli = ((section_t *)(s->data))->itm_list;
- if (!lli) {
- continue;
- }
- do {
- if (lli->data_type & DT_RANGE) {
- i += mr;
- mr = 0;
- i = lli->idx + ((range_item_t *)(lli->data))->length + 1;
- #warning check
- } else if ((lli->data_type & DT_WEIGHTED) && !(s->data_type & DT_REORDER)) {
- i += mr;
- mr = 0;
- w = (weighted_item_t *)(lli->data);
- if (find_wi_index_reordered(w->symbol) /* reordered symbol skipped on first pass */
- #if 0
- || (s->data_type & DT_REORDER) /* or in a reordered section */
- #endif
- ) {
- assert(!(s->data_type & DT_REORDER));
- index2weight_buffer[index2weight_len] = lli->idx = ++i;
- index2weight_len += index2weight_len_inc;
- add_final_col_index(w->symbol);
- /* fprintf(stdout, "%11s: r %6d %6d %s\n", */
- /* cur_base->name, lli->idx, final_index_val(w->symbol), w->symbol); */
- continue;
- }
- i = lli->idx;
- /* fprintf(stdout, "%11s: w %6d %6d %s\n", */
- /* cur_base->name, lli->idx, final_index_val(w->symbol), w->symbol); */
- } else {
- /* verbose_msg(VDETAIL, "section: %s %d %d\n", ((section_t *)(s->data))->name, */
- /* s->data_type, lli->data_type); */
- /* assert(!(s->data_type & DT_REORDER)); */
- /* assert(lli->data_type & DT_REORDER); */
- #if 1
- if (s->data_type & DT_REORDER) {
- h2 = l2 = lli;
- if (!h2) {
- continue;
- }
- } else {
- assert(s->data_type & DT_SECTION);
- h2 = l2 = ((section_t *)(lli->data))->itm_list;
- if (!h2) {
- continue;
- }
- }
- #else
- h2 = l2 = ((section_t *)(lli->data))->itm_list;
- if (!h2) {
- continue;
- }
- #endif
- r = 0;
- do {
- assert(l2->data_type & DT_WEIGHTED);
- ++r;
- l2->idx = i + r;
- /* fprintf(stdout, "%s: R %6d %s\n", */
- /* ((section_t *)(lli->data))->name, l2->idx, ((weighted_item_t *)(l2->data))->symbol); */
- } while ((l2 = l2->next) != h2);
- if (r > mr) {
- mr = r;
- }
- }
- } while ((lli = lli->next) != h);
- }
- /* finally, walk through all derived locales and set non-reordered section items */
- mr = mi;
- for (cli = cur_base->derived_list ; cli ; cli = cli->next) {
- cl = (col_locale_t *)(cli->data);
- /* verbose_msg(VDETAIL, "pass3: %d %s\n", cli->data_type, cl->name); */
- /* fprintf(stdout, "pass3: %d %s\n", cli->data_type, cl->name); */
- assert(cli->data_type == DT_COL_LOCALE);
- i = mi;
- for (s = cl->section_list ; s ; s = s->next) {
- /* if (s->data_type & DT_REORDER) { */
- /* continue; */
- /* } */
- h = lli = ((section_t *)(s->data))->itm_list;
- if (!lli) {
- continue;
- }
- do {
- assert(!(lli->data_type & DT_RANGE));
- if (lli->data_type & DT_WEIGHTED) {
- /* verbose_msg(VDETAIL, " %d %d %s\n", lli->data_type, lli->idx, ((weighted_item_t *)(lli->data))->symbol); */
- add_final_col_index(((weighted_item_t *)(lli->data))->symbol);
- if (s->data_type & DT_REORDER) {
- continue;
- }
- assert(lli->idx == INT_MIN);
- lli->idx = ++i;
- /* fprintf(stdout, "%11s: S %6d %6d %s\n", */
- /* cl->name, lli->idx, */
- /* final_index_val(((weighted_item_t *)(lli->data))->symbol), */
- /* ((weighted_item_t *)(lli->data))->symbol); */
- } else {
- assert(0);
- assert(lli->data_type & DT_SECTION);
- h2 = l2 = ((section_t *)(lli->data))->itm_list;
- if (!h2) {
- continue;
- }
- do {
- assert(l2->data_type & DT_WEIGHTED);
- assert(l2->idx == INT_MIN);
- l2->idx = ++i;
- add_final_col_index(((weighted_item_t *)(l2->data))->symbol);
- } while ((l2 = l2->next) != h2);
- }
- } while ((lli = lli->next) != h);
- }
- if (i > mr) {
- mr = i;
- }
- }
- max_weight = mr;
- assert(num_varying == tnumnodes(cur_base->root_wi_index_reordered));
- /* we can now initialize the wcs2index array */
- {
- ENTRY *p;
- ENTRY e;
- char buf[8];
- static const char xd[] = "0123456789ABCDEF";
- int starter_index = final_index;
- int wcs2index_count = 0;
- strcpy(buf, "<U....>");
- memset(wcs2index, 0, sizeof(wcs2index));
- e.key = (char *) buf;
- for (i=1 ; i <= 0xffff ; i++) {
- buf[5] = xd[ i & 0xf ];
- buf[4] = xd[ (i >> 4) & 0xf ];
- buf[3] = xd[ (i >> 8) & 0xf ];
- buf[2] = xd[ (i >> 12) & 0xf ];
- if ((p = hsearch(e, FIND)) != NULL) {
- ++wcs2index_count;
- if ((tfind(buf, &cur_base->root_starter_char, sym_cmp)) != NULL) {
- wcs2index[i] = ++starter_index;
- /* verbose_msg(VDETAIL, "wcs2index[ %#06x ] = %d (starter)\n", i, wcs2index[i]); */
- } else {
- wcs2index[i] = (int)(p->data);
- /* verbose_msg(VDETAIL, "wcs2index[ %#06x ] = %d\n", i, wcs2index[i]); */
- }
- } else {
- if ((tfind(buf, &cur_base->root_starter_char, sym_cmp)) != NULL) {
- error_msg("marked starter but not in hash: %s", buf);
- }
- }
- }
- /* ---------------------------------------------------------------------- */
- {
- int i, n;
- table_data table;
- size_t t, smallest;
- n = 0;
- smallest = SIZE_MAX;
- table.ii = NULL;
- for (i=0 ; i < 14 ; i++) {
- if ((RANGE >> i) < 4) {
- break;
- }
- t = newopt(wcs2index, RANGE, i, &table);
- if (smallest >= t) {
- n = i;
- smallest = t;
- /* } else { */
- /* break; */
- }
- }
- /* printf("smallest = %u for range %#x (%u)\n", smallest, RANGE, RANGE); */
- assert(smallest != SIZE_MAX);
- if (smallest + wcs2colidt_len >= WCS2COLIDT_LEN) {
- error_msg("WCS2COLIDT_LEN too small");
- }
- base_locale_array[base_locale_len].wcs2colidt_offset = wcs2colidt_len;
- table.ii = wcs2colidt_buffer + wcs2colidt_len;
- t = smallest;
- smallest = SIZE_MAX;
- smallest = newopt(wcs2index, RANGE, n, &table);
- assert(t == smallest);
- wcs2colidt_len += smallest;
- /* verbose_msg(VDETAIL, "smallest = %d wcs2colidt_len = %d\n", smallest, wcs2colidt_len); */
- #if 0
- {
- unsigned int sc, n, i0, i1;
- unsigned int u = 0xe40;
- table_data *tbl = &table;
- #define __LOCALE_DATA_WCctype_TI_MASK ((1 << tbl->ti_shift)-1)
- #define __LOCALE_DATA_WCctype_TI_SHIFT (tbl->ti_shift)
- #define __LOCALE_DATA_WCctype_TI_LEN (tbl->ti_len)
- #define __LOCALE_DATA_WCctype_II_MASK ((1 << tbl->ii_shift)-1)
- #define __LOCALE_DATA_WCctype_II_SHIFT (tbl->ii_shift)
- #define __LOCALE_DATA_WCctype_II_LEN (tbl->ii_len)
- sc = u & __LOCALE_DATA_WCctype_TI_MASK;
- u >>= __LOCALE_DATA_WCctype_TI_SHIFT;
- n = u & __LOCALE_DATA_WCctype_II_MASK;
- u >>= __LOCALE_DATA_WCctype_II_SHIFT;
- i0 = tbl->ii[u];
- verbose_msg(VDETAIL, "i0 = %d\n", i0);
- i0 <<= __LOCALE_DATA_WCctype_II_SHIFT;
- i1 = tbl->ii[__LOCALE_DATA_WCctype_II_LEN + i0 + n];
- /* i1 = tbl->ti[i0 + n]; */
- verbose_msg(VDETAIL, "i1 = %d\n", i1);
- i1 <<= __LOCALE_DATA_WCctype_TI_SHIFT;
- /* return *(uint16_t *)(&(tbl->ii[__LOCALE_DATA_WCctype_II_LEN + __LOCALE_DATA_WCctype_TI_LEN + i1 + sc])); */
- verbose_msg(VDETAIL, "i2 = %d\n", __LOCALE_DATA_WCctype_II_LEN + __LOCALE_DATA_WCctype_TI_LEN + i1 + sc);
- verbose_msg(VDETAIL, "val = %d\n", tbl->ii[__LOCALE_DATA_WCctype_II_LEN + __LOCALE_DATA_WCctype_TI_LEN + i1 + sc]);
- /* return tbl->ut[i1 + sc]; */
- }
- #endif
- base_locale_array[base_locale_len].ii_shift = table.ii_shift;
- base_locale_array[base_locale_len].ti_shift = table.ti_shift;
- base_locale_array[base_locale_len].ii_len = table.ii_len;
- base_locale_array[base_locale_len].ti_len = table.ti_len;
- }
- /* ---------------------------------------------------------------------- */
- base_locale_array[base_locale_len].num_col_base = num_invariant + num_varying;
- base_locale_array[base_locale_len].max_col_index = final_index;
- base_locale_array[base_locale_len].max_weight = max_weight;
- verbose_msg(VDETAIL, "%s: %6u invariant %6u varying %6u derived %6u total %6u max weight %6u wcs2\n",
- cur_base->name, num_invariant, num_varying,
- tnumnodes(cur_base->root_derived_wi), final_index, max_weight,
- wcs2index_count);
- }
- #if 1
- /* ok, now we need to dump out the base and derived tables... */
- /* don't forget to break up collating elements!!! */
- /* fprintf(stdout, "**************************************************\n"); */
- /* first pass ... set the invariants */
- for (s = cur_base->section_list ; s ; s = s->next) {
- #if 1
- if (s->data_type & DT_REORDER) {
- verbose_msg(VDETAIL, "1: skipping reordered section %s\n", ((section_t *)(s->data))->name);
- continue;
- }
- #endif
- h = lli = ((section_t *)(s->data))->itm_list;
- if (!lli) {
- continue;
- }
- do {
- if (lli->data_type & DT_WEIGHTED) {
- w = (weighted_item_t *)(lli->data);
- if (find_wi_index_reordered(w->symbol)) { /* reordered symbol so skip on first pass */
- continue;
- }
- if (index2weight_len_inc) {
- index2ruleidx_buffer[index2ruleidx_len++] =
- add_rule((weighted_item_t *)(lli->data));
- }
- /* fprintf(stdout, "%11s: w %6d %6d %s\n", */
- /* cur_base->name, lli->idx, final_index_val(w->symbol), w->symbol); */
- }
- } while ((lli = lli->next) != h);
- }
- /* second pass ... set varying */
- for (s = cur_base->section_list ; s ; s = s->next) {
- #if 1
- if (s->data_type & DT_REORDER) {
- verbose_msg(VDETAIL, "2: skipping reordered section %s\n", ((section_t *)(s->data))->name);
- continue;
- }
- #endif
- h = lli = ((section_t *)(s->data))->itm_list;
- if (!lli) {
- continue;
- }
- do {
- if (lli->data_type & DT_WEIGHTED) {
- w = (weighted_item_t *)(lli->data);
- if (find_wi_index_reordered(w->symbol)) { /* reordered symbol so skip on first pass */
- if (index2weight_len_inc) {
- index2ruleidx_buffer[index2ruleidx_len++] =
- add_rule((weighted_item_t *)(lli->data));
- }
- /* fprintf(stdout, "%11s: r %6d %6d %s\n", */
- /* cur_base->name, lli->idx, final_index_val(w->symbol), w->symbol); */
- continue;
- }
- }
- } while ((lli = lli->next) != h);
- }
- do_starter_lists(cur_base);
- /* verbose_msg(VDETAIL,"updated final_index = %d\n", final_index); */
- if (rli) {
- base_locale_array[base_locale_len].range_low
- = strtoul(((range_item_t *)(rli->data))->symbol1 + 2, NULL, 16);
- base_locale_array[base_locale_len].range_count
- = ((range_item_t *)(rli->data))->length;
- base_locale_array[base_locale_len].range_base_weight = rli->idx;
- base_locale_array[base_locale_len].range_rule_offset = add_range_rule((range_item_t *)(rli->data));
- /* fprintf(stdout, "%11s: %6d %6d %s %s (%d)\n", */
- /* "RANGE", rli->idx, -1, */
- /* ((range_item_t *)(rli->data))->symbol1, */
- /* ((range_item_t *)(rli->data))->symbol2, */
- /* ((range_item_t *)(rli->data))->length); */
- }
- /* fprintf(stdout,"\nDerived\n\n"); */
- /* first, if base name is of the form ll_CC, add a derived locale for it */
- if ((strlen(cur_base->name) == 5)
- && islower(cur_base->name[0])
- && islower(cur_base->name[1])
- && (cur_base->name[2] == '_')
- && isupper(cur_base->name[3])
- && isupper(cur_base->name[4])
- ) {
- verbose_msg(VDETAIL, "adding special derived for %s\n", cur_base->name);
- /* verbose_msg(VDETAIL,"updated final_index = %d\n", final_index); */
- assert(der_locale_len+1 < DER_LOCALE_LEN);
- der_locale_array[der_locale_len].name = cur_base->name;
- der_locale_array[der_locale_len].base_idx = base_locale_len;
- u16_buf[0] = 1;
- u16_buf[1] = 0;
- u16_buf_len = 2;
- mm = NULL;
- if ((u16_buf_len > override_len) ||
- !(mm = memmem(override_buffer, override_len*sizeof(override_buffer[0]),
- u16_buf, u16_buf_len*sizeof(u16_buf[0])))
- ) {
- assert(override_len + u16_buf_len < OVERRIDE_LEN);
- memcpy(override_buffer + override_len, u16_buf, u16_buf_len*sizeof(u16_buf[0]));
- der_locale_array[der_locale_len].overrides_offset = override_len;
- override_len += u16_buf_len;
- /* printf("%s: override_len = %d u16_buf_len = %d\n", cl->name, override_len, u16_buf_len); */
- } else if (!(u16_buf_len > override_len)) {
- assert(mm);
- der_locale_array[der_locale_len].overrides_offset = ((uint16_t *)(mm)) - override_buffer;
- /* printf("%s: memmem found a match with u16_buf_len = %d\n", cl->name, u16_buf_len); */
- }
- der_locale_array[der_locale_len].multistart_offset
- = base_locale_array[base_locale_len].multistart_offset;
- der_locale_array[der_locale_len].undefined_idx = final_index_val0("UNDEFINED");
- if (!der_locale_array[der_locale_len].undefined_idx) {
- error_msg("no UNDEFINED definition for %s", cur_base->name);
- }
- ++der_locale_len;
- } else {
- verbose_msg(VDETAIL, "NOT adding special derived for %s\n", cur_base->name);
- }
- /* now all the derived... */
- for (cli = cur_base->derived_list ; cli ; cli = cli->next) {
- cl = (col_locale_t *)(cli->data);
- assert(cli->data_type == DT_COL_LOCALE);
- assert(der_locale_len+1 < DER_LOCALE_LEN);
- der_locale_array[der_locale_len].name = cl->name;
- der_locale_array[der_locale_len].base_idx = base_locale_len;
- u16_buf_len = 0;
- for (i = 0 ; i < 2 ; i++) {
- if (i) {
- /* fprintf(stdout, " section --- (singles)\n"); */
- u16_buf[u16_buf_len++] = 1; /* single */
- }
- /* we do this in two passes... first all sequences, then all single reorders */
- for (s = cl->section_list ; s ; s = s->next) {
- /* verbose_msg(VDETAIL, "doing section %s\n", ((section_t *)(s->data))->name); */
- h = lli = ((section_t *)(s->data))->itm_list;
- if (!lli) {
- /* fprintf(stdout, "EMPTY ITEM LIST IN SECTION %s\n", ((section_t *)(s->data))->name ); */
- continue;
- }
- assert(u16_buf_len +4 < sizeof(u16_buf)/sizeof(u16_buf[0]));
- if ((!i && (ll_len(h) > 1) ) || (ll_len(h) == i)) {
- if (!i) {
- /* fprintf(stdout, " section ----------------- %d %d\n", i, ll_len(h)); */
- u16_buf[u16_buf_len++] = ll_len(h); /* multi */
- assert(lli->data_type & DT_WEIGHTED);
- #if 0
- u16_buf[u16_buf_len++] = final_index_val(((weighted_item_t *)(lli->data))->symbol); /* start index */
- #endif
- u16_buf[u16_buf_len++] = lli->idx; /* start weight */
- }
- do {
- assert(lli->data_type & DT_WEIGHTED);
- if (lli->data_type & DT_WEIGHTED) {
- /* fprintf(stdout, "%11s: S %6d %6d %s\n", */
- /* cl->name, lli->idx, */
- /* final_index_val(((weighted_item_t *)(lli->data))->symbol), */
- /* ((weighted_item_t *)(lli->data))->symbol); */
- #if 0
- if (i) {
- assert(u16_buf_len +4 < sizeof(u16_buf)/sizeof(u16_buf[0]));
- u16_buf[u16_buf_len++] = final_index_val(((weighted_item_t *)(lli->data))->symbol);
- assert(u16_buf[u16_buf_len-1]);
- u16_buf[u16_buf_len++] = lli->idx; /* weight */
- }
- #else
- assert(u16_buf_len +4 < sizeof(u16_buf)/sizeof(u16_buf[0]));
- u16_buf[u16_buf_len++] = final_index_val(((weighted_item_t *)(lli->data))->symbol);
- assert(u16_buf[u16_buf_len-1]);
- if (i) {
- u16_buf[u16_buf_len++] = lli->idx; /* weight */
- }
- #endif
- u16_buf[u16_buf_len++] = add_rule((weighted_item_t *)(lli->data));
- }
- } while ((lli = lli->next) != h);
- }
- }
- }
- u16_buf[u16_buf_len++] = 0;
- mm = NULL;
- if ((u16_buf_len > override_len) ||
- !(mm = memmem(override_buffer, override_len*sizeof(override_buffer[0]),
- u16_buf, u16_buf_len*sizeof(u16_buf[0])))
- ) {
- assert(override_len + u16_buf_len < OVERRIDE_LEN);
- memcpy(override_buffer + override_len, u16_buf, u16_buf_len*sizeof(u16_buf[0]));
- der_locale_array[der_locale_len].overrides_offset = override_len;
- override_len += u16_buf_len;
- /* printf("%s: override_len = %d u16_buf_len = %d\n", cl->name, override_len, u16_buf_len); */
- } else if (!(u16_buf_len > override_len)) {
- assert(mm);
- der_locale_array[der_locale_len].overrides_offset = ((uint16_t *)(mm)) - override_buffer;
- /* printf("%s: memmem found a match with u16_buf_len = %d\n", cl->name, u16_buf_len); */
- }
- do_starter_lists(cl);
- der_locale_array[der_locale_len].undefined_idx = final_index_val0("UNDEFINED");
- #if 0
- assert(der_locale_array[der_locale_len].undefined_idx);
- if (!der_locale_array[der_locale_len].undefined_idx) {
- der_locale_array[der_locale_len].undefined_idx = base_locale_array[base_locale_len].undefined_idx;
- }
- #endif
- if (!der_locale_array[der_locale_len].undefined_idx) {
- error_msg("no UNDEFINED definition for %s", cl->name);
- }
- ++der_locale_len;
- }
- #endif
- #warning handle UNDEFINED idx specially? what if in only some of derived?
- /* base_locale_array[base_locale_len].undefined_idx = final_index_val0("UNDEFINED"); */
- base_locale_array[base_locale_len].undefined_idx = 0;
- hdestroy();
- ++base_locale_len;
- /* if (tnumnodes(cur_base->root_starter_char)) { */
- /* verbose_msg(VDETAIL, "starter nodes\n"); */
- /* twalk(cur_base->root_starter_char, print_starter_node); */
- /* } */
- }
- static int starter_all_cmp(const void *n1, const void *n2)
- {
- const char *s1 = ((weighted_item_t *) n1)->symbol;
- const char *s2 = ((weighted_item_t *) n2)->symbol;
- colitem_t x;
- colitem_t *p;
- int n;
- /* sort by 1st char ... then inverse for string */
- x.element = NULL;
- if (!is_ucode(s1)) {
- x.string = s1;
- p = tfind(&x, &cur_base->root_colitem, colitem_cmp);
- s1 = (*((colitem_t **) p))->element + 1;
- }
- if (!is_ucode(s2)) {
- x.string = s2;
- p = tfind(&x, &cur_base->root_colitem, colitem_cmp);
- s2 = (*((colitem_t **) p))->element + 1;
- }
- /* <U####>< */
- /* 01234567 */
- assert(is_ucode(s1));
- assert(is_ucode(s2));
- n = strncmp(s1+2, s2+2, 4);
- if (n) {
- return n;
- }
- s1 += 7;
- s2 += 7;
- return strcmp(s2, s1);
- }
- static void print_starter_all_node(const void *ptr, VISIT order, int level)
- {
- const weighted_item_t *w = *(const weighted_item_t **) ptr;
- colitem_t *ci;
- void *p;
- int n;
- colitem_t x;
- if (order == postorder || order == leaf) {
- #if 0
- if ((n = is_ucode(w->symbol)) != 0) {
- printf(" %s\n", w->symbol);
- } else {
- x.string = w->symbol;
- x.element = NULL;
- p = tfind(&x, &cur_base->root_colitem, colitem_cmp);
- assert(p);
- ci = *((colitem_t **) p);
- printf("%s = %s\n", ci->element, w->symbol);
- }
- #else
- printf("%s|", w->symbol);
- /* if ((n = is_ucode(w->symbol)) != 0) { */
- /* printf("\n"); */
- /* } */
- #endif
- }
- }
- static void process_starter_node(const void *ptr, VISIT order, int level)
- {
- const weighted_item_t *w = *(const weighted_item_t **) ptr;
- colitem_t *ci;
- void *p;
- int n;
- colitem_t x;
- const char *s;
- char buf[32];
- /* store index of collation item followed by (unprefixed) nul-terminated string */
- if (order == postorder || order == leaf) {
- if ((n = is_ucode(w->symbol)) != 0) {
- u16_buf[u16_buf_len++] = final_index_val(w->symbol);
- assert(u16_buf[u16_buf_len-1]);
- u16_buf[u16_buf_len++] = 0;
- if (++u16_starter < base_locale_array[base_locale_len].num_starters) {
- u16_buf[u16_starter] = u16_buf_len;
- }
- /* verbose_msg(VDETAIL, "ucode - %d %d\n", u16_buf[u16_starter-1], u16_buf_len); */
- } else {
- x.string = w->symbol;
- x.element = NULL;
- p = tfind(&x, &cur_base->root_colitem, colitem_cmp);
- assert(p);
- ci = *((colitem_t **) p);
- s = ci->element;
- u16_buf[u16_buf_len++] = final_index_val(w->symbol);
- assert(u16_buf[u16_buf_len-1]);
- assert(*s == '"');
- n = is_ucode(++s);
- /* verbose_msg(VDETAIL, "s is |%s| with len %d (%d)\n", s, strlen(s), n); */
- assert(n);
- s += n;
- while (*s != '"') {
- n = is_ucode(s);
- assert(n);
- strncpy(buf, s, n+1);
- buf[n] = 0;
- /* verbose_msg(VDETAIL, "buf is |%s| with len %d (%d)\n", buf, strlen(buf), n); */
- u16_buf[u16_buf_len++] = final_index_val(buf);
- assert(u16_buf[u16_buf_len-1]);
- s += n;
- }
- u16_buf[u16_buf_len++] = 0;
- }
- }
- }
- static void **p_cl_root_starter_all;
- static void complete_starter_node(const void *ptr, VISIT order, int level)
- {
- weighted_item_t w;
- weighted_item_t *p;
- if (order == postorder || order == leaf) {
- w.symbol = *(const char **) ptr;
- w.weight = NULL;
- if (!tfind(&w, p_cl_root_starter_all, starter_all_cmp)) {
- p = xmalloc(sizeof(weighted_item_t));
- p->symbol = w.symbol;
- p->weight = NULL;
- /* verbose_msg(VDETAIL, "complete_starter_node: %s\n", *(const char **) ptr); */
- if (!tsearch(p, p_cl_root_starter_all, starter_all_cmp)) {
- error_msg("OUT OF MEMORY");
- }
- }
- }
- }
- static void do_starter_lists(col_locale_t *cl)
- {
- ll_item_t *s;
- ll_item_t *h;
- ll_item_t *lli;
- col_locale_t *c;
- colitem_t *ci;
- weighted_item_t *w;
- void *p;
- char buf[32];
- int n;
- colitem_t x;
- void *mm;
- c = cl;
- if (c != cur_base) {
- c = cur_base;
- }
- /* printf("STARTERS %s --------------------\n", cl->name); */
- LOOP:
- for (s = c->section_list ; s ; s = s->next) {
- h = lli = ((section_t *)(s->data))->itm_list;
- if (!lli) {
- continue;
- }
- do {
- if (lli->data_type & DT_WEIGHTED) {
- w = (weighted_item_t *)(lli->data);
- ci = NULL;
- if ((n = is_ucode(w->symbol)) != 0) {
- strcpy(buf, w->symbol);
- } else {
- /* fprintf(stdout, "looking for |%s|\n", w->symbol); */
- x.string = w->symbol;
- x.element = NULL;
- p = tfind(&x, &cur_base->root_colitem, colitem_cmp);
- if (!p) {
- /* verbose_msg(VDETAIL, "Whoa... processing starters for %s and couldn't find %s\n", */
- /* cl->name, w->symbol); */
- continue;
- }
- ci = *((colitem_t **) p);
- if (!ci->element) { /* just a collating symbol */
- continue;
- }
- assert(ci->element[0] == '"');
- n = is_ucode(ci->element + 1);
- assert(n);
- strncpy(buf, ci->element + 1, n);
- }
- if ((tfind(buf, &cur_base->root_starter_char, sym_cmp)) != NULL) {
- /* fprintf(stdout, "adding from %s: %s", c->name, w->symbol); */
- /* if (ci) { */
- /* fprintf(stdout, " = %s", ci->element); */
- /* } */
- /* fprintf(stdout, "\n"); */
- if (!tsearch(w, &cl->root_starter_all, starter_all_cmp)) {
- error_msg("OUT OF MEMORY");
- }
- }
- }
- } while ((lli = lli->next) != h);
- }
- if (c != cl) {
- c = cl;
- goto LOOP;
- }
- p_cl_root_starter_all = &cl->root_starter_all;
- twalk(cur_base->root_starter_char, complete_starter_node);
- if (cl == cur_base) {
- base_locale_array[base_locale_len].num_starters = tnumnodes(cur_base->root_starter_char);
- }
- #if 0
- printf("\nNow walking tree...\n\n");
- twalk(cl->root_starter_all, print_starter_all_node);
- printf("\n\n");
- #endif
- u16_starter = 0;
- u16_buf[0] = u16_buf_len = base_locale_array[base_locale_len].num_starters;
- twalk(cl->root_starter_all, process_starter_node);
- /* verbose_msg(VDETAIL, "s=%d n=%d\n", u16_starter, base_locale_array[base_locale_len].num_starters); */
- assert(u16_starter == base_locale_array[base_locale_len].num_starters);
- #if 0
- { int i;
- for (i=0 ; i < u16_buf_len ; i++) {
- verbose_msg(VDETAIL, "starter %2d: %d - %#06x\n", i, u16_buf[i], u16_buf[i]);
- }}
- #endif
- mm = NULL;
- if (u16_buf_len) {
- /* assert(base_locale_array[base_locale_len].num_starters); */
- if ((u16_buf_len > multistart_len) ||
- !(mm = memmem(multistart_buffer, multistart_len*sizeof(multistart_buffer[0]),
- u16_buf, u16_buf_len*sizeof(u16_buf[0])))
- ) {
- assert(multistart_len + u16_buf_len < MULTISTART_LEN);
- memcpy(multistart_buffer + multistart_len, u16_buf, u16_buf_len*sizeof(u16_buf[0]));
- if (cl == cur_base) {
- base_locale_array[base_locale_len].multistart_offset = multistart_len;
- } else {
- der_locale_array[der_locale_len].multistart_offset = multistart_len;
- }
- multistart_len += u16_buf_len;
- /* verbose_msg(VDETAIL, "%s: multistart_len = %d u16_buf_len = %d\n", cl->name, multistart_len, u16_buf_len); */
- } else if (!(u16_buf_len > multistart_len)) {
- assert(mm);
- if (cl == cur_base) {
- base_locale_array[base_locale_len].multistart_offset = ((uint16_t *)(mm)) - multistart_buffer;
- } else {
- der_locale_array[der_locale_len].multistart_offset = ((uint16_t *)(mm)) - multistart_buffer;
- }
- /* verbose_msg(VDETAIL, "%s: memmem found a match with u16_buf_len = %d\n", cl->name, u16_buf_len); */
- }
- } else {
- assert(!base_locale_array[base_locale_len].num_starters);
- }
- /* printf("u16_buf_len = %d\n", u16_buf_len); */
- /* printf("STARTERS %s DONE ---------------\n", cl->name); */
- }
- /* For sorting the blocks of unsigned chars. */
- static size_t nu_val;
- int nu_memcmp(const void *a, const void *b)
- {
- return memcmp(*(unsigned char**)a, *(unsigned char**)b, nu_val * sizeof(tbl_item));
- }
- size_t newopt(tbl_item *ut, size_t usize, int shift, table_data *tbl)
- {
- static int recurse;
- tbl_item *ti[RANGE]; /* table index */
- size_t numblocks;
- size_t blocksize;
- size_t uniq;
- size_t i, j;
- size_t smallest, t;
- tbl_item *ii_save;
- int uniqblock[1 << (8*sizeof(tbl_item) - 1)];
- tbl_item uit[RANGE];
- int shift2;
- if (shift > 15) {
- return SIZE_MAX;
- }
- ii_save = NULL;
- blocksize = 1 << shift;
- numblocks = usize >> shift;
- /* init table index */
- for (i=j=0 ; i < numblocks ; i++) {
- ti[i] = ut + j;
- j += blocksize;
- }
- /* sort */
- nu_val = blocksize;
- qsort(ti, numblocks, sizeof(unsigned char *), nu_memcmp);
- uniq = 1;
- uit[(ti[0]-ut)/blocksize] = 0;
- for (i=1 ; i < numblocks ; i++) {
- if (memcmp(ti[i-1], ti[i], blocksize*sizeof(tbl_item)) < 0) {
- if (++uniq > (1 << (8*sizeof(tbl_item) - 1))) {
- break;
- }
- uniqblock[uniq - 1] = i;
- }
- #if 1
- else if (memcmp(ti[i-1], ti[i], blocksize*sizeof(tbl_item)) > 0) {
- printf("bad sort %i!\n", i);
- abort();
- }
- #endif
- uit[(ti[i]-ut)/blocksize] = uniq - 1;
- }
- smallest = SIZE_MAX;
- shift2 = -1;
- if (uniq <= (1 << (8*sizeof(tbl_item) - 1))) {
- smallest = numblocks + uniq * blocksize;
- if (!recurse) {
- ++recurse;
- for (j=1 ; j < 14 ; j++) {
- if ((numblocks >> j) < 2) break;
- if (tbl) {
- ii_save = tbl->ii;
- tbl->ii = NULL;
- }
- if ((t = newopt(uit, numblocks, j, tbl)) < SIZE_MAX) {
- t += uniq * blocksize;
- }
- if (tbl) {
- tbl->ii = ii_save;
- }
- if (smallest >= t) {
- shift2 = j;
- smallest = t;
- /* if (!tbl->ii) { */
- /* printf("ishift %u tshift %u size %u\n", */
- /* shift2, shift, t); */
- /* } */
- /* } else { */
- /* break; */
- }
- }
- --recurse;
- }
- } else {
- return SIZE_MAX;
- }
- if (tbl->ii) {
- if (recurse) {
- tbl->ii_shift = shift;
- tbl->ii_len = numblocks;
- memcpy(tbl->ii, uit, numblocks*sizeof(tbl_item));
- tbl->ti = tbl->ii + tbl->ii_len;
- tbl->ti_len = uniq * blocksize;
- for (i=0 ; i < uniq ; i++) {
- memcpy(tbl->ti + i * blocksize, ti[uniqblock[i]], blocksize*sizeof(tbl_item));
- }
- } else {
- ++recurse;
- /* printf("setting ishift %u tshift %u\n", shift2, shift); */
- newopt(uit, numblocks, shift2, tbl);
- --recurse;
- tbl->ti_shift = shift;
- tbl->ut_len = uniq * blocksize;
- tbl->ut = tbl->ti + tbl->ti_len;
- for (i=0 ; i < uniq ; i++) {
- memcpy(tbl->ut + i * blocksize, ti[uniqblock[i]], blocksize*sizeof(tbl_item));
- }
- }
- }
- return smallest;
- }
- static const int rule2val[8] = {
- -1,
- (1 << 14), /* forward */
- (2 << 14), /* position */
- (3 << 14), /* forward,position */
- 0, /* backward */
- -1,
- -1,
- -1,
- };
- static int final_index_val_x(const char *s, const char *sym)
- {
- int r;
- if (!(r = final_index_val0(s))) {
- if (!strcmp(s, "IGNORE")) {
- r = 0;
- } else if (!strcmp(s, "..") || !strcmp(sym, "RANGE")) {
- if (*sym == '.') {
- final_index_val(sym); /* make sure it's known */
- }
- r = 0x3fff;
- } else if (!strcmp(s, ".")) {
- r = 0x3ffe;
- } else {
- error_msg("can't find final index: %s", s);
- }
- }
- return r;
- }
- /* store rule2val in 2 high bits and collation index in lower.
- * for sort strings, store (offset from base) + max colindex as index.
- */
- static unsigned int add_rule(weighted_item_t *wi)
- {
- weight_t *w = wi->weight;
- int i, j, r, n;
- uint16_t rbuf[MAX_COLLATION_WEIGHTS];
- uint16_t ws_buf[32];
- void *mm;
- char buf[32];
- const char *s;
- const char *e;
- for (i=0 ; i < MAX_COLLATION_WEIGHTS ; i++) {
- rbuf[i] = rule2val[R_FORWARD]; /* set a default to forward-ignore */
- }
- if (base_locale_array[base_locale_len].num_weights < w->num_weights) {
- base_locale_array[base_locale_len].num_weights = w->num_weights;
- }
- for (i=0 ; i < w->num_weights ; i++) {
- assert(rule2val[(int)(w->rule[i])] >= 0);
- assert(w->colitem[i] && *w->colitem[i]);
- if (*w->colitem[i] == '"') { /* string... */
- s = w->colitem[i] + 1;
- assert(*s == '<');
- n = 0;
- do {
- e = s;
- do {
- if (*e == '/') {
- e += 2;
- continue;
- }
- } while (*e++ != '>');
- assert(((size_t)(e-s) < sizeof(buf)));
- memcpy(buf, s, (size_t)(e-s));
- buf[(size_t)(e-s)] = 0;
- r = final_index_val_x(buf, wi->symbol);
- assert(n + 1 < sizeof(ws_buf)/sizeof(ws_buf[0]));
- ws_buf[n++] = r | rule2val[(int)(w->rule[i])];
- s = e;
- } while (*s != '"');
- ws_buf[n++] = 0; /* terminator */
- mm = memmem(weightstr_buffer, weightstr_len*sizeof(weightstr_buffer[0]),
- ws_buf, n*sizeof(ws_buf[0]));
- if (!mm) {
- assert(weightstr_len + n < WEIGHTSTR_LEN);
- memcpy(weightstr_buffer + weightstr_len, ws_buf, n*sizeof(ws_buf[0]));
- mm = weightstr_buffer + weightstr_len;
- weightstr_len += n;
- }
- r = (((uint16_t *)(mm)) - weightstr_buffer)
- + base_locale_array[base_locale_len].max_col_index + 2;
- assert(r < (1 << 14));
- rbuf[i] = r | rule2val[(int)(w->rule[i])];
- } else { /* item */
- r = final_index_val_x(w->colitem[i], wi->symbol);
- rbuf[i] = r | rule2val[(int)(w->rule[i])];
- }
- }
- for (i=0 ; i < ruletable_len ; i += MAX_COLLATION_WEIGHTS) {
- if (!memcmp(ruletable_buffer + i, rbuf, MAX_COLLATION_WEIGHTS*sizeof(ruletable_buffer[0]))) {
- return i/MAX_COLLATION_WEIGHTS;
- }
- }
- memcpy(ruletable_buffer + ruletable_len, rbuf, MAX_COLLATION_WEIGHTS*sizeof(ruletable_buffer[0]));
- ruletable_len += MAX_COLLATION_WEIGHTS;
- return (ruletable_len / MAX_COLLATION_WEIGHTS)-1;
- }
- static unsigned int add_range_rule(range_item_t *ri)
- {
- weight_t *w = ri->weight;
- int i, j, r, n;
- uint16_t rbuf[MAX_COLLATION_WEIGHTS];
- uint16_t ws_buf[32];
- void *mm;
- char buf[32];
- const char *s;
- const char *e;
- for (i=0 ; i < MAX_COLLATION_WEIGHTS ; i++) {
- rbuf[i] = rule2val[R_FORWARD]; /* set a default to forward-ignore */
- }
- if (base_locale_array[base_locale_len].num_weights < w->num_weights) {
- base_locale_array[base_locale_len].num_weights = w->num_weights;
- }
- for (i=0 ; i < w->num_weights ; i++) {
- assert(rule2val[(int)(w->rule[i])] >= 0);
- assert(w->colitem[i] && *w->colitem[i]);
- if (*w->colitem[i] == '"') { /* string... */
- s = w->colitem[i] + 1;
- assert(*s == '<');
- n = 0;
- do {
- e = s;
- do {
- if (*e == '/') {
- e += 2;
- continue;
- }
- } while (*e++ != '>');
- assert(((size_t)(e-s) < sizeof(buf)));
- memcpy(buf, s, (size_t)(e-s));
- buf[(size_t)(e-s)] = 0;
- r = final_index_val_x(buf, "RANGE");
- assert(n + 1 < sizeof(ws_buf)/sizeof(ws_buf[0]));
- ws_buf[n++] = r | rule2val[(int)(w->rule[i])];
- s = e;
- } while (*s != '"');
- ws_buf[n++] = 0; /* terminator */
- mm = memmem(weightstr_buffer, weightstr_len*sizeof(weightstr_buffer[0]),
- ws_buf, n*sizeof(ws_buf[0]));
- if (!mm) {
- assert(weightstr_len + n < WEIGHTSTR_LEN);
- memcpy(weightstr_buffer + weightstr_len, ws_buf, n*sizeof(ws_buf[0]));
- mm = weightstr_buffer + weightstr_len;
- weightstr_len += n;
- }
- r = (((uint16_t *)(mm)) - weightstr_buffer)
- + base_locale_array[base_locale_len].max_col_index + 2;
- assert(r < (1 << 14));
- rbuf[i] = r | rule2val[(int)(w->rule[i])];
- } else { /* item */
- r = final_index_val_x(w->colitem[i], "RANGE");
- rbuf[i] = r | rule2val[(int)(w->rule[i])];
- }
- }
- for (i=0 ; i < ruletable_len ; i += MAX_COLLATION_WEIGHTS) {
- if (!memcmp(ruletable_buffer + i, rbuf, MAX_COLLATION_WEIGHTS*sizeof(ruletable_buffer[0]))) {
- return i/MAX_COLLATION_WEIGHTS;
- }
- }
- memcpy(ruletable_buffer + ruletable_len, rbuf, MAX_COLLATION_WEIGHTS*sizeof(ruletable_buffer[0]));
- ruletable_len += MAX_COLLATION_WEIGHTS;
- return (ruletable_len / MAX_COLLATION_WEIGHTS)-1;
- }
- #define DUMPn(X) fprintf(stderr, "%10d-%-.20s", base_locale_array[n]. X, #X);
- static void dump_base_locale(int n)
- {
- assert(n < base_locale_len);
- fprintf(stderr, "Base Locale: %s\n", base_locale_array[n].name);
- DUMPn(num_weights);
- DUMPn(ii_shift);
- DUMPn(ti_shift);
- DUMPn(ii_len);
- DUMPn(ti_len);
- DUMPn(max_weight);
- fprintf(stderr, "\n");
- DUMPn(num_col_base);
- DUMPn(max_col_index);
- DUMPn(undefined_idx);
- DUMPn(range_low);
- DUMPn(range_count);
- fprintf(stderr, "\n");
- DUMPn(range_base_weight);
- DUMPn(num_starters);
- fprintf(stderr, "\n");
- DUMPn(range_rule_offset);
- DUMPn(wcs2colidt_offset);
- DUMPn(index2weight_offset);
- fprintf(stderr, "\n");
- DUMPn(index2ruleidx_offset);
- DUMPn(multistart_offset);
- fprintf(stderr, "\n");
- }
- #undef DUMPn
- #define DUMPn(X) fprintf(stderr, "%10d-%s", der_locale_array[n]. X, #X);
- static void dump_der_locale(int n)
- {
- assert(n < der_locale_len);
- fprintf(stderr, "Derived Locale: %s (%.12s)",
- der_locale_array[n].name,
- base_locale_array[der_locale_array[n].base_idx].name);
- DUMPn(base_idx);
- DUMPn(undefined_idx);
- DUMPn(overrides_offset);
- DUMPn(multistart_offset);
- fprintf(stderr, "\n");
- }
- static unsigned long collate_pos;
- static void dump_u16_array(FILE *fp, uint16_t *u, int len, const char *name)
- {
- int i;
- fprintf(fp, "\t/* %8lu %s */\n", collate_pos, name);
- for (i=0 ; i < len ; i++) {
- if (!(i & 7)) {
- fprintf(fp, "\n\t");
- }
- fprintf(fp," %#06x,", (unsigned int)(u[i]));
- }
- fprintf(fp,"\n");
- collate_pos += len;
- }
- #define OUT_U16C(X,N) fprintf(fp,"\t%10d, /* %8lu %s */\n", X, collate_pos++, N);
- static void dump_collate(FILE *fp)
- {
- int n;
- fprintf(fp, "const uint16_t __locale_collate_tbl[] = {\n");
- OUT_U16C(base_locale_len, "numbef of base locales");
- OUT_U16C(der_locale_len, "number of derived locales");
- OUT_U16C(MAX_COLLATION_WEIGHTS, "max collation weights");
- OUT_U16C(index2weight_len, "number of index2{weight|ruleidx} elements");
- OUT_U16C(weightstr_len, "number of weightstr elements");
- OUT_U16C(multistart_len, "number of multistart elements");
- OUT_U16C(override_len, "number of override elements");
- OUT_U16C(ruletable_len, "number of ruletable elements");
- #undef DUMPn
- #define DUMPn(X) fprintf(fp, "\t%10d, /* %8lu %s */\n", base_locale_array[n]. X, collate_pos++, #X);
- for (n=0 ; n < base_locale_len ; n++) {
- unsigned wcs2colidt_offset_low = base_locale_array[n].wcs2colidt_offset & 0xffffU;
- unsigned wcs2colidt_offset_hi = base_locale_array[n].wcs2colidt_offset >> 16;
- fprintf(fp, "\t/* Base Locale %2d: %s */\n", n, base_locale_array[n].name);
- DUMPn(num_weights);
- DUMPn(num_starters);
- DUMPn(ii_shift);
- DUMPn(ti_shift);
- DUMPn(ii_len);
- DUMPn(ti_len);
- DUMPn(max_weight);
- DUMPn(num_col_base);
- DUMPn(max_col_index);
- DUMPn(undefined_idx);
- DUMPn(range_low);
- DUMPn(range_count);
- DUMPn(range_base_weight);
- DUMPn(range_rule_offset);
- DUMPn(index2weight_offset);
- DUMPn(index2ruleidx_offset);
- DUMPn(multistart_offset);
- #undef DUMPn
- #define DUMPn(X) fprintf(fp, "\t%10d, /* %8lu %s */\n", X, collate_pos++, #X);
- DUMPn(wcs2colidt_offset_low);
- DUMPn(wcs2colidt_offset_hi);
- }
- #undef DUMPn
- fprintf(fp, "#define COL_IDX_C %5d\n", 0);
- #define DUMPn(X) fprintf(fp, "\t%10d, /* %8lu %s */\n", der_locale_array[n]. X, collate_pos++, #X);
- for (n=0 ; n < der_locale_len ; n++) {
- fprintf(fp, "#define COL_IDX_%s %5d\n", der_locale_array[n].name, n+1);
- fprintf(fp, "\t/* Derived Locale %4d: %s (%.12s) */\n",
- n, der_locale_array[n].name,
- base_locale_array[der_locale_array[n].base_idx].name);
- DUMPn(base_idx);
- DUMPn(undefined_idx);
- DUMPn(overrides_offset);
- DUMPn(multistart_offset);
- }
- #undef DUMPn
- fprintf(fp, "\n");
- dump_u16_array(fp, index2weight_buffer, index2weight_len, "index2weight");
- dump_u16_array(fp, index2ruleidx_buffer, index2ruleidx_len, "index2ruleidx");
- dump_u16_array(fp, multistart_buffer, multistart_len, "multistart");
- dump_u16_array(fp, override_buffer, override_len, "override");
- dump_u16_array(fp, ruletable_buffer, ruletable_len, "ruletable");
- dump_u16_array(fp, weightstr_buffer, weightstr_len, "weightstr");
- dump_u16_array(fp, wcs2colidt_buffer, wcs2colidt_len, "wcs2colidt");
- fprintf(fp,"}; /* %8lu */\n", collate_pos);
- fprintf(fp,"#define __lc_collate_data_LEN %d\n\n", collate_pos);
- }
|