| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670 | /* * Copyright (C) 2002     Manuel Novoa III * Copyright (C) 2000-2005 Erik Andersen <andersen@uclibc.org> * * Licensed under the LGPL v2.1, see the file COPYING.LIB in this tarball. *//*  Dec 20, 2002 *  Initial test implementation of strcoll, strxfrm, wcscoll, and wcsxfrm. *  The code needs to be cleaned up a good bit, but I'd like to see people *  test it out. * */#include "_string.h"#include <ctype.h>#include <locale.h>#include <stdlib.h>#include <errno.h>#include <assert.h>#ifdef __UCLIBC_HAS_LOCALE__#if defined(L_strxfrm) || defined(L_strxfrm_l) || defined(L_wcsxfrm) || defined(L_wcsxfrm_l)#ifdef L_strxfrm#ifndef WANT_WIDE#error WANT_WIDE should be defined for L_strxfrm#endif#ifdef L_wcsxfrm#error L_wcsxfrm already defined for L_strxfrm#endif#endif /* L_strxfrm */#if defined(L_strxfrm) || defined(L_strxfrm_l)#define wcscoll   strcoll#define wcscoll_l strcoll_l#define wcsxfrm   strxfrm#define wcsxfrm_l strxfrm_l#undef WANT_WIDE#undef Wvoid#undef Wchar#undef Wuchar#undef Wint#define Wchar char#endif /* defined(L_strxfrm) || defined(L_strxfrm_l) */#if defined(__UCLIBC_HAS_XLOCALE__) && !defined(__UCLIBC_DO_XLOCALE)int wcscoll (const Wchar *s0, const Wchar *s1){	return wcscoll_l(s0, s1, __UCLIBC_CURLOCALE );}libc_hidden_def(wcscoll)size_t wcsxfrm(Wchar *__restrict ws1, const Wchar *__restrict ws2, size_t n){	return wcsxfrm_l(ws1, ws2, n, __UCLIBC_CURLOCALE );}#else  /* defined(__UCLIBC_HAS_XLOCALE__) && !defined(__UCLIBC_DO_XLOCALE) */#if 0#define CUR_COLLATE (&__UCLIBC_CURLOCALE->collate)#else#define CUR_COLLATE (& __LOCALE_PTR->collate)#endif#define MAX_PENDING 8typedef struct {	const Wchar *s;	const Wchar *eob;			/* end of backward */	__uwchar_t weight;	__uwchar_t ui_weight;		/* undefined or invalid */	int colitem;	int weightidx;	int rule;	size_t position;	/* should be wchar_t.  if wchar < 0 do EILSEQ? */	__uwchar_t *cip;	__uwchar_t ci_pending[MAX_PENDING];	/* nul-terminated */	char *back_buf;	char *bbe;					/* end of back_buf (actual last... not 1 past end) */	char *bp;					/* ptr into backbuf, NULL if not in backward mode */	char ibb[128];	size_t bb_size;	int ru_pushed;} col_state_t;#define WEIGHT_MASK	0x3fffU#define RULE_MASK	0xc000U#define RULE_FORWARD  (1 << 14)#define RULE_POSITION (1 << 15)#define UI_IDX		(WEIGHT_MASK-6)#define POSIT_IDX	(WEIGHT_MASK-5)#define RANGE_IDX	(WEIGHT_MASK-4)#define UNDEF_IDX	(WEIGHT_MASK-3)#define INVAL_IDX	(WEIGHT_MASK-2)#define DITTO_IDX   (WEIGHT_MASK-1)#undef TRACE#if 0#define TRACE(X)	printf X#else#define TRACE(X)	((void)0)#endifstatic int lookup(wchar_t wc   __LOCALE_PARAM ){	unsigned int sc, n, i0, i1;	if (((__uwchar_t) wc) > 0xffffU) {		return 0;	}	sc = wc & CUR_COLLATE->ti_mask;	wc >>= CUR_COLLATE->ti_shift;	n = wc & CUR_COLLATE->ii_mask;	wc >>= CUR_COLLATE->ii_shift;	i0 = CUR_COLLATE->wcs2colidt_tbl[wc];	i0 <<= CUR_COLLATE->ii_shift;	i1 = CUR_COLLATE->wcs2colidt_tbl[CUR_COLLATE->ii_len + i0 + n];	i1 <<= CUR_COLLATE->ti_shift;	return CUR_COLLATE->wcs2colidt_tbl[CUR_COLLATE->ii_len + CUR_COLLATE->ti_len + i1 + sc];}static void init_col_state(col_state_t *cs, const Wchar *wcs){	memset(cs, 0, sizeof(col_state_t));	cs->s = wcs;	cs->bp = cs->back_buf = cs->ibb;	cs->bb_size = 128;	cs->bbe = cs->back_buf + (cs->bb_size -1);}static void next_weight(col_state_t *cs, int pass   __LOCALE_PARAM ){	int r, w, ru, ri, popping_backup_stack;	ssize_t n;	const uint16_t *p;#ifdef WANT_WIDE#define WC (*cs->s)#define N (1)#else  /* WANT_WIDE */	wchar_t WC;	size_t n0, nx = 0;#define N n0#endif /* WANT_WIDE */	do {		if (cs->ru_pushed) {			ru = cs->ru_pushed;			TRACE(("ru_pushed = %d\n", ru));			cs->ru_pushed = 0;			goto POSITION_SKIP;		}#ifdef __UCLIBC_MJN3_ONLY__#warning should we walk pendings backwards?#endif		if (cs->cip) {			/* possible pending weight */			if ((r = *(cs->cip++)) == 0) {				cs->cip = NULL;				continue;			}			cs->weightidx = r & WEIGHT_MASK;			assert(cs->weightidx);/* 			assert(cs->weightidx != WEIGHT_MASK); */		} else {				/* get the next collation item from the string */			TRACE(("clearing popping flag\n"));			popping_backup_stack = 0;		IGNORE_LOOP:			/* keep first pos as 0 for a sentinal */			if (*cs->bp) {				/* pending backward chars */			POP_BACKUP:				popping_backup_stack = 1;				TRACE(("setting popping flag\n"));				n = 0;				if (*cs->bp > 0) {		/* singles pending */					cs->s -= 1;					if ((*cs->bp -= 1) == 0) {						cs->bp -= 1;					}				} else {				/* last was a multi */					cs->s += *cs->bp;					cs->bp -= 1;				}			} else if (!*cs->s) { /* not in backward mode and end of string */				cs->weight = 0;				return;			} else {				cs->position += 1;			}		BACK_LOOP:#ifdef WANT_WIDE			n = 1;			cs->colitem = r = lookup(*cs->s   __LOCALE_ARG );#else  /* WANT_WIDE */			n = n0 = __locale_mbrtowc_l(&WC, cs->s, __LOCALE_PTR);			if (n < 0) {				__set_errno(EILSEQ);				cs->weight = 0;				return;			}			cs->colitem = r = lookup(WC   __LOCALE_ARG );#endif /* WANT_WIDE */			TRACE((" r=%d WC=%#lx\n", r, (unsigned long)(WC)));			if (r > CUR_COLLATE->max_col_index) { /* starting char for one or more sequences */				p = CUR_COLLATE->multistart_tbl;				p += p[r-CUR_COLLATE->max_col_index -1];				do {					n = N;					r = *p++;					do {						if (!*p) {		/* found it */							cs->colitem = r;							TRACE(("    found multi %d\n", n));							goto FOUND;						}#ifdef WANT_WIDE						/* the lookup check here is safe since we're assured that *p is a valid colidx */						if (!cs->s[n] || (lookup(cs->s[n]   __LOCALE_ARG ) != *p)) {							do {} while (*p++);							break;						}						++p;						++n;#else  /* WANT_WIDE */						if (cs->s[n]) {							nx = __locale_mbrtowc_l(&WC, cs->s + n, __LOCALE_PTR);							if (nx < 0) {								__set_errno(EILSEQ);								cs->weight = 0;								return;							}						}						if (!cs->s[n] || (lookup(WC   __LOCALE_ARG ) != *p)) {							do {} while (*p++);							break;						}						++p;						n += nx; /* Only gets here if cs->s[n] != 0, so nx is set. */#endif /* WANT_WIDE */					} while (1);				} while (1);			} else if (r == 0) {		/* illegal, undefined, or part of a range */				if ((CUR_COLLATE->range_count)#ifdef __UCLIBC_MJN3_ONLY__#warning .. need to introduce range as a collating item?#endif					&& (((__uwchar_t)(WC - CUR_COLLATE->range_low)) <= CUR_COLLATE->range_count)					) {					/* part of a range */					/* Note: cs->colitem = 0 already. */					TRACE(("    found range\n"));					ru = CUR_COLLATE->ruletable[CUR_COLLATE->range_rule_offset*CUR_COLLATE->MAX_WEIGHTS + pass];					assert((ru & WEIGHT_MASK) != DITTO_IDX);					if ((ru & WEIGHT_MASK) == WEIGHT_MASK) {						ru = (ru & RULE_MASK) | RANGE_IDX;						cs->weight = CUR_COLLATE->range_base_weight + (WC - CUR_COLLATE->range_low);					}					goto RANGE_SKIP_TO;				} else if (((__uwchar_t)(WC)) <= 0x7fffffffUL) { /* legal but undefined */				UNDEFINED:					/* Note: cs->colitem = 0 already. */					ri = CUR_COLLATE->undefined_idx;					assert(ri != 0); /* implicit undefined isn't supported */					TRACE(("    found explicit UNDEFINED\n"));#ifdef __UCLIBC_MJN3_ONLY__#warning right now single weight locales do not support ..#endif					if (CUR_COLLATE->num_weights == 1) {						TRACE(("    single weight UNDEFINED\n"));						cs->weightidx = RANGE_IDX;						cs->weight = ri;						cs->s += n;						goto PROCESS_WEIGHT;					}					ri = CUR_COLLATE->index2ruleidx[ri - 1];					ru = CUR_COLLATE->ruletable[ri * CUR_COLLATE->MAX_WEIGHTS + pass];					assert((ru & WEIGHT_MASK) != WEIGHT_MASK); /* TODO: handle ".." */					if ((ru & WEIGHT_MASK) == DITTO_IDX) {						cs->colitem = CUR_COLLATE->undefined_idx;					}					goto RANGE_SKIP_TO;				} else {		/* illegal */					TRACE(("    found illegal\n"));					__set_errno(EINVAL);					/* We put all illegals in the same equiv class with maximal weight,					 * and ignore them after the first pass. */					if (pass > 0) {						cs->s += n;						goto IGNORE_LOOP;					}					ru = (RULE_FORWARD | RANGE_IDX);					cs->weight = 0xffffU;					goto RANGE_SKIP_TO;				}			} else if (CUR_COLLATE->num_weights == 1) {				TRACE(("    single weight\n"));				cs->weightidx = RANGE_IDX;				cs->weight = cs->colitem;				cs->s += n;				goto PROCESS_WEIGHT;			} else {				TRACE(("    normal\n"));			}			/* if we get here, it is a normal char either singlely weighted, undefined, or in a range */		FOUND:			ri = CUR_COLLATE->index2ruleidx[cs->colitem - 1];			TRACE((" ri=%d ", ri));#ifdef __UCLIBC_MJN3_ONLY__#warning make sure this is correct#endif			if (!ri) {				TRACE(("NOT IN THIS LOCALE\n"));				goto UNDEFINED;			}			ru = CUR_COLLATE->ruletable[ri * CUR_COLLATE->MAX_WEIGHTS + pass];		RANGE_SKIP_TO:#ifdef __UCLIBC_MJN3_ONLY__#warning ignoreables probably should not interrupt backwards processing, but this is wrong#endif/* 			if (!(ru & WEIGHT_MASK)) { *//* 				TRACE(("IGNORE\n")); *//* 				cs->s += n; *//* 				continue; *//* 			} */			TRACE((" rule = %#x  weight = %#x  popping = %d  s = %p  eob = %p\n",				   ru & RULE_MASK, ru & WEIGHT_MASK, popping_backup_stack,				   cs->s, cs->eob));			/* now we need to check if we're going backwards... */			if (!popping_backup_stack) {				if (!(ru & RULE_MASK)) { /* backward */					TRACE(("backwards\n"));					assert(cs->bp <= cs->bbe);					if (cs->bp == cs->bbe) {						if (cs->back_buf == cs->ibb) { /* was using internal buffer */							cs->bp = malloc(cs->bb_size + 128);							if (!cs->bp) {								/* __set_errno(ENOMEM); */#ifdef __UCLIBC_MJN3_ONLY__#warning what to do here?#endif								cs->weight = 0;								return;							}							memcpy(cs->bp, cs->back_buf, cs->bb_size);						} else {							cs->bp = realloc(cs->back_buf, cs->bb_size + 128);							if (!cs->bp) {								/* __set_errno(ENOMEM); */#ifdef __UCLIBC_MJN3_ONLY__#warning what to do here?#endif								cs->weight = 0;								return;							}						}						cs->bb_size += 128;						cs->bbe = cs->bp + (cs->bbe - cs->back_buf);						cs->back_buf = cs->bp;						cs->bp = cs->bbe;					}					if (n==1) {			/* single char */						if (*cs->bp && (((unsigned char)(*cs->bp)) < CHAR_MAX)) {							*cs->bp += 1; /* increment last single's count */						} else {	  /* last was a multi, or just starting */							if (!cs->bp) {								cs->bp = cs->back_buf;							} else {								assert(cs->bp < cs->bbe);								++cs->bp;							}							*cs->bp = 1;						}					} else {			/* multichar */						assert(n>1);						assert(cs->bp < cs->bbe);						*++cs->bp = -n;					}					cs->s += n;					if (*cs->s) {						goto BACK_LOOP;					}					/* end-of-string so start popping */					cs->eob = cs->s;					TRACE(("popping\n"));					goto POP_BACKUP;				} else if (*cs->bp) { /* was going backward but this element isn't */					/* discard current and use previous backward element */					assert(!cs->cip);					cs->eob = cs->s;					TRACE(("popping\n"));					goto POP_BACKUP;				} else {				/* was and still going forward */					TRACE(("forwards\n"));					if ((ru & (RULE_POSITION|WEIGHT_MASK)) > RULE_POSITION) {						assert(ru & WEIGHT_MASK);						cs->ru_pushed = ru;						cs->weight = cs->position;#ifdef __UCLIBC_MJN3_ONLY__#warning devel code#endif						cs->position = 0;	/* reset to reduce size for strcoll? */						cs->s += n;						cs->weightidx = RANGE_IDX;						goto PROCESS_WEIGHT;					}				}			} else {					/* popping backwards stack */				TRACE(("popping (continued)\n"));				if (!*cs->bp) {					cs->s = cs->eob;				}				cs->s -= n;			}			cs->s += n;		POSITION_SKIP:			cs->weightidx = ru & WEIGHT_MASK;			cs->rule = ru & RULE_MASK;		}#ifdef __UCLIBC_MJN3_ONLY__#warning for pending we only want the weight... _not_ the rule#endif		if (!cs->weightidx) {	/* ignore */			continue;		}	PROCESS_WEIGHT:		assert(cs->weightidx);		if (((unsigned int)(cs->weightidx - UI_IDX)) <= (INVAL_IDX-UI_IDX)) {			if (cs->weightidx == UI_IDX) {				cs->weight = cs->ui_weight;			}			return;		}		assert(cs->weightidx != WEIGHT_MASK);		if (cs->weightidx == DITTO_IDX) { /* want the weight of the current collating item */			TRACE(("doing ditto\n"));			w = CUR_COLLATE->index2weight[cs->colitem -1];		} else if (cs->weightidx <= CUR_COLLATE->max_col_index) { /* normal */			TRACE(("doing normal\n"));			w = CUR_COLLATE->index2weight[cs->weightidx -1];		} else {				/* a string */			TRACE(("doing string\n"));			assert(!(cs->weightidx & RULE_MASK));			/* note: iso14561 allows null string here */			p = CUR_COLLATE->weightstr + (cs->weightidx - (CUR_COLLATE->max_col_index + 2));			if (*p & WEIGHT_MASK) {				r = 0;				do {					assert(r < MAX_PENDING);					cs->ci_pending[r++] = *p++;				} while (*p & WEIGHT_MASK);				cs->cip = cs->ci_pending;			}			continue;		}		cs->weight = w;		return;	} while (1);}int __XL_NPP(wcscoll) (const Wchar *s0, const Wchar *s1   __LOCALE_PARAM ){	col_state_t ws[2];	int pass;	if (!CUR_COLLATE->num_weights) { /* C locale */#ifdef WANT_WIDE		return wcscmp(s0, s1);#else		return strcmp(s0, s1);#endif	}	pass = 0;	do {						/* loop through the weights levels */		init_col_state(ws, s0);		init_col_state(ws+1, s1);		do {					/* loop through the strings */			/* for each string, get the next weight */			next_weight(ws, pass   __LOCALE_ARG );			next_weight(ws+1, pass   __LOCALE_ARG );			TRACE(("w0=%lu  w1=%lu\n",				   (unsigned long) ws[0].weight,				   (unsigned long) ws[1].weight));			if (ws[0].weight != ws[1].weight) {				return ws[0].weight - ws[1].weight;			}		} while (ws[0].weight);	} while (++pass < CUR_COLLATE->num_weights);	return 0;}libc_hidden_def(__XL_NPP(wcscoll))#ifdef WANT_WIDEsize_t __XL_NPP(wcsxfrm)(wchar_t *__restrict ws1, const wchar_t *__restrict ws2,					 size_t n   __LOCALE_PARAM ){	col_state_t cs;	size_t count;	int pass;	if (!CUR_COLLATE->num_weights) { /* C locale */		return __wcslcpy(ws1, ws2, n);	}#ifdef __UCLIBC_MJN3_ONLY__#warning handle empty string as a special case#endif	count = pass = 0;	do {						/* loop through the weights levels */		init_col_state(&cs, ws2);		do {					/* loop through the string */			next_weight(&cs, pass   __LOCALE_ARG );			TRACE(("weight=%lu (%#lx)\n", (unsigned long) cs.weight, (unsigned long) cs.weight));			if (count < n) {				ws1[count] = cs.weight +1;			}			++count;			TRACE(("--------------------------------------------\n"));		} while (cs.weight);		if (count <= n) {		/* overwrite the trailing 0 end-of-pass marker */			ws1[count-1] = 1;		}		TRACE(("--------------------  pass %d  --------------------\n", pass));	} while (++pass < CUR_COLLATE->num_weights);	if (count <= n) {			/* oops... change it back */		ws1[count-1] = 0;	}	return count-1;}#if defined L_strxfrm_l || defined L_wcsxfrm_llibc_hidden_def(__XL_NPP(wcsxfrm))#endif#else  /* WANT_WIDE */static const unsigned long bound[] = {	1UL << 7,	1UL << 11,	1UL << 16,	1UL << 21,	1UL << 26,};static unsigned char first[] = {	0x0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};/* Use an extension of UTF-8 to store a 32 bit val in max 6 bytes. */static size_t store(unsigned char *s, size_t count, size_t n, __uwchar_t weight){	int i, r;	i = 0;	do {		if (weight < bound[i]) {			break;		}	} while (++i < sizeof(bound)/sizeof(bound[0]));	r = i+1;	if (i + count < n) {		s += count;		s[0] = first[i];		while (i) {			s[i] = 0x80 | (weight & 0x3f);			weight >>= 6;			--i;		}		s[0] |= weight;	}	return r;}size_t __XL_NPP(strxfrm)(char *__restrict ws1, const char *__restrict ws2, size_t n					 __LOCALE_PARAM ){	col_state_t cs;	size_t count, inc;	int pass;	if (!CUR_COLLATE->num_weights) { /* C locale */		return strlcpy(ws1, ws2, n);	}#ifdef __UCLIBC_MJN3_ONLY__#warning handle empty string as a special case#endif	inc = count = pass = 0;	do {						/* loop through the weights levels */		init_col_state(&cs, ws2);		do {					/* loop through the string */			next_weight(&cs, pass   __LOCALE_ARG );			TRACE(("weight=%lu (%#lx)\n", (unsigned long) cs.weight, (unsigned long) cs.weight));			inc = store((unsigned char *)ws1, count, n, cs.weight + 1);			count += inc;			TRACE(("--------------------------------------------\n"));		} while (cs.weight);		/* overwrite the trailing 0 end-of-pass marker */		assert(inc == 1);		if (count <= n) {			ws1[count-1] = 1;		}		TRACE(("--------------------  pass %d  --------------------\n", pass));	} while (++pass < CUR_COLLATE->num_weights);	if (count <= n) {			/* oops... change it back */		ws1[count-1] = 0;	}	return count-1;}#ifdef L_strxfrm_llibc_hidden_def(__XL_NPP(strxfrm))#endif#endif /* WANT_WIDE */#endif /* defined(__UCLIBC_HAS_XLOCALE__) && !defined(__UCLIBC_DO_XLOCALE) */#endif /* defined(L_strxfrm) || defined(L_strxfrm_l) || defined(L_wcsxfrm) || defined(L_wcsxfrm_l) */#endif /* __UCLIBC_HAS_LOCALE__ *//**********************************************************************/
 |