regex_internal.c 44 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646
  1. /* Extended regular expression matching and search library.
  2. Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
  3. This file is part of the GNU C Library.
  4. Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
  5. The GNU C Library is free software; you can redistribute it and/or
  6. modify it under the terms of the GNU Lesser General Public
  7. License as published by the Free Software Foundation; either
  8. version 2.1 of the License, or (at your option) any later version.
  9. The GNU C Library is distributed in the hope that it will be useful,
  10. but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  12. Lesser General Public License for more details.
  13. You should have received a copy of the GNU Lesser General Public
  14. License along with the GNU C Library; if not, write to the Free
  15. Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  16. 02111-1307 USA. */
  17. static void re_string_construct_common (const char *str, int len,
  18. re_string_t *pstr,
  19. RE_TRANSLATE_TYPE trans, int icase,
  20. const re_dfa_t *dfa) internal_function;
  21. static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
  22. const re_node_set *nodes,
  23. unsigned int hash) internal_function;
  24. static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
  25. const re_node_set *nodes,
  26. unsigned int context,
  27. unsigned int hash) internal_function;
  28. /* Functions for string operation. */
  29. /* This function allocate the buffers. It is necessary to call
  30. re_string_reconstruct before using the object. */
  31. static reg_errcode_t
  32. internal_function
  33. re_string_allocate (re_string_t *pstr, const char *str, int len, int init_len,
  34. RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
  35. {
  36. reg_errcode_t ret;
  37. int init_buf_len;
  38. /* Ensure at least one character fits into the buffers. */
  39. if (init_len < dfa->mb_cur_max)
  40. init_len = dfa->mb_cur_max;
  41. init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
  42. re_string_construct_common (str, len, pstr, trans, icase, dfa);
  43. ret = re_string_realloc_buffers (pstr, init_buf_len);
  44. if (BE (ret != REG_NOERROR, 0))
  45. return ret;
  46. pstr->word_char = dfa->word_char;
  47. pstr->word_ops_used = dfa->word_ops_used;
  48. pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
  49. pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
  50. pstr->valid_raw_len = pstr->valid_len;
  51. return REG_NOERROR;
  52. }
  53. /* This function allocate the buffers, and initialize them. */
  54. static reg_errcode_t
  55. internal_function
  56. re_string_construct (re_string_t *pstr, const char *str, int len,
  57. RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
  58. {
  59. reg_errcode_t ret;
  60. memset (pstr, '\0', sizeof (re_string_t));
  61. re_string_construct_common (str, len, pstr, trans, icase, dfa);
  62. if (len > 0)
  63. {
  64. ret = re_string_realloc_buffers (pstr, len + 1);
  65. if (BE (ret != REG_NOERROR, 0))
  66. return ret;
  67. }
  68. pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
  69. if (icase)
  70. {
  71. #ifdef RE_ENABLE_I18N
  72. if (dfa->mb_cur_max > 1)
  73. {
  74. while (1)
  75. {
  76. ret = build_wcs_upper_buffer (pstr);
  77. if (BE (ret != REG_NOERROR, 0))
  78. return ret;
  79. if (pstr->valid_raw_len >= len)
  80. break;
  81. if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
  82. break;
  83. ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
  84. if (BE (ret != REG_NOERROR, 0))
  85. return ret;
  86. }
  87. }
  88. else
  89. #endif /* RE_ENABLE_I18N */
  90. build_upper_buffer (pstr);
  91. }
  92. else
  93. {
  94. #ifdef RE_ENABLE_I18N
  95. if (dfa->mb_cur_max > 1)
  96. build_wcs_buffer (pstr);
  97. else
  98. #endif /* RE_ENABLE_I18N */
  99. {
  100. if (trans != NULL)
  101. re_string_translate_buffer (pstr);
  102. else
  103. {
  104. pstr->valid_len = pstr->bufs_len;
  105. pstr->valid_raw_len = pstr->bufs_len;
  106. }
  107. }
  108. }
  109. return REG_NOERROR;
  110. }
  111. /* Helper functions for re_string_allocate, and re_string_construct. */
  112. static reg_errcode_t
  113. internal_function
  114. re_string_realloc_buffers (re_string_t *pstr, int new_buf_len)
  115. {
  116. #ifdef RE_ENABLE_I18N
  117. if (pstr->mb_cur_max > 1)
  118. {
  119. wint_t *new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
  120. if (BE (new_wcs == NULL, 0))
  121. return REG_ESPACE;
  122. pstr->wcs = new_wcs;
  123. if (pstr->offsets != NULL)
  124. {
  125. int *new_offsets = re_realloc (pstr->offsets, int, new_buf_len);
  126. if (BE (new_offsets == NULL, 0))
  127. return REG_ESPACE;
  128. pstr->offsets = new_offsets;
  129. }
  130. }
  131. #endif /* RE_ENABLE_I18N */
  132. if (pstr->mbs_allocated)
  133. {
  134. unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
  135. new_buf_len);
  136. if (BE (new_mbs == NULL, 0))
  137. return REG_ESPACE;
  138. pstr->mbs = new_mbs;
  139. }
  140. pstr->bufs_len = new_buf_len;
  141. return REG_NOERROR;
  142. }
  143. static void
  144. internal_function
  145. re_string_construct_common (const char *str, int len, re_string_t *pstr,
  146. RE_TRANSLATE_TYPE trans, int icase,
  147. const re_dfa_t *dfa)
  148. {
  149. pstr->raw_mbs = (const unsigned char *) str;
  150. pstr->len = len;
  151. pstr->raw_len = len;
  152. pstr->trans = trans;
  153. pstr->icase = icase ? 1 : 0;
  154. pstr->mbs_allocated = (trans != NULL || icase);
  155. pstr->mb_cur_max = dfa->mb_cur_max;
  156. pstr->is_utf8 = dfa->is_utf8;
  157. pstr->map_notascii = dfa->map_notascii;
  158. pstr->stop = pstr->len;
  159. pstr->raw_stop = pstr->stop;
  160. }
  161. #ifdef RE_ENABLE_I18N
  162. /* Build wide character buffer PSTR->WCS.
  163. If the byte sequence of the string are:
  164. <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
  165. Then wide character buffer will be:
  166. <wc1> , WEOF , <wc2> , WEOF , <wc3>
  167. We use WEOF for padding, they indicate that the position isn't
  168. a first byte of a multibyte character.
  169. Note that this function assumes PSTR->VALID_LEN elements are already
  170. built and starts from PSTR->VALID_LEN. */
  171. static void
  172. internal_function
  173. build_wcs_buffer (re_string_t *pstr)
  174. {
  175. #if defined _LIBC || defined __UCLIBC__
  176. unsigned char buf[MB_LEN_MAX];
  177. assert (MB_LEN_MAX >= pstr->mb_cur_max);
  178. #else
  179. unsigned char buf[64];
  180. #endif
  181. mbstate_t prev_st;
  182. int byte_idx, end_idx, remain_len;
  183. size_t mbclen;
  184. /* Build the buffers from pstr->valid_len to either pstr->len or
  185. pstr->bufs_len. */
  186. end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
  187. for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
  188. {
  189. wchar_t wc;
  190. const char *p;
  191. remain_len = end_idx - byte_idx;
  192. prev_st = pstr->cur_state;
  193. /* Apply the translation if we need. */
  194. if (BE (pstr->trans != NULL, 0))
  195. {
  196. int i, ch;
  197. for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
  198. {
  199. ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
  200. buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
  201. }
  202. p = (const char *) buf;
  203. }
  204. else
  205. p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
  206. mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
  207. if (BE (mbclen == (size_t) -2, 0))
  208. {
  209. /* The buffer doesn't have enough space, finish to build. */
  210. pstr->cur_state = prev_st;
  211. break;
  212. }
  213. else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
  214. {
  215. /* We treat these cases as a singlebyte character. */
  216. mbclen = 1;
  217. wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
  218. if (BE (pstr->trans != NULL, 0))
  219. wc = pstr->trans[wc];
  220. pstr->cur_state = prev_st;
  221. }
  222. /* Write wide character and padding. */
  223. pstr->wcs[byte_idx++] = wc;
  224. /* Write paddings. */
  225. for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
  226. pstr->wcs[byte_idx++] = WEOF;
  227. }
  228. pstr->valid_len = byte_idx;
  229. pstr->valid_raw_len = byte_idx;
  230. }
  231. /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
  232. but for REG_ICASE. */
  233. static int
  234. internal_function
  235. build_wcs_upper_buffer (re_string_t *pstr)
  236. {
  237. mbstate_t prev_st;
  238. int src_idx, byte_idx, end_idx, remain_len;
  239. size_t mbclen;
  240. #if defined _LIBC || defined __UCLIBC__
  241. char buf[MB_LEN_MAX];
  242. assert (MB_LEN_MAX >= pstr->mb_cur_max);
  243. #else
  244. char buf[64];
  245. #endif
  246. byte_idx = pstr->valid_len;
  247. end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
  248. /* The following optimization assumes that ASCII characters can be
  249. mapped to wide characters with a simple cast. */
  250. if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
  251. {
  252. while (byte_idx < end_idx)
  253. {
  254. wchar_t wc;
  255. if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
  256. && mbsinit (&pstr->cur_state))
  257. {
  258. /* In case of a singlebyte character. */
  259. pstr->mbs[byte_idx]
  260. = __toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
  261. /* The next step uses the assumption that wchar_t is encoded
  262. ASCII-safe: all ASCII values can be converted like this. */
  263. pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
  264. ++byte_idx;
  265. continue;
  266. }
  267. remain_len = end_idx - byte_idx;
  268. prev_st = pstr->cur_state;
  269. mbclen = mbrtowc (&wc,
  270. ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
  271. + byte_idx), remain_len, &pstr->cur_state);
  272. if (BE (mbclen + 2 > 2, 1))
  273. {
  274. wchar_t wcu = wc;
  275. if (iswlower (wc))
  276. {
  277. size_t mbcdlen;
  278. wcu = towupper (wc);
  279. mbcdlen = wcrtomb (buf, wcu, &prev_st);
  280. if (BE (mbclen == mbcdlen, 1))
  281. memcpy (pstr->mbs + byte_idx, buf, mbclen);
  282. else
  283. {
  284. src_idx = byte_idx;
  285. goto offsets_needed;
  286. }
  287. }
  288. else
  289. memcpy (pstr->mbs + byte_idx,
  290. pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
  291. pstr->wcs[byte_idx++] = wcu;
  292. /* Write paddings. */
  293. for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
  294. pstr->wcs[byte_idx++] = WEOF;
  295. }
  296. else if (mbclen == (size_t) -1 || mbclen == 0)
  297. {
  298. /* It is an invalid character or '\0'. Just use the byte. */
  299. int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
  300. pstr->mbs[byte_idx] = ch;
  301. /* And also cast it to wide char. */
  302. pstr->wcs[byte_idx++] = (wchar_t) ch;
  303. if (BE (mbclen == (size_t) -1, 0))
  304. pstr->cur_state = prev_st;
  305. }
  306. else
  307. {
  308. /* The buffer doesn't have enough space, finish to build. */
  309. pstr->cur_state = prev_st;
  310. break;
  311. }
  312. }
  313. pstr->valid_len = byte_idx;
  314. pstr->valid_raw_len = byte_idx;
  315. return REG_NOERROR;
  316. }
  317. else
  318. for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
  319. {
  320. wchar_t wc;
  321. const char *p;
  322. offsets_needed:
  323. remain_len = end_idx - byte_idx;
  324. prev_st = pstr->cur_state;
  325. if (BE (pstr->trans != NULL, 0))
  326. {
  327. int i, ch;
  328. for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
  329. {
  330. ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
  331. buf[i] = pstr->trans[ch];
  332. }
  333. p = (const char *) buf;
  334. }
  335. else
  336. p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
  337. mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
  338. if (BE (mbclen + 2 > 2, 1))
  339. {
  340. wchar_t wcu = wc;
  341. if (iswlower (wc))
  342. {
  343. size_t mbcdlen;
  344. wcu = towupper (wc);
  345. mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
  346. if (BE (mbclen == mbcdlen, 1))
  347. memcpy (pstr->mbs + byte_idx, buf, mbclen);
  348. else if (mbcdlen != (size_t) -1)
  349. {
  350. size_t i;
  351. if (byte_idx + mbcdlen > pstr->bufs_len)
  352. {
  353. pstr->cur_state = prev_st;
  354. break;
  355. }
  356. if (pstr->offsets == NULL)
  357. {
  358. pstr->offsets = re_malloc (int, pstr->bufs_len);
  359. if (pstr->offsets == NULL)
  360. return REG_ESPACE;
  361. }
  362. if (!pstr->offsets_needed)
  363. {
  364. for (i = 0; i < (size_t) byte_idx; ++i)
  365. pstr->offsets[i] = i;
  366. pstr->offsets_needed = 1;
  367. }
  368. memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
  369. pstr->wcs[byte_idx] = wcu;
  370. pstr->offsets[byte_idx] = src_idx;
  371. for (i = 1; i < mbcdlen; ++i)
  372. {
  373. pstr->offsets[byte_idx + i]
  374. = src_idx + (i < mbclen ? i : mbclen - 1);
  375. pstr->wcs[byte_idx + i] = WEOF;
  376. }
  377. pstr->len += mbcdlen - mbclen;
  378. if (pstr->raw_stop > src_idx)
  379. pstr->stop += mbcdlen - mbclen;
  380. end_idx = (pstr->bufs_len > pstr->len)
  381. ? pstr->len : pstr->bufs_len;
  382. byte_idx += mbcdlen;
  383. src_idx += mbclen;
  384. continue;
  385. }
  386. else
  387. memcpy (pstr->mbs + byte_idx, p, mbclen);
  388. }
  389. else
  390. memcpy (pstr->mbs + byte_idx, p, mbclen);
  391. if (BE (pstr->offsets_needed != 0, 0))
  392. {
  393. size_t i;
  394. for (i = 0; i < mbclen; ++i)
  395. pstr->offsets[byte_idx + i] = src_idx + i;
  396. }
  397. src_idx += mbclen;
  398. pstr->wcs[byte_idx++] = wcu;
  399. /* Write paddings. */
  400. for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
  401. pstr->wcs[byte_idx++] = WEOF;
  402. }
  403. else if (mbclen == (size_t) -1 || mbclen == 0)
  404. {
  405. /* It is an invalid character or '\0'. Just use the byte. */
  406. int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
  407. if (BE (pstr->trans != NULL, 0))
  408. ch = pstr->trans [ch];
  409. pstr->mbs[byte_idx] = ch;
  410. if (BE (pstr->offsets_needed != 0, 0))
  411. pstr->offsets[byte_idx] = src_idx;
  412. ++src_idx;
  413. /* And also cast it to wide char. */
  414. pstr->wcs[byte_idx++] = (wchar_t) ch;
  415. if (BE (mbclen == (size_t) -1, 0))
  416. pstr->cur_state = prev_st;
  417. }
  418. else
  419. {
  420. /* The buffer doesn't have enough space, finish to build. */
  421. pstr->cur_state = prev_st;
  422. break;
  423. }
  424. }
  425. pstr->valid_len = byte_idx;
  426. pstr->valid_raw_len = src_idx;
  427. return REG_NOERROR;
  428. }
  429. /* Skip characters until the index becomes greater than NEW_RAW_IDX.
  430. Return the index. */
  431. static int
  432. internal_function
  433. re_string_skip_chars (re_string_t *pstr, int new_raw_idx, wint_t *last_wc)
  434. {
  435. mbstate_t prev_st;
  436. int rawbuf_idx;
  437. size_t mbclen;
  438. wchar_t wc = 0;
  439. /* Skip the characters which are not necessary to check. */
  440. for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
  441. rawbuf_idx < new_raw_idx;)
  442. {
  443. int remain_len;
  444. remain_len = pstr->len - rawbuf_idx;
  445. prev_st = pstr->cur_state;
  446. mbclen = mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx,
  447. remain_len, &pstr->cur_state);
  448. if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
  449. {
  450. /* We treat these cases as a singlebyte character. */
  451. mbclen = 1;
  452. pstr->cur_state = prev_st;
  453. }
  454. /* Then proceed the next character. */
  455. rawbuf_idx += mbclen;
  456. }
  457. *last_wc = (wint_t) wc;
  458. return rawbuf_idx;
  459. }
  460. #endif /* RE_ENABLE_I18N */
  461. /* Build the buffer PSTR->MBS, and apply the translation if we need.
  462. This function is used in case of REG_ICASE. */
  463. static void
  464. internal_function
  465. build_upper_buffer (re_string_t *pstr)
  466. {
  467. int char_idx, end_idx;
  468. end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
  469. for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
  470. {
  471. int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
  472. if (BE (pstr->trans != NULL, 0))
  473. ch = pstr->trans[ch];
  474. if (islower (ch))
  475. pstr->mbs[char_idx] = __toupper (ch);
  476. else
  477. pstr->mbs[char_idx] = ch;
  478. }
  479. pstr->valid_len = char_idx;
  480. pstr->valid_raw_len = char_idx;
  481. }
  482. /* Apply TRANS to the buffer in PSTR. */
  483. static void
  484. internal_function
  485. re_string_translate_buffer (re_string_t *pstr)
  486. {
  487. int buf_idx, end_idx;
  488. end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
  489. for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
  490. {
  491. int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
  492. pstr->mbs[buf_idx] = pstr->trans[ch];
  493. }
  494. pstr->valid_len = buf_idx;
  495. pstr->valid_raw_len = buf_idx;
  496. }
  497. /* This function re-construct the buffers.
  498. Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
  499. convert to upper case in case of REG_ICASE, apply translation. */
  500. static reg_errcode_t
  501. internal_function
  502. re_string_reconstruct (re_string_t *pstr, int idx, int eflags)
  503. {
  504. int offset = idx - pstr->raw_mbs_idx;
  505. if (BE (offset < 0, 0))
  506. {
  507. /* Reset buffer. */
  508. #ifdef RE_ENABLE_I18N
  509. if (pstr->mb_cur_max > 1)
  510. memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
  511. #endif /* RE_ENABLE_I18N */
  512. pstr->len = pstr->raw_len;
  513. pstr->stop = pstr->raw_stop;
  514. pstr->valid_len = 0;
  515. pstr->raw_mbs_idx = 0;
  516. pstr->valid_raw_len = 0;
  517. pstr->offsets_needed = 0;
  518. pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
  519. : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
  520. if (!pstr->mbs_allocated)
  521. pstr->mbs = (unsigned char *) pstr->raw_mbs;
  522. offset = idx;
  523. }
  524. if (BE (offset != 0, 1))
  525. {
  526. /* Are the characters which are already checked remain? */
  527. if (BE (offset < pstr->valid_raw_len, 1)
  528. #ifdef RE_ENABLE_I18N
  529. /* Handling this would enlarge the code too much.
  530. Accept a slowdown in that case. */
  531. && pstr->offsets_needed == 0
  532. #endif
  533. )
  534. {
  535. /* Yes, move them to the front of the buffer. */
  536. pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags);
  537. #ifdef RE_ENABLE_I18N
  538. if (pstr->mb_cur_max > 1)
  539. memmove (pstr->wcs, pstr->wcs + offset,
  540. (pstr->valid_len - offset) * sizeof (wint_t));
  541. #endif /* RE_ENABLE_I18N */
  542. if (BE (pstr->mbs_allocated, 0))
  543. memmove (pstr->mbs, pstr->mbs + offset,
  544. pstr->valid_len - offset);
  545. pstr->valid_len -= offset;
  546. pstr->valid_raw_len -= offset;
  547. #if DEBUG
  548. assert (pstr->valid_len > 0);
  549. #endif
  550. }
  551. else
  552. {
  553. /* No, skip all characters until IDX. */
  554. #ifdef RE_ENABLE_I18N
  555. if (BE (pstr->offsets_needed, 0))
  556. {
  557. pstr->len = pstr->raw_len - idx + offset;
  558. pstr->stop = pstr->raw_stop - idx + offset;
  559. pstr->offsets_needed = 0;
  560. }
  561. #endif
  562. pstr->valid_len = 0;
  563. pstr->valid_raw_len = 0;
  564. #ifdef RE_ENABLE_I18N
  565. if (pstr->mb_cur_max > 1)
  566. {
  567. int wcs_idx;
  568. wint_t wc = WEOF;
  569. if (pstr->is_utf8)
  570. {
  571. const unsigned char *raw, *p, *q, *end;
  572. /* Special case UTF-8. Multi-byte chars start with any
  573. byte other than 0x80 - 0xbf. */
  574. raw = pstr->raw_mbs + pstr->raw_mbs_idx;
  575. end = raw + (offset - pstr->mb_cur_max);
  576. p = raw + offset - 1;
  577. #ifdef _LIBC
  578. /* We know the wchar_t encoding is UCS4, so for the simple
  579. case, ASCII characters, skip the conversion step. */
  580. if (isascii (*p) && BE (pstr->trans == NULL, 1))
  581. {
  582. memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
  583. pstr->valid_len = 0;
  584. wc = (wchar_t) *p;
  585. }
  586. else
  587. #endif
  588. for (; p >= end; --p)
  589. if ((*p & 0xc0) != 0x80)
  590. {
  591. mbstate_t cur_state;
  592. wchar_t wc2;
  593. int mlen = raw + pstr->len - p;
  594. unsigned char buf[6];
  595. size_t mbclen;
  596. q = p;
  597. if (BE (pstr->trans != NULL, 0))
  598. {
  599. int i = mlen < 6 ? mlen : 6;
  600. while (--i >= 0)
  601. buf[i] = pstr->trans[p[i]];
  602. q = buf;
  603. }
  604. /* XXX Don't use mbrtowc, we know which conversion
  605. to use (UTF-8 -> UCS4). */
  606. memset (&cur_state, 0, sizeof (cur_state));
  607. mbclen = mbrtowc (&wc2, (const char *) p, mlen,
  608. &cur_state);
  609. if (raw + offset - p <= mbclen
  610. && mbclen < (size_t) -2)
  611. {
  612. memset (&pstr->cur_state, '\0',
  613. sizeof (mbstate_t));
  614. pstr->valid_len = mbclen - (raw + offset - p);
  615. wc = wc2;
  616. }
  617. break;
  618. }
  619. }
  620. if (wc == WEOF)
  621. pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
  622. if (BE (pstr->valid_len, 0))
  623. {
  624. for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
  625. pstr->wcs[wcs_idx] = WEOF;
  626. if (pstr->mbs_allocated)
  627. memset (pstr->mbs, 255, pstr->valid_len);
  628. }
  629. pstr->valid_raw_len = pstr->valid_len;
  630. pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
  631. && IS_WIDE_WORD_CHAR (wc))
  632. ? CONTEXT_WORD
  633. : ((IS_WIDE_NEWLINE (wc)
  634. && pstr->newline_anchor)
  635. ? CONTEXT_NEWLINE : 0));
  636. }
  637. else
  638. #endif /* RE_ENABLE_I18N */
  639. {
  640. int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
  641. if (pstr->trans)
  642. c = pstr->trans[c];
  643. pstr->tip_context = (bitset_contain (pstr->word_char, c)
  644. ? CONTEXT_WORD
  645. : ((IS_NEWLINE (c) && pstr->newline_anchor)
  646. ? CONTEXT_NEWLINE : 0));
  647. }
  648. }
  649. if (!BE (pstr->mbs_allocated, 0))
  650. pstr->mbs += offset;
  651. }
  652. pstr->raw_mbs_idx = idx;
  653. pstr->len -= offset;
  654. pstr->stop -= offset;
  655. /* Then build the buffers. */
  656. #ifdef RE_ENABLE_I18N
  657. if (pstr->mb_cur_max > 1)
  658. {
  659. if (pstr->icase)
  660. {
  661. int ret = build_wcs_upper_buffer (pstr);
  662. if (BE (ret != REG_NOERROR, 0))
  663. return ret;
  664. }
  665. else
  666. build_wcs_buffer (pstr);
  667. }
  668. else
  669. #endif /* RE_ENABLE_I18N */
  670. if (BE (pstr->mbs_allocated, 0))
  671. {
  672. if (pstr->icase)
  673. build_upper_buffer (pstr);
  674. else if (pstr->trans != NULL)
  675. re_string_translate_buffer (pstr);
  676. }
  677. else
  678. pstr->valid_len = pstr->len;
  679. pstr->cur_idx = 0;
  680. return REG_NOERROR;
  681. }
  682. static unsigned char
  683. internal_function __attribute ((pure))
  684. re_string_peek_byte_case (const re_string_t *pstr, int idx)
  685. {
  686. int ch, off;
  687. /* Handle the common (easiest) cases first. */
  688. if (BE (!pstr->mbs_allocated, 1))
  689. return re_string_peek_byte (pstr, idx);
  690. #ifdef RE_ENABLE_I18N
  691. if (pstr->mb_cur_max > 1
  692. && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
  693. return re_string_peek_byte (pstr, idx);
  694. #endif
  695. off = pstr->cur_idx + idx;
  696. #ifdef RE_ENABLE_I18N
  697. if (pstr->offsets_needed)
  698. off = pstr->offsets[off];
  699. #endif
  700. ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
  701. #ifdef RE_ENABLE_I18N
  702. /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
  703. this function returns CAPITAL LETTER I instead of first byte of
  704. DOTLESS SMALL LETTER I. The latter would confuse the parser,
  705. since peek_byte_case doesn't advance cur_idx in any way. */
  706. if (pstr->offsets_needed && !isascii (ch))
  707. return re_string_peek_byte (pstr, idx);
  708. #endif
  709. return ch;
  710. }
  711. static unsigned char
  712. internal_function __attribute ((pure))
  713. re_string_fetch_byte_case (re_string_t *pstr)
  714. {
  715. if (BE (!pstr->mbs_allocated, 1))
  716. return re_string_fetch_byte (pstr);
  717. #ifdef RE_ENABLE_I18N
  718. if (pstr->offsets_needed)
  719. {
  720. int off, ch;
  721. /* For tr_TR.UTF-8 [[:islower:]] there is
  722. [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
  723. in that case the whole multi-byte character and return
  724. the original letter. On the other side, with
  725. [[: DOTLESS SMALL LETTER I return [[:I, as doing
  726. anything else would complicate things too much. */
  727. if (!re_string_first_byte (pstr, pstr->cur_idx))
  728. return re_string_fetch_byte (pstr);
  729. off = pstr->offsets[pstr->cur_idx];
  730. ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
  731. if (! isascii (ch))
  732. return re_string_fetch_byte (pstr);
  733. re_string_skip_bytes (pstr,
  734. re_string_char_size_at (pstr, pstr->cur_idx));
  735. return ch;
  736. }
  737. #endif
  738. return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
  739. }
  740. static void
  741. internal_function
  742. re_string_destruct (re_string_t *pstr)
  743. {
  744. #ifdef RE_ENABLE_I18N
  745. re_free (pstr->wcs);
  746. re_free (pstr->offsets);
  747. #endif /* RE_ENABLE_I18N */
  748. if (pstr->mbs_allocated)
  749. re_free (pstr->mbs);
  750. }
  751. /* Return the context at IDX in INPUT. */
  752. static unsigned int
  753. internal_function
  754. re_string_context_at (const re_string_t *input, int idx, int eflags)
  755. {
  756. int c;
  757. if (BE (idx < 0, 0))
  758. /* In this case, we use the value stored in input->tip_context,
  759. since we can't know the character in input->mbs[-1] here. */
  760. return input->tip_context;
  761. if (BE (idx == input->len, 0))
  762. return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
  763. : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
  764. #ifdef RE_ENABLE_I18N
  765. if (input->mb_cur_max > 1)
  766. {
  767. wint_t wc;
  768. int wc_idx = idx;
  769. while(input->wcs[wc_idx] == WEOF)
  770. {
  771. #ifdef DEBUG
  772. /* It must not happen. */
  773. assert (wc_idx >= 0);
  774. #endif
  775. --wc_idx;
  776. if (wc_idx < 0)
  777. return input->tip_context;
  778. }
  779. wc = input->wcs[wc_idx];
  780. if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
  781. return CONTEXT_WORD;
  782. return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
  783. ? CONTEXT_NEWLINE : 0);
  784. }
  785. else
  786. #endif
  787. {
  788. c = re_string_byte_at (input, idx);
  789. if (bitset_contain (input->word_char, c))
  790. return CONTEXT_WORD;
  791. return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
  792. }
  793. }
  794. /* Functions for set operation. */
  795. static reg_errcode_t
  796. internal_function
  797. re_node_set_alloc (re_node_set *set, int size)
  798. {
  799. set->alloc = size;
  800. set->nelem = 0;
  801. set->elems = re_malloc (int, size); /* can be NULL if size == 0
  802. (see re_node_set_init_empty(set)) */
  803. if (BE (set->elems == NULL && size != 0, 0))
  804. return REG_ESPACE;
  805. return REG_NOERROR;
  806. }
  807. static reg_errcode_t
  808. internal_function
  809. re_node_set_init_1 (re_node_set *set, int elem)
  810. {
  811. set->alloc = 1;
  812. set->nelem = 1;
  813. set->elems = re_malloc (int, 1);
  814. if (BE (set->elems == NULL, 0))
  815. {
  816. set->alloc = set->nelem = 0;
  817. return REG_ESPACE;
  818. }
  819. set->elems[0] = elem;
  820. return REG_NOERROR;
  821. }
  822. static reg_errcode_t
  823. internal_function
  824. re_node_set_init_2 (re_node_set *set, int elem1, int elem2)
  825. {
  826. set->alloc = 2;
  827. set->elems = re_malloc (int, 2);
  828. if (BE (set->elems == NULL, 0))
  829. return REG_ESPACE;
  830. if (elem1 == elem2)
  831. {
  832. set->nelem = 1;
  833. set->elems[0] = elem1;
  834. }
  835. else
  836. {
  837. set->nelem = 2;
  838. if (elem1 < elem2)
  839. {
  840. set->elems[0] = elem1;
  841. set->elems[1] = elem2;
  842. }
  843. else
  844. {
  845. set->elems[0] = elem2;
  846. set->elems[1] = elem1;
  847. }
  848. }
  849. return REG_NOERROR;
  850. }
  851. static reg_errcode_t
  852. internal_function
  853. re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
  854. {
  855. dest->nelem = src->nelem;
  856. if (src->nelem > 0)
  857. {
  858. dest->alloc = dest->nelem;
  859. dest->elems = re_malloc (int, dest->alloc);
  860. if (BE (dest->elems == NULL, 0))
  861. {
  862. dest->alloc = dest->nelem = 0;
  863. return REG_ESPACE;
  864. }
  865. memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
  866. }
  867. else
  868. re_node_set_init_empty (dest);
  869. return REG_NOERROR;
  870. }
  871. /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
  872. DEST. Return value indicate the error code or REG_NOERROR if succeeded.
  873. Note: We assume dest->elems is NULL, when dest->alloc is 0. */
  874. static reg_errcode_t
  875. internal_function
  876. re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
  877. const re_node_set *src2)
  878. {
  879. int i1, i2, is, id, delta, sbase;
  880. if (src1->nelem == 0 || src2->nelem == 0)
  881. return REG_NOERROR;
  882. /* We need dest->nelem + 2 * elems_in_intersection; this is a
  883. conservative estimate. */
  884. if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
  885. {
  886. int new_alloc = src1->nelem + src2->nelem + dest->alloc;
  887. int *new_elems = re_realloc (dest->elems, int, new_alloc);
  888. if (BE (new_elems == NULL, 0))
  889. return REG_ESPACE;
  890. dest->elems = new_elems;
  891. dest->alloc = new_alloc;
  892. }
  893. /* Find the items in the intersection of SRC1 and SRC2, and copy
  894. into the top of DEST those that are not already in DEST itself. */
  895. sbase = dest->nelem + src1->nelem + src2->nelem;
  896. i1 = src1->nelem - 1;
  897. i2 = src2->nelem - 1;
  898. id = dest->nelem - 1;
  899. for (;;)
  900. {
  901. if (src1->elems[i1] == src2->elems[i2])
  902. {
  903. /* Try to find the item in DEST. Maybe we could binary search? */
  904. while (id >= 0 && dest->elems[id] > src1->elems[i1])
  905. --id;
  906. if (id < 0 || dest->elems[id] != src1->elems[i1])
  907. dest->elems[--sbase] = src1->elems[i1];
  908. if (--i1 < 0 || --i2 < 0)
  909. break;
  910. }
  911. /* Lower the highest of the two items. */
  912. else if (src1->elems[i1] < src2->elems[i2])
  913. {
  914. if (--i2 < 0)
  915. break;
  916. }
  917. else
  918. {
  919. if (--i1 < 0)
  920. break;
  921. }
  922. }
  923. id = dest->nelem - 1;
  924. is = dest->nelem + src1->nelem + src2->nelem - 1;
  925. delta = is - sbase + 1;
  926. /* Now copy. When DELTA becomes zero, the remaining
  927. DEST elements are already in place; this is more or
  928. less the same loop that is in re_node_set_merge. */
  929. dest->nelem += delta;
  930. if (delta > 0 && id >= 0)
  931. for (;;)
  932. {
  933. if (dest->elems[is] > dest->elems[id])
  934. {
  935. /* Copy from the top. */
  936. dest->elems[id + delta--] = dest->elems[is--];
  937. if (delta == 0)
  938. break;
  939. }
  940. else
  941. {
  942. /* Slide from the bottom. */
  943. dest->elems[id + delta] = dest->elems[id];
  944. if (--id < 0)
  945. break;
  946. }
  947. }
  948. /* Copy remaining SRC elements. */
  949. memcpy (dest->elems, dest->elems + sbase, delta * sizeof (int));
  950. return REG_NOERROR;
  951. }
  952. /* Calculate the union set of the sets SRC1 and SRC2. And store it to
  953. DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
  954. static reg_errcode_t
  955. internal_function
  956. re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
  957. const re_node_set *src2)
  958. {
  959. int i1, i2, id;
  960. if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
  961. {
  962. dest->alloc = src1->nelem + src2->nelem;
  963. dest->elems = re_malloc (int, dest->alloc);
  964. if (BE (dest->elems == NULL, 0))
  965. return REG_ESPACE;
  966. }
  967. else
  968. {
  969. if (src1 != NULL && src1->nelem > 0)
  970. return re_node_set_init_copy (dest, src1);
  971. else if (src2 != NULL && src2->nelem > 0)
  972. return re_node_set_init_copy (dest, src2);
  973. else
  974. re_node_set_init_empty (dest);
  975. return REG_NOERROR;
  976. }
  977. for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
  978. {
  979. if (src1->elems[i1] > src2->elems[i2])
  980. {
  981. dest->elems[id++] = src2->elems[i2++];
  982. continue;
  983. }
  984. if (src1->elems[i1] == src2->elems[i2])
  985. ++i2;
  986. dest->elems[id++] = src1->elems[i1++];
  987. }
  988. if (i1 < src1->nelem)
  989. {
  990. memcpy (dest->elems + id, src1->elems + i1,
  991. (src1->nelem - i1) * sizeof (int));
  992. id += src1->nelem - i1;
  993. }
  994. else if (i2 < src2->nelem)
  995. {
  996. memcpy (dest->elems + id, src2->elems + i2,
  997. (src2->nelem - i2) * sizeof (int));
  998. id += src2->nelem - i2;
  999. }
  1000. dest->nelem = id;
  1001. return REG_NOERROR;
  1002. }
  1003. /* Calculate the union set of the sets DEST and SRC. And store it to
  1004. DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
  1005. static reg_errcode_t
  1006. internal_function
  1007. re_node_set_merge (re_node_set *dest, const re_node_set *src)
  1008. {
  1009. int is, id, sbase, delta;
  1010. if (src == NULL || src->nelem == 0)
  1011. return REG_NOERROR;
  1012. if (dest->alloc < 2 * src->nelem + dest->nelem)
  1013. {
  1014. int new_alloc = 2 * (src->nelem + dest->alloc);
  1015. int *new_buffer = re_realloc (dest->elems, int, new_alloc);
  1016. if (BE (new_buffer == NULL, 0))
  1017. return REG_ESPACE;
  1018. dest->elems = new_buffer;
  1019. dest->alloc = new_alloc;
  1020. }
  1021. if (BE (dest->nelem == 0, 0))
  1022. {
  1023. dest->nelem = src->nelem;
  1024. memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
  1025. return REG_NOERROR;
  1026. }
  1027. /* Copy into the top of DEST the items of SRC that are not
  1028. found in DEST. Maybe we could binary search in DEST? */
  1029. for (sbase = dest->nelem + 2 * src->nelem,
  1030. is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
  1031. {
  1032. if (dest->elems[id] == src->elems[is])
  1033. is--, id--;
  1034. else if (dest->elems[id] < src->elems[is])
  1035. dest->elems[--sbase] = src->elems[is--];
  1036. else /* if (dest->elems[id] > src->elems[is]) */
  1037. --id;
  1038. }
  1039. if (is >= 0)
  1040. {
  1041. /* If DEST is exhausted, the remaining items of SRC must be unique. */
  1042. sbase -= is + 1;
  1043. memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (int));
  1044. }
  1045. id = dest->nelem - 1;
  1046. is = dest->nelem + 2 * src->nelem - 1;
  1047. delta = is - sbase + 1;
  1048. if (delta == 0)
  1049. return REG_NOERROR;
  1050. /* Now copy. When DELTA becomes zero, the remaining
  1051. DEST elements are already in place. */
  1052. dest->nelem += delta;
  1053. for (;;)
  1054. {
  1055. if (dest->elems[is] > dest->elems[id])
  1056. {
  1057. /* Copy from the top. */
  1058. dest->elems[id + delta--] = dest->elems[is--];
  1059. if (delta == 0)
  1060. break;
  1061. }
  1062. else
  1063. {
  1064. /* Slide from the bottom. */
  1065. dest->elems[id + delta] = dest->elems[id];
  1066. if (--id < 0)
  1067. {
  1068. /* Copy remaining SRC elements. */
  1069. memcpy (dest->elems, dest->elems + sbase,
  1070. delta * sizeof (int));
  1071. break;
  1072. }
  1073. }
  1074. }
  1075. return REG_NOERROR;
  1076. }
  1077. /* Insert the new element ELEM to the re_node_set* SET.
  1078. SET should not already have ELEM.
  1079. return -1 if an error is occured, return 1 otherwise. */
  1080. static int
  1081. internal_function
  1082. re_node_set_insert (re_node_set *set, int elem)
  1083. {
  1084. int idx;
  1085. /* In case the set is empty. */
  1086. if (set->alloc == 0)
  1087. {
  1088. if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))
  1089. return 1;
  1090. else
  1091. return -1;
  1092. }
  1093. if (BE (set->nelem, 0) == 0)
  1094. {
  1095. /* We already guaranteed above that set->alloc != 0. */
  1096. set->elems[0] = elem;
  1097. ++set->nelem;
  1098. return 1;
  1099. }
  1100. /* Realloc if we need. */
  1101. if (set->alloc == set->nelem)
  1102. {
  1103. int *new_elems;
  1104. set->alloc = set->alloc * 2;
  1105. new_elems = re_realloc (set->elems, int, set->alloc);
  1106. if (BE (new_elems == NULL, 0))
  1107. return -1;
  1108. set->elems = new_elems;
  1109. }
  1110. /* Move the elements which follows the new element. Test the
  1111. first element separately to skip a check in the inner loop. */
  1112. if (elem < set->elems[0])
  1113. {
  1114. idx = 0;
  1115. for (idx = set->nelem; idx > 0; idx--)
  1116. set->elems[idx] = set->elems[idx - 1];
  1117. }
  1118. else
  1119. {
  1120. for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
  1121. set->elems[idx] = set->elems[idx - 1];
  1122. }
  1123. /* Insert the new element. */
  1124. set->elems[idx] = elem;
  1125. ++set->nelem;
  1126. return 1;
  1127. }
  1128. /* Insert the new element ELEM to the re_node_set* SET.
  1129. SET should not already have any element greater than or equal to ELEM.
  1130. Return -1 if an error is occured, return 1 otherwise. */
  1131. static int
  1132. internal_function
  1133. re_node_set_insert_last (re_node_set *set, int elem)
  1134. {
  1135. /* Realloc if we need. */
  1136. if (set->alloc == set->nelem)
  1137. {
  1138. int *new_elems;
  1139. set->alloc = (set->alloc + 1) * 2;
  1140. new_elems = re_realloc (set->elems, int, set->alloc);
  1141. if (BE (new_elems == NULL, 0))
  1142. return -1;
  1143. set->elems = new_elems;
  1144. }
  1145. /* Insert the new element. */
  1146. set->elems[set->nelem++] = elem;
  1147. return 1;
  1148. }
  1149. /* Compare two node sets SET1 and SET2.
  1150. return 1 if SET1 and SET2 are equivalent, return 0 otherwise. */
  1151. static int
  1152. internal_function __attribute ((pure))
  1153. re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
  1154. {
  1155. int i;
  1156. if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
  1157. return 0;
  1158. for (i = set1->nelem ; --i >= 0 ; )
  1159. if (set1->elems[i] != set2->elems[i])
  1160. return 0;
  1161. return 1;
  1162. }
  1163. /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
  1164. static int
  1165. internal_function __attribute ((pure))
  1166. re_node_set_contains (const re_node_set *set, int elem)
  1167. {
  1168. unsigned int idx, right, mid;
  1169. if (set->nelem <= 0)
  1170. return 0;
  1171. /* Binary search the element. */
  1172. idx = 0;
  1173. right = set->nelem - 1;
  1174. while (idx < right)
  1175. {
  1176. mid = (idx + right) / 2;
  1177. if (set->elems[mid] < elem)
  1178. idx = mid + 1;
  1179. else
  1180. right = mid;
  1181. }
  1182. return set->elems[idx] == elem ? idx + 1 : 0;
  1183. }
  1184. static void
  1185. internal_function
  1186. re_node_set_remove_at (re_node_set *set, int idx)
  1187. {
  1188. if (idx < 0 || idx >= set->nelem)
  1189. return;
  1190. --set->nelem;
  1191. for (; idx < set->nelem; idx++)
  1192. set->elems[idx] = set->elems[idx + 1];
  1193. }
  1194. /* Add the token TOKEN to dfa->nodes, and return the index of the token.
  1195. Or return -1, if an error will be occured. */
  1196. static int
  1197. internal_function
  1198. re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
  1199. {
  1200. #ifdef RE_ENABLE_I18N
  1201. int type = token.type;
  1202. #endif
  1203. if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
  1204. {
  1205. size_t new_nodes_alloc = dfa->nodes_alloc * 2;
  1206. int *new_nexts, *new_indices;
  1207. re_node_set *new_edests, *new_eclosures;
  1208. re_token_t *new_nodes;
  1209. /* Avoid overflows. */
  1210. if (BE (new_nodes_alloc < dfa->nodes_alloc, 0))
  1211. return -1;
  1212. new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
  1213. if (BE (new_nodes == NULL, 0))
  1214. return -1;
  1215. dfa->nodes = new_nodes;
  1216. new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc);
  1217. new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc);
  1218. new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
  1219. new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
  1220. if (BE (new_nexts == NULL || new_indices == NULL
  1221. || new_edests == NULL || new_eclosures == NULL, 0))
  1222. return -1;
  1223. dfa->nexts = new_nexts;
  1224. dfa->org_indices = new_indices;
  1225. dfa->edests = new_edests;
  1226. dfa->eclosures = new_eclosures;
  1227. dfa->nodes_alloc = new_nodes_alloc;
  1228. }
  1229. dfa->nodes[dfa->nodes_len] = token;
  1230. dfa->nodes[dfa->nodes_len].constraint = 0;
  1231. #ifdef RE_ENABLE_I18N
  1232. dfa->nodes[dfa->nodes_len].accept_mb =
  1233. (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
  1234. #endif
  1235. dfa->nexts[dfa->nodes_len] = -1;
  1236. re_node_set_init_empty (dfa->edests + dfa->nodes_len);
  1237. re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
  1238. return dfa->nodes_len++;
  1239. }
  1240. static inline unsigned int
  1241. internal_function
  1242. calc_state_hash (const re_node_set *nodes, unsigned int context)
  1243. {
  1244. unsigned int hash = nodes->nelem + context;
  1245. int i;
  1246. for (i = 0 ; i < nodes->nelem ; i++)
  1247. hash += nodes->elems[i];
  1248. return hash;
  1249. }
  1250. /* Search for the state whose node_set is equivalent to NODES.
  1251. Return the pointer to the state, if we found it in the DFA.
  1252. Otherwise create the new one and return it. In case of an error
  1253. return NULL and set the error code in ERR.
  1254. Note: - We assume NULL as the invalid state, then it is possible that
  1255. return value is NULL and ERR is REG_NOERROR.
  1256. - We never return non-NULL value in case of any errors, it is for
  1257. optimization. */
  1258. static re_dfastate_t *
  1259. internal_function
  1260. re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
  1261. const re_node_set *nodes)
  1262. {
  1263. unsigned int hash;
  1264. re_dfastate_t *new_state;
  1265. struct re_state_table_entry *spot;
  1266. int i;
  1267. if (BE (nodes->nelem == 0, 0))
  1268. {
  1269. *err = REG_NOERROR;
  1270. return NULL;
  1271. }
  1272. hash = calc_state_hash (nodes, 0);
  1273. spot = dfa->state_table + (hash & dfa->state_hash_mask);
  1274. for (i = 0 ; i < spot->num ; i++)
  1275. {
  1276. re_dfastate_t *state = spot->array[i];
  1277. if (hash != state->hash)
  1278. continue;
  1279. if (re_node_set_compare (&state->nodes, nodes))
  1280. return state;
  1281. }
  1282. /* There are no appropriate state in the dfa, create the new one. */
  1283. new_state = create_ci_newstate (dfa, nodes, hash);
  1284. if (BE (new_state == NULL, 0))
  1285. *err = REG_ESPACE;
  1286. return new_state;
  1287. }
  1288. /* Search for the state whose node_set is equivalent to NODES and
  1289. whose context is equivalent to CONTEXT.
  1290. Return the pointer to the state, if we found it in the DFA.
  1291. Otherwise create the new one and return it. In case of an error
  1292. return NULL and set the error code in ERR.
  1293. Note: - We assume NULL as the invalid state, then it is possible that
  1294. return value is NULL and ERR is REG_NOERROR.
  1295. - We never return non-NULL value in case of any errors, it is for
  1296. optimization. */
  1297. static re_dfastate_t *
  1298. internal_function
  1299. re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
  1300. const re_node_set *nodes, unsigned int context)
  1301. {
  1302. unsigned int hash;
  1303. re_dfastate_t *new_state;
  1304. struct re_state_table_entry *spot;
  1305. int i;
  1306. if (nodes->nelem == 0)
  1307. {
  1308. *err = REG_NOERROR;
  1309. return NULL;
  1310. }
  1311. hash = calc_state_hash (nodes, context);
  1312. spot = dfa->state_table + (hash & dfa->state_hash_mask);
  1313. for (i = 0 ; i < spot->num ; i++)
  1314. {
  1315. re_dfastate_t *state = spot->array[i];
  1316. if (state->hash == hash
  1317. && state->context == context
  1318. && re_node_set_compare (state->entrance_nodes, nodes))
  1319. return state;
  1320. }
  1321. /* There are no appropriate state in `dfa', create the new one. */
  1322. new_state = create_cd_newstate (dfa, nodes, context, hash);
  1323. if (BE (new_state == NULL, 0))
  1324. *err = REG_ESPACE;
  1325. return new_state;
  1326. }
  1327. /* Finish initialization of the new state NEWSTATE, and using its hash value
  1328. HASH put in the appropriate bucket of DFA's state table. Return value
  1329. indicates the error code if failed. */
  1330. static reg_errcode_t
  1331. register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
  1332. unsigned int hash)
  1333. {
  1334. struct re_state_table_entry *spot;
  1335. reg_errcode_t err;
  1336. int i;
  1337. newstate->hash = hash;
  1338. err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
  1339. if (BE (err != REG_NOERROR, 0))
  1340. return REG_ESPACE;
  1341. for (i = 0; i < newstate->nodes.nelem; i++)
  1342. {
  1343. int elem = newstate->nodes.elems[i];
  1344. if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
  1345. re_node_set_insert_last (&newstate->non_eps_nodes, elem);
  1346. }
  1347. spot = dfa->state_table + (hash & dfa->state_hash_mask);
  1348. if (BE (spot->alloc <= spot->num, 0))
  1349. {
  1350. int new_alloc = 2 * spot->num + 2;
  1351. re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
  1352. new_alloc);
  1353. if (BE (new_array == NULL, 0))
  1354. return REG_ESPACE;
  1355. spot->array = new_array;
  1356. spot->alloc = new_alloc;
  1357. }
  1358. spot->array[spot->num++] = newstate;
  1359. return REG_NOERROR;
  1360. }
  1361. static void
  1362. free_state (re_dfastate_t *state)
  1363. {
  1364. re_node_set_free (&state->non_eps_nodes);
  1365. re_node_set_free (&state->inveclosure);
  1366. if (state->entrance_nodes != &state->nodes)
  1367. {
  1368. re_node_set_free (state->entrance_nodes);
  1369. re_free (state->entrance_nodes);
  1370. }
  1371. re_node_set_free (&state->nodes);
  1372. re_free (state->word_trtable);
  1373. re_free (state->trtable);
  1374. re_free (state);
  1375. }
  1376. /* Create the new state which is independ of contexts.
  1377. Return the new state if succeeded, otherwise return NULL. */
  1378. static re_dfastate_t *
  1379. internal_function
  1380. create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
  1381. unsigned int hash)
  1382. {
  1383. int i;
  1384. reg_errcode_t err;
  1385. re_dfastate_t *newstate;
  1386. newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
  1387. if (BE (newstate == NULL, 0))
  1388. return NULL;
  1389. err = re_node_set_init_copy (&newstate->nodes, nodes);
  1390. if (BE (err != REG_NOERROR, 0))
  1391. {
  1392. re_free (newstate);
  1393. return NULL;
  1394. }
  1395. newstate->entrance_nodes = &newstate->nodes;
  1396. for (i = 0 ; i < nodes->nelem ; i++)
  1397. {
  1398. re_token_t *node = dfa->nodes + nodes->elems[i];
  1399. re_token_type_t type = node->type;
  1400. if (type == CHARACTER && !node->constraint)
  1401. continue;
  1402. #ifdef RE_ENABLE_I18N
  1403. newstate->accept_mb |= node->accept_mb;
  1404. #endif /* RE_ENABLE_I18N */
  1405. /* If the state has the halt node, the state is a halt state. */
  1406. if (type == END_OF_RE)
  1407. newstate->halt = 1;
  1408. else if (type == OP_BACK_REF)
  1409. newstate->has_backref = 1;
  1410. else if (type == ANCHOR || node->constraint)
  1411. newstate->has_constraint = 1;
  1412. }
  1413. err = register_state (dfa, newstate, hash);
  1414. if (BE (err != REG_NOERROR, 0))
  1415. {
  1416. free_state (newstate);
  1417. newstate = NULL;
  1418. }
  1419. return newstate;
  1420. }
  1421. /* Create the new state which is depend on the context CONTEXT.
  1422. Return the new state if succeeded, otherwise return NULL. */
  1423. static re_dfastate_t *
  1424. internal_function
  1425. create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
  1426. unsigned int context, unsigned int hash)
  1427. {
  1428. int i, nctx_nodes = 0;
  1429. reg_errcode_t err;
  1430. re_dfastate_t *newstate;
  1431. newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
  1432. if (BE (newstate == NULL, 0))
  1433. return NULL;
  1434. err = re_node_set_init_copy (&newstate->nodes, nodes);
  1435. if (BE (err != REG_NOERROR, 0))
  1436. {
  1437. re_free (newstate);
  1438. return NULL;
  1439. }
  1440. newstate->context = context;
  1441. newstate->entrance_nodes = &newstate->nodes;
  1442. for (i = 0 ; i < nodes->nelem ; i++)
  1443. {
  1444. unsigned int constraint = 0;
  1445. re_token_t *node = dfa->nodes + nodes->elems[i];
  1446. re_token_type_t type = node->type;
  1447. if (node->constraint)
  1448. constraint = node->constraint;
  1449. if (type == CHARACTER && !constraint)
  1450. continue;
  1451. #ifdef RE_ENABLE_I18N
  1452. newstate->accept_mb |= node->accept_mb;
  1453. #endif /* RE_ENABLE_I18N */
  1454. /* If the state has the halt node, the state is a halt state. */
  1455. if (type == END_OF_RE)
  1456. newstate->halt = 1;
  1457. else if (type == OP_BACK_REF)
  1458. newstate->has_backref = 1;
  1459. else if (type == ANCHOR)
  1460. constraint = node->opr.ctx_type;
  1461. if (constraint)
  1462. {
  1463. if (newstate->entrance_nodes == &newstate->nodes)
  1464. {
  1465. newstate->entrance_nodes = re_malloc (re_node_set, 1);
  1466. if (BE (newstate->entrance_nodes == NULL, 0))
  1467. {
  1468. free_state (newstate);
  1469. return NULL;
  1470. }
  1471. re_node_set_init_copy (newstate->entrance_nodes, nodes);
  1472. nctx_nodes = 0;
  1473. newstate->has_constraint = 1;
  1474. }
  1475. if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
  1476. {
  1477. re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
  1478. ++nctx_nodes;
  1479. }
  1480. }
  1481. }
  1482. err = register_state (dfa, newstate, hash);
  1483. if (BE (err != REG_NOERROR, 0))
  1484. {
  1485. free_state (newstate);
  1486. newstate = NULL;
  1487. }
  1488. return newstate;
  1489. }