fastgrep.c 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336
  1. /* $OpenBSD: util.c,v 1.36 2007/10/02 17:59:18 otto Exp $ */
  2. /* $FreeBSD: head/usr.bin/grep/fastgrep.c 211496 2010-08-19 09:28:59Z des $ */
  3. /*-
  4. * Copyright (c) 1999 James Howard and Dag-Erling Coïdan Smørgrav
  5. * Copyright (C) 2008 Gabor Kovesdan <gabor@FreeBSD.org>
  6. * All rights reserved.
  7. *
  8. * Redistribution and use in source and binary forms, with or without
  9. * modification, are permitted provided that the following conditions
  10. * are met:
  11. * 1. Redistributions of source code must retain the above copyright
  12. * notice, this list of conditions and the following disclaimer.
  13. * 2. Redistributions in binary form must reproduce the above copyright
  14. * notice, this list of conditions and the following disclaimer in the
  15. * documentation and/or other materials provided with the distribution.
  16. *
  17. * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  18. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20. * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  21. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  22. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  23. * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  24. * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  25. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  26. * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  27. * SUCH DAMAGE.
  28. */
  29. /*
  30. * XXX: This file is a speed up for grep to cover the defects of the
  31. * regex library. These optimizations should practically be implemented
  32. * there keeping this code clean. This is a future TODO, but for the
  33. * meantime, we need to use this workaround.
  34. */
  35. #if HAVE_NBTOOL_CONFIG_H
  36. #include "nbtool_config.h"
  37. #endif
  38. #include <sys/cdefs.h>
  39. __RCSID("$NetBSD: fastgrep.c,v 1.5 2011/04/18 03:27:40 joerg Exp $");
  40. #include <limits.h>
  41. #include <stdbool.h>
  42. #include <stdlib.h>
  43. #include <string.h>
  44. #include <wchar.h>
  45. #include <wctype.h>
  46. #include "grep.h"
  47. static inline int grep_cmp(const unsigned char *, const unsigned char *, size_t);
  48. static inline void grep_revstr(unsigned char *, int);
  49. void
  50. fgrepcomp(fastgrep_t *fg, const char *pat)
  51. {
  52. unsigned int i;
  53. /* Initialize. */
  54. fg->len = strlen(pat);
  55. fg->bol = false;
  56. fg->eol = false;
  57. fg->reversed = false;
  58. fg->pattern = (unsigned char *)grep_strdup(pat);
  59. /* Preprocess pattern. */
  60. for (i = 0; i <= UCHAR_MAX; i++)
  61. fg->qsBc[i] = fg->len;
  62. for (i = 1; i < fg->len; i++)
  63. fg->qsBc[fg->pattern[i]] = fg->len - i;
  64. }
  65. /*
  66. * Returns: -1 on failure, 0 on success
  67. */
  68. int
  69. fastcomp(fastgrep_t *fg, const char *pat)
  70. {
  71. unsigned int i;
  72. int firstHalfDot = -1;
  73. int firstLastHalfDot = -1;
  74. int hasDot = 0;
  75. int lastHalfDot = 0;
  76. int shiftPatternLen;
  77. /* Initialize. */
  78. fg->len = strlen(pat);
  79. fg->bol = false;
  80. fg->eol = false;
  81. fg->reversed = false;
  82. fg->word = wflag;
  83. /* Remove end-of-line character ('$'). */
  84. if (fg->len > 0 && pat[fg->len - 1] == '$') {
  85. fg->eol = true;
  86. fg->len--;
  87. }
  88. /* Remove beginning-of-line character ('^'). */
  89. if (pat[0] == '^') {
  90. fg->bol = true;
  91. fg->len--;
  92. pat++;
  93. }
  94. if (fg->len >= 14 &&
  95. memcmp(pat, "[[:<:]]", 7) == 0 &&
  96. memcmp(pat + fg->len - 7, "[[:>:]]", 7) == 0) {
  97. fg->len -= 14;
  98. pat += 7;
  99. /* Word boundary is handled separately in util.c */
  100. fg->word = true;
  101. }
  102. /*
  103. * pat has been adjusted earlier to not include '^', '$' or
  104. * the word match character classes at the beginning and ending
  105. * of the string respectively.
  106. */
  107. fg->pattern = grep_malloc(fg->len + 1);
  108. memcpy(fg->pattern, pat, fg->len);
  109. fg->pattern[fg->len] = '\0';
  110. /* Look for ways to cheat...er...avoid the full regex engine. */
  111. for (i = 0; i < fg->len; i++) {
  112. /* Can still cheat? */
  113. if (fg->pattern[i] == '.') {
  114. hasDot = i;
  115. if (i < fg->len / 2) {
  116. if (firstHalfDot < 0)
  117. /* Closest dot to the beginning */
  118. firstHalfDot = i;
  119. } else {
  120. /* Closest dot to the end of the pattern. */
  121. lastHalfDot = i;
  122. if (firstLastHalfDot < 0)
  123. firstLastHalfDot = i;
  124. }
  125. } else {
  126. /* Free memory and let others know this is empty. */
  127. free(fg->pattern);
  128. fg->pattern = NULL;
  129. return (-1);
  130. }
  131. }
  132. /*
  133. * Determine if a reverse search would be faster based on the placement
  134. * of the dots.
  135. */
  136. if ((!(lflag || cflag)) && ((!(fg->bol || fg->eol)) &&
  137. ((lastHalfDot) && ((firstHalfDot < 0) ||
  138. ((fg->len - (lastHalfDot + 1)) < (size_t)firstHalfDot)))) &&
  139. !oflag && !color) {
  140. fg->reversed = true;
  141. hasDot = fg->len - (firstHalfDot < 0 ?
  142. firstLastHalfDot : firstHalfDot) - 1;
  143. grep_revstr(fg->pattern, fg->len);
  144. }
  145. /*
  146. * Normal Quick Search would require a shift based on the position the
  147. * next character after the comparison is within the pattern. With
  148. * wildcards, the position of the last dot effects the maximum shift
  149. * distance.
  150. * The closer to the end the wild card is the slower the search. A
  151. * reverse version of this algorithm would be useful for wildcards near
  152. * the end of the string.
  153. *
  154. * Examples:
  155. * Pattern Max shift
  156. * ------- ---------
  157. * this 5
  158. * .his 4
  159. * t.is 3
  160. * th.s 2
  161. * thi. 1
  162. */
  163. /* Adjust the shift based on location of the last dot ('.'). */
  164. shiftPatternLen = fg->len - hasDot;
  165. /* Preprocess pattern. */
  166. for (i = 0; i <= (signed)UCHAR_MAX; i++)
  167. fg->qsBc[i] = shiftPatternLen;
  168. for (i = hasDot + 1; i < fg->len; i++) {
  169. fg->qsBc[fg->pattern[i]] = fg->len - i;
  170. }
  171. /*
  172. * Put pattern back to normal after pre-processing to allow for easy
  173. * comparisons later.
  174. */
  175. if (fg->reversed)
  176. grep_revstr(fg->pattern, fg->len);
  177. return (0);
  178. }
  179. int
  180. grep_search(fastgrep_t *fg, const unsigned char *data, size_t len, regmatch_t *pmatch)
  181. {
  182. unsigned int j;
  183. int ret = REG_NOMATCH;
  184. if (pmatch->rm_so == (ssize_t)len)
  185. return (ret);
  186. if (fg->bol && pmatch->rm_so != 0) {
  187. pmatch->rm_so = len;
  188. pmatch->rm_eo = len;
  189. return (ret);
  190. }
  191. /* No point in going farther if we do not have enough data. */
  192. if (len < fg->len)
  193. return (ret);
  194. /* Only try once at the beginning or ending of the line. */
  195. if (fg->bol || fg->eol) {
  196. /* Simple text comparison. */
  197. /* Verify data is >= pattern length before searching on it. */
  198. if (len >= fg->len) {
  199. /* Determine where in data to start search at. */
  200. j = fg->eol ? len - fg->len : 0;
  201. if (!((fg->bol && fg->eol) && (len != fg->len)))
  202. if (grep_cmp(fg->pattern, data + j,
  203. fg->len) == -1) {
  204. pmatch->rm_so = j;
  205. pmatch->rm_eo = j + fg->len;
  206. ret = 0;
  207. }
  208. }
  209. } else if (fg->reversed) {
  210. /* Quick Search algorithm. */
  211. j = len;
  212. do {
  213. if (grep_cmp(fg->pattern, data + j - fg->len,
  214. fg->len) == -1) {
  215. pmatch->rm_so = j - fg->len;
  216. pmatch->rm_eo = j;
  217. ret = 0;
  218. break;
  219. }
  220. /* Shift if within bounds, otherwise, we are done. */
  221. if (j == fg->len)
  222. break;
  223. j -= fg->qsBc[data[j - fg->len - 1]];
  224. } while (j >= fg->len);
  225. } else {
  226. /* Quick Search algorithm. */
  227. j = pmatch->rm_so;
  228. do {
  229. if (grep_cmp(fg->pattern, data + j, fg->len) == -1) {
  230. pmatch->rm_so = j;
  231. pmatch->rm_eo = j + fg->len;
  232. ret = 0;
  233. break;
  234. }
  235. /* Shift if within bounds, otherwise, we are done. */
  236. if (j + fg->len == len)
  237. break;
  238. else
  239. j += fg->qsBc[data[j + fg->len]];
  240. } while (j <= (len - fg->len));
  241. }
  242. return (ret);
  243. }
  244. /*
  245. * Returns: i >= 0 on failure (position that it failed)
  246. * -1 on success
  247. */
  248. static inline int
  249. grep_cmp(const unsigned char *pat, const unsigned char *data, size_t len)
  250. {
  251. size_t size;
  252. wchar_t *wdata, *wpat;
  253. unsigned int i;
  254. if (iflag) {
  255. if ((size = mbstowcs(NULL, (const char *)data, 0)) ==
  256. ((size_t) - 1))
  257. return (-1);
  258. wdata = grep_malloc(size * sizeof(wint_t));
  259. if (mbstowcs(wdata, (const char *)data, size) ==
  260. ((size_t) - 1))
  261. return (-1);
  262. if ((size = mbstowcs(NULL, (const char *)pat, 0)) ==
  263. ((size_t) - 1))
  264. return (-1);
  265. wpat = grep_malloc(size * sizeof(wint_t));
  266. if (mbstowcs(wpat, (const char *)pat, size) == ((size_t) - 1))
  267. return (-1);
  268. for (i = 0; i < len; i++) {
  269. if ((towlower(wpat[i]) == towlower(wdata[i])) ||
  270. ((grepbehave != GREP_FIXED) && wpat[i] == L'.'))
  271. continue;
  272. free(wpat);
  273. free(wdata);
  274. return (i);
  275. }
  276. } else {
  277. for (i = 0; i < len; i++) {
  278. if ((pat[i] == data[i]) || ((grepbehave != GREP_FIXED) &&
  279. pat[i] == '.'))
  280. continue;
  281. return (i);
  282. }
  283. }
  284. return (-1);
  285. }
  286. static inline void
  287. grep_revstr(unsigned char *str, int len)
  288. {
  289. int i;
  290. char c;
  291. for (i = 0; i < len / 2; i++) {
  292. c = str[i];
  293. str[i] = str[len - i - 1];
  294. str[len - i - 1] = c;
  295. }
  296. }