? grep.cat1 Index: grep.c =================================================================== RCS file: /cvsroot/othersrc/usr.bin/grep/grep.c,v retrieving revision 1.43 diff -u -u -r1.43 grep.c --- grep.c 8 Nov 2003 20:10:08 -0000 1.43 +++ grep.c 8 Nov 2003 21:39:19 -0000 @@ -64,6 +64,8 @@ char **pattern; regex_t *r_pattern; +fastgrep_t *fg_pattern; + /* For regex errors */ char re_error[RE_ERROR_BUF + 1]; @@ -80,6 +82,7 @@ int bflag; /* -b: show block numbers for each match */ int cflag; /* -c: only show a count of matching lines */ int hflag; /* -h: Never print filenames. -H overrides */ +int iflag; /* -i: Ignore case */ int lflag; /* -l: only show names of files with matches */ int mflag; /* -m: specify maximum line matches (per file) */ int nflag; /* -n: show line numbers in front of matching lines */ @@ -124,6 +127,8 @@ /* Housekeeping */ int first; /* flag whether or not this is our first match */ int tail; /* lines left to print */ +int boleol; /* At least one pattern has a BOL or EOL */ +size_t maxPatternLen = 0; /* Longest length of all patterns */ static void usage(void) @@ -201,6 +206,9 @@ strncpy(pattern[patterns], pat, len); pattern[patterns][len] = '\0'; ++patterns; + + if (len > maxPatternLen) + maxPatternLen = len; } static void @@ -231,8 +239,26 @@ fclose(f); } +static void +free_patterns(void) +{ + int i; + + for (i = 0; i < patterns; i++) { + if (fg_pattern[i].pattern) + free(fg_pattern[i].pattern); + else + regfree(&r_pattern[i]); + free(pattern[i]); + } + free(fg_pattern); + free(r_pattern); + free(pattern); +} + static int -check_context_arg(char const *str) { +check_context_arg(char const *str) +{ char *ep; long lval; @@ -414,6 +440,7 @@ break; case 'i': case 'y': + iflag = 1; cflags |= REG_ICASE; break; case 'l': @@ -529,11 +556,19 @@ cflags |= REG_EXTENDED; else if (Fflag) cflags |= REG_NOSPEC; + + fg_pattern = grep_malloc(patterns * sizeof(*fg_pattern)); r_pattern = grep_malloc(patterns * sizeof(*r_pattern)); + for (i = 0; i < patterns; ++i) { - if ((c = regcomp(&r_pattern[i], pattern[i], cflags))) { - regerror(c, &r_pattern[i], re_error, RE_ERROR_BUF); - errx(2, "%s", re_error); + /* Check if fast algorithm is allowed */ + if (fastcomp(&fg_pattern[i], pattern[i])) { + /* Use regex library */ + if ((c = regcomp(&r_pattern[i], pattern[i], cflags))) { + regerror(c, &r_pattern[i], re_error, + RE_ERROR_BUF); + errx(2, "%s", re_error); + } } } @@ -552,6 +587,8 @@ else for (c = 0; argc--; ++argv) c += procfile(*argv); + + free_patterns(); exit(!c); } Index: grep.h =================================================================== RCS file: /cvsroot/othersrc/usr.bin/grep/grep.h,v retrieving revision 1.25 diff -u -u -r1.25 grep.h --- grep.h 8 Nov 2003 17:24:07 -0000 1.25 +++ grep.h 8 Nov 2003 21:39:20 -0000 @@ -28,11 +28,12 @@ #include +#include #include #include #include -#define VERSION "20031108" +#define VERSION "20031108a" #define GREP_READ 0 #define GREP_SKIP 1 @@ -56,11 +57,21 @@ char *dat; } str_t; +typedef struct { + unsigned char *pattern; + int patternLen; + int qsBc[UCHAR_MAX + 1]; + /* flags */ + int bol; + int eol; + int reversedSearch; +} fastgrep_t; + /* Flags passed to regcomp() and regexec() */ extern int cflags, eflags; /* Command line flags */ -extern int Aflag, Bflag, Fflag, Lflag, bflag, cflag, lflag, mflag, +extern int Aflag, Bflag, Fflag, Lflag, bflag, cflag, iflag, lflag, mflag, nflag, oflag, qflag, sflag, vflag, wflag, xflag; extern int colours; @@ -77,10 +88,14 @@ extern int binbehave, dirbehave, devbehave; /* extern int linkbehave; */ +extern int boleol; +extern size_t maxPatternLen; extern char *stdin_label; extern int first, matchall, patterns, tail; extern char **pattern; + +extern fastgrep_t *fg_pattern; extern regex_t *r_pattern; /* @@ -96,7 +111,9 @@ void *grep_malloc(size_t size); void *grep_realloc(void *ptr, size_t size); +unsigned char *grep_strdup(const char *); void printline(str_t *line, int sep, regmatch_t *matches, int m); +int fastcomp(fastgrep_t *, const char *); /* queue.c */ void initqueue(void); Index: mmfile.c =================================================================== RCS file: /cvsroot/othersrc/usr.bin/grep/mmfile.c,v retrieving revision 1.5 diff -u -u -r1.5 mmfile.c --- mmfile.c 8 Nov 2003 20:11:19 -0000 1.5 +++ mmfile.c 8 Nov 2003 21:39:20 -0000 @@ -44,6 +44,7 @@ #include "grep.h" #define MAX_MAP_LEN 1048576 +#define BLOCKSIZE 32768 mmf_t * mmopen(char *fn, char *mode) @@ -94,9 +95,26 @@ if (mmf->ptr >= mmf->end) return NULL; - for (p = mmf->ptr; mmf->ptr < mmf->end; ++mmf->ptr) - if (*mmf->ptr == line_endchar) - break; + + if ((lflag || qflag) && !boleol) { + /* Find starting point to search */ + if (mmf->ptr == mmf->base) + p = mmf->ptr; + else + p = mmf->ptr - maxPatternLen; + + /* Move the start pointer ahead for next iteration */ + if (mmf->end - mmf->ptr > BLOCKSIZE) + mmf->ptr += BLOCKSIZE; + else + mmf->ptr = mmf->end; + } else { + + for (p = mmf->ptr; mmf->ptr < mmf->end; ++mmf->ptr) + if (*mmf->ptr == line_endchar) + break; + } + *l = mmf->ptr - p; ++mmf->ptr; return p; Index: util.c =================================================================== RCS file: /cvsroot/othersrc/usr.bin/grep/util.c,v retrieving revision 1.29 diff -u -u -r1.29 util.c --- util.c 8 Nov 2003 17:24:07 -0000 1.29 +++ util.c 8 Nov 2003 21:39:21 -0000 @@ -54,6 +54,9 @@ static int linesqueued, newfile; static int procline(str_t *l, int nottext); +static int grep_search(fastgrep_t *, unsigned char *, int, regmatch_t *); +static int grep_cmp(const unsigned char *, const unsigned char *, size_t); +static void grep_revstr(unsigned char *, int); int grep_tree(char **argv) @@ -223,7 +226,14 @@ pmatch.rm_so = st; pmatch.rm_eo = l->len; for (i = 0; i < patterns; i++) { - r = regexec(&r_pattern[i], l->dat, 1, &pmatch, eflags); + if (fg_pattern[i].pattern) + r = grep_search(&fg_pattern[i], + (unsigned char *)l->dat, + l->len, &pmatch); + else + r = regexec(&r_pattern[i], l->dat, 1, &pmatch, + eflags); + if (r == REG_NOMATCH && t == 0) continue; if (r == 0) { @@ -286,6 +296,212 @@ return c; } +int +fastcomp(fastgrep_t *fg, const char *fast_pattern) +{ + int i; + int bol = 0; + int eol = 0; + int origPatternLen; + int shiftPatternLen; + int hasDot = 0; + int firstHalfDot = -1; + int firstLastHalfDot = -1; + int lastHalfDot = 0; + + if (Fflag) { + fg->pattern = NULL; + return (-1); + } + + /* Initialize. */ + origPatternLen = fg->patternLen = strlen(fast_pattern); + fg->bol = 0; + fg->eol = 0; + fg->reversedSearch = 0; + + /* Remove end-of-line character ('$'). */ + if (fast_pattern[fg->patternLen - 1] == '$') { + eol++; + fg->eol = 1; + fg->patternLen--; + boleol = 1; + } + + /* Remove beginning-of-line character ('^'). */ + if (fast_pattern[0] == '^') { + bol++; + fg->bol = 1; + fg->patternLen--; + boleol = 1; + } + + /* + * Copy pattern minus '^' and '$' characters at the beginning and + * ending of the string respectively. + */ + fg->pattern = grep_strdup(fast_pattern + bol); + + /* Look for ways to cheat...er...avoid the full regex engine. */ + for (i = 0; i < fg->patternLen; i++) + { + /* Can still cheat? */ + if ((isalnum(fg->pattern[i])) || isspace(fg->pattern[i]) || + (fg->pattern[i] == '_') || (fg->pattern[i] == ',') || + (fg->pattern[i] == '^') || (fg->pattern[i] == '$') || + (fg->pattern[i] == '=') || (fg->pattern[i] == '-') || + (fg->pattern[i] == ':') || (fg->pattern[i] == '/')) { + /* As long as it is good, upper case it for later. */ + if (iflag) + fg->pattern[i] = toupper(fg->pattern[i]); + } else if (fg->pattern[i] == '.') { + hasDot = i; + if (i < fg->patternLen / 2) { + if (firstHalfDot < -1) + /* Closest dot to the beginning */ + firstHalfDot = i; + } else { + /* Closest dot to the end of the pattern. */ + lastHalfDot = i; + if (firstLastHalfDot < 0) + firstLastHalfDot = i; + } + } else { + /* Free memory and let others know this is empty. */ + free(fg->pattern); + fg->pattern = NULL; + return (-1); + } + } + + /* + * Determine if a reverse search would be faster based on the placement + * of the dots. + */ + if ((!(lflag || cflag)) && ((!(bol || eol)) && + ((lastHalfDot) && ((firstHalfDot < 0) || + ((fg->patternLen - (lastHalfDot + 1)) < firstHalfDot))))) { + fg->reversedSearch = 1; + hasDot = fg->patternLen - (firstHalfDot < 0 ? + firstLastHalfDot : firstHalfDot) - 1; + grep_revstr(fg->pattern, fg->patternLen); + } + + /* + * Normal Quick Search would require a shift based on the position the + * next character after the comparison is within the pattern. With + * wildcards, the position of the last dot effects the maximum shift + * distance. + * The closer to the end the wild card is the slower the search. A + * reverse version of this algorithm would be useful for wildcards near + * the end of the string. + * + * Examples: + * Pattern Max shift + * ------- --------- + * this 5 + * .his 4 + * t.is 3 + * th.s 2 + * thi. 1 + */ + + /* Adjust the shift based on location of the last dot ('.'). */ + shiftPatternLen = fg->patternLen - hasDot; + + /* Preprocess pattern. */ + for (i = 0; i <= UCHAR_MAX; i++) + fg->qsBc[i] = shiftPatternLen; + for (i = hasDot + 1; i < fg->patternLen; i++) { + fg->qsBc[fg->pattern[i]] = fg->patternLen - i; + /* + * If case is ignored, make the jump apply to both upper and + * lower cased characters. As the pattern is stored in upper + * case, apply the same to the lower case equivalents. + */ + if (iflag) + fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i; + } + + /* + * Put pattern back to normal after pre-processing to allow for easy + * comparisons later. + */ + if (fg->reversedSearch) + grep_revstr(fg->pattern, fg->patternLen); + + return (0); +} + + +static int +grep_search(fastgrep_t *fg, unsigned char *data, int dataLen, regmatch_t *pmatch) +{ + int j; + int rtrnVal = REG_NOMATCH; + + pmatch->rm_so = -1; + pmatch->rm_eo = -1; + + /* No point in going farther if we do not have enough data. */ + if (dataLen < fg->patternLen) + return (rtrnVal); + + /* Only try once at the beginning or ending of the line. */ + if (fg->bol || fg->eol) { + /* Simple text comparison. */ + /* Verify data is >= pattern length before searching on it. */ + if (dataLen >= fg->patternLen) { + /* Determine where in data to start search at. */ + if (fg->eol) + j = dataLen - fg->patternLen; + else + j = 0; + if (!((fg->bol && fg->eol) && (dataLen != fg->patternLen))) + if (grep_cmp(fg->pattern, data + j, fg->patternLen) == -1) { + rtrnVal = 0; + pmatch->rm_so = j; + pmatch->rm_eo = j + fg->patternLen; + } + } + } else if (fg->reversedSearch) { + /* Quick Search algorithm. */ + j = dataLen; + do { + if (grep_cmp(fg->pattern, data + j - fg->patternLen, + fg->patternLen) == -1) { + rtrnVal = 0; + pmatch->rm_so = j - fg->patternLen; + pmatch->rm_eo = j; + break; + } + /* Shift if within bounds, otherwise, we are done. */ + if (j == fg->patternLen) + break; + j -= fg->qsBc[data[j - fg->patternLen - 1]]; + } while (j >= fg->patternLen); + } else { + /* Quick Search algorithm. */ + j = 0; + do { + if (grep_cmp(fg->pattern, data + j, fg->patternLen) == -1) { + rtrnVal = 0; + pmatch->rm_so = j; + pmatch->rm_eo = j + fg->patternLen; + break; + } + + /* Shift if within bounds, otherwise, we are done. */ + if (j + fg->patternLen == dataLen) + break; + else + j += fg->qsBc[data[j + fg->patternLen]]; + } while (j <= (dataLen - fg->patternLen)); + } + + return (rtrnVal); +} + void * grep_malloc(size_t size) { @@ -303,6 +519,48 @@ err(2, "realloc"); return ptr; } + +unsigned char * +grep_strdup(const char *str) +{ + unsigned char *ptr; + + if ((ptr = (unsigned char *)strdup(str)) == NULL) + err(2, "strdup"); + return ptr; +} + +/* + * Returns: i >= 0 on failure (position that it failed) + * -1 on success + */ +static int +grep_cmp(const unsigned char *pat, const unsigned char *data, size_t len) +{ + int i; + + for (i = 0; i < len; i++) { + if (((pat[i] == data[i]) || (pat[i] == '.')) || + (iflag && pat[i] == toupper(data[i]))) + continue; + return (i); + } + + return (-1); +} + +static void +grep_revstr(unsigned char *str, int len) +{ + int i; + char c; + + for (i = 0; i < len / 2; i++) { + c = str[i]; + str[i] = str[len - i - 1]; + str[len - i - 1] = c; + } +} void printline(str_t *line, int sep, regmatch_t *matches, int m)