00001 #include <my_global.h> 00002 #include <m_string.h> 00003 #include <m_ctype.h> 00004 #ifdef __WIN__ 00005 #include <limits.h> 00006 #endif 00007 00008 #include "my_regex.h" 00009 #include "utils.h" 00010 #include "regex2.h" 00011 00012 #include "cclass.h" 00013 #include "cname.h" 00014 00015 /* 00016 * parse structure, passed up and down to avoid global variables and 00017 * other clumsinesses 00018 */ 00019 struct parse { 00020 char *next; /* next character in RE */ 00021 char *end; /* end of string (-> NUL normally) */ 00022 int error; /* has an error been seen? */ 00023 sop *strip; /* malloced strip */ 00024 sopno ssize; /* malloced strip size (allocated) */ 00025 sopno slen; /* malloced strip length (used) */ 00026 int ncsalloc; /* number of csets allocated */ 00027 struct re_guts *g; 00028 # define NPAREN 10 /* we need to remember () 1-9 for back refs */ 00029 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ 00030 sopno pend[NPAREN]; /* -> ) ([0] unused) */ 00031 CHARSET_INFO *charset; /* for ctype things */ 00032 }; 00033 00034 #include "regcomp.ih" 00035 00036 static char nuls[10]; /* place to point scanner in event of error */ 00037 00038 struct cclass cclasses[CCLASS_LAST+1]= { 00039 { "alnum", "","", _MY_U | _MY_L | _MY_NMR}, 00040 { "alpha", "","", _MY_U | _MY_L }, 00041 { "blank", "","", _MY_B }, 00042 { "cntrl", "","", _MY_CTR }, 00043 { "digit", "","", _MY_NMR }, 00044 { "graph", "","", _MY_PNT | _MY_U | _MY_L | _MY_NMR}, 00045 { "lower", "","", _MY_L }, 00046 { "print", "","", _MY_PNT | _MY_U | _MY_L | _MY_NMR | _MY_B }, 00047 { "punct", "","", _MY_PNT }, 00048 { "space", "","", _MY_SPC }, 00049 { "upper", "","", _MY_U }, 00050 { "xdigit", "","", _MY_X }, 00051 { NULL,NULL,NULL, 0 } 00052 }; 00053 00054 /* 00055 * macros for use with parse structure 00056 * BEWARE: these know that the parse structure is named `p' !!! 00057 */ 00058 #define PEEK() (*p->next) 00059 #define PEEK2() (*(p->next+1)) 00060 #define MORE() (p->next < p->end) 00061 #define MORE2() (p->next+1 < p->end) 00062 #define SEE(c) (MORE() && PEEK() == (c)) 00063 #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) 00064 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) 00065 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) 00066 #define NEXT() (p->next++) 00067 #define NEXT2() (p->next += 2) 00068 #define NEXTn(n) (p->next += (n)) 00069 #define GETNEXT() (*p->next++) 00070 #define SETERROR(e) seterr(p, (e)) 00071 #define REQUIRE(co, e) ((co) || SETERROR(e)) 00072 #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) 00073 #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e)) 00074 #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) 00075 #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd)) 00076 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos) 00077 #define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) 00078 #define ASTERN(sop, pos) EMIT(sop, HERE()-pos) 00079 #define HERE() (p->slen) 00080 #define THERE() (p->slen - 1) 00081 #define THERETHERE() (p->slen - 2) 00082 #define DROP(n) (p->slen -= (n)) 00083 00084 #ifndef NDEBUG 00085 static int never = 0; /* for use in asserts; shuts lint up */ 00086 #else 00087 #define never 0 /* some <assert.h>s have bugs too */ 00088 #endif 00089 00090 /* 00091 - regcomp - interface for parser and compilation 00092 = extern int regcomp(regex_t *, const char *, int); 00093 = #define REG_BASIC 0000 00094 = #define REG_EXTENDED 0001 00095 = #define REG_ICASE 0002 00096 = #define REG_NOSUB 0004 00097 = #define REG_NEWLINE 0010 00098 = #define REG_NOSPEC 0020 00099 = #define REG_PEND 0040 00100 = #define REG_DUMP 0200 00101 */ 00102 int /* 0 success, otherwise REG_something */ 00103 my_regcomp(preg, pattern, cflags, charset) 00104 my_regex_t *preg; 00105 const char *pattern; 00106 int cflags; 00107 CHARSET_INFO *charset; 00108 { 00109 struct parse pa; 00110 register struct re_guts *g; 00111 register struct parse *p = &pa; 00112 register int i; 00113 register size_t len; 00114 #ifdef REDEBUG 00115 # define GOODFLAGS(f) (f) 00116 #else 00117 # define GOODFLAGS(f) ((f)&~REG_DUMP) 00118 #endif 00119 00120 my_regex_init(charset); /* Init cclass if neaded */ 00121 preg->charset=charset; 00122 cflags = GOODFLAGS(cflags); 00123 if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) 00124 return(REG_INVARG); 00125 00126 if (cflags®_PEND) { 00127 if (preg->re_endp < pattern) 00128 return(REG_INVARG); 00129 len = preg->re_endp - pattern; 00130 } else 00131 len = strlen((char *)pattern); 00132 00133 /* do the mallocs early so failure handling is easy */ 00134 g = (struct re_guts *)malloc(sizeof(struct re_guts) + 00135 (NC-1)*sizeof(cat_t)); 00136 if (g == NULL) 00137 return(REG_ESPACE); 00138 p->ssize = (long) (len/(size_t)2*(size_t)3 + (size_t)1); /* ugh */ 00139 p->strip = (sop *)malloc(p->ssize * sizeof(sop)); 00140 p->slen = 0; 00141 if (p->strip == NULL) { 00142 free((char *)g); 00143 return(REG_ESPACE); 00144 } 00145 00146 /* set things up */ 00147 p->g = g; 00148 p->next = (char *)pattern; /* convenience; we do not modify it */ 00149 p->end = p->next + len; 00150 p->error = 0; 00151 p->ncsalloc = 0; 00152 p->charset = preg->charset; 00153 for (i = 0; i < NPAREN; i++) { 00154 p->pbegin[i] = 0; 00155 p->pend[i] = 0; 00156 } 00157 g->csetsize = NC; 00158 g->sets = NULL; 00159 g->setbits = NULL; 00160 g->ncsets = 0; 00161 g->cflags = cflags; 00162 g->iflags = 0; 00163 g->nbol = 0; 00164 g->neol = 0; 00165 g->must = NULL; 00166 g->mlen = 0; 00167 g->nsub = 0; 00168 g->ncategories = 1; /* category 0 is "everything else" */ 00169 g->categories = &g->catspace[-(CHAR_MIN)]; 00170 (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t)); 00171 g->backrefs = 0; 00172 00173 /* do it */ 00174 EMIT(OEND, 0); 00175 g->firststate = THERE(); 00176 if (cflags®_EXTENDED) 00177 p_ere(p, OUT); 00178 else if (cflags®_NOSPEC) 00179 p_str(p); 00180 else 00181 p_bre(p, OUT, OUT); 00182 EMIT(OEND, 0); 00183 g->laststate = THERE(); 00184 00185 /* tidy up loose ends and fill things in */ 00186 categorize(p, g); 00187 stripsnug(p, g); 00188 findmust(p, g); 00189 g->nplus = pluscount(p, g); 00190 g->magic = MAGIC2; 00191 preg->re_nsub = g->nsub; 00192 preg->re_g = g; 00193 preg->re_magic = MAGIC1; 00194 #ifndef REDEBUG 00195 /* not debugging, so can't rely on the assert() in regexec() */ 00196 if (g->iflags&BAD) 00197 SETERROR(REG_ASSERT); 00198 #endif 00199 00200 /* win or lose, we're done */ 00201 if (p->error != 0) /* lose */ 00202 my_regfree(preg); 00203 return(p->error); 00204 } 00205 00206 /* 00207 - p_ere - ERE parser top level, concatenation and alternation 00208 == static void p_ere(register struct parse *p, int stop); 00209 */ 00210 static void 00211 p_ere(p, stop) 00212 register struct parse *p; 00213 int stop; /* character this ERE should end at */ 00214 { 00215 register char c; 00216 register sopno prevback; 00217 register sopno prevfwd; 00218 register sopno conc; 00219 register int first = 1; /* is this the first alternative? */ 00220 LINT_INIT(prevback); LINT_INIT(prevfwd); 00221 for (;;) { 00222 /* do a bunch of concatenated expressions */ 00223 conc = HERE(); 00224 while (MORE() && (c = PEEK()) != '|' && c != stop) 00225 p_ere_exp(p); 00226 if(REQUIRE(HERE() != conc, REG_EMPTY)) {}/* require nonempty */ 00227 00228 if (!EAT('|')) 00229 break; /* NOTE BREAK OUT */ 00230 00231 if (first) { 00232 INSERT(OCH_, conc); /* offset is wrong */ 00233 prevfwd = conc; 00234 prevback = conc; 00235 first = 0; 00236 } 00237 ASTERN(OOR1, prevback); 00238 prevback = THERE(); 00239 AHEAD(prevfwd); /* fix previous offset */ 00240 prevfwd = HERE(); 00241 EMIT(OOR2, 0); /* offset is very wrong */ 00242 } 00243 00244 if (!first) { /* tail-end fixups */ 00245 AHEAD(prevfwd); 00246 ASTERN(O_CH, prevback); 00247 } 00248 00249 assert(!MORE() || SEE(stop)); 00250 } 00251 00252 /* 00253 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op 00254 == static void p_ere_exp(register struct parse *p); 00255 */ 00256 static void 00257 p_ere_exp(p) 00258 register struct parse *p; 00259 { 00260 register char c; 00261 register sopno pos; 00262 register int count; 00263 register int count2; 00264 register sopno subno; 00265 int wascaret = 0; 00266 00267 assert(MORE()); /* caller should have ensured this */ 00268 c = GETNEXT(); 00269 00270 pos = HERE(); 00271 switch (c) { 00272 case '(': 00273 if(REQUIRE(MORE(), REG_EPAREN)) {} 00274 p->g->nsub++; 00275 subno = (sopno) p->g->nsub; 00276 if (subno < NPAREN) 00277 p->pbegin[subno] = HERE(); 00278 EMIT(OLPAREN, subno); 00279 if (!SEE(')')) 00280 p_ere(p, ')'); 00281 if (subno < NPAREN) { 00282 p->pend[subno] = HERE(); 00283 assert(p->pend[subno] != 0); 00284 } 00285 EMIT(ORPAREN, subno); 00286 if(MUSTEAT(')', REG_EPAREN)) {} 00287 break; 00288 #ifndef POSIX_MISTAKE 00289 case ')': /* happens only if no current unmatched ( */ 00290 /* 00291 * You may ask, why the ifndef? Because I didn't notice 00292 * this until slightly too late for 1003.2, and none of the 00293 * other 1003.2 regular-expression reviewers noticed it at 00294 * all. So an unmatched ) is legal POSIX, at least until 00295 * we can get it fixed. 00296 */ 00297 SETERROR(REG_EPAREN); 00298 break; 00299 #endif 00300 case '^': 00301 EMIT(OBOL, 0); 00302 p->g->iflags |= USEBOL; 00303 p->g->nbol++; 00304 wascaret = 1; 00305 break; 00306 case '$': 00307 EMIT(OEOL, 0); 00308 p->g->iflags |= USEEOL; 00309 p->g->neol++; 00310 break; 00311 case '|': 00312 SETERROR(REG_EMPTY); 00313 break; 00314 case '*': 00315 case '+': 00316 case '?': 00317 SETERROR(REG_BADRPT); 00318 break; 00319 case '.': 00320 if (p->g->cflags®_NEWLINE) 00321 nonnewline(p); 00322 else 00323 EMIT(OANY, 0); 00324 break; 00325 case '[': 00326 p_bracket(p); 00327 break; 00328 case '\\': 00329 if(REQUIRE(MORE(), REG_EESCAPE)) {} 00330 c = GETNEXT(); 00331 ordinary(p, c); 00332 break; 00333 case '{': /* okay as ordinary except if digit follows */ 00334 if(REQUIRE(!MORE() || !my_isdigit(p->charset,PEEK()), REG_BADRPT)) {} 00335 /* FALLTHROUGH */ 00336 default: 00337 ordinary(p, c); 00338 break; 00339 } 00340 00341 if (!MORE()) 00342 return; 00343 c = PEEK(); 00344 /* we call { a repetition if followed by a digit */ 00345 if (!( c == '*' || c == '+' || c == '?' || 00346 (c == '{' && MORE2() && 00347 my_isdigit(p->charset,PEEK2())) )) 00348 return; /* no repetition, we're done */ 00349 NEXT(); 00350 00351 if(REQUIRE(!wascaret, REG_BADRPT)) {} 00352 switch (c) { 00353 case '*': /* implemented as +? */ 00354 /* this case does not require the (y|) trick, noKLUDGE */ 00355 INSERT(OPLUS_, pos); 00356 ASTERN(O_PLUS, pos); 00357 INSERT(OQUEST_, pos); 00358 ASTERN(O_QUEST, pos); 00359 break; 00360 case '+': 00361 INSERT(OPLUS_, pos); 00362 ASTERN(O_PLUS, pos); 00363 break; 00364 case '?': 00365 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 00366 INSERT(OCH_, pos); /* offset slightly wrong */ 00367 ASTERN(OOR1, pos); /* this one's right */ 00368 AHEAD(pos); /* fix the OCH_ */ 00369 EMIT(OOR2, 0); /* offset very wrong... */ 00370 AHEAD(THERE()); /* ...so fix it */ 00371 ASTERN(O_CH, THERETHERE()); 00372 break; 00373 case '{': 00374 count = p_count(p); 00375 if (EAT(',')) { 00376 if (my_isdigit(p->charset,PEEK())) { 00377 count2 = p_count(p); 00378 if(REQUIRE(count <= count2, REG_BADBR)) {} 00379 } else /* single number with comma */ 00380 count2 = RE_INFINITY; 00381 } else /* just a single number */ 00382 count2 = count; 00383 repeat(p, pos, count, count2); 00384 if (!EAT('}')) { /* error heuristics */ 00385 while (MORE() && PEEK() != '}') 00386 NEXT(); 00387 if(REQUIRE(MORE(), REG_EBRACE)) {} 00388 SETERROR(REG_BADBR); 00389 } 00390 break; 00391 } 00392 00393 if (!MORE()) 00394 return; 00395 c = PEEK(); 00396 if (!( c == '*' || c == '+' || c == '?' || 00397 (c == '{' && MORE2() && 00398 my_isdigit(p->charset,PEEK2())) ) ) 00399 return; 00400 SETERROR(REG_BADRPT); 00401 } 00402 00403 /* 00404 - p_str - string (no metacharacters) "parser" 00405 == static void p_str(register struct parse *p); 00406 */ 00407 static void 00408 p_str(p) 00409 register struct parse *p; 00410 { 00411 if(REQUIRE(MORE(), REG_EMPTY)) {} 00412 while (MORE()) 00413 ordinary(p, GETNEXT()); 00414 } 00415 00416 /* 00417 - p_bre - BRE parser top level, anchoring and concatenation 00418 == static void p_bre(register struct parse *p, register int end1, \ 00419 == register int end2); 00420 * Giving end1 as OUT essentially eliminates the end1/end2 check. 00421 * 00422 * This implementation is a bit of a kludge, in that a trailing $ is first 00423 * taken as an ordinary character and then revised to be an anchor. The 00424 * only undesirable side effect is that '$' gets included as a character 00425 * category in such cases. This is fairly harmless; not worth fixing. 00426 * The amount of lookahead needed to avoid this kludge is excessive. 00427 */ 00428 static void 00429 p_bre(p, end1, end2) 00430 register struct parse *p; 00431 register int end1; /* first terminating character */ 00432 register int end2; /* second terminating character */ 00433 { 00434 register sopno start = HERE(); 00435 register int first = 1; /* first subexpression? */ 00436 register int wasdollar = 0; 00437 00438 if (EAT('^')) { 00439 EMIT(OBOL, 0); 00440 p->g->iflags |= USEBOL; 00441 p->g->nbol++; 00442 } 00443 while (MORE() && !SEETWO(end1, end2)) { 00444 wasdollar = p_simp_re(p, first); 00445 first = 0; 00446 } 00447 if (wasdollar) { /* oops, that was a trailing anchor */ 00448 DROP(1); 00449 EMIT(OEOL, 0); 00450 p->g->iflags |= USEEOL; 00451 p->g->neol++; 00452 } 00453 00454 if(REQUIRE(HERE() != start, REG_EMPTY)) {} /* require nonempty */ 00455 } 00456 00457 /* 00458 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition 00459 == static int p_simp_re(register struct parse *p, int starordinary); 00460 */ 00461 static int /* was the simple RE an unbackslashed $? */ 00462 p_simp_re(p, starordinary) 00463 register struct parse *p; 00464 int starordinary; /* is a leading * an ordinary character? */ 00465 { 00466 register int c; 00467 register int count; 00468 register int count2; 00469 register sopno pos; 00470 register int i; 00471 register sopno subno; 00472 # define BACKSL (1<<CHAR_BIT) 00473 00474 pos = HERE(); /* repetion op, if any, covers from here */ 00475 00476 assert(MORE()); /* caller should have ensured this */ 00477 c = GETNEXT(); 00478 if (c == '\\') { 00479 if(REQUIRE(MORE(), REG_EESCAPE)) {} 00480 c = BACKSL | (unsigned char)GETNEXT(); 00481 } 00482 switch (c) { 00483 case '.': 00484 if (p->g->cflags®_NEWLINE) 00485 nonnewline(p); 00486 else 00487 EMIT(OANY, 0); 00488 break; 00489 case '[': 00490 p_bracket(p); 00491 break; 00492 case BACKSL|'{': 00493 SETERROR(REG_BADRPT); 00494 break; 00495 case BACKSL|'(': 00496 p->g->nsub++; 00497 subno = (sopno) p->g->nsub; 00498 if (subno < NPAREN) 00499 p->pbegin[subno] = HERE(); 00500 EMIT(OLPAREN, subno); 00501 /* the MORE here is an error heuristic */ 00502 if (MORE() && !SEETWO('\\', ')')) 00503 p_bre(p, '\\', ')'); 00504 if (subno < NPAREN) { 00505 p->pend[subno] = HERE(); 00506 assert(p->pend[subno] != 0); 00507 } 00508 EMIT(ORPAREN, subno); 00509 if(REQUIRE(EATTWO('\\', ')'), REG_EPAREN)) {} 00510 break; 00511 case BACKSL|')': /* should not get here -- must be user */ 00512 case BACKSL|'}': 00513 SETERROR(REG_EPAREN); 00514 break; 00515 case BACKSL|'1': 00516 case BACKSL|'2': 00517 case BACKSL|'3': 00518 case BACKSL|'4': 00519 case BACKSL|'5': 00520 case BACKSL|'6': 00521 case BACKSL|'7': 00522 case BACKSL|'8': 00523 case BACKSL|'9': 00524 i = (c&~BACKSL) - '0'; 00525 assert(i < NPAREN); 00526 if (p->pend[i] != 0) { 00527 assert((uint) i <= p->g->nsub); 00528 EMIT(OBACK_, i); 00529 assert(p->pbegin[i] != 0); 00530 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 00531 assert(OP(p->strip[p->pend[i]]) == ORPAREN); 00532 (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 00533 EMIT(O_BACK, i); 00534 } else 00535 SETERROR(REG_ESUBREG); 00536 p->g->backrefs = 1; 00537 break; 00538 case '*': 00539 if(REQUIRE(starordinary, REG_BADRPT)) {} 00540 /* FALLTHROUGH */ 00541 default: 00542 ordinary(p, c &~ BACKSL); 00543 break; 00544 } 00545 00546 if (EAT('*')) { /* implemented as +? */ 00547 /* this case does not require the (y|) trick, noKLUDGE */ 00548 INSERT(OPLUS_, pos); 00549 ASTERN(O_PLUS, pos); 00550 INSERT(OQUEST_, pos); 00551 ASTERN(O_QUEST, pos); 00552 } else if (EATTWO('\\', '{')) { 00553 count = p_count(p); 00554 if (EAT(',')) { 00555 if (MORE() && my_isdigit(p->charset,PEEK())) { 00556 count2 = p_count(p); 00557 if(REQUIRE(count <= count2, REG_BADBR)) {} 00558 } else /* single number with comma */ 00559 count2 = RE_INFINITY; 00560 } else /* just a single number */ 00561 count2 = count; 00562 repeat(p, pos, count, count2); 00563 if (!EATTWO('\\', '}')) { /* error heuristics */ 00564 while (MORE() && !SEETWO('\\', '}')) 00565 NEXT(); 00566 if(REQUIRE(MORE(), REG_EBRACE)) {} 00567 SETERROR(REG_BADBR); 00568 } 00569 } else if (c == (unsigned char)'$') /* $ (but not \$) ends it */ 00570 return(1); 00571 00572 return(0); 00573 } 00574 00575 /* 00576 - p_count - parse a repetition count 00577 == static int p_count(register struct parse *p); 00578 */ 00579 static int /* the value */ 00580 p_count(p) 00581 register struct parse *p; 00582 { 00583 register int count = 0; 00584 register int ndigits = 0; 00585 00586 while (MORE() && my_isdigit(p->charset,PEEK()) && count <= DUPMAX) { 00587 count = count*10 + (GETNEXT() - '0'); 00588 ndigits++; 00589 } 00590 00591 if(REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR)) {} 00592 return(count); 00593 } 00594 00595 /* 00596 - p_bracket - parse a bracketed character list 00597 == static void p_bracket(register struct parse *p); 00598 * 00599 * Note a significant property of this code: if the allocset() did SETERROR, 00600 * no set operations are done. 00601 */ 00602 static void 00603 p_bracket(p) 00604 register struct parse *p; 00605 { 00606 register cset *cs = allocset(p); 00607 register int invert = 0; 00608 00609 /* Dept of Truly Sickening Special-Case Kludges */ 00610 if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) { 00611 EMIT(OBOW, 0); 00612 NEXTn(6); 00613 return; 00614 } 00615 if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) { 00616 EMIT(OEOW, 0); 00617 NEXTn(6); 00618 return; 00619 } 00620 00621 if (EAT('^')) 00622 invert++; /* make note to invert set at end */ 00623 if (EAT(']')) 00624 CHadd(cs, ']'); 00625 else if (EAT('-')) 00626 CHadd(cs, '-'); 00627 while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) 00628 p_b_term(p, cs); 00629 if (EAT('-')) 00630 CHadd(cs, '-'); 00631 if(MUSTEAT(']', REG_EBRACK)) {} 00632 00633 if (p->error != 0) /* don't mess things up further */ 00634 return; 00635 00636 if (p->g->cflags®_ICASE) { 00637 register int i; 00638 register int ci; 00639 00640 for (i = p->g->csetsize - 1; i >= 0; i--) 00641 if (CHIN(cs, i) && my_isalpha(p->charset,i)) { 00642 ci = othercase(p->charset,i); 00643 if (ci != i) 00644 CHadd(cs, ci); 00645 } 00646 if (cs->multis != NULL) 00647 mccase(p, cs); 00648 } 00649 if (invert) { 00650 register int i; 00651 00652 for (i = p->g->csetsize - 1; i >= 0; i--) 00653 if (CHIN(cs, i)) 00654 CHsub(cs, i); 00655 else 00656 CHadd(cs, i); 00657 if (p->g->cflags®_NEWLINE) 00658 CHsub(cs, '\n'); 00659 if (cs->multis != NULL) 00660 mcinvert(p, cs); 00661 } 00662 00663 assert(cs->multis == NULL); /* xxx */ 00664 00665 if (nch(p, cs) == 1) { /* optimize singleton sets */ 00666 ordinary(p, firstch(p, cs)); 00667 freeset(p, cs); 00668 } else 00669 EMIT(OANYOF, freezeset(p, cs)); 00670 } 00671 00672 /* 00673 - p_b_term - parse one term of a bracketed character list 00674 == static void p_b_term(register struct parse *p, register cset *cs); 00675 */ 00676 static void 00677 p_b_term(p, cs) 00678 register struct parse *p; 00679 register cset *cs; 00680 { 00681 register char c; 00682 register char start, finish; 00683 register int i; 00684 00685 /* classify what we've got */ 00686 switch ((MORE()) ? PEEK() : '\0') { 00687 case '[': 00688 c = (MORE2()) ? PEEK2() : '\0'; 00689 break; 00690 case '-': 00691 SETERROR(REG_ERANGE); 00692 return; /* NOTE RETURN */ 00693 break; 00694 default: 00695 c = '\0'; 00696 break; 00697 } 00698 00699 switch (c) { 00700 case ':': /* character class */ 00701 NEXT2(); 00702 if(REQUIRE(MORE(), REG_EBRACK)) {} 00703 c = PEEK(); 00704 if(REQUIRE(c != '-' && c != ']', REG_ECTYPE)) {} 00705 p_b_cclass(p, cs); 00706 if(REQUIRE(MORE(), REG_EBRACK)) {} 00707 if(REQUIRE(EATTWO(':', ']'), REG_ECTYPE)) {} 00708 break; 00709 case '=': /* equivalence class */ 00710 NEXT2(); 00711 if(REQUIRE(MORE(), REG_EBRACK)) {} 00712 c = PEEK(); 00713 if(REQUIRE(c != '-' && c != ']', REG_ECOLLATE)) {} 00714 p_b_eclass(p, cs); 00715 if(REQUIRE(MORE(), REG_EBRACK)) {} 00716 if(REQUIRE(EATTWO('=', ']'), REG_ECOLLATE)) {} 00717 break; 00718 default: /* symbol, ordinary character, or range */ 00719 /* xxx revision needed for multichar stuff */ 00720 start = p_b_symbol(p); 00721 if (SEE('-') && MORE2() && PEEK2() != ']') { 00722 /* range */ 00723 NEXT(); 00724 if (EAT('-')) 00725 finish = '-'; 00726 else 00727 finish = p_b_symbol(p); 00728 } else 00729 finish = start; 00730 /* xxx what about signed chars here... */ 00731 if(REQUIRE(start <= finish, REG_ERANGE)) {} 00732 for (i = start; i <= finish; i++) 00733 CHadd(cs, i); 00734 break; 00735 } 00736 } 00737 00738 /* 00739 - p_b_cclass - parse a character-class name and deal with it 00740 == static void p_b_cclass(register struct parse *p, register cset *cs); 00741 */ 00742 static void 00743 p_b_cclass(p, cs) 00744 register struct parse *p; 00745 register cset *cs; 00746 { 00747 register char *sp = p->next; 00748 register struct cclass *cp; 00749 register size_t len; 00750 00751 while (MORE() && my_isalpha(p->charset,PEEK())) 00752 NEXT(); 00753 len = p->next - sp; 00754 for (cp = cclasses; cp->name != NULL; cp++) 00755 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 00756 break; 00757 if (cp->name == NULL) { 00758 /* oops, didn't find it */ 00759 SETERROR(REG_ECTYPE); 00760 return; 00761 } 00762 00763 #ifndef USE_ORIG_REGEX_CODE 00764 { 00765 register size_t i; 00766 for (i=1 ; i<256 ; i++) 00767 if (p->charset->ctype[i+1] & cp->mask) 00768 CHadd(cs, i); 00769 } 00770 #else 00771 { 00772 register char *u = (char*) cp->chars; 00773 register char c; 00774 00775 while ((c = *u++) != '\0') 00776 CHadd(cs, c); 00777 00778 for (u = (char*) cp->multis; *u != '\0'; u += strlen(u) + 1) 00779 MCadd(p, cs, u); 00780 } 00781 #endif 00782 00783 } 00784 00785 /* 00786 - p_b_eclass - parse an equivalence-class name and deal with it 00787 == static void p_b_eclass(register struct parse *p, register cset *cs); 00788 * 00789 * This implementation is incomplete. xxx 00790 */ 00791 static void 00792 p_b_eclass(p, cs) 00793 register struct parse *p; 00794 register cset *cs; 00795 { 00796 register char c; 00797 00798 c = p_b_coll_elem(p, '='); 00799 CHadd(cs, c); 00800 } 00801 00802 /* 00803 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol 00804 == static char p_b_symbol(register struct parse *p); 00805 */ 00806 static char /* value of symbol */ 00807 p_b_symbol(p) 00808 register struct parse *p; 00809 { 00810 register char value; 00811 00812 if(REQUIRE(MORE(), REG_EBRACK)) {} 00813 if (!EATTWO('[', '.')) 00814 return(GETNEXT()); 00815 00816 /* collating symbol */ 00817 value = p_b_coll_elem(p, '.'); 00818 if(REQUIRE(EATTWO('.', ']'), REG_ECOLLATE)) {} 00819 return(value); 00820 } 00821 00822 /* 00823 - p_b_coll_elem - parse a collating-element name and look it up 00824 == static char p_b_coll_elem(register struct parse *p, int endc); 00825 */ 00826 static char /* value of collating element */ 00827 p_b_coll_elem(p, endc) 00828 register struct parse *p; 00829 int endc; /* name ended by endc,']' */ 00830 { 00831 register char *sp = p->next; 00832 register struct cname *cp; 00833 #ifdef _WIN64 00834 register __int64 len; 00835 #else 00836 register int len; 00837 #endif 00838 while (MORE() && !SEETWO(endc, ']')) 00839 NEXT(); 00840 if (!MORE()) { 00841 SETERROR(REG_EBRACK); 00842 return(0); 00843 } 00844 len = p->next - sp; 00845 for (cp = cnames; cp->name != NULL; cp++) 00846 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 00847 return(cp->code); /* known name */ 00848 if (len == 1) 00849 return(*sp); /* single character */ 00850 SETERROR(REG_ECOLLATE); /* neither */ 00851 return(0); 00852 } 00853 00854 /* 00855 - othercase - return the case counterpart of an alphabetic 00856 == static char othercase(int ch); 00857 */ 00858 static char /* if no counterpart, return ch */ 00859 othercase(charset,ch) 00860 CHARSET_INFO *charset; 00861 int ch; 00862 { 00863 /* 00864 In MySQL some multi-byte character sets 00865 have 'ctype' array but don't have 'to_lower' 00866 and 'to_upper' arrays. In this case we handle 00867 only basic latin letters a..z and A..Z. 00868 00869 If 'to_lower' and 'to_upper' arrays are empty in a character set, 00870 then my_isalpha(cs, ch) should never return TRUE for characters 00871 other than basic latin letters. Otherwise it should be 00872 considered as a mistake in character set definition. 00873 */ 00874 assert(my_isalpha(charset,ch)); 00875 if (my_isupper(charset,ch)) 00876 { 00877 return(charset->to_lower ? my_tolower(charset,ch) : 00878 ch - 'A' + 'a'); 00879 } 00880 else if (my_islower(charset,ch)) 00881 { 00882 return(charset->to_upper ? my_toupper(charset,ch) : 00883 ch - 'a' + 'A'); 00884 } 00885 else /* peculiar, but could happen */ 00886 return(ch); 00887 } 00888 00889 /* 00890 - bothcases - emit a dualcase version of a two-case character 00891 == static void bothcases(register struct parse *p, int ch); 00892 * 00893 * Boy, is this implementation ever a kludge... 00894 */ 00895 static void 00896 bothcases(p, ch) 00897 register struct parse *p; 00898 int ch; 00899 { 00900 register char *oldnext = p->next; 00901 register char *oldend = p->end; 00902 char bracket[3]; 00903 00904 assert(othercase(p->charset, ch) != ch); /* p_bracket() would recurse */ 00905 p->next = bracket; 00906 p->end = bracket+2; 00907 bracket[0] = ch; 00908 bracket[1] = ']'; 00909 bracket[2] = '\0'; 00910 p_bracket(p); 00911 assert(p->next == bracket+2); 00912 p->next = oldnext; 00913 p->end = oldend; 00914 } 00915 00916 /* 00917 - ordinary - emit an ordinary character 00918 == static void ordinary(register struct parse *p, register int ch); 00919 */ 00920 static void 00921 ordinary(p, ch) 00922 register struct parse *p; 00923 register int ch; 00924 { 00925 register cat_t *cap = p->g->categories; 00926 00927 if ((p->g->cflags®_ICASE) && my_isalpha(p->charset,ch) && 00928 othercase(p->charset,ch) != ch) 00929 bothcases(p, ch); 00930 else { 00931 EMIT(OCHAR, (unsigned char)ch); 00932 if (cap[ch] == 0) 00933 cap[ch] = p->g->ncategories++; 00934 } 00935 } 00936 00937 /* 00938 - nonnewline - emit REG_NEWLINE version of OANY 00939 == static void nonnewline(register struct parse *p); 00940 * 00941 * Boy, is this implementation ever a kludge... 00942 */ 00943 static void 00944 nonnewline(p) 00945 register struct parse *p; 00946 { 00947 register char *oldnext = p->next; 00948 register char *oldend = p->end; 00949 char bracket[4]; 00950 00951 p->next = bracket; 00952 p->end = bracket+3; 00953 bracket[0] = '^'; 00954 bracket[1] = '\n'; 00955 bracket[2] = ']'; 00956 bracket[3] = '\0'; 00957 p_bracket(p); 00958 assert(p->next == bracket+3); 00959 p->next = oldnext; 00960 p->end = oldend; 00961 } 00962 00963 /* 00964 - repeat - generate code for a bounded repetition, recursively if needed 00965 == static void repeat(register struct parse *p, sopno start, int from, int to); 00966 */ 00967 static void 00968 repeat(p, start, from, to) 00969 register struct parse *p; 00970 sopno start; /* operand from here to end of strip */ 00971 int from; /* repeated from this number */ 00972 int to; /* to this number of times (maybe RE_INFINITY) */ 00973 { 00974 register sopno finish = HERE(); 00975 # define N 2 00976 # define INF 3 00977 # define REP(f, t) ((f)*8 + (t)) 00978 # define MAP(n) (((n) <= 1) ? (n) : ((n) == RE_INFINITY) ? INF : N) 00979 register sopno copy; 00980 00981 if (p->error != 0) /* head off possible runaway recursion */ 00982 return; 00983 00984 assert(from <= to); 00985 00986 switch (REP(MAP(from), MAP(to))) { 00987 case REP(0, 0): /* must be user doing this */ 00988 DROP(finish-start); /* drop the operand */ 00989 break; 00990 case REP(0, 1): /* as x{1,1}? */ 00991 case REP(0, N): /* as x{1,n}? */ 00992 case REP(0, INF): /* as x{1,}? */ 00993 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 00994 INSERT(OCH_, start); /* offset is wrong... */ 00995 repeat(p, start+1, 1, to); 00996 ASTERN(OOR1, start); 00997 AHEAD(start); /* ... fix it */ 00998 EMIT(OOR2, 0); 00999 AHEAD(THERE()); 01000 ASTERN(O_CH, THERETHERE()); 01001 break; 01002 case REP(1, 1): /* trivial case */ 01003 /* done */ 01004 break; 01005 case REP(1, N): /* as x?x{1,n-1} */ 01006 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 01007 INSERT(OCH_, start); 01008 ASTERN(OOR1, start); 01009 AHEAD(start); 01010 EMIT(OOR2, 0); /* offset very wrong... */ 01011 AHEAD(THERE()); /* ...so fix it */ 01012 ASTERN(O_CH, THERETHERE()); 01013 copy = dupl(p, start+1, finish+1); 01014 assert(copy == finish+4); 01015 repeat(p, copy, 1, to-1); 01016 break; 01017 case REP(1, INF): /* as x+ */ 01018 INSERT(OPLUS_, start); 01019 ASTERN(O_PLUS, start); 01020 break; 01021 case REP(N, N): /* as xx{m-1,n-1} */ 01022 copy = dupl(p, start, finish); 01023 repeat(p, copy, from-1, to-1); 01024 break; 01025 case REP(N, INF): /* as xx{n-1,INF} */ 01026 copy = dupl(p, start, finish); 01027 repeat(p, copy, from-1, to); 01028 break; 01029 default: /* "can't happen" */ 01030 SETERROR(REG_ASSERT); /* just in case */ 01031 break; 01032 } 01033 } 01034 01035 /* 01036 - seterr - set an error condition 01037 == static int seterr(register struct parse *p, int e); 01038 */ 01039 static int /* useless but makes type checking happy */ 01040 seterr(p, e) 01041 register struct parse *p; 01042 int e; 01043 { 01044 if (p->error == 0) /* keep earliest error condition */ 01045 p->error = e; 01046 p->next = nuls; /* try to bring things to a halt */ 01047 p->end = nuls; 01048 return(0); /* make the return value well-defined */ 01049 } 01050 01051 /* 01052 - allocset - allocate a set of characters for [] 01053 == static cset *allocset(register struct parse *p); 01054 */ 01055 static cset * 01056 allocset(p) 01057 register struct parse *p; 01058 { 01059 register int no = p->g->ncsets++; 01060 register size_t nc; 01061 register size_t nbytes; 01062 register cset *cs; 01063 register size_t css = (size_t)p->g->csetsize; 01064 register int i; 01065 01066 if (no >= p->ncsalloc) { /* need another column of space */ 01067 p->ncsalloc += CHAR_BIT; 01068 nc = p->ncsalloc; 01069 assert(nc % CHAR_BIT == 0); 01070 nbytes = nc / CHAR_BIT * css; 01071 if (p->g->sets == NULL) 01072 p->g->sets = (cset *)malloc(nc * sizeof(cset)); 01073 else 01074 p->g->sets = (cset *)realloc((char *)p->g->sets, 01075 nc * sizeof(cset)); 01076 if (p->g->setbits == NULL) 01077 p->g->setbits = (uch *)malloc(nbytes); 01078 else { 01079 p->g->setbits = (uch *)realloc((char *)p->g->setbits, 01080 nbytes); 01081 /* xxx this isn't right if setbits is now NULL */ 01082 for (i = 0; i < no; i++) 01083 p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT); 01084 } 01085 if (p->g->sets != NULL && p->g->setbits != NULL) 01086 (void) memset((char *)p->g->setbits + (nbytes - css), 01087 0, css); 01088 else { 01089 no = 0; 01090 SETERROR(REG_ESPACE); 01091 /* caller's responsibility not to do set ops */ 01092 } 01093 } 01094 01095 assert(p->g->sets != NULL); /* xxx */ 01096 cs = &p->g->sets[no]; 01097 cs->ptr = p->g->setbits + css*((no)/CHAR_BIT); 01098 cs->mask = 1 << ((no) % CHAR_BIT); 01099 cs->hash = 0; 01100 cs->smultis = 0; 01101 cs->multis = NULL; 01102 01103 return(cs); 01104 } 01105 01106 /* 01107 - freeset - free a now-unused set 01108 == static void freeset(register struct parse *p, register cset *cs); 01109 */ 01110 static void 01111 freeset(p, cs) 01112 register struct parse *p; 01113 register cset *cs; 01114 { 01115 register size_t i; 01116 register cset *top = &p->g->sets[p->g->ncsets]; 01117 register size_t css = (size_t)p->g->csetsize; 01118 01119 for (i = 0; i < css; i++) 01120 CHsub(cs, i); 01121 if (cs == top-1) /* recover only the easy case */ 01122 p->g->ncsets--; 01123 } 01124 01125 /* 01126 - freezeset - final processing on a set of characters 01127 == static int freezeset(register struct parse *p, register cset *cs); 01128 * 01129 * The main task here is merging identical sets. This is usually a waste 01130 * of time (although the hash code minimizes the overhead), but can win 01131 * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash 01132 * is done using addition rather than xor -- all ASCII [aA] sets xor to 01133 * the same value! 01134 */ 01135 static int /* set number */ 01136 freezeset(p, cs) 01137 register struct parse *p; 01138 register cset *cs; 01139 { 01140 register uch h = cs->hash; 01141 register size_t i; 01142 register cset *top = &p->g->sets[p->g->ncsets]; 01143 register cset *cs2; 01144 register size_t css = (size_t)p->g->csetsize; 01145 01146 /* look for an earlier one which is the same */ 01147 for (cs2 = &p->g->sets[0]; cs2 < top; cs2++) 01148 if (cs2->hash == h && cs2 != cs) { 01149 /* maybe */ 01150 for (i = 0; i < css; i++) 01151 if (!!CHIN(cs2, i) != !!CHIN(cs, i)) 01152 break; /* no */ 01153 if (i == css) 01154 break; /* yes */ 01155 } 01156 01157 if (cs2 < top) { /* found one */ 01158 freeset(p, cs); 01159 cs = cs2; 01160 } 01161 01162 return((int)(cs - p->g->sets)); 01163 } 01164 01165 /* 01166 - firstch - return first character in a set (which must have at least one) 01167 == static int firstch(register struct parse *p, register cset *cs); 01168 */ 01169 static int /* character; there is no "none" value */ 01170 firstch(p, cs) 01171 register struct parse *p; 01172 register cset *cs; 01173 { 01174 register size_t i; 01175 register size_t css = (size_t)p->g->csetsize; 01176 01177 for (i = 0; i < css; i++) 01178 if (CHIN(cs, i)) 01179 return((char)i); 01180 assert(never); 01181 return(0); /* arbitrary */ 01182 } 01183 01184 /* 01185 - nch - number of characters in a set 01186 == static int nch(register struct parse *p, register cset *cs); 01187 */ 01188 static int 01189 nch(p, cs) 01190 register struct parse *p; 01191 register cset *cs; 01192 { 01193 register size_t i; 01194 register size_t css = (size_t)p->g->csetsize; 01195 register int n = 0; 01196 01197 for (i = 0; i < css; i++) 01198 if (CHIN(cs, i)) 01199 n++; 01200 return(n); 01201 } 01202 01203 #ifdef USE_ORIG_REGEX_CODE 01204 /* 01205 - mcadd - add a collating element to a cset 01206 == static void mcadd(register struct parse *p, register cset *cs, \ 01207 == register char *cp); 01208 */ 01209 static void 01210 mcadd(p, cs, cp) 01211 register struct parse *p; 01212 register cset *cs; 01213 register char *cp; 01214 { 01215 register size_t oldend = cs->smultis; 01216 01217 cs->smultis += strlen(cp) + 1; 01218 if (cs->multis == NULL) 01219 cs->multis = malloc(cs->smultis); 01220 else 01221 cs->multis = realloc(cs->multis, cs->smultis); 01222 if (cs->multis == NULL) { 01223 SETERROR(REG_ESPACE); 01224 return; 01225 } 01226 01227 (void) strcpy(cs->multis + oldend - 1, cp); 01228 cs->multis[cs->smultis - 1] = '\0'; 01229 } 01230 #endif 01231 01232 #ifdef NOT_USED 01233 /* 01234 - mcsub - subtract a collating element from a cset 01235 == static void mcsub(register cset *cs, register char *cp); 01236 */ 01237 static void 01238 mcsub(cs, cp) 01239 register cset *cs; 01240 register char *cp; 01241 { 01242 register char *fp = mcfind(cs, cp); 01243 register size_t len = strlen(fp); 01244 01245 assert(fp != NULL); 01246 (void) memmove(fp, fp + len + 1, 01247 cs->smultis - (fp + len + 1 - cs->multis)); 01248 cs->smultis -= len; 01249 01250 if (cs->smultis == 0) { 01251 free(cs->multis); 01252 cs->multis = NULL; 01253 return; 01254 } 01255 01256 cs->multis = realloc(cs->multis, cs->smultis); 01257 assert(cs->multis != NULL); 01258 } 01259 01260 /* 01261 - mcin - is a collating element in a cset? 01262 == static int mcin(register cset *cs, register char *cp); 01263 */ 01264 static int 01265 mcin(cs, cp) 01266 register cset *cs; 01267 register char *cp; 01268 { 01269 return(mcfind(cs, cp) != NULL); 01270 } 01271 01272 /* 01273 - mcfind - find a collating element in a cset 01274 == static char *mcfind(register cset *cs, register char *cp); 01275 */ 01276 static char * 01277 mcfind(cs, cp) 01278 register cset *cs; 01279 register char *cp; 01280 { 01281 register char *p; 01282 01283 if (cs->multis == NULL) 01284 return(NULL); 01285 for (p = cs->multis; *p != '\0'; p += strlen(p) + 1) 01286 if (strcmp(cp, p) == 0) 01287 return(p); 01288 return(NULL); 01289 } 01290 #endif 01291 01292 /* 01293 - mcinvert - invert the list of collating elements in a cset 01294 == static void mcinvert(register struct parse *p, register cset *cs); 01295 * 01296 * This would have to know the set of possibilities. Implementation 01297 * is deferred. 01298 */ 01299 static void 01300 mcinvert(p, cs) 01301 register struct parse *p __attribute__((unused)); 01302 register cset *cs __attribute__((unused)); 01303 { 01304 assert(cs->multis == NULL); /* xxx */ 01305 } 01306 01307 /* 01308 - mccase - add case counterparts of the list of collating elements in a cset 01309 == static void mccase(register struct parse *p, register cset *cs); 01310 * 01311 * This would have to know the set of possibilities. Implementation 01312 * is deferred. 01313 */ 01314 static void 01315 mccase(p, cs) 01316 register struct parse *p __attribute__((unused)); 01317 register cset *cs __attribute__((unused)); 01318 { 01319 assert(cs->multis == NULL); /* xxx */ 01320 } 01321 01322 /* 01323 - isinsets - is this character in any sets? 01324 == static int isinsets(register struct re_guts *g, int c); 01325 */ 01326 static int /* predicate */ 01327 isinsets(g, c) 01328 register struct re_guts *g; 01329 int c; 01330 { 01331 register uch *col; 01332 register int i; 01333 register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 01334 register unsigned uc = (unsigned char)c; 01335 01336 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 01337 if (col[uc] != 0) 01338 return(1); 01339 return(0); 01340 } 01341 01342 /* 01343 - samesets - are these two characters in exactly the same sets? 01344 == static int samesets(register struct re_guts *g, int c1, int c2); 01345 */ 01346 static int /* predicate */ 01347 samesets(g, c1, c2) 01348 register struct re_guts *g; 01349 int c1; 01350 int c2; 01351 { 01352 register uch *col; 01353 register int i; 01354 register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 01355 register unsigned uc1 = (unsigned char)c1; 01356 register unsigned uc2 = (unsigned char)c2; 01357 01358 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 01359 if (col[uc1] != col[uc2]) 01360 return(0); 01361 return(1); 01362 } 01363 01364 /* 01365 - categorize - sort out character categories 01366 == static void categorize(struct parse *p, register struct re_guts *g); 01367 */ 01368 static void 01369 categorize(p, g) 01370 struct parse *p; 01371 register struct re_guts *g; 01372 { 01373 register cat_t *cats = g->categories; 01374 register int c; 01375 register int c2; 01376 register cat_t cat; 01377 01378 /* avoid making error situations worse */ 01379 if (p->error != 0) 01380 return; 01381 01382 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 01383 if (cats[c] == 0 && isinsets(g, c)) { 01384 cat = g->ncategories++; 01385 cats[c] = cat; 01386 for (c2 = c+1; c2 <= CHAR_MAX; c2++) 01387 if (cats[c2] == 0 && samesets(g, c, c2)) 01388 cats[c2] = cat; 01389 } 01390 } 01391 01392 /* 01393 - dupl - emit a duplicate of a bunch of sops 01394 == static sopno dupl(register struct parse *p, sopno start, sopno finish); 01395 */ 01396 static sopno /* start of duplicate */ 01397 dupl(p, start, finish) 01398 register struct parse *p; 01399 sopno start; /* from here */ 01400 sopno finish; /* to this less one */ 01401 { 01402 register sopno ret = HERE(); 01403 register sopno len = finish - start; 01404 01405 assert(finish >= start); 01406 if (len == 0) 01407 return(ret); 01408 enlarge(p, p->ssize + len); /* this many unexpected additions */ 01409 assert(p->ssize >= p->slen + len); 01410 (void) memcpy((char *)(p->strip + p->slen), 01411 (char *)(p->strip + start), (size_t)len*sizeof(sop)); 01412 p->slen += len; 01413 return(ret); 01414 } 01415 01416 /* 01417 - doemit - emit a strip operator 01418 == static void doemit(register struct parse *p, sop op, size_t opnd); 01419 * 01420 * It might seem better to implement this as a macro with a function as 01421 * hard-case backup, but it's just too big and messy unless there are 01422 * some changes to the data structures. Maybe later. 01423 */ 01424 static void 01425 doemit(p, op, opnd) 01426 register struct parse *p; 01427 sop op; 01428 size_t opnd; 01429 { 01430 /* avoid making error situations worse */ 01431 if (p->error != 0) 01432 return; 01433 01434 /* deal with oversize operands ("can't happen", more or less) */ 01435 assert(opnd < 1<<OPSHIFT); 01436 01437 /* deal with undersized strip */ 01438 if (p->slen >= p->ssize) 01439 enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */ 01440 assert(p->slen < p->ssize); 01441 01442 /* finally, it's all reduced to the easy case */ 01443 p->strip[p->slen++] = SOP(op, opnd); 01444 } 01445 01446 /* 01447 - doinsert - insert a sop into the strip 01448 == static void doinsert(register struct parse *p, sop op, size_t opnd, sopno pos); 01449 */ 01450 static void 01451 doinsert(p, op, opnd, pos) 01452 register struct parse *p; 01453 sop op; 01454 size_t opnd; 01455 sopno pos; 01456 { 01457 register sopno sn; 01458 register sop s; 01459 register int i; 01460 01461 /* avoid making error situations worse */ 01462 if (p->error != 0) 01463 return; 01464 01465 sn = HERE(); 01466 EMIT(op, opnd); /* do checks, ensure space */ 01467 assert(HERE() == sn+1); 01468 s = p->strip[sn]; 01469 01470 /* adjust paren pointers */ 01471 assert(pos > 0); 01472 for (i = 1; i < NPAREN; i++) { 01473 if (p->pbegin[i] >= pos) { 01474 p->pbegin[i]++; 01475 } 01476 if (p->pend[i] >= pos) { 01477 p->pend[i]++; 01478 } 01479 } 01480 { 01481 int length=(HERE()-pos-1)*sizeof(sop); 01482 bmove_upp((char *) &p->strip[pos+1]+length, 01483 (char *) &p->strip[pos]+length, 01484 length); 01485 } 01486 #ifdef OLD_CODE 01487 memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], 01488 (HERE()-pos-1)*sizeof(sop)); 01489 #endif 01490 p->strip[pos] = s; 01491 } 01492 01493 /* 01494 - dofwd - complete a forward reference 01495 == static void dofwd(register struct parse *p, sopno pos, sop value); 01496 */ 01497 static void 01498 dofwd(p, pos, value) 01499 register struct parse *p; 01500 register sopno pos; 01501 sop value; 01502 { 01503 /* avoid making error situations worse */ 01504 if (p->error != 0) 01505 return; 01506 01507 assert(value < 1<<OPSHIFT); 01508 p->strip[pos] = OP(p->strip[pos]) | value; 01509 } 01510 01511 /* 01512 - enlarge - enlarge the strip 01513 == static void enlarge(register struct parse *p, sopno size); 01514 */ 01515 static void 01516 enlarge(p, size) 01517 register struct parse *p; 01518 register sopno size; 01519 { 01520 register sop *sp; 01521 01522 if (p->ssize >= size) 01523 return; 01524 01525 sp = (sop *)realloc(p->strip, size*sizeof(sop)); 01526 if (sp == NULL) { 01527 SETERROR(REG_ESPACE); 01528 return; 01529 } 01530 p->strip = sp; 01531 p->ssize = size; 01532 } 01533 01534 /* 01535 - stripsnug - compact the strip 01536 == static void stripsnug(register struct parse *p, register struct re_guts *g); 01537 */ 01538 static void 01539 stripsnug(p, g) 01540 register struct parse *p; 01541 register struct re_guts *g; 01542 { 01543 g->nstates = p->slen; 01544 g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop)); 01545 if (g->strip == NULL) { 01546 SETERROR(REG_ESPACE); 01547 g->strip = p->strip; 01548 } 01549 } 01550 01551 /* 01552 - findmust - fill in must and mlen with longest mandatory literal string 01553 == static void findmust(register struct parse *p, register struct re_guts *g); 01554 * 01555 * This algorithm could do fancy things like analyzing the operands of | 01556 * for common subsequences. Someday. This code is simple and finds most 01557 * of the interesting cases. 01558 * 01559 * Note that must and mlen got initialized during setup. 01560 */ 01561 static void 01562 findmust(p, g) 01563 struct parse *p; 01564 register struct re_guts *g; 01565 { 01566 register sop *scan; 01567 sop *start; 01568 register sop *newstart; 01569 register sopno newlen; 01570 register sop s; 01571 register char *cp; 01572 register sopno i; 01573 LINT_INIT(start); LINT_INIT(newstart); 01574 /* avoid making error situations worse */ 01575 if (p->error != 0) 01576 return; 01577 01578 /* find the longest OCHAR sequence in strip */ 01579 newlen = 0; 01580 scan = g->strip + 1; 01581 do { 01582 s = *scan++; 01583 switch (OP(s)) { 01584 case OCHAR: /* sequence member */ 01585 if (newlen == 0) /* new sequence */ 01586 newstart = scan - 1; 01587 newlen++; 01588 break; 01589 case OPLUS_: /* things that don't break one */ 01590 case OLPAREN: 01591 case ORPAREN: 01592 break; 01593 case OQUEST_: /* things that must be skipped */ 01594 case OCH_: 01595 scan--; 01596 do { 01597 scan += OPND(s); 01598 s = *scan; 01599 /* assert() interferes w debug printouts */ 01600 if (OP(s) != O_QUEST && OP(s) != O_CH && 01601 OP(s) != OOR2) { 01602 g->iflags |= BAD; 01603 return; 01604 } 01605 } while (OP(s) != O_QUEST && OP(s) != O_CH); 01606 /* fallthrough */ 01607 default: /* things that break a sequence */ 01608 if (newlen > g->mlen) { /* ends one */ 01609 start = newstart; 01610 g->mlen = newlen; 01611 } 01612 newlen = 0; 01613 break; 01614 } 01615 } while (OP(s) != OEND); 01616 01617 if (g->mlen == 0) /* there isn't one */ 01618 return; 01619 01620 /* turn it into a character string */ 01621 g->must = malloc((size_t)g->mlen + 1); 01622 if (g->must == NULL) { /* argh; just forget it */ 01623 g->mlen = 0; 01624 return; 01625 } 01626 cp = g->must; 01627 scan = start; 01628 for (i = g->mlen; i > 0; i--) { 01629 while (OP(s = *scan++) != OCHAR) 01630 continue; 01631 assert(cp < g->must + g->mlen); 01632 *cp++ = (char)OPND(s); 01633 } 01634 assert(cp == g->must + g->mlen); 01635 *cp++ = '\0'; /* just on general principles */ 01636 } 01637 01638 /* 01639 - pluscount - count + nesting 01640 == static sopno pluscount(register struct parse *p, register struct re_guts *g); 01641 */ 01642 static sopno /* nesting depth */ 01643 pluscount(p, g) 01644 struct parse *p; 01645 register struct re_guts *g; 01646 { 01647 register sop *scan; 01648 register sop s; 01649 register sopno plusnest = 0; 01650 register sopno maxnest = 0; 01651 01652 if (p->error != 0) 01653 return(0); /* there may not be an OEND */ 01654 01655 scan = g->strip + 1; 01656 do { 01657 s = *scan++; 01658 switch (OP(s)) { 01659 case OPLUS_: 01660 plusnest++; 01661 break; 01662 case O_PLUS: 01663 if (plusnest > maxnest) 01664 maxnest = plusnest; 01665 plusnest--; 01666 break; 01667 } 01668 } while (OP(s) != OEND); 01669 if (plusnest != 0) 01670 g->iflags |= BAD; 01671 return(maxnest); 01672 }
1.4.7

