Files
mujs/regexp.c
Tor Andersson 7f50591861 Bug 700429: Limit size of input regular expression program.
The size of the parsed syntax list could overflow. Allowing it to be
bigger than the compiled program makes no sense, so use the same size
limit when parsing as when compiling.
2019-01-02 16:05:24 +01:00

1172 lines
24 KiB
C

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <setjmp.h>
#include <limits.h>
#include "regexp.h"
#include "utf.h"
#define emit regemit
#define next regnext
#define accept regaccept
#define nelem(a) (int)(sizeof (a) / sizeof (a)[0])
#define REPINF 255
#define MAXSUB REG_MAXSUB
#define MAXPROG (32 << 10)
typedef struct Reclass Reclass;
typedef struct Renode Renode;
typedef struct Reinst Reinst;
typedef struct Rethread Rethread;
struct Reclass {
Rune *end;
Rune spans[64];
};
struct Reprog {
Reinst *start, *end;
int flags;
int nsub;
Reclass cclass[16];
};
struct cstate {
Reprog *prog;
Renode *pstart, *pend;
const char *source;
int ncclass;
int nsub;
Renode *sub[MAXSUB];
int lookahead;
Rune yychar;
Reclass *yycc;
int yymin, yymax;
const char *error;
jmp_buf kaboom;
};
static void die(struct cstate *g, const char *message)
{
g->error = message;
longjmp(g->kaboom, 1);
}
static int canon(Rune c)
{
Rune u = toupperrune(c);
if (c >= 128 && u < 128)
return c;
return u;
}
/* Scan */
enum {
L_CHAR = 256,
L_CCLASS, /* character class */
L_NCCLASS, /* negative character class */
L_NC, /* "(?:" no capture */
L_PLA, /* "(?=" positive lookahead */
L_NLA, /* "(?!" negative lookahead */
L_WORD, /* "\b" word boundary */
L_NWORD, /* "\B" non-word boundary */
L_REF, /* "\1" back-reference */
L_COUNT, /* {M,N} */
};
static int hex(struct cstate *g, int c)
{
if (c >= '0' && c <= '9') return c - '0';
if (c >= 'a' && c <= 'f') return c - 'a' + 0xA;
if (c >= 'A' && c <= 'F') return c - 'A' + 0xA;
die(g, "invalid escape sequence");
return 0;
}
static int dec(struct cstate *g, int c)
{
if (c >= '0' && c <= '9') return c - '0';
die(g, "invalid quantifier");
return 0;
}
#define ESCAPES "BbDdSsWw^$\\.*+?()[]{}|0123456789"
static int isunicodeletter(int c)
{
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || isalpharune(c);
}
static int nextrune(struct cstate *g)
{
g->source += chartorune(&g->yychar, g->source);
if (g->yychar == '\\') {
g->source += chartorune(&g->yychar, g->source);
switch (g->yychar) {
case 0: die(g, "unterminated escape sequence"); break;
case 'f': g->yychar = '\f'; return 0;
case 'n': g->yychar = '\n'; return 0;
case 'r': g->yychar = '\r'; return 0;
case 't': g->yychar = '\t'; return 0;
case 'v': g->yychar = '\v'; return 0;
case 'c':
g->yychar = (*g->source++) & 31;
return 0;
case 'x':
g->yychar = hex(g, *g->source++) << 4;
g->yychar += hex(g, *g->source++);
if (g->yychar == 0) {
g->yychar = '0';
return 1;
}
return 0;
case 'u':
g->yychar = hex(g, *g->source++) << 12;
g->yychar += hex(g, *g->source++) << 8;
g->yychar += hex(g, *g->source++) << 4;
g->yychar += hex(g, *g->source++);
if (g->yychar == 0) {
g->yychar = '0';
return 1;
}
return 0;
}
if (strchr(ESCAPES, g->yychar))
return 1;
if (isunicodeletter(g->yychar) || g->yychar == '_') /* check identity escape */
die(g, "invalid escape character");
return 0;
}
return 0;
}
static int lexcount(struct cstate *g)
{
g->yychar = *g->source++;
g->yymin = dec(g, g->yychar);
g->yychar = *g->source++;
while (g->yychar != ',' && g->yychar != '}') {
g->yymin = g->yymin * 10 + dec(g, g->yychar);
g->yychar = *g->source++;
if (g->yymin >= REPINF)
die(g, "numeric overflow");
}
if (g->yychar == ',') {
g->yychar = *g->source++;
if (g->yychar == '}') {
g->yymax = REPINF;
} else {
g->yymax = dec(g, g->yychar);
g->yychar = *g->source++;
while (g->yychar != '}') {
g->yymax = g->yymax * 10 + dec(g, g->yychar);
g->yychar = *g->source++;
if (g->yymax >= REPINF)
die(g, "numeric overflow");
}
}
} else {
g->yymax = g->yymin;
}
return L_COUNT;
}
static void newcclass(struct cstate *g)
{
if (g->ncclass >= nelem(g->prog->cclass))
die(g, "too many character classes");
g->yycc = g->prog->cclass + g->ncclass++;
g->yycc->end = g->yycc->spans;
}
static void addrange(struct cstate *g, Rune a, Rune b)
{
if (a > b)
die(g, "invalid character class range");
if (g->yycc->end + 2 == g->yycc->spans + nelem(g->yycc->spans))
die(g, "too many character class ranges");
*g->yycc->end++ = a;
*g->yycc->end++ = b;
}
static void addranges_d(struct cstate *g)
{
addrange(g, '0', '9');
}
static void addranges_D(struct cstate *g)
{
addrange(g, 0, '0'-1);
addrange(g, '9'+1, 0xFFFF);
}
static void addranges_s(struct cstate *g)
{
addrange(g, 0x9, 0xD);
addrange(g, 0x20, 0x20);
addrange(g, 0xA0, 0xA0);
addrange(g, 0x2028, 0x2029);
addrange(g, 0xFEFF, 0xFEFF);
}
static void addranges_S(struct cstate *g)
{
addrange(g, 0, 0x9-1);
addrange(g, 0xD+1, 0x20-1);
addrange(g, 0x20+1, 0xA0-1);
addrange(g, 0xA0+1, 0x2028-1);
addrange(g, 0x2029+1, 0xFEFF-1);
addrange(g, 0xFEFF+1, 0xFFFF);
}
static void addranges_w(struct cstate *g)
{
addrange(g, '0', '9');
addrange(g, 'A', 'Z');
addrange(g, '_', '_');
addrange(g, 'a', 'z');
}
static void addranges_W(struct cstate *g)
{
addrange(g, 0, '0'-1);
addrange(g, '9'+1, 'A'-1);
addrange(g, 'Z'+1, '_'-1);
addrange(g, '_'+1, 'a'-1);
addrange(g, 'z'+1, 0xFFFF);
}
static int lexclass(struct cstate *g)
{
int type = L_CCLASS;
int quoted, havesave, havedash;
Rune save = 0;
newcclass(g);
quoted = nextrune(g);
if (!quoted && g->yychar == '^') {
type = L_NCCLASS;
quoted = nextrune(g);
}
havesave = havedash = 0;
for (;;) {
if (g->yychar == 0)
die(g, "unterminated character class");
if (!quoted && g->yychar == ']')
break;
if (!quoted && g->yychar == '-') {
if (havesave) {
if (havedash) {
addrange(g, save, '-');
havesave = havedash = 0;
} else {
havedash = 1;
}
} else {
save = '-';
havesave = 1;
}
} else if (quoted && strchr("DSWdsw", g->yychar)) {
if (havesave) {
addrange(g, save, save);
if (havedash)
addrange(g, '-', '-');
}
switch (g->yychar) {
case 'd': addranges_d(g); break;
case 's': addranges_s(g); break;
case 'w': addranges_w(g); break;
case 'D': addranges_D(g); break;
case 'S': addranges_S(g); break;
case 'W': addranges_W(g); break;
}
havesave = havedash = 0;
} else {
if (quoted) {
if (g->yychar == 'b')
g->yychar = '\b';
else if (g->yychar == '0')
g->yychar = 0;
/* else identity escape */
}
if (havesave) {
if (havedash) {
addrange(g, save, g->yychar);
havesave = havedash = 0;
} else {
addrange(g, save, save);
save = g->yychar;
}
} else {
save = g->yychar;
havesave = 1;
}
}
quoted = nextrune(g);
}
if (havesave) {
addrange(g, save, save);
if (havedash)
addrange(g, '-', '-');
}
return type;
}
static int lex(struct cstate *g)
{
int quoted = nextrune(g);
if (quoted) {
switch (g->yychar) {
case 'b': return L_WORD;
case 'B': return L_NWORD;
case 'd': newcclass(g); addranges_d(g); return L_CCLASS;
case 's': newcclass(g); addranges_s(g); return L_CCLASS;
case 'w': newcclass(g); addranges_w(g); return L_CCLASS;
case 'D': newcclass(g); addranges_d(g); return L_NCCLASS;
case 'S': newcclass(g); addranges_s(g); return L_NCCLASS;
case 'W': newcclass(g); addranges_w(g); return L_NCCLASS;
case '0': g->yychar = 0; return L_CHAR;
}
if (g->yychar >= '0' && g->yychar <= '9') {
g->yychar -= '0';
if (*g->source >= '0' && *g->source <= '9')
g->yychar = g->yychar * 10 + *g->source++ - '0';
return L_REF;
}
return L_CHAR;
}
switch (g->yychar) {
case 0:
case '$': case ')': case '*': case '+':
case '.': case '?': case '^': case '|':
return g->yychar;
}
if (g->yychar == '{')
return lexcount(g);
if (g->yychar == '[')
return lexclass(g);
if (g->yychar == '(') {
if (g->source[0] == '?') {
if (g->source[1] == ':') {
g->source += 2;
return L_NC;
}
if (g->source[1] == '=') {
g->source += 2;
return L_PLA;
}
if (g->source[1] == '!') {
g->source += 2;
return L_NLA;
}
}
return '(';
}
return L_CHAR;
}
/* Parse */
enum {
P_CAT, P_ALT, P_REP,
P_BOL, P_EOL, P_WORD, P_NWORD,
P_PAR, P_PLA, P_NLA,
P_ANY, P_CHAR, P_CCLASS, P_NCCLASS,
P_REF,
};
struct Renode {
unsigned char type;
unsigned char ng, m, n;
Rune c;
Reclass *cc;
Renode *x;
Renode *y;
};
static Renode *newnode(struct cstate *g, int type)
{
Renode *node = g->pend++;
node->type = type;
node->cc = NULL;
node->c = 0;
node->ng = 0;
node->m = 0;
node->n = 0;
node->x = node->y = NULL;
return node;
}
static int empty(Renode *node)
{
if (!node) return 1;
switch (node->type) {
default: return 1;
case P_CAT: return empty(node->x) && empty(node->y);
case P_ALT: return empty(node->x) || empty(node->y);
case P_REP: return empty(node->x) || node->m == 0;
case P_PAR: return empty(node->x);
case P_REF: return empty(node->x);
case P_ANY: case P_CHAR: case P_CCLASS: case P_NCCLASS: return 0;
}
}
static Renode *newrep(struct cstate *g, Renode *atom, int ng, int min, int max)
{
Renode *rep = newnode(g, P_REP);
if (max == REPINF && empty(atom))
die(g, "infinite loop matching the empty string");
rep->ng = ng;
rep->m = min;
rep->n = max;
rep->x = atom;
return rep;
}
static void next(struct cstate *g)
{
g->lookahead = lex(g);
}
static int accept(struct cstate *g, int t)
{
if (g->lookahead == t) {
next(g);
return 1;
}
return 0;
}
static Renode *parsealt(struct cstate *g);
static Renode *parseatom(struct cstate *g)
{
Renode *atom;
if (g->lookahead == L_CHAR) {
atom = newnode(g, P_CHAR);
atom->c = g->yychar;
next(g);
return atom;
}
if (g->lookahead == L_CCLASS) {
atom = newnode(g, P_CCLASS);
atom->cc = g->yycc;
next(g);
return atom;
}
if (g->lookahead == L_NCCLASS) {
atom = newnode(g, P_NCCLASS);
atom->cc = g->yycc;
next(g);
return atom;
}
if (g->lookahead == L_REF) {
atom = newnode(g, P_REF);
if (g->yychar == 0 || g->yychar >= g->nsub || !g->sub[g->yychar])
die(g, "invalid back-reference");
atom->n = g->yychar;
atom->x = g->sub[g->yychar];
next(g);
return atom;
}
if (accept(g, '.'))
return newnode(g, P_ANY);
if (accept(g, '(')) {
atom = newnode(g, P_PAR);
if (g->nsub == MAXSUB)
die(g, "too many captures");
atom->n = g->nsub++;
atom->x = parsealt(g);
g->sub[atom->n] = atom;
if (!accept(g, ')'))
die(g, "unmatched '('");
return atom;
}
if (accept(g, L_NC)) {
atom = parsealt(g);
if (!accept(g, ')'))
die(g, "unmatched '('");
return atom;
}
if (accept(g, L_PLA)) {
atom = newnode(g, P_PLA);
atom->x = parsealt(g);
if (!accept(g, ')'))
die(g, "unmatched '('");
return atom;
}
if (accept(g, L_NLA)) {
atom = newnode(g, P_NLA);
atom->x = parsealt(g);
if (!accept(g, ')'))
die(g, "unmatched '('");
return atom;
}
die(g, "syntax error");
return NULL;
}
static Renode *parserep(struct cstate *g)
{
Renode *atom;
if (accept(g, '^')) return newnode(g, P_BOL);
if (accept(g, '$')) return newnode(g, P_EOL);
if (accept(g, L_WORD)) return newnode(g, P_WORD);
if (accept(g, L_NWORD)) return newnode(g, P_NWORD);
atom = parseatom(g);
if (g->lookahead == L_COUNT) {
int min = g->yymin, max = g->yymax;
next(g);
if (max < min)
die(g, "invalid quantifier");
return newrep(g, atom, accept(g, '?'), min, max);
}
if (accept(g, '*')) return newrep(g, atom, accept(g, '?'), 0, REPINF);
if (accept(g, '+')) return newrep(g, atom, accept(g, '?'), 1, REPINF);
if (accept(g, '?')) return newrep(g, atom, accept(g, '?'), 0, 1);
return atom;
}
static Renode *parsecat(struct cstate *g)
{
Renode *cat, *head, **tail;
if (g->lookahead && g->lookahead != '|' && g->lookahead != ')') {
/* Build a right-leaning tree by splicing in new 'cat' at the tail. */
head = parserep(g);
tail = &head;
while (g->lookahead && g->lookahead != '|' && g->lookahead != ')') {
cat = newnode(g, P_CAT);
cat->x = *tail;
cat->y = parserep(g);
*tail = cat;
tail = &cat->y;
}
return head;
}
return NULL;
}
static Renode *parsealt(struct cstate *g)
{
Renode *alt, *x;
alt = parsecat(g);
while (accept(g, '|')) {
x = alt;
alt = newnode(g, P_ALT);
alt->x = x;
alt->y = parsecat(g);
}
return alt;
}
/* Compile */
enum {
I_END, I_JUMP, I_SPLIT, I_PLA, I_NLA,
I_ANYNL, I_ANY, I_CHAR, I_CCLASS, I_NCCLASS, I_REF,
I_BOL, I_EOL, I_WORD, I_NWORD,
I_LPAR, I_RPAR
};
struct Reinst {
unsigned char opcode;
unsigned char n;
Rune c;
Reclass *cc;
Reinst *x;
Reinst *y;
};
static int count(struct cstate *g, Renode *node)
{
int min, max, n;
if (!node) return 0;
switch (node->type) {
default: return 1;
case P_CAT: return count(g, node->x) + count(g, node->y);
case P_ALT: return count(g, node->x) + count(g, node->y) + 2;
case P_REP:
min = node->m;
max = node->n;
if (min == max) n = count(g, node->x) * min;
else if (max < REPINF) n = count(g, node->x) * max + (max - min);
else n = count(g, node->x) * (min + 1) + 2;
if (n < 0 || n > MAXPROG) die(g, "program too large");
return n;
case P_PAR: return count(g, node->x) + 2;
case P_PLA: return count(g, node->x) + 2;
case P_NLA: return count(g, node->x) + 2;
}
}
static Reinst *emit(Reprog *prog, int opcode)
{
Reinst *inst = prog->end++;
inst->opcode = opcode;
inst->n = 0;
inst->c = 0;
inst->cc = NULL;
inst->x = inst->y = NULL;
return inst;
}
static void compile(Reprog *prog, Renode *node)
{
Reinst *inst, *split, *jump;
int i;
if (!node)
return;
loop:
switch (node->type) {
case P_CAT:
compile(prog, node->x);
node = node->y;
goto loop;
case P_ALT:
split = emit(prog, I_SPLIT);
compile(prog, node->x);
jump = emit(prog, I_JUMP);
compile(prog, node->y);
split->x = split + 1;
split->y = jump + 1;
jump->x = prog->end;
break;
case P_REP:
inst = NULL; /* silence compiler warning. assert(node->m > 0). */
for (i = 0; i < node->m; ++i) {
inst = prog->end;
compile(prog, node->x);
}
if (node->m == node->n)
break;
if (node->n < REPINF) {
for (i = node->m; i < node->n; ++i) {
split = emit(prog, I_SPLIT);
compile(prog, node->x);
if (node->ng) {
split->y = split + 1;
split->x = prog->end;
} else {
split->x = split + 1;
split->y = prog->end;
}
}
} else if (node->m == 0) {
split = emit(prog, I_SPLIT);
compile(prog, node->x);
jump = emit(prog, I_JUMP);
if (node->ng) {
split->y = split + 1;
split->x = prog->end;
} else {
split->x = split + 1;
split->y = prog->end;
}
jump->x = split;
} else {
split = emit(prog, I_SPLIT);
if (node->ng) {
split->y = inst;
split->x = prog->end;
} else {
split->x = inst;
split->y = prog->end;
}
}
break;
case P_BOL: emit(prog, I_BOL); break;
case P_EOL: emit(prog, I_EOL); break;
case P_WORD: emit(prog, I_WORD); break;
case P_NWORD: emit(prog, I_NWORD); break;
case P_PAR:
inst = emit(prog, I_LPAR);
inst->n = node->n;
compile(prog, node->x);
inst = emit(prog, I_RPAR);
inst->n = node->n;
break;
case P_PLA:
split = emit(prog, I_PLA);
compile(prog, node->x);
emit(prog, I_END);
split->x = split + 1;
split->y = prog->end;
break;
case P_NLA:
split = emit(prog, I_NLA);
compile(prog, node->x);
emit(prog, I_END);
split->x = split + 1;
split->y = prog->end;
break;
case P_ANY:
emit(prog, I_ANY);
break;
case P_CHAR:
inst = emit(prog, I_CHAR);
inst->c = (prog->flags & REG_ICASE) ? canon(node->c) : node->c;
break;
case P_CCLASS:
inst = emit(prog, I_CCLASS);
inst->cc = node->cc;
break;
case P_NCCLASS:
inst = emit(prog, I_NCCLASS);
inst->cc = node->cc;
break;
case P_REF:
inst = emit(prog, I_REF);
inst->n = node->n;
break;
}
}
#ifdef TEST
static void dumpnode(Renode *node)
{
Rune *p;
if (!node) { printf("Empty"); return; }
switch (node->type) {
case P_CAT: printf("Cat("); dumpnode(node->x); printf(", "); dumpnode(node->y); printf(")"); break;
case P_ALT: printf("Alt("); dumpnode(node->x); printf(", "); dumpnode(node->y); printf(")"); break;
case P_REP:
printf(node->ng ? "NgRep(%d,%d," : "Rep(%d,%d,", node->m, node->n);
dumpnode(node->x);
printf(")");
break;
case P_BOL: printf("Bol"); break;
case P_EOL: printf("Eol"); break;
case P_WORD: printf("Word"); break;
case P_NWORD: printf("NotWord"); break;
case P_PAR: printf("Par(%d,", node->n); dumpnode(node->x); printf(")"); break;
case P_PLA: printf("PLA("); dumpnode(node->x); printf(")"); break;
case P_NLA: printf("NLA("); dumpnode(node->x); printf(")"); break;
case P_ANY: printf("Any"); break;
case P_CHAR: printf("Char(%c)", node->c); break;
case P_CCLASS:
printf("Class(");
for (p = node->cc->spans; p < node->cc->end; p += 2) printf("%02X-%02X,", p[0], p[1]);
printf(")");
break;
case P_NCCLASS:
printf("NotClass(");
for (p = node->cc->spans; p < node->cc->end; p += 2) printf("%02X-%02X,", p[0], p[1]);
printf(")");
break;
case P_REF: printf("Ref(%d)", node->n); break;
}
}
static void dumpprog(Reprog *prog)
{
Reinst *inst;
int i;
for (i = 0, inst = prog->start; inst < prog->end; ++i, ++inst) {
printf("% 5d: ", i);
switch (inst->opcode) {
case I_END: puts("end"); break;
case I_JUMP: printf("jump %d\n", (int)(inst->x - prog->start)); break;
case I_SPLIT: printf("split %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;
case I_PLA: printf("pla %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;
case I_NLA: printf("nla %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;
case I_ANY: puts("any"); break;
case I_ANYNL: puts("anynl"); break;
case I_CHAR: printf(inst->c >= 32 && inst->c < 127 ? "char '%c'\n" : "char U+%04X\n", inst->c); break;
case I_CCLASS: puts("cclass"); break;
case I_NCCLASS: puts("ncclass"); break;
case I_REF: printf("ref %d\n", inst->n); break;
case I_BOL: puts("bol"); break;
case I_EOL: puts("eol"); break;
case I_WORD: puts("word"); break;
case I_NWORD: puts("nword"); break;
case I_LPAR: printf("lpar %d\n", inst->n); break;
case I_RPAR: printf("rpar %d\n", inst->n); break;
}
}
}
#endif
Reprog *regcompx(void *(*alloc)(void *ctx, void *p, int n), void *ctx,
const char *pattern, int cflags, const char **errorp)
{
struct cstate g;
Renode *node;
Reinst *split, *jump;
int i, n;
g.pstart = NULL;
g.prog = NULL;
if (setjmp(g.kaboom)) {
if (errorp) *errorp = g.error;
alloc(ctx, g.pstart, 0);
alloc(ctx, g.prog, 0);
return NULL;
}
g.prog = alloc(ctx, NULL, sizeof (Reprog));
if (!g.prog)
die(&g, "cannot allocate regular expression");
n = strlen(pattern) * 2;
if (n > MAXPROG)
die(&g, "program too large");
if (n > 0) {
g.pstart = g.pend = alloc(ctx, NULL, sizeof (Renode) * n);
if (!g.pstart)
die(&g, "cannot allocate regular expression parse list");
}
g.source = pattern;
g.ncclass = 0;
g.nsub = 1;
for (i = 0; i < MAXSUB; ++i)
g.sub[i] = 0;
g.prog->flags = cflags;
next(&g);
node = parsealt(&g);
if (g.lookahead == ')')
die(&g, "unmatched ')'");
if (g.lookahead != 0)
die(&g, "syntax error");
#ifdef TEST
dumpnode(node);
putchar('\n');
#endif
n = 6 + count(&g, node);
if (n < 0 || n > MAXPROG)
die(&g, "program too large");
g.prog->nsub = g.nsub;
g.prog->start = g.prog->end = alloc(ctx, NULL, n * sizeof (Reinst));
if (!g.prog->start)
die(&g, "cannot allocate regular expression instruction list");
split = emit(g.prog, I_SPLIT);
split->x = split + 3;
split->y = split + 1;
emit(g.prog, I_ANYNL);
jump = emit(g.prog, I_JUMP);
jump->x = split;
emit(g.prog, I_LPAR);
compile(g.prog, node);
emit(g.prog, I_RPAR);
emit(g.prog, I_END);
#ifdef TEST
dumpprog(g.prog);
#endif
alloc(ctx, g.pstart, 0);
if (errorp) *errorp = NULL;
return g.prog;
}
void regfreex(void *(*alloc)(void *ctx, void *p, int n), void *ctx, Reprog *prog)
{
if (prog) {
alloc(ctx, prog->start, 0);
alloc(ctx, prog, 0);
}
}
static void *default_alloc(void *ctx, void *p, int n)
{
return realloc(p, (size_t)n);
}
Reprog *regcomp(const char *pattern, int cflags, const char **errorp)
{
return regcompx(default_alloc, NULL, pattern, cflags, errorp);
}
void regfree(Reprog *prog)
{
regfreex(default_alloc, NULL, prog);
}
/* Match */
static int isnewline(int c)
{
return c == 0xA || c == 0xD || c == 0x2028 || c == 0x2029;
}
static int iswordchar(int c)
{
return c == '_' ||
(c >= 'a' && c <= 'z') ||
(c >= 'A' && c <= 'Z') ||
(c >= '0' && c <= '9');
}
static int incclass(Reclass *cc, Rune c)
{
Rune *p;
for (p = cc->spans; p < cc->end; p += 2)
if (p[0] <= c && c <= p[1])
return 1;
return 0;
}
static int incclasscanon(Reclass *cc, Rune c)
{
Rune *p, r;
for (p = cc->spans; p < cc->end; p += 2)
for (r = p[0]; r <= p[1]; ++r)
if (c == canon(r))
return 1;
return 0;
}
static int strncmpcanon(const char *a, const char *b, int n)
{
Rune ra, rb;
int c;
while (n--) {
if (!*a) return -1;
if (!*b) return 1;
a += chartorune(&ra, a);
b += chartorune(&rb, b);
c = canon(ra) - canon(rb);
if (c)
return c;
}
return 0;
}
static int match(Reinst *pc, const char *sp, const char *bol, int flags, Resub *out)
{
Resub scratch;
int i;
Rune c;
for (;;) {
switch (pc->opcode) {
case I_END:
return 1;
case I_JUMP:
pc = pc->x;
break;
case I_SPLIT:
scratch = *out;
if (match(pc->x, sp, bol, flags, &scratch)) {
*out = scratch;
return 1;
}
pc = pc->y;
break;
case I_PLA:
if (!match(pc->x, sp, bol, flags, out))
return 0;
pc = pc->y;
break;
case I_NLA:
scratch = *out;
if (match(pc->x, sp, bol, flags, &scratch))
return 0;
pc = pc->y;
break;
case I_ANYNL:
sp += chartorune(&c, sp);
if (c == 0)
return 0;
pc = pc + 1;
break;
case I_ANY:
sp += chartorune(&c, sp);
if (c == 0)
return 0;
if (isnewline(c))
return 0;
pc = pc + 1;
break;
case I_CHAR:
sp += chartorune(&c, sp);
if (c == 0)
return 0;
if (flags & REG_ICASE)
c = canon(c);
if (c != pc->c)
return 0;
pc = pc + 1;
break;
case I_CCLASS:
sp += chartorune(&c, sp);
if (c == 0)
return 0;
if (flags & REG_ICASE) {
if (!incclasscanon(pc->cc, canon(c)))
return 0;
} else {
if (!incclass(pc->cc, c))
return 0;
}
pc = pc + 1;
break;
case I_NCCLASS:
sp += chartorune(&c, sp);
if (c == 0)
return 0;
if (flags & REG_ICASE) {
if (incclasscanon(pc->cc, canon(c)))
return 0;
} else {
if (incclass(pc->cc, c))
return 0;
}
pc = pc + 1;
break;
case I_REF:
i = out->sub[pc->n].ep - out->sub[pc->n].sp;
if (flags & REG_ICASE) {
if (strncmpcanon(sp, out->sub[pc->n].sp, i))
return 0;
} else {
if (strncmp(sp, out->sub[pc->n].sp, i))
return 0;
}
if (i > 0)
sp += i;
pc = pc + 1;
break;
case I_BOL:
if (sp == bol && !(flags & REG_NOTBOL)) {
pc = pc + 1;
break;
}
if (flags & REG_NEWLINE) {
if (sp > bol && isnewline(sp[-1])) {
pc = pc + 1;
break;
}
}
return 0;
case I_EOL:
if (*sp == 0) {
pc = pc + 1;
break;
}
if (flags & REG_NEWLINE) {
if (isnewline(*sp)) {
pc = pc + 1;
break;
}
}
return 0;
case I_WORD:
i = sp > bol && iswordchar(sp[-1]);
i ^= iswordchar(sp[0]);
if (!i)
return 0;
pc = pc + 1;
break;
case I_NWORD:
i = sp > bol && iswordchar(sp[-1]);
i ^= iswordchar(sp[0]);
if (i)
return 0;
pc = pc + 1;
break;
case I_LPAR:
out->sub[pc->n].sp = sp;
pc = pc + 1;
break;
case I_RPAR:
out->sub[pc->n].ep = sp;
pc = pc + 1;
break;
default:
return 0;
}
}
}
int regexec(Reprog *prog, const char *sp, Resub *sub, int eflags)
{
Resub scratch;
int i;
if (!sub)
sub = &scratch;
sub->nsub = prog->nsub;
for (i = 0; i < MAXSUB; ++i)
sub->sub[i].sp = sub->sub[i].ep = NULL;
return !match(prog->start, sp, sp, prog->flags | eflags, sub);
}
#ifdef TEST
int main(int argc, char **argv)
{
const char *error;
const char *s;
Reprog *p;
Resub m;
int i;
if (argc > 1) {
p = regcomp(argv[1], 0, &error);
if (!p) {
fprintf(stderr, "regcomp: %s\n", error);
return 1;
}
if (argc > 2) {
s = argv[2];
printf("nsub = %d\n", p->nsub);
if (!regexec(p, s, &m, 0)) {
for (i = 0; i < m.nsub; ++i) {
int n = m.sub[i].ep - m.sub[i].sp;
if (n > 0)
printf("match %d: s=%d e=%d n=%d '%.*s'\n", i, (int)(m.sub[i].sp - s), (int)(m.sub[i].ep - s), n, n, m.sub[i].sp);
else
printf("match %d: n=0 ''\n", i);
}
} else {
printf("no match\n");
}
}
}
return 0;
}
#endif