From bfe569921d63fdbb29fe06c8e19ac402e009b960 Mon Sep 17 00:00:00 2001 From: Tor Andersson Date: Wed, 26 Feb 2014 20:22:53 +0100 Subject: [PATCH] Improve Resub API. Hold the subexpression count and array of matches inside a struct. --- jsregexp.c | 16 +++++----- jsstring.c | 50 +++++++++++++++---------------- regex.c | 88 +++++++++++++++++++++++++++--------------------------- regex.h | 14 +++++---- 4 files changed, 86 insertions(+), 82 deletions(-) diff --git a/jsregexp.c b/jsregexp.c index 6b2972f..164ab61 100644 --- a/jsregexp.c +++ b/jsregexp.c @@ -29,9 +29,9 @@ void js_newregexp(js_State *J, const char *pattern, int flags) void js_RegExp_prototype_exec(js_State *J, js_Regexp *re, const char *text) { - Resub m[REG_MAXSUB]; unsigned int i; int opts; + Resub m; opts = 0; if (re->flags & JS_REGEXP_G) { @@ -46,14 +46,14 @@ void js_RegExp_prototype_exec(js_State *J, js_Regexp *re, const char *text) } } - if (!js_regexec(re->prog, text, nelem(m), m, opts)) { + if (!js_regexec(re->prog, text, &m, opts)) { js_newarray(J); - for (i = 0; i < nelem(m) && m[i].sp; ++i) { - js_pushlstring(J, m[i].sp, m[i].ep - m[i].sp); + for (i = 0; i < m.nsub; ++i) { + js_pushlstring(J, m.sub[i].sp, m.sub[i].ep - m.sub[i].sp); js_setindex(J, -2, i); } if (re->flags & JS_REGEXP_G) - re->last = re->last + (m[0].ep - text); + re->last = re->last + (m.sub[0].ep - text); return; } @@ -67,8 +67,8 @@ static void Rp_test(js_State *J, unsigned int argc) { js_Regexp *re; const char *text; - Resub m[REG_MAXSUB]; int opts; + Resub m; re = js_toregexp(J, 0); text = js_tostring(J, 1); @@ -86,9 +86,9 @@ static void Rp_test(js_State *J, unsigned int argc) } } - if (!js_regexec(re->prog, text, nelem(m), m, opts)) { + if (!js_regexec(re->prog, text, &m, opts)) { if (re->flags & JS_REGEXP_G) - re->last = re->last + (m[0].ep - text); + re->last = re->last + (m.sub[0].ep - text); js_pushboolean(J, 1); return; } diff --git a/jsstring.c b/jsstring.c index f44b1b3..2c63b0c 100644 --- a/jsstring.c +++ b/jsstring.c @@ -307,10 +307,10 @@ static void S_fromCharCode(js_State *J, unsigned int argc) static void Sp_match(js_State *J, unsigned int argc) { js_Regexp *re; - Resub m[REG_MAXSUB]; const char *text; unsigned int len; const char *a, *b, *c, *e; + Resub m; text = js_tostring(J, 0); @@ -335,11 +335,11 @@ static void Sp_match(js_State *J, unsigned int argc) a = text; e = text + strlen(text); while (a <= e) { - if (js_regexec(re->prog, a, nelem(m), m, a > text ? REG_NOTBOL : 0)) + if (js_regexec(re->prog, a, &m, a > text ? REG_NOTBOL : 0)) break; - b = m[0].sp; - c = m[0].ep; + b = m.sub[0].sp; + c = m.sub[0].ep; js_pushlstring(J, b, c - b); js_setindex(J, -2, len++); @@ -353,8 +353,8 @@ static void Sp_match(js_State *J, unsigned int argc) static void Sp_search(js_State *J, unsigned int argc) { js_Regexp *re; - Resub m[REG_MAXSUB]; const char *text; + Resub m; text = js_tostring(J, 0); @@ -367,8 +367,8 @@ static void Sp_search(js_State *J, unsigned int argc) re = js_toregexp(J, -1); - if (!js_regexec(re->prog, text, nelem(m), m, 0)) - js_pushnumber(J, js_utfptrtoidx(text, m[0].sp)); + if (!js_regexec(re->prog, text, &m, 0)) + js_pushnumber(J, js_utfptrtoidx(text, m.sub[0].sp)); else js_pushnumber(J, -1); } @@ -376,15 +376,15 @@ static void Sp_search(js_State *J, unsigned int argc) static void Sp_replace_regexp(js_State *J, unsigned int argc) { js_Regexp *re; - Resub m[REG_MAXSUB]; const char *source, *s, *r; js_Buffer *sb = NULL; - int n, x; + unsigned int n, x; + Resub m; source = js_tostring(J, 0); re = js_toregexp(J, 1); - if (js_regexec(re->prog, source, nelem(m), m, 0)) { + if (js_regexec(re->prog, source, &m, 0)) { js_copy(J, 0); return; } @@ -392,14 +392,14 @@ static void Sp_replace_regexp(js_State *J, unsigned int argc) re->last = 0; loop: - s = m[0].sp; - n = m[0].ep - m[0].sp; + s = m.sub[0].sp; + n = m.sub[0].ep - m.sub[0].sp; if (js_iscallable(J, 2)) { js_copy(J, 2); js_pushglobal(J); - for (x = 0; m[x].sp; ++x) /* arg 0..x: substring and subexps that matched */ - js_pushlstring(J, m[x].sp, m[x].ep - m[x].sp); + for (x = 0; m.sub[x].sp; ++x) /* arg 0..x: substring and subexps that matched */ + js_pushlstring(J, m.sub[x].sp, m.sub[x].ep - m.sub[x].sp); js_pushnumber(J, s - source); /* arg x+2: offset within search string */ js_copy(J, 0); /* arg x+3: search string */ js_call(J, 2 + x); @@ -425,8 +425,8 @@ loop: if (r[1] >= '0' && r[1] <= '9') x = x * 10 + *(++r) - '0'; // TODO: use prog->nsub somehow - if (x > 0 && x < REG_MAXSUB && m[x].sp) { - sb_putm(&sb, m[x].sp, m[x].ep); + if (x > 0 && x < m.nsub) { + sb_putm(&sb, m.sub[x].sp, m.sub[x].ep); } else { sb_putc(&sb, '$'); if (x > 10) { @@ -450,14 +450,14 @@ loop: } if (re->flags & JS_REGEXP_G) { - source = m[0].ep; + source = m.sub[0].ep; if (n == 0) { if (*source) sb_putc(&sb, *source++); else goto end; } - if (!js_regexec(re->prog, source, nelem(m), m, REG_NOTBOL)) + if (!js_regexec(re->prog, source, &m, REG_NOTBOL)) goto loop; } @@ -544,10 +544,10 @@ static void Sp_replace(js_State *J, unsigned int argc) static void Sp_split_regexp(js_State *J, unsigned int argc) { js_Regexp *re; - Resub m[REG_MAXSUB]; const char *text; unsigned int limit, len, k; const char *p, *a, *b, *c, *e; + Resub m; text = js_tostring(J, 0); re = js_toregexp(J, 1); @@ -560,7 +560,7 @@ static void Sp_split_regexp(js_State *J, unsigned int argc) /* splitting the empty string */ if (e == 0) { - if (js_regexec(re->prog, text, nelem(m), m, 0)) { + if (js_regexec(re->prog, text, &m, 0)) { if (len == limit) return; js_pushliteral(J, ""); js_setindex(J, -2, 0); @@ -570,11 +570,11 @@ static void Sp_split_regexp(js_State *J, unsigned int argc) p = a = text; while (a < e) { - if (js_regexec(re->prog, a, nelem(m), m, a > text ? REG_NOTBOL : 0)) + if (js_regexec(re->prog, a, &m, a > text ? REG_NOTBOL : 0)) break; /* no match */ - b = m[0].sp; - c = m[0].ep; + b = m.sub[0].sp; + c = m.sub[0].ep; /* empty string at end of last match */ if (b == p) { @@ -586,9 +586,9 @@ static void Sp_split_regexp(js_State *J, unsigned int argc) js_pushlstring(J, p, b - p); js_setindex(J, -2, len++); - for (k = 1; k < nelem(m) && m[k].sp; ++k) { + for (k = 1; k < m.nsub; ++k) { if (len == limit) return; - js_pushlstring(J, m[k].sp, m[k].ep - m[k].sp); + js_pushlstring(J, m.sub[k].sp, m.sub[k].ep - m.sub[k].sp); js_setindex(J, -2, len++); } diff --git a/regex.c b/regex.c index 6434cb2..faa4899 100644 --- a/regex.c +++ b/regex.c @@ -30,7 +30,7 @@ struct Reclass { struct Reprog { Reinst *start, *end; int flags; - unsigned int ncap; + unsigned int nsub; Reclass cclass[16]; }; @@ -40,8 +40,8 @@ struct cstate { const char *source; unsigned int ncclass; - unsigned int ncap; - Renode *cap[MAXSUB]; + unsigned int nsub; + Renode *sub[MAXSUB]; int lookahead; Rune yychar; @@ -77,7 +77,7 @@ enum { L_NLA, /* "(?!" negative lookahead */ L_WORD, /* "\b" word boundary */ L_NWORD, /* "\B" non-word boundary */ - L_REF, /* "\0" back-reference */ + L_REF, /* "\1" back-reference */ L_COUNT, /* {M,N} */ }; @@ -459,10 +459,10 @@ static Renode *parseatom(struct cstate *g) } if (g->lookahead == L_REF) { atom = newnode(g, P_REF); - if (g->yychar == 0 || g->yychar > g->ncap || !g->cap[g->yychar]) + if (g->yychar == 0 || g->yychar > g->nsub || !g->sub[g->yychar]) die(g, "invalid back-reference"); atom->n = g->yychar; - atom->x = g->cap[g->yychar]; + atom->x = g->sub[g->yychar]; next(g); return atom; } @@ -470,12 +470,11 @@ static Renode *parseatom(struct cstate *g) return newnode(g, P_ANY); if (accept(g, '(')) { atom = newnode(g, P_PAR); - if (++g->ncap == MAXSUB) + if (g->nsub == MAXSUB) die(g, "too many captures"); - atom->n = g->ncap; - g->cap[atom->n] = NULL; + atom->n = g->nsub++; atom->x = parsealt(g); - g->cap[atom->n] = atom; + g->sub[atom->n] = atom; if (!accept(g, ')')) die(g, "unmatched '('"); return atom; @@ -805,9 +804,9 @@ Reprog *regcomp(const char *pattern, int cflags, const char **errorp) g.source = pattern; g.ncclass = 0; - g.ncap = 0; + g.nsub = 1; for (i = 0; i < MAXSUB; ++i) - g.cap[i] = 0; + g.sub[i] = 0; g.prog->flags = cflags; @@ -818,7 +817,7 @@ Reprog *regcomp(const char *pattern, int cflags, const char **errorp) if (g.lookahead != 0) die(&g, "syntax error"); - g.prog->ncap = g.ncap; + g.prog->nsub = g.nsub; g.prog->start = g.prog->end = malloc((count(node) + 6) * sizeof (Reinst)); split = emit(g.prog, I_SPLIT); @@ -905,21 +904,21 @@ static int strncmpcanon(const char *a, const char *b, unsigned int n) struct Rethread { Reinst *pc; const char *sp; - Resub sub[MAXSUB]; + Resub sub; }; static void spawn(Rethread *t, Reinst *pc, const char *sp, Resub *sub) { t->pc = pc; t->sp = sp; - memcpy(t->sub, sub, sizeof t->sub); + memcpy(&t->sub, sub, sizeof t->sub); } static int match(Reinst *pc, const char *sp, const char *bol, int flags, Resub *out) { Rethread ready[MAXTHREAD]; - Resub scrap[MAXSUB]; - Resub sub[MAXSUB]; + Resub scratch; + Resub sub; Rune c; unsigned int nready; int i; @@ -933,13 +932,13 @@ static int match(Reinst *pc, const char *sp, const char *bol, int flags, Resub * --nready; pc = ready[nready].pc; sp = ready[nready].sp; - memcpy(sub, ready[nready].sub, sizeof sub); + memcpy(&sub, &ready[nready].sub, sizeof sub); for (;;) { switch (pc->opcode) { case I_END: for (i = 0; i < MAXSUB; ++i) { - out[i].sp = sub[i].sp; - out[i].ep = sub[i].ep; + out->sub[i].sp = sub.sub[i].sp; + out->sub[i].ep = sub.sub[i].ep; } return 1; case I_JUMP: @@ -950,18 +949,18 @@ static int match(Reinst *pc, const char *sp, const char *bol, int flags, Resub * fprintf(stderr, "regexec: backtrack overflow!\n"); return 0; } - spawn(&ready[nready++], pc->y, sp, sub); + spawn(&ready[nready++], pc->y, sp, &sub); pc = pc->x; continue; case I_PLA: - if (!match(pc->x, sp, bol, flags, sub)) + if (!match(pc->x, sp, bol, flags, &sub)) goto dead; pc = pc->y; continue; case I_NLA: - memcpy(scrap, sub, sizeof scrap); - if (match(pc->x, sp, bol, flags, scrap)) + memcpy(&scratch, &sub, sizeof scratch); + if (match(pc->x, sp, bol, flags, &scratch)) goto dead; pc = pc->y; continue; @@ -1012,12 +1011,12 @@ static int match(Reinst *pc, const char *sp, const char *bol, int flags, Resub * } break; case I_REF: - i = sub[pc->n].ep - sub[pc->n].sp; + i = sub.sub[pc->n].ep - sub.sub[pc->n].sp; if (flags & REG_ICASE) { - if (strncmpcanon(sp, sub[pc->n].sp, i)) + if (strncmpcanon(sp, sub.sub[pc->n].sp, i)) goto dead; } else { - if (strncmp(sp, sub[pc->n].sp, i)) + if (strncmp(sp, sub.sub[pc->n].sp, i)) goto dead; } if (i > 0) @@ -1052,10 +1051,10 @@ static int match(Reinst *pc, const char *sp, const char *bol, int flags, Resub * goto dead; case I_LPAR: - sub[pc->n].sp = sp; + sub.sub[pc->n].sp = sp; break; case I_RPAR: - sub[pc->n].ep = sp; + sub.sub[pc->n].ep = sp; break; default: goto dead; @@ -1067,17 +1066,19 @@ dead: ; return 0; } -int regexec(Reprog *prog, const char *sp, int n, Resub *m, int eflags) +int regexec(Reprog *prog, const char *sp, Resub *sub, int eflags) { - Resub gm[MAXSUB]; - unsigned int i; + Resub scratch; + int i; - m = m ? m : gm; + if (!sub) + sub = &scratch; + sub->nsub = prog->nsub; for (i = 0; i < MAXSUB; ++i) - m[i].sp = m[i].ep = i <= prog->ncap ? sp : NULL; + sub->sub[i].sp = sub->sub[i].ep = NULL; - return !match(prog->start, sp, sp, prog->flags | eflags, m); + return !match(prog->start, sp, sp, prog->flags | eflags, sub); } #ifdef TEST @@ -1086,8 +1087,8 @@ int main(int argc, char **argv) const char *error; const char *s; Reprog *p; - Resub m[MAXSUB]; - int i; + Resub m; + unsigned int i; if (argc > 1) { p = regcomp(argv[1], 0, &error); @@ -1098,13 +1099,12 @@ int main(int argc, char **argv) if (argc > 2) { s = argv[2]; - printf("ncap = %d\n", p->ncap); - if (!regexec(p, s, MAXSUB, m, 0)) { - for (i = 0; i < MAXSUB; ++i) - if (m[i].sp) { - int n = m[i].ep - m[i].sp; - printf("match %d: s=%d e=%d n=%d '%.*s'\n", i, (int)(m[i].sp - s), (int)(m[i].ep - s), n, n, m[i].sp); - } + printf("nsub = %d\n", p->nsub); + if (!regexec(p, s, &m, 0)) { + for (i = 0; i < m.nsub; ++i) { + int n = m.sub[i].ep - m.sub[i].sp; + printf("match %d: s=%d e=%d n=%d '%.*s'\n", i, (int)(m.sub[i].sp - s), (int)(m.sub[i].ep - s), n, n, m.sub[i].sp); + } } else { printf("no match\n"); } diff --git a/regex.h b/regex.h index dd44143..3164a46 100644 --- a/regex.h +++ b/regex.h @@ -7,13 +7,9 @@ typedef struct Reprog Reprog; typedef struct Resub Resub; -struct Resub { - const char *sp; - const char *ep; -}; Reprog *regcomp(const char *pattern, int cflags, const char **errorp); -int regexec(Reprog *prog, const char *string, int nmatch, Resub *pmatch, int eflags); +int regexec(Reprog *prog, const char *string, Resub *sub, int eflags); void regfree(Reprog *prog); enum { @@ -28,4 +24,12 @@ enum { REG_MAXSUB = 16 }; +struct Resub { + unsigned int nsub; + struct { + const char *sp; + const char *ep; + } sub[REG_MAXSUB]; +}; + #endif