Improve Resub API.

Hold the subexpression count and array of matches inside a struct.
This commit is contained in:
Tor Andersson
2014-02-26 20:22:53 +01:00
parent 01d85a4994
commit bfe569921d
4 changed files with 86 additions and 82 deletions

View File

@@ -29,9 +29,9 @@ void js_newregexp(js_State *J, const char *pattern, int flags)
void js_RegExp_prototype_exec(js_State *J, js_Regexp *re, const char *text)
{
Resub m[REG_MAXSUB];
unsigned int i;
int opts;
Resub m;
opts = 0;
if (re->flags & JS_REGEXP_G) {
@@ -46,14 +46,14 @@ void js_RegExp_prototype_exec(js_State *J, js_Regexp *re, const char *text)
}
}
if (!js_regexec(re->prog, text, nelem(m), m, opts)) {
if (!js_regexec(re->prog, text, &m, opts)) {
js_newarray(J);
for (i = 0; i < nelem(m) && m[i].sp; ++i) {
js_pushlstring(J, m[i].sp, m[i].ep - m[i].sp);
for (i = 0; i < m.nsub; ++i) {
js_pushlstring(J, m.sub[i].sp, m.sub[i].ep - m.sub[i].sp);
js_setindex(J, -2, i);
}
if (re->flags & JS_REGEXP_G)
re->last = re->last + (m[0].ep - text);
re->last = re->last + (m.sub[0].ep - text);
return;
}
@@ -67,8 +67,8 @@ static void Rp_test(js_State *J, unsigned int argc)
{
js_Regexp *re;
const char *text;
Resub m[REG_MAXSUB];
int opts;
Resub m;
re = js_toregexp(J, 0);
text = js_tostring(J, 1);
@@ -86,9 +86,9 @@ static void Rp_test(js_State *J, unsigned int argc)
}
}
if (!js_regexec(re->prog, text, nelem(m), m, opts)) {
if (!js_regexec(re->prog, text, &m, opts)) {
if (re->flags & JS_REGEXP_G)
re->last = re->last + (m[0].ep - text);
re->last = re->last + (m.sub[0].ep - text);
js_pushboolean(J, 1);
return;
}

View File

@@ -307,10 +307,10 @@ static void S_fromCharCode(js_State *J, unsigned int argc)
static void Sp_match(js_State *J, unsigned int argc)
{
js_Regexp *re;
Resub m[REG_MAXSUB];
const char *text;
unsigned int len;
const char *a, *b, *c, *e;
Resub m;
text = js_tostring(J, 0);
@@ -335,11 +335,11 @@ static void Sp_match(js_State *J, unsigned int argc)
a = text;
e = text + strlen(text);
while (a <= e) {
if (js_regexec(re->prog, a, nelem(m), m, a > text ? REG_NOTBOL : 0))
if (js_regexec(re->prog, a, &m, a > text ? REG_NOTBOL : 0))
break;
b = m[0].sp;
c = m[0].ep;
b = m.sub[0].sp;
c = m.sub[0].ep;
js_pushlstring(J, b, c - b);
js_setindex(J, -2, len++);
@@ -353,8 +353,8 @@ static void Sp_match(js_State *J, unsigned int argc)
static void Sp_search(js_State *J, unsigned int argc)
{
js_Regexp *re;
Resub m[REG_MAXSUB];
const char *text;
Resub m;
text = js_tostring(J, 0);
@@ -367,8 +367,8 @@ static void Sp_search(js_State *J, unsigned int argc)
re = js_toregexp(J, -1);
if (!js_regexec(re->prog, text, nelem(m), m, 0))
js_pushnumber(J, js_utfptrtoidx(text, m[0].sp));
if (!js_regexec(re->prog, text, &m, 0))
js_pushnumber(J, js_utfptrtoidx(text, m.sub[0].sp));
else
js_pushnumber(J, -1);
}
@@ -376,15 +376,15 @@ static void Sp_search(js_State *J, unsigned int argc)
static void Sp_replace_regexp(js_State *J, unsigned int argc)
{
js_Regexp *re;
Resub m[REG_MAXSUB];
const char *source, *s, *r;
js_Buffer *sb = NULL;
int n, x;
unsigned int n, x;
Resub m;
source = js_tostring(J, 0);
re = js_toregexp(J, 1);
if (js_regexec(re->prog, source, nelem(m), m, 0)) {
if (js_regexec(re->prog, source, &m, 0)) {
js_copy(J, 0);
return;
}
@@ -392,14 +392,14 @@ static void Sp_replace_regexp(js_State *J, unsigned int argc)
re->last = 0;
loop:
s = m[0].sp;
n = m[0].ep - m[0].sp;
s = m.sub[0].sp;
n = m.sub[0].ep - m.sub[0].sp;
if (js_iscallable(J, 2)) {
js_copy(J, 2);
js_pushglobal(J);
for (x = 0; m[x].sp; ++x) /* arg 0..x: substring and subexps that matched */
js_pushlstring(J, m[x].sp, m[x].ep - m[x].sp);
for (x = 0; m.sub[x].sp; ++x) /* arg 0..x: substring and subexps that matched */
js_pushlstring(J, m.sub[x].sp, m.sub[x].ep - m.sub[x].sp);
js_pushnumber(J, s - source); /* arg x+2: offset within search string */
js_copy(J, 0); /* arg x+3: search string */
js_call(J, 2 + x);
@@ -425,8 +425,8 @@ loop:
if (r[1] >= '0' && r[1] <= '9')
x = x * 10 + *(++r) - '0';
// TODO: use prog->nsub somehow
if (x > 0 && x < REG_MAXSUB && m[x].sp) {
sb_putm(&sb, m[x].sp, m[x].ep);
if (x > 0 && x < m.nsub) {
sb_putm(&sb, m.sub[x].sp, m.sub[x].ep);
} else {
sb_putc(&sb, '$');
if (x > 10) {
@@ -450,14 +450,14 @@ loop:
}
if (re->flags & JS_REGEXP_G) {
source = m[0].ep;
source = m.sub[0].ep;
if (n == 0) {
if (*source)
sb_putc(&sb, *source++);
else
goto end;
}
if (!js_regexec(re->prog, source, nelem(m), m, REG_NOTBOL))
if (!js_regexec(re->prog, source, &m, REG_NOTBOL))
goto loop;
}
@@ -544,10 +544,10 @@ static void Sp_replace(js_State *J, unsigned int argc)
static void Sp_split_regexp(js_State *J, unsigned int argc)
{
js_Regexp *re;
Resub m[REG_MAXSUB];
const char *text;
unsigned int limit, len, k;
const char *p, *a, *b, *c, *e;
Resub m;
text = js_tostring(J, 0);
re = js_toregexp(J, 1);
@@ -560,7 +560,7 @@ static void Sp_split_regexp(js_State *J, unsigned int argc)
/* splitting the empty string */
if (e == 0) {
if (js_regexec(re->prog, text, nelem(m), m, 0)) {
if (js_regexec(re->prog, text, &m, 0)) {
if (len == limit) return;
js_pushliteral(J, "");
js_setindex(J, -2, 0);
@@ -570,11 +570,11 @@ static void Sp_split_regexp(js_State *J, unsigned int argc)
p = a = text;
while (a < e) {
if (js_regexec(re->prog, a, nelem(m), m, a > text ? REG_NOTBOL : 0))
if (js_regexec(re->prog, a, &m, a > text ? REG_NOTBOL : 0))
break; /* no match */
b = m[0].sp;
c = m[0].ep;
b = m.sub[0].sp;
c = m.sub[0].ep;
/* empty string at end of last match */
if (b == p) {
@@ -586,9 +586,9 @@ static void Sp_split_regexp(js_State *J, unsigned int argc)
js_pushlstring(J, p, b - p);
js_setindex(J, -2, len++);
for (k = 1; k < nelem(m) && m[k].sp; ++k) {
for (k = 1; k < m.nsub; ++k) {
if (len == limit) return;
js_pushlstring(J, m[k].sp, m[k].ep - m[k].sp);
js_pushlstring(J, m.sub[k].sp, m.sub[k].ep - m.sub[k].sp);
js_setindex(J, -2, len++);
}

88
regex.c
View File

@@ -30,7 +30,7 @@ struct Reclass {
struct Reprog {
Reinst *start, *end;
int flags;
unsigned int ncap;
unsigned int nsub;
Reclass cclass[16];
};
@@ -40,8 +40,8 @@ struct cstate {
const char *source;
unsigned int ncclass;
unsigned int ncap;
Renode *cap[MAXSUB];
unsigned int nsub;
Renode *sub[MAXSUB];
int lookahead;
Rune yychar;
@@ -77,7 +77,7 @@ enum {
L_NLA, /* "(?!" negative lookahead */
L_WORD, /* "\b" word boundary */
L_NWORD, /* "\B" non-word boundary */
L_REF, /* "\0" back-reference */
L_REF, /* "\1" back-reference */
L_COUNT, /* {M,N} */
};
@@ -459,10 +459,10 @@ static Renode *parseatom(struct cstate *g)
}
if (g->lookahead == L_REF) {
atom = newnode(g, P_REF);
if (g->yychar == 0 || g->yychar > g->ncap || !g->cap[g->yychar])
if (g->yychar == 0 || g->yychar > g->nsub || !g->sub[g->yychar])
die(g, "invalid back-reference");
atom->n = g->yychar;
atom->x = g->cap[g->yychar];
atom->x = g->sub[g->yychar];
next(g);
return atom;
}
@@ -470,12 +470,11 @@ static Renode *parseatom(struct cstate *g)
return newnode(g, P_ANY);
if (accept(g, '(')) {
atom = newnode(g, P_PAR);
if (++g->ncap == MAXSUB)
if (g->nsub == MAXSUB)
die(g, "too many captures");
atom->n = g->ncap;
g->cap[atom->n] = NULL;
atom->n = g->nsub++;
atom->x = parsealt(g);
g->cap[atom->n] = atom;
g->sub[atom->n] = atom;
if (!accept(g, ')'))
die(g, "unmatched '('");
return atom;
@@ -805,9 +804,9 @@ Reprog *regcomp(const char *pattern, int cflags, const char **errorp)
g.source = pattern;
g.ncclass = 0;
g.ncap = 0;
g.nsub = 1;
for (i = 0; i < MAXSUB; ++i)
g.cap[i] = 0;
g.sub[i] = 0;
g.prog->flags = cflags;
@@ -818,7 +817,7 @@ Reprog *regcomp(const char *pattern, int cflags, const char **errorp)
if (g.lookahead != 0)
die(&g, "syntax error");
g.prog->ncap = g.ncap;
g.prog->nsub = g.nsub;
g.prog->start = g.prog->end = malloc((count(node) + 6) * sizeof (Reinst));
split = emit(g.prog, I_SPLIT);
@@ -905,21 +904,21 @@ static int strncmpcanon(const char *a, const char *b, unsigned int n)
struct Rethread {
Reinst *pc;
const char *sp;
Resub sub[MAXSUB];
Resub sub;
};
static void spawn(Rethread *t, Reinst *pc, const char *sp, Resub *sub)
{
t->pc = pc;
t->sp = sp;
memcpy(t->sub, sub, sizeof t->sub);
memcpy(&t->sub, sub, sizeof t->sub);
}
static int match(Reinst *pc, const char *sp, const char *bol, int flags, Resub *out)
{
Rethread ready[MAXTHREAD];
Resub scrap[MAXSUB];
Resub sub[MAXSUB];
Resub scratch;
Resub sub;
Rune c;
unsigned int nready;
int i;
@@ -933,13 +932,13 @@ static int match(Reinst *pc, const char *sp, const char *bol, int flags, Resub *
--nready;
pc = ready[nready].pc;
sp = ready[nready].sp;
memcpy(sub, ready[nready].sub, sizeof sub);
memcpy(&sub, &ready[nready].sub, sizeof sub);
for (;;) {
switch (pc->opcode) {
case I_END:
for (i = 0; i < MAXSUB; ++i) {
out[i].sp = sub[i].sp;
out[i].ep = sub[i].ep;
out->sub[i].sp = sub.sub[i].sp;
out->sub[i].ep = sub.sub[i].ep;
}
return 1;
case I_JUMP:
@@ -950,18 +949,18 @@ static int match(Reinst *pc, const char *sp, const char *bol, int flags, Resub *
fprintf(stderr, "regexec: backtrack overflow!\n");
return 0;
}
spawn(&ready[nready++], pc->y, sp, sub);
spawn(&ready[nready++], pc->y, sp, &sub);
pc = pc->x;
continue;
case I_PLA:
if (!match(pc->x, sp, bol, flags, sub))
if (!match(pc->x, sp, bol, flags, &sub))
goto dead;
pc = pc->y;
continue;
case I_NLA:
memcpy(scrap, sub, sizeof scrap);
if (match(pc->x, sp, bol, flags, scrap))
memcpy(&scratch, &sub, sizeof scratch);
if (match(pc->x, sp, bol, flags, &scratch))
goto dead;
pc = pc->y;
continue;
@@ -1012,12 +1011,12 @@ static int match(Reinst *pc, const char *sp, const char *bol, int flags, Resub *
}
break;
case I_REF:
i = sub[pc->n].ep - sub[pc->n].sp;
i = sub.sub[pc->n].ep - sub.sub[pc->n].sp;
if (flags & REG_ICASE) {
if (strncmpcanon(sp, sub[pc->n].sp, i))
if (strncmpcanon(sp, sub.sub[pc->n].sp, i))
goto dead;
} else {
if (strncmp(sp, sub[pc->n].sp, i))
if (strncmp(sp, sub.sub[pc->n].sp, i))
goto dead;
}
if (i > 0)
@@ -1052,10 +1051,10 @@ static int match(Reinst *pc, const char *sp, const char *bol, int flags, Resub *
goto dead;
case I_LPAR:
sub[pc->n].sp = sp;
sub.sub[pc->n].sp = sp;
break;
case I_RPAR:
sub[pc->n].ep = sp;
sub.sub[pc->n].ep = sp;
break;
default:
goto dead;
@@ -1067,17 +1066,19 @@ dead: ;
return 0;
}
int regexec(Reprog *prog, const char *sp, int n, Resub *m, int eflags)
int regexec(Reprog *prog, const char *sp, Resub *sub, int eflags)
{
Resub gm[MAXSUB];
unsigned int i;
Resub scratch;
int i;
m = m ? m : gm;
if (!sub)
sub = &scratch;
sub->nsub = prog->nsub;
for (i = 0; i < MAXSUB; ++i)
m[i].sp = m[i].ep = i <= prog->ncap ? sp : NULL;
sub->sub[i].sp = sub->sub[i].ep = NULL;
return !match(prog->start, sp, sp, prog->flags | eflags, m);
return !match(prog->start, sp, sp, prog->flags | eflags, sub);
}
#ifdef TEST
@@ -1086,8 +1087,8 @@ int main(int argc, char **argv)
const char *error;
const char *s;
Reprog *p;
Resub m[MAXSUB];
int i;
Resub m;
unsigned int i;
if (argc > 1) {
p = regcomp(argv[1], 0, &error);
@@ -1098,13 +1099,12 @@ int main(int argc, char **argv)
if (argc > 2) {
s = argv[2];
printf("ncap = %d\n", p->ncap);
if (!regexec(p, s, MAXSUB, m, 0)) {
for (i = 0; i < MAXSUB; ++i)
if (m[i].sp) {
int n = m[i].ep - m[i].sp;
printf("match %d: s=%d e=%d n=%d '%.*s'\n", i, (int)(m[i].sp - s), (int)(m[i].ep - s), n, n, m[i].sp);
}
printf("nsub = %d\n", p->nsub);
if (!regexec(p, s, &m, 0)) {
for (i = 0; i < m.nsub; ++i) {
int n = m.sub[i].ep - m.sub[i].sp;
printf("match %d: s=%d e=%d n=%d '%.*s'\n", i, (int)(m.sub[i].sp - s), (int)(m.sub[i].ep - s), n, n, m.sub[i].sp);
}
} else {
printf("no match\n");
}

14
regex.h
View File

@@ -7,13 +7,9 @@
typedef struct Reprog Reprog;
typedef struct Resub Resub;
struct Resub {
const char *sp;
const char *ep;
};
Reprog *regcomp(const char *pattern, int cflags, const char **errorp);
int regexec(Reprog *prog, const char *string, int nmatch, Resub *pmatch, int eflags);
int regexec(Reprog *prog, const char *string, Resub *sub, int eflags);
void regfree(Reprog *prog);
enum {
@@ -28,4 +24,12 @@ enum {
REG_MAXSUB = 16
};
struct Resub {
unsigned int nsub;
struct {
const char *sp;
const char *ep;
} sub[REG_MAXSUB];
};
#endif