mirror of
https://github.com/ccxvii/mujs.git
synced 2026-02-06 09:51:41 +08:00
Improve Resub API.
Hold the subexpression count and array of matches inside a struct.
This commit is contained in:
16
jsregexp.c
16
jsregexp.c
@@ -29,9 +29,9 @@ void js_newregexp(js_State *J, const char *pattern, int flags)
|
||||
|
||||
void js_RegExp_prototype_exec(js_State *J, js_Regexp *re, const char *text)
|
||||
{
|
||||
Resub m[REG_MAXSUB];
|
||||
unsigned int i;
|
||||
int opts;
|
||||
Resub m;
|
||||
|
||||
opts = 0;
|
||||
if (re->flags & JS_REGEXP_G) {
|
||||
@@ -46,14 +46,14 @@ void js_RegExp_prototype_exec(js_State *J, js_Regexp *re, const char *text)
|
||||
}
|
||||
}
|
||||
|
||||
if (!js_regexec(re->prog, text, nelem(m), m, opts)) {
|
||||
if (!js_regexec(re->prog, text, &m, opts)) {
|
||||
js_newarray(J);
|
||||
for (i = 0; i < nelem(m) && m[i].sp; ++i) {
|
||||
js_pushlstring(J, m[i].sp, m[i].ep - m[i].sp);
|
||||
for (i = 0; i < m.nsub; ++i) {
|
||||
js_pushlstring(J, m.sub[i].sp, m.sub[i].ep - m.sub[i].sp);
|
||||
js_setindex(J, -2, i);
|
||||
}
|
||||
if (re->flags & JS_REGEXP_G)
|
||||
re->last = re->last + (m[0].ep - text);
|
||||
re->last = re->last + (m.sub[0].ep - text);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -67,8 +67,8 @@ static void Rp_test(js_State *J, unsigned int argc)
|
||||
{
|
||||
js_Regexp *re;
|
||||
const char *text;
|
||||
Resub m[REG_MAXSUB];
|
||||
int opts;
|
||||
Resub m;
|
||||
|
||||
re = js_toregexp(J, 0);
|
||||
text = js_tostring(J, 1);
|
||||
@@ -86,9 +86,9 @@ static void Rp_test(js_State *J, unsigned int argc)
|
||||
}
|
||||
}
|
||||
|
||||
if (!js_regexec(re->prog, text, nelem(m), m, opts)) {
|
||||
if (!js_regexec(re->prog, text, &m, opts)) {
|
||||
if (re->flags & JS_REGEXP_G)
|
||||
re->last = re->last + (m[0].ep - text);
|
||||
re->last = re->last + (m.sub[0].ep - text);
|
||||
js_pushboolean(J, 1);
|
||||
return;
|
||||
}
|
||||
|
||||
50
jsstring.c
50
jsstring.c
@@ -307,10 +307,10 @@ static void S_fromCharCode(js_State *J, unsigned int argc)
|
||||
static void Sp_match(js_State *J, unsigned int argc)
|
||||
{
|
||||
js_Regexp *re;
|
||||
Resub m[REG_MAXSUB];
|
||||
const char *text;
|
||||
unsigned int len;
|
||||
const char *a, *b, *c, *e;
|
||||
Resub m;
|
||||
|
||||
text = js_tostring(J, 0);
|
||||
|
||||
@@ -335,11 +335,11 @@ static void Sp_match(js_State *J, unsigned int argc)
|
||||
a = text;
|
||||
e = text + strlen(text);
|
||||
while (a <= e) {
|
||||
if (js_regexec(re->prog, a, nelem(m), m, a > text ? REG_NOTBOL : 0))
|
||||
if (js_regexec(re->prog, a, &m, a > text ? REG_NOTBOL : 0))
|
||||
break;
|
||||
|
||||
b = m[0].sp;
|
||||
c = m[0].ep;
|
||||
b = m.sub[0].sp;
|
||||
c = m.sub[0].ep;
|
||||
|
||||
js_pushlstring(J, b, c - b);
|
||||
js_setindex(J, -2, len++);
|
||||
@@ -353,8 +353,8 @@ static void Sp_match(js_State *J, unsigned int argc)
|
||||
static void Sp_search(js_State *J, unsigned int argc)
|
||||
{
|
||||
js_Regexp *re;
|
||||
Resub m[REG_MAXSUB];
|
||||
const char *text;
|
||||
Resub m;
|
||||
|
||||
text = js_tostring(J, 0);
|
||||
|
||||
@@ -367,8 +367,8 @@ static void Sp_search(js_State *J, unsigned int argc)
|
||||
|
||||
re = js_toregexp(J, -1);
|
||||
|
||||
if (!js_regexec(re->prog, text, nelem(m), m, 0))
|
||||
js_pushnumber(J, js_utfptrtoidx(text, m[0].sp));
|
||||
if (!js_regexec(re->prog, text, &m, 0))
|
||||
js_pushnumber(J, js_utfptrtoidx(text, m.sub[0].sp));
|
||||
else
|
||||
js_pushnumber(J, -1);
|
||||
}
|
||||
@@ -376,15 +376,15 @@ static void Sp_search(js_State *J, unsigned int argc)
|
||||
static void Sp_replace_regexp(js_State *J, unsigned int argc)
|
||||
{
|
||||
js_Regexp *re;
|
||||
Resub m[REG_MAXSUB];
|
||||
const char *source, *s, *r;
|
||||
js_Buffer *sb = NULL;
|
||||
int n, x;
|
||||
unsigned int n, x;
|
||||
Resub m;
|
||||
|
||||
source = js_tostring(J, 0);
|
||||
re = js_toregexp(J, 1);
|
||||
|
||||
if (js_regexec(re->prog, source, nelem(m), m, 0)) {
|
||||
if (js_regexec(re->prog, source, &m, 0)) {
|
||||
js_copy(J, 0);
|
||||
return;
|
||||
}
|
||||
@@ -392,14 +392,14 @@ static void Sp_replace_regexp(js_State *J, unsigned int argc)
|
||||
re->last = 0;
|
||||
|
||||
loop:
|
||||
s = m[0].sp;
|
||||
n = m[0].ep - m[0].sp;
|
||||
s = m.sub[0].sp;
|
||||
n = m.sub[0].ep - m.sub[0].sp;
|
||||
|
||||
if (js_iscallable(J, 2)) {
|
||||
js_copy(J, 2);
|
||||
js_pushglobal(J);
|
||||
for (x = 0; m[x].sp; ++x) /* arg 0..x: substring and subexps that matched */
|
||||
js_pushlstring(J, m[x].sp, m[x].ep - m[x].sp);
|
||||
for (x = 0; m.sub[x].sp; ++x) /* arg 0..x: substring and subexps that matched */
|
||||
js_pushlstring(J, m.sub[x].sp, m.sub[x].ep - m.sub[x].sp);
|
||||
js_pushnumber(J, s - source); /* arg x+2: offset within search string */
|
||||
js_copy(J, 0); /* arg x+3: search string */
|
||||
js_call(J, 2 + x);
|
||||
@@ -425,8 +425,8 @@ loop:
|
||||
if (r[1] >= '0' && r[1] <= '9')
|
||||
x = x * 10 + *(++r) - '0';
|
||||
// TODO: use prog->nsub somehow
|
||||
if (x > 0 && x < REG_MAXSUB && m[x].sp) {
|
||||
sb_putm(&sb, m[x].sp, m[x].ep);
|
||||
if (x > 0 && x < m.nsub) {
|
||||
sb_putm(&sb, m.sub[x].sp, m.sub[x].ep);
|
||||
} else {
|
||||
sb_putc(&sb, '$');
|
||||
if (x > 10) {
|
||||
@@ -450,14 +450,14 @@ loop:
|
||||
}
|
||||
|
||||
if (re->flags & JS_REGEXP_G) {
|
||||
source = m[0].ep;
|
||||
source = m.sub[0].ep;
|
||||
if (n == 0) {
|
||||
if (*source)
|
||||
sb_putc(&sb, *source++);
|
||||
else
|
||||
goto end;
|
||||
}
|
||||
if (!js_regexec(re->prog, source, nelem(m), m, REG_NOTBOL))
|
||||
if (!js_regexec(re->prog, source, &m, REG_NOTBOL))
|
||||
goto loop;
|
||||
}
|
||||
|
||||
@@ -544,10 +544,10 @@ static void Sp_replace(js_State *J, unsigned int argc)
|
||||
static void Sp_split_regexp(js_State *J, unsigned int argc)
|
||||
{
|
||||
js_Regexp *re;
|
||||
Resub m[REG_MAXSUB];
|
||||
const char *text;
|
||||
unsigned int limit, len, k;
|
||||
const char *p, *a, *b, *c, *e;
|
||||
Resub m;
|
||||
|
||||
text = js_tostring(J, 0);
|
||||
re = js_toregexp(J, 1);
|
||||
@@ -560,7 +560,7 @@ static void Sp_split_regexp(js_State *J, unsigned int argc)
|
||||
|
||||
/* splitting the empty string */
|
||||
if (e == 0) {
|
||||
if (js_regexec(re->prog, text, nelem(m), m, 0)) {
|
||||
if (js_regexec(re->prog, text, &m, 0)) {
|
||||
if (len == limit) return;
|
||||
js_pushliteral(J, "");
|
||||
js_setindex(J, -2, 0);
|
||||
@@ -570,11 +570,11 @@ static void Sp_split_regexp(js_State *J, unsigned int argc)
|
||||
|
||||
p = a = text;
|
||||
while (a < e) {
|
||||
if (js_regexec(re->prog, a, nelem(m), m, a > text ? REG_NOTBOL : 0))
|
||||
if (js_regexec(re->prog, a, &m, a > text ? REG_NOTBOL : 0))
|
||||
break; /* no match */
|
||||
|
||||
b = m[0].sp;
|
||||
c = m[0].ep;
|
||||
b = m.sub[0].sp;
|
||||
c = m.sub[0].ep;
|
||||
|
||||
/* empty string at end of last match */
|
||||
if (b == p) {
|
||||
@@ -586,9 +586,9 @@ static void Sp_split_regexp(js_State *J, unsigned int argc)
|
||||
js_pushlstring(J, p, b - p);
|
||||
js_setindex(J, -2, len++);
|
||||
|
||||
for (k = 1; k < nelem(m) && m[k].sp; ++k) {
|
||||
for (k = 1; k < m.nsub; ++k) {
|
||||
if (len == limit) return;
|
||||
js_pushlstring(J, m[k].sp, m[k].ep - m[k].sp);
|
||||
js_pushlstring(J, m.sub[k].sp, m.sub[k].ep - m.sub[k].sp);
|
||||
js_setindex(J, -2, len++);
|
||||
}
|
||||
|
||||
|
||||
88
regex.c
88
regex.c
@@ -30,7 +30,7 @@ struct Reclass {
|
||||
struct Reprog {
|
||||
Reinst *start, *end;
|
||||
int flags;
|
||||
unsigned int ncap;
|
||||
unsigned int nsub;
|
||||
Reclass cclass[16];
|
||||
};
|
||||
|
||||
@@ -40,8 +40,8 @@ struct cstate {
|
||||
|
||||
const char *source;
|
||||
unsigned int ncclass;
|
||||
unsigned int ncap;
|
||||
Renode *cap[MAXSUB];
|
||||
unsigned int nsub;
|
||||
Renode *sub[MAXSUB];
|
||||
|
||||
int lookahead;
|
||||
Rune yychar;
|
||||
@@ -77,7 +77,7 @@ enum {
|
||||
L_NLA, /* "(?!" negative lookahead */
|
||||
L_WORD, /* "\b" word boundary */
|
||||
L_NWORD, /* "\B" non-word boundary */
|
||||
L_REF, /* "\0" back-reference */
|
||||
L_REF, /* "\1" back-reference */
|
||||
L_COUNT, /* {M,N} */
|
||||
};
|
||||
|
||||
@@ -459,10 +459,10 @@ static Renode *parseatom(struct cstate *g)
|
||||
}
|
||||
if (g->lookahead == L_REF) {
|
||||
atom = newnode(g, P_REF);
|
||||
if (g->yychar == 0 || g->yychar > g->ncap || !g->cap[g->yychar])
|
||||
if (g->yychar == 0 || g->yychar > g->nsub || !g->sub[g->yychar])
|
||||
die(g, "invalid back-reference");
|
||||
atom->n = g->yychar;
|
||||
atom->x = g->cap[g->yychar];
|
||||
atom->x = g->sub[g->yychar];
|
||||
next(g);
|
||||
return atom;
|
||||
}
|
||||
@@ -470,12 +470,11 @@ static Renode *parseatom(struct cstate *g)
|
||||
return newnode(g, P_ANY);
|
||||
if (accept(g, '(')) {
|
||||
atom = newnode(g, P_PAR);
|
||||
if (++g->ncap == MAXSUB)
|
||||
if (g->nsub == MAXSUB)
|
||||
die(g, "too many captures");
|
||||
atom->n = g->ncap;
|
||||
g->cap[atom->n] = NULL;
|
||||
atom->n = g->nsub++;
|
||||
atom->x = parsealt(g);
|
||||
g->cap[atom->n] = atom;
|
||||
g->sub[atom->n] = atom;
|
||||
if (!accept(g, ')'))
|
||||
die(g, "unmatched '('");
|
||||
return atom;
|
||||
@@ -805,9 +804,9 @@ Reprog *regcomp(const char *pattern, int cflags, const char **errorp)
|
||||
|
||||
g.source = pattern;
|
||||
g.ncclass = 0;
|
||||
g.ncap = 0;
|
||||
g.nsub = 1;
|
||||
for (i = 0; i < MAXSUB; ++i)
|
||||
g.cap[i] = 0;
|
||||
g.sub[i] = 0;
|
||||
|
||||
g.prog->flags = cflags;
|
||||
|
||||
@@ -818,7 +817,7 @@ Reprog *regcomp(const char *pattern, int cflags, const char **errorp)
|
||||
if (g.lookahead != 0)
|
||||
die(&g, "syntax error");
|
||||
|
||||
g.prog->ncap = g.ncap;
|
||||
g.prog->nsub = g.nsub;
|
||||
g.prog->start = g.prog->end = malloc((count(node) + 6) * sizeof (Reinst));
|
||||
|
||||
split = emit(g.prog, I_SPLIT);
|
||||
@@ -905,21 +904,21 @@ static int strncmpcanon(const char *a, const char *b, unsigned int n)
|
||||
struct Rethread {
|
||||
Reinst *pc;
|
||||
const char *sp;
|
||||
Resub sub[MAXSUB];
|
||||
Resub sub;
|
||||
};
|
||||
|
||||
static void spawn(Rethread *t, Reinst *pc, const char *sp, Resub *sub)
|
||||
{
|
||||
t->pc = pc;
|
||||
t->sp = sp;
|
||||
memcpy(t->sub, sub, sizeof t->sub);
|
||||
memcpy(&t->sub, sub, sizeof t->sub);
|
||||
}
|
||||
|
||||
static int match(Reinst *pc, const char *sp, const char *bol, int flags, Resub *out)
|
||||
{
|
||||
Rethread ready[MAXTHREAD];
|
||||
Resub scrap[MAXSUB];
|
||||
Resub sub[MAXSUB];
|
||||
Resub scratch;
|
||||
Resub sub;
|
||||
Rune c;
|
||||
unsigned int nready;
|
||||
int i;
|
||||
@@ -933,13 +932,13 @@ static int match(Reinst *pc, const char *sp, const char *bol, int flags, Resub *
|
||||
--nready;
|
||||
pc = ready[nready].pc;
|
||||
sp = ready[nready].sp;
|
||||
memcpy(sub, ready[nready].sub, sizeof sub);
|
||||
memcpy(&sub, &ready[nready].sub, sizeof sub);
|
||||
for (;;) {
|
||||
switch (pc->opcode) {
|
||||
case I_END:
|
||||
for (i = 0; i < MAXSUB; ++i) {
|
||||
out[i].sp = sub[i].sp;
|
||||
out[i].ep = sub[i].ep;
|
||||
out->sub[i].sp = sub.sub[i].sp;
|
||||
out->sub[i].ep = sub.sub[i].ep;
|
||||
}
|
||||
return 1;
|
||||
case I_JUMP:
|
||||
@@ -950,18 +949,18 @@ static int match(Reinst *pc, const char *sp, const char *bol, int flags, Resub *
|
||||
fprintf(stderr, "regexec: backtrack overflow!\n");
|
||||
return 0;
|
||||
}
|
||||
spawn(&ready[nready++], pc->y, sp, sub);
|
||||
spawn(&ready[nready++], pc->y, sp, &sub);
|
||||
pc = pc->x;
|
||||
continue;
|
||||
|
||||
case I_PLA:
|
||||
if (!match(pc->x, sp, bol, flags, sub))
|
||||
if (!match(pc->x, sp, bol, flags, &sub))
|
||||
goto dead;
|
||||
pc = pc->y;
|
||||
continue;
|
||||
case I_NLA:
|
||||
memcpy(scrap, sub, sizeof scrap);
|
||||
if (match(pc->x, sp, bol, flags, scrap))
|
||||
memcpy(&scratch, &sub, sizeof scratch);
|
||||
if (match(pc->x, sp, bol, flags, &scratch))
|
||||
goto dead;
|
||||
pc = pc->y;
|
||||
continue;
|
||||
@@ -1012,12 +1011,12 @@ static int match(Reinst *pc, const char *sp, const char *bol, int flags, Resub *
|
||||
}
|
||||
break;
|
||||
case I_REF:
|
||||
i = sub[pc->n].ep - sub[pc->n].sp;
|
||||
i = sub.sub[pc->n].ep - sub.sub[pc->n].sp;
|
||||
if (flags & REG_ICASE) {
|
||||
if (strncmpcanon(sp, sub[pc->n].sp, i))
|
||||
if (strncmpcanon(sp, sub.sub[pc->n].sp, i))
|
||||
goto dead;
|
||||
} else {
|
||||
if (strncmp(sp, sub[pc->n].sp, i))
|
||||
if (strncmp(sp, sub.sub[pc->n].sp, i))
|
||||
goto dead;
|
||||
}
|
||||
if (i > 0)
|
||||
@@ -1052,10 +1051,10 @@ static int match(Reinst *pc, const char *sp, const char *bol, int flags, Resub *
|
||||
goto dead;
|
||||
|
||||
case I_LPAR:
|
||||
sub[pc->n].sp = sp;
|
||||
sub.sub[pc->n].sp = sp;
|
||||
break;
|
||||
case I_RPAR:
|
||||
sub[pc->n].ep = sp;
|
||||
sub.sub[pc->n].ep = sp;
|
||||
break;
|
||||
default:
|
||||
goto dead;
|
||||
@@ -1067,17 +1066,19 @@ dead: ;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int regexec(Reprog *prog, const char *sp, int n, Resub *m, int eflags)
|
||||
int regexec(Reprog *prog, const char *sp, Resub *sub, int eflags)
|
||||
{
|
||||
Resub gm[MAXSUB];
|
||||
unsigned int i;
|
||||
Resub scratch;
|
||||
int i;
|
||||
|
||||
m = m ? m : gm;
|
||||
if (!sub)
|
||||
sub = &scratch;
|
||||
|
||||
sub->nsub = prog->nsub;
|
||||
for (i = 0; i < MAXSUB; ++i)
|
||||
m[i].sp = m[i].ep = i <= prog->ncap ? sp : NULL;
|
||||
sub->sub[i].sp = sub->sub[i].ep = NULL;
|
||||
|
||||
return !match(prog->start, sp, sp, prog->flags | eflags, m);
|
||||
return !match(prog->start, sp, sp, prog->flags | eflags, sub);
|
||||
}
|
||||
|
||||
#ifdef TEST
|
||||
@@ -1086,8 +1087,8 @@ int main(int argc, char **argv)
|
||||
const char *error;
|
||||
const char *s;
|
||||
Reprog *p;
|
||||
Resub m[MAXSUB];
|
||||
int i;
|
||||
Resub m;
|
||||
unsigned int i;
|
||||
|
||||
if (argc > 1) {
|
||||
p = regcomp(argv[1], 0, &error);
|
||||
@@ -1098,13 +1099,12 @@ int main(int argc, char **argv)
|
||||
|
||||
if (argc > 2) {
|
||||
s = argv[2];
|
||||
printf("ncap = %d\n", p->ncap);
|
||||
if (!regexec(p, s, MAXSUB, m, 0)) {
|
||||
for (i = 0; i < MAXSUB; ++i)
|
||||
if (m[i].sp) {
|
||||
int n = m[i].ep - m[i].sp;
|
||||
printf("match %d: s=%d e=%d n=%d '%.*s'\n", i, (int)(m[i].sp - s), (int)(m[i].ep - s), n, n, m[i].sp);
|
||||
}
|
||||
printf("nsub = %d\n", p->nsub);
|
||||
if (!regexec(p, s, &m, 0)) {
|
||||
for (i = 0; i < m.nsub; ++i) {
|
||||
int n = m.sub[i].ep - m.sub[i].sp;
|
||||
printf("match %d: s=%d e=%d n=%d '%.*s'\n", i, (int)(m.sub[i].sp - s), (int)(m.sub[i].ep - s), n, n, m.sub[i].sp);
|
||||
}
|
||||
} else {
|
||||
printf("no match\n");
|
||||
}
|
||||
|
||||
14
regex.h
14
regex.h
@@ -7,13 +7,9 @@
|
||||
|
||||
typedef struct Reprog Reprog;
|
||||
typedef struct Resub Resub;
|
||||
struct Resub {
|
||||
const char *sp;
|
||||
const char *ep;
|
||||
};
|
||||
|
||||
Reprog *regcomp(const char *pattern, int cflags, const char **errorp);
|
||||
int regexec(Reprog *prog, const char *string, int nmatch, Resub *pmatch, int eflags);
|
||||
int regexec(Reprog *prog, const char *string, Resub *sub, int eflags);
|
||||
void regfree(Reprog *prog);
|
||||
|
||||
enum {
|
||||
@@ -28,4 +24,12 @@ enum {
|
||||
REG_MAXSUB = 16
|
||||
};
|
||||
|
||||
struct Resub {
|
||||
unsigned int nsub;
|
||||
struct {
|
||||
const char *sp;
|
||||
const char *ep;
|
||||
} sub[REG_MAXSUB];
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
Reference in New Issue
Block a user