mirror of
https://github.com/ccxvii/mujs.git
synced 2026-02-05 17:29:43 +08:00
Support embedded 0 in strings by using modified UTF-8.
This commit is contained in:
@@ -65,8 +65,11 @@ Strings in the C interface are zero-terminated byte arrays in CESU-8 encoding.
|
||||
CESU-8 is a variant of UTF-8 which encodes supplementary unicode characters as
|
||||
surrogate pairs. This maintains compatibility with the UTF-16 nature of
|
||||
JavaScript, but requires attention when passing strings using supplementary
|
||||
unicode characters to and from the MuJS library. It also means that you cannot
|
||||
have any JavaScript strings with a zero character value in MuJS.
|
||||
unicode characters to and from the MuJS library.
|
||||
|
||||
<p>
|
||||
The U+0000 character is encoded as the two-byte sequence <C0 80>, same as in
|
||||
modified UTF-8.
|
||||
|
||||
<h3>Environments</h3>
|
||||
|
||||
|
||||
33
jslex.c
33
jslex.c
@@ -158,6 +158,10 @@ int jsY_tohex(int c)
|
||||
static void jsY_next(js_State *J)
|
||||
{
|
||||
Rune c;
|
||||
if (*J->source == 0) {
|
||||
J->lexchar = EOF;
|
||||
return;
|
||||
}
|
||||
J->source += chartorune(&c, J->source);
|
||||
/* consume CR LF as one unit */
|
||||
if (c == '\r' && *J->source == '\n')
|
||||
@@ -201,17 +205,24 @@ static void textinit(js_State *J)
|
||||
|
||||
static void textpush(js_State *J, Rune c)
|
||||
{
|
||||
int n = runelen(c);
|
||||
int n;
|
||||
if (c == EOF)
|
||||
n = 1;
|
||||
else
|
||||
n = runelen(c);
|
||||
if (J->lexbuf.len + n > J->lexbuf.cap) {
|
||||
J->lexbuf.cap = J->lexbuf.cap * 2;
|
||||
J->lexbuf.text = js_realloc(J, J->lexbuf.text, J->lexbuf.cap);
|
||||
}
|
||||
J->lexbuf.len += runetochar(J->lexbuf.text + J->lexbuf.len, &c);
|
||||
if (c == EOF)
|
||||
J->lexbuf.text[J->lexbuf.len++] = 0;
|
||||
else
|
||||
J->lexbuf.len += runetochar(J->lexbuf.text + J->lexbuf.len, &c);
|
||||
}
|
||||
|
||||
static char *textend(js_State *J)
|
||||
{
|
||||
textpush(J, 0);
|
||||
textpush(J, EOF);
|
||||
return J->lexbuf.text;
|
||||
}
|
||||
|
||||
@@ -224,7 +235,7 @@ static void lexlinecomment(js_State *J)
|
||||
static int lexcomment(js_State *J)
|
||||
{
|
||||
/* already consumed initial '/' '*' sequence */
|
||||
while (J->lexchar != 0) {
|
||||
while (J->lexchar != EOF) {
|
||||
if (jsY_accept(J, '*')) {
|
||||
while (J->lexchar == '*')
|
||||
jsY_next(J);
|
||||
@@ -385,7 +396,7 @@ static int lexescape(js_State *J)
|
||||
return 0;
|
||||
|
||||
switch (J->lexchar) {
|
||||
case 0: jsY_error(J, "unterminated escape sequence");
|
||||
case EOF: jsY_error(J, "unterminated escape sequence");
|
||||
case 'u':
|
||||
jsY_next(J);
|
||||
if (!jsY_ishex(J->lexchar)) return 1; else { x |= jsY_tohex(J->lexchar) << 12; jsY_next(J); }
|
||||
@@ -425,7 +436,7 @@ static int lexstring(js_State *J)
|
||||
textinit(J);
|
||||
|
||||
while (J->lexchar != q) {
|
||||
if (J->lexchar == 0 || J->lexchar == '\n')
|
||||
if (J->lexchar == EOF || J->lexchar == '\n')
|
||||
jsY_error(J, "string not terminated");
|
||||
if (jsY_accept(J, '\\')) {
|
||||
if (lexescape(J))
|
||||
@@ -475,14 +486,14 @@ static int lexregexp(js_State *J)
|
||||
|
||||
/* regexp body */
|
||||
while (J->lexchar != '/' || inclass) {
|
||||
if (J->lexchar == 0 || J->lexchar == '\n') {
|
||||
if (J->lexchar == EOF || J->lexchar == '\n') {
|
||||
jsY_error(J, "regular expression not terminated");
|
||||
} else if (jsY_accept(J, '\\')) {
|
||||
if (jsY_accept(J, '/')) {
|
||||
textpush(J, '/');
|
||||
} else {
|
||||
textpush(J, '\\');
|
||||
if (J->lexchar == 0 || J->lexchar == '\n')
|
||||
if (J->lexchar == EOF || J->lexchar == '\n')
|
||||
jsY_error(J, "regular expression not terminated");
|
||||
textpush(J, J->lexchar);
|
||||
jsY_next(J);
|
||||
@@ -688,7 +699,7 @@ static int jsY_lexx(js_State *J)
|
||||
return TK_XOR_ASS;
|
||||
return '^';
|
||||
|
||||
case 0:
|
||||
case EOF:
|
||||
return 0; /* EOF */
|
||||
}
|
||||
|
||||
@@ -803,7 +814,7 @@ static int lexjsonstring(js_State *J)
|
||||
textinit(J);
|
||||
|
||||
while (J->lexchar != '"') {
|
||||
if (J->lexchar == 0)
|
||||
if (J->lexchar == EOF)
|
||||
jsY_error(J, "unterminated string");
|
||||
else if (J->lexchar < 32)
|
||||
jsY_error(J, "invalid control character in string");
|
||||
@@ -857,7 +868,7 @@ int jsY_lexjson(js_State *J)
|
||||
jsY_next(J); jsY_expect(J, 'r'); jsY_expect(J, 'u'); jsY_expect(J, 'e');
|
||||
return TK_TRUE;
|
||||
|
||||
case 0:
|
||||
case EOF:
|
||||
return 0; /* EOF */
|
||||
}
|
||||
|
||||
|
||||
2
jsrun.c
2
jsrun.c
@@ -469,7 +469,7 @@ int js_isarrayindex(js_State *J, const char *p, int *idx)
|
||||
static void js_pushrune(js_State *J, Rune rune)
|
||||
{
|
||||
char buf[UTFmax + 1];
|
||||
if (rune > 0) {
|
||||
if (rune >= 0) {
|
||||
buf[runetochar(buf, &rune)] = 0;
|
||||
js_pushstring(J, buf);
|
||||
} else {
|
||||
|
||||
@@ -21,12 +21,12 @@ static const char *checkstring(js_State *J, int idx)
|
||||
|
||||
int js_runeat(js_State *J, const char *s, int i)
|
||||
{
|
||||
Rune rune = 0;
|
||||
Rune rune = EOF;
|
||||
while (i-- >= 0) {
|
||||
rune = *(unsigned char*)s;
|
||||
if (rune < Runeself) {
|
||||
if (rune == 0)
|
||||
return 0;
|
||||
return EOF;
|
||||
++s;
|
||||
} else
|
||||
s += chartorune(&rune, s);
|
||||
@@ -93,7 +93,7 @@ static void Sp_charAt(js_State *J)
|
||||
const char *s = checkstring(J, 0);
|
||||
int pos = js_tointeger(J, 1);
|
||||
Rune rune = js_runeat(J, s, pos);
|
||||
if (rune > 0) {
|
||||
if (rune >= 0) {
|
||||
buf[runetochar(buf, &rune)] = 0;
|
||||
js_pushstring(J, buf);
|
||||
} else {
|
||||
@@ -106,7 +106,7 @@ static void Sp_charCodeAt(js_State *J)
|
||||
const char *s = checkstring(J, 0);
|
||||
int pos = js_tointeger(J, 1);
|
||||
Rune rune = js_runeat(J, s, pos);
|
||||
if (rune > 0)
|
||||
if (rune >= 0)
|
||||
js_pushnumber(J, rune);
|
||||
else
|
||||
js_pushnumber(J, NAN);
|
||||
|
||||
35
regexp.c
35
regexp.c
@@ -116,11 +116,16 @@ static int isunicodeletter(int c)
|
||||
|
||||
static int nextrune(struct cstate *g)
|
||||
{
|
||||
if (!*g->source) {
|
||||
g->yychar = EOF;
|
||||
return 0;
|
||||
}
|
||||
g->source += chartorune(&g->yychar, g->source);
|
||||
if (g->yychar == '\\') {
|
||||
if (!*g->source)
|
||||
die(g, "unterminated escape sequence");
|
||||
g->source += chartorune(&g->yychar, g->source);
|
||||
switch (g->yychar) {
|
||||
case 0: die(g, "unterminated escape sequence"); break;
|
||||
case 'f': g->yychar = '\f'; return 0;
|
||||
case 'n': g->yychar = '\n'; return 0;
|
||||
case 'r': g->yychar = '\r'; return 0;
|
||||
@@ -147,6 +152,9 @@ static int nextrune(struct cstate *g)
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
case 0:
|
||||
g->yychar = '0';
|
||||
return 1;
|
||||
}
|
||||
if (strchr(ESCAPES, g->yychar))
|
||||
return 1;
|
||||
@@ -272,7 +280,7 @@ static int lexclass(struct cstate *g)
|
||||
|
||||
havesave = havedash = 0;
|
||||
for (;;) {
|
||||
if (g->yychar == 0)
|
||||
if (g->yychar == EOF)
|
||||
die(g, "unterminated character class");
|
||||
if (!quoted && g->yychar == ']')
|
||||
break;
|
||||
@@ -363,7 +371,7 @@ static int lex(struct cstate *g)
|
||||
}
|
||||
|
||||
switch (g->yychar) {
|
||||
case 0:
|
||||
case EOF:
|
||||
case '$': case ')': case '*': case '+':
|
||||
case '.': case '?': case '^': case '|':
|
||||
return g->yychar;
|
||||
@@ -561,11 +569,11 @@ static Renode *parserep(struct cstate *g)
|
||||
static Renode *parsecat(struct cstate *g)
|
||||
{
|
||||
Renode *cat, *head, **tail;
|
||||
if (g->lookahead && g->lookahead != '|' && g->lookahead != ')') {
|
||||
if (g->lookahead != EOF && g->lookahead != '|' && g->lookahead != ')') {
|
||||
/* Build a right-leaning tree by splicing in new 'cat' at the tail. */
|
||||
head = parserep(g);
|
||||
tail = &head;
|
||||
while (g->lookahead && g->lookahead != '|' && g->lookahead != ')') {
|
||||
while (g->lookahead != EOF && g->lookahead != '|' && g->lookahead != ')') {
|
||||
cat = newnode(g, P_CAT);
|
||||
cat->x = *tail;
|
||||
cat->y = parserep(g);
|
||||
@@ -866,7 +874,7 @@ Reprog *regcompx(void *(*alloc)(void *ctx, void *p, int n), void *ctx,
|
||||
node = parsealt(&g);
|
||||
if (g.lookahead == ')')
|
||||
die(&g, "unmatched ')'");
|
||||
if (g.lookahead != 0)
|
||||
if (g.lookahead != EOF)
|
||||
die(&g, "syntax error");
|
||||
|
||||
#ifdef TEST
|
||||
@@ -1026,23 +1034,20 @@ static int match(Reinst *pc, const char *sp, const char *bol, int flags, Resub *
|
||||
break;
|
||||
|
||||
case I_ANYNL:
|
||||
if (!*sp) return 1;
|
||||
sp += chartorune(&c, sp);
|
||||
if (c == 0)
|
||||
return 1;
|
||||
pc = pc + 1;
|
||||
break;
|
||||
case I_ANY:
|
||||
if (!*sp) return 1;
|
||||
sp += chartorune(&c, sp);
|
||||
if (c == 0)
|
||||
return 1;
|
||||
if (isnewline(c))
|
||||
return 1;
|
||||
pc = pc + 1;
|
||||
break;
|
||||
case I_CHAR:
|
||||
if (!*sp) return 1;
|
||||
sp += chartorune(&c, sp);
|
||||
if (c == 0)
|
||||
return 1;
|
||||
if (flags & REG_ICASE)
|
||||
c = canon(c);
|
||||
if (c != pc->c)
|
||||
@@ -1050,9 +1055,8 @@ static int match(Reinst *pc, const char *sp, const char *bol, int flags, Resub *
|
||||
pc = pc + 1;
|
||||
break;
|
||||
case I_CCLASS:
|
||||
if (!*sp) return 1;
|
||||
sp += chartorune(&c, sp);
|
||||
if (c == 0)
|
||||
return 1;
|
||||
if (flags & REG_ICASE) {
|
||||
if (!incclasscanon(pc->cc, canon(c)))
|
||||
return 1;
|
||||
@@ -1063,9 +1067,8 @@ static int match(Reinst *pc, const char *sp, const char *bol, int flags, Resub *
|
||||
pc = pc + 1;
|
||||
break;
|
||||
case I_NCCLASS:
|
||||
if (!*sp) return 1;
|
||||
sp += chartorune(&c, sp);
|
||||
if (c == 0)
|
||||
return 1;
|
||||
if (flags & REG_ICASE) {
|
||||
if (incclasscanon(pc->cc, canon(c)))
|
||||
return 1;
|
||||
|
||||
16
utf.c
16
utf.c
@@ -48,6 +48,12 @@ chartorune(Rune *rune, const char *str)
|
||||
int c, c1, c2;
|
||||
int l;
|
||||
|
||||
/* overlong null character */
|
||||
if((uchar)str[0] == 0xc0 && (uchar)str[1] == 0x80) {
|
||||
*rune = 0;
|
||||
return 2;
|
||||
}
|
||||
|
||||
/*
|
||||
* one character sequence
|
||||
* 00000-0007F => T1
|
||||
@@ -101,13 +107,19 @@ bad:
|
||||
int
|
||||
runetochar(char *str, const Rune *rune)
|
||||
{
|
||||
int c;
|
||||
int c = *rune;
|
||||
|
||||
/* overlong null character */
|
||||
if (c == 0) {
|
||||
str[0] = 0xc0;
|
||||
str[1] = 0x80;
|
||||
return 2;
|
||||
}
|
||||
|
||||
/*
|
||||
* one character sequence
|
||||
* 00000-0007F => 00-7F
|
||||
*/
|
||||
c = *rune;
|
||||
if(c <= Rune1) {
|
||||
str[0] = c;
|
||||
return 1;
|
||||
|
||||
Reference in New Issue
Block a user