Support embedded 0 in strings by using modified UTF-8.

This commit is contained in:
Tor Andersson
2020-05-14 13:59:34 +02:00
parent 331c5ecbac
commit 0261579d78
6 changed files with 65 additions and 36 deletions

View File

@@ -65,8 +65,11 @@ Strings in the C interface are zero-terminated byte arrays in CESU-8 encoding.
CESU-8 is a variant of UTF-8 which encodes supplementary unicode characters as
surrogate pairs. This maintains compatibility with the UTF-16 nature of
JavaScript, but requires attention when passing strings using supplementary
unicode characters to and from the MuJS library. It also means that you cannot
have any JavaScript strings with a zero character value in MuJS.
unicode characters to and from the MuJS library.
<p>
The U+0000 character is encoded as the two-byte sequence <C0 80>, same as in
modified UTF-8.
<h3>Environments</h3>

33
jslex.c
View File

@@ -158,6 +158,10 @@ int jsY_tohex(int c)
static void jsY_next(js_State *J)
{
Rune c;
if (*J->source == 0) {
J->lexchar = EOF;
return;
}
J->source += chartorune(&c, J->source);
/* consume CR LF as one unit */
if (c == '\r' && *J->source == '\n')
@@ -201,17 +205,24 @@ static void textinit(js_State *J)
static void textpush(js_State *J, Rune c)
{
int n = runelen(c);
int n;
if (c == EOF)
n = 1;
else
n = runelen(c);
if (J->lexbuf.len + n > J->lexbuf.cap) {
J->lexbuf.cap = J->lexbuf.cap * 2;
J->lexbuf.text = js_realloc(J, J->lexbuf.text, J->lexbuf.cap);
}
J->lexbuf.len += runetochar(J->lexbuf.text + J->lexbuf.len, &c);
if (c == EOF)
J->lexbuf.text[J->lexbuf.len++] = 0;
else
J->lexbuf.len += runetochar(J->lexbuf.text + J->lexbuf.len, &c);
}
static char *textend(js_State *J)
{
textpush(J, 0);
textpush(J, EOF);
return J->lexbuf.text;
}
@@ -224,7 +235,7 @@ static void lexlinecomment(js_State *J)
static int lexcomment(js_State *J)
{
/* already consumed initial '/' '*' sequence */
while (J->lexchar != 0) {
while (J->lexchar != EOF) {
if (jsY_accept(J, '*')) {
while (J->lexchar == '*')
jsY_next(J);
@@ -385,7 +396,7 @@ static int lexescape(js_State *J)
return 0;
switch (J->lexchar) {
case 0: jsY_error(J, "unterminated escape sequence");
case EOF: jsY_error(J, "unterminated escape sequence");
case 'u':
jsY_next(J);
if (!jsY_ishex(J->lexchar)) return 1; else { x |= jsY_tohex(J->lexchar) << 12; jsY_next(J); }
@@ -425,7 +436,7 @@ static int lexstring(js_State *J)
textinit(J);
while (J->lexchar != q) {
if (J->lexchar == 0 || J->lexchar == '\n')
if (J->lexchar == EOF || J->lexchar == '\n')
jsY_error(J, "string not terminated");
if (jsY_accept(J, '\\')) {
if (lexescape(J))
@@ -475,14 +486,14 @@ static int lexregexp(js_State *J)
/* regexp body */
while (J->lexchar != '/' || inclass) {
if (J->lexchar == 0 || J->lexchar == '\n') {
if (J->lexchar == EOF || J->lexchar == '\n') {
jsY_error(J, "regular expression not terminated");
} else if (jsY_accept(J, '\\')) {
if (jsY_accept(J, '/')) {
textpush(J, '/');
} else {
textpush(J, '\\');
if (J->lexchar == 0 || J->lexchar == '\n')
if (J->lexchar == EOF || J->lexchar == '\n')
jsY_error(J, "regular expression not terminated");
textpush(J, J->lexchar);
jsY_next(J);
@@ -688,7 +699,7 @@ static int jsY_lexx(js_State *J)
return TK_XOR_ASS;
return '^';
case 0:
case EOF:
return 0; /* EOF */
}
@@ -803,7 +814,7 @@ static int lexjsonstring(js_State *J)
textinit(J);
while (J->lexchar != '"') {
if (J->lexchar == 0)
if (J->lexchar == EOF)
jsY_error(J, "unterminated string");
else if (J->lexchar < 32)
jsY_error(J, "invalid control character in string");
@@ -857,7 +868,7 @@ int jsY_lexjson(js_State *J)
jsY_next(J); jsY_expect(J, 'r'); jsY_expect(J, 'u'); jsY_expect(J, 'e');
return TK_TRUE;
case 0:
case EOF:
return 0; /* EOF */
}

View File

@@ -469,7 +469,7 @@ int js_isarrayindex(js_State *J, const char *p, int *idx)
static void js_pushrune(js_State *J, Rune rune)
{
char buf[UTFmax + 1];
if (rune > 0) {
if (rune >= 0) {
buf[runetochar(buf, &rune)] = 0;
js_pushstring(J, buf);
} else {

View File

@@ -21,12 +21,12 @@ static const char *checkstring(js_State *J, int idx)
int js_runeat(js_State *J, const char *s, int i)
{
Rune rune = 0;
Rune rune = EOF;
while (i-- >= 0) {
rune = *(unsigned char*)s;
if (rune < Runeself) {
if (rune == 0)
return 0;
return EOF;
++s;
} else
s += chartorune(&rune, s);
@@ -93,7 +93,7 @@ static void Sp_charAt(js_State *J)
const char *s = checkstring(J, 0);
int pos = js_tointeger(J, 1);
Rune rune = js_runeat(J, s, pos);
if (rune > 0) {
if (rune >= 0) {
buf[runetochar(buf, &rune)] = 0;
js_pushstring(J, buf);
} else {
@@ -106,7 +106,7 @@ static void Sp_charCodeAt(js_State *J)
const char *s = checkstring(J, 0);
int pos = js_tointeger(J, 1);
Rune rune = js_runeat(J, s, pos);
if (rune > 0)
if (rune >= 0)
js_pushnumber(J, rune);
else
js_pushnumber(J, NAN);

View File

@@ -116,11 +116,16 @@ static int isunicodeletter(int c)
static int nextrune(struct cstate *g)
{
if (!*g->source) {
g->yychar = EOF;
return 0;
}
g->source += chartorune(&g->yychar, g->source);
if (g->yychar == '\\') {
if (!*g->source)
die(g, "unterminated escape sequence");
g->source += chartorune(&g->yychar, g->source);
switch (g->yychar) {
case 0: die(g, "unterminated escape sequence"); break;
case 'f': g->yychar = '\f'; return 0;
case 'n': g->yychar = '\n'; return 0;
case 'r': g->yychar = '\r'; return 0;
@@ -147,6 +152,9 @@ static int nextrune(struct cstate *g)
return 1;
}
return 0;
case 0:
g->yychar = '0';
return 1;
}
if (strchr(ESCAPES, g->yychar))
return 1;
@@ -272,7 +280,7 @@ static int lexclass(struct cstate *g)
havesave = havedash = 0;
for (;;) {
if (g->yychar == 0)
if (g->yychar == EOF)
die(g, "unterminated character class");
if (!quoted && g->yychar == ']')
break;
@@ -363,7 +371,7 @@ static int lex(struct cstate *g)
}
switch (g->yychar) {
case 0:
case EOF:
case '$': case ')': case '*': case '+':
case '.': case '?': case '^': case '|':
return g->yychar;
@@ -561,11 +569,11 @@ static Renode *parserep(struct cstate *g)
static Renode *parsecat(struct cstate *g)
{
Renode *cat, *head, **tail;
if (g->lookahead && g->lookahead != '|' && g->lookahead != ')') {
if (g->lookahead != EOF && g->lookahead != '|' && g->lookahead != ')') {
/* Build a right-leaning tree by splicing in new 'cat' at the tail. */
head = parserep(g);
tail = &head;
while (g->lookahead && g->lookahead != '|' && g->lookahead != ')') {
while (g->lookahead != EOF && g->lookahead != '|' && g->lookahead != ')') {
cat = newnode(g, P_CAT);
cat->x = *tail;
cat->y = parserep(g);
@@ -866,7 +874,7 @@ Reprog *regcompx(void *(*alloc)(void *ctx, void *p, int n), void *ctx,
node = parsealt(&g);
if (g.lookahead == ')')
die(&g, "unmatched ')'");
if (g.lookahead != 0)
if (g.lookahead != EOF)
die(&g, "syntax error");
#ifdef TEST
@@ -1026,23 +1034,20 @@ static int match(Reinst *pc, const char *sp, const char *bol, int flags, Resub *
break;
case I_ANYNL:
if (!*sp) return 1;
sp += chartorune(&c, sp);
if (c == 0)
return 1;
pc = pc + 1;
break;
case I_ANY:
if (!*sp) return 1;
sp += chartorune(&c, sp);
if (c == 0)
return 1;
if (isnewline(c))
return 1;
pc = pc + 1;
break;
case I_CHAR:
if (!*sp) return 1;
sp += chartorune(&c, sp);
if (c == 0)
return 1;
if (flags & REG_ICASE)
c = canon(c);
if (c != pc->c)
@@ -1050,9 +1055,8 @@ static int match(Reinst *pc, const char *sp, const char *bol, int flags, Resub *
pc = pc + 1;
break;
case I_CCLASS:
if (!*sp) return 1;
sp += chartorune(&c, sp);
if (c == 0)
return 1;
if (flags & REG_ICASE) {
if (!incclasscanon(pc->cc, canon(c)))
return 1;
@@ -1063,9 +1067,8 @@ static int match(Reinst *pc, const char *sp, const char *bol, int flags, Resub *
pc = pc + 1;
break;
case I_NCCLASS:
if (!*sp) return 1;
sp += chartorune(&c, sp);
if (c == 0)
return 1;
if (flags & REG_ICASE) {
if (incclasscanon(pc->cc, canon(c)))
return 1;

16
utf.c
View File

@@ -48,6 +48,12 @@ chartorune(Rune *rune, const char *str)
int c, c1, c2;
int l;
/* overlong null character */
if((uchar)str[0] == 0xc0 && (uchar)str[1] == 0x80) {
*rune = 0;
return 2;
}
/*
* one character sequence
* 00000-0007F => T1
@@ -101,13 +107,19 @@ bad:
int
runetochar(char *str, const Rune *rune)
{
int c;
int c = *rune;
/* overlong null character */
if (c == 0) {
str[0] = 0xc0;
str[1] = 0x80;
return 2;
}
/*
* one character sequence
* 00000-0007F => 00-7F
*/
c = *rune;
if(c <= Rune1) {
str[0] = c;
return 1;