Support embedded 0 in strings by using modified UTF-8.

2026-02-05 17:29:43 +08:00 · 2020-05-14 13:59:34 +02:00
parent 331c5ecbac
commit 0261579d78
6 changed files with 65 additions and 36 deletions
--- a/docs/reference.html
+++ b/docs/reference.html
@@ -65,8 +65,11 @@ Strings in the C interface are zero-terminated byte arrays in CESU-8 encoding.
 CESU-8 is a variant of UTF-8 which encodes supplementary unicode characters as
 surrogate pairs. This maintains compatibility with the UTF-16 nature of
 JavaScript, but requires attention when passing strings using supplementary
-unicode characters to and from the MuJS library. It also means that you cannot
-have any JavaScript strings with a zero character value in MuJS.
+unicode characters to and from the MuJS library.
+
+<p>
+The U+0000 character is encoded as the two-byte sequence <C0 80>, same as in
+modified UTF-8.

 <h3>Environments</h3>

--- a/jslex.c
+++ b/jslex.c
@@ -158,6 +158,10 @@ int jsY_tohex(int c)
 static void jsY_next(js_State *J)
 {
 	Rune c;
+	if (*J->source == 0) {
+		J->lexchar = EOF;
+		return;
+	}
 	J->source += chartorune(&c, J->source);
 	/* consume CR LF as one unit */
 	if (c == '\r' && *J->source == '\n')
@@ -201,17 +205,24 @@ static void textinit(js_State *J)

 static void textpush(js_State *J, Rune c)
 {
-	int n = runelen(c);
+	int n;
+	if (c == EOF)
+		n = 1;
+	else
+		n = runelen(c);
 	if (J->lexbuf.len + n > J->lexbuf.cap) {
 		J->lexbuf.cap = J->lexbuf.cap * 2;
 		J->lexbuf.text = js_realloc(J, J->lexbuf.text, J->lexbuf.cap);
 	}
-	J->lexbuf.len += runetochar(J->lexbuf.text + J->lexbuf.len, &c);
+	if (c == EOF)
+		J->lexbuf.text[J->lexbuf.len++] = 0;
+	else
+		J->lexbuf.len += runetochar(J->lexbuf.text + J->lexbuf.len, &c);
 }

 static char *textend(js_State *J)
 {
-	textpush(J, 0);
+	textpush(J, EOF);
 	return J->lexbuf.text;
 }

@@ -224,7 +235,7 @@ static void lexlinecomment(js_State *J)
 static int lexcomment(js_State *J)
 {
 	/* already consumed initial '/' '*' sequence */
-	while (J->lexchar != 0) {
+	while (J->lexchar != EOF) {
 		if (jsY_accept(J, '*')) {
 			while (J->lexchar == '*')
 				jsY_next(J);
@@ -385,7 +396,7 @@ static int lexescape(js_State *J)
 		return 0;

 	switch (J->lexchar) {
-	case 0: jsY_error(J, "unterminated escape sequence");
+	case EOF: jsY_error(J, "unterminated escape sequence");
 	case 'u':
 		jsY_next(J);
 		if (!jsY_ishex(J->lexchar)) return 1; else { x |= jsY_tohex(J->lexchar) << 12; jsY_next(J); }
@@ -425,7 +436,7 @@ static int lexstring(js_State *J)
 	textinit(J);

 	while (J->lexchar != q) {
-		if (J->lexchar == 0 || J->lexchar == '\n')
+		if (J->lexchar == EOF || J->lexchar == '\n')
 			jsY_error(J, "string not terminated");
 		if (jsY_accept(J, '\\')) {
 			if (lexescape(J))
@@ -475,14 +486,14 @@ static int lexregexp(js_State *J)

 	/* regexp body */
 	while (J->lexchar != '/' || inclass) {
-		if (J->lexchar == 0 || J->lexchar == '\n') {
+		if (J->lexchar == EOF || J->lexchar == '\n') {
 			jsY_error(J, "regular expression not terminated");
 		} else if (jsY_accept(J, '\\')) {
 			if (jsY_accept(J, '/')) {
 				textpush(J, '/');
 			} else {
 				textpush(J, '\\');
-				if (J->lexchar == 0 || J->lexchar == '\n')
+				if (J->lexchar == EOF || J->lexchar == '\n')
 					jsY_error(J, "regular expression not terminated");
 				textpush(J, J->lexchar);
 				jsY_next(J);
@@ -688,7 +699,7 @@ static int jsY_lexx(js_State *J)
 				return TK_XOR_ASS;
 			return '^';

-		case 0:
+		case EOF:
 			return 0; /* EOF */
 		}

@@ -803,7 +814,7 @@ static int lexjsonstring(js_State *J)
 	textinit(J);

 	while (J->lexchar != '"') {
-		if (J->lexchar == 0)
+		if (J->lexchar == EOF)
 			jsY_error(J, "unterminated string");
 		else if (J->lexchar < 32)
 			jsY_error(J, "invalid control character in string");
@@ -857,7 +868,7 @@ int jsY_lexjson(js_State *J)
 			jsY_next(J); jsY_expect(J, 'r'); jsY_expect(J, 'u'); jsY_expect(J, 'e');
 			return TK_TRUE;

-		case 0:
+		case EOF:
 			return 0; /* EOF */
 		}

--- a/jsrun.c
+++ b/jsrun.c
@@ -469,7 +469,7 @@ int js_isarrayindex(js_State *J, const char *p, int *idx)
 static void js_pushrune(js_State *J, Rune rune)
 {
 	char buf[UTFmax + 1];
-	if (rune > 0) {
+	if (rune >= 0) {
 		buf[runetochar(buf, &rune)] = 0;
 		js_pushstring(J, buf);
 	} else {
--- a/jsstring.c
+++ b/jsstring.c
@@ -21,12 +21,12 @@ static const char *checkstring(js_State *J, int idx)

 int js_runeat(js_State *J, const char *s, int i)
 {
-	Rune rune = 0;
+	Rune rune = EOF;
 	while (i-- >= 0) {
 		rune = *(unsigned char*)s;
 		if (rune < Runeself) {
 			if (rune == 0)
-				return 0;
+				return EOF;
 			++s;
 		} else
 			s += chartorune(&rune, s);
@@ -93,7 +93,7 @@ static void Sp_charAt(js_State *J)
 	const char *s = checkstring(J, 0);
 	int pos = js_tointeger(J, 1);
 	Rune rune = js_runeat(J, s, pos);
-	if (rune > 0) {
+	if (rune >= 0) {
 		buf[runetochar(buf, &rune)] = 0;
 		js_pushstring(J, buf);
 	} else {
@@ -106,7 +106,7 @@ static void Sp_charCodeAt(js_State *J)
 	const char *s = checkstring(J, 0);
 	int pos = js_tointeger(J, 1);
 	Rune rune = js_runeat(J, s, pos);
-	if (rune > 0)
+	if (rune >= 0)
 		js_pushnumber(J, rune);
 	else
 		js_pushnumber(J, NAN);
--- a/regexp.c
+++ b/regexp.c
@@ -116,11 +116,16 @@ static int isunicodeletter(int c)

 static int nextrune(struct cstate *g)
 {
+	if (!*g->source) {
+		g->yychar = EOF;
+		return 0;
+	}
 	g->source += chartorune(&g->yychar, g->source);
 	if (g->yychar == '\\') {
+		if (!*g->source)
+			die(g, "unterminated escape sequence");
 		g->source += chartorune(&g->yychar, g->source);
 		switch (g->yychar) {
-		case 0: die(g, "unterminated escape sequence"); break;
 		case 'f': g->yychar = '\f'; return 0;
 		case 'n': g->yychar = '\n'; return 0;
 		case 'r': g->yychar = '\r'; return 0;
@@ -147,6 +152,9 @@ static int nextrune(struct cstate *g)
 				return 1;
 			}
 			return 0;
+		case 0:
+			g->yychar = '0';
+			return 1;
 		}
 		if (strchr(ESCAPES, g->yychar))
 			return 1;
@@ -272,7 +280,7 @@ static int lexclass(struct cstate *g)

 	havesave = havedash = 0;
 	for (;;) {
-		if (g->yychar == 0)
+		if (g->yychar == EOF)
 			die(g, "unterminated character class");
 		if (!quoted && g->yychar == ']')
 			break;
@@ -363,7 +371,7 @@ static int lex(struct cstate *g)
 	}

 	switch (g->yychar) {
-	case 0:
+	case EOF:
 	case '$': case ')': case '*': case '+':
 	case '.': case '?': case '^': case '|':
 		return g->yychar;
@@ -561,11 +569,11 @@ static Renode *parserep(struct cstate *g)
 static Renode *parsecat(struct cstate *g)
 {
 	Renode *cat, *head, **tail;
-	if (g->lookahead && g->lookahead != '|' && g->lookahead != ')') {
+	if (g->lookahead != EOF && g->lookahead != '|' && g->lookahead != ')') {
 		/* Build a right-leaning tree by splicing in new 'cat' at the tail. */
 		head = parserep(g);
 		tail = &head;
-		while (g->lookahead && g->lookahead != '|' && g->lookahead != ')') {
+		while (g->lookahead != EOF && g->lookahead != '|' && g->lookahead != ')') {
 			cat = newnode(g, P_CAT);
 			cat->x = *tail;
 			cat->y = parserep(g);
@@ -866,7 +874,7 @@ Reprog *regcompx(void *(*alloc)(void *ctx, void *p, int n), void *ctx,
 	node = parsealt(&g);
 	if (g.lookahead == ')')
 		die(&g, "unmatched ')'");
-	if (g.lookahead != 0)
+	if (g.lookahead != EOF)
 		die(&g, "syntax error");

 #ifdef TEST
@@ -1026,23 +1034,20 @@ static int match(Reinst *pc, const char *sp, const char *bol, int flags, Resub *
 			break;

 		case I_ANYNL:
+			if (!*sp) return 1;
 			sp += chartorune(&c, sp);
-			if (c == 0)
-				return 1;
 			pc = pc + 1;
 			break;
 		case I_ANY:
+			if (!*sp) return 1;
 			sp += chartorune(&c, sp);
-			if (c == 0)
-				return 1;
 			if (isnewline(c))
 				return 1;
 			pc = pc + 1;
 			break;
 		case I_CHAR:
+			if (!*sp) return 1;
 			sp += chartorune(&c, sp);
-			if (c == 0)
-				return 1;
 			if (flags & REG_ICASE)
 				c = canon(c);
 			if (c != pc->c)
@@ -1050,9 +1055,8 @@ static int match(Reinst *pc, const char *sp, const char *bol, int flags, Resub *
 			pc = pc + 1;
 			break;
 		case I_CCLASS:
+			if (!*sp) return 1;
 			sp += chartorune(&c, sp);
-			if (c == 0)
-				return 1;
 			if (flags & REG_ICASE) {
 				if (!incclasscanon(pc->cc, canon(c)))
 					return 1;
@@ -1063,9 +1067,8 @@ static int match(Reinst *pc, const char *sp, const char *bol, int flags, Resub *
 			pc = pc + 1;
 			break;
 		case I_NCCLASS:
+			if (!*sp) return 1;
 			sp += chartorune(&c, sp);
-			if (c == 0)
-				return 1;
 			if (flags & REG_ICASE) {
 				if (incclasscanon(pc->cc, canon(c)))
 					return 1;
--- a/utf.c
+++ b/utf.c
@@ -48,6 +48,12 @@ chartorune(Rune *rune, const char *str)
 	int c, c1, c2;
 	int l;

+	/* overlong null character */
+	if((uchar)str[0] == 0xc0 && (uchar)str[1] == 0x80) {
+		*rune = 0;
+		return 2;
+	}
+
 	/*
 	 * one character sequence
 	 *	00000-0007F => T1
@@ -101,13 +107,19 @@ bad:
 int
 runetochar(char *str, const Rune *rune)
 {
-	int c;
+	int c = *rune;
+
+	/* overlong null character */
+	if (c == 0) {
+		str[0] = 0xc0;
+		str[1] = 0x80;
+		return 2;
+	}

 	/*
 	 * one character sequence
 	 *	00000-0007F => 00-7F
 	 */
-	c = *rune;
 	if(c <= Rune1) {
 		str[0] = c;
 		return 1;