Support 4-byte UTF-8 sequences.

The following functions are no longer restricted to 16-bit integer values:

	String.fromCharCode()
	String.prototype.charCodeAt()

repr() will not escape SMP characters, as doing so would require conversion to
surrogate pairs, but will encode these characters as UTF-8. Unicode characters
in the BMP will still be escaped with \uXXXX as before.

JSON.stringify() only escapes control characters, so will represent all non-ASCII
characters as UTF-8.

We do no automatic conversions to/from surrogate pairs. Code that worked with
surrogate pairs should not be affected by these changes.
This commit is contained in:
Tor Andersson
2020-02-19 10:11:31 +01:00
parent 0261579d78
commit 832e069049
6 changed files with 87 additions and 27 deletions

View File

@@ -61,11 +61,15 @@ Custom properties on userdata objects can be implemented using getter and setter
Numbers are represented using double precision floating point values. Numbers are represented using double precision floating point values.
<p> <p>
Strings in the C interface are zero-terminated byte arrays in CESU-8 encoding. Strings in the C interface are zero-terminated byte arrays in WTF-8 encoding.
CESU-8 is a variant of UTF-8 which encodes supplementary unicode characters as This allows both arbitary 16-bit values (as required by Javascript) and also
surrogate pairs. This maintains compatibility with the UTF-16 nature of extended code points for the full 21-bit Unicode range.
JavaScript, but requires attention when passing strings using supplementary These extended characters will mostly work as expected in Javascript.
unicode characters to and from the MuJS library.
<p>
If you have Javascript code that expects to work with UTF-16 surrogate pairs,
you will need to manually convert any extended characters to surrogate pairs
and back when passing strings between C and Javascript.
<p> <p>
The U+0000 character is encoded as the two-byte sequence <C0 80>, same as in The U+0000 character is encoded as the two-byte sequence <C0 80>, same as in

15
json.c
View File

@@ -180,10 +180,11 @@ static void fmtnum(js_State *J, js_Buffer **sb, double n)
static void fmtstr(js_State *J, js_Buffer **sb, const char *s) static void fmtstr(js_State *J, js_Buffer **sb, const char *s)
{ {
static const char *HEX = "0123456789ABCDEF"; static const char *HEX = "0123456789ABCDEF";
int i, n;
Rune c; Rune c;
js_putc(J, sb, '"'); js_putc(J, sb, '"');
while (*s) { while (*s) {
s += chartorune(&c, s); n = chartorune(&c, s);
switch (c) { switch (c) {
case '"': js_puts(J, sb, "\\\""); break; case '"': js_puts(J, sb, "\\\""); break;
case '\\': js_puts(J, sb, "\\\\"); break; case '\\': js_puts(J, sb, "\\\\"); break;
@@ -193,16 +194,22 @@ static void fmtstr(js_State *J, js_Buffer **sb, const char *s)
case '\r': js_puts(J, sb, "\\r"); break; case '\r': js_puts(J, sb, "\\r"); break;
case '\t': js_puts(J, sb, "\\t"); break; case '\t': js_puts(J, sb, "\\t"); break;
default: default:
if (c < ' ' || c > 127) { if (c < ' ') {
js_puts(J, sb, "\\u"); js_putc(J, sb, '\\');
js_putc(J, sb, 'u');
js_putc(J, sb, HEX[(c>>12)&15]); js_putc(J, sb, HEX[(c>>12)&15]);
js_putc(J, sb, HEX[(c>>8)&15]); js_putc(J, sb, HEX[(c>>8)&15]);
js_putc(J, sb, HEX[(c>>4)&15]); js_putc(J, sb, HEX[(c>>4)&15]);
js_putc(J, sb, HEX[c&15]); js_putc(J, sb, HEX[c&15]);
} else if (c < 128) {
js_putc(J, sb, c);
} else { } else {
js_putc(J, sb, c); break; for (i = 0; i < n; ++i)
js_putc(J, sb, s[i]);
} }
break;
} }
s += n;
} }
js_putc(J, sb, '"'); js_putc(J, sb, '"');
} }

View File

@@ -19,10 +19,11 @@ static void reprnum(js_State *J, js_Buffer **sb, double n)
static void reprstr(js_State *J, js_Buffer **sb, const char *s) static void reprstr(js_State *J, js_Buffer **sb, const char *s)
{ {
static const char *HEX = "0123456789ABCDEF"; static const char *HEX = "0123456789ABCDEF";
int i, n;
Rune c; Rune c;
js_putc(J, sb, '"'); js_putc(J, sb, '"');
while (*s) { while (*s) {
s += chartorune(&c, s); n = chartorune(&c, s);
switch (c) { switch (c) {
case '"': js_puts(J, sb, "\\\""); break; case '"': js_puts(J, sb, "\\\""); break;
case '\\': js_puts(J, sb, "\\\\"); break; case '\\': js_puts(J, sb, "\\\\"); break;
@@ -32,16 +33,27 @@ static void reprstr(js_State *J, js_Buffer **sb, const char *s)
case '\r': js_puts(J, sb, "\\r"); break; case '\r': js_puts(J, sb, "\\r"); break;
case '\t': js_puts(J, sb, "\\t"); break; case '\t': js_puts(J, sb, "\\t"); break;
default: default:
if (c < ' ' || c > 127) { if (c < ' ') {
js_puts(J, sb, "\\u"); js_putc(J, sb, '\\');
js_putc(J, sb, 'x');
js_putc(J, sb, HEX[(c>>4)&15]);
js_putc(J, sb, HEX[c&15]);
} else if (c < 128) {
js_putc(J, sb, c);
} else if (c < 0x10000) {
js_putc(J, sb, '\\');
js_putc(J, sb, 'u');
js_putc(J, sb, HEX[(c>>12)&15]); js_putc(J, sb, HEX[(c>>12)&15]);
js_putc(J, sb, HEX[(c>>8)&15]); js_putc(J, sb, HEX[(c>>8)&15]);
js_putc(J, sb, HEX[(c>>4)&15]); js_putc(J, sb, HEX[(c>>4)&15]);
js_putc(J, sb, HEX[c&15]); js_putc(J, sb, HEX[c&15]);
} else { } else {
js_putc(J, sb, c); break; for (i = 0; i < n; ++i)
js_putc(J, sb, s[i]);
} }
break;
} }
s += n;
} }
js_putc(J, sb, '"'); js_putc(J, sb, '"');
} }

View File

@@ -310,7 +310,7 @@ static void S_fromCharCode(js_State *J)
} }
for (i = 1; i < top; ++i) { for (i = 1; i < top; ++i) {
c = js_touint16(J, i); c = js_touint32(J, i);
p += runetochar(p, &c); p += runetochar(p, &c);
} }
*p = 0; *p = 0;

58
utf.c
View File

@@ -25,27 +25,30 @@ enum
Bit2 = 5, Bit2 = 5,
Bit3 = 4, Bit3 = 4,
Bit4 = 3, Bit4 = 3,
Bit5 = 2,
T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */ T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */
Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */ Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */
T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */ T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */
T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */ T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */
T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */ T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */
T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */
Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */ Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0000 0000 0111 1111 */
Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */ Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0000 0000 0111 1111 1111 */
Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */ Rune3 = (1<<(Bit3+2*Bitx))-1, /* 0000 0000 1111 1111 1111 1111 */
Rune4 = (1<<(Bit4+3*Bitx))-1, /* 0011 1111 1111 1111 1111 1111 */
Maskx = (1<<Bitx)-1, /* 0011 1111 */ Maskx = (1<<Bitx)-1, /* 0011 1111 */
Testx = Maskx ^ 0xFF, /* 1100 0000 */ Testx = Maskx ^ 0xFF, /* 1100 0000 */
Bad = Runeerror, Bad = Runeerror
}; };
int int
chartorune(Rune *rune, const char *str) chartorune(Rune *rune, const char *str)
{ {
int c, c1, c2; int c, c1, c2, c3;
int l; int l;
/* overlong null character */ /* overlong null character */
@@ -96,6 +99,25 @@ chartorune(Rune *rune, const char *str)
return 3; return 3;
} }
/*
* four character sequence
* 10000-10FFFF => T4 Tx Tx Tx
*/
if(UTFmax >= 4) {
c3 = *(uchar*)(str+3) ^ Tx;
if(c3 & Testx)
goto bad;
if(c < T5) {
l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
if(l <= Rune3)
goto bad;
if(l > Runemax)
goto bad;
*rune = l;
return 4;
}
}
/* /*
* bad decoding * bad decoding
*/ */
@@ -127,7 +149,7 @@ runetochar(char *str, const Rune *rune)
/* /*
* two character sequence * two character sequence
* 0080-07FF => T2 Tx * 00080-007FF => T2 Tx
*/ */
if(c <= Rune2) { if(c <= Rune2) {
str[0] = T2 | (c >> 1*Bitx); str[0] = T2 | (c >> 1*Bitx);
@@ -137,12 +159,26 @@ runetochar(char *str, const Rune *rune)
/* /*
* three character sequence * three character sequence
* 0800-FFFF => T3 Tx Tx * 00800-0FFFF => T3 Tx Tx
*/ */
str[0] = T3 | (c >> 2*Bitx); if(c > Runemax)
str[1] = Tx | ((c >> 1*Bitx) & Maskx); c = Runeerror;
str[2] = Tx | (c & Maskx); if(c <= Rune3) {
return 3; str[0] = T3 | (c >> 2*Bitx);
str[1] = Tx | ((c >> 1*Bitx) & Maskx);
str[2] = Tx | (c & Maskx);
return 3;
}
/*
* four character sequence
* 010000-1FFFFF => T4 Tx Tx Tx
*/
str[0] = T4 | (c >> 3*Bitx);
str[1] = Tx | ((c >> 2*Bitx) & Maskx);
str[2] = Tx | ((c >> 1*Bitx) & Maskx);
str[3] = Tx | (c & Maskx);
return 4;
} }
int int

5
utf.h
View File

@@ -1,7 +1,7 @@
#ifndef js_utf_h #ifndef js_utf_h
#define js_utf_h #define js_utf_h
typedef unsigned short Rune; /* 16 bits */ typedef int Rune; /* 32 bits */
#define chartorune jsU_chartorune #define chartorune jsU_chartorune
#define runetochar jsU_runetochar #define runetochar jsU_runetochar
@@ -19,10 +19,11 @@ typedef unsigned short Rune; /* 16 bits */
enum enum
{ {
UTFmax = 3, /* maximum bytes per rune */ UTFmax = 4, /* maximum bytes per rune */
Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */ Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */
Runeself = 0x80, /* rune and UTF sequences are the same (<) */ Runeself = 0x80, /* rune and UTF sequences are the same (<) */
Runeerror = 0xFFFD, /* decoding error in UTF */ Runeerror = 0xFFFD, /* decoding error in UTF */
Runemax = 0x10FFFF, /* maximum rune value */
}; };
int chartorune(Rune *rune, const char *str); int chartorune(Rune *rune, const char *str);