mirror of
https://github.com/ccxvii/mujs.git
synced 2026-02-06 01:41:37 +08:00
Support 4-byte UTF-8 sequences.
The following functions are no longer restricted to 16-bit integer values: String.fromCharCode() String.prototype.charCodeAt() repr() will not escape SMP characters, as doing so would require conversion to surrogate pairs, but will encode these characters as UTF-8. Unicode characters in the BMP will still be escaped with \uXXXX as before. JSON.stringify() only escapes control characters, so will represent all non-ASCII characters as UTF-8. We do no automatic conversions to/from surrogate pairs. Code that worked with surrogate pairs should not be affected by these changes.
This commit is contained in:
@@ -61,11 +61,15 @@ Custom properties on userdata objects can be implemented using getter and setter
|
|||||||
Numbers are represented using double precision floating point values.
|
Numbers are represented using double precision floating point values.
|
||||||
|
|
||||||
<p>
|
<p>
|
||||||
Strings in the C interface are zero-terminated byte arrays in CESU-8 encoding.
|
Strings in the C interface are zero-terminated byte arrays in WTF-8 encoding.
|
||||||
CESU-8 is a variant of UTF-8 which encodes supplementary unicode characters as
|
This allows both arbitary 16-bit values (as required by Javascript) and also
|
||||||
surrogate pairs. This maintains compatibility with the UTF-16 nature of
|
extended code points for the full 21-bit Unicode range.
|
||||||
JavaScript, but requires attention when passing strings using supplementary
|
These extended characters will mostly work as expected in Javascript.
|
||||||
unicode characters to and from the MuJS library.
|
|
||||||
|
<p>
|
||||||
|
If you have Javascript code that expects to work with UTF-16 surrogate pairs,
|
||||||
|
you will need to manually convert any extended characters to surrogate pairs
|
||||||
|
and back when passing strings between C and Javascript.
|
||||||
|
|
||||||
<p>
|
<p>
|
||||||
The U+0000 character is encoded as the two-byte sequence <C0 80>, same as in
|
The U+0000 character is encoded as the two-byte sequence <C0 80>, same as in
|
||||||
|
|||||||
15
json.c
15
json.c
@@ -180,10 +180,11 @@ static void fmtnum(js_State *J, js_Buffer **sb, double n)
|
|||||||
static void fmtstr(js_State *J, js_Buffer **sb, const char *s)
|
static void fmtstr(js_State *J, js_Buffer **sb, const char *s)
|
||||||
{
|
{
|
||||||
static const char *HEX = "0123456789ABCDEF";
|
static const char *HEX = "0123456789ABCDEF";
|
||||||
|
int i, n;
|
||||||
Rune c;
|
Rune c;
|
||||||
js_putc(J, sb, '"');
|
js_putc(J, sb, '"');
|
||||||
while (*s) {
|
while (*s) {
|
||||||
s += chartorune(&c, s);
|
n = chartorune(&c, s);
|
||||||
switch (c) {
|
switch (c) {
|
||||||
case '"': js_puts(J, sb, "\\\""); break;
|
case '"': js_puts(J, sb, "\\\""); break;
|
||||||
case '\\': js_puts(J, sb, "\\\\"); break;
|
case '\\': js_puts(J, sb, "\\\\"); break;
|
||||||
@@ -193,16 +194,22 @@ static void fmtstr(js_State *J, js_Buffer **sb, const char *s)
|
|||||||
case '\r': js_puts(J, sb, "\\r"); break;
|
case '\r': js_puts(J, sb, "\\r"); break;
|
||||||
case '\t': js_puts(J, sb, "\\t"); break;
|
case '\t': js_puts(J, sb, "\\t"); break;
|
||||||
default:
|
default:
|
||||||
if (c < ' ' || c > 127) {
|
if (c < ' ') {
|
||||||
js_puts(J, sb, "\\u");
|
js_putc(J, sb, '\\');
|
||||||
|
js_putc(J, sb, 'u');
|
||||||
js_putc(J, sb, HEX[(c>>12)&15]);
|
js_putc(J, sb, HEX[(c>>12)&15]);
|
||||||
js_putc(J, sb, HEX[(c>>8)&15]);
|
js_putc(J, sb, HEX[(c>>8)&15]);
|
||||||
js_putc(J, sb, HEX[(c>>4)&15]);
|
js_putc(J, sb, HEX[(c>>4)&15]);
|
||||||
js_putc(J, sb, HEX[c&15]);
|
js_putc(J, sb, HEX[c&15]);
|
||||||
|
} else if (c < 128) {
|
||||||
|
js_putc(J, sb, c);
|
||||||
} else {
|
} else {
|
||||||
js_putc(J, sb, c); break;
|
for (i = 0; i < n; ++i)
|
||||||
|
js_putc(J, sb, s[i]);
|
||||||
}
|
}
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
|
s += n;
|
||||||
}
|
}
|
||||||
js_putc(J, sb, '"');
|
js_putc(J, sb, '"');
|
||||||
}
|
}
|
||||||
|
|||||||
20
jsrepr.c
20
jsrepr.c
@@ -19,10 +19,11 @@ static void reprnum(js_State *J, js_Buffer **sb, double n)
|
|||||||
static void reprstr(js_State *J, js_Buffer **sb, const char *s)
|
static void reprstr(js_State *J, js_Buffer **sb, const char *s)
|
||||||
{
|
{
|
||||||
static const char *HEX = "0123456789ABCDEF";
|
static const char *HEX = "0123456789ABCDEF";
|
||||||
|
int i, n;
|
||||||
Rune c;
|
Rune c;
|
||||||
js_putc(J, sb, '"');
|
js_putc(J, sb, '"');
|
||||||
while (*s) {
|
while (*s) {
|
||||||
s += chartorune(&c, s);
|
n = chartorune(&c, s);
|
||||||
switch (c) {
|
switch (c) {
|
||||||
case '"': js_puts(J, sb, "\\\""); break;
|
case '"': js_puts(J, sb, "\\\""); break;
|
||||||
case '\\': js_puts(J, sb, "\\\\"); break;
|
case '\\': js_puts(J, sb, "\\\\"); break;
|
||||||
@@ -32,16 +33,27 @@ static void reprstr(js_State *J, js_Buffer **sb, const char *s)
|
|||||||
case '\r': js_puts(J, sb, "\\r"); break;
|
case '\r': js_puts(J, sb, "\\r"); break;
|
||||||
case '\t': js_puts(J, sb, "\\t"); break;
|
case '\t': js_puts(J, sb, "\\t"); break;
|
||||||
default:
|
default:
|
||||||
if (c < ' ' || c > 127) {
|
if (c < ' ') {
|
||||||
js_puts(J, sb, "\\u");
|
js_putc(J, sb, '\\');
|
||||||
|
js_putc(J, sb, 'x');
|
||||||
|
js_putc(J, sb, HEX[(c>>4)&15]);
|
||||||
|
js_putc(J, sb, HEX[c&15]);
|
||||||
|
} else if (c < 128) {
|
||||||
|
js_putc(J, sb, c);
|
||||||
|
} else if (c < 0x10000) {
|
||||||
|
js_putc(J, sb, '\\');
|
||||||
|
js_putc(J, sb, 'u');
|
||||||
js_putc(J, sb, HEX[(c>>12)&15]);
|
js_putc(J, sb, HEX[(c>>12)&15]);
|
||||||
js_putc(J, sb, HEX[(c>>8)&15]);
|
js_putc(J, sb, HEX[(c>>8)&15]);
|
||||||
js_putc(J, sb, HEX[(c>>4)&15]);
|
js_putc(J, sb, HEX[(c>>4)&15]);
|
||||||
js_putc(J, sb, HEX[c&15]);
|
js_putc(J, sb, HEX[c&15]);
|
||||||
} else {
|
} else {
|
||||||
js_putc(J, sb, c); break;
|
for (i = 0; i < n; ++i)
|
||||||
|
js_putc(J, sb, s[i]);
|
||||||
}
|
}
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
|
s += n;
|
||||||
}
|
}
|
||||||
js_putc(J, sb, '"');
|
js_putc(J, sb, '"');
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -310,7 +310,7 @@ static void S_fromCharCode(js_State *J)
|
|||||||
}
|
}
|
||||||
|
|
||||||
for (i = 1; i < top; ++i) {
|
for (i = 1; i < top; ++i) {
|
||||||
c = js_touint16(J, i);
|
c = js_touint32(J, i);
|
||||||
p += runetochar(p, &c);
|
p += runetochar(p, &c);
|
||||||
}
|
}
|
||||||
*p = 0;
|
*p = 0;
|
||||||
|
|||||||
58
utf.c
58
utf.c
@@ -25,27 +25,30 @@ enum
|
|||||||
Bit2 = 5,
|
Bit2 = 5,
|
||||||
Bit3 = 4,
|
Bit3 = 4,
|
||||||
Bit4 = 3,
|
Bit4 = 3,
|
||||||
|
Bit5 = 2,
|
||||||
|
|
||||||
T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */
|
T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */
|
||||||
Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */
|
Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */
|
||||||
T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */
|
T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */
|
||||||
T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */
|
T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */
|
||||||
T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */
|
T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */
|
||||||
|
T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */
|
||||||
|
|
||||||
Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */
|
Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0000 0000 0111 1111 */
|
||||||
Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */
|
Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0000 0000 0111 1111 1111 */
|
||||||
Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */
|
Rune3 = (1<<(Bit3+2*Bitx))-1, /* 0000 0000 1111 1111 1111 1111 */
|
||||||
|
Rune4 = (1<<(Bit4+3*Bitx))-1, /* 0011 1111 1111 1111 1111 1111 */
|
||||||
|
|
||||||
Maskx = (1<<Bitx)-1, /* 0011 1111 */
|
Maskx = (1<<Bitx)-1, /* 0011 1111 */
|
||||||
Testx = Maskx ^ 0xFF, /* 1100 0000 */
|
Testx = Maskx ^ 0xFF, /* 1100 0000 */
|
||||||
|
|
||||||
Bad = Runeerror,
|
Bad = Runeerror
|
||||||
};
|
};
|
||||||
|
|
||||||
int
|
int
|
||||||
chartorune(Rune *rune, const char *str)
|
chartorune(Rune *rune, const char *str)
|
||||||
{
|
{
|
||||||
int c, c1, c2;
|
int c, c1, c2, c3;
|
||||||
int l;
|
int l;
|
||||||
|
|
||||||
/* overlong null character */
|
/* overlong null character */
|
||||||
@@ -96,6 +99,25 @@ chartorune(Rune *rune, const char *str)
|
|||||||
return 3;
|
return 3;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* four character sequence
|
||||||
|
* 10000-10FFFF => T4 Tx Tx Tx
|
||||||
|
*/
|
||||||
|
if(UTFmax >= 4) {
|
||||||
|
c3 = *(uchar*)(str+3) ^ Tx;
|
||||||
|
if(c3 & Testx)
|
||||||
|
goto bad;
|
||||||
|
if(c < T5) {
|
||||||
|
l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
|
||||||
|
if(l <= Rune3)
|
||||||
|
goto bad;
|
||||||
|
if(l > Runemax)
|
||||||
|
goto bad;
|
||||||
|
*rune = l;
|
||||||
|
return 4;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* bad decoding
|
* bad decoding
|
||||||
*/
|
*/
|
||||||
@@ -127,7 +149,7 @@ runetochar(char *str, const Rune *rune)
|
|||||||
|
|
||||||
/*
|
/*
|
||||||
* two character sequence
|
* two character sequence
|
||||||
* 0080-07FF => T2 Tx
|
* 00080-007FF => T2 Tx
|
||||||
*/
|
*/
|
||||||
if(c <= Rune2) {
|
if(c <= Rune2) {
|
||||||
str[0] = T2 | (c >> 1*Bitx);
|
str[0] = T2 | (c >> 1*Bitx);
|
||||||
@@ -137,12 +159,26 @@ runetochar(char *str, const Rune *rune)
|
|||||||
|
|
||||||
/*
|
/*
|
||||||
* three character sequence
|
* three character sequence
|
||||||
* 0800-FFFF => T3 Tx Tx
|
* 00800-0FFFF => T3 Tx Tx
|
||||||
*/
|
*/
|
||||||
str[0] = T3 | (c >> 2*Bitx);
|
if(c > Runemax)
|
||||||
str[1] = Tx | ((c >> 1*Bitx) & Maskx);
|
c = Runeerror;
|
||||||
str[2] = Tx | (c & Maskx);
|
if(c <= Rune3) {
|
||||||
return 3;
|
str[0] = T3 | (c >> 2*Bitx);
|
||||||
|
str[1] = Tx | ((c >> 1*Bitx) & Maskx);
|
||||||
|
str[2] = Tx | (c & Maskx);
|
||||||
|
return 3;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* four character sequence
|
||||||
|
* 010000-1FFFFF => T4 Tx Tx Tx
|
||||||
|
*/
|
||||||
|
str[0] = T4 | (c >> 3*Bitx);
|
||||||
|
str[1] = Tx | ((c >> 2*Bitx) & Maskx);
|
||||||
|
str[2] = Tx | ((c >> 1*Bitx) & Maskx);
|
||||||
|
str[3] = Tx | (c & Maskx);
|
||||||
|
return 4;
|
||||||
}
|
}
|
||||||
|
|
||||||
int
|
int
|
||||||
|
|||||||
5
utf.h
5
utf.h
@@ -1,7 +1,7 @@
|
|||||||
#ifndef js_utf_h
|
#ifndef js_utf_h
|
||||||
#define js_utf_h
|
#define js_utf_h
|
||||||
|
|
||||||
typedef unsigned short Rune; /* 16 bits */
|
typedef int Rune; /* 32 bits */
|
||||||
|
|
||||||
#define chartorune jsU_chartorune
|
#define chartorune jsU_chartorune
|
||||||
#define runetochar jsU_runetochar
|
#define runetochar jsU_runetochar
|
||||||
@@ -19,10 +19,11 @@ typedef unsigned short Rune; /* 16 bits */
|
|||||||
|
|
||||||
enum
|
enum
|
||||||
{
|
{
|
||||||
UTFmax = 3, /* maximum bytes per rune */
|
UTFmax = 4, /* maximum bytes per rune */
|
||||||
Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */
|
Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */
|
||||||
Runeself = 0x80, /* rune and UTF sequences are the same (<) */
|
Runeself = 0x80, /* rune and UTF sequences are the same (<) */
|
||||||
Runeerror = 0xFFFD, /* decoding error in UTF */
|
Runeerror = 0xFFFD, /* decoding error in UTF */
|
||||||
|
Runemax = 0x10FFFF, /* maximum rune value */
|
||||||
};
|
};
|
||||||
|
|
||||||
int chartorune(Rune *rune, const char *str);
|
int chartorune(Rune *rune, const char *str);
|
||||||
|
|||||||
Reference in New Issue
Block a user