Update character tables to include SpecialCasing.txt.

Use full mappings for String.prototype.to(Upper|Lower)Case, including
the mappings that change the string length (for example "ß" to "SS").
This commit is contained in:
Tor Andersson
2025-02-10 11:21:38 +01:00
parent cc569c5fa9
commit 94ec2f2d7c
6 changed files with 265 additions and 15 deletions

View File

@@ -29,7 +29,7 @@ ifeq ($(shell uname),FreeBSD)
CFLAGS += -I/usr/local/include -L/usr/local/lib
endif
HDRS = mujs.h jsi.h regexp.h utf.h astnames.h opnames.h
HDRS = mujs.h jsi.h regexp.h utf.h astnames.h opnames.h utfdata.h
ifneq ($(HAVE_READLINE),no)
READLINE_CFLAGS = -DHAVE_READLINE
@@ -73,10 +73,12 @@ opnames.h: jsi.h
grep -E '\<OP_' jsi.h | sed 's/^[^A-Z]*OP_/"/;s/,.*/",/' | tr A-Z a-z > $@
UnicodeData.txt:
curl -s -o $@ https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
curl -s -o $@ https://www.unicode.org/Public/16.0.0/ucd/UnicodeData.txt
SpecialCasing.txt:
curl -s -o $@ https://www.unicode.org/Public/16.0.0/ucd/SpecialCasing.txt
utfdata.h: genucd.py UnicodeData.txt
python3 genucd.py UnicodeData.txt >$@
utfdata.h: genucd.py UnicodeData.txt SpecialCasing.txt
python3 genucd.py UnicodeData.txt SpecialCasing.txt >$@
build/sanitize/mujs: main.c one.c $(SRCS) $(HDRS)
@mkdir -p $(@D)

View File

@@ -1,9 +1,11 @@
# Create utfdata.h from UnicodeData.txt
# Create utfdata.h from UnicodeData.txt and SpecialCasing.txt
import sys
tolower = []
toupper = []
tolower_full = []
toupper_full = []
isalpha = []
for line in open(sys.argv[1]).readlines():
@@ -17,6 +19,32 @@ for line in open(sys.argv[1]).readlines():
if line[13]:
tolower.append((code,int(line[13],16)))
for line in open(sys.argv[2]).readlines():
# SpecialCasing.txt -- code; lower; title; upper; (condition;)? # comment
line = line.strip()
if len(line) == 0:
continue
if line[0] == "#":
continue
line = line.split(";")
code = int(line[0],16)
lower = line[1].strip()
upper = line[3].strip()
if len(lower) == 0 or len(upper) == 0:
continue
condition = line[4].split("#")[0].strip()
if len(condition) > 0:
continue
lower = list(map(lambda x: int(x,16), lower.split(" ")))
upper = list(map(lambda x: int(x,16), upper.split(" ")))
if lower[0] != code:
tolower_full.append([code] + lower)
if upper[0] != code:
toupper_full.append([code] + upper)
tolower_full.sort()
toupper_full.sort()
def dumpalpha():
table = []
prev = 0
@@ -73,7 +101,17 @@ def dumpmap(name, input):
print(hex(a)+","+str(n-a)+",")
print("};");
def dumpmultimap(name, table, w):
print("")
print("static const Rune " + name + "[] = {")
for list in table:
list += [0] * (w - len(list))
print(",".join(map(hex, list)) + ",")
print("};")
print("/* This file was automatically created from " + sys.argv[1] + " */")
dumpalpha()
dumpmap("ucd_tolower", tolower)
dumpmap("ucd_toupper", toupper)
dumpmultimap("ucd_tolower_full", tolower_full, 4)
dumpmultimap("ucd_toupper_full", toupper_full, 5)

View File

@@ -310,21 +310,46 @@ static void Sp_substring(js_State *J)
static void Sp_toLowerCase(js_State *J)
{
const char *s = checkstring(J, 0);
const char *s, *s0 = checkstring(J, 0);
char * volatile dst = NULL;
char *d;
Rune rune;
const Rune *full;
int n;
n = 1;
for (s = s0; *s;) {
s += chartorune(&rune, s);
full = tolowerrune_full(rune);
if (full) {
while (*full) {
n += runelen(*full);
++full;
}
} else {
rune = tolowerrune(rune);
n += runelen(rune);
}
}
if (js_try(J)) {
js_free(J, dst);
js_throw(J);
}
d = dst = js_malloc(J, UTFmax * strlen(s) + 1);
while (*s) {
d = dst = js_malloc(J, n);
for (s = s0; *s;) {
s += chartorune(&rune, s);
rune = tolowerrune(rune);
d += runetochar(d, &rune);
full = tolowerrune_full(rune);
if (full) {
while (*full) {
d += runetochar(d, full);
++full;
}
} else {
rune = tolowerrune(rune);
d += runetochar(d, &rune);
}
}
*d = 0;
@@ -335,21 +360,46 @@ static void Sp_toLowerCase(js_State *J)
static void Sp_toUpperCase(js_State *J)
{
const char *s = checkstring(J, 0);
const char *s, *s0 = checkstring(J, 0);
char * volatile dst = NULL;
char *d;
const Rune *full;
Rune rune;
int n;
n = 1;
for (s = s0; *s;) {
s += chartorune(&rune, s);
full = toupperrune_full(rune);
if (full) {
while (*full) {
n += runelen(*full);
++full;
}
} else {
rune = toupperrune(rune);
n += runelen(rune);
}
}
if (js_try(J)) {
js_free(J, dst);
js_throw(J);
}
d = dst = js_malloc(J, UTFmax * strlen(s) + 1);
while (*s) {
d = dst = js_malloc(J, n);
for (s = s0; *s;) {
s += chartorune(&rune, s);
rune = toupperrune(rune);
d += runetochar(d, &rune);
full = toupperrune_full(rune);
if (full) {
while (*full) {
d += runetochar(d, full);
++full;
}
} else {
rune = toupperrune(rune);
d += runetochar(d, &rune);
}
}
*d = 0;

20
utf.c
View File

@@ -283,3 +283,23 @@ isalpharune(Rune c)
return 1;
return 0;
}
const Rune *
tolowerrune_full(Rune c)
{
const Rune *p;
p = ucd_bsearch(c, ucd_tolower_full, nelem(ucd_tolower_full)/4, 4);
if(p && c == p[0])
return p + 1;
return NULL;
}
const Rune *
toupperrune_full(Rune c)
{
const Rune *p;
p = ucd_bsearch(c, ucd_toupper_full, nelem(ucd_toupper_full)/5, 5);
if(p && c == p[0])
return p + 1;
return NULL;
}

4
utf.h
View File

@@ -25,6 +25,8 @@ typedef int Rune; /* 32 bits */
#define isupperrune jsU_isupperrune
#define tolowerrune jsU_tolowerrune
#define toupperrune jsU_toupperrune
#define tolowerrune_full jsU_tolowerrune_full
#define toupperrune_full jsU_toupperrune_full
enum
{
@@ -44,5 +46,7 @@ int islowerrune(Rune c);
int isupperrune(Rune c);
Rune tolowerrune(Rune c);
Rune toupperrune(Rune c);
const Rune* tolowerrune_full(Rune c);
const Rune* toupperrune_full(Rune c);
#endif

136
utfdata.h
View File

@@ -2071,3 +2071,139 @@ static const Rune ucd_toupper1[] = {
0xa7f6,-1,
0xab53,-928,
};
static const Rune ucd_tolower_full[] = {
0x130,0x69,0x307,0x0,
0x1f88,0x1f80,0x0,0x0,
0x1f89,0x1f81,0x0,0x0,
0x1f8a,0x1f82,0x0,0x0,
0x1f8b,0x1f83,0x0,0x0,
0x1f8c,0x1f84,0x0,0x0,
0x1f8d,0x1f85,0x0,0x0,
0x1f8e,0x1f86,0x0,0x0,
0x1f8f,0x1f87,0x0,0x0,
0x1f98,0x1f90,0x0,0x0,
0x1f99,0x1f91,0x0,0x0,
0x1f9a,0x1f92,0x0,0x0,
0x1f9b,0x1f93,0x0,0x0,
0x1f9c,0x1f94,0x0,0x0,
0x1f9d,0x1f95,0x0,0x0,
0x1f9e,0x1f96,0x0,0x0,
0x1f9f,0x1f97,0x0,0x0,
0x1fa8,0x1fa0,0x0,0x0,
0x1fa9,0x1fa1,0x0,0x0,
0x1faa,0x1fa2,0x0,0x0,
0x1fab,0x1fa3,0x0,0x0,
0x1fac,0x1fa4,0x0,0x0,
0x1fad,0x1fa5,0x0,0x0,
0x1fae,0x1fa6,0x0,0x0,
0x1faf,0x1fa7,0x0,0x0,
0x1fbc,0x1fb3,0x0,0x0,
0x1fcc,0x1fc3,0x0,0x0,
0x1ffc,0x1ff3,0x0,0x0,
};
static const Rune ucd_toupper_full[] = {
0xdf,0x53,0x53,0x0,0x0,
0x149,0x2bc,0x4e,0x0,0x0,
0x1f0,0x4a,0x30c,0x0,0x0,
0x390,0x399,0x308,0x301,0x0,
0x3b0,0x3a5,0x308,0x301,0x0,
0x587,0x535,0x552,0x0,0x0,
0x1e96,0x48,0x331,0x0,0x0,
0x1e97,0x54,0x308,0x0,0x0,
0x1e98,0x57,0x30a,0x0,0x0,
0x1e99,0x59,0x30a,0x0,0x0,
0x1e9a,0x41,0x2be,0x0,0x0,
0x1f50,0x3a5,0x313,0x0,0x0,
0x1f52,0x3a5,0x313,0x300,0x0,
0x1f54,0x3a5,0x313,0x301,0x0,
0x1f56,0x3a5,0x313,0x342,0x0,
0x1f80,0x1f08,0x399,0x0,0x0,
0x1f81,0x1f09,0x399,0x0,0x0,
0x1f82,0x1f0a,0x399,0x0,0x0,
0x1f83,0x1f0b,0x399,0x0,0x0,
0x1f84,0x1f0c,0x399,0x0,0x0,
0x1f85,0x1f0d,0x399,0x0,0x0,
0x1f86,0x1f0e,0x399,0x0,0x0,
0x1f87,0x1f0f,0x399,0x0,0x0,
0x1f88,0x1f08,0x399,0x0,0x0,
0x1f89,0x1f09,0x399,0x0,0x0,
0x1f8a,0x1f0a,0x399,0x0,0x0,
0x1f8b,0x1f0b,0x399,0x0,0x0,
0x1f8c,0x1f0c,0x399,0x0,0x0,
0x1f8d,0x1f0d,0x399,0x0,0x0,
0x1f8e,0x1f0e,0x399,0x0,0x0,
0x1f8f,0x1f0f,0x399,0x0,0x0,
0x1f90,0x1f28,0x399,0x0,0x0,
0x1f91,0x1f29,0x399,0x0,0x0,
0x1f92,0x1f2a,0x399,0x0,0x0,
0x1f93,0x1f2b,0x399,0x0,0x0,
0x1f94,0x1f2c,0x399,0x0,0x0,
0x1f95,0x1f2d,0x399,0x0,0x0,
0x1f96,0x1f2e,0x399,0x0,0x0,
0x1f97,0x1f2f,0x399,0x0,0x0,
0x1f98,0x1f28,0x399,0x0,0x0,
0x1f99,0x1f29,0x399,0x0,0x0,
0x1f9a,0x1f2a,0x399,0x0,0x0,
0x1f9b,0x1f2b,0x399,0x0,0x0,
0x1f9c,0x1f2c,0x399,0x0,0x0,
0x1f9d,0x1f2d,0x399,0x0,0x0,
0x1f9e,0x1f2e,0x399,0x0,0x0,
0x1f9f,0x1f2f,0x399,0x0,0x0,
0x1fa0,0x1f68,0x399,0x0,0x0,
0x1fa1,0x1f69,0x399,0x0,0x0,
0x1fa2,0x1f6a,0x399,0x0,0x0,
0x1fa3,0x1f6b,0x399,0x0,0x0,
0x1fa4,0x1f6c,0x399,0x0,0x0,
0x1fa5,0x1f6d,0x399,0x0,0x0,
0x1fa6,0x1f6e,0x399,0x0,0x0,
0x1fa7,0x1f6f,0x399,0x0,0x0,
0x1fa8,0x1f68,0x399,0x0,0x0,
0x1fa9,0x1f69,0x399,0x0,0x0,
0x1faa,0x1f6a,0x399,0x0,0x0,
0x1fab,0x1f6b,0x399,0x0,0x0,
0x1fac,0x1f6c,0x399,0x0,0x0,
0x1fad,0x1f6d,0x399,0x0,0x0,
0x1fae,0x1f6e,0x399,0x0,0x0,
0x1faf,0x1f6f,0x399,0x0,0x0,
0x1fb2,0x1fba,0x399,0x0,0x0,
0x1fb3,0x391,0x399,0x0,0x0,
0x1fb4,0x386,0x399,0x0,0x0,
0x1fb6,0x391,0x342,0x0,0x0,
0x1fb7,0x391,0x342,0x399,0x0,
0x1fbc,0x391,0x399,0x0,0x0,
0x1fc2,0x1fca,0x399,0x0,0x0,
0x1fc3,0x397,0x399,0x0,0x0,
0x1fc4,0x389,0x399,0x0,0x0,
0x1fc6,0x397,0x342,0x0,0x0,
0x1fc7,0x397,0x342,0x399,0x0,
0x1fcc,0x397,0x399,0x0,0x0,
0x1fd2,0x399,0x308,0x300,0x0,
0x1fd3,0x399,0x308,0x301,0x0,
0x1fd6,0x399,0x342,0x0,0x0,
0x1fd7,0x399,0x308,0x342,0x0,
0x1fe2,0x3a5,0x308,0x300,0x0,
0x1fe3,0x3a5,0x308,0x301,0x0,
0x1fe4,0x3a1,0x313,0x0,0x0,
0x1fe6,0x3a5,0x342,0x0,0x0,
0x1fe7,0x3a5,0x308,0x342,0x0,
0x1ff2,0x1ffa,0x399,0x0,0x0,
0x1ff3,0x3a9,0x399,0x0,0x0,
0x1ff4,0x38f,0x399,0x0,0x0,
0x1ff6,0x3a9,0x342,0x0,0x0,
0x1ff7,0x3a9,0x342,0x399,0x0,
0x1ffc,0x3a9,0x399,0x0,0x0,
0xfb00,0x46,0x46,0x0,0x0,
0xfb01,0x46,0x49,0x0,0x0,
0xfb02,0x46,0x4c,0x0,0x0,
0xfb03,0x46,0x46,0x49,0x0,
0xfb04,0x46,0x46,0x4c,0x0,
0xfb05,0x53,0x54,0x0,0x0,
0xfb06,0x53,0x54,0x0,0x0,
0xfb13,0x544,0x546,0x0,0x0,
0xfb14,0x544,0x535,0x0,0x0,
0xfb15,0x544,0x53b,0x0,0x0,
0xfb16,0x54e,0x546,0x0,0x0,
0xfb17,0x544,0x53d,0x0,0x0,
};