Generate new tables for isalpha/toupper/tolower from UnicodeDate.txt

This commit is contained in:
Tor Andersson
2021-07-20 15:01:53 +02:00
parent 1b8aae1d3c
commit 2a1804ea26
6 changed files with 2158 additions and 1150 deletions

77
genucd.py Normal file
View File

@@ -0,0 +1,77 @@
# Create utfdata.h from UnicodeData.txt
tolower = []
toupper = []
isalpha = []
for line in open("UnicodeData.txt").readlines():
line = line.split(";")
code = int(line[0],16)
# if code > 65535: continue # skip non-BMP codepoints
if line[2][0] == 'L':
isalpha.append(code)
if line[12]:
toupper.append((code,int(line[12],16)))
if line[13]:
tolower.append((code,int(line[13],16)))
def dumpalpha():
table = []
prev = 0
start = 0
for code in isalpha:
if code != prev+1:
if start:
table.append((start,prev))
start = code
prev = code
table.append((start,prev))
print("")
print("static const Rune ucd_alpha2[] = {")
for a, b in table:
if b - a > 0:
print(hex(a)+","+hex(b)+",")
print("};");
print("")
print("static const Rune ucd_alpha1[] = {")
for a, b in table:
if b - a == 0:
print(hex(a)+",")
print("};");
def dumpmap(name, input):
table = []
prev_a = 0
prev_b = 0
start_a = 0
start_b = 0
for a, b in input:
if a != prev_a+1 or b != prev_b+1:
if start_a:
table.append((start_a,prev_a,start_b))
start_a = a
start_b = b
prev_a = a
prev_b = b
table.append((start_a,prev_a,start_b))
print("")
print("static const Rune " + name + "2[] = {")
for a, b, n in table:
if b - a > 0:
print(hex(a)+","+hex(b)+","+str(n-a)+",")
print("};");
print("")
print("static const Rune " + name + "1[] = {")
for a, b, n in table:
if b - a == 0:
print(hex(a)+","+str(n-a)+",")
print("};");
print("/* This file was automatically created from UnicodeData.txt */")
dumpalpha()
dumpmap("ucd_tolower", tolower)
dumpmap("ucd_toupper", toupper)

1
one.c
View File

@@ -24,4 +24,3 @@
#include "jsvalue.c"
#include "regexp.c"
#include "utf.c"
#include "utftype.c"

93
utf.c
View File

@@ -15,6 +15,9 @@
#include <string.h>
#include "utf.h"
#include "utfdata.h"
#define nelem(a) (int)(sizeof (a) / sizeof (a)[0])
typedef unsigned char uchar;
@@ -210,3 +213,93 @@ utflen(const char *s)
n++;
}
}
static const Rune *
ucd_bsearch(Rune c, const Rune *t, int n, int ne)
{
const Rune *p;
int m;
while(n > 1) {
m = n/2;
p = t + m*ne;
if(c >= p[0]) {
t = p;
n = n-m;
} else
n = m;
}
if(n && c >= t[0])
return t;
return 0;
}
Rune
tolowerrune(Rune c)
{
const Rune *p;
p = ucd_bsearch(c, ucd_tolower2, nelem(ucd_tolower2)/3, 3);
if(p && c >= p[0] && c <= p[1])
return c + p[2];
p = ucd_bsearch(c, ucd_tolower1, nelem(ucd_tolower1)/2, 2);
if(p && c == p[0])
return c + p[1];
return c;
}
Rune
toupperrune(Rune c)
{
const Rune *p;
p = ucd_bsearch(c, ucd_toupper2, nelem(ucd_toupper2)/3, 3);
if(p && c >= p[0] && c <= p[1])
return c + p[2];
p = ucd_bsearch(c, ucd_toupper1, nelem(ucd_toupper1)/2, 2);
if(p && c == p[0])
return c + p[1];
return c;
}
int
islowerrune(Rune c)
{
const Rune *p;
p = ucd_bsearch(c, ucd_toupper2, nelem(ucd_toupper2)/3, 3);
if(p && c >= p[0] && c <= p[1])
return 1;
p = ucd_bsearch(c, ucd_toupper1, nelem(ucd_toupper1)/2, 2);
if(p && c == p[0])
return 1;
return 0;
}
int
isupperrune(Rune c)
{
const Rune *p;
p = ucd_bsearch(c, ucd_tolower2, nelem(ucd_tolower2)/3, 3);
if(p && c >= p[0] && c <= p[1])
return 1;
p = ucd_bsearch(c, ucd_tolower1, nelem(ucd_tolower1)/2, 2);
if(p && c == p[0])
return 1;
return 0;
}
int
isalpharune(Rune c)
{
const Rune *p;
p = ucd_bsearch(c, ucd_alpha2, nelem(ucd_alpha2)/2, 2);
if(p && c >= p[0] && c <= p[1])
return 1;
p = ucd_bsearch(c, ucd_alpha1, nelem(ucd_alpha1), 1);
if(p && c == p[0])
return 1;
return 0;
}

6
utf.h
View File

@@ -23,11 +23,8 @@ typedef int Rune; /* 32 bits */
#define isalpharune jsU_isalpharune
#define islowerrune jsU_islowerrune
#define isspacerune jsU_isspacerune
#define istitlerune jsU_istitlerune
#define isupperrune jsU_isupperrune
#define tolowerrune jsU_tolowerrune
#define totitlerune jsU_totitlerune
#define toupperrune jsU_toupperrune
enum
@@ -46,11 +43,8 @@ int utflen(const char *s);
int isalpharune(Rune c);
int islowerrune(Rune c);
int isspacerune(Rune c);
int istitlerune(Rune c);
int isupperrune(Rune c);
Rune tolowerrune(Rune c);
Rune totitlerune(Rune c);
Rune toupperrune(Rune c);
#endif

1988
utfdata.h Normal file

File diff suppressed because it is too large Load Diff

1143
utftype.c

File diff suppressed because it is too large Load Diff