mirror of
https://github.com/fltk/fltk.git
synced 2026-05-28 20:06:18 +08:00
UTF8 nbsp detection before string expansion fixed: as explained, now the fl_expand_txt() main loop is utf8 multibyte char aware and should not confuse nbsp chars with utf8, even mixed with other CP125x code.
git-svn-id: file:///fltk/svn/fltk/branches/branch-1.3@6500 ea41ed52-d2ee-0310-a9c1-e6b18d33e121
This commit is contained in:
+52
-49
@@ -58,51 +58,54 @@ static char* underline_at;
|
|||||||
#define C_IN(c,a,b) ((c)>=(a) && (c)<=(b))
|
#define C_IN(c,a,b) ((c)>=(a) && (c)<=(b))
|
||||||
#define C_UTF8(c) C_IN(c,0x80,0xBF)
|
#define C_UTF8(c) C_IN(c,0x80,0xBF)
|
||||||
|
|
||||||
/** fast utf8 string detection routine. \reval 0 if not utf8, 1 otherwise */
|
/**
|
||||||
int fl_is_valid_utf8(int& init_scan, int& scan_ret, const char* s) {
|
utf8 multibyte char seq. detection an pass-thru routine.
|
||||||
if (init_scan) return scan_ret; // scan only once the string
|
\retval false if no utf8 seq detected, no change made. true if utf8 and d copied with s seq.
|
||||||
init_scan=1;
|
note that for n bytes copied dest incremented of n, but s of n-1 for compatible loop use see below.
|
||||||
if ( !s || !(*s) ) return 0;
|
*/
|
||||||
|
static bool handle_utf8_seq(const char * &s,char * &d) {
|
||||||
register const unsigned char* p=(const unsigned char*)s;
|
register const unsigned char* p=(const unsigned char*)s;
|
||||||
while (*p) {
|
if (p[0] < 0xc2 || p[0] > 0xf4)
|
||||||
if ( p[0]==0x09 || p[0]==0x0d || p[0]==0x0a || (p[0]>0x1f && p[0]<0x80) ) {
|
return false; // not adressed in this function
|
||||||
p++;
|
else if ( C_IN(p[0], 0xc2, 0xdf) && C_UTF8(p[1]) ) {
|
||||||
continue; // Ascii
|
d[0]=s[0]; d[1]=s[1];
|
||||||
}
|
d+=2; s++;
|
||||||
if ( C_IN(p[0], 0xc2, 0xdf) && C_UTF8(p[1]) ) {
|
// non-overlong 2-byte
|
||||||
p+=2;
|
}
|
||||||
continue; // non-overlong 2-byte
|
else if ( p[0]==0xe0 && C_IN(p[1], 0xa0, 0xbf) && C_UTF8(p[2]) ) {
|
||||||
}
|
d[0]=s[0]; d[1]=s[1];d[2]=s[2];
|
||||||
if ( p[0]==0xe0 && C_IN(p[1], 0xa0, 0xbf) && C_UTF8(p[2]) ) {
|
d+=3; s+=2;
|
||||||
p+=3;
|
// excluding overlongs
|
||||||
continue; // excluding overlongs
|
}
|
||||||
}
|
else if (p[0]==0xed && C_IN(p[1], 0x80, 0x9f) && C_UTF8(p[2]) ) {
|
||||||
if (p[0]==0xed && C_IN(p[1], 0x80, 0x9f) && C_UTF8(p[2]) ) {
|
d[0]=s[0]; d[1]=s[1];d[2]=s[2];
|
||||||
p+=3;
|
d+=3; s+=2;
|
||||||
continue; // excluding surrogates
|
// excluding surrogates
|
||||||
}
|
}
|
||||||
if (p[0]!=0xed && C_IN(p[0], 0xe1, 0xef) && C_UTF8(p[1]) && C_UTF8(p[2]) ) {
|
else if (p[0]!=0xed && C_IN(p[0], 0xe1, 0xef) && C_UTF8(p[1]) && C_UTF8(p[2]) ) {
|
||||||
p+=3;
|
d[0]=s[0]; d[1]=s[1];d[2]=s[2];
|
||||||
continue; // straight 3-byte
|
d+=3; s+=2;
|
||||||
}
|
// straight 3-byte
|
||||||
if (p[0]==0xf0 && C_IN(p[1], 0x90, 0xbf) && C_UTF8(p[2]) && C_UTF8(p[3]) ) {
|
}
|
||||||
p+=4;
|
else if (p[0]==0xf0 && C_IN(p[1], 0x90, 0xbf) && C_UTF8(p[2]) && C_UTF8(p[3]) ) {
|
||||||
continue; // planes 1-3
|
d[0]=s[0]; d[1]=s[1]; d[2]=s[2]; d[3]=s[3];
|
||||||
}
|
d+=4; s+=3;
|
||||||
if (C_IN(p[0], 0xf1, 0xf3) && C_UTF8(p[1]) && C_UTF8(p[2]) && C_UTF8(p[3]) ) {
|
// planes 1-3
|
||||||
p+=4;
|
}
|
||||||
continue; // planes 4-15
|
else if (C_IN(p[0], 0xf1, 0xf3) && C_UTF8(p[1]) && C_UTF8(p[2]) && C_UTF8(p[3]) ) {
|
||||||
}
|
d[0]=s[0]; d[1]=s[1]; d[2]=s[2]; d[3]=s[3];
|
||||||
if (p[0]==0xf4 && C_IN(p[1], 0x80, 0x8f) && C_UTF8(p[2]) && C_UTF8(p[3]) ) {
|
d+=4; s+=3;
|
||||||
p+=4;
|
// planes 4-15
|
||||||
continue; // planes 16
|
}
|
||||||
}
|
else if (p[0]==0xf4 && C_IN(p[1], 0x80, 0x8f) && C_UTF8(p[2]) && C_UTF8(p[3]) ) {
|
||||||
scan_ret=0;
|
d[0]=s[0]; d[1]=s[1]; d[2]=s[2]; d[3]=s[3];
|
||||||
return scan_ret; // not utf8
|
d+=4; s+=3;
|
||||||
}
|
// planes 16
|
||||||
scan_ret=1;
|
} else { // non utf8 compliant, maybe CP125x or broken utf8 string
|
||||||
return scan_ret;
|
fprintf(stderr, "Not UTF8 char \n");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true; // we did handled and copied the utf8 multibyte char seq.
|
||||||
}
|
}
|
||||||
|
|
||||||
const char*
|
const char*
|
||||||
@@ -115,8 +118,6 @@ fl_expand_text(const char* from, char* buf, int maxbuf, double maxw, int& n,
|
|||||||
const char* word_start = from;
|
const char* word_start = from;
|
||||||
double w = 0;
|
double w = 0;
|
||||||
|
|
||||||
int init_scan=0, scan_ret;
|
|
||||||
|
|
||||||
const char* p = from;
|
const char* p = from;
|
||||||
for (;; p++) {
|
for (;; p++) {
|
||||||
|
|
||||||
@@ -150,12 +151,14 @@ fl_expand_text(const char* from, char* buf, int maxbuf, double maxw, int& n,
|
|||||||
} else if (c < ' ' || c == 127) { // ^X
|
} else if (c < ' ' || c == 127) { // ^X
|
||||||
*o++ = '^';
|
*o++ = '^';
|
||||||
*o++ = c ^ 0x40;
|
*o++ = c ^ 0x40;
|
||||||
|
} else if (handle_utf8_seq(p, o)) { // figure out if we have an utf8 valid sequence before we determine the nbsp test validity:
|
||||||
#ifdef __APPLE__
|
#ifdef __APPLE__
|
||||||
} else if (c == 0xCA && !fl_is_valid_utf8(init_scan, scan_ret,from) ) { // non-breaking space in MacRoman
|
} else if (c == 0xCA) { // non-breaking space in MacRoman
|
||||||
#else
|
#else
|
||||||
} else if (c == 0xA0 && !fl_is_valid_utf8(init_scan, scan_ret,from) ) { // non-breaking space in ISO 8859
|
} else if (c == 0xA0) { // non-breaking space in ISO 8859
|
||||||
#endif
|
#endif
|
||||||
*o++ = ' ';
|
*o++ = ' ';
|
||||||
|
|
||||||
} else if (c == '@' && draw_symbols) { // Symbol???
|
} else if (c == '@' && draw_symbols) { // Symbol???
|
||||||
if (p[1] && p[1] != '@') break;
|
if (p[1] && p[1] != '@') break;
|
||||||
*o++ = c;
|
*o++ = c;
|
||||||
|
|||||||
Reference in New Issue
Block a user