mirror of
https://github.com/fltk/fltk.git
synced 2026-06-06 08:32:07 +08:00
More attempts to clean up WIN32 handling of UTF16 surrogate pairs.
In particular, I have added a new function to src/fl_utf.c called fl_ucs_to_Utf16() which converts a single 32-bit Unicode value into one (or more) UTF16 cells. This is needed in the win32 char-by-char text width() logic, and I suspect may also be useful in the OSX code in some places. git-svn-id: file:///fltk/svn/fltk/branches/branch-1.3@8585 ea41ed52-d2ee-0310-a9c1-e6b18d33e121
This commit is contained in:
+6
-5
@@ -28,8 +28,6 @@
|
||||
* with the functions provided in OksiD's fltk-1.1.6-utf8 port
|
||||
*/
|
||||
|
||||
/*** NOTE : all functions are LIMITED to 24 bits Unicode values !!! ***/
|
||||
|
||||
/**
|
||||
\file fl_utf8.h
|
||||
\brief header for Unicode and UTF8 chracter handling
|
||||
@@ -99,16 +97,16 @@ FL_EXPORT int fl_utf8bytes(unsigned ucs);
|
||||
|
||||
/* OD: returns the byte length of the first UTF-8 char sequence (returns -1 if not valid) */
|
||||
FL_EXPORT int fl_utf8len(char c);
|
||||
|
||||
|
||||
/* OD: returns the byte length of the first UTF-8 char sequence (returns +1 if not valid) */
|
||||
FL_EXPORT int fl_utf8len1(char c);
|
||||
|
||||
|
||||
/* OD: returns the number of Unicode chars in the UTF-8 string */
|
||||
FL_EXPORT int fl_utf_nb_char(const unsigned char *buf, int len);
|
||||
|
||||
/* F2: Convert the next UTF8 char-sequence into a Unicode value (and say how many bytes were used) */
|
||||
FL_EXPORT unsigned fl_utf8decode(const char* p, const char* end, int* len);
|
||||
|
||||
|
||||
/* F2: Encode a Unicode value into a UTF8 sequence, return the number of bytes used */
|
||||
FL_EXPORT int fl_utf8encode(unsigned ucs, char* buf);
|
||||
|
||||
@@ -118,6 +116,9 @@ FL_EXPORT const char* fl_utf8fwd(const char* p, const char* start, const char* e
|
||||
/* F2: Move backward to the previous valid UTF8 sequence start */
|
||||
FL_EXPORT const char* fl_utf8back(const char* p, const char* start, const char* end);
|
||||
|
||||
/* XX: Convert a single 32-bit Unicode value into UTF16 */
|
||||
FL_EXPORT unsigned fl_ucs_to_Utf16(const unsigned ucs, unsigned short *dst, const unsigned dstlen);
|
||||
|
||||
/* F2: Convert a UTF8 string into UTF16 */
|
||||
FL_EXPORT unsigned fl_utf8toUtf16(const char* src, unsigned srclen, unsigned short* dst, unsigned dstlen);
|
||||
|
||||
|
||||
+7
-10
@@ -185,26 +185,23 @@ double Fl_GDI_Graphics_Driver::width(unsigned int c) {
|
||||
Fl_Font_Descriptor *fl_fontsize = font_descriptor();
|
||||
unsigned int r;
|
||||
SIZE s;
|
||||
// Special Case Handling of Unicode points over U+FFFF
|
||||
// Special Case Handling of Unicode points over U+FFFF.
|
||||
// The logic (below) computes a lookup table for char widths
|
||||
// on-the-fly, but the table only covers codepoints up to
|
||||
// U+FFFF, which covers the basic multilingual plane, but
|
||||
// not any higher plane, or glyphs that require surrogate-pairs
|
||||
// to encode them in WinXX which is UTF16.
|
||||
// to encode them in WinXX, which is UTF16.
|
||||
// This code assumes that these glyphs are rarely used and simply
|
||||
// measures them explicitly if they occur - Which may be slow...
|
||||
// measures them explicitly if they occur - This will be slow...
|
||||
if(c > 0x0000FFFF) { // UTF16 surrogate pair is needed
|
||||
if (!fl_gc) { // We have no valid gc, so nothing to measure - bail out
|
||||
return 0.0;
|
||||
}
|
||||
int cc; // cell count
|
||||
char utf8[8]; // Array for UTF-8 representation of c
|
||||
unsigned short ucs[4]; // Array for UTF16 representation of c
|
||||
// This fl_utf8encode / fl_utf8toUtf16 dance creates a UTF16 string
|
||||
// from a UCS code point.
|
||||
cc = fl_utf8encode(c, utf8);
|
||||
cc = fl_utf8toUtf16(utf8, cc, ucs, 4);
|
||||
GetTextExtentPoint32W(fl_gc, (WCHAR*)ucs, cc, &s);
|
||||
unsigned short u16[4]; // Array for UTF16 representation of c
|
||||
// Creates a UTF16 string from a UCS code point.
|
||||
cc = fl_ucs_to_Utf16(c, u16, 4);
|
||||
GetTextExtentPoint32W(fl_gc, (WCHAR*)u16, cc, &s);
|
||||
return (double)s.cx;
|
||||
}
|
||||
// else - this falls through to the lookup-table for glyph widths
|
||||
|
||||
+84
-17
@@ -37,11 +37,11 @@
|
||||
|
||||
|
||||
#if 0
|
||||
/**
|
||||
/**
|
||||
\defgroup fl_unichar Unicode Character Functions
|
||||
Global Functions Handling Single Unicode Characters
|
||||
@{ */
|
||||
|
||||
|
||||
/**
|
||||
Converts a Unicode character into a utf-8 sequence.
|
||||
\param[in] uc Unicode character
|
||||
@@ -50,24 +50,24 @@
|
||||
\return length of the sequence in bytes
|
||||
*/
|
||||
/* FL_EXPORT int fl_unichar_to_utf8(unsigned int uc, char *text); */
|
||||
|
||||
/** @} */
|
||||
|
||||
/**
|
||||
|
||||
/** @} */
|
||||
|
||||
/**
|
||||
\defgroup fl_utf8 Unicode String Functions
|
||||
Global Functions Handling Unicode Text
|
||||
@{ */
|
||||
|
||||
|
||||
/**
|
||||
Calculate the size of a utf-8 sequence for a Unicode character.
|
||||
\param[in] uc Unicode character
|
||||
\return length of the sequence in bytes
|
||||
*/
|
||||
/* FL_EXPORT int fl_utf8_size(unsigned int uc); */
|
||||
|
||||
/** @} */
|
||||
|
||||
/** @} */
|
||||
#endif /* 0 */
|
||||
|
||||
|
||||
/*!Set to 1 to turn bad UTF8 bytes into ISO-8859-1. If this is to zero
|
||||
they are instead turned into the Unicode REPLACEMENT CHARACTER, of
|
||||
value 0xfffd.
|
||||
@@ -337,6 +337,73 @@ int fl_utf8encode(unsigned ucs, char* buf) {
|
||||
}
|
||||
}
|
||||
|
||||
/*! Convert a single 32-bit Unicode codepoint into an array of 16-bit
|
||||
characters. These are used by some system calls, especially on Windows.
|
||||
|
||||
\p ucs is the value to convert.
|
||||
|
||||
\p dst points at an array to write, and \p dstlen is the number of
|
||||
locations in this array. At most \p dstlen words will be
|
||||
written, and a 0 terminating word will be added if \p dstlen is
|
||||
large enough. Thus this function will never overwrite the buffer
|
||||
and will attempt return a zero-terminated string if space permits.
|
||||
If \p dstlen is zero then \p dst can be set to NULL and no data
|
||||
is written, but the length is returned.
|
||||
|
||||
The return value is the number of 16-bit words that \e would be written
|
||||
to \p dst if it is large enough, not counting any terminating
|
||||
zero.
|
||||
|
||||
If the return value is greater than \p dstlen it indicates truncation,
|
||||
you should then allocate a new array of size return+1 and call this again.
|
||||
|
||||
Unicode characters in the range 0x10000 to 0x10ffff are converted to
|
||||
"surrogate pairs" which take two words each (in UTF-16 encoding).
|
||||
Typically, setting \p dstlen to 2 will ensure that any valid Unicode
|
||||
value can be converted, and setting \p dstlen to 3 or more will allow
|
||||
a NULL terminated sequence to be returned.
|
||||
*/
|
||||
unsigned fl_ucs_to_Utf16(const unsigned ucs, unsigned short *dst, const unsigned dstlen)
|
||||
{
|
||||
/* The rule for direct conversion from UCS to UTF16 is:
|
||||
* - if UCS > 0x0010FFFF then UCS is invalid
|
||||
* - if UCS >= 0xD800 && UCS <= 0xDFFF UCS is invalid
|
||||
* - if UCS <= 0x0000FFFF then U16 = UCS, len = 1
|
||||
* - else
|
||||
* -- U16[0] = ((UCS - 0x00010000) >> 10) & 0x3FF + 0xD800
|
||||
* -- U16[1] = (UCS & 0x3FF) + 0xDC00
|
||||
* -- len = 2;
|
||||
*/
|
||||
unsigned count; /* Count of converted UTF16 cells */
|
||||
unsigned short u16[4]; /* Alternate buffer if dst is not set */
|
||||
unsigned short *out; /* points to the active buffer */
|
||||
/* Ensure we have a valid buffer to write to */
|
||||
if((!dstlen) || (!dst)) {
|
||||
out = u16;
|
||||
} else {
|
||||
out = dst;
|
||||
}
|
||||
/* Convert from UCS to UTF16 */
|
||||
if((ucs > 0x0010FFFF) || /* UCS is too large */
|
||||
((ucs > 0xD7FF) && (ucs < 0xE000))) { /* UCS in invalid range */
|
||||
out[0] = 0xFFFD; /* REPLACEMENT CHARACTER */
|
||||
count = 1;
|
||||
} else if(ucs < 0x00010000) {
|
||||
out[0] = (unsigned short)ucs;
|
||||
count = 1;
|
||||
} else if(dstlen < 2) { /* dst is too small for the result */
|
||||
out[0] = 0xFFFD; /* REPLACEMENT CHARACTER */
|
||||
count = 2;
|
||||
} else {
|
||||
out[0] = (((ucs - 0x00010000) >> 10) & 0x3FF) + 0xD800;
|
||||
out[1] = (ucs & 0x3FF) + 0xDC00;
|
||||
count = 2;
|
||||
}
|
||||
/* NULL terminate the output, if there is space */
|
||||
if(count < dstlen) { out[count] = 0; }
|
||||
return count;
|
||||
} /* fl_ucs_to_Utf16 */
|
||||
|
||||
/*! Convert a UTF-8 sequence into an array of 16-bit characters. These
|
||||
are used by some system calls, especially on Windows.
|
||||
|
||||
@@ -363,7 +430,7 @@ int fl_utf8encode(unsigned ucs, char* buf) {
|
||||
|
||||
Unicode characters in the range 0x10000 to 0x10ffff are converted to
|
||||
"surrogate pairs" which take two words each (this is called UTF-16
|
||||
encoding).
|
||||
encoding).
|
||||
*/
|
||||
unsigned fl_utf8toUtf16(const char* src, unsigned srclen,
|
||||
unsigned short* dst, unsigned dstlen)
|
||||
@@ -407,21 +474,21 @@ unsigned fl_utf8toUtf16(const char* src, unsigned srclen,
|
||||
Converts a UTF-8 string into a wide character string.
|
||||
|
||||
This function generates 32-bit wchar_t (e.g. "ucs4" as it were) except
|
||||
on Windows where it is equivalent to fl_utf8toUtf16 and returns
|
||||
on Windows where it is equivalent to fl_utf8toUtf16 and returns
|
||||
UTF-16.
|
||||
|
||||
|
||||
\p src points at the UTF-8, and \p srclen is the number of bytes to
|
||||
convert.
|
||||
|
||||
|
||||
\p dst points at an array to write, and \p dstlen is the number of
|
||||
locations in this array. At most \p dstlen-1 wchar_t will be
|
||||
written there, plus a 0 terminating wchar_t.
|
||||
|
||||
|
||||
The return value is the number of wchar_t that \e would be written
|
||||
to \p dst if it were long enough, not counting the terminating
|
||||
zero. If the return value is greater or equal to \p dstlen it
|
||||
indicates truncation, you can then allocate a new array of size
|
||||
return+1 and call this again.
|
||||
return+1 and call this again.
|
||||
|
||||
Notice that sizeof(wchar_t) is 2 on Windows and is 4 on Linux
|
||||
and most other systems. Where wchar_t is 16 bits, Unicode
|
||||
@@ -429,7 +496,7 @@ unsigned fl_utf8toUtf16(const char* src, unsigned srclen,
|
||||
"surrogate pairs" which take two words each (this is called UTF-16
|
||||
encoding). If wchar_t is 32 bits this rather nasty problem is
|
||||
avoided.
|
||||
|
||||
|
||||
Note that Windows includes Cygwin, i.e. compiled with Cygwin's POSIX
|
||||
layer (cygwin1.dll, --enable-cygwin), either native (GDI) or X11.
|
||||
*/
|
||||
|
||||
Reference in New Issue
Block a user