More attempts to clean up WIN32 handling of UTF16 surrogate pairs.

In particular, I have added a new function to src/fl_utf.c called fl_ucs_to_Utf16() which converts a single 32-bit Unicode value into one (or more) UTF16 cells. This is needed in the win32 char-by-char text width() logic, and I suspect may also be useful in the OSX code in some places. git-svn-id: file:///fltk/svn/fltk/branches/branch-1.3@8585 ea41ed52-d2ee-0310-a9c1-e6b18d33e121
2026-06-06 08:32:07 +08:00 · 2011-04-13 15:43:22 +00:00
parent 639ee4fbab
commit be6df55717
3 changed files with 97 additions and 32 deletions
@@ -28,8 +28,6 @@
 * with the functions provided in OksiD's fltk-1.1.6-utf8 port
 */

-/*** NOTE : all functions are LIMITED to 24 bits Unicode values !!! ***/
-
 /**
  \file fl_utf8.h
  \brief header for Unicode and UTF8 chracter handling
@@ -99,16 +97,16 @@ FL_EXPORT int fl_utf8bytes(unsigned ucs);

 /* OD: returns the byte length of the first UTF-8 char sequence (returns -1 if not valid) */
 FL_EXPORT int fl_utf8len(char c);
-  
+
 /* OD: returns the byte length of the first UTF-8 char sequence (returns +1 if not valid) */
 FL_EXPORT int fl_utf8len1(char c);
-  
+
 /* OD: returns the number of Unicode chars in the UTF-8 string */
 FL_EXPORT int fl_utf_nb_char(const unsigned char *buf, int len);

 /* F2: Convert the next UTF8 char-sequence into a Unicode value (and say how many bytes were used) */
 FL_EXPORT unsigned fl_utf8decode(const char* p, const char* end, int* len);
-  
+
 /* F2: Encode a Unicode value into a UTF8 sequence, return the number of bytes used */
 FL_EXPORT int fl_utf8encode(unsigned ucs, char* buf);

@@ -118,6 +116,9 @@ FL_EXPORT const char* fl_utf8fwd(const char* p, const char* start, const char* e
 /* F2: Move backward to the previous valid UTF8 sequence start */
 FL_EXPORT const char* fl_utf8back(const char* p, const char* start, const char* end);

+/* XX: Convert a single 32-bit Unicode value into UTF16 */
+FL_EXPORT unsigned fl_ucs_to_Utf16(const unsigned ucs, unsigned short *dst, const unsigned dstlen);
+
 /* F2: Convert a UTF8 string into UTF16 */
 FL_EXPORT unsigned fl_utf8toUtf16(const char* src, unsigned srclen, unsigned short* dst, unsigned dstlen);

@@ -185,26 +185,23 @@ double Fl_GDI_Graphics_Driver::width(unsigned int c) {
  Fl_Font_Descriptor *fl_fontsize = font_descriptor();
  unsigned int r;
  SIZE s;
-  // Special Case Handling of Unicode points over U+FFFF
+  // Special Case Handling of Unicode points over U+FFFF.
  // The logic (below) computes a lookup table for char widths
  // on-the-fly, but the table only covers codepoints up to
  // U+FFFF, which covers the basic multilingual plane, but
  // not any higher plane, or glyphs that require surrogate-pairs
-  // to encode them in WinXX which is UTF16.
+  // to encode them in WinXX, which is UTF16.
  // This code assumes that these glyphs are rarely used and simply
-  // measures them explicitly if they occur - Which may be slow...
+  // measures them explicitly if they occur - This will be slow...
  if(c > 0x0000FFFF) { // UTF16 surrogate pair is needed
    if (!fl_gc) { // We have no valid gc, so nothing to measure - bail out
      return 0.0;
    }
    int cc; // cell count
-    char utf8[8];          // Array for UTF-8 representation of c
-    unsigned short ucs[4]; // Array for UTF16 representation of c
-    // This fl_utf8encode / fl_utf8toUtf16 dance creates a UTF16 string
-    // from a UCS code point.
-    cc = fl_utf8encode(c, utf8);
-    cc = fl_utf8toUtf16(utf8, cc, ucs, 4);
-    GetTextExtentPoint32W(fl_gc, (WCHAR*)ucs, cc, &s);
+    unsigned short u16[4]; // Array for UTF16 representation of c
+    // Creates a UTF16 string from a UCS code point.
+    cc = fl_ucs_to_Utf16(c, u16, 4);
+    GetTextExtentPoint32W(fl_gc, (WCHAR*)u16, cc, &s);
    return (double)s.cx;
  }
  // else - this falls through to the lookup-table for glyph widths
@@ -37,11 +37,11 @@


 #if 0
-  /** 
+  /**
   \defgroup fl_unichar Unicode Character Functions
   Global Functions Handling Single Unicode Characters
   @{ */
-  
+
  /**
   Converts a Unicode character into a utf-8 sequence.
   \param[in] uc Unicode character
@@ -50,24 +50,24 @@
   \return length of the sequence in bytes
   */
  /* FL_EXPORT int fl_unichar_to_utf8(unsigned int uc, char *text); */
-  
-  /** @} */  
-  
-  /** 
+
+  /** @} */
+
+  /**
   \defgroup fl_utf8 Unicode String Functions
   Global Functions Handling Unicode Text
   @{ */
-  
+
  /**
   Calculate the size of a utf-8 sequence for a Unicode character.
   \param[in] uc Unicode character
   \return length of the sequence in bytes
   */
  /* FL_EXPORT int fl_utf8_size(unsigned int uc); */
-  
-  /** @} */  
+
+  /** @} */
 #endif /* 0 */
-  
+
 /*!Set to 1 to turn bad UTF8 bytes into ISO-8859-1. If this is to zero
   they are instead turned into the Unicode REPLACEMENT CHARACTER, of
   value 0xfffd.
@@ -337,6 +337,73 @@ int fl_utf8encode(unsigned ucs, char* buf) {
  }
 }

+/*! Convert a single 32-bit Unicode codepoint into an array of 16-bit
+    characters. These are used by some system calls, especially on Windows.
+
+    \p ucs is the value to convert.
+
+    \p dst points at an array to write, and \p dstlen is the number of
+    locations in this array. At most \p dstlen words will be
+    written, and a 0 terminating word will be added if \p dstlen is
+    large enough. Thus this function will never overwrite the buffer
+    and will attempt return a zero-terminated string if space permits.
+    If \p dstlen is zero then \p dst can be set to NULL and no data
+    is written, but the length is returned.
+
+    The return value is the number of 16-bit words that \e would be written
+    to \p dst if it is large enough, not counting any terminating
+    zero.
+
+    If the return value is greater than \p dstlen it indicates truncation,
+    you should then allocate a new array of size return+1 and call this again.
+
+    Unicode characters in the range 0x10000 to 0x10ffff are converted to
+    "surrogate pairs" which take two words each (in UTF-16 encoding).
+    Typically, setting \p dstlen to 2 will ensure that any valid Unicode
+    value can be converted, and setting \p dstlen to 3 or more will allow
+    a NULL terminated sequence to be returned.
+*/
+unsigned fl_ucs_to_Utf16(const unsigned ucs, unsigned short *dst, const unsigned dstlen)
+{
+  /* The rule for direct conversion from UCS to UTF16 is:
+   * - if UCS >  0x0010FFFF then UCS is invalid
+   * - if UCS >= 0xD800 && UCS <= 0xDFFF UCS is invalid
+   * - if UCS <= 0x0000FFFF then U16 = UCS, len = 1
+   * - else
+   * -- U16[0] = ((UCS - 0x00010000) >> 10) & 0x3FF + 0xD800
+   * -- U16[1] = (UCS & 0x3FF) + 0xDC00
+   * -- len = 2;
+   */
+  unsigned count;        /* Count of converted UTF16 cells */
+  unsigned short u16[4]; /* Alternate buffer if dst is not set */
+  unsigned short *out;   /* points to the active buffer */
+  /* Ensure we have a valid buffer to write to */
+  if((!dstlen) || (!dst)) {
+    out = u16;
+  } else {
+    out = dst;
+  }
+  /* Convert from UCS to UTF16 */
+  if((ucs > 0x0010FFFF) || /* UCS is too large */
+  ((ucs > 0xD7FF) && (ucs < 0xE000))) { /* UCS in invalid range */
+    out[0] = 0xFFFD; /* REPLACEMENT CHARACTER */
+    count = 1;
+  } else if(ucs < 0x00010000) {
+    out[0] = (unsigned short)ucs;
+    count = 1;
+  } else if(dstlen < 2) { /* dst is too small for the result */
+    out[0] = 0xFFFD; /* REPLACEMENT CHARACTER */
+    count = 2;
+  } else {
+    out[0] = (((ucs - 0x00010000) >> 10) & 0x3FF) + 0xD800;
+    out[1] = (ucs & 0x3FF) + 0xDC00;
+    count = 2;
+  }
+  /* NULL terminate the output, if there is space */
+  if(count < dstlen) { out[count] = 0; }
+  return count;
+} /* fl_ucs_to_Utf16 */
+
 /*! Convert a UTF-8 sequence into an array of 16-bit characters. These
    are used by some system calls, especially on Windows.

@@ -363,7 +430,7 @@ int fl_utf8encode(unsigned ucs, char* buf) {

    Unicode characters in the range 0x10000 to 0x10ffff are converted to
    "surrogate pairs" which take two words each (this is called UTF-16
-    encoding). 
+    encoding).
 */
 unsigned fl_utf8toUtf16(const char* src, unsigned srclen,
 		  unsigned short* dst, unsigned dstlen)
@@ -407,21 +474,21 @@ unsigned fl_utf8toUtf16(const char* src, unsigned srclen,
  Converts a UTF-8 string into a wide character string.

  This function generates 32-bit wchar_t (e.g. "ucs4" as it were) except
-  on Windows where it is equivalent to fl_utf8toUtf16 and returns 
+  on Windows where it is equivalent to fl_utf8toUtf16 and returns
  UTF-16.
- 
+
  \p src points at the UTF-8, and \p srclen is the number of bytes to
  convert.
- 
+
  \p dst points at an array to write, and \p dstlen is the number of
  locations in this array. At most \p dstlen-1 wchar_t will be
  written there, plus a 0 terminating wchar_t.
- 
+
  The return value is the number of wchar_t that \e would be written
  to \p dst if it were long enough, not counting the terminating
  zero. If the return value is greater or equal to \p dstlen it
  indicates truncation, you can then allocate a new array of size
-  return+1 and call this again. 
+  return+1 and call this again.

  Notice that sizeof(wchar_t) is 2 on Windows and is 4 on Linux
  and most other systems. Where wchar_t is 16 bits, Unicode
@@ -429,7 +496,7 @@ unsigned fl_utf8toUtf16(const char* src, unsigned srclen,
  "surrogate pairs" which take two words each (this is called UTF-16
  encoding). If wchar_t is 32 bits this rather nasty problem is
  avoided.
- 
+
  Note that Windows includes Cygwin, i.e. compiled with Cygwin's POSIX
  layer (cygwin1.dll, --enable-cygwin), either native (GDI) or X11.
  */