diff --git a/FL/Fl_Text_Buffer.H b/FL/Fl_Text_Buffer.H index d74ddfc79..a9365d5c0 100644 --- a/FL/Fl_Text_Buffer.H +++ b/FL/Fl_Text_Buffer.H @@ -742,19 +742,11 @@ public: */ const Fl_Text_Selection* highlight_selection() const { return &mHighlight; } - /** - Returns the index of the previous character. - \param ix index to the current character - */ - int prev_char(int ix) const; - int prev_char_clipped(int ix) const; + int prev_char(int pos) const; + int prev_char_clipped(int pos) const; - /** - Returns the index of the next character. - \param ix index to the current character - */ - int next_char(int ix) const; - int next_char_clipped(int ix) const; + int next_char(int pos) const; + int next_char_clipped(int pos) const; /** Align an index into the buffer to the current or previous UTF-8 boundary. diff --git a/src/Fl_Text_Buffer.cxx b/src/Fl_Text_Buffer.cxx index 907a5cce6..848cacd9a 100644 --- a/src/Fl_Text_Buffer.cxx +++ b/src/Fl_Text_Buffer.cxx @@ -2080,9 +2080,8 @@ int Fl_Text_Buffer::outputfile(const char *file, } -/* - Return the previous character position. - Unicode safe. +/** + As prev_char() but returns 0 if the beginning of the buffer is reached. */ int Fl_Text_Buffer::prev_char_clipped(int pos) const { @@ -2090,17 +2089,27 @@ int Fl_Text_Buffer::prev_char_clipped(int pos) const return 0; IS_UTF8_ALIGNED2(this, (pos)) - const char *previous = fl_utf8_previous_composed_char(address(0) + pos, address(0)); - pos = previous - address(0); - + const int l_t = 40; + char t[l_t + 1]; t[l_t] = 0; + int l = l_t, p = pos, ll; + for (int i = l_t; i > 0 && p > 0; i--) { + t[--l] = byte_at(--p); + ll = fl_utf8len(t[l]); + if (ll == 1 || ll == 2) break; + } + const char *previous = fl_utf8_previous_composed_char(t + l_t, t + l); + ll = strlen(t + l); + pos = (pos - ll) + (previous - (t+l)); IS_UTF8_ALIGNED2(this, (pos)) return pos; } -/* - Return the previous character position. +/** + Returns the index of the previous character. + This function processes a composed character (e.g., a flag emoji) as a single character. Returns -1 if the beginning of the buffer is reached. + \param pos index to the current character */ int Fl_Text_Buffer::prev_char(int pos) const { @@ -2109,15 +2118,31 @@ int Fl_Text_Buffer::prev_char(int pos) const } -/* - Return the next character position. +/** + Returns the index of the next character. + This function processes a composed character (e.g., a flag emoji) as a single character. Returns length() if the end of the buffer is reached. + \param pos index to the current character */ int Fl_Text_Buffer::next_char(int pos) const { IS_UTF8_ALIGNED2(this, (pos)) - const char *next = fl_utf8_next_composed_char(address(0) + pos, address(0) + mLength); - pos = next - address(0); + int l = fl_utf8len1(byte_at(pos)); + if (l > 2) { // test for composed character only if pos is at long codepoint + int p = pos, ll, b; + char t[40]; // crazyest composed characters I know use 28 bytes in UTF8 (e.g., 🏴󠁧󠁢󠁷󠁬󠁳󠁿) + l = 0; + // extract bytes after pos stopping after short codepoint or 40 bytes at most + while (p < mLength && l < sizeof(t)) { + b = byte_at(p++); + t[l++] = b; + ll = fl_utf8len(b); + for (int i = 1; i < ll && l < sizeof(t); i++) t[l++] = byte_at(p++); + if (ll == 1 || ll == 2) break; // stop after short codepoint (includes '\n') + } + l = fl_utf8_next_composed_char(t, t + l) - t; // length of possibly composed character starting at pos + } + pos += l; if (pos>=mLength) return mLength; IS_UTF8_ALIGNED2(this, (pos)) diff --git a/src/fl_utf8.cxx b/src/fl_utf8.cxx index df4c6d423..b9681108c 100644 --- a/src/fl_utf8.cxx +++ b/src/fl_utf8.cxx @@ -1634,17 +1634,32 @@ unsigned fl_utf8from_mb(char* dst, unsigned dstlen, const char* src, unsigned sr Returns pointer to beginning of next unicode character after potentially composed character. Some unicode characters (example: 👩‍✈️ "woman pilot") are composed of several unicode points. They may pair two successive codepoints with U+200D (zero-width joiner) and may qualify any component with variation selectors or Fitzpatrick emoji modifiers. + Most flag emojis are composed of 2 successive "regional indicator symbols" from range [U+1F1E6 , U+1F1FF]. \param from points to a location within a UTF8 string. If this location is inside the UTF8 encoding of a codepoint or is an invalid byte, this function returns \p from + 1. \param end points past last codepoint of the string. - \return pointer to start of first codepoint after potentially composed character beginning at \p from. + \return pointer to beginning of first codepoint after potentially composed character that begins at \p from. */ const char *fl_utf8_next_composed_char(const char *from, const char *end) { - int skip = fl_utf8len(*from); - if (skip == -1) return from + 1; + int skip = fl_utf8len1(*from); + if (skip <= 2) return from + skip; + unsigned u = fl_utf8decode(from, end, NULL); + if (u >= 0x1F1E6 && u <= 0x1F1FF) { // a 1st regional indicator symbol can be a flag + u = fl_utf8decode(from + skip, end, NULL); + if (u >= 0x1F1E6 && u <= 0x1F1FF) { // a 2nd regional indicator symbol gives a flag + return from + 2 * skip; + } + } else if (u == 0x1F3F4) { // “waving black flag” may start subdivision flags (e.g. 🏴󠁧󠁢󠁷󠁬󠁳󠁿) + const char *next = from + skip; + do { + u = fl_utf8decode(next, end, NULL); + next += fl_utf8len1(*next); + if (u == 0xE007F) return next; // ends with "cancel tag" + } while (u >= 0xE0020 && u <= 0xE007E); // any series of "tag components" + } from += skip; // skip 1st codepoint while (from < end) { - unsigned u = fl_utf8decode(from, end, NULL); + u = fl_utf8decode(from, end, NULL); if (u == 0x200D) { // zero-width joiner from += fl_utf8len(*from); // skip joiner from += fl_utf8len(*from); // skip joined codepoint @@ -1664,14 +1679,31 @@ const char *fl_utf8_next_composed_char(const char *from, const char *end) { \param from points to a location within a UTF8 string. If this location is inside the UTF8 encoding of a codepoint or is an invalid byte, this function returns \p from - 1. \param begin points to start of first codepoint of the string. - \return pointer to start of first potentially composed character before the codepoint beginning at \p from. + \return pointer to beginning of first potentially composed character before the codepoint that begins at \p from. */ const char *fl_utf8_previous_composed_char(const char *from, const char *begin) { - if (from <= begin || fl_utf8len(*from) == -1) return from - 1; - const char *keep = from; + int l = fl_utf8len(*from); + if (from <= begin || l == -1) return from - 1; + const char *keep = from + l; from = fl_utf8back(from - 1, begin, NULL); + unsigned u = fl_utf8decode(from, keep, NULL); + if (u >= 0x1F1E6 && u <= 0x1F1FF) { // a 1st regional indicator symbol can be a flag + const char *previous = fl_utf8back(from - 1, begin, NULL); + u = fl_utf8decode(previous, keep, NULL); + if (u >= 0x1F1E6 && u <= 0x1F1FF) { // a 2nd Regional indicator symbol gives a flag + return previous; + } + } else if (u == 0xE007F) { // ends with "cancel tag" + const char *previous = from; + do { + if (previous <= begin) return begin; + previous = fl_utf8back(previous - 1, begin, NULL); + u = fl_utf8decode(previous, keep, NULL); + if (u == 0x1F3F4) return previous; // “waving black flag” starts subdivision flags + } while (u >= 0xE0020 && u <= 0xE007E); // any series of "tag components" + } while (from >= begin) { - unsigned u = fl_utf8decode(from, keep, NULL); + u = fl_utf8decode(from, keep, NULL); if (u >= 0xFE00 && u <= 0xFE0F) { // a variation selector from = fl_utf8back(from - 1, begin, NULL); } else if (u >= 0x1F3FB && u <= 0x1F3FF) { // EMOJI MODIFIER FITZPATRICK