Let text widgets handle gracefully composed unicode characters - Cont'd.

This commit adds support of another type of composed characters: flags.
It also fixes Fl_Text_Buffer::prev_char() and Fl_Text_Buffer::next_char()
that must use Fl_Text_Buffer::byte_at() to access to the content of the text buffer.
This commit is contained in:
ManoloFLTK
2026-01-26 09:17:18 +01:00
parent 188aa0be0e
commit 521f7b3c5b
3 changed files with 81 additions and 32 deletions

View File

@@ -742,19 +742,11 @@ public:
*/
const Fl_Text_Selection* highlight_selection() const { return &mHighlight; }
/**
Returns the index of the previous character.
\param ix index to the current character
*/
int prev_char(int ix) const;
int prev_char_clipped(int ix) const;
int prev_char(int pos) const;
int prev_char_clipped(int pos) const;
/**
Returns the index of the next character.
\param ix index to the current character
*/
int next_char(int ix) const;
int next_char_clipped(int ix) const;
int next_char(int pos) const;
int next_char_clipped(int pos) const;
/**
Align an index into the buffer to the current or previous UTF-8 boundary.

View File

@@ -2080,9 +2080,8 @@ int Fl_Text_Buffer::outputfile(const char *file,
}
/*
Return the previous character position.
Unicode safe.
/**
As prev_char() but returns 0 if the beginning of the buffer is reached.
*/
int Fl_Text_Buffer::prev_char_clipped(int pos) const
{
@@ -2090,17 +2089,27 @@ int Fl_Text_Buffer::prev_char_clipped(int pos) const
return 0;
IS_UTF8_ALIGNED2(this, (pos))
const char *previous = fl_utf8_previous_composed_char(address(0) + pos, address(0));
pos = previous - address(0);
const int l_t = 40;
char t[l_t + 1]; t[l_t] = 0;
int l = l_t, p = pos, ll;
for (int i = l_t; i > 0 && p > 0; i--) {
t[--l] = byte_at(--p);
ll = fl_utf8len(t[l]);
if (ll == 1 || ll == 2) break;
}
const char *previous = fl_utf8_previous_composed_char(t + l_t, t + l);
ll = strlen(t + l);
pos = (pos - ll) + (previous - (t+l));
IS_UTF8_ALIGNED2(this, (pos))
return pos;
}
/*
Return the previous character position.
/**
Returns the index of the previous character.
This function processes a composed character (e.g., a flag emoji) as a single character.
Returns -1 if the beginning of the buffer is reached.
\param pos index to the current character
*/
int Fl_Text_Buffer::prev_char(int pos) const
{
@@ -2109,15 +2118,31 @@ int Fl_Text_Buffer::prev_char(int pos) const
}
/*
Return the next character position.
/**
Returns the index of the next character.
This function processes a composed character (e.g., a flag emoji) as a single character.
Returns length() if the end of the buffer is reached.
\param pos index to the current character
*/
int Fl_Text_Buffer::next_char(int pos) const
{
IS_UTF8_ALIGNED2(this, (pos))
const char *next = fl_utf8_next_composed_char(address(0) + pos, address(0) + mLength);
pos = next - address(0);
int l = fl_utf8len1(byte_at(pos));
if (l > 2) { // test for composed character only if pos is at long codepoint
int p = pos, ll, b;
char t[40]; // crazyest composed characters I know use 28 bytes in UTF8 (e.g., 🏴󠁧󠁢󠁷󠁬󠁳󠁿)
l = 0;
// extract bytes after pos stopping after short codepoint or 40 bytes at most
while (p < mLength && l < sizeof(t)) {
b = byte_at(p++);
t[l++] = b;
ll = fl_utf8len(b);
for (int i = 1; i < ll && l < sizeof(t); i++) t[l++] = byte_at(p++);
if (ll == 1 || ll == 2) break; // stop after short codepoint (includes '\n')
}
l = fl_utf8_next_composed_char(t, t + l) - t; // length of possibly composed character starting at pos
}
pos += l;
if (pos>=mLength)
return mLength;
IS_UTF8_ALIGNED2(this, (pos))

View File

@@ -1634,17 +1634,32 @@ unsigned fl_utf8from_mb(char* dst, unsigned dstlen, const char* src, unsigned sr
Returns pointer to beginning of next unicode character after potentially composed character.
Some unicode characters (example: 👩‍✈️ "woman pilot") are composed of several unicode points. They may pair two successive
codepoints with U+200D (zero-width joiner) and may qualify any component with variation selectors or Fitzpatrick emoji modifiers.
Most flag emojis are composed of 2 successive "regional indicator symbols" from range [U+1F1E6 , U+1F1FF].
\param from points to a location within a UTF8 string. If this location is inside the UTF8
encoding of a codepoint or is an invalid byte, this function returns \p from + 1.
\param end points past last codepoint of the string.
\return pointer to start of first codepoint after potentially composed character beginning at \p from.
\return pointer to beginning of first codepoint after potentially composed character that begins at \p from.
*/
const char *fl_utf8_next_composed_char(const char *from, const char *end) {
int skip = fl_utf8len(*from);
if (skip == -1) return from + 1;
int skip = fl_utf8len1(*from);
if (skip <= 2) return from + skip;
unsigned u = fl_utf8decode(from, end, NULL);
if (u >= 0x1F1E6 && u <= 0x1F1FF) { // a 1st regional indicator symbol can be a flag
u = fl_utf8decode(from + skip, end, NULL);
if (u >= 0x1F1E6 && u <= 0x1F1FF) { // a 2nd regional indicator symbol gives a flag
return from + 2 * skip;
}
} else if (u == 0x1F3F4) { // “waving black flag” may start subdivision flags (e.g. 🏴󠁧󠁢󠁷󠁬󠁳󠁿)
const char *next = from + skip;
do {
u = fl_utf8decode(next, end, NULL);
next += fl_utf8len1(*next);
if (u == 0xE007F) return next; // ends with "cancel tag"
} while (u >= 0xE0020 && u <= 0xE007E); // any series of "tag components"
}
from += skip; // skip 1st codepoint
while (from < end) {
unsigned u = fl_utf8decode(from, end, NULL);
u = fl_utf8decode(from, end, NULL);
if (u == 0x200D) { // zero-width joiner
from += fl_utf8len(*from); // skip joiner
from += fl_utf8len(*from); // skip joined codepoint
@@ -1664,14 +1679,31 @@ const char *fl_utf8_next_composed_char(const char *from, const char *end) {
\param from points to a location within a UTF8 string. If this location is inside the UTF8
encoding of a codepoint or is an invalid byte, this function returns \p from - 1.
\param begin points to start of first codepoint of the string.
\return pointer to start of first potentially composed character before the codepoint beginning at \p from.
\return pointer to beginning of first potentially composed character before the codepoint that begins at \p from.
*/
const char *fl_utf8_previous_composed_char(const char *from, const char *begin) {
if (from <= begin || fl_utf8len(*from) == -1) return from - 1;
const char *keep = from;
int l = fl_utf8len(*from);
if (from <= begin || l == -1) return from - 1;
const char *keep = from + l;
from = fl_utf8back(from - 1, begin, NULL);
unsigned u = fl_utf8decode(from, keep, NULL);
if (u >= 0x1F1E6 && u <= 0x1F1FF) { // a 1st regional indicator symbol can be a flag
const char *previous = fl_utf8back(from - 1, begin, NULL);
u = fl_utf8decode(previous, keep, NULL);
if (u >= 0x1F1E6 && u <= 0x1F1FF) { // a 2nd Regional indicator symbol gives a flag
return previous;
}
} else if (u == 0xE007F) { // ends with "cancel tag"
const char *previous = from;
do {
if (previous <= begin) return begin;
previous = fl_utf8back(previous - 1, begin, NULL);
u = fl_utf8decode(previous, keep, NULL);
if (u == 0x1F3F4) return previous; // “waving black flag” starts subdivision flags
} while (u >= 0xE0020 && u <= 0xE007E); // any series of "tag components"
}
while (from >= begin) {
unsigned u = fl_utf8decode(from, keep, NULL);
u = fl_utf8decode(from, keep, NULL);
if (u >= 0xFE00 && u <= 0xFE0F) { // a variation selector
from = fl_utf8back(from - 1, begin, NULL);
} else if (u >= 0x1F3FB && u <= 0x1F3FF) { // EMOJI MODIFIER FITZPATRICK