56,22 → 56,22 |
/** Number of data bits in a UTF-8 continuation byte. */ |
#define CONT_BITS 6 |
|
/** Decode a single character from a substring. |
/** Decode a single UTF-8 character from a NULL-terminated string. |
* |
* Decode a single character from a substring of size @a sz. Decoding starts |
* at @a offset and this offset is moved to the beginning of the next |
* character. In case of decoding error, offset generally advances at least |
* by one. However, offset is never moved beyond (str + sz). |
* Decode a single UTF-8 character from a plain char NULL-terminated |
* string. Decoding starts at @index and this index is moved to the |
* beginning of the next character. In case of decoding error, |
* index advances. However, index is never moved beyond (str+limit). |
* |
* @param str String (not necessarily NULL-terminated). |
* @param str Plain character NULL-terminated string. |
* @param index Index (counted in plain characters) where to start |
* the decoding. |
* @param limit Size of the substring. |
* @param limit Maximal allowed value of index. |
* |
* @return Value of decoded character or '?' on decoding error. |
* @return Decoded character in UTF-32 or '?' if the encoding is wrong. |
* |
*/ |
wchar_t chr_decode(const char *str, size_t *offset, size_t sz) |
wchar_t utf8_decode(const char *str, index_t *index, index_t limit) |
{ |
uint8_t b0, b; /* Bytes read from str. */ |
wchar_t ch; |
79,10 → 79,10 |
int b0_bits; /* Data bits in first byte. */ |
int cbytes; /* Number of continuation bytes. */ |
|
if (*offset + 1 > sz) |
if (*index + 1 > limit) |
return invalch; |
|
b0 = (uint8_t) str[(*offset)++]; |
b0 = (uint8_t) str[(*index)++]; |
|
/* Determine code length. */ |
|
107,7 → 107,7 |
return invalch; |
} |
|
if (*offset + cbytes > sz) { |
if (*index + cbytes > limit) { |
return invalch; |
} |
|
115,7 → 115,7 |
|
/* Decode continuation bytes. */ |
while (cbytes > 0) { |
b = (uint8_t) str[(*offset)++]; |
b = (uint8_t) str[(*index)++]; |
|
/* Must be 10xxxxxx. */ |
if ((b & 0xc0) != 0x80) { |
130,22 → 130,25 |
return ch; |
} |
|
/** Encode a single character to string representation. |
/** Encode a single UTF-32 character as UTF-8 |
* |
* Encode a single character to string representation (i.e. UTF-8) and store |
* it into a buffer at @a offset. Encoding starts at @a offset and this offset |
* is moved to the position where the next character can be written to. |
* Encode a single UTF-32 character as UTF-8 and store it into |
* the given buffer at @index. Encoding starts at @index and |
* this index is moved at the position where the next character |
* can be written to. |
* |
* @param ch Input character. |
* @param str Output buffer. |
* @param offset Offset (in bytes) where to start writing. |
* @param sz Size of the output buffer. |
* @param ch Input UTF-32 character. |
* @param str Output buffer. |
* @param index Index (counted in plain characters) where to start |
* the encoding |
* @param limit Maximal allowed value of index. |
* |
* @return True if the character was encoded successfully or false if there |
* was not enough space in the output buffer or the character code |
* was invalid. |
* @return True if the character was encoded or false if there is not |
* enought space in the output buffer or the character is invalid |
* Unicode code point. |
* |
*/ |
bool chr_encode(const wchar_t ch, char *str, size_t *offset, size_t sz) |
bool utf8_encode(const wchar_t ch, char *str, index_t *index, index_t limit) |
{ |
uint32_t cc; /* Unsigned version of ch. */ |
|
153,7 → 156,7 |
int b0_bits; /* Number of data bits in first byte. */ |
int i; |
|
if (*offset >= sz) |
if (*index >= limit) |
return false; |
|
if (ch < 0) |
181,20 → 184,20 |
} |
|
/* Check for available space in buffer. */ |
if (*offset + cbytes >= sz) |
if (*index + cbytes >= limit) |
return false; |
|
/* Encode continuation bytes. */ |
for (i = cbytes; i > 0; --i) { |
str[*offset + i] = 0x80 | (cc & LO_MASK_32(CONT_BITS)); |
str[*index + i] = 0x80 | (cc & LO_MASK_32(CONT_BITS)); |
cc = cc >> CONT_BITS; |
} |
|
/* Encode first byte. */ |
str[*offset] = (cc & LO_MASK_32(b0_bits)) | HI_MASK_8(8 - b0_bits - 1); |
str[*index] = (cc & LO_MASK_32(b0_bits)) | HI_MASK_8(8 - b0_bits - 1); |
|
/* Advance offset. */ |
*offset += (1 + cbytes); |
/* Advance index. */ |
*index += (1 + cbytes); |
|
return true; |
} |
223,7 → 226,7 |
iprev = index; |
if (size >= count) |
break; |
ch = chr_decode(str, &index, UTF8_NO_LIMIT); |
ch = utf8_decode(str, &index, UTF8_NO_LIMIT); |
if (ch == '\0') break; |
|
size++; |
285,7 → 288,7 |
size_t size = 0; |
index_t index = 0; |
|
while (chr_decode(str, &index, UTF8_NO_LIMIT) != 0) { |
while (utf8_decode(str, &index, UTF8_NO_LIMIT) != 0) { |
size++; |
} |
|