42,22 → 42,271 |
#include <arch.h> |
#include <console/kconsole.h> |
|
/** Return number of characters in a string. |
char invalch = '?'; |
|
/** Byte mask consisting of lowest @n bits (out of eight). */ |
#define LO_MASK_8(n) ((uint8_t)((1 << (n)) - 1)) |
|
/** Byte mask consisting of lowest @n bits (out of 32). */ |
#define LO_MASK_32(n) ((uint32_t)((1 << (n)) - 1)) |
|
/** Byte mask consisting of highest @n bits (out of eight). */ |
#define HI_MASK_8(n) (~LO_MASK_8(8 - (n))) |
|
/** Number of data bits in a UTF-8 continuation byte. */ |
#define CONT_BITS 6 |
|
/** Decode a single character from a substring. |
* |
* @param str NULL terminated string. |
* Decode a single character from a substring of size @a sz. Decoding starts |
* at @a offset and this offset is moved to the beginning of the next |
* character. In case of decoding error, offset generally advances at least |
* by one. However, offset is never moved beyond (str + sz). |
* |
* @param str String (not necessarily NULL-terminated). |
* @param index Index (counted in plain characters) where to start |
* the decoding. |
* @param limit Size of the substring. |
* |
* @return Value of decoded character or '?' on decoding error. |
* |
*/ |
wchar_t chr_decode(const char *str, size_t *offset, size_t sz) |
{ |
uint8_t b0, b; /* Bytes read from str. */ |
wchar_t ch; |
|
int b0_bits; /* Data bits in first byte. */ |
int cbytes; /* Number of continuation bytes. */ |
|
if (*offset + 1 > sz) |
return invalch; |
|
b0 = (uint8_t) str[(*offset)++]; |
|
/* Determine code length. */ |
|
if ((b0 & 0x80) == 0) { |
/* 0xxxxxxx (Plain ASCII) */ |
b0_bits = 7; |
cbytes = 0; |
} else if ((b0 & 0xe0) == 0xc0) { |
/* 110xxxxx 10xxxxxx */ |
b0_bits = 5; |
cbytes = 1; |
} else if ((b0 & 0xf0) == 0xe0) { |
/* 1110xxxx 10xxxxxx 10xxxxxx */ |
b0_bits = 4; |
cbytes = 2; |
} else if ((b0 & 0xf8) == 0xf0) { |
/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ |
b0_bits = 3; |
cbytes = 3; |
} else { |
/* 10xxxxxx -- unexpected continuation byte. */ |
return invalch; |
} |
|
if (*offset + cbytes > sz) { |
return invalch; |
} |
|
ch = b0 & LO_MASK_8(b0_bits); |
|
/* Decode continuation bytes. */ |
while (cbytes > 0) { |
b = (uint8_t) str[(*offset)++]; |
|
/* Must be 10xxxxxx. */ |
if ((b & 0xc0) != 0x80) { |
return invalch; |
} |
|
/* Shift data bits to ch. */ |
ch = (ch << CONT_BITS) | (wchar_t) (b & LO_MASK_8(CONT_BITS)); |
--cbytes; |
} |
|
return ch; |
} |
|
/** Encode a single character to string representation. |
* |
* Encode a single character to string representation (i.e. UTF-8) and store |
* it into a buffer at @a offset. Encoding starts at @a offset and this offset |
* is moved to the position where the next character can be written to. |
* |
* @param ch Input character. |
* @param str Output buffer. |
* @param offset Offset (in bytes) where to start writing. |
* @param sz Size of the output buffer. |
* |
* @return True if the character was encoded successfully or false if there |
* was not enough space in the output buffer or the character code |
* was invalid. |
*/ |
bool chr_encode(const wchar_t ch, char *str, size_t *offset, size_t sz) |
{ |
uint32_t cc; /* Unsigned version of ch. */ |
|
int cbytes; /* Number of continuation bytes. */ |
int b0_bits; /* Number of data bits in first byte. */ |
int i; |
|
if (*offset >= sz) |
return false; |
|
if (ch < 0) |
return false; |
|
/* Bit operations should only be done on unsigned numbers. */ |
cc = (uint32_t) ch; |
|
/* Determine how many continuation bytes are needed. */ |
if ((cc & ~LO_MASK_32(7)) == 0) { |
b0_bits = 7; |
cbytes = 0; |
} else if ((cc & ~LO_MASK_32(11)) == 0) { |
b0_bits = 5; |
cbytes = 1; |
} else if ((cc & ~LO_MASK_32(16)) == 0) { |
b0_bits = 4; |
cbytes = 2; |
} else if ((cc & ~LO_MASK_32(21)) == 0) { |
b0_bits = 3; |
cbytes = 3; |
} else { |
/* Codes longer than 21 bits are not supported. */ |
return false; |
} |
|
/* Check for available space in buffer. */ |
if (*offset + cbytes >= sz) |
return false; |
|
/* Encode continuation bytes. */ |
for (i = cbytes; i > 0; --i) { |
str[*offset + i] = 0x80 | (cc & LO_MASK_32(CONT_BITS)); |
cc = cc >> CONT_BITS; |
} |
|
/* Encode first byte. */ |
str[*offset] = (cc & LO_MASK_32(b0_bits)) | HI_MASK_8(8 - b0_bits - 1); |
|
/* Advance offset. */ |
*offset += (1 + cbytes); |
|
return true; |
} |
|
/** Get bytes used by UTF-8 characters. |
* |
* Get the number of bytes (count of plain characters) which |
* are used by a given count of UTF-8 characters in a string. |
* As UTF-8 encoding is multibyte, there is no constant |
* correspondence between number of characters and used bytes. |
* |
* @param str UTF-8 string to consider. |
* @param count Number of UTF-8 characters to count. |
* |
* @return Number of bytes used by the characters. |
* |
*/ |
size_t utf8_count_bytes(const char *str, count_t count) |
{ |
size_t size = 0; |
index_t index = 0; |
index_t iprev; |
wchar_t ch; |
|
while (true) { |
iprev = index; |
if (size >= count) |
break; |
ch = chr_decode(str, &index, UTF8_NO_LIMIT); |
if (ch == '\0') break; |
|
size++; |
} |
|
return iprev; |
} |
|
/** Check whether character is plain ASCII. |
* |
* @return True if character is plain ASCII. |
* |
*/ |
bool ascii_check(const wchar_t ch) |
{ |
if ((ch >= 0) && (ch <= 127)) |
return true; |
|
return false; |
} |
|
/** Check whether character is Unicode. |
* |
* @return True if character is valid Unicode code point. |
* |
*/ |
bool unicode_check(const wchar_t ch) |
{ |
if ((ch >= 0) && (ch <= 1114111)) |
return true; |
|
return false; |
} |
|
/** Return number of plain characters in a string. |
* |
* @param str NULL-terminated string. |
* |
* @return Number of characters in str. |
* |
*/ |
size_t strlen(const char *str) |
{ |
int i; |
size_t size; |
for (size = 0; str[size]; size++); |
|
for (i = 0; str[i]; i++); |
return size; |
} |
|
/** Return number of UTF-8 characters in a string. |
* |
* @param str NULL-terminated UTF-8 string. |
* |
* @return Number of UTF-8 characters in str. |
* |
*/ |
size_t strlen_utf8(const char *str) |
{ |
size_t size = 0; |
index_t index = 0; |
|
return i; |
while (chr_decode(str, &index, UTF8_NO_LIMIT) != 0) { |
size++; |
} |
|
return size; |
} |
|
/** Return number of UTF-32 characters in a string. |
* |
* @param str NULL-terminated UTF-32 string. |
* |
* @return Number of UTF-32 characters in str. |
* |
*/ |
size_t strlen_utf32(const wchar_t *str) |
{ |
size_t size; |
for (size = 0; str[size]; size++); |
|
return size; |
} |
|
/** Compare two NULL terminated strings |
* |
* Do a char-by-char comparison of two NULL terminated strings. |