Rev 4179 | Rev 4198 | Go to most recent revision | Show entire file | Ignore whitespace | Details | Blame | Last modification | View Log | RSS feed
| Rev 4179 | Rev 4196 | ||
|---|---|---|---|
| Line 42... | Line 42... | ||
| 42 | #include <arch.h> |
42 | #include <arch.h> |
| 43 | #include <console/kconsole.h> |
43 | #include <console/kconsole.h> |
| 44 | 44 | ||
| 45 | char invalch = '?'; |
45 | char invalch = '?'; |
| 46 | 46 | ||
| - | 47 | /** Byte mask consisting of bits 0 - (@n - 1) */ |
|
| - | 48 | #define LO_MASK_8(n) ((uint8_t)((1 << (n)) - 1)) |
|
| - | 49 | ||
| - | 50 | /** Number of data bits in a UTF-8 continuation byte. */ |
|
| - | 51 | #define CONT_BITS 6 |
|
| - | 52 | ||
| 47 | /** Decode a single UTF-8 character from a NULL-terminated string. |
53 | /** Decode a single UTF-8 character from a NULL-terminated string. |
| 48 | * |
54 | * |
| 49 | * Decode a single UTF-8 character from a plain char NULL-terminated |
55 | * Decode a single UTF-8 character from a plain char NULL-terminated |
| 50 | * string. Decoding starts at @index and this index is incremented |
56 | * string. Decoding starts at @index and this index is incremented |
| 51 | * if the current UTF-8 string is encoded in more than a single byte. |
57 | * if the current UTF-8 string is encoded in more than a single byte. |
| Line 58... | Line 64... | ||
| 58 | * @return Decoded character in UTF-32 or '?' if the encoding is wrong. |
64 | * @return Decoded character in UTF-32 or '?' if the encoding is wrong. |
| 59 | * |
65 | * |
| 60 | */ |
66 | */ |
| 61 | wchar_t utf8_decode(const char *str, index_t *index, index_t limit) |
67 | wchar_t utf8_decode(const char *str, index_t *index, index_t limit) |
| 62 | { |
68 | { |
| 63 | uint8_t c1; /* First plain character from str */ |
69 | uint8_t b0, b; /* Bytes read from str. */ |
| 64 | uint8_t c2; /* Second plain character from str */ |
70 | wchar_t ch; |
| - | 71 | ||
| 65 | uint8_t c3; /* Third plain character from str */ |
72 | int b0_bits; /* Data bits in first byte. */ |
| 66 | uint8_t c4; /* Fourth plain character from str */ |
73 | int cbytes; /* Number of continuation bytes. */ |
| 67 | 74 | ||
| 68 | if (*index > limit) |
75 | if (*index > limit) |
| 69 | return invalch; |
76 | return invalch; |
| 70 | 77 | ||
| 71 | c1 = (uint8_t) str[*index]; |
78 | b0 = (uint8_t) str[*index]; |
| - | 79 | ||
| - | 80 | /* Determine code length. */ |
|
| 72 | 81 | ||
| 73 | if ((c1 & 0x80) == 0) { |
82 | if ((b0 & 0x80) == 0) { |
| 74 | /* Plain ASCII (code points 0 .. 127) */ |
83 | /* 0xxxxxxx (Plain ASCII) */ |
| 75 | return (wchar_t) c1; |
84 | b0_bits = 7; |
| 76 | } |
85 | cbytes = 0; |
| 77 | - | ||
| 78 | if ((c1 & 0xe0) == 0xc0) { |
86 | } else if ((b0 & 0xe0) == 0xc0) { |
| 79 | /* Code points 128 .. 2047 */ |
87 | /* 110xxxxx 10xxxxxx */ |
| 80 | if (*index + 1 > limit) |
88 | b0_bits = 5; |
| 81 | return invalch; |
89 | cbytes = 1; |
| - | 90 | } else if ((b0 & 0xf0) == 0xe0) { |
|
| - | 91 | /* 1110xxxx 10xxxxxx 10xxxxxx */ |
|
| 82 | 92 | b0_bits = 4; |
|
| 83 | c2 = (uint8_t) str[*index + 1]; |
93 | cbytes = 2; |
| 84 | if ((c2 & 0xc0) == 0x80) { |
94 | } else if ((b0 & 0xf8) == 0xf0) { |
| - | 95 | /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ |
|
| 85 | (*index)++; |
96 | b0_bits = 3; |
| 86 | return ((wchar_t) ((c1 & 0x1f) << 6) | (c2 & 0x3f)); |
97 | cbytes = 3; |
| 87 | } else |
98 | } else { |
| - | 99 | /* 10xxxxxx -- unexpected continuation byte. */ |
|
| 88 | return invalch; |
100 | return invalch; |
| 89 | } |
101 | } |
| 90 | 102 | ||
| 91 | if ((c1 & 0xf0) == 0xe0) { |
- | |
| 92 | /* Code points 2048 .. 65535 */ |
- | |
| 93 | if (*index + 2 > limit) |
103 | if (*index + cbytes > limit) { |
| 94 | return invalch; |
- | |
| 95 | - | ||
| 96 | c2 = (uint8_t) str[*index + 1]; |
- | |
| 97 | if ((c2 & 0xc0) == 0x80) { |
- | |
| 98 | (*index)++; |
- | |
| 99 | c3 = (uint8_t) str[*index + 1]; |
- | |
| 100 | if ((c3 & 0xc0) == 0x80) { |
- | |
| 101 | (*index)++; |
- | |
| 102 | return ((wchar_t) ((c1 & 0x0f) << 12) | ((c2 & 0x3f) << 6) | (c3 & 0x3f)); |
- | |
| 103 | } else |
- | |
| 104 | return invalch; |
- | |
| 105 | } else |
- | |
| 106 | return invalch; |
104 | return invalch; |
| 107 | } |
105 | } |
| 108 | 106 | ||
| 109 | if ((c1 & 0xf8) == 0xf0) { |
107 | ch = b0 & LO_MASK_8(b0_bits); |
| 110 | /* Code points 65536 .. 1114111 */ |
- | |
| 111 | if (*index + 3 > limit) |
- | |
| 112 | return invalch; |
- | |
| 113 | 108 | ||
| 114 | c2 = (uint8_t) str[*index + 1]; |
109 | /* Decode continuation bytes. */ |
| 115 | if ((c2 & 0xc0) == 0x80) { |
110 | while (cbytes > 0) { |
| 116 | (*index)++; |
- | |
| 117 | c3 = (uint8_t) str[*index + 1]; |
111 | b = (uint8_t) str[*index + 1]; |
| 118 | if ((c3 & 0xc0) == 0x80) { |
- | |
| 119 | (*index)++; |
112 | ++(*index); |
| - | 113 | ||
| 120 | c4 = (uint8_t) str[*index + 1]; |
114 | /* Must be 10xxxxxx. */ |
| 121 | if ((c4 & 0xc0) == 0x80) { |
115 | if ((b & 0xc0) != 0x80) { |
| 122 | (*index)++; |
- | |
| 123 | return ((wchar_t) ((c1 & 0x07) << 18) | ((c2 & 0x3f) << 12) | ((c3 & 0x3f) << 6) | (c4 & 0x3f)); |
- | |
| 124 | } else |
- | |
| 125 | return invalch; |
- | |
| 126 | } else |
- | |
| 127 | return invalch; |
- | |
| 128 | } else |
- | |
| 129 | return invalch; |
116 | return invalch; |
| - | 117 | } |
|
| - | 118 | ||
| - | 119 | /* Shift data bits to ch. */ |
|
| - | 120 | ch = (ch << CONT_BITS) | (wchar_t) (b & LO_MASK_8(CONT_BITS)); |
|
| - | 121 | --cbytes; |
|
| 130 | } |
122 | } |
| 131 | 123 | ||
| 132 | return invalch; |
124 | return ch; |
| 133 | } |
125 | } |
| 134 | 126 | ||
| 135 | /** Encode a single UTF-32 character as UTF-8 |
127 | /** Encode a single UTF-32 character as UTF-8 |
| 136 | * |
128 | * |
| 137 | * Encode a single UTF-32 character as UTF-8 and store it into |
129 | * Encode a single UTF-32 character as UTF-8 and store it into |