Rev 4179 | Rev 4198 | Go to most recent revision | Show entire file | Ignore whitespace | Details | Blame | Last modification | View Log | RSS feed
Rev 4179 | Rev 4196 | ||
---|---|---|---|
Line 42... | Line 42... | ||
42 | #include <arch.h> |
42 | #include <arch.h> |
43 | #include <console/kconsole.h> |
43 | #include <console/kconsole.h> |
44 | 44 | ||
45 | char invalch = '?'; |
45 | char invalch = '?'; |
46 | 46 | ||
- | 47 | /** Byte mask consisting of bits 0 - (@n - 1) */ |
|
- | 48 | #define LO_MASK_8(n) ((uint8_t)((1 << (n)) - 1)) |
|
- | 49 | ||
- | 50 | /** Number of data bits in a UTF-8 continuation byte. */ |
|
- | 51 | #define CONT_BITS 6 |
|
- | 52 | ||
47 | /** Decode a single UTF-8 character from a NULL-terminated string. |
53 | /** Decode a single UTF-8 character from a NULL-terminated string. |
48 | * |
54 | * |
49 | * Decode a single UTF-8 character from a plain char NULL-terminated |
55 | * Decode a single UTF-8 character from a plain char NULL-terminated |
50 | * string. Decoding starts at @index and this index is incremented |
56 | * string. Decoding starts at @index and this index is incremented |
51 | * if the current UTF-8 string is encoded in more than a single byte. |
57 | * if the current UTF-8 string is encoded in more than a single byte. |
Line 58... | Line 64... | ||
58 | * @return Decoded character in UTF-32 or '?' if the encoding is wrong. |
64 | * @return Decoded character in UTF-32 or '?' if the encoding is wrong. |
59 | * |
65 | * |
60 | */ |
66 | */ |
61 | wchar_t utf8_decode(const char *str, index_t *index, index_t limit) |
67 | wchar_t utf8_decode(const char *str, index_t *index, index_t limit) |
62 | { |
68 | { |
63 | uint8_t c1; /* First plain character from str */ |
69 | uint8_t b0, b; /* Bytes read from str. */ |
64 | uint8_t c2; /* Second plain character from str */ |
70 | wchar_t ch; |
- | 71 | ||
65 | uint8_t c3; /* Third plain character from str */ |
72 | int b0_bits; /* Data bits in first byte. */ |
66 | uint8_t c4; /* Fourth plain character from str */ |
73 | int cbytes; /* Number of continuation bytes. */ |
67 | 74 | ||
68 | if (*index > limit) |
75 | if (*index > limit) |
69 | return invalch; |
76 | return invalch; |
70 | 77 | ||
71 | c1 = (uint8_t) str[*index]; |
78 | b0 = (uint8_t) str[*index]; |
- | 79 | ||
- | 80 | /* Determine code length. */ |
|
72 | 81 | ||
73 | if ((c1 & 0x80) == 0) { |
82 | if ((b0 & 0x80) == 0) { |
74 | /* Plain ASCII (code points 0 .. 127) */ |
83 | /* 0xxxxxxx (Plain ASCII) */ |
75 | return (wchar_t) c1; |
84 | b0_bits = 7; |
76 | } |
85 | cbytes = 0; |
77 | - | ||
78 | if ((c1 & 0xe0) == 0xc0) { |
86 | } else if ((b0 & 0xe0) == 0xc0) { |
79 | /* Code points 128 .. 2047 */ |
87 | /* 110xxxxx 10xxxxxx */ |
80 | if (*index + 1 > limit) |
88 | b0_bits = 5; |
81 | return invalch; |
89 | cbytes = 1; |
- | 90 | } else if ((b0 & 0xf0) == 0xe0) { |
|
- | 91 | /* 1110xxxx 10xxxxxx 10xxxxxx */ |
|
82 | 92 | b0_bits = 4; |
|
83 | c2 = (uint8_t) str[*index + 1]; |
93 | cbytes = 2; |
84 | if ((c2 & 0xc0) == 0x80) { |
94 | } else if ((b0 & 0xf8) == 0xf0) { |
- | 95 | /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ |
|
85 | (*index)++; |
96 | b0_bits = 3; |
86 | return ((wchar_t) ((c1 & 0x1f) << 6) | (c2 & 0x3f)); |
97 | cbytes = 3; |
87 | } else |
98 | } else { |
- | 99 | /* 10xxxxxx -- unexpected continuation byte. */ |
|
88 | return invalch; |
100 | return invalch; |
89 | } |
101 | } |
90 | 102 | ||
91 | if ((c1 & 0xf0) == 0xe0) { |
- | |
92 | /* Code points 2048 .. 65535 */ |
- | |
93 | if (*index + 2 > limit) |
103 | if (*index + cbytes > limit) { |
94 | return invalch; |
- | |
95 | - | ||
96 | c2 = (uint8_t) str[*index + 1]; |
- | |
97 | if ((c2 & 0xc0) == 0x80) { |
- | |
98 | (*index)++; |
- | |
99 | c3 = (uint8_t) str[*index + 1]; |
- | |
100 | if ((c3 & 0xc0) == 0x80) { |
- | |
101 | (*index)++; |
- | |
102 | return ((wchar_t) ((c1 & 0x0f) << 12) | ((c2 & 0x3f) << 6) | (c3 & 0x3f)); |
- | |
103 | } else |
- | |
104 | return invalch; |
- | |
105 | } else |
- | |
106 | return invalch; |
104 | return invalch; |
107 | } |
105 | } |
108 | 106 | ||
109 | if ((c1 & 0xf8) == 0xf0) { |
107 | ch = b0 & LO_MASK_8(b0_bits); |
110 | /* Code points 65536 .. 1114111 */ |
- | |
111 | if (*index + 3 > limit) |
- | |
112 | return invalch; |
- | |
113 | 108 | ||
114 | c2 = (uint8_t) str[*index + 1]; |
109 | /* Decode continuation bytes. */ |
115 | if ((c2 & 0xc0) == 0x80) { |
110 | while (cbytes > 0) { |
116 | (*index)++; |
- | |
117 | c3 = (uint8_t) str[*index + 1]; |
111 | b = (uint8_t) str[*index + 1]; |
118 | if ((c3 & 0xc0) == 0x80) { |
- | |
119 | (*index)++; |
112 | ++(*index); |
- | 113 | ||
120 | c4 = (uint8_t) str[*index + 1]; |
114 | /* Must be 10xxxxxx. */ |
121 | if ((c4 & 0xc0) == 0x80) { |
115 | if ((b & 0xc0) != 0x80) { |
122 | (*index)++; |
- | |
123 | return ((wchar_t) ((c1 & 0x07) << 18) | ((c2 & 0x3f) << 12) | ((c3 & 0x3f) << 6) | (c4 & 0x3f)); |
- | |
124 | } else |
- | |
125 | return invalch; |
- | |
126 | } else |
- | |
127 | return invalch; |
- | |
128 | } else |
- | |
129 | return invalch; |
116 | return invalch; |
- | 117 | } |
|
- | 118 | ||
- | 119 | /* Shift data bits to ch. */ |
|
- | 120 | ch = (ch << CONT_BITS) | (wchar_t) (b & LO_MASK_8(CONT_BITS)); |
|
- | 121 | --cbytes; |
|
130 | } |
122 | } |
131 | 123 | ||
132 | return invalch; |
124 | return ch; |
133 | } |
125 | } |
134 | 126 | ||
135 | /** Encode a single UTF-32 character as UTF-8 |
127 | /** Encode a single UTF-32 character as UTF-8 |
136 | * |
128 | * |
137 | * Encode a single UTF-32 character as UTF-8 and store it into |
129 | * Encode a single UTF-32 character as UTF-8 and store it into |