Rev 3731 | Rev 4234 | Go to most recent revision | Show entire file | Ignore whitespace | Details | Blame | Last modification | View Log | RSS feed
Rev 3731 | Rev 4226 | ||
---|---|---|---|
Line 36... | Line 36... | ||
36 | #include <string.h> |
36 | #include <string.h> |
37 | #include <stdlib.h> |
37 | #include <stdlib.h> |
38 | #include <limits.h> |
38 | #include <limits.h> |
39 | #include <ctype.h> |
39 | #include <ctype.h> |
40 | #include <malloc.h> |
40 | #include <malloc.h> |
- | 41 | #include <errno.h> |
|
- | 42 | #include <string.h> |
|
- | 43 | ||
- | 44 | /** Byte mask consisting of lowest @n bits (out of 8) */ |
|
- | 45 | #define LO_MASK_8(n) ((uint8_t) ((1 << (n)) - 1)) |
|
- | 46 | ||
- | 47 | /** Byte mask consisting of lowest @n bits (out of 32) */ |
|
- | 48 | #define LO_MASK_32(n) ((uint32_t) ((1 << (n)) - 1)) |
|
- | 49 | ||
- | 50 | /** Byte mask consisting of highest @n bits (out of 8) */ |
|
- | 51 | #define HI_MASK_8(n) (~LO_MASK_8(8 - (n))) |
|
- | 52 | ||
- | 53 | /** Number of data bits in a UTF-8 continuation byte */ |
|
- | 54 | #define CONT_BITS 6 |
|
- | 55 | ||
- | 56 | /** Decode a single character from a string. |
|
- | 57 | * |
|
- | 58 | * Decode a single character from a string of size @a size. Decoding starts |
|
- | 59 | * at @a offset and this offset is moved to the beginning of the next |
|
- | 60 | * character. In case of decoding error, offset generally advances at least |
|
- | 61 | * by one. However, offset is never moved beyond size. |
|
- | 62 | * |
|
- | 63 | * @param str String (not necessarily NULL-terminated). |
|
- | 64 | * @param offset Byte offset in string where to start decoding. |
|
- | 65 | * @param size Size of the string (in bytes). |
|
- | 66 | * |
|
- | 67 | * @return Value of decoded character, U_SPECIAL on decoding error or |
|
- | 68 | * NULL if attempt to decode beyond @a size. |
|
- | 69 | * |
|
- | 70 | */ |
|
- | 71 | wchar_t str_decode(const char *str, size_t *offset, size_t size) |
|
- | 72 | { |
|
- | 73 | if (*offset + 1 > size) |
|
- | 74 | return 0; |
|
- | 75 | ||
- | 76 | /* First byte read from string */ |
|
- | 77 | uint8_t b0 = (uint8_t) str[(*offset)++]; |
|
- | 78 | ||
- | 79 | /* Determine code length */ |
|
- | 80 | ||
- | 81 | unsigned int b0_bits; /* Data bits in first byte */ |
|
- | 82 | unsigned int cbytes; /* Number of continuation bytes */ |
|
- | 83 | ||
- | 84 | if ((b0 & 0x80) == 0) { |
|
- | 85 | /* 0xxxxxxx (Plain ASCII) */ |
|
- | 86 | b0_bits = 7; |
|
- | 87 | cbytes = 0; |
|
- | 88 | } else if ((b0 & 0xe0) == 0xc0) { |
|
- | 89 | /* 110xxxxx 10xxxxxx */ |
|
- | 90 | b0_bits = 5; |
|
- | 91 | cbytes = 1; |
|
- | 92 | } else if ((b0 & 0xf0) == 0xe0) { |
|
- | 93 | /* 1110xxxx 10xxxxxx 10xxxxxx */ |
|
- | 94 | b0_bits = 4; |
|
- | 95 | cbytes = 2; |
|
- | 96 | } else if ((b0 & 0xf8) == 0xf0) { |
|
- | 97 | /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ |
|
- | 98 | b0_bits = 3; |
|
- | 99 | cbytes = 3; |
|
- | 100 | } else { |
|
- | 101 | /* 10xxxxxx -- unexpected continuation byte */ |
|
- | 102 | return U_SPECIAL; |
|
- | 103 | } |
|
- | 104 | ||
- | 105 | if (*offset + cbytes > size) |
|
- | 106 | return U_SPECIAL; |
|
- | 107 | ||
- | 108 | wchar_t ch = b0 & LO_MASK_8(b0_bits); |
|
- | 109 | ||
- | 110 | /* Decode continuation bytes */ |
|
- | 111 | while (cbytes > 0) { |
|
- | 112 | uint8_t b = (uint8_t) str[(*offset)++]; |
|
- | 113 | ||
- | 114 | /* Must be 10xxxxxx */ |
|
- | 115 | if ((b & 0xc0) != 0x80) |
|
- | 116 | return U_SPECIAL; |
|
- | 117 | ||
- | 118 | /* Shift data bits to ch */ |
|
- | 119 | ch = (ch << CONT_BITS) | (wchar_t) (b & LO_MASK_8(CONT_BITS)); |
|
- | 120 | cbytes--; |
|
- | 121 | } |
|
- | 122 | ||
- | 123 | return ch; |
|
- | 124 | } |
|
- | 125 | ||
- | 126 | /** Encode a single character to string representation. |
|
- | 127 | * |
|
- | 128 | * Encode a single character to string representation (i.e. UTF-8) and store |
|
- | 129 | * it into a buffer at @a offset. Encoding starts at @a offset and this offset |
|
- | 130 | * is moved to the position where the next character can be written to. |
|
- | 131 | * |
|
- | 132 | * @param ch Input character. |
|
- | 133 | * @param str Output buffer. |
|
- | 134 | * @param offset Byte offset where to start writing. |
|
- | 135 | * @param size Size of the output buffer (in bytes). |
|
- | 136 | * |
|
- | 137 | * @return EOK if the character was encoded successfully, EOVERFLOW if there |
|
- | 138 | * was not enough space in the output buffer or EINVAL if the character |
|
- | 139 | * code was invalid. |
|
- | 140 | */ |
|
- | 141 | int chr_encode(const wchar_t ch, char *str, size_t *offset, size_t size) |
|
- | 142 | { |
|
- | 143 | if (*offset >= size) |
|
- | 144 | return EOVERFLOW; |
|
- | 145 | ||
- | 146 | if (!chr_check(ch)) |
|
- | 147 | return EINVAL; |
|
- | 148 | ||
- | 149 | /* Unsigned version of ch (bit operations should only be done |
|
- | 150 | on unsigned types). */ |
|
- | 151 | uint32_t cc = (uint32_t) ch; |
|
- | 152 | ||
- | 153 | /* Determine how many continuation bytes are needed */ |
|
- | 154 | ||
- | 155 | unsigned int b0_bits; /* Data bits in first byte */ |
|
- | 156 | unsigned int cbytes; /* Number of continuation bytes */ |
|
- | 157 | ||
- | 158 | if ((cc & ~LO_MASK_32(7)) == 0) { |
|
- | 159 | b0_bits = 7; |
|
- | 160 | cbytes = 0; |
|
- | 161 | } else if ((cc & ~LO_MASK_32(11)) == 0) { |
|
- | 162 | b0_bits = 5; |
|
- | 163 | cbytes = 1; |
|
- | 164 | } else if ((cc & ~LO_MASK_32(16)) == 0) { |
|
- | 165 | b0_bits = 4; |
|
- | 166 | cbytes = 2; |
|
- | 167 | } else if ((cc & ~LO_MASK_32(21)) == 0) { |
|
- | 168 | b0_bits = 3; |
|
- | 169 | cbytes = 3; |
|
- | 170 | } else { |
|
- | 171 | /* Codes longer than 21 bits are not supported */ |
|
- | 172 | return EINVAL; |
|
- | 173 | } |
|
- | 174 | ||
- | 175 | /* Check for available space in buffer */ |
|
- | 176 | if (*offset + cbytes >= size) |
|
- | 177 | return EOVERFLOW; |
|
- | 178 | ||
- | 179 | /* Encode continuation bytes */ |
|
- | 180 | unsigned int i; |
|
- | 181 | for (i = cbytes; i > 0; i--) { |
|
- | 182 | str[*offset + i] = 0x80 | (cc & LO_MASK_32(CONT_BITS)); |
|
- | 183 | cc = cc >> CONT_BITS; |
|
- | 184 | } |
|
- | 185 | ||
- | 186 | /* Encode first byte */ |
|
- | 187 | str[*offset] = (cc & LO_MASK_32(b0_bits)) | HI_MASK_8(8 - b0_bits - 1); |
|
- | 188 | ||
- | 189 | /* Advance offset */ |
|
- | 190 | *offset += cbytes + 1; |
|
- | 191 | ||
- | 192 | return EOK; |
|
- | 193 | } |
|
- | 194 | ||
- | 195 | /** Check whether character is valid |
|
- | 196 | * |
|
- | 197 | * @return True if character is a valid Unicode code point. |
|
- | 198 | * |
|
- | 199 | */ |
|
- | 200 | bool chr_check(const wchar_t ch) |
|
- | 201 | { |
|
- | 202 | if ((ch >= 0) && (ch <= 1114111)) |
|
- | 203 | return true; |
|
- | 204 | ||
- | 205 | return false; |
|
- | 206 | } |
|
41 | 207 | ||
42 | /** Count the number of characters in the string, not including terminating 0. |
208 | /** Count the number of characters in the string, not including terminating 0. |
43 | * |
209 | * |
44 | * @param str String. |
210 | * @param str String. |
45 | * @return Number of characters in string. |
211 | * @return Number of characters in string. |