Rev 4208 | Rev 4212 | Go to most recent revision | Details | Compare with Previous | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
4011 | svoboda | 1 | /* |
2 | * Copyright (c) 2001-2004 Jakub Jermar |
||
3 | * All rights reserved. |
||
4 | * |
||
5 | * Redistribution and use in source and binary forms, with or without |
||
6 | * modification, are permitted provided that the following conditions |
||
7 | * are met: |
||
8 | * |
||
9 | * - Redistributions of source code must retain the above copyright |
||
10 | * notice, this list of conditions and the following disclaimer. |
||
11 | * - Redistributions in binary form must reproduce the above copyright |
||
12 | * notice, this list of conditions and the following disclaimer in the |
||
13 | * documentation and/or other materials provided with the distribution. |
||
14 | * - The name of the author may not be used to endorse or promote products |
||
15 | * derived from this software without specific prior written permission. |
||
16 | * |
||
17 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR |
||
18 | * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES |
||
19 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. |
||
20 | * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, |
||
21 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT |
||
22 | * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
||
23 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
||
24 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
||
25 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF |
||
26 | * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
||
27 | */ |
||
28 | |||
4014 | decky | 29 | /** @addtogroup generic |
4011 | svoboda | 30 | * @{ |
31 | */ |
||
32 | |||
33 | /** |
||
34 | * @file |
||
4209 | svoboda | 35 | * @brief String functions. |
36 | * |
||
37 | * Strings and characters use the Universal Character Set (UCS). The standard |
||
38 | * strings, called just strings are encoded in UTF-8. Wide strings (encoded |
||
39 | * in UTF-32) are supported to a limited degree. A single character is |
||
40 | * represented as wchar_t. |
||
41 | * |
||
42 | * Strings have the following metrics: |
||
43 | * |
||
44 | * Metric Abbrev. Meaning |
||
45 | * ------ ------ ------- |
||
46 | * size n Number of bytes the string is encoded into, excluding |
||
47 | * the null terminator. |
||
48 | * length l The number of characters in the string, excluding |
||
49 | * the null terminator. |
||
50 | * width w The number of character cells the string takes up on a |
||
51 | * monospace display. |
||
52 | * |
||
53 | * Naming scheme: |
||
54 | * |
||
55 | * chr_xxx operate on characters |
||
56 | * str_xxx operate on strings |
||
57 | * wstr_xxx operate on wide strings |
||
58 | * |
||
59 | * [w]str_[n|l|w]xxx operate on a prefix limited by size, length |
||
60 | * or width. |
||
4011 | svoboda | 61 | */ |
62 | |||
63 | #include <string.h> |
||
64 | #include <print.h> |
||
65 | #include <cpu.h> |
||
66 | #include <arch/asm.h> |
||
67 | #include <arch.h> |
||
4208 | svoboda | 68 | #include <errno.h> |
4011 | svoboda | 69 | #include <console/kconsole.h> |
70 | |||
4179 | decky | 71 | char invalch = '?'; |
72 | |||
4198 | svoboda | 73 | /** Byte mask consisting of lowest @n bits (out of eight). */ |
4196 | svoboda | 74 | #define LO_MASK_8(n) ((uint8_t)((1 << (n)) - 1)) |
75 | |||
4198 | svoboda | 76 | /** Byte mask consisting of lowest @n bits (out of 32). */ |
77 | #define LO_MASK_32(n) ((uint32_t)((1 << (n)) - 1)) |
||
78 | |||
79 | /** Byte mask consisting of highest @n bits (out of eight). */ |
||
80 | #define HI_MASK_8(n) (~LO_MASK_8(8 - (n))) |
||
81 | |||
4196 | svoboda | 82 | /** Number of data bits in a UTF-8 continuation byte. */ |
83 | #define CONT_BITS 6 |
||
84 | |||
4200 | svoboda | 85 | /** Decode a single character from a substring. |
4175 | decky | 86 | * |
4200 | svoboda | 87 | * Decode a single character from a substring of size @a sz. Decoding starts |
88 | * at @a offset and this offset is moved to the beginning of the next |
||
89 | * character. In case of decoding error, offset generally advances at least |
||
90 | * by one. However, offset is never moved beyond (str + sz). |
||
4175 | decky | 91 | * |
4200 | svoboda | 92 | * @param str String (not necessarily NULL-terminated). |
4175 | decky | 93 | * @param index Index (counted in plain characters) where to start |
94 | * the decoding. |
||
4200 | svoboda | 95 | * @param limit Size of the substring. |
4175 | decky | 96 | * |
4200 | svoboda | 97 | * @return Value of decoded character or '?' on decoding error. |
4175 | decky | 98 | */ |
4200 | svoboda | 99 | wchar_t chr_decode(const char *str, size_t *offset, size_t sz) |
4175 | decky | 100 | { |
4196 | svoboda | 101 | uint8_t b0, b; /* Bytes read from str. */ |
102 | wchar_t ch; |
||
103 | |||
104 | int b0_bits; /* Data bits in first byte. */ |
||
105 | int cbytes; /* Number of continuation bytes. */ |
||
106 | |||
4200 | svoboda | 107 | if (*offset + 1 > sz) |
4179 | decky | 108 | return invalch; |
4196 | svoboda | 109 | |
4200 | svoboda | 110 | b0 = (uint8_t) str[(*offset)++]; |
4196 | svoboda | 111 | |
112 | /* Determine code length. */ |
||
113 | |||
114 | if ((b0 & 0x80) == 0) { |
||
115 | /* 0xxxxxxx (Plain ASCII) */ |
||
116 | b0_bits = 7; |
||
117 | cbytes = 0; |
||
118 | } else if ((b0 & 0xe0) == 0xc0) { |
||
119 | /* 110xxxxx 10xxxxxx */ |
||
120 | b0_bits = 5; |
||
121 | cbytes = 1; |
||
122 | } else if ((b0 & 0xf0) == 0xe0) { |
||
123 | /* 1110xxxx 10xxxxxx 10xxxxxx */ |
||
124 | b0_bits = 4; |
||
125 | cbytes = 2; |
||
126 | } else if ((b0 & 0xf8) == 0xf0) { |
||
127 | /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ |
||
128 | b0_bits = 3; |
||
129 | cbytes = 3; |
||
130 | } else { |
||
131 | /* 10xxxxxx -- unexpected continuation byte. */ |
||
132 | return invalch; |
||
4179 | decky | 133 | } |
4196 | svoboda | 134 | |
4200 | svoboda | 135 | if (*offset + cbytes > sz) { |
4196 | svoboda | 136 | return invalch; |
4179 | decky | 137 | } |
4196 | svoboda | 138 | |
139 | ch = b0 & LO_MASK_8(b0_bits); |
||
140 | |||
141 | /* Decode continuation bytes. */ |
||
142 | while (cbytes > 0) { |
||
4200 | svoboda | 143 | b = (uint8_t) str[(*offset)++]; |
4196 | svoboda | 144 | |
145 | /* Must be 10xxxxxx. */ |
||
146 | if ((b & 0xc0) != 0x80) { |
||
4179 | decky | 147 | return invalch; |
4196 | svoboda | 148 | } |
149 | |||
150 | /* Shift data bits to ch. */ |
||
151 | ch = (ch << CONT_BITS) | (wchar_t) (b & LO_MASK_8(CONT_BITS)); |
||
152 | --cbytes; |
||
4179 | decky | 153 | } |
4196 | svoboda | 154 | |
155 | return ch; |
||
4175 | decky | 156 | } |
157 | |||
4200 | svoboda | 158 | /** Encode a single character to string representation. |
4011 | svoboda | 159 | * |
4200 | svoboda | 160 | * Encode a single character to string representation (i.e. UTF-8) and store |
161 | * it into a buffer at @a offset. Encoding starts at @a offset and this offset |
||
162 | * is moved to the position where the next character can be written to. |
||
4011 | svoboda | 163 | * |
4200 | svoboda | 164 | * @param ch Input character. |
165 | * @param str Output buffer. |
||
166 | * @param offset Offset (in bytes) where to start writing. |
||
167 | * @param sz Size of the output buffer. |
||
4179 | decky | 168 | * |
4208 | svoboda | 169 | * @return EOK if the character was encoded successfully, EOVERFLOW if there |
170 | * was not enough space in the output buffer or EINVAL if the character |
||
171 | * code was invalid. |
||
4179 | decky | 172 | */ |
4209 | svoboda | 173 | int chr_encode(wchar_t ch, char *str, size_t *offset, size_t sz) |
4179 | decky | 174 | { |
4198 | svoboda | 175 | uint32_t cc; /* Unsigned version of ch. */ |
176 | |||
177 | int cbytes; /* Number of continuation bytes. */ |
||
178 | int b0_bits; /* Number of data bits in first byte. */ |
||
179 | int i; |
||
180 | |||
4200 | svoboda | 181 | if (*offset >= sz) |
4208 | svoboda | 182 | return EOVERFLOW; |
4198 | svoboda | 183 | |
184 | if (ch < 0) |
||
4208 | svoboda | 185 | return EINVAL; |
4198 | svoboda | 186 | |
187 | /* Bit operations should only be done on unsigned numbers. */ |
||
188 | cc = (uint32_t) ch; |
||
189 | |||
190 | /* Determine how many continuation bytes are needed. */ |
||
191 | if ((cc & ~LO_MASK_32(7)) == 0) { |
||
192 | b0_bits = 7; |
||
193 | cbytes = 0; |
||
194 | } else if ((cc & ~LO_MASK_32(11)) == 0) { |
||
195 | b0_bits = 5; |
||
196 | cbytes = 1; |
||
197 | } else if ((cc & ~LO_MASK_32(16)) == 0) { |
||
198 | b0_bits = 4; |
||
199 | cbytes = 2; |
||
200 | } else if ((cc & ~LO_MASK_32(21)) == 0) { |
||
201 | b0_bits = 3; |
||
202 | cbytes = 3; |
||
203 | } else { |
||
204 | /* Codes longer than 21 bits are not supported. */ |
||
4208 | svoboda | 205 | return EINVAL; |
4179 | decky | 206 | } |
4198 | svoboda | 207 | |
208 | /* Check for available space in buffer. */ |
||
4200 | svoboda | 209 | if (*offset + cbytes >= sz) |
4208 | svoboda | 210 | return EOVERFLOW; |
4198 | svoboda | 211 | |
212 | /* Encode continuation bytes. */ |
||
213 | for (i = cbytes; i > 0; --i) { |
||
4200 | svoboda | 214 | str[*offset + i] = 0x80 | (cc & LO_MASK_32(CONT_BITS)); |
4198 | svoboda | 215 | cc = cc >> CONT_BITS; |
4179 | decky | 216 | } |
4198 | svoboda | 217 | |
218 | /* Encode first byte. */ |
||
4200 | svoboda | 219 | str[*offset] = (cc & LO_MASK_32(b0_bits)) | HI_MASK_8(8 - b0_bits - 1); |
4198 | svoboda | 220 | |
4200 | svoboda | 221 | /* Advance offset. */ |
222 | *offset += (1 + cbytes); |
||
4179 | decky | 223 | |
4208 | svoboda | 224 | return EOK; |
4179 | decky | 225 | } |
226 | |||
4209 | svoboda | 227 | /** Get display width of character. |
228 | * |
||
229 | * @param ch The character. |
||
230 | * @return Character width in display cells. |
||
231 | */ |
||
232 | count_t chr_width(wchar_t ch) |
||
233 | { |
||
234 | return 1; |
||
235 | } |
||
236 | |||
4205 | svoboda | 237 | /** Get size of string, with length limit. |
4179 | decky | 238 | * |
4205 | svoboda | 239 | * Get the number of bytes which are used by up to @a max_len first |
240 | * characters in the string @a str. If @a max_len is greater than |
||
241 | * the length of @a str, the entire string is measured. |
||
4179 | decky | 242 | * |
4209 | svoboda | 243 | * @param str String to consider. |
244 | * @param count Maximum number of characters to measure. |
||
4179 | decky | 245 | * |
4209 | svoboda | 246 | * @return Number of bytes used by the characters. |
4179 | decky | 247 | */ |
4205 | svoboda | 248 | size_t str_lsize(const char *str, count_t max_len) |
4179 | decky | 249 | { |
4205 | svoboda | 250 | count_t len = 0; |
251 | size_t cur = 0; |
||
252 | size_t prev; |
||
4199 | svoboda | 253 | wchar_t ch; |
4205 | svoboda | 254 | |
4199 | svoboda | 255 | while (true) { |
4205 | svoboda | 256 | prev = cur; |
257 | if (len >= max_len) |
||
4199 | svoboda | 258 | break; |
4205 | svoboda | 259 | ch = chr_decode(str, &cur, UTF8_NO_LIMIT); |
4199 | svoboda | 260 | if (ch == '\0') break; |
261 | |||
4205 | svoboda | 262 | len++; |
4179 | decky | 263 | } |
4205 | svoboda | 264 | |
265 | return prev; |
||
4179 | decky | 266 | } |
267 | |||
4209 | svoboda | 268 | /** Get size of string, with width limit. |
269 | * |
||
270 | * Get the number of bytes which are used by the longest prefix of @a str |
||
271 | * that can fit into @a max_width display cells. |
||
272 | * |
||
273 | * @param str String to consider. |
||
274 | * @param count Maximum number of display cells. |
||
275 | * |
||
276 | * @return Number of bytes used by the characters that fit. |
||
277 | */ |
||
278 | size_t str_wsize(const char *str, count_t max_width) |
||
279 | { |
||
280 | count_t width = 0; |
||
281 | size_t cur = 0; |
||
282 | size_t prev; |
||
283 | wchar_t ch; |
||
284 | |||
285 | while (true) { |
||
286 | prev = cur; |
||
287 | if (width >= max_width) |
||
288 | break; |
||
289 | ch = chr_decode(str, &cur, UTF8_NO_LIMIT); |
||
290 | if (ch == '\0') break; |
||
291 | |||
292 | width += chr_width(ch); |
||
293 | } |
||
294 | |||
295 | return prev; |
||
296 | } |
||
297 | |||
298 | |||
299 | /** Get length of wide string, with width limit. |
||
300 | * |
||
301 | * Get the number of characters in a wide string that can fit into @a max_width |
||
302 | * display cells. |
||
303 | * |
||
304 | * @param wstr Wide string to consider. |
||
305 | * @param count Maximum number of display cells. |
||
306 | * |
||
307 | * @return Number of bytes used by the characters that fit. |
||
308 | */ |
||
309 | count_t wstr_wlength(const wchar_t *wstr, count_t max_width) |
||
310 | { |
||
311 | count_t width = 0; |
||
312 | index_t cur = 0; |
||
313 | |||
314 | while (true) { |
||
315 | if (width >= max_width) |
||
316 | break; |
||
317 | if (wstr[cur] == '\0') break; |
||
318 | |||
319 | width += chr_width(wstr[cur]); |
||
320 | ++cur; |
||
321 | } |
||
322 | |||
323 | return (count_t) cur; |
||
324 | } |
||
325 | |||
4179 | decky | 326 | /** Check whether character is plain ASCII. |
327 | * |
||
328 | * @return True if character is plain ASCII. |
||
329 | * |
||
330 | */ |
||
331 | bool ascii_check(const wchar_t ch) |
||
332 | { |
||
333 | if ((ch >= 0) && (ch <= 127)) |
||
334 | return true; |
||
335 | |||
336 | return false; |
||
337 | } |
||
338 | |||
339 | /** Check whether character is Unicode. |
||
340 | * |
||
341 | * @return True if character is valid Unicode code point. |
||
342 | */ |
||
343 | bool unicode_check(const wchar_t ch) |
||
344 | { |
||
345 | if ((ch >= 0) && (ch <= 1114111)) |
||
346 | return true; |
||
347 | |||
348 | return false; |
||
349 | } |
||
350 | |||
4207 | svoboda | 351 | /** Return number of bytes the string occupies. |
4179 | decky | 352 | * |
4207 | svoboda | 353 | * @param str A string. |
354 | * @return Number of bytes in @a str excluding the null terminator. |
||
4011 | svoboda | 355 | */ |
4207 | svoboda | 356 | size_t str_size(const char *str) |
4011 | svoboda | 357 | { |
4179 | decky | 358 | size_t size; |
4207 | svoboda | 359 | |
360 | size = 0; |
||
361 | while (*str++ != '\0') |
||
362 | ++size; |
||
363 | |||
4179 | decky | 364 | return size; |
365 | } |
||
366 | |||
4205 | svoboda | 367 | /** Return number of characters in a string. |
4179 | decky | 368 | * |
4205 | svoboda | 369 | * @param str NULL-terminated string. |
370 | * @return Number of characters in string. |
||
4179 | decky | 371 | */ |
4205 | svoboda | 372 | count_t str_length(const char *str) |
4179 | decky | 373 | { |
4205 | svoboda | 374 | count_t len = 0; |
375 | size_t offset = 0; |
||
376 | |||
377 | while (chr_decode(str, &offset, UTF8_NO_LIMIT) != 0) { |
||
378 | len++; |
||
4179 | decky | 379 | } |
4205 | svoboda | 380 | |
381 | return len; |
||
4011 | svoboda | 382 | } |
383 | |||
4205 | svoboda | 384 | /** Return number of characters in a wide string. |
4179 | decky | 385 | * |
4209 | svoboda | 386 | * @param str NULL-terminated wide string. |
387 | * @return Number of characters in @a str. |
||
4179 | decky | 388 | */ |
4205 | svoboda | 389 | count_t wstr_length(const wchar_t *wstr) |
4179 | decky | 390 | { |
4205 | svoboda | 391 | count_t len; |
392 | |||
393 | len = 0; |
||
394 | while (*wstr++ != '\0') |
||
395 | ++len; |
||
396 | |||
397 | return len; |
||
4179 | decky | 398 | } |
399 | |||
4011 | svoboda | 400 | /** Compare two NULL terminated strings |
401 | * |
||
402 | * Do a char-by-char comparison of two NULL terminated strings. |
||
403 | * The strings are considered equal iff they consist of the same |
||
404 | * characters on the minimum of their lengths. |
||
405 | * |
||
406 | * @param src First string to compare. |
||
407 | * @param dst Second string to compare. |
||
408 | * |
||
409 | * @return 0 if the strings are equal, -1 if first is smaller, 1 if second smaller. |
||
410 | * |
||
411 | */ |
||
412 | int strcmp(const char *src, const char *dst) |
||
413 | { |
||
414 | for (; *src && *dst; src++, dst++) { |
||
415 | if (*src < *dst) |
||
416 | return -1; |
||
417 | if (*src > *dst) |
||
418 | return 1; |
||
419 | } |
||
420 | if (*src == *dst) |
||
421 | return 0; |
||
4014 | decky | 422 | |
4011 | svoboda | 423 | if (!*src) |
424 | return -1; |
||
4014 | decky | 425 | |
4011 | svoboda | 426 | return 1; |
427 | } |
||
428 | |||
429 | |||
430 | /** Compare two NULL terminated strings |
||
431 | * |
||
432 | * Do a char-by-char comparison of two NULL terminated strings. |
||
433 | * The strings are considered equal iff they consist of the same |
||
434 | * characters on the minimum of their lengths and specified maximal |
||
435 | * length. |
||
436 | * |
||
437 | * @param src First string to compare. |
||
438 | * @param dst Second string to compare. |
||
439 | * @param len Maximal length for comparison. |
||
440 | * |
||
441 | * @return 0 if the strings are equal, -1 if first is smaller, 1 if second smaller. |
||
442 | */ |
||
443 | int strncmp(const char *src, const char *dst, size_t len) |
||
444 | { |
||
445 | unsigned int i; |
||
446 | |||
447 | for (i = 0; (*src) && (*dst) && (i < len); src++, dst++, i++) { |
||
448 | if (*src < *dst) |
||
449 | return -1; |
||
4014 | decky | 450 | |
4011 | svoboda | 451 | if (*src > *dst) |
452 | return 1; |
||
453 | } |
||
4014 | decky | 454 | |
4011 | svoboda | 455 | if (i == len || *src == *dst) |
456 | return 0; |
||
4014 | decky | 457 | |
4011 | svoboda | 458 | if (!*src) |
459 | return -1; |
||
4014 | decky | 460 | |
4011 | svoboda | 461 | return 1; |
462 | } |
||
463 | |||
464 | |||
465 | |||
466 | /** Copy NULL terminated string. |
||
467 | * |
||
468 | * Copy at most 'len' characters from string 'src' to 'dest'. |
||
469 | * If 'src' is shorter than 'len', '\0' is inserted behind the |
||
470 | * last copied character. |
||
471 | * |
||
4014 | decky | 472 | * @param src Source string. |
4011 | svoboda | 473 | * @param dest Destination buffer. |
4014 | decky | 474 | * @param len Size of destination buffer. |
4011 | svoboda | 475 | */ |
476 | void strncpy(char *dest, const char *src, size_t len) |
||
477 | { |
||
478 | unsigned int i; |
||
4014 | decky | 479 | |
4011 | svoboda | 480 | for (i = 0; i < len; i++) { |
481 | if (!(dest[i] = src[i])) |
||
482 | return; |
||
483 | } |
||
4014 | decky | 484 | |
4011 | svoboda | 485 | dest[i - 1] = '\0'; |
486 | } |
||
487 | |||
4012 | svoboda | 488 | /** Find first occurence of character in string. |
489 | * |
||
4014 | decky | 490 | * @param s String to search. |
491 | * @param i Character to look for. |
||
4012 | svoboda | 492 | * |
4014 | decky | 493 | * @return Pointer to character in @a s or NULL if not found. |
4012 | svoboda | 494 | */ |
495 | extern char *strchr(const char *s, int i) |
||
496 | { |
||
497 | while (*s != '\0') { |
||
4014 | decky | 498 | if (*s == i) |
499 | return (char *) s; |
||
4012 | svoboda | 500 | ++s; |
501 | } |
||
4014 | decky | 502 | |
4012 | svoboda | 503 | return NULL; |
504 | } |
||
505 | |||
4011 | svoboda | 506 | /** @} |
507 | */ |