Subversion Repositories HelenOS

Rev

Rev 4153 | Rev 4327 | Go to most recent revision | Show entire file | Ignore whitespace | Details | Blame | Last modification | View Log | RSS feed

Rev 4153 Rev 4263
Line 36... Line 36...
36
#include <string.h>
36
#include <string.h>
37
#include <stdlib.h>
37
#include <stdlib.h>
38
#include <limits.h>
38
#include <limits.h>
39
#include <ctype.h>
39
#include <ctype.h>
40
#include <malloc.h>
40
#include <malloc.h>
-
 
41
#include <errno.h>
-
 
42
#include <align.h>
-
 
43
#include <string.h>
-
 
44
 
-
 
45
/** Byte mask consisting of lowest @n bits (out of 8) */
-
 
46
#define LO_MASK_8(n)  ((uint8_t) ((1 << (n)) - 1))
-
 
47
 
-
 
48
/** Byte mask consisting of lowest @n bits (out of 32) */
-
 
49
#define LO_MASK_32(n)  ((uint32_t) ((1 << (n)) - 1))
-
 
50
 
-
 
51
/** Byte mask consisting of highest @n bits (out of 8) */
-
 
52
#define HI_MASK_8(n)  (~LO_MASK_8(8 - (n)))
-
 
53
 
-
 
54
/** Number of data bits in a UTF-8 continuation byte */
-
 
55
#define CONT_BITS  6
-
 
56
 
-
 
57
/** Decode a single character from a string.
-
 
58
 *
-
 
59
 * Decode a single character from a string of size @a size. Decoding starts
-
 
60
 * at @a offset and this offset is moved to the beginning of the next
-
 
61
 * character. In case of decoding error, offset generally advances at least
-
 
62
 * by one. However, offset is never moved beyond size.
-
 
63
 *
-
 
64
 * @param str    String (not necessarily NULL-terminated).
-
 
65
 * @param offset Byte offset in string where to start decoding.
-
 
66
 * @param size   Size of the string (in bytes).
-
 
67
 *
-
 
68
 * @return Value of decoded character, U_SPECIAL on decoding error or
-
 
69
 *         NULL if attempt to decode beyond @a size.
-
 
70
 *
-
 
71
 */
-
 
72
wchar_t str_decode(const char *str, size_t *offset, size_t size)
-
 
73
{
-
 
74
    if (*offset + 1 > size)
-
 
75
        return 0;
-
 
76
   
-
 
77
    /* First byte read from string */
-
 
78
    uint8_t b0 = (uint8_t) str[(*offset)++];
-
 
79
   
-
 
80
    /* Determine code length */
-
 
81
   
-
 
82
    unsigned int b0_bits;  /* Data bits in first byte */
-
 
83
    unsigned int cbytes;   /* Number of continuation bytes */
-
 
84
   
-
 
85
    if ((b0 & 0x80) == 0) {
-
 
86
        /* 0xxxxxxx (Plain ASCII) */
-
 
87
        b0_bits = 7;
-
 
88
        cbytes = 0;
-
 
89
    } else if ((b0 & 0xe0) == 0xc0) {
-
 
90
        /* 110xxxxx 10xxxxxx */
-
 
91
        b0_bits = 5;
-
 
92
        cbytes = 1;
-
 
93
    } else if ((b0 & 0xf0) == 0xe0) {
-
 
94
        /* 1110xxxx 10xxxxxx 10xxxxxx */
-
 
95
        b0_bits = 4;
-
 
96
        cbytes = 2;
-
 
97
    } else if ((b0 & 0xf8) == 0xf0) {
-
 
98
        /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
-
 
99
        b0_bits = 3;
-
 
100
        cbytes = 3;
-
 
101
    } else {
-
 
102
        /* 10xxxxxx -- unexpected continuation byte */
-
 
103
        return U_SPECIAL;
-
 
104
    }
-
 
105
   
-
 
106
    if (*offset + cbytes > size)
-
 
107
        return U_SPECIAL;
-
 
108
   
-
 
109
    wchar_t ch = b0 & LO_MASK_8(b0_bits);
-
 
110
   
-
 
111
    /* Decode continuation bytes */
-
 
112
    while (cbytes > 0) {
-
 
113
        uint8_t b = (uint8_t) str[(*offset)++];
-
 
114
       
-
 
115
        /* Must be 10xxxxxx */
-
 
116
        if ((b & 0xc0) != 0x80)
-
 
117
            return U_SPECIAL;
-
 
118
       
-
 
119
        /* Shift data bits to ch */
-
 
120
        ch = (ch << CONT_BITS) | (wchar_t) (b & LO_MASK_8(CONT_BITS));
-
 
121
        cbytes--;
-
 
122
    }
-
 
123
   
-
 
124
    return ch;
-
 
125
}
-
 
126
 
-
 
127
/** Encode a single character to string representation.
-
 
128
 *
-
 
129
 * Encode a single character to string representation (i.e. UTF-8) and store
-
 
130
 * it into a buffer at @a offset. Encoding starts at @a offset and this offset
-
 
131
 * is moved to the position where the next character can be written to.
-
 
132
 *
-
 
133
 * @param ch     Input character.
-
 
134
 * @param str    Output buffer.
-
 
135
 * @param offset Byte offset where to start writing.
-
 
136
 * @param size   Size of the output buffer (in bytes).
-
 
137
 *
-
 
138
 * @return EOK if the character was encoded successfully, EOVERFLOW if there
-
 
139
 *     was not enough space in the output buffer or EINVAL if the character
-
 
140
 *     code was invalid.
-
 
141
 */
-
 
142
int chr_encode(const wchar_t ch, char *str, size_t *offset, size_t size)
-
 
143
{
-
 
144
    if (*offset >= size)
-
 
145
        return EOVERFLOW;
-
 
146
   
-
 
147
    if (!chr_check(ch))
-
 
148
        return EINVAL;
-
 
149
   
-
 
150
    /* Unsigned version of ch (bit operations should only be done
-
 
151
       on unsigned types). */
-
 
152
    uint32_t cc = (uint32_t) ch;
-
 
153
   
-
 
154
    /* Determine how many continuation bytes are needed */
-
 
155
   
-
 
156
    unsigned int b0_bits;  /* Data bits in first byte */
-
 
157
    unsigned int cbytes;   /* Number of continuation bytes */
-
 
158
   
-
 
159
    if ((cc & ~LO_MASK_32(7)) == 0) {
-
 
160
        b0_bits = 7;
-
 
161
        cbytes = 0;
-
 
162
    } else if ((cc & ~LO_MASK_32(11)) == 0) {
-
 
163
        b0_bits = 5;
-
 
164
        cbytes = 1;
-
 
165
    } else if ((cc & ~LO_MASK_32(16)) == 0) {
-
 
166
        b0_bits = 4;
-
 
167
        cbytes = 2;
-
 
168
    } else if ((cc & ~LO_MASK_32(21)) == 0) {
-
 
169
        b0_bits = 3;
-
 
170
        cbytes = 3;
-
 
171
    } else {
-
 
172
        /* Codes longer than 21 bits are not supported */
-
 
173
        return EINVAL;
-
 
174
    }
-
 
175
   
-
 
176
    /* Check for available space in buffer */
-
 
177
    if (*offset + cbytes >= size)
-
 
178
        return EOVERFLOW;
-
 
179
   
-
 
180
    /* Encode continuation bytes */
-
 
181
    unsigned int i;
-
 
182
    for (i = cbytes; i > 0; i--) {
-
 
183
        str[*offset + i] = 0x80 | (cc & LO_MASK_32(CONT_BITS));
-
 
184
        cc = cc >> CONT_BITS;
-
 
185
    }
-
 
186
   
-
 
187
    /* Encode first byte */
-
 
188
    str[*offset] = (cc & LO_MASK_32(b0_bits)) | HI_MASK_8(8 - b0_bits - 1);
-
 
189
   
-
 
190
    /* Advance offset */
-
 
191
    *offset += cbytes + 1;
-
 
192
   
-
 
193
    return EOK;
-
 
194
}
-
 
195
 
-
 
196
/** Get size of string.
-
 
197
 *
-
 
198
 * Get the number of bytes which are used by the string @a str (excluding the
-
 
199
 * NULL-terminator).
-
 
200
 *
-
 
201
 * @param str String to consider.
-
 
202
 *
-
 
203
 * @return Number of bytes used by the string
-
 
204
 *
-
 
205
 */
-
 
206
size_t str_size(const char *str)
-
 
207
{
-
 
208
    size_t size = 0;
-
 
209
   
-
 
210
    while (*str++ != 0)
-
 
211
        size++;
-
 
212
   
-
 
213
    return size;
-
 
214
}
-
 
215
 
-
 
216
/** Get size of wide string.
-
 
217
 *
-
 
218
 * Get the number of bytes which are used by the wide string @a str (excluding the
-
 
219
 * NULL-terminator).
-
 
220
 *
-
 
221
 * @param str Wide string to consider.
-
 
222
 *
-
 
223
 * @return Number of bytes used by the wide string
-
 
224
 *
-
 
225
 */
-
 
226
size_t wstr_size(const wchar_t *str)
-
 
227
{
-
 
228
    return (wstr_length(str) * sizeof(wchar_t));
-
 
229
}
-
 
230
 
-
 
231
/** Get size of string with length limit.
-
 
232
 *
-
 
233
 * Get the number of bytes which are used by up to @a max_len first
-
 
234
 * characters in the string @a str. If @a max_len is greater than
-
 
235
 * the length of @a str, the entire string is measured (excluding the
-
 
236
 * NULL-terminator).
-
 
237
 *
-
 
238
 * @param str     String to consider.
-
 
239
 * @param max_len Maximum number of characters to measure.
-
 
240
 *
-
 
241
 * @return Number of bytes used by the characters.
-
 
242
 *
-
 
243
 */
-
 
244
size_t str_lsize(const char *str, count_t max_len)
-
 
245
{
-
 
246
    count_t len = 0;
-
 
247
    size_t offset = 0;
-
 
248
   
-
 
249
    while (len < max_len) {
-
 
250
        if (str_decode(str, &offset, STR_NO_LIMIT) == 0)
-
 
251
            break;
-
 
252
       
-
 
253
        len++;
-
 
254
    }
-
 
255
   
-
 
256
    return offset;
-
 
257
}
-
 
258
 
-
 
259
/** Get size of wide string with length limit.
-
 
260
 *
-
 
261
 * Get the number of bytes which are used by up to @a max_len first
-
 
262
 * wide characters in the wide string @a str. If @a max_len is greater than
-
 
263
 * the length of @a str, the entire wide string is measured (excluding the
-
 
264
 * NULL-terminator).
-
 
265
 *
-
 
266
 * @param str     Wide string to consider.
-
 
267
 * @param max_len Maximum number of wide characters to measure.
-
 
268
 *
-
 
269
 * @return Number of bytes used by the wide characters.
-
 
270
 *
-
 
271
 */
-
 
272
size_t wstr_lsize(const wchar_t *str, count_t max_len)
-
 
273
{
-
 
274
    return (wstr_nlength(str, max_len * sizeof(wchar_t)) * sizeof(wchar_t));
-
 
275
}
-
 
276
 
-
 
277
/** Get number of characters in a string.
-
 
278
 *
-
 
279
 * @param str NULL-terminated string.
-
 
280
 *
-
 
281
 * @return Number of characters in string.
-
 
282
 *
-
 
283
 */
-
 
284
count_t str_length(const char *str)
-
 
285
{
-
 
286
    count_t len = 0;
-
 
287
    size_t offset = 0;
-
 
288
   
-
 
289
    while (str_decode(str, &offset, STR_NO_LIMIT) != 0)
-
 
290
        len++;
-
 
291
   
-
 
292
    return len;
-
 
293
}
-
 
294
 
-
 
295
/** Get number of characters in a wide string.
-
 
296
 *
-
 
297
 * @param str NULL-terminated wide string.
-
 
298
 *
-
 
299
 * @return Number of characters in @a str.
-
 
300
 *
-
 
301
 */
-
 
302
count_t wstr_length(const wchar_t *wstr)
-
 
303
{
-
 
304
    count_t len = 0;
-
 
305
   
-
 
306
    while (*wstr++ != 0)
-
 
307
        len++;
-
 
308
   
-
 
309
    return len;
-
 
310
}
-
 
311
 
-
 
312
/** Get number of characters in a string with size limit.
-
 
313
 *
-
 
314
 * @param str  NULL-terminated string.
-
 
315
 * @param size Maximum number of bytes to consider.
-
 
316
 *
-
 
317
 * @return Number of characters in string.
-
 
318
 *
-
 
319
 */
-
 
320
count_t str_nlength(const char *str, size_t size)
-
 
321
{
-
 
322
    count_t len = 0;
-
 
323
    size_t offset = 0;
-
 
324
   
-
 
325
    while (str_decode(str, &offset, size) != 0)
-
 
326
        len++;
-
 
327
   
-
 
328
    return len;
-
 
329
}
-
 
330
 
-
 
331
/** Get number of characters in a string with size limit.
-
 
332
 *
-
 
333
 * @param str  NULL-terminated string.
-
 
334
 * @param size Maximum number of bytes to consider.
-
 
335
 *
-
 
336
 * @return Number of characters in string.
-
 
337
 *
-
 
338
 */
-
 
339
count_t wstr_nlength(const wchar_t *str, size_t size)
-
 
340
{
-
 
341
    count_t len = 0;
-
 
342
    count_t limit = ALIGN_DOWN(size, sizeof(wchar_t));
-
 
343
    count_t offset = 0;
-
 
344
   
-
 
345
    while ((offset < limit) && (*str++ != 0)) {
-
 
346
        len++;
-
 
347
        offset += sizeof(wchar_t);
-
 
348
    }
-
 
349
   
-
 
350
    return len;
-
 
351
}
-
 
352
 
-
 
353
/** Check whether character is plain ASCII.
-
 
354
 *
-
 
355
 * @return True if character is plain ASCII.
-
 
356
 *
-
 
357
 */
-
 
358
bool ascii_check(wchar_t ch)
-
 
359
{
-
 
360
    if ((ch >= 0) && (ch <= 127))
-
 
361
        return true;
-
 
362
   
-
 
363
    return false;
-
 
364
}
-
 
365
 
-
 
366
/** Check whether character is valid
-
 
367
 *
-
 
368
 * @return True if character is a valid Unicode code point.
-
 
369
 *
-
 
370
 */
-
 
371
bool chr_check(wchar_t ch)
-
 
372
{
-
 
373
    if ((ch >= 0) && (ch <= 1114111))
-
 
374
        return true;
-
 
375
   
-
 
376
    return false;
-
 
377
}
-
 
378
 
-
 
379
/** Compare two NULL terminated strings.
-
 
380
 *
-
 
381
 * Do a char-by-char comparison of two NULL-terminated strings.
-
 
382
 * The strings are considered equal iff they consist of the same
-
 
383
 * characters on the minimum of their lengths.
-
 
384
 *
-
 
385
 * @param s1 First string to compare.
-
 
386
 * @param s2 Second string to compare.
-
 
387
 *
-
 
388
 * @return 0 if the strings are equal, -1 if first is smaller,
-
 
389
 *         1 if second smaller.
-
 
390
 *
-
 
391
 */
-
 
392
int str_cmp(const char *s1, const char *s2)
-
 
393
{
-
 
394
    wchar_t c1 = 0;
-
 
395
    wchar_t c2 = 0;
-
 
396
   
-
 
397
    size_t off1 = 0;
-
 
398
    size_t off2 = 0;
-
 
399
 
-
 
400
    while (true) {
-
 
401
        c1 = str_decode(s1, &off1, STR_NO_LIMIT);
-
 
402
        c2 = str_decode(s2, &off2, STR_NO_LIMIT);
-
 
403
 
-
 
404
        if (c1 < c2)
-
 
405
            return -1;
-
 
406
       
-
 
407
        if (c1 > c2)
-
 
408
            return 1;
-
 
409
 
-
 
410
        if (c1 == 0 || c2 == 0)
-
 
411
            break;     
-
 
412
    }
-
 
413
 
-
 
414
    return 0;
-
 
415
}
-
 
416
 
-
 
417
/** Compare two NULL terminated strings with length limit.
-
 
418
 *
-
 
419
 * Do a char-by-char comparison of two NULL-terminated strings.
-
 
420
 * The strings are considered equal iff they consist of the same
-
 
421
 * characters on the minimum of their lengths and the length limit.
-
 
422
 *
-
 
423
 * @param s1      First string to compare.
-
 
424
 * @param s2      Second string to compare.
-
 
425
 * @param max_len Maximum number of characters to consider.
-
 
426
 *
-
 
427
 * @return 0 if the strings are equal, -1 if first is smaller,
-
 
428
 *         1 if second smaller.
-
 
429
 *
-
 
430
 */
-
 
431
int str_lcmp(const char *s1, const char *s2, count_t max_len)
-
 
432
{
-
 
433
    wchar_t c1 = 0;
-
 
434
    wchar_t c2 = 0;
-
 
435
   
-
 
436
    size_t off1 = 0;
-
 
437
    size_t off2 = 0;
-
 
438
   
-
 
439
    count_t len = 0;
-
 
440
 
-
 
441
    while (true) {
-
 
442
        if (len >= max_len)
-
 
443
            break;
-
 
444
 
-
 
445
        c1 = str_decode(s1, &off1, STR_NO_LIMIT);
-
 
446
        c2 = str_decode(s2, &off2, STR_NO_LIMIT);
-
 
447
 
-
 
448
        if (c1 < c2)
-
 
449
            return -1;
-
 
450
 
-
 
451
        if (c1 > c2)
-
 
452
            return 1;
-
 
453
 
-
 
454
        if (c1 == 0 || c2 == 0)
-
 
455
            break;
-
 
456
 
-
 
457
        ++len; 
-
 
458
    }
-
 
459
 
-
 
460
    return 0;
-
 
461
 
-
 
462
}
-
 
463
 
-
 
464
/** Copy NULL-terminated string.
-
 
465
 *
-
 
466
 * Copy source string @a src to destination buffer @a dst.
-
 
467
 * No more than @a size bytes are written. NULL-terminator is always
-
 
468
 * written after the last succesfully copied character (i.e. if the
-
 
469
 * destination buffer is has at least 1 byte, it will be always
-
 
470
 * NULL-terminated).
-
 
471
 *
-
 
472
 * @param src   Source string.
-
 
473
 * @param dst   Destination buffer.
-
 
474
 * @param count Size of the destination buffer.
-
 
475
 *
-
 
476
 */
-
 
477
void str_ncpy(char *dst, const char *src, size_t size)
-
 
478
{
-
 
479
    /* No space for the NULL-terminator in the buffer */
-
 
480
    if (size == 0)
-
 
481
        return;
-
 
482
   
-
 
483
    wchar_t ch;
-
 
484
    size_t str_off = 0;
-
 
485
    size_t dst_off = 0;
-
 
486
   
-
 
487
    while ((ch = str_decode(src, &str_off, STR_NO_LIMIT)) != 0) {
-
 
488
        if (chr_encode(ch, dst, &dst_off, size) != EOK)
-
 
489
            break;
-
 
490
    }
-
 
491
   
-
 
492
    if (dst_off >= size)
-
 
493
        dst[size - 1] = 0;
-
 
494
    else
-
 
495
        dst[dst_off] = 0;
-
 
496
}
-
 
497
 
-
 
498
/** Copy NULL-terminated wide string to string
-
 
499
 *
-
 
500
 * Copy source wide string @a src to destination buffer @a dst.
-
 
501
 * No more than @a size bytes are written. NULL-terminator is always
-
 
502
 * written after the last succesfully copied character (i.e. if the
-
 
503
 * destination buffer is has at least 1 byte, it will be always
-
 
504
 * NULL-terminated).
-
 
505
 *
-
 
506
 * @param src   Source wide string.
-
 
507
 * @param dst   Destination buffer.
-
 
508
 * @param count Size of the destination buffer.
-
 
509
 *
-
 
510
 */
-
 
511
void wstr_nstr(char *dst, const wchar_t *src, size_t size)
-
 
512
{
-
 
513
    /* No space for the NULL-terminator in the buffer */
-
 
514
    if (size == 0)
-
 
515
        return;
-
 
516
   
-
 
517
    wchar_t ch;
-
 
518
    count_t src_idx = 0;
-
 
519
    size_t dst_off = 0;
-
 
520
   
-
 
521
    while ((ch = src[src_idx++]) != 0) {
-
 
522
        if (chr_encode(ch, dst, &dst_off, size) != EOK)
-
 
523
            break;
-
 
524
    }
-
 
525
   
-
 
526
    if (dst_off >= size)
-
 
527
        dst[size - 1] = 0;
-
 
528
    else
-
 
529
        dst[dst_off] = 0;
-
 
530
}
-
 
531
 
-
 
532
/** Find first occurence of character in string.
-
 
533
 *
-
 
534
 * @param str String to search.
-
 
535
 * @param ch  Character to look for.
-
 
536
 *
-
 
537
 * @return Pointer to character in @a str or NULL if not found.
-
 
538
 *
-
 
539
 */
-
 
540
const char *str_chr(const char *str, wchar_t ch)
-
 
541
{
-
 
542
    wchar_t acc;
-
 
543
    size_t off = 0;
-
 
544
   
-
 
545
    while ((acc = str_decode(str, &off, STR_NO_LIMIT)) != 0) {
-
 
546
        if (acc == ch)
-
 
547
            return (str + off);
-
 
548
    }
-
 
549
   
-
 
550
    return NULL;
-
 
551
}
-
 
552
 
-
 
553
/** Insert a wide character into a wide string.
-
 
554
 *
-
 
555
 * Insert a wide character into a wide string at position
-
 
556
 * @a pos. The characters after the position are shifted.
-
 
557
 *
-
 
558
 * @param str     String to insert to.
-
 
559
 * @param ch      Character to insert to.
-
 
560
 * @param pos     Character index where to insert.
-
 
561
 @ @param max_pos Characters in the buffer.
-
 
562
 *
-
 
563
 * @return True if the insertion was sucessful, false if the position
-
 
564
 *         is out of bounds.
-
 
565
 *
-
 
566
 */
-
 
567
bool wstr_linsert(wchar_t *str, wchar_t ch, count_t pos, count_t max_pos)
-
 
568
{
-
 
569
    count_t len = wstr_length(str);
-
 
570
   
-
 
571
    if ((pos > len) || (pos + 1 > max_pos))
-
 
572
        return false;
-
 
573
   
-
 
574
    count_t i;
-
 
575
    for (i = len; i + 1 > pos; i--)
-
 
576
        str[i + 1] = str[i];
-
 
577
   
-
 
578
    str[pos] = ch;
-
 
579
   
-
 
580
    return true;
-
 
581
}
-
 
582
 
-
 
583
/** Remove a wide character from a wide string.
-
 
584
 *
-
 
585
 * Remove a wide character from a wide string at position
-
 
586
 * @a pos. The characters after the position are shifted.
-
 
587
 *
-
 
588
 * @param str String to remove from.
-
 
589
 * @param pos Character index to remove.
-
 
590
 *
-
 
591
 * @return True if the removal was sucessful, false if the position
-
 
592
 *         is out of bounds.
-
 
593
 *
-
 
594
 */
-
 
595
bool wstr_remove(wchar_t *str, count_t pos)
-
 
596
{
-
 
597
    count_t len = wstr_length(str);
-
 
598
   
-
 
599
    if (pos >= len)
-
 
600
        return false;
-
 
601
   
-
 
602
    count_t i;
-
 
603
    for (i = pos + 1; i <= len; i++)
-
 
604
        str[i - 1] = str[i];
-
 
605
   
-
 
606
    return true;
-
 
607
}
41
 
608
 
42
/** Count the number of characters in the string, not including terminating 0.
609
/** Count the number of characters in the string, not including terminating 0.
43
 *
610
 *
44
 * @param str       String.
611
 * @param str       String.
45
 * @return      Number of characters in string.
612
 * @return      Number of characters in string.