Subversion Repositories HelenOS

Rev

Rev 4179 | Rev 4198 | Go to most recent revision | Show entire file | Ignore whitespace | Details | Blame | Last modification | View Log | RSS feed

Rev 4179 Rev 4196
Line 42... Line 42...
42
#include <arch.h>
42
#include <arch.h>
43
#include <console/kconsole.h>
43
#include <console/kconsole.h>
44
 
44
 
45
char invalch = '?';
45
char invalch = '?';
46
 
46
 
-
 
47
/** Byte mask consisting of bits 0 - (@n - 1) */
-
 
48
#define LO_MASK_8(n) ((uint8_t)((1 << (n)) - 1))
-
 
49
 
-
 
50
/** Number of data bits in a UTF-8 continuation byte. */
-
 
51
#define CONT_BITS 6
-
 
52
 
47
/** Decode a single UTF-8 character from a NULL-terminated string.
53
/** Decode a single UTF-8 character from a NULL-terminated string.
48
 *
54
 *
49
 * Decode a single UTF-8 character from a plain char NULL-terminated
55
 * Decode a single UTF-8 character from a plain char NULL-terminated
50
 * string. Decoding starts at @index and this index is incremented
56
 * string. Decoding starts at @index and this index is incremented
51
 * if the current UTF-8 string is encoded in more than a single byte.
57
 * if the current UTF-8 string is encoded in more than a single byte.
Line 58... Line 64...
58
 * @return Decoded character in UTF-32 or '?' if the encoding is wrong.
64
 * @return Decoded character in UTF-32 or '?' if the encoding is wrong.
59
 *
65
 *
60
 */
66
 */
61
wchar_t utf8_decode(const char *str, index_t *index, index_t limit)
67
wchar_t utf8_decode(const char *str, index_t *index, index_t limit)
62
{
68
{
63
    uint8_t c1;           /* First plain character from str */
69
    uint8_t b0, b;          /* Bytes read from str. */
64
    uint8_t c2;           /* Second plain character from str */
70
    wchar_t ch;
-
 
71
 
65
    uint8_t c3;           /* Third plain character from str */
72
    int b0_bits;        /* Data bits in first byte. */
66
    uint8_t c4;           /* Fourth plain character from str */
73
    int cbytes;     /* Number of continuation bytes. */
67
   
74
 
68
    if (*index > limit)
75
    if (*index > limit)
69
        return invalch;
76
        return invalch;
70
   
77
 
71
    c1 = (uint8_t) str[*index];
78
    b0 = (uint8_t) str[*index];
-
 
79
 
-
 
80
    /* Determine code length. */
72
   
81
 
73
    if ((c1 & 0x80) == 0) {
82
    if ((b0 & 0x80) == 0) {
74
        /* Plain ASCII (code points 0 .. 127) */
83
        /* 0xxxxxxx (Plain ASCII) */
75
        return (wchar_t) c1;
84
        b0_bits = 7;
76
    }
85
        cbytes = 0;
77
   
-
 
78
    if ((c1 & 0xe0) == 0xc0) {
86
    } else if ((b0 & 0xe0) == 0xc0) {
79
        /* Code points 128 .. 2047 */
87
        /* 110xxxxx 10xxxxxx */
80
        if (*index + 1 > limit)
88
        b0_bits = 5;
81
            return invalch;
89
        cbytes = 1;
-
 
90
    } else if ((b0 & 0xf0) == 0xe0) {
-
 
91
        /* 1110xxxx 10xxxxxx 10xxxxxx */
82
       
92
        b0_bits = 4;
83
        c2 = (uint8_t) str[*index + 1];
93
        cbytes = 2;
84
        if ((c2 & 0xc0) == 0x80) {
94
    } else if ((b0 & 0xf8) == 0xf0) {
-
 
95
        /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
85
            (*index)++;
96
        b0_bits = 3;
86
            return ((wchar_t) ((c1 & 0x1f) << 6) | (c2 & 0x3f));
97
        cbytes = 3;
87
        } else
98
    } else {
-
 
99
        /* 10xxxxxx -- unexpected continuation byte. */
88
            return invalch;
100
        return invalch;
89
    }
101
    }
90
   
102
 
91
    if ((c1 & 0xf0) == 0xe0) {
-
 
92
        /* Code points 2048 .. 65535 */
-
 
93
        if (*index + 2 > limit)
103
    if (*index + cbytes > limit) {
94
            return invalch;
-
 
95
       
-
 
96
        c2 = (uint8_t) str[*index + 1];
-
 
97
        if ((c2 & 0xc0) == 0x80) {
-
 
98
            (*index)++;
-
 
99
            c3 = (uint8_t) str[*index + 1];
-
 
100
            if ((c3 & 0xc0) == 0x80) {
-
 
101
                (*index)++;
-
 
102
                return ((wchar_t) ((c1 & 0x0f) << 12) | ((c2 & 0x3f) << 6) | (c3 & 0x3f));
-
 
103
            } else
-
 
104
                return invalch;
-
 
105
        } else
-
 
106
            return invalch;
104
        return invalch;
107
    }
105
    }
108
   
106
 
109
    if ((c1 & 0xf8) == 0xf0) {
107
    ch = b0 & LO_MASK_8(b0_bits);
110
        /* Code points 65536 .. 1114111 */
-
 
111
        if (*index + 3 > limit)
-
 
112
            return invalch;
-
 
113
       
108
 
114
        c2 = (uint8_t) str[*index + 1];
109
    /* Decode continuation bytes. */
115
        if ((c2 & 0xc0) == 0x80) {
110
    while (cbytes > 0) {
116
            (*index)++;
-
 
117
            c3 = (uint8_t) str[*index + 1];
111
        b = (uint8_t) str[*index + 1];
118
            if ((c3 & 0xc0) == 0x80) {
-
 
119
                (*index)++;
112
        ++(*index);
-
 
113
 
120
                c4 = (uint8_t) str[*index + 1];
114
        /* Must be 10xxxxxx. */
121
                if ((c4 & 0xc0) == 0x80) {
115
        if ((b & 0xc0) != 0x80) {
122
                    (*index)++;
-
 
123
                    return ((wchar_t) ((c1 & 0x07) << 18) | ((c2 & 0x3f) << 12) | ((c3 & 0x3f) << 6) | (c4 & 0x3f));
-
 
124
                } else
-
 
125
                    return invalch;
-
 
126
            } else
-
 
127
                return invalch;
-
 
128
        } else
-
 
129
            return invalch;
116
            return invalch;
-
 
117
        }
-
 
118
 
-
 
119
        /* Shift data bits to ch. */
-
 
120
        ch = (ch << CONT_BITS) | (wchar_t) (b & LO_MASK_8(CONT_BITS));
-
 
121
        --cbytes;
130
    }
122
    }
131
   
123
 
132
    return invalch;
124
    return ch;
133
}
125
}
134
 
126
 
135
/** Encode a single UTF-32 character as UTF-8
127
/** Encode a single UTF-32 character as UTF-8
136
 *
128
 *
137
 * Encode a single UTF-32 character as UTF-8 and store it into
129
 * Encode a single UTF-32 character as UTF-8 and store it into