Subversion Repositories HelenOS

Rev

Rev 3731 | Rev 4234 | Go to most recent revision | Show entire file | Ignore whitespace | Details | Blame | Last modification | View Log | RSS feed

Rev 3731 Rev 4226
Line 36... Line 36...
36
#include <string.h>
36
#include <string.h>
37
#include <stdlib.h>
37
#include <stdlib.h>
38
#include <limits.h>
38
#include <limits.h>
39
#include <ctype.h>
39
#include <ctype.h>
40
#include <malloc.h>
40
#include <malloc.h>
-
 
41
#include <errno.h>
-
 
42
#include <string.h>
-
 
43
 
-
 
44
/** Byte mask consisting of lowest @n bits (out of 8) */
-
 
45
#define LO_MASK_8(n)  ((uint8_t) ((1 << (n)) - 1))
-
 
46
 
-
 
47
/** Byte mask consisting of lowest @n bits (out of 32) */
-
 
48
#define LO_MASK_32(n)  ((uint32_t) ((1 << (n)) - 1))
-
 
49
 
-
 
50
/** Byte mask consisting of highest @n bits (out of 8) */
-
 
51
#define HI_MASK_8(n)  (~LO_MASK_8(8 - (n)))
-
 
52
 
-
 
53
/** Number of data bits in a UTF-8 continuation byte */
-
 
54
#define CONT_BITS  6
-
 
55
 
-
 
56
/** Decode a single character from a string.
-
 
57
 *
-
 
58
 * Decode a single character from a string of size @a size. Decoding starts
-
 
59
 * at @a offset and this offset is moved to the beginning of the next
-
 
60
 * character. In case of decoding error, offset generally advances at least
-
 
61
 * by one. However, offset is never moved beyond size.
-
 
62
 *
-
 
63
 * @param str    String (not necessarily NULL-terminated).
-
 
64
 * @param offset Byte offset in string where to start decoding.
-
 
65
 * @param size   Size of the string (in bytes).
-
 
66
 *
-
 
67
 * @return Value of decoded character, U_SPECIAL on decoding error or
-
 
68
 *         NULL if attempt to decode beyond @a size.
-
 
69
 *
-
 
70
 */
-
 
71
wchar_t str_decode(const char *str, size_t *offset, size_t size)
-
 
72
{
-
 
73
    if (*offset + 1 > size)
-
 
74
        return 0;
-
 
75
   
-
 
76
    /* First byte read from string */
-
 
77
    uint8_t b0 = (uint8_t) str[(*offset)++];
-
 
78
   
-
 
79
    /* Determine code length */
-
 
80
   
-
 
81
    unsigned int b0_bits;  /* Data bits in first byte */
-
 
82
    unsigned int cbytes;   /* Number of continuation bytes */
-
 
83
   
-
 
84
    if ((b0 & 0x80) == 0) {
-
 
85
        /* 0xxxxxxx (Plain ASCII) */
-
 
86
        b0_bits = 7;
-
 
87
        cbytes = 0;
-
 
88
    } else if ((b0 & 0xe0) == 0xc0) {
-
 
89
        /* 110xxxxx 10xxxxxx */
-
 
90
        b0_bits = 5;
-
 
91
        cbytes = 1;
-
 
92
    } else if ((b0 & 0xf0) == 0xe0) {
-
 
93
        /* 1110xxxx 10xxxxxx 10xxxxxx */
-
 
94
        b0_bits = 4;
-
 
95
        cbytes = 2;
-
 
96
    } else if ((b0 & 0xf8) == 0xf0) {
-
 
97
        /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
-
 
98
        b0_bits = 3;
-
 
99
        cbytes = 3;
-
 
100
    } else {
-
 
101
        /* 10xxxxxx -- unexpected continuation byte */
-
 
102
        return U_SPECIAL;
-
 
103
    }
-
 
104
   
-
 
105
    if (*offset + cbytes > size)
-
 
106
        return U_SPECIAL;
-
 
107
   
-
 
108
    wchar_t ch = b0 & LO_MASK_8(b0_bits);
-
 
109
   
-
 
110
    /* Decode continuation bytes */
-
 
111
    while (cbytes > 0) {
-
 
112
        uint8_t b = (uint8_t) str[(*offset)++];
-
 
113
       
-
 
114
        /* Must be 10xxxxxx */
-
 
115
        if ((b & 0xc0) != 0x80)
-
 
116
            return U_SPECIAL;
-
 
117
       
-
 
118
        /* Shift data bits to ch */
-
 
119
        ch = (ch << CONT_BITS) | (wchar_t) (b & LO_MASK_8(CONT_BITS));
-
 
120
        cbytes--;
-
 
121
    }
-
 
122
   
-
 
123
    return ch;
-
 
124
}
-
 
125
 
-
 
126
/** Encode a single character to string representation.
-
 
127
 *
-
 
128
 * Encode a single character to string representation (i.e. UTF-8) and store
-
 
129
 * it into a buffer at @a offset. Encoding starts at @a offset and this offset
-
 
130
 * is moved to the position where the next character can be written to.
-
 
131
 *
-
 
132
 * @param ch     Input character.
-
 
133
 * @param str    Output buffer.
-
 
134
 * @param offset Byte offset where to start writing.
-
 
135
 * @param size   Size of the output buffer (in bytes).
-
 
136
 *
-
 
137
 * @return EOK if the character was encoded successfully, EOVERFLOW if there
-
 
138
 *     was not enough space in the output buffer or EINVAL if the character
-
 
139
 *     code was invalid.
-
 
140
 */
-
 
141
int chr_encode(const wchar_t ch, char *str, size_t *offset, size_t size)
-
 
142
{
-
 
143
    if (*offset >= size)
-
 
144
        return EOVERFLOW;
-
 
145
   
-
 
146
    if (!chr_check(ch))
-
 
147
        return EINVAL;
-
 
148
   
-
 
149
    /* Unsigned version of ch (bit operations should only be done
-
 
150
       on unsigned types). */
-
 
151
    uint32_t cc = (uint32_t) ch;
-
 
152
   
-
 
153
    /* Determine how many continuation bytes are needed */
-
 
154
   
-
 
155
    unsigned int b0_bits;  /* Data bits in first byte */
-
 
156
    unsigned int cbytes;   /* Number of continuation bytes */
-
 
157
   
-
 
158
    if ((cc & ~LO_MASK_32(7)) == 0) {
-
 
159
        b0_bits = 7;
-
 
160
        cbytes = 0;
-
 
161
    } else if ((cc & ~LO_MASK_32(11)) == 0) {
-
 
162
        b0_bits = 5;
-
 
163
        cbytes = 1;
-
 
164
    } else if ((cc & ~LO_MASK_32(16)) == 0) {
-
 
165
        b0_bits = 4;
-
 
166
        cbytes = 2;
-
 
167
    } else if ((cc & ~LO_MASK_32(21)) == 0) {
-
 
168
        b0_bits = 3;
-
 
169
        cbytes = 3;
-
 
170
    } else {
-
 
171
        /* Codes longer than 21 bits are not supported */
-
 
172
        return EINVAL;
-
 
173
    }
-
 
174
   
-
 
175
    /* Check for available space in buffer */
-
 
176
    if (*offset + cbytes >= size)
-
 
177
        return EOVERFLOW;
-
 
178
   
-
 
179
    /* Encode continuation bytes */
-
 
180
    unsigned int i;
-
 
181
    for (i = cbytes; i > 0; i--) {
-
 
182
        str[*offset + i] = 0x80 | (cc & LO_MASK_32(CONT_BITS));
-
 
183
        cc = cc >> CONT_BITS;
-
 
184
    }
-
 
185
   
-
 
186
    /* Encode first byte */
-
 
187
    str[*offset] = (cc & LO_MASK_32(b0_bits)) | HI_MASK_8(8 - b0_bits - 1);
-
 
188
   
-
 
189
    /* Advance offset */
-
 
190
    *offset += cbytes + 1;
-
 
191
   
-
 
192
    return EOK;
-
 
193
}
-
 
194
 
-
 
195
/** Check whether character is valid
-
 
196
 *
-
 
197
 * @return True if character is a valid Unicode code point.
-
 
198
 *
-
 
199
 */
-
 
200
bool chr_check(const wchar_t ch)
-
 
201
{
-
 
202
    if ((ch >= 0) && (ch <= 1114111))
-
 
203
        return true;
-
 
204
   
-
 
205
    return false;
-
 
206
}
41
 
207
 
42
/** Count the number of characters in the string, not including terminating 0.
208
/** Count the number of characters in the string, not including terminating 0.
43
 *
209
 *
44
 * @param str       String.
210
 * @param str       String.
45
 * @return      Number of characters in string.
211
 * @return      Number of characters in string.