WebSVN – HelenOS – Diff – /trunk/kernel/generic/src/lib/string.c

 #include <arch.h>
 #include <console/kconsole.h>
 char invalch = '?';
+/** Byte mask consisting of bits 0 - (@n - 1) */
+#define LO_MASK_8(n) ((uint8_t)((1 << (n)) - 1))
+/** Number of data bits in a UTF-8 continuation byte. */
+#define CONT_BITS 6
 /** Decode a single UTF-8 character from a NULL-terminated string.
+ *
  * Decode a single UTF-8 character from a plain char NULL-terminated
  * string. Decoding starts at @index and this index is incremented
  * if the current UTF-8 string is encoded in more than a single byte.
  * @return Decoded character in UTF-32 or '?' if the encoding is wrong.
+ *
  */
 wchar_t utf8_decode(const char *str, index_t *index, index_t limit)
+{
-    uint8_t c1;           /* First plain character from str */
+    uint8_t b0, b;          /* Bytes read from str. */
-    uint8_t c2;           /* Second plain character from str */
+    wchar_t ch;
-    uint8_t c3;           /* Third plain character from str */
+    int b0_bits;        /* Data bits in first byte. */
-    uint8_t c4;           /* Fourth plain character from str */
+    int cbytes;     /* Number of continuation bytes. */
     if (*index > limit)
         return invalch;
-    c1 = (uint8_t) str[*index];
+    b0 = (uint8_t) str[*index];
+    /* Determine code length. */
-    if ((c1 & 0x80) == 0) {
+    if ((b0 & 0x80) == 0) {
-        /* Plain ASCII (code points 0 .. 127) */
+        /* 0xxxxxxx (Plain ASCII) */
-        return (wchar_t) c1;
+        b0_bits = 7;
-    }
+        cbytes = 0;
-    if ((c1 & 0xe0) == 0xc0) {
+    } else if ((b0 & 0xe0) == 0xc0) {
-        /* Code points 128 .. 2047 */
+        /* 110xxxxx 10xxxxxx */
-        if (*index + 1 > limit)
+        b0_bits = 5;
-            return invalch;
+        cbytes = 1;
+    } else if ((b0 & 0xf0) == 0xe0) {
+        /* 1110xxxx 10xxxxxx 10xxxxxx */
+        b0_bits = 4;
-        c2 = (uint8_t) str[*index + 1];
+        cbytes = 2;
-        if ((c2 & 0xc0) == 0x80) {
+    } else if ((b0 & 0xf8) == 0xf0) {
+        /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
-            (*index)++;
+        b0_bits = 3;
-            return ((wchar_t) ((c1 & 0x1f) << 6) | (c2 & 0x3f));
+        cbytes = 3;
-        } else
+    } else {
+        /* 10xxxxxx -- unexpected continuation byte. */
-            return invalch;
+        return invalch;
+    }
-    if ((c1 & 0xf0) == 0xe0) {
-        /* Code points 2048 .. 65535 */
-        if (*index + 2 > limit)
+    if (*index + cbytes > limit) {
-            return invalch;
-        c2 = (uint8_t) str[*index + 1];
-        if ((c2 & 0xc0) == 0x80) {
-            (*index)++;
-            c3 = (uint8_t) str[*index + 1];
-            if ((c3 & 0xc0) == 0x80) {
-                (*index)++;
-                return ((wchar_t) ((c1 & 0x0f) << 12) | ((c2 & 0x3f) << 6) | (c3 & 0x3f));
-            } else
-                return invalch;
-        } else
-            return invalch;
+        return invalch;
+    }
-    if ((c1 & 0xf8) == 0xf0) {
+    ch = b0 & LO_MASK_8(b0_bits);
-        /* Code points 65536 .. 1114111 */
-        if (*index + 3 > limit)
-            return invalch;
-        c2 = (uint8_t) str[*index + 1];
+    /* Decode continuation bytes. */
-        if ((c2 & 0xc0) == 0x80) {
+    while (cbytes > 0) {
-            (*index)++;
-            c3 = (uint8_t) str[*index + 1];
+        b = (uint8_t) str[*index + 1];
-            if ((c3 & 0xc0) == 0x80) {
-                (*index)++;
+        ++(*index);
-                c4 = (uint8_t) str[*index + 1];
+        /* Must be 10xxxxxx. */
-                if ((c4 & 0xc0) == 0x80) {
+        if ((b & 0xc0) != 0x80) {
-                    (*index)++;
-                    return ((wchar_t) ((c1 & 0x07) << 18) | ((c2 & 0x3f) << 12) | ((c3 & 0x3f) << 6) | (c4 & 0x3f));
-                } else
-                    return invalch;
-            } else
-                return invalch;
-        } else
             return invalch;
+        }
+        /* Shift data bits to ch. */
+        ch = (ch << CONT_BITS) | (wchar_t) (b & LO_MASK_8(CONT_BITS));
+        --cbytes;
+    }
-    return invalch;
+    return ch;
+}
 /** Encode a single UTF-32 character as UTF-8
+ *
  * Encode a single UTF-32 character as UTF-8 and store it into

Subversion Repositories HelenOS

(root)/trunk/kernel/generic/src/lib/string.c @ 4684 – Rev 4179 → 4196