Subversion Repositories HelenOS

Rev

Rev 4208 | Rev 4223 | Go to most recent revision | Blame | Compare with Previous | Last modification | View Log | Download | RSS feed

  1. /*
  2.  * Copyright (c) 2001-2004 Jakub Jermar
  3.  * All rights reserved.
  4.  *
  5.  * Redistribution and use in source and binary forms, with or without
  6.  * modification, are permitted provided that the following conditions
  7.  * are met:
  8.  *
  9.  * - Redistributions of source code must retain the above copyright
  10.  *   notice, this list of conditions and the following disclaimer.
  11.  * - Redistributions in binary form must reproduce the above copyright
  12.  *   notice, this list of conditions and the following disclaimer in the
  13.  *   documentation and/or other materials provided with the distribution.
  14.  * - The name of the author may not be used to endorse or promote products
  15.  *   derived from this software without specific prior written permission.
  16.  *
  17.  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  18.  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  19.  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  20.  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  21.  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  22.  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  23.  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  24.  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  25.  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  26.  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  27.  */
  28.  
  29. /** @addtogroup generic
  30.  * @{
  31.  */
  32.  
  33. /**
  34.  * @file
  35.  * @brief String functions.
  36.  *
  37.  * Strings and characters use the Universal Character Set (UCS). The standard
  38.  * strings, called just strings are encoded in UTF-8. Wide strings (encoded
  39.  * in UTF-32) are supported to a limited degree. A single character is
  40.  * represented as wchar_t.
  41.  *
  42.  * Strings have the following metrics:
  43.  *
  44.  *  Metric  Abbrev. Meaning
  45.  *  ------  ------  -------
  46.  *  size    n   Number of bytes the string is encoded into, excluding
  47.  *          the null terminator.
  48.  *  length  l   The number of characters in the string, excluding
  49.  *          the null terminator.
  50.  *  width   w   The number of character cells the string takes up on a
  51.  *          monospace display.
  52.  *
  53.  * Naming scheme:
  54.  *
  55.  *  chr_xxx     operate on characters
  56.  *  str_xxx     operate on strings
  57.  *  wstr_xxx    operate on wide strings
  58.  *
  59.  *  [w]str_[n|l|w]xxx   operate on a prefix limited by size, length
  60.  *              or width.
  61.  */
  62.  
  63. #include <string.h>
  64. #include <print.h>
  65. #include <cpu.h>
  66. #include <arch/asm.h>
  67. #include <arch.h>
  68. #include <errno.h>
  69. #include <console/kconsole.h>
  70.  
  71. char invalch = '?';
  72.  
  73. /** Byte mask consisting of lowest @n bits (out of eight). */
  74. #define LO_MASK_8(n) ((uint8_t)((1 << (n)) - 1))
  75.  
  76. /** Byte mask consisting of lowest @n bits (out of 32). */
  77. #define LO_MASK_32(n) ((uint32_t)((1 << (n)) - 1))
  78.  
  79. /** Byte mask consisting of highest @n bits (out of eight). */
  80. #define HI_MASK_8(n) (~LO_MASK_8(8 - (n)))
  81.  
  82. /** Number of data bits in a UTF-8 continuation byte. */
  83. #define CONT_BITS 6
  84.  
  85. /** Decode a single character from a substring.
  86.  *
  87.  * Decode a single character from a substring of size @a sz. Decoding starts
  88.  * at @a offset and this offset is moved to the beginning of the next
  89.  * character. In case of decoding error, offset generally advances at least
  90.  * by one. However, offset is never moved beyond (str + sz).
  91.  *
  92.  * @param str   String (not necessarily NULL-terminated).
  93.  * @param index Index (counted in plain characters) where to start
  94.  *              the decoding.
  95.  * @param limit Size of the substring.
  96.  *
  97.  * @return  Value of decoded character or '?' on decoding error.
  98.  */
  99. wchar_t chr_decode(const char *str, size_t *offset, size_t sz)
  100. {
  101.     uint8_t b0, b;          /* Bytes read from str. */
  102.     wchar_t ch;
  103.  
  104.     int b0_bits;        /* Data bits in first byte. */
  105.     int cbytes;     /* Number of continuation bytes. */
  106.  
  107.     if (*offset + 1 > sz)
  108.         return invalch;
  109.  
  110.     b0 = (uint8_t) str[(*offset)++];
  111.  
  112.     /* Determine code length. */
  113.  
  114.     if ((b0 & 0x80) == 0) {
  115.         /* 0xxxxxxx (Plain ASCII) */
  116.         b0_bits = 7;
  117.         cbytes = 0;
  118.     } else if ((b0 & 0xe0) == 0xc0) {
  119.         /* 110xxxxx 10xxxxxx */
  120.         b0_bits = 5;
  121.         cbytes = 1;
  122.     } else if ((b0 & 0xf0) == 0xe0) {
  123.         /* 1110xxxx 10xxxxxx 10xxxxxx */
  124.         b0_bits = 4;
  125.         cbytes = 2;
  126.     } else if ((b0 & 0xf8) == 0xf0) {
  127.         /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
  128.         b0_bits = 3;
  129.         cbytes = 3;
  130.     } else {
  131.         /* 10xxxxxx -- unexpected continuation byte. */
  132.         return invalch;
  133.     }
  134.  
  135.     if (*offset + cbytes > sz) {
  136.         return invalch;
  137.     }
  138.  
  139.     ch = b0 & LO_MASK_8(b0_bits);
  140.  
  141.     /* Decode continuation bytes. */
  142.     while (cbytes > 0) {
  143.         b = (uint8_t) str[(*offset)++];
  144.  
  145.         /* Must be 10xxxxxx. */
  146.         if ((b & 0xc0) != 0x80) {
  147.             return invalch;
  148.         }
  149.  
  150.         /* Shift data bits to ch. */
  151.         ch = (ch << CONT_BITS) | (wchar_t) (b & LO_MASK_8(CONT_BITS));
  152.         --cbytes;
  153.     }
  154.  
  155.     return ch;
  156. }
  157.  
  158. /** Encode a single character to string representation.
  159.  *
  160.  * Encode a single character to string representation (i.e. UTF-8) and store
  161.  * it into a buffer at @a offset. Encoding starts at @a offset and this offset
  162.  * is moved to the position where the next character can be written to.
  163.  *
  164.  * @param ch        Input character.
  165.  * @param str       Output buffer.
  166.  * @param offset    Offset (in bytes) where to start writing.
  167.  * @param sz        Size of the output buffer.
  168.  *
  169.  * @return EOK if the character was encoded successfully, EOVERFLOW if there
  170.  *     was not enough space in the output buffer or EINVAL if the character
  171.  *     code was invalid.
  172.  */
  173. int chr_encode(wchar_t ch, char *str, size_t *offset, size_t sz)
  174. {
  175.     uint32_t cc;        /* Unsigned version of ch. */
  176.  
  177.     int cbytes;     /* Number of continuation bytes. */
  178.     int b0_bits;        /* Number of data bits in first byte. */
  179.     int i;
  180.  
  181.     if (*offset >= sz)
  182.         return EOVERFLOW;
  183.  
  184.     if (ch < 0)
  185.         return EINVAL;
  186.  
  187.     /* Bit operations should only be done on unsigned numbers. */
  188.     cc = (uint32_t) ch;
  189.  
  190.     /* Determine how many continuation bytes are needed. */
  191.     if ((cc & ~LO_MASK_32(7)) == 0) {
  192.         b0_bits = 7;
  193.         cbytes = 0;
  194.     } else if ((cc & ~LO_MASK_32(11)) == 0) {
  195.         b0_bits = 5;
  196.         cbytes = 1;
  197.     } else if ((cc & ~LO_MASK_32(16)) == 0) {
  198.         b0_bits = 4;
  199.         cbytes = 2;
  200.     } else if ((cc & ~LO_MASK_32(21)) == 0) {
  201.         b0_bits = 3;
  202.         cbytes = 3;
  203.     } else {
  204.         /* Codes longer than 21 bits are not supported. */
  205.         return EINVAL;
  206.     }
  207.  
  208.     /* Check for available space in buffer. */
  209.     if (*offset + cbytes >= sz)
  210.         return EOVERFLOW;
  211.  
  212.     /* Encode continuation bytes. */
  213.     for (i = cbytes; i > 0; --i) {
  214.         str[*offset + i] = 0x80 | (cc & LO_MASK_32(CONT_BITS));
  215.         cc = cc >> CONT_BITS;
  216.     }
  217.  
  218.     /* Encode first byte. */
  219.     str[*offset] = (cc & LO_MASK_32(b0_bits)) | HI_MASK_8(8 - b0_bits - 1);
  220.  
  221.     /* Advance offset. */
  222.     *offset += (1 + cbytes);
  223.    
  224.     return EOK;
  225. }
  226.  
  227. /** Get display width of character.
  228.  *
  229.  * @param ch    The character.
  230.  * @return  Character width in display cells.
  231.  */
  232. count_t chr_width(wchar_t ch)
  233. {
  234.     return 1;
  235. }
  236.  
  237. /** Get size of string, with length limit.
  238.  *
  239.  * Get the number of bytes which are used by up to @a max_len first
  240.  * characters in the string @a str. If @a max_len is greater than
  241.  * the length of @a str, the entire string is measured.
  242.  *
  243.  * @param str   String to consider.
  244.  * @param count Maximum number of characters to measure.
  245.  *
  246.  * @return  Number of bytes used by the characters.
  247.  */
  248. size_t str_lsize(const char *str, count_t max_len)
  249. {
  250.     count_t len = 0;
  251.     size_t cur = 0;
  252.     size_t prev;
  253.     wchar_t ch;
  254.  
  255.     while (true) {
  256.         prev = cur;
  257.         if (len >= max_len)
  258.             break;
  259.         ch = chr_decode(str, &cur, UTF8_NO_LIMIT);
  260.         if (ch == '\0') break;
  261.  
  262.         len++;
  263.     }
  264.  
  265.     return prev;
  266. }
  267.  
  268. /** Get size of string, with width limit.
  269.  *
  270.  * Get the number of bytes which are used by the longest prefix of @a str
  271.  * that can fit into @a max_width display cells.
  272.  *
  273.  * @param str   String to consider.
  274.  * @param count Maximum number of display cells.
  275.  *
  276.  * @return  Number of bytes used by the characters that fit.
  277.  */
  278. size_t str_wsize(const char *str, count_t max_width)
  279. {
  280.     count_t width = 0;
  281.     size_t cur = 0;
  282.     size_t prev;
  283.     wchar_t ch;
  284.  
  285.     while (true) {
  286.         prev = cur;
  287.         if (width >= max_width)
  288.             break;
  289.         ch = chr_decode(str, &cur, UTF8_NO_LIMIT);
  290.         if (ch == '\0') break;
  291.  
  292.         width += chr_width(ch);
  293.     }
  294.  
  295.     return prev;
  296. }
  297.  
  298.  
  299. /** Get length of wide string, with width limit.
  300.  *
  301.  * Get the number of characters in a wide string that can fit into @a max_width
  302.  * display cells.
  303.  *
  304.  * @param wstr   Wide string to consider.
  305.  * @param count Maximum number of display cells.
  306.  *
  307.  * @return  Number of bytes used by the characters that fit.
  308.  */
  309. count_t wstr_wlength(const wchar_t *wstr, count_t max_width)
  310. {
  311.     count_t width = 0;
  312.     index_t cur = 0;
  313.  
  314.     while (true) {
  315.         if (width >= max_width)
  316.             break;
  317.         if (wstr[cur] == '\0') break;
  318.  
  319.         width += chr_width(wstr[cur]);
  320.         ++cur;
  321.     }
  322.  
  323.     return (count_t) cur;
  324. }
  325.  
  326. /** Check whether character is plain ASCII.
  327.  *
  328.  * @return True if character is plain ASCII.
  329.  *
  330.  */
  331. bool ascii_check(const wchar_t ch)
  332. {
  333.     if ((ch >= 0) && (ch <= 127))
  334.         return true;
  335.    
  336.     return false;
  337. }
  338.  
  339. /** Check whether character is Unicode.
  340.  *
  341.  * @return True if character is valid Unicode code point.
  342.  */
  343. bool unicode_check(const wchar_t ch)
  344. {
  345.     if ((ch >= 0) && (ch <= 1114111))
  346.         return true;
  347.    
  348.     return false;
  349. }
  350.  
  351. /** Return number of bytes the string occupies.
  352.  *
  353.  * @param str A string.
  354.  * @return Number of bytes in @a str excluding the null terminator.
  355.  */
  356. size_t str_size(const char *str)
  357. {
  358.     size_t size;
  359.  
  360.     size = 0;
  361.     while (*str++ != '\0')
  362.         ++size;
  363.  
  364.     return size;
  365. }
  366.  
  367. /** Return number of characters in a string.
  368.  *
  369.  * @param str NULL-terminated string.
  370.  * @return Number of characters in string.
  371.  */
  372. count_t str_length(const char *str)
  373. {
  374.     count_t len = 0;
  375.     size_t offset = 0;
  376.  
  377.     while (chr_decode(str, &offset, UTF8_NO_LIMIT) != 0) {
  378.         len++;
  379.     }
  380.  
  381.     return len;
  382. }
  383.  
  384. /** Return number of characters in a wide string.
  385.  *
  386.  * @param   str NULL-terminated wide string.
  387.  * @return  Number of characters in @a str.
  388.  */
  389. count_t wstr_length(const wchar_t *wstr)
  390. {
  391.     count_t len;
  392.  
  393.     len = 0;
  394.     while (*wstr++ != '\0')
  395.         ++len;
  396.  
  397.     return len;
  398. }
  399.  
  400. /** Compare two NULL terminated strings
  401.  *
  402.  * Do a char-by-char comparison of two NULL terminated strings.
  403.  * The strings are considered equal iff they consist of the same
  404.  * characters on the minimum of their lengths.
  405.  *
  406.  * @param src First string to compare.
  407.  * @param dst Second string to compare.
  408.  *
  409.  * @return 0 if the strings are equal, -1 if first is smaller, 1 if second smaller.
  410.  *
  411.  */
  412. int strcmp(const char *src, const char *dst)
  413. {
  414.     for (; *src && *dst; src++, dst++) {
  415.         if (*src < *dst)
  416.             return -1;
  417.         if (*src > *dst)
  418.             return 1;
  419.     }
  420.     if (*src == *dst)
  421.         return 0;
  422.    
  423.     if (!*src)
  424.         return -1;
  425.    
  426.     return 1;
  427. }
  428.  
  429.  
  430. /** Compare two NULL terminated strings
  431.  *
  432.  * Do a char-by-char comparison of two NULL terminated strings.
  433.  * The strings are considered equal iff they consist of the same
  434.  * characters on the minimum of their lengths and specified maximal
  435.  * length.
  436.  *
  437.  * @param src First string to compare.
  438.  * @param dst Second string to compare.
  439.  * @param len Maximal length for comparison.
  440.  *
  441.  * @return 0 if the strings are equal, -1 if first is smaller, 1 if second smaller.
  442.  */
  443. int strncmp(const char *src, const char *dst, size_t len)
  444. {
  445.     unsigned int i;
  446.    
  447.     for (i = 0; (*src) && (*dst) && (i < len); src++, dst++, i++) {
  448.         if (*src < *dst)
  449.             return -1;
  450.        
  451.         if (*src > *dst)
  452.             return 1;
  453.     }
  454.    
  455.     if (i == len || *src == *dst)
  456.         return 0;
  457.    
  458.     if (!*src)
  459.         return -1;
  460.    
  461.     return 1;
  462. }
  463.  
  464.  
  465.  
  466. /** Copy NULL terminated string.
  467.  *
  468.  * Copy at most 'len' characters from string 'src' to 'dest'.
  469.  * If 'src' is shorter than 'len', '\0' is inserted behind the
  470.  * last copied character.
  471.  *
  472.  * @param src  Source string.
  473.  * @param dest Destination buffer.
  474.  * @param len  Size of destination buffer.
  475.  */
  476. void strncpy(char *dest, const char *src, size_t len)
  477. {
  478.     unsigned int i;
  479.    
  480.     for (i = 0; i < len; i++) {
  481.         if (!(dest[i] = src[i]))
  482.             return;
  483.     }
  484.    
  485.     dest[i - 1] = '\0';
  486. }
  487.  
  488. /** Find first occurence of character in string.
  489.  *
  490.  * @param s String to search.
  491.  * @param i Character to look for.
  492.  *
  493.  * @return Pointer to character in @a s or NULL if not found.
  494.  */
  495. extern char *strchr(const char *s, int i)
  496. {
  497.     while (*s != '\0') {
  498.         if (*s == i)
  499.             return (char *) s;
  500.         ++s;
  501.     }
  502.    
  503.     return NULL;
  504. }
  505.  
  506. /** @}
  507.  */
  508.