Subversion Repositories HelenOS

Rev

Rev 4209 | Rev 4223 | Go to most recent revision | Blame | Compare with Previous | Last modification | View Log | Download | RSS feed

  1. /*
  2.  * Copyright (c) 2001-2004 Jakub Jermar
  3.  * All rights reserved.
  4.  *
  5.  * Redistribution and use in source and binary forms, with or without
  6.  * modification, are permitted provided that the following conditions
  7.  * are met:
  8.  *
  9.  * - Redistributions of source code must retain the above copyright
  10.  *   notice, this list of conditions and the following disclaimer.
  11.  * - Redistributions in binary form must reproduce the above copyright
  12.  *   notice, this list of conditions and the following disclaimer in the
  13.  *   documentation and/or other materials provided with the distribution.
  14.  * - The name of the author may not be used to endorse or promote products
  15.  *   derived from this software without specific prior written permission.
  16.  *
  17.  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  18.  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  19.  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  20.  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  21.  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  22.  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  23.  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  24.  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  25.  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  26.  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  27.  */
  28.  
  29. /** @addtogroup generic
  30.  * @{
  31.  */
  32.  
  33. /**
  34.  * @file
  35.  * @brief String functions.
  36.  *
  37.  * Strings and characters use the Universal Character Set (UCS). The standard
  38.  * strings, called just strings are encoded in UTF-8. Wide strings (encoded
  39.  * in UTF-32) are supported to a limited degree. A single character is
  40.  * represented as wchar_t.@n
  41.  *
  42.  * Overview of the terminology:@n
  43.  *
  44.  *  Term                  Meaning
  45.  *  --------------------  ----------------------------------------------------
  46.  *  byte                  8 bits stored in uint8_t (unsigned 8 bit integer)
  47.  *
  48.  *  character             UTF-32 encoded Unicode character, stored in wchar_t
  49.  *                        (signed 32 bit integer), code points 0 .. 1114111
  50.  *                        are valid
  51.  *
  52.  *  ASCII character       7 bit encoded ASCII character, stored in char
  53.  *                        (usually signed 8 bit integer), code points 0 .. 127
  54.  *                        are valid
  55.  *
  56.  *  string                UTF-8 encoded NULL-terminated Unicode string, char *
  57.  *
  58.  *  wide string           UTF-32 encoded NULL-terminated Unicode string,
  59.  *                        wchar_t *
  60.  *
  61.  *  [wide] string size    number of BYTES in a [wide] string (excluding
  62.  *                        the NULL-terminator), size_t
  63.  *
  64.  *  [wide] string length  number of CHARACTERS in a [wide] string (excluding
  65.  *                        the NULL-terminator), count_t
  66.  *
  67.  *  [wide] string width   number of display cells on a monospace display taken
  68.  *                        by a [wide] string, count_t
  69.  *
  70.  *
  71.  * Overview of string metrics:@n
  72.  *
  73.  *  Metric  Abbrev.  Type     Meaning
  74.  *  ------  ------   ------   -------------------------------------------------
  75.  *  size    n        size_t   number of BYTES in a string (excluding the
  76.  *                            NULL-terminator)
  77.  *
  78.  *  length  l        count_t  number of CHARACTERS in a string (excluding the
  79.  *                            null terminator)
  80.  *
  81.  *  width  w         count_t  number of display cells on a monospace display
  82.  *                            taken by a string
  83.  *
  84.  *
  85.  * Function naming prefixes:@n
  86.  *
  87.  *  chr_    operate on characters
  88.  *  ascii_  operate on ASCII characters
  89.  *  str_    operate on strings
  90.  *  wstr_   operate on wide strings
  91.  *
  92.  *  [w]str_[n|l|w]  operate on a prefix limited by size, length
  93.  *                  or width
  94.  *
  95.  *
  96.  * A specific character inside a [wide] string can be referred to by:@n
  97.  *
  98.  *  pointer (char *, wchar_t *)
  99.  *  byte offset (size_t)
  100.  *  character index (count_t)
  101.  *
  102.  */
  103.  
  104. #include <string.h>
  105. #include <print.h>
  106. #include <cpu.h>
  107. #include <arch/asm.h>
  108. #include <arch.h>
  109. #include <errno.h>
  110. #include <align.h>
  111.  
  112. char invalch = '?';
  113.  
  114. /** Byte mask consisting of lowest @n bits (out of 8) */
  115. #define LO_MASK_8(n)  ((uint8_t) ((1 << (n)) - 1))
  116.  
  117. /** Byte mask consisting of lowest @n bits (out of 32) */
  118. #define LO_MASK_32(n)  ((uint32_t) ((1 << (n)) - 1))
  119.  
  120. /** Byte mask consisting of highest @n bits (out of 8) */
  121. #define HI_MASK_8(n)  (~LO_MASK_8(8 - (n)))
  122.  
  123. /** Number of data bits in a UTF-8 continuation byte */
  124. #define CONT_BITS  6
  125.  
  126. /** Decode a single character from a string.
  127.  *
  128.  * Decode a single character from a string of size @a size. Decoding starts
  129.  * at @a offset and this offset is moved to the beginning of the next
  130.  * character. In case of decoding error, offset generally advances at least
  131.  * by one. However, offset is never moved beyond size.
  132.  *
  133.  * @param str    String (not necessarily NULL-terminated).
  134.  * @param offset Byte offset in string where to start decoding.
  135.  * @param size   Size of the string (in bytes).
  136.  *
  137.  * @return Value of decoded character, invalch on decoding error or
  138.  *         NULL if attempt to decode beyond @a size.
  139.  *
  140.  */
  141. wchar_t str_decode(const char *str, size_t *offset, size_t size)
  142. {
  143.     if (*offset + 1 > size)
  144.         return 0;
  145.    
  146.     /* First byte read from string */
  147.     uint8_t b0 = (uint8_t) str[(*offset)++];
  148.    
  149.     /* Determine code length */
  150.    
  151.     unsigned int b0_bits;  /* Data bits in first byte */
  152.     unsigned int cbytes;   /* Number of continuation bytes */
  153.    
  154.     if ((b0 & 0x80) == 0) {
  155.         /* 0xxxxxxx (Plain ASCII) */
  156.         b0_bits = 7;
  157.         cbytes = 0;
  158.     } else if ((b0 & 0xe0) == 0xc0) {
  159.         /* 110xxxxx 10xxxxxx */
  160.         b0_bits = 5;
  161.         cbytes = 1;
  162.     } else if ((b0 & 0xf0) == 0xe0) {
  163.         /* 1110xxxx 10xxxxxx 10xxxxxx */
  164.         b0_bits = 4;
  165.         cbytes = 2;
  166.     } else if ((b0 & 0xf8) == 0xf0) {
  167.         /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
  168.         b0_bits = 3;
  169.         cbytes = 3;
  170.     } else {
  171.         /* 10xxxxxx -- unexpected continuation byte */
  172.         return invalch;
  173.     }
  174.    
  175.     if (*offset + cbytes > size)
  176.         return invalch;
  177.    
  178.     wchar_t ch = b0 & LO_MASK_8(b0_bits);
  179.    
  180.     /* Decode continuation bytes */
  181.     while (cbytes > 0) {
  182.         uint8_t b = (uint8_t) str[(*offset)++];
  183.        
  184.         /* Must be 10xxxxxx */
  185.         if ((b & 0xc0) != 0x80)
  186.             return invalch;
  187.        
  188.         /* Shift data bits to ch */
  189.         ch = (ch << CONT_BITS) | (wchar_t) (b & LO_MASK_8(CONT_BITS));
  190.         cbytes--;
  191.     }
  192.    
  193.     return ch;
  194. }
  195.  
  196. /** Encode a single character to string representation.
  197.  *
  198.  * Encode a single character to string representation (i.e. UTF-8) and store
  199.  * it into a buffer at @a offset. Encoding starts at @a offset and this offset
  200.  * is moved to the position where the next character can be written to.
  201.  *
  202.  * @param ch     Input character.
  203.  * @param str    Output buffer.
  204.  * @param offset Byte offset where to start writing.
  205.  * @param size   Size of the output buffer (in bytes).
  206.  *
  207.  * @return EOK if the character was encoded successfully, EOVERFLOW if there
  208.  *     was not enough space in the output buffer or EINVAL if the character
  209.  *     code was invalid.
  210.  */
  211. int chr_encode(const wchar_t ch, char *str, size_t *offset, size_t size)
  212. {
  213.     if (*offset >= size)
  214.         return EOVERFLOW;
  215.    
  216.     if (!chr_check(ch))
  217.         return EINVAL;
  218.    
  219.     /* Unsigned version of ch (bit operations should only be done
  220.        on unsigned types). */
  221.     uint32_t cc = (uint32_t) ch;
  222.    
  223.     /* Determine how many continuation bytes are needed */
  224.    
  225.     unsigned int b0_bits;  /* Data bits in first byte */
  226.     unsigned int cbytes;   /* Number of continuation bytes */
  227.    
  228.     if ((cc & ~LO_MASK_32(7)) == 0) {
  229.         b0_bits = 7;
  230.         cbytes = 0;
  231.     } else if ((cc & ~LO_MASK_32(11)) == 0) {
  232.         b0_bits = 5;
  233.         cbytes = 1;
  234.     } else if ((cc & ~LO_MASK_32(16)) == 0) {
  235.         b0_bits = 4;
  236.         cbytes = 2;
  237.     } else if ((cc & ~LO_MASK_32(21)) == 0) {
  238.         b0_bits = 3;
  239.         cbytes = 3;
  240.     } else {
  241.         /* Codes longer than 21 bits are not supported */
  242.         return EINVAL;
  243.     }
  244.    
  245.     /* Check for available space in buffer */
  246.     if (*offset + cbytes >= size)
  247.         return EOVERFLOW;
  248.    
  249.     /* Encode continuation bytes */
  250.     unsigned int i;
  251.     for (i = cbytes; i > 0; i--) {
  252.         str[*offset + i] = 0x80 | (cc & LO_MASK_32(CONT_BITS));
  253.         cc = cc >> CONT_BITS;
  254.     }
  255.    
  256.     /* Encode first byte */
  257.     str[*offset] = (cc & LO_MASK_32(b0_bits)) | HI_MASK_8(8 - b0_bits - 1);
  258.    
  259.     /* Advance offset */
  260.     *offset += cbytes + 1;
  261.    
  262.     return EOK;
  263. }
  264.  
  265. /** Get size of string.
  266.  *
  267.  * Get the number of bytes which are used by the string @a str (excluding the
  268.  * NULL-terminator).
  269.  *
  270.  * @param str String to consider.
  271.  *
  272.  * @return Number of bytes used by the string
  273.  *
  274.  */
  275. size_t str_size(const char *str)
  276. {
  277.     size_t size = 0;
  278.    
  279.     while (*str++ != 0)
  280.         size++;
  281.    
  282.     return size;
  283. }
  284.  
  285. /** Get size of wide string.
  286.  *
  287.  * Get the number of bytes which are used by the wide string @a str (excluding the
  288.  * NULL-terminator).
  289.  *
  290.  * @param str Wide string to consider.
  291.  *
  292.  * @return Number of bytes used by the wide string
  293.  *
  294.  */
  295. size_t wstr_size(const wchar_t *str)
  296. {
  297.     return (wstr_length(str) * sizeof(wchar_t));
  298. }
  299.  
  300. /** Get size of string with length limit.
  301.  *
  302.  * Get the number of bytes which are used by up to @a max_len first
  303.  * characters in the string @a str. If @a max_len is greater than
  304.  * the length of @a str, the entire string is measured (excluding the
  305.  * NULL-terminator).
  306.  *
  307.  * @param str     String to consider.
  308.  * @param max_len Maximum number of characters to measure.
  309.  *
  310.  * @return Number of bytes used by the characters.
  311.  *
  312.  */
  313. size_t str_lsize(const char *str, count_t max_len)
  314. {
  315.     count_t len = 0;
  316.     size_t offset = 0;
  317.    
  318.     while (len < max_len) {
  319.         if (str_decode(str, &offset, STR_NO_LIMIT) == 0)
  320.             break;
  321.        
  322.         len++;
  323.     }
  324.    
  325.     return offset;
  326. }
  327.  
  328. /** Get size of wide string with length limit.
  329.  *
  330.  * Get the number of bytes which are used by up to @a max_len first
  331.  * wide characters in the wide string @a str. If @a max_len is greater than
  332.  * the length of @a str, the entire wide string is measured (excluding the
  333.  * NULL-terminator).
  334.  *
  335.  * @param str     Wide string to consider.
  336.  * @param max_len Maximum number of wide characters to measure.
  337.  *
  338.  * @return Number of bytes used by the wide characters.
  339.  *
  340.  */
  341. size_t wstr_lsize(const wchar_t *str, count_t max_len)
  342. {
  343.     return (wstr_nlength(str, max_len * sizeof(wchar_t)) * sizeof(wchar_t));
  344. }
  345.  
  346. /** Get number of characters in a string.
  347.  *
  348.  * @param str NULL-terminated string.
  349.  *
  350.  * @return Number of characters in string.
  351.  *
  352.  */
  353. count_t str_length(const char *str)
  354. {
  355.     count_t len = 0;
  356.     size_t offset = 0;
  357.    
  358.     while (str_decode(str, &offset, STR_NO_LIMIT) != 0)
  359.         len++;
  360.    
  361.     return len;
  362. }
  363.  
  364. /** Get number of characters in a wide string.
  365.  *
  366.  * @param str NULL-terminated wide string.
  367.  *
  368.  * @return Number of characters in @a str.
  369.  *
  370.  */
  371. count_t wstr_length(const wchar_t *wstr)
  372. {
  373.     count_t len = 0;
  374.    
  375.     while (*wstr++ != 0)
  376.         len++;
  377.    
  378.     return len;
  379. }
  380.  
  381. /** Get number of characters in a string with size limit.
  382.  *
  383.  * @param str  NULL-terminated string.
  384.  * @param size Maximum number of bytes to consider.
  385.  *
  386.  * @return Number of characters in string.
  387.  *
  388.  */
  389. count_t str_nlength(const char *str, size_t size)
  390. {
  391.     count_t len = 0;
  392.     size_t offset = 0;
  393.    
  394.     while (str_decode(str, &offset, size) != 0)
  395.         len++;
  396.    
  397.     return len;
  398. }
  399.  
  400. /** Get number of characters in a string with size limit.
  401.  *
  402.  * @param str  NULL-terminated string.
  403.  * @param size Maximum number of bytes to consider.
  404.  *
  405.  * @return Number of characters in string.
  406.  *
  407.  */
  408. count_t wstr_nlength(const wchar_t *str, size_t size)
  409. {
  410.     count_t len = 0;
  411.     count_t limit = ALIGN_DOWN(size, sizeof(wchar_t));
  412.     count_t offset = 0;
  413.    
  414.     while ((offset < limit) && (*str++ != 0)) {
  415.         len++;
  416.         offset += sizeof(wchar_t);
  417.     }
  418.    
  419.     return len;
  420. }
  421.  
  422. /** Check whether character is plain ASCII.
  423.  *
  424.  * @return True if character is plain ASCII.
  425.  *
  426.  */
  427. bool ascii_check(const wchar_t ch)
  428. {
  429.     if ((ch >= 0) && (ch <= 127))
  430.         return true;
  431.    
  432.     return false;
  433. }
  434.  
  435. /** Check whether character is valid
  436.  *
  437.  * @return True if character is a valid Unicode code point.
  438.  *
  439.  */
  440. bool chr_check(const wchar_t ch)
  441. {
  442.     if ((ch >= 0) && (ch <= 1114111))
  443.         return true;
  444.    
  445.     return false;
  446. }
  447.  
  448. /** Compare two NULL terminated strings.
  449.  *
  450.  * Do a char-by-char comparison of two NULL-terminated strings.
  451.  * The strings are considered equal iff they consist of the same
  452.  * characters on the minimum of their lengths.
  453.  *
  454.  * @param s1 First string to compare.
  455.  * @param s2 Second string to compare.
  456.  *
  457.  * @return 0 if the strings are equal, -1 if first is smaller,
  458.  *         1 if second smaller.
  459.  *
  460.  */
  461. int str_cmp(const char *s1, const char *s2)
  462. {
  463.     wchar_t c1;
  464.     wchar_t c2;
  465.    
  466.     size_t off1 = 0;
  467.     size_t off2 = 0;
  468.    
  469.     while ((c1 = str_decode(s1, &off1, STR_NO_LIMIT) != 0)
  470.         && (c2 = str_decode(s2, &off2, STR_NO_LIMIT) != 0)) {
  471.        
  472.         if (off1 != off2)
  473.             break;
  474.        
  475.         if (c1 < c2)
  476.             return -1;
  477.        
  478.         if (c1 > c2)
  479.             return 1;
  480.     }
  481.    
  482.     if ((off1 == off2) && (c1 == c2))
  483.         return 0;
  484.    
  485.     if ((c1 == 0) || (off1 < off2))
  486.         return -1;
  487.    
  488.     return 1;
  489. }
  490.  
  491. /** Compare two NULL terminated strings with length limit.
  492.  *
  493.  * Do a char-by-char comparison of two NULL-terminated strings.
  494.  * The strings are considered equal iff they consist of the same
  495.  * characters on the minimum of their lengths and the length limit.
  496.  *
  497.  * @param s1      First string to compare.
  498.  * @param s2      Second string to compare.
  499.  * @param max_len Maximum number of characters to consider.
  500.  *
  501.  * @return 0 if the strings are equal, -1 if first is smaller,
  502.  *         1 if second smaller.
  503.  *
  504.  */
  505. int str_lcmp(const char *s1, const char *s2, count_t max_len)
  506. {
  507.     wchar_t c1 = 0;
  508.     wchar_t c2 = 0;
  509.    
  510.     size_t off1 = 0;
  511.     size_t off2 = 0;
  512.    
  513.     count_t len = 0;
  514.    
  515.     while ((len < max_len)
  516.         && ((c1 = str_decode(s1, &off1, STR_NO_LIMIT)) != 0)
  517.         && ((c2 = str_decode(s2, &off2, STR_NO_LIMIT)) != 0)) {
  518.        
  519.         if (off1 != off2)
  520.             break;
  521.        
  522.         if (c1 < c2)
  523.             return -1;
  524.        
  525.         if (c1 > c2)
  526.             return 1;
  527.        
  528.         len++;
  529.     }
  530.    
  531.     if ((off1 == off2) && (len == max_len) && (c1 == c2))
  532.         return 0;
  533.    
  534.     if ((c1 == 0) || (off1 < off2))
  535.         return -1;
  536.    
  537.     return 1;
  538. }
  539.  
  540. /** Copy NULL-terminated string.
  541.  *
  542.  * Copy source string @a src to destination buffer @a dst.
  543.  * No more than @a size bytes are written. NULL-terminator is always
  544.  * written after the last succesfully copied character (i.e. if the
  545.  * destination buffer is has at least 1 byte, it will be always
  546.  * NULL-terminated).
  547.  *
  548.  * @param src   Source string.
  549.  * @param dst   Destination buffer.
  550.  * @param count Size of the destination buffer.
  551.  *
  552.  */
  553. void str_ncpy(char *dst, const char *src, size_t size)
  554. {
  555.     /* No space for the NULL-terminator in the buffer */
  556.     if (size == 0)
  557.         return;
  558.    
  559.     wchar_t ch;
  560.     size_t str_off = 0;
  561.     size_t dst_off = 0;
  562.    
  563.     while ((ch = str_decode(src, &str_off, STR_NO_LIMIT) != 0)) {
  564.         if (chr_encode(ch, dst, &dst_off, size) != EOK)
  565.             break;
  566.     }
  567.    
  568.     if (dst_off >= size)
  569.         dst[size - 1] = 0;
  570.     else
  571.         dst[dst_off] = 0;
  572. }
  573.  
  574. /** Copy NULL-terminated wide string to string
  575.  *
  576.  * Copy source wide string @a src to destination buffer @a dst.
  577.  * No more than @a size bytes are written. NULL-terminator is always
  578.  * written after the last succesfully copied character (i.e. if the
  579.  * destination buffer is has at least 1 byte, it will be always
  580.  * NULL-terminated).
  581.  *
  582.  * @param src   Source wide string.
  583.  * @param dst   Destination buffer.
  584.  * @param count Size of the destination buffer.
  585.  *
  586.  */
  587. void wstr_nstr(char *dst, const wchar_t *src, size_t size)
  588. {
  589.     /* No space for the NULL-terminator in the buffer */
  590.     if (size == 0)
  591.         return;
  592.    
  593.     wchar_t ch;
  594.     count_t src_idx = 0;
  595.     size_t dst_off = 0;
  596.    
  597.     while ((ch = src[src_idx++]) != 0) {
  598.         if (chr_encode(ch, dst, &dst_off, size) != EOK)
  599.             break;
  600.     }
  601.    
  602.     if (dst_off >= size)
  603.         dst[size - 1] = 0;
  604.     else
  605.         dst[dst_off] = 0;
  606. }
  607.  
  608. /** Find first occurence of character in string.
  609.  *
  610.  * @param str String to search.
  611.  * @param ch  Character to look for.
  612.  *
  613.  * @return Pointer to character in @a str or NULL if not found.
  614.  *
  615.  */
  616. const char *str_chr(const char *str, wchar_t ch)
  617. {
  618.     wchar_t acc;
  619.     size_t off = 0;
  620.    
  621.     while ((acc = str_decode(str, &off, STR_NO_LIMIT) != 0)) {
  622.         if (acc == ch)
  623.             return (str + off);
  624.     }
  625.    
  626.     return NULL;
  627. }
  628.  
  629. /** Insert a wide character into a wide string.
  630.  *
  631.  * Insert a wide character into a wide string at position
  632.  * @a pos. The characters after the position are shifted.
  633.  *
  634.  * @param str     String to insert to.
  635.  * @param ch      Character to insert to.
  636.  * @param pos     Character index where to insert.
  637.  @ @param max_pos Characters in the buffer.
  638.  *
  639.  * @return True if the insertion was sucessful, false if the position
  640.  *         is out of bounds.
  641.  *
  642.  */
  643. bool wstr_linsert(wchar_t *str, wchar_t ch, count_t pos, count_t max_pos)
  644. {
  645.     count_t len = wstr_length(str);
  646.    
  647.     if ((pos > len) || (pos + 1 > max_pos))
  648.         return false;
  649.    
  650.     count_t i;
  651.     for (i = len; i + 1 > pos; i--)
  652.         str[i + 1] = str[i];
  653.    
  654.     str[pos] = ch;
  655.    
  656.     return true;
  657. }
  658.  
  659. /** Remove a wide character from a wide string.
  660.  *
  661.  * Remove a wide character from a wide string at position
  662.  * @a pos. The characters after the position are shifted.
  663.  *
  664.  * @param str String to remove from.
  665.  * @param pos Character index to remove.
  666.  *
  667.  * @return True if the removal was sucessful, false if the position
  668.  *         is out of bounds.
  669.  *
  670.  */
  671. bool wstr_remove(wchar_t *str, count_t pos)
  672. {
  673.     count_t len = wstr_length(str);
  674.    
  675.     if (pos >= len)
  676.         return false;
  677.    
  678.     count_t i;
  679.     for (i = pos + 1; i <= len; i++)
  680.         str[i - 1] = str[i];
  681.    
  682.     return true;
  683. }
  684.  
  685. /** @}
  686.  */
  687.