Subversion Repositories HelenOS

Rev

Rev 4153 | Rev 4327 | Go to most recent revision | Blame | Compare with Previous | Last modification | View Log | Download | RSS feed

  1. /*
  2.  * Copyright (c) 2001-2004 Jakub Jermar
  3.  * All rights reserved.
  4.  *
  5.  * Redistribution and use in source and binary forms, with or without
  6.  * modification, are permitted provided that the following conditions
  7.  * are met:
  8.  *
  9.  * - Redistributions of source code must retain the above copyright
  10.  *   notice, this list of conditions and the following disclaimer.
  11.  * - Redistributions in binary form must reproduce the above copyright
  12.  *   notice, this list of conditions and the following disclaimer in the
  13.  *   documentation and/or other materials provided with the distribution.
  14.  * - The name of the author may not be used to endorse or promote products
  15.  *   derived from this software without specific prior written permission.
  16.  *
  17.  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  18.  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  19.  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  20.  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  21.  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  22.  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  23.  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  24.  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  25.  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  26.  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  27.  */
  28.  
  29. /** @addtogroup generic
  30.  * @{
  31.  */
  32.  
  33. /**
  34.  * @file
  35.  * @brief String functions.
  36.  *
  37.  * Strings and characters use the Universal Character Set (UCS). The standard
  38.  * strings, called just strings are encoded in UTF-8. Wide strings (encoded
  39.  * in UTF-32) are supported to a limited degree. A single character is
  40.  * represented as wchar_t.@n
  41.  *
  42.  * Overview of the terminology:@n
  43.  *
  44.  *  Term                  Meaning
  45.  *  --------------------  ----------------------------------------------------
  46.  *  byte                  8 bits stored in uint8_t (unsigned 8 bit integer)
  47.  *
  48.  *  character             UTF-32 encoded Unicode character, stored in wchar_t
  49.  *                        (signed 32 bit integer), code points 0 .. 1114111
  50.  *                        are valid
  51.  *
  52.  *  ASCII character       7 bit encoded ASCII character, stored in char
  53.  *                        (usually signed 8 bit integer), code points 0 .. 127
  54.  *                        are valid
  55.  *
  56.  *  string                UTF-8 encoded NULL-terminated Unicode string, char *
  57.  *
  58.  *  wide string           UTF-32 encoded NULL-terminated Unicode string,
  59.  *                        wchar_t *
  60.  *
  61.  *  [wide] string size    number of BYTES in a [wide] string (excluding
  62.  *                        the NULL-terminator), size_t
  63.  *
  64.  *  [wide] string length  number of CHARACTERS in a [wide] string (excluding
  65.  *                        the NULL-terminator), count_t
  66.  *
  67.  *  [wide] string width   number of display cells on a monospace display taken
  68.  *                        by a [wide] string, count_t
  69.  *
  70.  *
  71.  * Overview of string metrics:@n
  72.  *
  73.  *  Metric  Abbrev.  Type     Meaning
  74.  *  ------  ------   ------   -------------------------------------------------
  75.  *  size    n        size_t   number of BYTES in a string (excluding the
  76.  *                            NULL-terminator)
  77.  *
  78.  *  length  l        count_t  number of CHARACTERS in a string (excluding the
  79.  *                            null terminator)
  80.  *
  81.  *  width  w         count_t  number of display cells on a monospace display
  82.  *                            taken by a string
  83.  *
  84.  *
  85.  * Function naming prefixes:@n
  86.  *
  87.  *  chr_    operate on characters
  88.  *  ascii_  operate on ASCII characters
  89.  *  str_    operate on strings
  90.  *  wstr_   operate on wide strings
  91.  *
  92.  *  [w]str_[n|l|w]  operate on a prefix limited by size, length
  93.  *                  or width
  94.  *
  95.  *
  96.  * A specific character inside a [wide] string can be referred to by:@n
  97.  *
  98.  *  pointer (char *, wchar_t *)
  99.  *  byte offset (size_t)
  100.  *  character index (count_t)
  101.  *
  102.  */
  103.  
  104. #include <string.h>
  105. #include <print.h>
  106. #include <cpu.h>
  107. #include <arch/asm.h>
  108. #include <arch.h>
  109. #include <errno.h>
  110. #include <align.h>
  111.  
  112. /** Byte mask consisting of lowest @n bits (out of 8) */
  113. #define LO_MASK_8(n)  ((uint8_t) ((1 << (n)) - 1))
  114.  
  115. /** Byte mask consisting of lowest @n bits (out of 32) */
  116. #define LO_MASK_32(n)  ((uint32_t) ((1 << (n)) - 1))
  117.  
  118. /** Byte mask consisting of highest @n bits (out of 8) */
  119. #define HI_MASK_8(n)  (~LO_MASK_8(8 - (n)))
  120.  
  121. /** Number of data bits in a UTF-8 continuation byte */
  122. #define CONT_BITS  6
  123.  
  124. /** Decode a single character from a string.
  125.  *
  126.  * Decode a single character from a string of size @a size. Decoding starts
  127.  * at @a offset and this offset is moved to the beginning of the next
  128.  * character. In case of decoding error, offset generally advances at least
  129.  * by one. However, offset is never moved beyond size.
  130.  *
  131.  * @param str    String (not necessarily NULL-terminated).
  132.  * @param offset Byte offset in string where to start decoding.
  133.  * @param size   Size of the string (in bytes).
  134.  *
  135.  * @return Value of decoded character, U_SPECIAL on decoding error or
  136.  *         NULL if attempt to decode beyond @a size.
  137.  *
  138.  */
  139. wchar_t str_decode(const char *str, size_t *offset, size_t size)
  140. {
  141.     if (*offset + 1 > size)
  142.         return 0;
  143.    
  144.     /* First byte read from string */
  145.     uint8_t b0 = (uint8_t) str[(*offset)++];
  146.    
  147.     /* Determine code length */
  148.    
  149.     unsigned int b0_bits;  /* Data bits in first byte */
  150.     unsigned int cbytes;   /* Number of continuation bytes */
  151.    
  152.     if ((b0 & 0x80) == 0) {
  153.         /* 0xxxxxxx (Plain ASCII) */
  154.         b0_bits = 7;
  155.         cbytes = 0;
  156.     } else if ((b0 & 0xe0) == 0xc0) {
  157.         /* 110xxxxx 10xxxxxx */
  158.         b0_bits = 5;
  159.         cbytes = 1;
  160.     } else if ((b0 & 0xf0) == 0xe0) {
  161.         /* 1110xxxx 10xxxxxx 10xxxxxx */
  162.         b0_bits = 4;
  163.         cbytes = 2;
  164.     } else if ((b0 & 0xf8) == 0xf0) {
  165.         /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
  166.         b0_bits = 3;
  167.         cbytes = 3;
  168.     } else {
  169.         /* 10xxxxxx -- unexpected continuation byte */
  170.         return U_SPECIAL;
  171.     }
  172.    
  173.     if (*offset + cbytes > size)
  174.         return U_SPECIAL;
  175.    
  176.     wchar_t ch = b0 & LO_MASK_8(b0_bits);
  177.    
  178.     /* Decode continuation bytes */
  179.     while (cbytes > 0) {
  180.         uint8_t b = (uint8_t) str[(*offset)++];
  181.        
  182.         /* Must be 10xxxxxx */
  183.         if ((b & 0xc0) != 0x80)
  184.             return U_SPECIAL;
  185.        
  186.         /* Shift data bits to ch */
  187.         ch = (ch << CONT_BITS) | (wchar_t) (b & LO_MASK_8(CONT_BITS));
  188.         cbytes--;
  189.     }
  190.    
  191.     return ch;
  192. }
  193.  
  194. /** Encode a single character to string representation.
  195.  *
  196.  * Encode a single character to string representation (i.e. UTF-8) and store
  197.  * it into a buffer at @a offset. Encoding starts at @a offset and this offset
  198.  * is moved to the position where the next character can be written to.
  199.  *
  200.  * @param ch     Input character.
  201.  * @param str    Output buffer.
  202.  * @param offset Byte offset where to start writing.
  203.  * @param size   Size of the output buffer (in bytes).
  204.  *
  205.  * @return EOK if the character was encoded successfully, EOVERFLOW if there
  206.  *     was not enough space in the output buffer or EINVAL if the character
  207.  *     code was invalid.
  208.  */
  209. int chr_encode(wchar_t ch, char *str, size_t *offset, size_t size)
  210. {
  211.     if (*offset >= size)
  212.         return EOVERFLOW;
  213.    
  214.     if (!chr_check(ch))
  215.         return EINVAL;
  216.    
  217.     /* Unsigned version of ch (bit operations should only be done
  218.        on unsigned types). */
  219.     uint32_t cc = (uint32_t) ch;
  220.    
  221.     /* Determine how many continuation bytes are needed */
  222.    
  223.     unsigned int b0_bits;  /* Data bits in first byte */
  224.     unsigned int cbytes;   /* Number of continuation bytes */
  225.    
  226.     if ((cc & ~LO_MASK_32(7)) == 0) {
  227.         b0_bits = 7;
  228.         cbytes = 0;
  229.     } else if ((cc & ~LO_MASK_32(11)) == 0) {
  230.         b0_bits = 5;
  231.         cbytes = 1;
  232.     } else if ((cc & ~LO_MASK_32(16)) == 0) {
  233.         b0_bits = 4;
  234.         cbytes = 2;
  235.     } else if ((cc & ~LO_MASK_32(21)) == 0) {
  236.         b0_bits = 3;
  237.         cbytes = 3;
  238.     } else {
  239.         /* Codes longer than 21 bits are not supported */
  240.         return EINVAL;
  241.     }
  242.    
  243.     /* Check for available space in buffer */
  244.     if (*offset + cbytes >= size)
  245.         return EOVERFLOW;
  246.    
  247.     /* Encode continuation bytes */
  248.     unsigned int i;
  249.     for (i = cbytes; i > 0; i--) {
  250.         str[*offset + i] = 0x80 | (cc & LO_MASK_32(CONT_BITS));
  251.         cc = cc >> CONT_BITS;
  252.     }
  253.    
  254.     /* Encode first byte */
  255.     str[*offset] = (cc & LO_MASK_32(b0_bits)) | HI_MASK_8(8 - b0_bits - 1);
  256.    
  257.     /* Advance offset */
  258.     *offset += cbytes + 1;
  259.    
  260.     return EOK;
  261. }
  262.  
  263. /** Get size of string.
  264.  *
  265.  * Get the number of bytes which are used by the string @a str (excluding the
  266.  * NULL-terminator).
  267.  *
  268.  * @param str String to consider.
  269.  *
  270.  * @return Number of bytes used by the string
  271.  *
  272.  */
  273. size_t str_size(const char *str)
  274. {
  275.     size_t size = 0;
  276.    
  277.     while (*str++ != 0)
  278.         size++;
  279.    
  280.     return size;
  281. }
  282.  
  283. /** Get size of wide string.
  284.  *
  285.  * Get the number of bytes which are used by the wide string @a str (excluding the
  286.  * NULL-terminator).
  287.  *
  288.  * @param str Wide string to consider.
  289.  *
  290.  * @return Number of bytes used by the wide string
  291.  *
  292.  */
  293. size_t wstr_size(const wchar_t *str)
  294. {
  295.     return (wstr_length(str) * sizeof(wchar_t));
  296. }
  297.  
  298. /** Get size of string with length limit.
  299.  *
  300.  * Get the number of bytes which are used by up to @a max_len first
  301.  * characters in the string @a str. If @a max_len is greater than
  302.  * the length of @a str, the entire string is measured (excluding the
  303.  * NULL-terminator).
  304.  *
  305.  * @param str     String to consider.
  306.  * @param max_len Maximum number of characters to measure.
  307.  *
  308.  * @return Number of bytes used by the characters.
  309.  *
  310.  */
  311. size_t str_lsize(const char *str, count_t max_len)
  312. {
  313.     count_t len = 0;
  314.     size_t offset = 0;
  315.    
  316.     while (len < max_len) {
  317.         if (str_decode(str, &offset, STR_NO_LIMIT) == 0)
  318.             break;
  319.        
  320.         len++;
  321.     }
  322.    
  323.     return offset;
  324. }
  325.  
  326. /** Get size of wide string with length limit.
  327.  *
  328.  * Get the number of bytes which are used by up to @a max_len first
  329.  * wide characters in the wide string @a str. If @a max_len is greater than
  330.  * the length of @a str, the entire wide string is measured (excluding the
  331.  * NULL-terminator).
  332.  *
  333.  * @param str     Wide string to consider.
  334.  * @param max_len Maximum number of wide characters to measure.
  335.  *
  336.  * @return Number of bytes used by the wide characters.
  337.  *
  338.  */
  339. size_t wstr_lsize(const wchar_t *str, count_t max_len)
  340. {
  341.     return (wstr_nlength(str, max_len * sizeof(wchar_t)) * sizeof(wchar_t));
  342. }
  343.  
  344. /** Get number of characters in a string.
  345.  *
  346.  * @param str NULL-terminated string.
  347.  *
  348.  * @return Number of characters in string.
  349.  *
  350.  */
  351. count_t str_length(const char *str)
  352. {
  353.     count_t len = 0;
  354.     size_t offset = 0;
  355.    
  356.     while (str_decode(str, &offset, STR_NO_LIMIT) != 0)
  357.         len++;
  358.    
  359.     return len;
  360. }
  361.  
  362. /** Get number of characters in a wide string.
  363.  *
  364.  * @param str NULL-terminated wide string.
  365.  *
  366.  * @return Number of characters in @a str.
  367.  *
  368.  */
  369. count_t wstr_length(const wchar_t *wstr)
  370. {
  371.     count_t len = 0;
  372.    
  373.     while (*wstr++ != 0)
  374.         len++;
  375.    
  376.     return len;
  377. }
  378.  
  379. /** Get number of characters in a string with size limit.
  380.  *
  381.  * @param str  NULL-terminated string.
  382.  * @param size Maximum number of bytes to consider.
  383.  *
  384.  * @return Number of characters in string.
  385.  *
  386.  */
  387. count_t str_nlength(const char *str, size_t size)
  388. {
  389.     count_t len = 0;
  390.     size_t offset = 0;
  391.    
  392.     while (str_decode(str, &offset, size) != 0)
  393.         len++;
  394.    
  395.     return len;
  396. }
  397.  
  398. /** Get number of characters in a string with size limit.
  399.  *
  400.  * @param str  NULL-terminated string.
  401.  * @param size Maximum number of bytes to consider.
  402.  *
  403.  * @return Number of characters in string.
  404.  *
  405.  */
  406. count_t wstr_nlength(const wchar_t *str, size_t size)
  407. {
  408.     count_t len = 0;
  409.     count_t limit = ALIGN_DOWN(size, sizeof(wchar_t));
  410.     count_t offset = 0;
  411.    
  412.     while ((offset < limit) && (*str++ != 0)) {
  413.         len++;
  414.         offset += sizeof(wchar_t);
  415.     }
  416.    
  417.     return len;
  418. }
  419.  
  420. /** Check whether character is plain ASCII.
  421.  *
  422.  * @return True if character is plain ASCII.
  423.  *
  424.  */
  425. bool ascii_check(wchar_t ch)
  426. {
  427.     if ((ch >= 0) && (ch <= 127))
  428.         return true;
  429.    
  430.     return false;
  431. }
  432.  
  433. /** Check whether character is valid
  434.  *
  435.  * @return True if character is a valid Unicode code point.
  436.  *
  437.  */
  438. bool chr_check(wchar_t ch)
  439. {
  440.     if ((ch >= 0) && (ch <= 1114111))
  441.         return true;
  442.    
  443.     return false;
  444. }
  445.  
  446. /** Compare two NULL terminated strings.
  447.  *
  448.  * Do a char-by-char comparison of two NULL-terminated strings.
  449.  * The strings are considered equal iff they consist of the same
  450.  * characters on the minimum of their lengths.
  451.  *
  452.  * @param s1 First string to compare.
  453.  * @param s2 Second string to compare.
  454.  *
  455.  * @return 0 if the strings are equal, -1 if first is smaller,
  456.  *         1 if second smaller.
  457.  *
  458.  */
  459. int str_cmp(const char *s1, const char *s2)
  460. {
  461.     wchar_t c1 = 0;
  462.     wchar_t c2 = 0;
  463.    
  464.     size_t off1 = 0;
  465.     size_t off2 = 0;
  466.  
  467.     while (true) {
  468.         c1 = str_decode(s1, &off1, STR_NO_LIMIT);
  469.         c2 = str_decode(s2, &off2, STR_NO_LIMIT);
  470.  
  471.         if (c1 < c2)
  472.             return -1;
  473.        
  474.         if (c1 > c2)
  475.             return 1;
  476.  
  477.         if (c1 == 0 || c2 == 0)
  478.             break;     
  479.     }
  480.  
  481.     return 0;
  482. }
  483.  
  484. /** Compare two NULL terminated strings with length limit.
  485.  *
  486.  * Do a char-by-char comparison of two NULL-terminated strings.
  487.  * The strings are considered equal iff they consist of the same
  488.  * characters on the minimum of their lengths and the length limit.
  489.  *
  490.  * @param s1      First string to compare.
  491.  * @param s2      Second string to compare.
  492.  * @param max_len Maximum number of characters to consider.
  493.  *
  494.  * @return 0 if the strings are equal, -1 if first is smaller,
  495.  *         1 if second smaller.
  496.  *
  497.  */
  498. int str_lcmp(const char *s1, const char *s2, count_t max_len)
  499. {
  500.     wchar_t c1 = 0;
  501.     wchar_t c2 = 0;
  502.    
  503.     size_t off1 = 0;
  504.     size_t off2 = 0;
  505.    
  506.     count_t len = 0;
  507.  
  508.     while (true) {
  509.         if (len >= max_len)
  510.             break;
  511.  
  512.         c1 = str_decode(s1, &off1, STR_NO_LIMIT);
  513.         c2 = str_decode(s2, &off2, STR_NO_LIMIT);
  514.  
  515.         if (c1 < c2)
  516.             return -1;
  517.  
  518.         if (c1 > c2)
  519.             return 1;
  520.  
  521.         if (c1 == 0 || c2 == 0)
  522.             break;
  523.  
  524.         ++len; 
  525.     }
  526.  
  527.     return 0;
  528.  
  529. }
  530.  
  531. /** Copy NULL-terminated string.
  532.  *
  533.  * Copy source string @a src to destination buffer @a dst.
  534.  * No more than @a size bytes are written. NULL-terminator is always
  535.  * written after the last succesfully copied character (i.e. if the
  536.  * destination buffer is has at least 1 byte, it will be always
  537.  * NULL-terminated).
  538.  *
  539.  * @param src   Source string.
  540.  * @param dst   Destination buffer.
  541.  * @param count Size of the destination buffer.
  542.  *
  543.  */
  544. void str_ncpy(char *dst, const char *src, size_t size)
  545. {
  546.     /* No space for the NULL-terminator in the buffer */
  547.     if (size == 0)
  548.         return;
  549.    
  550.     wchar_t ch;
  551.     size_t str_off = 0;
  552.     size_t dst_off = 0;
  553.    
  554.     while ((ch = str_decode(src, &str_off, STR_NO_LIMIT)) != 0) {
  555.         if (chr_encode(ch, dst, &dst_off, size) != EOK)
  556.             break;
  557.     }
  558.    
  559.     if (dst_off >= size)
  560.         dst[size - 1] = 0;
  561.     else
  562.         dst[dst_off] = 0;
  563. }
  564.  
  565. /** Copy NULL-terminated wide string to string
  566.  *
  567.  * Copy source wide string @a src to destination buffer @a dst.
  568.  * No more than @a size bytes are written. NULL-terminator is always
  569.  * written after the last succesfully copied character (i.e. if the
  570.  * destination buffer is has at least 1 byte, it will be always
  571.  * NULL-terminated).
  572.  *
  573.  * @param src   Source wide string.
  574.  * @param dst   Destination buffer.
  575.  * @param count Size of the destination buffer.
  576.  *
  577.  */
  578. void wstr_nstr(char *dst, const wchar_t *src, size_t size)
  579. {
  580.     /* No space for the NULL-terminator in the buffer */
  581.     if (size == 0)
  582.         return;
  583.    
  584.     wchar_t ch;
  585.     count_t src_idx = 0;
  586.     size_t dst_off = 0;
  587.    
  588.     while ((ch = src[src_idx++]) != 0) {
  589.         if (chr_encode(ch, dst, &dst_off, size) != EOK)
  590.             break;
  591.     }
  592.    
  593.     if (dst_off >= size)
  594.         dst[size - 1] = 0;
  595.     else
  596.         dst[dst_off] = 0;
  597. }
  598.  
  599. /** Find first occurence of character in string.
  600.  *
  601.  * @param str String to search.
  602.  * @param ch  Character to look for.
  603.  *
  604.  * @return Pointer to character in @a str or NULL if not found.
  605.  *
  606.  */
  607. const char *str_chr(const char *str, wchar_t ch)
  608. {
  609.     wchar_t acc;
  610.     size_t off = 0;
  611.    
  612.     while ((acc = str_decode(str, &off, STR_NO_LIMIT)) != 0) {
  613.         if (acc == ch)
  614.             return (str + off);
  615.     }
  616.    
  617.     return NULL;
  618. }
  619.  
  620. /** Insert a wide character into a wide string.
  621.  *
  622.  * Insert a wide character into a wide string at position
  623.  * @a pos. The characters after the position are shifted.
  624.  *
  625.  * @param str     String to insert to.
  626.  * @param ch      Character to insert to.
  627.  * @param pos     Character index where to insert.
  628.  @ @param max_pos Characters in the buffer.
  629.  *
  630.  * @return True if the insertion was sucessful, false if the position
  631.  *         is out of bounds.
  632.  *
  633.  */
  634. bool wstr_linsert(wchar_t *str, wchar_t ch, count_t pos, count_t max_pos)
  635. {
  636.     count_t len = wstr_length(str);
  637.    
  638.     if ((pos > len) || (pos + 1 > max_pos))
  639.         return false;
  640.    
  641.     count_t i;
  642.     for (i = len; i + 1 > pos; i--)
  643.         str[i + 1] = str[i];
  644.    
  645.     str[pos] = ch;
  646.    
  647.     return true;
  648. }
  649.  
  650. /** Remove a wide character from a wide string.
  651.  *
  652.  * Remove a wide character from a wide string at position
  653.  * @a pos. The characters after the position are shifted.
  654.  *
  655.  * @param str String to remove from.
  656.  * @param pos Character index to remove.
  657.  *
  658.  * @return True if the removal was sucessful, false if the position
  659.  *         is out of bounds.
  660.  *
  661.  */
  662. bool wstr_remove(wchar_t *str, count_t pos)
  663. {
  664.     count_t len = wstr_length(str);
  665.    
  666.     if (pos >= len)
  667.         return false;
  668.    
  669.     count_t i;
  670.     for (i = pos + 1; i <= len; i++)
  671.         str[i - 1] = str[i];
  672.    
  673.     return true;
  674. }
  675.  
  676. /** @}
  677.  */
  678.