Subversion Repositories HelenOS

Rev

Rev 4212 | Rev 4224 | Go to most recent revision | Blame | Compare with Previous | Last modification | View Log | Download | RSS feed

  1. /*
  2.  * Copyright (c) 2001-2004 Jakub Jermar
  3.  * All rights reserved.
  4.  *
  5.  * Redistribution and use in source and binary forms, with or without
  6.  * modification, are permitted provided that the following conditions
  7.  * are met:
  8.  *
  9.  * - Redistributions of source code must retain the above copyright
  10.  *   notice, this list of conditions and the following disclaimer.
  11.  * - Redistributions in binary form must reproduce the above copyright
  12.  *   notice, this list of conditions and the following disclaimer in the
  13.  *   documentation and/or other materials provided with the distribution.
  14.  * - The name of the author may not be used to endorse or promote products
  15.  *   derived from this software without specific prior written permission.
  16.  *
  17.  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  18.  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  19.  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  20.  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  21.  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  22.  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  23.  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  24.  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  25.  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  26.  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  27.  */
  28.  
  29. /** @addtogroup generic
  30.  * @{
  31.  */
  32.  
  33. /**
  34.  * @file
  35.  * @brief String functions.
  36.  *
  37.  * Strings and characters use the Universal Character Set (UCS). The standard
  38.  * strings, called just strings are encoded in UTF-8. Wide strings (encoded
  39.  * in UTF-32) are supported to a limited degree. A single character is
  40.  * represented as wchar_t.@n
  41.  *
  42.  * Overview of the terminology:@n
  43.  *
  44.  *  Term                  Meaning
  45.  *  --------------------  ----------------------------------------------------
  46.  *  byte                  8 bits stored in uint8_t (unsigned 8 bit integer)
  47.  *
  48.  *  character             UTF-32 encoded Unicode character, stored in wchar_t
  49.  *                        (signed 32 bit integer), code points 0 .. 1114111
  50.  *                        are valid
  51.  *
  52.  *  ASCII character       7 bit encoded ASCII character, stored in char
  53.  *                        (usually signed 8 bit integer), code points 0 .. 127
  54.  *                        are valid
  55.  *
  56.  *  string                UTF-8 encoded NULL-terminated Unicode string, char *
  57.  *
  58.  *  wide string           UTF-32 encoded NULL-terminated Unicode string,
  59.  *                        wchar_t *
  60.  *
  61.  *  [wide] string size    number of BYTES in a [wide] string (excluding
  62.  *                        the NULL-terminator), size_t
  63.  *
  64.  *  [wide] string length  number of CHARACTERS in a [wide] string (excluding
  65.  *                        the NULL-terminator), count_t
  66.  *
  67.  *  [wide] string width   number of display cells on a monospace display taken
  68.  *                        by a [wide] string, count_t
  69.  *
  70.  *
  71.  * Overview of string metrics:@n
  72.  *
  73.  *  Metric  Abbrev.  Type     Meaning
  74.  *  ------  ------   ------   -------------------------------------------------
  75.  *  size    n        size_t   number of BYTES in a string (excluding the
  76.  *                            NULL-terminator)
  77.  *
  78.  *  length  l        count_t  number of CHARACTERS in a string (excluding the
  79.  *                            null terminator)
  80.  *
  81.  *  width  w         count_t  number of display cells on a monospace display
  82.  *                            taken by a string
  83.  *
  84.  *
  85.  * Function naming prefixes:@n
  86.  *
  87.  *  chr_    operate on characters
  88.  *  ascii_  operate on ASCII characters
  89.  *  str_    operate on strings
  90.  *  wstr_   operate on wide strings
  91.  *
  92.  *  [w]str_[n|l|w]  operate on a prefix limited by size, length
  93.  *                  or width
  94.  *
  95.  *
  96.  * A specific character inside a [wide] string can be referred to by:@n
  97.  *
  98.  *  pointer (char *, wchar_t *)
  99.  *  byte offset (size_t)
  100.  *  character index (count_t)
  101.  *
  102.  */
  103.  
  104. #include <string.h>
  105. #include <print.h>
  106. #include <cpu.h>
  107. #include <arch/asm.h>
  108. #include <arch.h>
  109. #include <errno.h>
  110. #include <align.h>
  111.  
  112. /** Byte mask consisting of lowest @n bits (out of 8) */
  113. #define LO_MASK_8(n)  ((uint8_t) ((1 << (n)) - 1))
  114.  
  115. /** Byte mask consisting of lowest @n bits (out of 32) */
  116. #define LO_MASK_32(n)  ((uint32_t) ((1 << (n)) - 1))
  117.  
  118. /** Byte mask consisting of highest @n bits (out of 8) */
  119. #define HI_MASK_8(n)  (~LO_MASK_8(8 - (n)))
  120.  
  121. /** Number of data bits in a UTF-8 continuation byte */
  122. #define CONT_BITS  6
  123.  
  124. /** Decode a single character from a string.
  125.  *
  126.  * Decode a single character from a string of size @a size. Decoding starts
  127.  * at @a offset and this offset is moved to the beginning of the next
  128.  * character. In case of decoding error, offset generally advances at least
  129.  * by one. However, offset is never moved beyond size.
  130.  *
  131.  * @param str    String (not necessarily NULL-terminated).
  132.  * @param offset Byte offset in string where to start decoding.
  133.  * @param size   Size of the string (in bytes).
  134.  *
  135.  * @return Value of decoded character, U_SPECIAL on decoding error or
  136.  *         NULL if attempt to decode beyond @a size.
  137.  *
  138.  */
  139. wchar_t str_decode(const char *str, size_t *offset, size_t size)
  140. {
  141.     if (*offset + 1 > size)
  142.         return 0;
  143.    
  144.     /* First byte read from string */
  145.     uint8_t b0 = (uint8_t) str[(*offset)++];
  146.    
  147.     /* Determine code length */
  148.    
  149.     unsigned int b0_bits;  /* Data bits in first byte */
  150.     unsigned int cbytes;   /* Number of continuation bytes */
  151.    
  152.     if ((b0 & 0x80) == 0) {
  153.         /* 0xxxxxxx (Plain ASCII) */
  154.         b0_bits = 7;
  155.         cbytes = 0;
  156.     } else if ((b0 & 0xe0) == 0xc0) {
  157.         /* 110xxxxx 10xxxxxx */
  158.         b0_bits = 5;
  159.         cbytes = 1;
  160.     } else if ((b0 & 0xf0) == 0xe0) {
  161.         /* 1110xxxx 10xxxxxx 10xxxxxx */
  162.         b0_bits = 4;
  163.         cbytes = 2;
  164.     } else if ((b0 & 0xf8) == 0xf0) {
  165.         /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
  166.         b0_bits = 3;
  167.         cbytes = 3;
  168.     } else {
  169.         /* 10xxxxxx -- unexpected continuation byte */
  170.         return U_SPECIAL;
  171.     }
  172.    
  173.     if (*offset + cbytes > size)
  174.         return U_SPECIAL;
  175.    
  176.     wchar_t ch = b0 & LO_MASK_8(b0_bits);
  177.    
  178.     /* Decode continuation bytes */
  179.     while (cbytes > 0) {
  180.         uint8_t b = (uint8_t) str[(*offset)++];
  181.        
  182.         /* Must be 10xxxxxx */
  183.         if ((b & 0xc0) != 0x80)
  184.             return U_SPECIAL;
  185.        
  186.         /* Shift data bits to ch */
  187.         ch = (ch << CONT_BITS) | (wchar_t) (b & LO_MASK_8(CONT_BITS));
  188.         cbytes--;
  189.     }
  190.    
  191.     return ch;
  192. }
  193.  
  194. /** Encode a single character to string representation.
  195.  *
  196.  * Encode a single character to string representation (i.e. UTF-8) and store
  197.  * it into a buffer at @a offset. Encoding starts at @a offset and this offset
  198.  * is moved to the position where the next character can be written to.
  199.  *
  200.  * @param ch     Input character.
  201.  * @param str    Output buffer.
  202.  * @param offset Byte offset where to start writing.
  203.  * @param size   Size of the output buffer (in bytes).
  204.  *
  205.  * @return EOK if the character was encoded successfully, EOVERFLOW if there
  206.  *     was not enough space in the output buffer or EINVAL if the character
  207.  *     code was invalid.
  208.  */
  209. int chr_encode(const wchar_t ch, char *str, size_t *offset, size_t size)
  210. {
  211.     if (*offset >= size)
  212.         return EOVERFLOW;
  213.    
  214.     if (!chr_check(ch))
  215.         return EINVAL;
  216.    
  217.     /* Unsigned version of ch (bit operations should only be done
  218.        on unsigned types). */
  219.     uint32_t cc = (uint32_t) ch;
  220.    
  221.     /* Determine how many continuation bytes are needed */
  222.    
  223.     unsigned int b0_bits;  /* Data bits in first byte */
  224.     unsigned int cbytes;   /* Number of continuation bytes */
  225.    
  226.     if ((cc & ~LO_MASK_32(7)) == 0) {
  227.         b0_bits = 7;
  228.         cbytes = 0;
  229.     } else if ((cc & ~LO_MASK_32(11)) == 0) {
  230.         b0_bits = 5;
  231.         cbytes = 1;
  232.     } else if ((cc & ~LO_MASK_32(16)) == 0) {
  233.         b0_bits = 4;
  234.         cbytes = 2;
  235.     } else if ((cc & ~LO_MASK_32(21)) == 0) {
  236.         b0_bits = 3;
  237.         cbytes = 3;
  238.     } else {
  239.         /* Codes longer than 21 bits are not supported */
  240.         return EINVAL;
  241.     }
  242.    
  243.     /* Check for available space in buffer */
  244.     if (*offset + cbytes >= size)
  245.         return EOVERFLOW;
  246.    
  247.     /* Encode continuation bytes */
  248.     unsigned int i;
  249.     for (i = cbytes; i > 0; i--) {
  250.         str[*offset + i] = 0x80 | (cc & LO_MASK_32(CONT_BITS));
  251.         cc = cc >> CONT_BITS;
  252.     }
  253.    
  254.     /* Encode first byte */
  255.     str[*offset] = (cc & LO_MASK_32(b0_bits)) | HI_MASK_8(8 - b0_bits - 1);
  256.    
  257.     /* Advance offset */
  258.     *offset += cbytes + 1;
  259.    
  260.     return EOK;
  261. }
  262.  
  263. /** Get size of string.
  264.  *
  265.  * Get the number of bytes which are used by the string @a str (excluding the
  266.  * NULL-terminator).
  267.  *
  268.  * @param str String to consider.
  269.  *
  270.  * @return Number of bytes used by the string
  271.  *
  272.  */
  273. size_t str_size(const char *str)
  274. {
  275.     size_t size = 0;
  276.    
  277.     while (*str++ != 0)
  278.         size++;
  279.    
  280.     return size;
  281. }
  282.  
  283. /** Get size of wide string.
  284.  *
  285.  * Get the number of bytes which are used by the wide string @a str (excluding the
  286.  * NULL-terminator).
  287.  *
  288.  * @param str Wide string to consider.
  289.  *
  290.  * @return Number of bytes used by the wide string
  291.  *
  292.  */
  293. size_t wstr_size(const wchar_t *str)
  294. {
  295.     return (wstr_length(str) * sizeof(wchar_t));
  296. }
  297.  
  298. /** Get size of string with length limit.
  299.  *
  300.  * Get the number of bytes which are used by up to @a max_len first
  301.  * characters in the string @a str. If @a max_len is greater than
  302.  * the length of @a str, the entire string is measured (excluding the
  303.  * NULL-terminator).
  304.  *
  305.  * @param str     String to consider.
  306.  * @param max_len Maximum number of characters to measure.
  307.  *
  308.  * @return Number of bytes used by the characters.
  309.  *
  310.  */
  311. size_t str_lsize(const char *str, count_t max_len)
  312. {
  313.     count_t len = 0;
  314.     size_t offset = 0;
  315.    
  316.     while (len < max_len) {
  317.         if (str_decode(str, &offset, STR_NO_LIMIT) == 0)
  318.             break;
  319.        
  320.         len++;
  321.     }
  322.    
  323.     return offset;
  324. }
  325.  
  326. /** Get size of wide string with length limit.
  327.  *
  328.  * Get the number of bytes which are used by up to @a max_len first
  329.  * wide characters in the wide string @a str. If @a max_len is greater than
  330.  * the length of @a str, the entire wide string is measured (excluding the
  331.  * NULL-terminator).
  332.  *
  333.  * @param str     Wide string to consider.
  334.  * @param max_len Maximum number of wide characters to measure.
  335.  *
  336.  * @return Number of bytes used by the wide characters.
  337.  *
  338.  */
  339. size_t wstr_lsize(const wchar_t *str, count_t max_len)
  340. {
  341.     return (wstr_nlength(str, max_len * sizeof(wchar_t)) * sizeof(wchar_t));
  342. }
  343.  
  344. /** Get number of characters in a string.
  345.  *
  346.  * @param str NULL-terminated string.
  347.  *
  348.  * @return Number of characters in string.
  349.  *
  350.  */
  351. count_t str_length(const char *str)
  352. {
  353.     count_t len = 0;
  354.     size_t offset = 0;
  355.    
  356.     while (str_decode(str, &offset, STR_NO_LIMIT) != 0)
  357.         len++;
  358.    
  359.     return len;
  360. }
  361.  
  362. /** Get number of characters in a wide string.
  363.  *
  364.  * @param str NULL-terminated wide string.
  365.  *
  366.  * @return Number of characters in @a str.
  367.  *
  368.  */
  369. count_t wstr_length(const wchar_t *wstr)
  370. {
  371.     count_t len = 0;
  372.    
  373.     while (*wstr++ != 0)
  374.         len++;
  375.    
  376.     return len;
  377. }
  378.  
  379. /** Get number of characters in a string with size limit.
  380.  *
  381.  * @param str  NULL-terminated string.
  382.  * @param size Maximum number of bytes to consider.
  383.  *
  384.  * @return Number of characters in string.
  385.  *
  386.  */
  387. count_t str_nlength(const char *str, size_t size)
  388. {
  389.     count_t len = 0;
  390.     size_t offset = 0;
  391.    
  392.     while (str_decode(str, &offset, size) != 0)
  393.         len++;
  394.    
  395.     return len;
  396. }
  397.  
  398. /** Get number of characters in a string with size limit.
  399.  *
  400.  * @param str  NULL-terminated string.
  401.  * @param size Maximum number of bytes to consider.
  402.  *
  403.  * @return Number of characters in string.
  404.  *
  405.  */
  406. count_t wstr_nlength(const wchar_t *str, size_t size)
  407. {
  408.     count_t len = 0;
  409.     count_t limit = ALIGN_DOWN(size, sizeof(wchar_t));
  410.     count_t offset = 0;
  411.    
  412.     while ((offset < limit) && (*str++ != 0)) {
  413.         len++;
  414.         offset += sizeof(wchar_t);
  415.     }
  416.    
  417.     return len;
  418. }
  419.  
  420. /** Check whether character is plain ASCII.
  421.  *
  422.  * @return True if character is plain ASCII.
  423.  *
  424.  */
  425. bool ascii_check(const wchar_t ch)
  426. {
  427.     if ((ch >= 0) && (ch <= 127))
  428.         return true;
  429.    
  430.     return false;
  431. }
  432.  
  433. /** Check whether character is valid
  434.  *
  435.  * @return True if character is a valid Unicode code point.
  436.  *
  437.  */
  438. bool chr_check(const wchar_t ch)
  439. {
  440.     if ((ch >= 0) && (ch <= 1114111))
  441.         return true;
  442.    
  443.     return false;
  444. }
  445.  
  446. /** Compare two NULL terminated strings.
  447.  *
  448.  * Do a char-by-char comparison of two NULL-terminated strings.
  449.  * The strings are considered equal iff they consist of the same
  450.  * characters on the minimum of their lengths.
  451.  *
  452.  * @param s1 First string to compare.
  453.  * @param s2 Second string to compare.
  454.  *
  455.  * @return 0 if the strings are equal, -1 if first is smaller,
  456.  *         1 if second smaller.
  457.  *
  458.  */
  459. int str_cmp(const char *s1, const char *s2)
  460. {
  461.     wchar_t c1;
  462.     wchar_t c2;
  463.    
  464.     size_t off1 = 0;
  465.     size_t off2 = 0;
  466.    
  467.     while ((c1 = str_decode(s1, &off1, STR_NO_LIMIT) != 0)
  468.         && (c2 = str_decode(s2, &off2, STR_NO_LIMIT) != 0)) {
  469.        
  470.         if (off1 != off2)
  471.             break;
  472.        
  473.         if (c1 < c2)
  474.             return -1;
  475.        
  476.         if (c1 > c2)
  477.             return 1;
  478.     }
  479.    
  480.     if ((off1 == off2) && (c1 == c2))
  481.         return 0;
  482.    
  483.     if ((c1 == 0) || (off1 < off2))
  484.         return -1;
  485.    
  486.     return 1;
  487. }
  488.  
  489. /** Compare two NULL terminated strings with length limit.
  490.  *
  491.  * Do a char-by-char comparison of two NULL-terminated strings.
  492.  * The strings are considered equal iff they consist of the same
  493.  * characters on the minimum of their lengths and the length limit.
  494.  *
  495.  * @param s1      First string to compare.
  496.  * @param s2      Second string to compare.
  497.  * @param max_len Maximum number of characters to consider.
  498.  *
  499.  * @return 0 if the strings are equal, -1 if first is smaller,
  500.  *         1 if second smaller.
  501.  *
  502.  */
  503. int str_lcmp(const char *s1, const char *s2, count_t max_len)
  504. {
  505.     wchar_t c1 = 0;
  506.     wchar_t c2 = 0;
  507.    
  508.     size_t off1 = 0;
  509.     size_t off2 = 0;
  510.    
  511.     count_t len = 0;
  512.    
  513.     while ((len < max_len)
  514.         && ((c1 = str_decode(s1, &off1, STR_NO_LIMIT)) != 0)
  515.         && ((c2 = str_decode(s2, &off2, STR_NO_LIMIT)) != 0)) {
  516.        
  517.         if (off1 != off2)
  518.             break;
  519.        
  520.         if (c1 < c2)
  521.             return -1;
  522.        
  523.         if (c1 > c2)
  524.             return 1;
  525.        
  526.         len++;
  527.     }
  528.    
  529.     if ((off1 == off2) && (len == max_len) && (c1 == c2))
  530.         return 0;
  531.    
  532.     if ((c1 == 0) || (off1 < off2))
  533.         return -1;
  534.    
  535.     return 1;
  536. }
  537.  
  538. /** Copy NULL-terminated string.
  539.  *
  540.  * Copy source string @a src to destination buffer @a dst.
  541.  * No more than @a size bytes are written. NULL-terminator is always
  542.  * written after the last succesfully copied character (i.e. if the
  543.  * destination buffer is has at least 1 byte, it will be always
  544.  * NULL-terminated).
  545.  *
  546.  * @param src   Source string.
  547.  * @param dst   Destination buffer.
  548.  * @param count Size of the destination buffer.
  549.  *
  550.  */
  551. void str_ncpy(char *dst, const char *src, size_t size)
  552. {
  553.     /* No space for the NULL-terminator in the buffer */
  554.     if (size == 0)
  555.         return;
  556.    
  557.     wchar_t ch;
  558.     size_t str_off = 0;
  559.     size_t dst_off = 0;
  560.    
  561.     while ((ch = str_decode(src, &str_off, STR_NO_LIMIT) != 0)) {
  562.         if (chr_encode(ch, dst, &dst_off, size) != EOK)
  563.             break;
  564.     }
  565.    
  566.     if (dst_off >= size)
  567.         dst[size - 1] = 0;
  568.     else
  569.         dst[dst_off] = 0;
  570. }
  571.  
  572. /** Copy NULL-terminated wide string to string
  573.  *
  574.  * Copy source wide string @a src to destination buffer @a dst.
  575.  * No more than @a size bytes are written. NULL-terminator is always
  576.  * written after the last succesfully copied character (i.e. if the
  577.  * destination buffer is has at least 1 byte, it will be always
  578.  * NULL-terminated).
  579.  *
  580.  * @param src   Source wide string.
  581.  * @param dst   Destination buffer.
  582.  * @param count Size of the destination buffer.
  583.  *
  584.  */
  585. void wstr_nstr(char *dst, const wchar_t *src, size_t size)
  586. {
  587.     /* No space for the NULL-terminator in the buffer */
  588.     if (size == 0)
  589.         return;
  590.    
  591.     wchar_t ch;
  592.     count_t src_idx = 0;
  593.     size_t dst_off = 0;
  594.    
  595.     while ((ch = src[src_idx++]) != 0) {
  596.         if (chr_encode(ch, dst, &dst_off, size) != EOK)
  597.             break;
  598.     }
  599.    
  600.     if (dst_off >= size)
  601.         dst[size - 1] = 0;
  602.     else
  603.         dst[dst_off] = 0;
  604. }
  605.  
  606. /** Find first occurence of character in string.
  607.  *
  608.  * @param str String to search.
  609.  * @param ch  Character to look for.
  610.  *
  611.  * @return Pointer to character in @a str or NULL if not found.
  612.  *
  613.  */
  614. const char *str_chr(const char *str, wchar_t ch)
  615. {
  616.     wchar_t acc;
  617.     size_t off = 0;
  618.    
  619.     while ((acc = str_decode(str, &off, STR_NO_LIMIT) != 0)) {
  620.         if (acc == ch)
  621.             return (str + off);
  622.     }
  623.    
  624.     return NULL;
  625. }
  626.  
  627. /** Insert a wide character into a wide string.
  628.  *
  629.  * Insert a wide character into a wide string at position
  630.  * @a pos. The characters after the position are shifted.
  631.  *
  632.  * @param str     String to insert to.
  633.  * @param ch      Character to insert to.
  634.  * @param pos     Character index where to insert.
  635.  @ @param max_pos Characters in the buffer.
  636.  *
  637.  * @return True if the insertion was sucessful, false if the position
  638.  *         is out of bounds.
  639.  *
  640.  */
  641. bool wstr_linsert(wchar_t *str, wchar_t ch, count_t pos, count_t max_pos)
  642. {
  643.     count_t len = wstr_length(str);
  644.    
  645.     if ((pos > len) || (pos + 1 > max_pos))
  646.         return false;
  647.    
  648.     count_t i;
  649.     for (i = len; i + 1 > pos; i--)
  650.         str[i + 1] = str[i];
  651.    
  652.     str[pos] = ch;
  653.    
  654.     return true;
  655. }
  656.  
  657. /** Remove a wide character from a wide string.
  658.  *
  659.  * Remove a wide character from a wide string at position
  660.  * @a pos. The characters after the position are shifted.
  661.  *
  662.  * @param str String to remove from.
  663.  * @param pos Character index to remove.
  664.  *
  665.  * @return True if the removal was sucessful, false if the position
  666.  *         is out of bounds.
  667.  *
  668.  */
  669. bool wstr_remove(wchar_t *str, count_t pos)
  670. {
  671.     count_t len = wstr_length(str);
  672.    
  673.     if (pos >= len)
  674.         return false;
  675.    
  676.     count_t i;
  677.     for (i = pos + 1; i <= len; i++)
  678.         str[i - 1] = str[i];
  679.    
  680.     return true;
  681. }
  682.  
  683. /** @}
  684.  */
  685.