32,7 → 32,73 |
|
/** |
* @file |
* @brief Miscellaneous functions. |
* @brief String functions. |
* |
* Strings and characters use the Universal Character Set (UCS). The standard |
* strings, called just strings are encoded in UTF-8. Wide strings (encoded |
* in UTF-32) are supported to a limited degree. A single character is |
* represented as wchar_t.@n |
* |
* Overview of the terminology:@n |
* |
* Term Meaning |
* -------------------- ---------------------------------------------------- |
* byte 8 bits stored in uint8_t (unsigned 8 bit integer) |
* |
* character UTF-32 encoded Unicode character, stored in wchar_t |
* (signed 32 bit integer), code points 0 .. 1114111 |
* are valid |
* |
* ASCII character 7 bit encoded ASCII character, stored in char |
* (usually signed 8 bit integer), code points 0 .. 127 |
* are valid |
* |
* string UTF-8 encoded NULL-terminated Unicode string, char * |
* |
* wide string UTF-32 encoded NULL-terminated Unicode string, |
* wchar_t * |
* |
* [wide] string size number of BYTES in a [wide] string (excluding |
* the NULL-terminator), size_t |
* |
* [wide] string length number of CHARACTERS in a [wide] string (excluding |
* the NULL-terminator), count_t |
* |
* [wide] string width number of display cells on a monospace display taken |
* by a [wide] string, count_t |
* |
* |
* Overview of string metrics:@n |
* |
* Metric Abbrev. Type Meaning |
* ------ ------ ------ ------------------------------------------------- |
* size n size_t number of BYTES in a string (excluding the |
* NULL-terminator) |
* |
* length l count_t number of CHARACTERS in a string (excluding the |
* null terminator) |
* |
* width w count_t number of display cells on a monospace display |
* taken by a string |
* |
* |
* Function naming prefixes:@n |
* |
* chr_ operate on characters |
* ascii_ operate on ASCII characters |
* str_ operate on strings |
* wstr_ operate on wide strings |
* |
* [w]str_[n|l|w] operate on a prefix limited by size, length |
* or width |
* |
* |
* A specific character inside a [wide] string can be referred to by:@n |
* |
* pointer (char *, wchar_t *) |
* byte offset (size_t) |
* character index (count_t) |
* |
*/ |
|
#include <string.h> |
40,131 → 106,572 |
#include <cpu.h> |
#include <arch/asm.h> |
#include <arch.h> |
#include <console/kconsole.h> |
#include <errno.h> |
#include <align.h> |
|
/** Return number of characters in a string. |
/** Byte mask consisting of lowest @n bits (out of 8) */ |
#define LO_MASK_8(n) ((uint8_t) ((1 << (n)) - 1)) |
|
/** Byte mask consisting of lowest @n bits (out of 32) */ |
#define LO_MASK_32(n) ((uint32_t) ((1 << (n)) - 1)) |
|
/** Byte mask consisting of highest @n bits (out of 8) */ |
#define HI_MASK_8(n) (~LO_MASK_8(8 - (n))) |
|
/** Number of data bits in a UTF-8 continuation byte */ |
#define CONT_BITS 6 |
|
/** Decode a single character from a string. |
* |
* @param str NULL terminated string. |
* Decode a single character from a string of size @a size. Decoding starts |
* at @a offset and this offset is moved to the beginning of the next |
* character. In case of decoding error, offset generally advances at least |
* by one. However, offset is never moved beyond size. |
* |
* @return Number of characters in str. |
* @param str String (not necessarily NULL-terminated). |
* @param offset Byte offset in string where to start decoding. |
* @param size Size of the string (in bytes). |
* |
* @return Value of decoded character, U_SPECIAL on decoding error or |
* NULL if attempt to decode beyond @a size. |
* |
*/ |
size_t strlen(const char *str) |
wchar_t str_decode(const char *str, size_t *offset, size_t size) |
{ |
int i; |
if (*offset + 1 > size) |
return 0; |
|
for (i = 0; str[i]; i++); |
/* First byte read from string */ |
uint8_t b0 = (uint8_t) str[(*offset)++]; |
|
return i; |
/* Determine code length */ |
|
unsigned int b0_bits; /* Data bits in first byte */ |
unsigned int cbytes; /* Number of continuation bytes */ |
|
if ((b0 & 0x80) == 0) { |
/* 0xxxxxxx (Plain ASCII) */ |
b0_bits = 7; |
cbytes = 0; |
} else if ((b0 & 0xe0) == 0xc0) { |
/* 110xxxxx 10xxxxxx */ |
b0_bits = 5; |
cbytes = 1; |
} else if ((b0 & 0xf0) == 0xe0) { |
/* 1110xxxx 10xxxxxx 10xxxxxx */ |
b0_bits = 4; |
cbytes = 2; |
} else if ((b0 & 0xf8) == 0xf0) { |
/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ |
b0_bits = 3; |
cbytes = 3; |
} else { |
/* 10xxxxxx -- unexpected continuation byte */ |
return U_SPECIAL; |
} |
|
if (*offset + cbytes > size) |
return U_SPECIAL; |
|
wchar_t ch = b0 & LO_MASK_8(b0_bits); |
|
/* Decode continuation bytes */ |
while (cbytes > 0) { |
uint8_t b = (uint8_t) str[(*offset)++]; |
|
/* Must be 10xxxxxx */ |
if ((b & 0xc0) != 0x80) |
return U_SPECIAL; |
|
/* Shift data bits to ch */ |
ch = (ch << CONT_BITS) | (wchar_t) (b & LO_MASK_8(CONT_BITS)); |
cbytes--; |
} |
|
return ch; |
} |
|
/** Compare two NULL terminated strings |
/** Encode a single character to string representation. |
* |
* Do a char-by-char comparison of two NULL terminated strings. |
* The strings are considered equal iff they consist of the same |
* characters on the minimum of their lengths. |
* Encode a single character to string representation (i.e. UTF-8) and store |
* it into a buffer at @a offset. Encoding starts at @a offset and this offset |
* is moved to the position where the next character can be written to. |
* |
* @param src First string to compare. |
* @param dst Second string to compare. |
* @param ch Input character. |
* @param str Output buffer. |
* @param offset Byte offset where to start writing. |
* @param size Size of the output buffer (in bytes). |
* |
* @return 0 if the strings are equal, -1 if first is smaller, 1 if second smaller. |
* @return EOK if the character was encoded successfully, EOVERFLOW if there |
* was not enough space in the output buffer or EINVAL if the character |
* code was invalid. |
*/ |
int chr_encode(wchar_t ch, char *str, size_t *offset, size_t size) |
{ |
if (*offset >= size) |
return EOVERFLOW; |
|
if (!chr_check(ch)) |
return EINVAL; |
|
/* Unsigned version of ch (bit operations should only be done |
on unsigned types). */ |
uint32_t cc = (uint32_t) ch; |
|
/* Determine how many continuation bytes are needed */ |
|
unsigned int b0_bits; /* Data bits in first byte */ |
unsigned int cbytes; /* Number of continuation bytes */ |
|
if ((cc & ~LO_MASK_32(7)) == 0) { |
b0_bits = 7; |
cbytes = 0; |
} else if ((cc & ~LO_MASK_32(11)) == 0) { |
b0_bits = 5; |
cbytes = 1; |
} else if ((cc & ~LO_MASK_32(16)) == 0) { |
b0_bits = 4; |
cbytes = 2; |
} else if ((cc & ~LO_MASK_32(21)) == 0) { |
b0_bits = 3; |
cbytes = 3; |
} else { |
/* Codes longer than 21 bits are not supported */ |
return EINVAL; |
} |
|
/* Check for available space in buffer */ |
if (*offset + cbytes >= size) |
return EOVERFLOW; |
|
/* Encode continuation bytes */ |
unsigned int i; |
for (i = cbytes; i > 0; i--) { |
str[*offset + i] = 0x80 | (cc & LO_MASK_32(CONT_BITS)); |
cc = cc >> CONT_BITS; |
} |
|
/* Encode first byte */ |
str[*offset] = (cc & LO_MASK_32(b0_bits)) | HI_MASK_8(8 - b0_bits - 1); |
|
/* Advance offset */ |
*offset += cbytes + 1; |
|
return EOK; |
} |
|
/** Get size of string. |
* |
* Get the number of bytes which are used by the string @a str (excluding the |
* NULL-terminator). |
* |
* @param str String to consider. |
* |
* @return Number of bytes used by the string |
* |
*/ |
int strcmp(const char *src, const char *dst) |
size_t str_size(const char *str) |
{ |
for (; *src && *dst; src++, dst++) { |
if (*src < *dst) |
return -1; |
if (*src > *dst) |
return 1; |
size_t size = 0; |
|
while (*str++ != 0) |
size++; |
|
return size; |
} |
|
/** Get size of wide string. |
* |
* Get the number of bytes which are used by the wide string @a str (excluding the |
* NULL-terminator). |
* |
* @param str Wide string to consider. |
* |
* @return Number of bytes used by the wide string |
* |
*/ |
size_t wstr_size(const wchar_t *str) |
{ |
return (wstr_length(str) * sizeof(wchar_t)); |
} |
|
/** Get size of string with length limit. |
* |
* Get the number of bytes which are used by up to @a max_len first |
* characters in the string @a str. If @a max_len is greater than |
* the length of @a str, the entire string is measured (excluding the |
* NULL-terminator). |
* |
* @param str String to consider. |
* @param max_len Maximum number of characters to measure. |
* |
* @return Number of bytes used by the characters. |
* |
*/ |
size_t str_lsize(const char *str, count_t max_len) |
{ |
count_t len = 0; |
size_t offset = 0; |
|
while (len < max_len) { |
if (str_decode(str, &offset, STR_NO_LIMIT) == 0) |
break; |
|
len++; |
} |
if (*src == *dst) |
return 0; |
|
if (!*src) |
return -1; |
return offset; |
} |
|
/** Get size of wide string with length limit. |
* |
* Get the number of bytes which are used by up to @a max_len first |
* wide characters in the wide string @a str. If @a max_len is greater than |
* the length of @a str, the entire wide string is measured (excluding the |
* NULL-terminator). |
* |
* @param str Wide string to consider. |
* @param max_len Maximum number of wide characters to measure. |
* |
* @return Number of bytes used by the wide characters. |
* |
*/ |
size_t wstr_lsize(const wchar_t *str, count_t max_len) |
{ |
return (wstr_nlength(str, max_len * sizeof(wchar_t)) * sizeof(wchar_t)); |
} |
|
/** Get number of characters in a string. |
* |
* @param str NULL-terminated string. |
* |
* @return Number of characters in string. |
* |
*/ |
count_t str_length(const char *str) |
{ |
count_t len = 0; |
size_t offset = 0; |
|
return 1; |
while (str_decode(str, &offset, STR_NO_LIMIT) != 0) |
len++; |
|
return len; |
} |
|
/** Get number of characters in a wide string. |
* |
* @param str NULL-terminated wide string. |
* |
* @return Number of characters in @a str. |
* |
*/ |
count_t wstr_length(const wchar_t *wstr) |
{ |
count_t len = 0; |
|
while (*wstr++ != 0) |
len++; |
|
return len; |
} |
|
/** Compare two NULL terminated strings |
/** Get number of characters in a string with size limit. |
* |
* Do a char-by-char comparison of two NULL terminated strings. |
* @param str NULL-terminated string. |
* @param size Maximum number of bytes to consider. |
* |
* @return Number of characters in string. |
* |
*/ |
count_t str_nlength(const char *str, size_t size) |
{ |
count_t len = 0; |
size_t offset = 0; |
|
while (str_decode(str, &offset, size) != 0) |
len++; |
|
return len; |
} |
|
/** Get number of characters in a string with size limit. |
* |
* @param str NULL-terminated string. |
* @param size Maximum number of bytes to consider. |
* |
* @return Number of characters in string. |
* |
*/ |
count_t wstr_nlength(const wchar_t *str, size_t size) |
{ |
count_t len = 0; |
count_t limit = ALIGN_DOWN(size, sizeof(wchar_t)); |
count_t offset = 0; |
|
while ((offset < limit) && (*str++ != 0)) { |
len++; |
offset += sizeof(wchar_t); |
} |
|
return len; |
} |
|
/** Check whether character is plain ASCII. |
* |
* @return True if character is plain ASCII. |
* |
*/ |
bool ascii_check(wchar_t ch) |
{ |
if ((ch >= 0) && (ch <= 127)) |
return true; |
|
return false; |
} |
|
/** Check whether character is valid |
* |
* @return True if character is a valid Unicode code point. |
* |
*/ |
bool chr_check(wchar_t ch) |
{ |
if ((ch >= 0) && (ch <= 1114111)) |
return true; |
|
return false; |
} |
|
/** Compare two NULL terminated strings. |
* |
* Do a char-by-char comparison of two NULL-terminated strings. |
* The strings are considered equal iff they consist of the same |
* characters on the minimum of their lengths and specified maximal |
* length. |
* characters on the minimum of their lengths. |
* |
* @param src First string to compare. |
* @param dst Second string to compare. |
* @param len Maximal length for comparison. |
* @param s1 First string to compare. |
* @param s2 Second string to compare. |
* |
* @return 0 if the strings are equal, -1 if first is smaller, 1 if second smaller. |
* @return 0 if the strings are equal, -1 if first is smaller, |
* 1 if second smaller. |
* |
*/ |
int strncmp(const char *src, const char *dst, size_t len) |
int str_cmp(const char *s1, const char *s2) |
{ |
unsigned int i; |
wchar_t c1 = 0; |
wchar_t c2 = 0; |
|
for (i = 0; (*src) && (*dst) && (i < len); src++, dst++, i++) { |
if (*src < *dst) |
size_t off1 = 0; |
size_t off2 = 0; |
|
while (true) { |
c1 = str_decode(s1, &off1, STR_NO_LIMIT); |
c2 = str_decode(s2, &off2, STR_NO_LIMIT); |
|
if (c1 < c2) |
return -1; |
|
if (*src > *dst) |
if (c1 > c2) |
return 1; |
|
if (c1 == 0 || c2 == 0) |
break; |
} |
|
return 0; |
} |
|
/** Compare two NULL terminated strings with length limit. |
* |
* Do a char-by-char comparison of two NULL-terminated strings. |
* The strings are considered equal iff they consist of the same |
* characters on the minimum of their lengths and the length limit. |
* |
* @param s1 First string to compare. |
* @param s2 Second string to compare. |
* @param max_len Maximum number of characters to consider. |
* |
* @return 0 if the strings are equal, -1 if first is smaller, |
* 1 if second smaller. |
* |
*/ |
int str_lcmp(const char *s1, const char *s2, count_t max_len) |
{ |
wchar_t c1 = 0; |
wchar_t c2 = 0; |
|
if (i == len || *src == *dst) |
return 0; |
size_t off1 = 0; |
size_t off2 = 0; |
|
if (!*src) |
return -1; |
|
return 1; |
} |
count_t len = 0; |
|
while (true) { |
if (len >= max_len) |
break; |
|
c1 = str_decode(s1, &off1, STR_NO_LIMIT); |
c2 = str_decode(s2, &off2, STR_NO_LIMIT); |
|
/** Copy NULL terminated string. |
if (c1 < c2) |
return -1; |
|
if (c1 > c2) |
return 1; |
|
if (c1 == 0 || c2 == 0) |
break; |
|
++len; |
} |
|
return 0; |
|
} |
|
/** Copy NULL-terminated string. |
* |
* Copy at most 'len' characters from string 'src' to 'dest'. |
* If 'src' is shorter than 'len', '\0' is inserted behind the |
* last copied character. |
* Copy source string @a src to destination buffer @a dst. |
* No more than @a size bytes are written. NULL-terminator is always |
* written after the last succesfully copied character (i.e. if the |
* destination buffer is has at least 1 byte, it will be always |
* NULL-terminated). |
* |
* @param src Source string. |
* @param dest Destination buffer. |
* @param len Size of destination buffer. |
* @param src Source string. |
* @param dst Destination buffer. |
* @param count Size of the destination buffer. |
* |
*/ |
void strncpy(char *dest, const char *src, size_t len) |
void str_ncpy(char *dst, const char *src, size_t size) |
{ |
unsigned int i; |
/* No space for the NULL-terminator in the buffer */ |
if (size == 0) |
return; |
|
for (i = 0; i < len; i++) { |
if (!(dest[i] = src[i])) |
return; |
wchar_t ch; |
size_t str_off = 0; |
size_t dst_off = 0; |
|
while ((ch = str_decode(src, &str_off, STR_NO_LIMIT)) != 0) { |
if (chr_encode(ch, dst, &dst_off, size) != EOK) |
break; |
} |
|
dest[i - 1] = '\0'; |
if (dst_off >= size) |
dst[size - 1] = 0; |
else |
dst[dst_off] = 0; |
} |
|
/** Copy NULL-terminated wide string to string |
* |
* Copy source wide string @a src to destination buffer @a dst. |
* No more than @a size bytes are written. NULL-terminator is always |
* written after the last succesfully copied character (i.e. if the |
* destination buffer is has at least 1 byte, it will be always |
* NULL-terminated). |
* |
* @param src Source wide string. |
* @param dst Destination buffer. |
* @param count Size of the destination buffer. |
* |
*/ |
void wstr_nstr(char *dst, const wchar_t *src, size_t size) |
{ |
/* No space for the NULL-terminator in the buffer */ |
if (size == 0) |
return; |
|
wchar_t ch; |
count_t src_idx = 0; |
size_t dst_off = 0; |
|
while ((ch = src[src_idx++]) != 0) { |
if (chr_encode(ch, dst, &dst_off, size) != EOK) |
break; |
} |
|
if (dst_off >= size) |
dst[size - 1] = 0; |
else |
dst[dst_off] = 0; |
} |
|
/** Find first occurence of character in string. |
* |
* @param s String to search. |
* @param i Character to look for. |
* @param str String to search. |
* @param ch Character to look for. |
* |
* @return Pointer to character in @a s or NULL if not found. |
* @return Pointer to character in @a str or NULL if not found. |
* |
*/ |
extern char *strchr(const char *s, int i) |
const char *str_chr(const char *str, wchar_t ch) |
{ |
while (*s != '\0') { |
if (*s == i) |
return (char *) s; |
++s; |
wchar_t acc; |
size_t off = 0; |
|
while ((acc = str_decode(str, &off, STR_NO_LIMIT)) != 0) { |
if (acc == ch) |
return (str + off); |
} |
|
return NULL; |
} |
|
/** Insert a wide character into a wide string. |
* |
* Insert a wide character into a wide string at position |
* @a pos. The characters after the position are shifted. |
* |
* @param str String to insert to. |
* @param ch Character to insert to. |
* @param pos Character index where to insert. |
@ @param max_pos Characters in the buffer. |
* |
* @return True if the insertion was sucessful, false if the position |
* is out of bounds. |
* |
*/ |
bool wstr_linsert(wchar_t *str, wchar_t ch, count_t pos, count_t max_pos) |
{ |
count_t len = wstr_length(str); |
|
if ((pos > len) || (pos + 1 > max_pos)) |
return false; |
|
count_t i; |
for (i = len; i + 1 > pos; i--) |
str[i + 1] = str[i]; |
|
str[pos] = ch; |
|
return true; |
} |
|
/** Remove a wide character from a wide string. |
* |
* Remove a wide character from a wide string at position |
* @a pos. The characters after the position are shifted. |
* |
* @param str String to remove from. |
* @param pos Character index to remove. |
* |
* @return True if the removal was sucessful, false if the position |
* is out of bounds. |
* |
*/ |
bool wstr_remove(wchar_t *str, count_t pos) |
{ |
count_t len = wstr_length(str); |
|
if (pos >= len) |
return false; |
|
count_t i; |
for (i = pos + 1; i <= len; i++) |
str[i - 1] = str[i]; |
|
return true; |
} |
|
/** @} |
*/ |