Subversion Repositories HelenOS

Compare Revisions

Ignore whitespace Rev 4200 → Rev 4201

/branches/dd/kernel/generic/src/lib/string.c
42,22 → 42,271
#include <arch.h>
#include <console/kconsole.h>
 
/** Return number of characters in a string.
char invalch = '?';
 
/** Byte mask consisting of lowest @n bits (out of eight). */
#define LO_MASK_8(n) ((uint8_t)((1 << (n)) - 1))
 
/** Byte mask consisting of lowest @n bits (out of 32). */
#define LO_MASK_32(n) ((uint32_t)((1 << (n)) - 1))
 
/** Byte mask consisting of highest @n bits (out of eight). */
#define HI_MASK_8(n) (~LO_MASK_8(8 - (n)))
 
/** Number of data bits in a UTF-8 continuation byte. */
#define CONT_BITS 6
 
/** Decode a single character from a substring.
*
* @param str NULL terminated string.
* Decode a single character from a substring of size @a sz. Decoding starts
* at @a offset and this offset is moved to the beginning of the next
* character. In case of decoding error, offset generally advances at least
* by one. However, offset is never moved beyond (str + sz).
*
* @param str String (not necessarily NULL-terminated).
* @param index Index (counted in plain characters) where to start
* the decoding.
* @param limit Size of the substring.
*
* @return Value of decoded character or '?' on decoding error.
*
*/
wchar_t chr_decode(const char *str, size_t *offset, size_t sz)
{
uint8_t b0, b; /* Bytes read from str. */
wchar_t ch;
 
int b0_bits; /* Data bits in first byte. */
int cbytes; /* Number of continuation bytes. */
 
if (*offset + 1 > sz)
return invalch;
 
b0 = (uint8_t) str[(*offset)++];
 
/* Determine code length. */
 
if ((b0 & 0x80) == 0) {
/* 0xxxxxxx (Plain ASCII) */
b0_bits = 7;
cbytes = 0;
} else if ((b0 & 0xe0) == 0xc0) {
/* 110xxxxx 10xxxxxx */
b0_bits = 5;
cbytes = 1;
} else if ((b0 & 0xf0) == 0xe0) {
/* 1110xxxx 10xxxxxx 10xxxxxx */
b0_bits = 4;
cbytes = 2;
} else if ((b0 & 0xf8) == 0xf0) {
/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
b0_bits = 3;
cbytes = 3;
} else {
/* 10xxxxxx -- unexpected continuation byte. */
return invalch;
}
 
if (*offset + cbytes > sz) {
return invalch;
}
 
ch = b0 & LO_MASK_8(b0_bits);
 
/* Decode continuation bytes. */
while (cbytes > 0) {
b = (uint8_t) str[(*offset)++];
 
/* Must be 10xxxxxx. */
if ((b & 0xc0) != 0x80) {
return invalch;
}
 
/* Shift data bits to ch. */
ch = (ch << CONT_BITS) | (wchar_t) (b & LO_MASK_8(CONT_BITS));
--cbytes;
}
 
return ch;
}
 
/** Encode a single character to string representation.
*
* Encode a single character to string representation (i.e. UTF-8) and store
* it into a buffer at @a offset. Encoding starts at @a offset and this offset
* is moved to the position where the next character can be written to.
*
* @param ch Input character.
* @param str Output buffer.
* @param offset Offset (in bytes) where to start writing.
* @param sz Size of the output buffer.
*
* @return True if the character was encoded successfully or false if there
* was not enough space in the output buffer or the character code
* was invalid.
*/
bool chr_encode(const wchar_t ch, char *str, size_t *offset, size_t sz)
{
uint32_t cc; /* Unsigned version of ch. */
 
int cbytes; /* Number of continuation bytes. */
int b0_bits; /* Number of data bits in first byte. */
int i;
 
if (*offset >= sz)
return false;
 
if (ch < 0)
return false;
 
/* Bit operations should only be done on unsigned numbers. */
cc = (uint32_t) ch;
 
/* Determine how many continuation bytes are needed. */
if ((cc & ~LO_MASK_32(7)) == 0) {
b0_bits = 7;
cbytes = 0;
} else if ((cc & ~LO_MASK_32(11)) == 0) {
b0_bits = 5;
cbytes = 1;
} else if ((cc & ~LO_MASK_32(16)) == 0) {
b0_bits = 4;
cbytes = 2;
} else if ((cc & ~LO_MASK_32(21)) == 0) {
b0_bits = 3;
cbytes = 3;
} else {
/* Codes longer than 21 bits are not supported. */
return false;
}
 
/* Check for available space in buffer. */
if (*offset + cbytes >= sz)
return false;
 
/* Encode continuation bytes. */
for (i = cbytes; i > 0; --i) {
str[*offset + i] = 0x80 | (cc & LO_MASK_32(CONT_BITS));
cc = cc >> CONT_BITS;
}
 
/* Encode first byte. */
str[*offset] = (cc & LO_MASK_32(b0_bits)) | HI_MASK_8(8 - b0_bits - 1);
 
/* Advance offset. */
*offset += (1 + cbytes);
return true;
}
 
/** Get bytes used by UTF-8 characters.
*
* Get the number of bytes (count of plain characters) which
* are used by a given count of UTF-8 characters in a string.
* As UTF-8 encoding is multibyte, there is no constant
* correspondence between number of characters and used bytes.
*
* @param str UTF-8 string to consider.
* @param count Number of UTF-8 characters to count.
*
* @return Number of bytes used by the characters.
*
*/
size_t utf8_count_bytes(const char *str, count_t count)
{
size_t size = 0;
index_t index = 0;
index_t iprev;
wchar_t ch;
while (true) {
iprev = index;
if (size >= count)
break;
ch = chr_decode(str, &index, UTF8_NO_LIMIT);
if (ch == '\0') break;
 
size++;
}
return iprev;
}
 
/** Check whether character is plain ASCII.
*
* @return True if character is plain ASCII.
*
*/
bool ascii_check(const wchar_t ch)
{
if ((ch >= 0) && (ch <= 127))
return true;
return false;
}
 
/** Check whether character is Unicode.
*
* @return True if character is valid Unicode code point.
*
*/
bool unicode_check(const wchar_t ch)
{
if ((ch >= 0) && (ch <= 1114111))
return true;
return false;
}
 
/** Return number of plain characters in a string.
*
* @param str NULL-terminated string.
*
* @return Number of characters in str.
*
*/
size_t strlen(const char *str)
{
int i;
size_t size;
for (size = 0; str[size]; size++);
for (i = 0; str[i]; i++);
return size;
}
 
/** Return number of UTF-8 characters in a string.
*
* @param str NULL-terminated UTF-8 string.
*
* @return Number of UTF-8 characters in str.
*
*/
size_t strlen_utf8(const char *str)
{
size_t size = 0;
index_t index = 0;
return i;
while (chr_decode(str, &index, UTF8_NO_LIMIT) != 0) {
size++;
}
return size;
}
 
/** Return number of UTF-32 characters in a string.
*
* @param str NULL-terminated UTF-32 string.
*
* @return Number of UTF-32 characters in str.
*
*/
size_t strlen_utf32(const wchar_t *str)
{
size_t size;
for (size = 0; str[size]; size++);
return size;
}
 
/** Compare two NULL terminated strings
*
* Do a char-by-char comparison of two NULL terminated strings.