WebSVN – HelenOS – Blame – /branches/tracing/kernel/generic/src/lib/string.c

Rev	Author	Line No.	Line
4377	svoboda	1	/*
		2	* Copyright (c) 2001-2004 Jakub Jermar
		3	* All rights reserved.
		4	*
		5	* Redistribution and use in source and binary forms, with or without
		6	* modification, are permitted provided that the following conditions
		7	* are met:
		8	*
		9	* - Redistributions of source code must retain the above copyright
		10	* notice, this list of conditions and the following disclaimer.
		11	* - Redistributions in binary form must reproduce the above copyright
		12	* notice, this list of conditions and the following disclaimer in the
		13	* documentation and/or other materials provided with the distribution.
		14	* - The name of the author may not be used to endorse or promote products
		15	* derived from this software without specific prior written permission.
		16	*
		17	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
		18	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
		19	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
		20	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
		21	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
		22	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
		23	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
		24	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
		25	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
		26	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
		27	*/
		28
		29	/** @addtogroup generic
		30	* @{
		31	*/
		32
		33	/**
		34	* @file
		35	* @brief String functions.
		36	*
		37	* Strings and characters use the Universal Character Set (UCS). The standard
		38	* strings, called just strings are encoded in UTF-8. Wide strings (encoded
		39	* in UTF-32) are supported to a limited degree. A single character is
		40	* represented as wchar_t.@n
		41	*
		42	* Overview of the terminology:@n
		43	*
		44	* Term Meaning
		45	* -------------------- ----------------------------------------------------
		46	* byte 8 bits stored in uint8_t (unsigned 8 bit integer)
		47	*
		48	* character UTF-32 encoded Unicode character, stored in wchar_t
		49	* (signed 32 bit integer), code points 0 .. 1114111
		50	* are valid
		51	*
		52	* ASCII character 7 bit encoded ASCII character, stored in char
		53	* (usually signed 8 bit integer), code points 0 .. 127
		54	* are valid
		55	*
		56	* string UTF-8 encoded NULL-terminated Unicode string, char *
		57	*
		58	* wide string UTF-32 encoded NULL-terminated Unicode string,
		59	* wchar_t *
		60	*
		61	* [wide] string size number of BYTES in a [wide] string (excluding
		62	* the NULL-terminator), size_t
		63	*
		64	* [wide] string length number of CHARACTERS in a [wide] string (excluding
4692	svoboda	65	* the NULL-terminator), size_t
4377	svoboda	66	*
		67	* [wide] string width number of display cells on a monospace display taken
4692	svoboda	68	* by a [wide] string, size_t
4377	svoboda	69	*
		70	*
		71	* Overview of string metrics:@n
		72	*
		73	* Metric Abbrev. Type Meaning
		74	* ------ ------ ------ -------------------------------------------------
		75	* size n size_t number of BYTES in a string (excluding the
		76	* NULL-terminator)
		77	*
4692	svoboda	78	* length l size_t number of CHARACTERS in a string (excluding the
4377	svoboda	79	* null terminator)
		80	*
4692	svoboda	81	* width w size_t number of display cells on a monospace display
4377	svoboda	82	* taken by a string
		83	*
		84	*
		85	* Function naming prefixes:@n
		86	*
		87	* chr_ operate on characters
		88	* ascii_ operate on ASCII characters
		89	* str_ operate on strings
		90	* wstr_ operate on wide strings
		91	*
		92	* [w]str_[n\|l\|w] operate on a prefix limited by size, length
		93	* or width
		94	*
		95	*
		96	* A specific character inside a [wide] string can be referred to by:@n
		97	*
		98	* pointer (char , wchar_t )
		99	* byte offset (size_t)
4692	svoboda	100	* character index (size_t)
4377	svoboda	101	*
		102	*/
		103
		104	#include <string.h>
		105	#include <print.h>
		106	#include <cpu.h>
		107	#include <arch/asm.h>
		108	#include <arch.h>
		109	#include <errno.h>
		110	#include <align.h>
		111	#include <debug.h>
		112
		113	/** Byte mask consisting of lowest @n bits (out of 8) */
		114	#define LO_MASK_8(n) ((uint8_t) ((1 << (n)) - 1))
		115
		116	/** Byte mask consisting of lowest @n bits (out of 32) */
		117	#define LO_MASK_32(n) ((uint32_t) ((1 << (n)) - 1))
		118
		119	/** Byte mask consisting of highest @n bits (out of 8) */
		120	#define HI_MASK_8(n) (~LO_MASK_8(8 - (n)))
		121
		122	/** Number of data bits in a UTF-8 continuation byte */
		123	#define CONT_BITS 6
		124
		125	/** Decode a single character from a string.
		126	*
		127	* Decode a single character from a string of size @a size. Decoding starts
		128	* at @a offset and this offset is moved to the beginning of the next
		129	* character. In case of decoding error, offset generally advances at least
		130	* by one. However, offset is never moved beyond size.
		131	*
		132	* @param str String (not necessarily NULL-terminated).
		133	* @param offset Byte offset in string where to start decoding.
		134	* @param size Size of the string (in bytes).
		135	*
		136	* @return Value of decoded character, U_SPECIAL on decoding error or
		137	* NULL if attempt to decode beyond @a size.
		138	*
		139	*/
		140	wchar_t str_decode(const char str, size_t offset, size_t size)
		141	{
		142	if (*offset + 1 > size)
		143	return 0;
		144
		145	/* First byte read from string */
		146	uint8_t b0 = (uint8_t) str[(*offset)++];
		147
		148	/* Determine code length */
		149
		150	unsigned int b0_bits; /* Data bits in first byte */
		151	unsigned int cbytes; /* Number of continuation bytes */
		152
		153	if ((b0 & 0x80) == 0) {
		154	/* 0xxxxxxx (Plain ASCII) */
		155	b0_bits = 7;
		156	cbytes = 0;
		157	} else if ((b0 & 0xe0) == 0xc0) {
		158	/* 110xxxxx 10xxxxxx */
		159	b0_bits = 5;
		160	cbytes = 1;
		161	} else if ((b0 & 0xf0) == 0xe0) {
		162	/* 1110xxxx 10xxxxxx 10xxxxxx */
		163	b0_bits = 4;
		164	cbytes = 2;
		165	} else if ((b0 & 0xf8) == 0xf0) {
		166	/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
		167	b0_bits = 3;
		168	cbytes = 3;
		169	} else {
		170	/* 10xxxxxx -- unexpected continuation byte */
		171	return U_SPECIAL;
		172	}
		173
		174	if (*offset + cbytes > size)
		175	return U_SPECIAL;
		176
		177	wchar_t ch = b0 & LO_MASK_8(b0_bits);
		178
		179	/* Decode continuation bytes */
		180	while (cbytes > 0) {
		181	uint8_t b = (uint8_t) str[(*offset)++];
		182
		183	/* Must be 10xxxxxx */
		184	if ((b & 0xc0) != 0x80)
		185	return U_SPECIAL;
		186
		187	/* Shift data bits to ch */
		188	ch = (ch << CONT_BITS) \| (wchar_t) (b & LO_MASK_8(CONT_BITS));
		189	cbytes--;
		190	}
		191
		192	return ch;
		193	}
		194
		195	/** Encode a single character to string representation.
		196	*
		197	* Encode a single character to string representation (i.e. UTF-8) and store
		198	* it into a buffer at @a offset. Encoding starts at @a offset and this offset
		199	* is moved to the position where the next character can be written to.
		200	*
		201	* @param ch Input character.
		202	* @param str Output buffer.
		203	* @param offset Byte offset where to start writing.
		204	* @param size Size of the output buffer (in bytes).
		205	*
		206	* @return EOK if the character was encoded successfully, EOVERFLOW if there
		207	* was not enough space in the output buffer or EINVAL if the character
		208	* code was invalid.
		209	*/
		210	int chr_encode(wchar_t ch, char str, size_t offset, size_t size)
		211	{
		212	if (*offset >= size)
		213	return EOVERFLOW;
		214
		215	if (!chr_check(ch))
		216	return EINVAL;
		217
		218	/* Unsigned version of ch (bit operations should only be done
		219	on unsigned types). */
		220	uint32_t cc = (uint32_t) ch;
		221
		222	/* Determine how many continuation bytes are needed */
		223
		224	unsigned int b0_bits; /* Data bits in first byte */
		225	unsigned int cbytes; /* Number of continuation bytes */
		226
		227	if ((cc & ~LO_MASK_32(7)) == 0) {
		228	b0_bits = 7;
		229	cbytes = 0;
		230	} else if ((cc & ~LO_MASK_32(11)) == 0) {
		231	b0_bits = 5;
		232	cbytes = 1;
		233	} else if ((cc & ~LO_MASK_32(16)) == 0) {
		234	b0_bits = 4;
		235	cbytes = 2;
		236	} else if ((cc & ~LO_MASK_32(21)) == 0) {
		237	b0_bits = 3;
		238	cbytes = 3;
		239	} else {
		240	/* Codes longer than 21 bits are not supported */
		241	return EINVAL;
		242	}
		243
		244	/* Check for available space in buffer */
		245	if (*offset + cbytes >= size)
		246	return EOVERFLOW;
		247
		248	/* Encode continuation bytes */
		249	unsigned int i;
		250	for (i = cbytes; i > 0; i--) {
		251	str[*offset + i] = 0x80 \| (cc & LO_MASK_32(CONT_BITS));
		252	cc = cc >> CONT_BITS;
		253	}
		254
		255	/* Encode first byte */
		256	str[*offset] = (cc & LO_MASK_32(b0_bits)) \| HI_MASK_8(8 - b0_bits - 1);
		257
		258	/* Advance offset */
		259	*offset += cbytes + 1;
		260
		261	return EOK;
		262	}
		263
		264	/** Get size of string.
		265	*
		266	* Get the number of bytes which are used by the string @a str (excluding the
		267	* NULL-terminator).
		268	*
		269	* @param str String to consider.
		270	*
		271	* @return Number of bytes used by the string
		272	*
		273	*/
		274	size_t str_size(const char *str)
		275	{
		276	size_t size = 0;
		277
		278	while (*str++ != 0)
		279	size++;
		280
		281	return size;
		282	}
		283
		284	/** Get size of wide string.
		285	*
		286	* Get the number of bytes which are used by the wide string @a str (excluding the
		287	* NULL-terminator).
		288	*
		289	* @param str Wide string to consider.
		290	*
		291	* @return Number of bytes used by the wide string
		292	*
		293	*/
		294	size_t wstr_size(const wchar_t *str)
		295	{
		296	return (wstr_length(str) * sizeof(wchar_t));
		297	}
		298
		299	/** Get size of string with length limit.
		300	*
		301	* Get the number of bytes which are used by up to @a max_len first
		302	* characters in the string @a str. If @a max_len is greater than
		303	* the length of @a str, the entire string is measured (excluding the
		304	* NULL-terminator).
		305	*
		306	* @param str String to consider.
		307	* @param max_len Maximum number of characters to measure.
		308	*
		309	* @return Number of bytes used by the characters.
		310	*
		311	*/
4692	svoboda	312	size_t str_lsize(const char *str, size_t max_len)
4377	svoboda	313	{
4692	svoboda	314	size_t len = 0;
4377	svoboda	315	size_t offset = 0;
		316
		317	while (len < max_len) {
		318	if (str_decode(str, &offset, STR_NO_LIMIT) == 0)
		319	break;
		320
		321	len++;
		322	}
		323
		324	return offset;
		325	}
		326
		327	/** Get size of wide string with length limit.
		328	*
		329	* Get the number of bytes which are used by up to @a max_len first
		330	* wide characters in the wide string @a str. If @a max_len is greater than
		331	* the length of @a str, the entire wide string is measured (excluding the
		332	* NULL-terminator).
		333	*
		334	* @param str Wide string to consider.
		335	* @param max_len Maximum number of wide characters to measure.
		336	*
		337	* @return Number of bytes used by the wide characters.
		338	*
		339	*/
4692	svoboda	340	size_t wstr_lsize(const wchar_t *str, size_t max_len)
4377	svoboda	341	{
		342	return (wstr_nlength(str, max_len * sizeof(wchar_t)) * sizeof(wchar_t));
		343	}
		344
		345	/** Get number of characters in a string.
		346	*
		347	* @param str NULL-terminated string.
		348	*
		349	* @return Number of characters in string.
		350	*
		351	*/
4692	svoboda	352	size_t str_length(const char *str)
4377	svoboda	353	{
4692	svoboda	354	size_t len = 0;
4377	svoboda	355	size_t offset = 0;
		356
		357	while (str_decode(str, &offset, STR_NO_LIMIT) != 0)
		358	len++;
		359
		360	return len;
		361	}
		362
		363	/** Get number of characters in a wide string.
		364	*
		365	* @param str NULL-terminated wide string.
		366	*
		367	* @return Number of characters in @a str.
		368	*
		369	*/
4692	svoboda	370	size_t wstr_length(const wchar_t *wstr)
4377	svoboda	371	{
4692	svoboda	372	size_t len = 0;
4377	svoboda	373
		374	while (*wstr++ != 0)
		375	len++;
		376
		377	return len;
		378	}
		379
		380	/** Get number of characters in a string with size limit.
		381	*
		382	* @param str NULL-terminated string.
		383	* @param size Maximum number of bytes to consider.
		384	*
		385	* @return Number of characters in string.
		386	*
		387	*/
4692	svoboda	388	size_t str_nlength(const char *str, size_t size)
4377	svoboda	389	{
4692	svoboda	390	size_t len = 0;
4377	svoboda	391	size_t offset = 0;
		392
		393	while (str_decode(str, &offset, size) != 0)
		394	len++;
		395
		396	return len;
		397	}
		398
		399	/** Get number of characters in a string with size limit.
		400	*
		401	* @param str NULL-terminated string.
		402	* @param size Maximum number of bytes to consider.
		403	*
		404	* @return Number of characters in string.
		405	*
		406	*/
4692	svoboda	407	size_t wstr_nlength(const wchar_t *str, size_t size)
4377	svoboda	408	{
4692	svoboda	409	size_t len = 0;
		410	size_t limit = ALIGN_DOWN(size, sizeof(wchar_t));
		411	size_t offset = 0;
4377	svoboda	412
		413	while ((offset < limit) && (*str++ != 0)) {
		414	len++;
		415	offset += sizeof(wchar_t);
		416	}
		417
		418	return len;
		419	}
		420
		421	/** Check whether character is plain ASCII.
		422	*
		423	* @return True if character is plain ASCII.
		424	*
		425	*/
		426	bool ascii_check(wchar_t ch)
		427	{
		428	if ((ch >= 0) && (ch <= 127))
		429	return true;
		430
		431	return false;
		432	}
		433
		434	/** Check whether character is valid
		435	*
		436	* @return True if character is a valid Unicode code point.
		437	*
		438	*/
		439	bool chr_check(wchar_t ch)
		440	{
		441	if ((ch >= 0) && (ch <= 1114111))
		442	return true;
		443
		444	return false;
		445	}
		446
		447	/** Compare two NULL terminated strings.
		448	*
		449	* Do a char-by-char comparison of two NULL-terminated strings.
		450	* The strings are considered equal iff they consist of the same
		451	* characters on the minimum of their lengths.
		452	*
		453	* @param s1 First string to compare.
		454	* @param s2 Second string to compare.
		455	*
		456	* @return 0 if the strings are equal, -1 if first is smaller,
		457	* 1 if second smaller.
		458	*
		459	*/
		460	int str_cmp(const char s1, const char s2)
		461	{
		462	wchar_t c1 = 0;
		463	wchar_t c2 = 0;
		464
		465	size_t off1 = 0;
		466	size_t off2 = 0;
		467
		468	while (true) {
		469	c1 = str_decode(s1, &off1, STR_NO_LIMIT);
		470	c2 = str_decode(s2, &off2, STR_NO_LIMIT);
		471
		472	if (c1 < c2)
		473	return -1;
		474
		475	if (c1 > c2)
		476	return 1;
		477
		478	if (c1 == 0 \|\| c2 == 0)
		479	break;
		480	}
		481
		482	return 0;
		483	}
		484
		485	/** Compare two NULL terminated strings with length limit.
		486	*
		487	* Do a char-by-char comparison of two NULL-terminated strings.
		488	* The strings are considered equal iff they consist of the same
		489	* characters on the minimum of their lengths and the length limit.
		490	*
		491	* @param s1 First string to compare.
		492	* @param s2 Second string to compare.
		493	* @param max_len Maximum number of characters to consider.
		494	*
		495	* @return 0 if the strings are equal, -1 if first is smaller,
		496	* 1 if second smaller.
		497	*
		498	*/
4692	svoboda	499	int str_lcmp(const char s1, const char s2, size_t max_len)
4377	svoboda	500	{
		501	wchar_t c1 = 0;
		502	wchar_t c2 = 0;
		503
		504	size_t off1 = 0;
		505	size_t off2 = 0;
		506
4692	svoboda	507	size_t len = 0;
4377	svoboda	508
		509	while (true) {
		510	if (len >= max_len)
		511	break;
		512
		513	c1 = str_decode(s1, &off1, STR_NO_LIMIT);
		514	c2 = str_decode(s2, &off2, STR_NO_LIMIT);
		515
		516	if (c1 < c2)
		517	return -1;
		518
		519	if (c1 > c2)
		520	return 1;
		521
		522	if (c1 == 0 \|\| c2 == 0)
		523	break;
		524
		525	++len;
		526	}
		527
		528	return 0;
		529
		530	}
		531
		532	/** Copy string.
		533	*
		534	* Copy source string @a src to destination buffer @a dest.
		535	* No more than @a size bytes are written. If the size of the output buffer
		536	* is at least one byte, the output string will always be well-formed, i.e.
		537	* null-terminated and containing only complete characters.
		538	*
		539	* @param dst Destination buffer.
		540	* @param count Size of the destination buffer (must be > 0).
		541	* @param src Source string.
		542	*/
		543	void str_cpy(char dest, size_t size, const char src)
		544	{
		545	wchar_t ch;
		546	size_t src_off;
		547	size_t dest_off;
		548
		549	/* There must be space for a null terminator in the buffer. */
		550	ASSERT(size > 0);
		551
		552	src_off = 0;
		553	dest_off = 0;
		554
		555	while ((ch = str_decode(src, &src_off, STR_NO_LIMIT)) != 0) {
		556	if (chr_encode(ch, dest, &dest_off, size - 1) != EOK)
		557	break;
		558	}
		559
		560	dest[dest_off] = '\0';
		561	}
		562
		563	/** Copy size-limited substring.
		564	*
		565	* Copy prefix of string @a src of max. size @a size to destination buffer
		566	* @a dest. No more than @a size bytes are written. The output string will
		567	* always be well-formed, i.e. null-terminated and containing only complete
		568	* characters.
		569	*
		570	* No more than @a n bytes are read from the input string, so it does not
		571	* have to be null-terminated.
		572	*
		573	* @param dst Destination buffer.
		574	* @param count Size of the destination buffer (must be > 0).
		575	* @param src Source string.
		576	* @param n Maximum number of bytes to read from @a src.
		577	*/
		578	void str_ncpy(char dest, size_t size, const char src, size_t n)
		579	{
		580	wchar_t ch;
		581	size_t src_off;
		582	size_t dest_off;
		583
		584	/* There must be space for a null terminator in the buffer. */
		585	ASSERT(size > 0);
		586
		587	src_off = 0;
		588	dest_off = 0;
		589
		590	while ((ch = str_decode(src, &src_off, n)) != 0) {
		591	if (chr_encode(ch, dest, &dest_off, size - 1) != EOK)
		592	break;
		593	}
		594
		595	dest[dest_off] = '\0';
		596	}
		597
		598	/** Copy NULL-terminated wide string to string
		599	*
		600	* Copy source wide string @a src to destination buffer @a dst.
		601	* No more than @a size bytes are written. NULL-terminator is always
		602	* written after the last succesfully copied character (i.e. if the
		603	* destination buffer is has at least 1 byte, it will be always
		604	* NULL-terminated).
		605	*
		606	* @param src Source wide string.
		607	* @param dst Destination buffer.
		608	* @param count Size of the destination buffer.
		609	*
		610	*/
		611	void wstr_nstr(char dst, const wchar_t src, size_t size)
		612	{
		613	/* No space for the NULL-terminator in the buffer */
		614	if (size == 0)
		615	return;
		616
		617	wchar_t ch;
4692	svoboda	618	size_t src_idx = 0;
4377	svoboda	619	size_t dst_off = 0;
		620
		621	while ((ch = src[src_idx++]) != 0) {
		622	if (chr_encode(ch, dst, &dst_off, size) != EOK)
		623	break;
		624	}
		625
		626	if (dst_off >= size)
		627	dst[size - 1] = 0;
		628	else
		629	dst[dst_off] = 0;
		630	}
		631
		632	/** Find first occurence of character in string.
		633	*
		634	* @param str String to search.
		635	* @param ch Character to look for.
		636	*
		637	* @return Pointer to character in @a str or NULL if not found.
		638	*
		639	*/
4692	svoboda	640	char str_chr(const char str, wchar_t ch)
4377	svoboda	641	{
		642	wchar_t acc;
		643	size_t off = 0;
		644	size_t last = 0;
		645
		646	while ((acc = str_decode(str, &off, STR_NO_LIMIT)) != 0) {
		647	if (acc == ch)
4692	svoboda	648	return (char *) (str + last);
4377	svoboda	649	last = off;
		650	}
		651
		652	return NULL;
		653	}
		654
		655	/** Insert a wide character into a wide string.
		656	*
		657	* Insert a wide character into a wide string at position
		658	* @a pos. The characters after the position are shifted.
		659	*
		660	* @param str String to insert to.
		661	* @param ch Character to insert to.
		662	* @param pos Character index where to insert.
		663	@ @param max_pos Characters in the buffer.
		664	*
		665	* @return True if the insertion was sucessful, false if the position
		666	* is out of bounds.
		667	*
		668	*/
4692	svoboda	669	bool wstr_linsert(wchar_t *str, wchar_t ch, size_t pos, size_t max_pos)
4377	svoboda	670	{
4692	svoboda	671	size_t len = wstr_length(str);
4377	svoboda	672
		673	if ((pos > len) \|\| (pos + 1 > max_pos))
		674	return false;
		675
4692	svoboda	676	size_t i;
4377	svoboda	677	for (i = len; i + 1 > pos; i--)
		678	str[i + 1] = str[i];
		679
		680	str[pos] = ch;
		681
		682	return true;
		683	}
		684
		685	/** Remove a wide character from a wide string.
		686	*
		687	* Remove a wide character from a wide string at position
		688	* @a pos. The characters after the position are shifted.
		689	*
		690	* @param str String to remove from.
		691	* @param pos Character index to remove.
		692	*
		693	* @return True if the removal was sucessful, false if the position
		694	* is out of bounds.
		695	*
		696	*/
4692	svoboda	697	bool wstr_remove(wchar_t *str, size_t pos)
4377	svoboda	698	{
4692	svoboda	699	size_t len = wstr_length(str);
4377	svoboda	700
		701	if (pos >= len)
		702	return false;
		703
4692	svoboda	704	size_t i;
4377	svoboda	705	for (i = pos + 1; i <= len; i++)
		706	str[i - 1] = str[i];
		707
		708	return true;
		709	}
		710
		711	/** @}
		712	*/

Subversion Repositories HelenOS

(root)/branches/tracing/kernel/generic/src/lib/string.c – Rev 4703