WebSVN – HelenOS – Blame – /branches/network/kernel/generic/src/lib/string.c

Rev	Author	Line No.	Line
4011	svoboda	1	/*
		2	* Copyright (c) 2001-2004 Jakub Jermar
		3	* All rights reserved.
		4	*
		5	* Redistribution and use in source and binary forms, with or without
		6	* modification, are permitted provided that the following conditions
		7	* are met:
		8	*
		9	* - Redistributions of source code must retain the above copyright
		10	* notice, this list of conditions and the following disclaimer.
		11	* - Redistributions in binary form must reproduce the above copyright
		12	* notice, this list of conditions and the following disclaimer in the
		13	* documentation and/or other materials provided with the distribution.
		14	* - The name of the author may not be used to endorse or promote products
		15	* derived from this software without specific prior written permission.
		16	*
		17	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
		18	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
		19	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
		20	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
		21	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
		22	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
		23	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
		24	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
		25	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
		26	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
		27	*/
		28
4014	decky	29	/** @addtogroup generic
4011	svoboda	30	* @{
		31	*/
		32
		33	/**
		34	* @file
4263	mejdrech	35	* @brief String functions.
		36	*
		37	* Strings and characters use the Universal Character Set (UCS). The standard
		38	* strings, called just strings are encoded in UTF-8. Wide strings (encoded
		39	* in UTF-32) are supported to a limited degree. A single character is
		40	* represented as wchar_t.@n
		41	*
		42	* Overview of the terminology:@n
		43	*
		44	* Term Meaning
		45	* -------------------- ----------------------------------------------------
		46	* byte 8 bits stored in uint8_t (unsigned 8 bit integer)
		47	*
		48	* character UTF-32 encoded Unicode character, stored in wchar_t
		49	* (signed 32 bit integer), code points 0 .. 1114111
		50	* are valid
		51	*
		52	* ASCII character 7 bit encoded ASCII character, stored in char
		53	* (usually signed 8 bit integer), code points 0 .. 127
		54	* are valid
		55	*
		56	* string UTF-8 encoded NULL-terminated Unicode string, char *
		57	*
		58	* wide string UTF-32 encoded NULL-terminated Unicode string,
		59	* wchar_t *
		60	*
		61	* [wide] string size number of BYTES in a [wide] string (excluding
		62	* the NULL-terminator), size_t
		63	*
		64	* [wide] string length number of CHARACTERS in a [wide] string (excluding
		65	* the NULL-terminator), count_t
		66	*
		67	* [wide] string width number of display cells on a monospace display taken
		68	* by a [wide] string, count_t
		69	*
		70	*
		71	* Overview of string metrics:@n
		72	*
		73	* Metric Abbrev. Type Meaning
		74	* ------ ------ ------ -------------------------------------------------
		75	* size n size_t number of BYTES in a string (excluding the
		76	* NULL-terminator)
		77	*
		78	* length l count_t number of CHARACTERS in a string (excluding the
		79	* null terminator)
		80	*
		81	* width w count_t number of display cells on a monospace display
		82	* taken by a string
		83	*
		84	*
		85	* Function naming prefixes:@n
		86	*
		87	* chr_ operate on characters
		88	* ascii_ operate on ASCII characters
		89	* str_ operate on strings
		90	* wstr_ operate on wide strings
		91	*
		92	* [w]str_[n\|l\|w] operate on a prefix limited by size, length
		93	* or width
		94	*
		95	*
		96	* A specific character inside a [wide] string can be referred to by:@n
		97	*
		98	* pointer (char , wchar_t )
		99	* byte offset (size_t)
		100	* character index (count_t)
		101	*
4011	svoboda	102	*/
		103
		104	#include <string.h>
		105	#include <print.h>
		106	#include <cpu.h>
		107	#include <arch/asm.h>
		108	#include <arch.h>
4263	mejdrech	109	#include <errno.h>
		110	#include <align.h>
4011	svoboda	111
4263	mejdrech	112	/** Byte mask consisting of lowest @n bits (out of 8) */
		113	#define LO_MASK_8(n) ((uint8_t) ((1 << (n)) - 1))
		114
		115	/** Byte mask consisting of lowest @n bits (out of 32) */
		116	#define LO_MASK_32(n) ((uint32_t) ((1 << (n)) - 1))
		117
		118	/** Byte mask consisting of highest @n bits (out of 8) */
		119	#define HI_MASK_8(n) (~LO_MASK_8(8 - (n)))
		120
		121	/** Number of data bits in a UTF-8 continuation byte */
		122	#define CONT_BITS 6
		123
		124	/** Decode a single character from a string.
4011	svoboda	125	*
4263	mejdrech	126	* Decode a single character from a string of size @a size. Decoding starts
		127	* at @a offset and this offset is moved to the beginning of the next
		128	* character. In case of decoding error, offset generally advances at least
		129	* by one. However, offset is never moved beyond size.
4011	svoboda	130	*
4263	mejdrech	131	* @param str String (not necessarily NULL-terminated).
		132	* @param offset Byte offset in string where to start decoding.
		133	* @param size Size of the string (in bytes).
4014	decky	134	*
4263	mejdrech	135	* @return Value of decoded character, U_SPECIAL on decoding error or
		136	* NULL if attempt to decode beyond @a size.
		137	*
4011	svoboda	138	*/
4263	mejdrech	139	wchar_t str_decode(const char str, size_t offset, size_t size)
4011	svoboda	140	{
4263	mejdrech	141	if (*offset + 1 > size)
		142	return 0;
4011	svoboda	143
4263	mejdrech	144	/* First byte read from string */
		145	uint8_t b0 = (uint8_t) str[(*offset)++];
4011	svoboda	146
4263	mejdrech	147	/* Determine code length */
		148
		149	unsigned int b0_bits; /* Data bits in first byte */
		150	unsigned int cbytes; /* Number of continuation bytes */
		151
		152	if ((b0 & 0x80) == 0) {
		153	/* 0xxxxxxx (Plain ASCII) */
		154	b0_bits = 7;
		155	cbytes = 0;
		156	} else if ((b0 & 0xe0) == 0xc0) {
		157	/* 110xxxxx 10xxxxxx */
		158	b0_bits = 5;
		159	cbytes = 1;
		160	} else if ((b0 & 0xf0) == 0xe0) {
		161	/* 1110xxxx 10xxxxxx 10xxxxxx */
		162	b0_bits = 4;
		163	cbytes = 2;
		164	} else if ((b0 & 0xf8) == 0xf0) {
		165	/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
		166	b0_bits = 3;
		167	cbytes = 3;
		168	} else {
		169	/* 10xxxxxx -- unexpected continuation byte */
		170	return U_SPECIAL;
		171	}
		172
		173	if (*offset + cbytes > size)
		174	return U_SPECIAL;
		175
		176	wchar_t ch = b0 & LO_MASK_8(b0_bits);
		177
		178	/* Decode continuation bytes */
		179	while (cbytes > 0) {
		180	uint8_t b = (uint8_t) str[(*offset)++];
		181
		182	/* Must be 10xxxxxx */
		183	if ((b & 0xc0) != 0x80)
		184	return U_SPECIAL;
		185
		186	/* Shift data bits to ch */
		187	ch = (ch << CONT_BITS) \| (wchar_t) (b & LO_MASK_8(CONT_BITS));
		188	cbytes--;
		189	}
		190
		191	return ch;
4011	svoboda	192	}
		193
4263	mejdrech	194	/** Encode a single character to string representation.
4011	svoboda	195	*
4263	mejdrech	196	* Encode a single character to string representation (i.e. UTF-8) and store
		197	* it into a buffer at @a offset. Encoding starts at @a offset and this offset
		198	* is moved to the position where the next character can be written to.
4011	svoboda	199	*
4263	mejdrech	200	* @param ch Input character.
		201	* @param str Output buffer.
		202	* @param offset Byte offset where to start writing.
		203	* @param size Size of the output buffer (in bytes).
4011	svoboda	204	*
4263	mejdrech	205	* @return EOK if the character was encoded successfully, EOVERFLOW if there
		206	* was not enough space in the output buffer or EINVAL if the character
		207	* code was invalid.
		208	*/
		209	int chr_encode(wchar_t ch, char str, size_t offset, size_t size)
		210	{
		211	if (*offset >= size)
		212	return EOVERFLOW;
		213
		214	if (!chr_check(ch))
		215	return EINVAL;
		216
		217	/* Unsigned version of ch (bit operations should only be done
		218	on unsigned types). */
		219	uint32_t cc = (uint32_t) ch;
		220
		221	/* Determine how many continuation bytes are needed */
		222
		223	unsigned int b0_bits; /* Data bits in first byte */
		224	unsigned int cbytes; /* Number of continuation bytes */
		225
		226	if ((cc & ~LO_MASK_32(7)) == 0) {
		227	b0_bits = 7;
		228	cbytes = 0;
		229	} else if ((cc & ~LO_MASK_32(11)) == 0) {
		230	b0_bits = 5;
		231	cbytes = 1;
		232	} else if ((cc & ~LO_MASK_32(16)) == 0) {
		233	b0_bits = 4;
		234	cbytes = 2;
		235	} else if ((cc & ~LO_MASK_32(21)) == 0) {
		236	b0_bits = 3;
		237	cbytes = 3;
		238	} else {
		239	/* Codes longer than 21 bits are not supported */
		240	return EINVAL;
		241	}
		242
		243	/* Check for available space in buffer */
		244	if (*offset + cbytes >= size)
		245	return EOVERFLOW;
		246
		247	/* Encode continuation bytes */
		248	unsigned int i;
		249	for (i = cbytes; i > 0; i--) {
		250	str[*offset + i] = 0x80 \| (cc & LO_MASK_32(CONT_BITS));
		251	cc = cc >> CONT_BITS;
		252	}
		253
		254	/* Encode first byte */
		255	str[*offset] = (cc & LO_MASK_32(b0_bits)) \| HI_MASK_8(8 - b0_bits - 1);
		256
		257	/* Advance offset */
		258	*offset += cbytes + 1;
		259
		260	return EOK;
		261	}
		262
		263	/** Get size of string.
4011	svoboda	264	*
4263	mejdrech	265	* Get the number of bytes which are used by the string @a str (excluding the
		266	* NULL-terminator).
		267	*
		268	* @param str String to consider.
		269	*
		270	* @return Number of bytes used by the string
		271	*
4011	svoboda	272	*/
4263	mejdrech	273	size_t str_size(const char *str)
4011	svoboda	274	{
4263	mejdrech	275	size_t size = 0;
		276
		277	while (*str++ != 0)
		278	size++;
		279
		280	return size;
		281	}
		282
		283	/** Get size of wide string.
		284	*
		285	* Get the number of bytes which are used by the wide string @a str (excluding the
		286	* NULL-terminator).
		287	*
		288	* @param str Wide string to consider.
		289	*
		290	* @return Number of bytes used by the wide string
		291	*
		292	*/
		293	size_t wstr_size(const wchar_t *str)
		294	{
		295	return (wstr_length(str) * sizeof(wchar_t));
		296	}
		297
		298	/** Get size of string with length limit.
		299	*
		300	* Get the number of bytes which are used by up to @a max_len first
		301	* characters in the string @a str. If @a max_len is greater than
		302	* the length of @a str, the entire string is measured (excluding the
		303	* NULL-terminator).
		304	*
		305	* @param str String to consider.
		306	* @param max_len Maximum number of characters to measure.
		307	*
		308	* @return Number of bytes used by the characters.
		309	*
		310	*/
		311	size_t str_lsize(const char *str, count_t max_len)
		312	{
		313	count_t len = 0;
		314	size_t offset = 0;
		315
		316	while (len < max_len) {
		317	if (str_decode(str, &offset, STR_NO_LIMIT) == 0)
		318	break;
		319
		320	len++;
4011	svoboda	321	}
4014	decky	322
4263	mejdrech	323	return offset;
		324	}
		325
		326	/** Get size of wide string with length limit.
		327	*
		328	* Get the number of bytes which are used by up to @a max_len first
		329	* wide characters in the wide string @a str. If @a max_len is greater than
		330	* the length of @a str, the entire wide string is measured (excluding the
		331	* NULL-terminator).
		332	*
		333	* @param str Wide string to consider.
		334	* @param max_len Maximum number of wide characters to measure.
		335	*
		336	* @return Number of bytes used by the wide characters.
		337	*
		338	*/
		339	size_t wstr_lsize(const wchar_t *str, count_t max_len)
		340	{
		341	return (wstr_nlength(str, max_len * sizeof(wchar_t)) * sizeof(wchar_t));
		342	}
		343
		344	/** Get number of characters in a string.
		345	*
		346	* @param str NULL-terminated string.
		347	*
		348	* @return Number of characters in string.
		349	*
		350	*/
		351	count_t str_length(const char *str)
		352	{
		353	count_t len = 0;
		354	size_t offset = 0;
4014	decky	355
4263	mejdrech	356	while (str_decode(str, &offset, STR_NO_LIMIT) != 0)
		357	len++;
		358
		359	return len;
4011	svoboda	360	}
		361
4263	mejdrech	362	/** Get number of characters in a wide string.
		363	*
		364	* @param str NULL-terminated wide string.
		365	*
		366	* @return Number of characters in @a str.
		367	*
		368	*/
		369	count_t wstr_length(const wchar_t *wstr)
		370	{
		371	count_t len = 0;
		372
		373	while (*wstr++ != 0)
		374	len++;
		375
		376	return len;
		377	}
4011	svoboda	378
4263	mejdrech	379	/** Get number of characters in a string with size limit.
4011	svoboda	380	*
4263	mejdrech	381	* @param str NULL-terminated string.
		382	* @param size Maximum number of bytes to consider.
		383	*
		384	* @return Number of characters in string.
		385	*
		386	*/
		387	count_t str_nlength(const char *str, size_t size)
		388	{
		389	count_t len = 0;
		390	size_t offset = 0;
		391
		392	while (str_decode(str, &offset, size) != 0)
		393	len++;
		394
		395	return len;
		396	}
		397
		398	/** Get number of characters in a string with size limit.
		399	*
		400	* @param str NULL-terminated string.
		401	* @param size Maximum number of bytes to consider.
		402	*
		403	* @return Number of characters in string.
		404	*
		405	*/
		406	count_t wstr_nlength(const wchar_t *str, size_t size)
		407	{
		408	count_t len = 0;
		409	count_t limit = ALIGN_DOWN(size, sizeof(wchar_t));
		410	count_t offset = 0;
		411
		412	while ((offset < limit) && (*str++ != 0)) {
		413	len++;
		414	offset += sizeof(wchar_t);
		415	}
		416
		417	return len;
		418	}
		419
		420	/** Check whether character is plain ASCII.
		421	*
		422	* @return True if character is plain ASCII.
		423	*
		424	*/
		425	bool ascii_check(wchar_t ch)
		426	{
		427	if ((ch >= 0) && (ch <= 127))
		428	return true;
		429
		430	return false;
		431	}
		432
		433	/** Check whether character is valid
		434	*
		435	* @return True if character is a valid Unicode code point.
		436	*
		437	*/
		438	bool chr_check(wchar_t ch)
		439	{
		440	if ((ch >= 0) && (ch <= 1114111))
		441	return true;
		442
		443	return false;
		444	}
		445
		446	/** Compare two NULL terminated strings.
		447	*
		448	* Do a char-by-char comparison of two NULL-terminated strings.
4011	svoboda	449	* The strings are considered equal iff they consist of the same
4263	mejdrech	450	* characters on the minimum of their lengths.
4011	svoboda	451	*
4263	mejdrech	452	* @param s1 First string to compare.
		453	* @param s2 Second string to compare.
4011	svoboda	454	*
4263	mejdrech	455	* @return 0 if the strings are equal, -1 if first is smaller,
		456	* 1 if second smaller.
4011	svoboda	457	*
		458	*/
4263	mejdrech	459	int str_cmp(const char s1, const char s2)
4011	svoboda	460	{
4263	mejdrech	461	wchar_t c1 = 0;
		462	wchar_t c2 = 0;
4011	svoboda	463
4263	mejdrech	464	size_t off1 = 0;
		465	size_t off2 = 0;
		466
		467	while (true) {
		468	c1 = str_decode(s1, &off1, STR_NO_LIMIT);
		469	c2 = str_decode(s2, &off2, STR_NO_LIMIT);
		470
		471	if (c1 < c2)
4011	svoboda	472	return -1;
4014	decky	473
4263	mejdrech	474	if (c1 > c2)
4011	svoboda	475	return 1;
4263	mejdrech	476
		477	if (c1 == 0 \|\| c2 == 0)
		478	break;
4011	svoboda	479	}
4263	mejdrech	480
		481	return 0;
		482	}
		483
		484	/** Compare two NULL terminated strings with length limit.
		485	*
		486	* Do a char-by-char comparison of two NULL-terminated strings.
		487	* The strings are considered equal iff they consist of the same
		488	* characters on the minimum of their lengths and the length limit.
		489	*
		490	* @param s1 First string to compare.
		491	* @param s2 Second string to compare.
		492	* @param max_len Maximum number of characters to consider.
		493	*
		494	* @return 0 if the strings are equal, -1 if first is smaller,
		495	* 1 if second smaller.
		496	*
		497	*/
		498	int str_lcmp(const char s1, const char s2, count_t max_len)
		499	{
		500	wchar_t c1 = 0;
		501	wchar_t c2 = 0;
4014	decky	502
4263	mejdrech	503	size_t off1 = 0;
		504	size_t off2 = 0;
4014	decky	505
4263	mejdrech	506	count_t len = 0;
4011	svoboda	507
4263	mejdrech	508	while (true) {
		509	if (len >= max_len)
		510	break;
4011	svoboda	511
4263	mejdrech	512	c1 = str_decode(s1, &off1, STR_NO_LIMIT);
		513	c2 = str_decode(s2, &off2, STR_NO_LIMIT);
4011	svoboda	514
4263	mejdrech	515	if (c1 < c2)
		516	return -1;
		517
		518	if (c1 > c2)
		519	return 1;
		520
		521	if (c1 == 0 \|\| c2 == 0)
		522	break;
		523
		524	++len;
		525	}
		526
		527	return 0;
		528
		529	}
		530
		531	/** Copy NULL-terminated string.
4011	svoboda	532	*
4263	mejdrech	533	* Copy source string @a src to destination buffer @a dst.
		534	* No more than @a size bytes are written. NULL-terminator is always
		535	* written after the last succesfully copied character (i.e. if the
		536	* destination buffer is has at least 1 byte, it will be always
		537	* NULL-terminated).
4011	svoboda	538	*
4263	mejdrech	539	* @param src Source string.
		540	* @param dst Destination buffer.
		541	* @param count Size of the destination buffer.
4014	decky	542	*
4011	svoboda	543	*/
4263	mejdrech	544	void str_ncpy(char dst, const char src, size_t size)
4011	svoboda	545	{
4263	mejdrech	546	/* No space for the NULL-terminator in the buffer */
		547	if (size == 0)
		548	return;
4014	decky	549
4263	mejdrech	550	wchar_t ch;
		551	size_t str_off = 0;
		552	size_t dst_off = 0;
		553
		554	while ((ch = str_decode(src, &str_off, STR_NO_LIMIT)) != 0) {
		555	if (chr_encode(ch, dst, &dst_off, size) != EOK)
		556	break;
4011	svoboda	557	}
4014	decky	558
4263	mejdrech	559	if (dst_off >= size)
		560	dst[size - 1] = 0;
		561	else
		562	dst[dst_off] = 0;
4011	svoboda	563	}
		564
4263	mejdrech	565	/** Copy NULL-terminated wide string to string
		566	*
		567	* Copy source wide string @a src to destination buffer @a dst.
		568	* No more than @a size bytes are written. NULL-terminator is always
		569	* written after the last succesfully copied character (i.e. if the
		570	* destination buffer is has at least 1 byte, it will be always
		571	* NULL-terminated).
		572	*
		573	* @param src Source wide string.
		574	* @param dst Destination buffer.
		575	* @param count Size of the destination buffer.
		576	*
		577	*/
		578	void wstr_nstr(char dst, const wchar_t src, size_t size)
		579	{
		580	/* No space for the NULL-terminator in the buffer */
		581	if (size == 0)
		582	return;
		583
		584	wchar_t ch;
		585	count_t src_idx = 0;
		586	size_t dst_off = 0;
		587
		588	while ((ch = src[src_idx++]) != 0) {
		589	if (chr_encode(ch, dst, &dst_off, size) != EOK)
		590	break;
		591	}
		592
		593	if (dst_off >= size)
		594	dst[size - 1] = 0;
		595	else
		596	dst[dst_off] = 0;
		597	}
		598
4012	svoboda	599	/** Find first occurence of character in string.
		600	*
4263	mejdrech	601	* @param str String to search.
		602	* @param ch Character to look for.
4012	svoboda	603	*
4263	mejdrech	604	* @return Pointer to character in @a str or NULL if not found.
		605	*
4012	svoboda	606	*/
4263	mejdrech	607	const char str_chr(const char str, wchar_t ch)
4012	svoboda	608	{
4263	mejdrech	609	wchar_t acc;
		610	size_t off = 0;
		611
		612	while ((acc = str_decode(str, &off, STR_NO_LIMIT)) != 0) {
		613	if (acc == ch)
		614	return (str + off);
4012	svoboda	615	}
4014	decky	616
4012	svoboda	617	return NULL;
		618	}
		619
4263	mejdrech	620	/** Insert a wide character into a wide string.
		621	*
		622	* Insert a wide character into a wide string at position
		623	* @a pos. The characters after the position are shifted.
		624	*
		625	* @param str String to insert to.
		626	* @param ch Character to insert to.
		627	* @param pos Character index where to insert.
		628	@ @param max_pos Characters in the buffer.
		629	*
		630	* @return True if the insertion was sucessful, false if the position
		631	* is out of bounds.
		632	*
		633	*/
		634	bool wstr_linsert(wchar_t *str, wchar_t ch, count_t pos, count_t max_pos)
		635	{
		636	count_t len = wstr_length(str);
		637
		638	if ((pos > len) \|\| (pos + 1 > max_pos))
		639	return false;
		640
		641	count_t i;
		642	for (i = len; i + 1 > pos; i--)
		643	str[i + 1] = str[i];
		644
		645	str[pos] = ch;
		646
		647	return true;
		648	}
		649
		650	/** Remove a wide character from a wide string.
		651	*
		652	* Remove a wide character from a wide string at position
		653	* @a pos. The characters after the position are shifted.
		654	*
		655	* @param str String to remove from.
		656	* @param pos Character index to remove.
		657	*
		658	* @return True if the removal was sucessful, false if the position
		659	* is out of bounds.
		660	*
		661	*/
		662	bool wstr_remove(wchar_t *str, count_t pos)
		663	{
		664	count_t len = wstr_length(str);
		665
		666	if (pos >= len)
		667	return false;
		668
		669	count_t i;
		670	for (i = pos + 1; i <= len; i++)
		671	str[i - 1] = str[i];
		672
		673	return true;
		674	}
		675
4011	svoboda	676	/** @}
		677	*/

Subversion Repositories HelenOS

(root)/branches/network/kernel/generic/src/lib/string.c – Rev 4263