gunicode.h File Reference

__G_UNICODE_H__

gunichar

G_BEGIN_DECLS typedef guint32

gunichar

Typedef gunichar2

typedef guint16

gunichar2

Enum GUnicodeType

Enumerator	Value	Description
G_UNICODE_CONTROL
G_UNICODE_FORMAT
G_UNICODE_UNASSIGNED
G_UNICODE_PRIVATE_USE
G_UNICODE_SURROGATE
G_UNICODE_LOWERCASE_LETTER
G_UNICODE_MODIFIER_LETTER
G_UNICODE_OTHER_LETTER
G_UNICODE_TITLECASE_LETTER
G_UNICODE_UPPERCASE_LETTER
G_UNICODE_COMBINING_MARK
G_UNICODE_ENCLOSING_MARK
G_UNICODE_NON_SPACING_MARK
G_UNICODE_DECIMAL_NUMBER
G_UNICODE_LETTER_NUMBER
G_UNICODE_OTHER_NUMBER
G_UNICODE_CONNECT_PUNCTUATION
G_UNICODE_DASH_PUNCTUATION
G_UNICODE_CLOSE_PUNCTUATION
G_UNICODE_FINAL_PUNCTUATION
G_UNICODE_INITIAL_PUNCTUATION
G_UNICODE_OTHER_PUNCTUATION
G_UNICODE_OPEN_PUNCTUATION
G_UNICODE_CURRENCY_SYMBOL
G_UNICODE_MODIFIER_SYMBOL
G_UNICODE_MATH_SYMBOL
G_UNICODE_OTHER_SYMBOL
G_UNICODE_LINE_SEPARATOR
G_UNICODE_PARAGRAPH_SEPARATOR
G_UNICODE_SPACE_SEPARATOR

Enum GUnicodeBreakType

Enumerator	Value	Description
G_UNICODE_BREAK_MANDATORY
G_UNICODE_BREAK_CARRIAGE_RETURN
G_UNICODE_BREAK_LINE_FEED
G_UNICODE_BREAK_COMBINING_MARK
G_UNICODE_BREAK_SURROGATE
G_UNICODE_BREAK_ZERO_WIDTH_SPACE
G_UNICODE_BREAK_INSEPARABLE
G_UNICODE_BREAK_NON_BREAKING_GLUE
G_UNICODE_BREAK_CONTINGENT
G_UNICODE_BREAK_SPACE
G_UNICODE_BREAK_AFTER
G_UNICODE_BREAK_BEFORE
G_UNICODE_BREAK_BEFORE_AND_AFTER
G_UNICODE_BREAK_HYPHEN
G_UNICODE_BREAK_NON_STARTER
G_UNICODE_BREAK_OPEN_PUNCTUATION
G_UNICODE_BREAK_CLOSE_PUNCTUATION
G_UNICODE_BREAK_QUOTATION
G_UNICODE_BREAK_EXCLAMATION
G_UNICODE_BREAK_IDEOGRAPHIC
G_UNICODE_BREAK_NUMERIC
G_UNICODE_BREAK_INFIX_SEPARATOR
G_UNICODE_BREAK_SYMBOL
G_UNICODE_BREAK_ALPHABETIC
G_UNICODE_BREAK_PREFIX
G_UNICODE_BREAK_POSTFIX
G_UNICODE_BREAK_COMPLEX_CONTEXT
G_UNICODE_BREAK_AMBIGUOUS
G_UNICODE_BREAK_UNKNOWN
G_UNICODE_BREAK_NEXT_LINE
G_UNICODE_BREAK_WORD_JOINER
G_UNICODE_BREAK_HANGUL_L_JAMO
G_UNICODE_BREAK_HANGUL_V_JAMO
G_UNICODE_BREAK_HANGUL_T_JAMO
G_UNICODE_BREAK_HANGUL_LV_SYLLABLE
G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE

Enum GUnicodeScript

Enumerator	Value	Description
G_UNICODE_SCRIPT_INVALID_CODE	-1
G_UNICODE_SCRIPT_COMMON	0
G_UNICODE_SCRIPT_INHERITED
G_UNICODE_SCRIPT_ARABIC
G_UNICODE_SCRIPT_ARMENIAN
G_UNICODE_SCRIPT_BENGALI
G_UNICODE_SCRIPT_BOPOMOFO
G_UNICODE_SCRIPT_CHEROKEE
G_UNICODE_SCRIPT_COPTIC
G_UNICODE_SCRIPT_CYRILLIC
G_UNICODE_SCRIPT_DESERET
G_UNICODE_SCRIPT_DEVANAGARI
G_UNICODE_SCRIPT_ETHIOPIC
G_UNICODE_SCRIPT_GEORGIAN
G_UNICODE_SCRIPT_GOTHIC
G_UNICODE_SCRIPT_GREEK
G_UNICODE_SCRIPT_GUJARATI
G_UNICODE_SCRIPT_GURMUKHI
G_UNICODE_SCRIPT_HAN
G_UNICODE_SCRIPT_HANGUL
G_UNICODE_SCRIPT_HEBREW
G_UNICODE_SCRIPT_HIRAGANA
G_UNICODE_SCRIPT_KANNADA
G_UNICODE_SCRIPT_KATAKANA
G_UNICODE_SCRIPT_KHMER
G_UNICODE_SCRIPT_LAO
G_UNICODE_SCRIPT_LATIN
G_UNICODE_SCRIPT_MALAYALAM
G_UNICODE_SCRIPT_MONGOLIAN
G_UNICODE_SCRIPT_MYANMAR
G_UNICODE_SCRIPT_OGHAM
G_UNICODE_SCRIPT_OLD_ITALIC
G_UNICODE_SCRIPT_ORIYA
G_UNICODE_SCRIPT_RUNIC
G_UNICODE_SCRIPT_SINHALA
G_UNICODE_SCRIPT_SYRIAC
G_UNICODE_SCRIPT_TAMIL
G_UNICODE_SCRIPT_TELUGU
G_UNICODE_SCRIPT_THAANA
G_UNICODE_SCRIPT_THAI
G_UNICODE_SCRIPT_TIBETAN
G_UNICODE_SCRIPT_CANADIAN_ABORIGINAL
G_UNICODE_SCRIPT_YI
G_UNICODE_SCRIPT_TAGALOG
G_UNICODE_SCRIPT_HANUNOO
G_UNICODE_SCRIPT_BUHID
G_UNICODE_SCRIPT_TAGBANWA
G_UNICODE_SCRIPT_BRAILLE
G_UNICODE_SCRIPT_CYPRIOT
G_UNICODE_SCRIPT_LIMBU
G_UNICODE_SCRIPT_OSMANYA
G_UNICODE_SCRIPT_SHAVIAN
G_UNICODE_SCRIPT_LINEAR_B
G_UNICODE_SCRIPT_TAI_LE
G_UNICODE_SCRIPT_UGARITIC
G_UNICODE_SCRIPT_NEW_TAI_LUE
G_UNICODE_SCRIPT_BUGINESE
G_UNICODE_SCRIPT_GLAGOLITIC
G_UNICODE_SCRIPT_TIFINAGH
G_UNICODE_SCRIPT_SYLOTI_NAGRI
G_UNICODE_SCRIPT_OLD_PERSIAN
G_UNICODE_SCRIPT_KHAROSHTHI
G_UNICODE_SCRIPT_UNKNOWN
G_UNICODE_SCRIPT_BALINESE
G_UNICODE_SCRIPT_CUNEIFORM
G_UNICODE_SCRIPT_PHOENICIAN
G_UNICODE_SCRIPT_PHAGS_PA
G_UNICODE_SCRIPT_NKO
G_UNICODE_SCRIPT_KAYAH_LI
G_UNICODE_SCRIPT_LEPCHA
G_UNICODE_SCRIPT_REJANG
G_UNICODE_SCRIPT_SUNDANESE
G_UNICODE_SCRIPT_SAURASHTRA
G_UNICODE_SCRIPT_CHAM
G_UNICODE_SCRIPT_OL_CHIKI
G_UNICODE_SCRIPT_VAI
G_UNICODE_SCRIPT_CARIAN
G_UNICODE_SCRIPT_LYCIAN
G_UNICODE_SCRIPT_LYDIAN

g_get_charset ( G_CONST_RETURN char ** )

IMPORT_C gboolean

g_get_charset

(

G_CONST_RETURN char **

charset

)

g_get_charset: : return location for character set name

Obtains the character set for the <link linkend="setlocale">current locale</link>; you might use this character set as an argument to g_convert(), to convert from the current locale's encoding to some other encoding. (Frequently g_locale_to_utf8() and g_locale_from_utf8() are nice shortcuts, though.)

On Windows the character set returned by this function is the so-called system default ANSI code-page. That is the character set used by the "narrow" versions of C library and Win32 functions that handle file names. It might be different from the character set used by the C library's current locale.

The return value is TRUE if the locale's encoding is UTF-8, in that case you can perhaps avoid calling g_convert().

The string returned in is not allocated, and should not be freed.

Return value: TRUE if the returned charset is UTF-8

g_unichar_isalnum ( gunichar )

IMPORT_C gboolean

g_unichar_isalnum

(

gunichar

)

g_unichar_isalnum: : a Unicode character

Determines whether a character is alphanumeric. Given some UTF-8 text, obtain a character value with g_utf8_get_char().

Return value: TRUE if is an alphanumeric character

g_unichar_isalpha ( gunichar )

IMPORT_C gboolean

g_unichar_isalpha

(

gunichar

)

g_unichar_isalpha: : a Unicode character

Determines whether a character is alphabetic (i.e. a letter). Given some UTF-8 text, obtain a character value with g_utf8_get_char().

Return value: TRUE if is an alphabetic character

g_unichar_iscntrl ( gunichar )

IMPORT_C gboolean

g_unichar_iscntrl

(

gunichar

)

g_unichar_iscntrl: : a Unicode character

Determines whether a character is a control character. Given some UTF-8 text, obtain a character value with g_utf8_get_char().

Return value: TRUE if is a control character

g_unichar_isdigit ( gunichar )

IMPORT_C gboolean

g_unichar_isdigit

(

gunichar

)

g_unichar_isdigit: : a Unicode character

Determines whether a character is numeric (i.e. a digit). This covers ASCII 0-9 and also digits in other languages/scripts. Given some UTF-8 text, obtain a character value with g_utf8_get_char().

Return value: TRUE if is a digit

g_unichar_isgraph ( gunichar )

IMPORT_C gboolean

g_unichar_isgraph

(

gunichar

)

g_unichar_isgraph: : a Unicode character

Determines whether a character is printable and not a space (returns FALSE for control characters, format characters, and spaces). g_unichar_isprint() is similar, but returns TRUE for spaces. Given some UTF-8 text, obtain a character value with g_utf8_get_char().

Return value: TRUE if is printable unless it's a space

g_unichar_islower ( gunichar )

IMPORT_C gboolean

g_unichar_islower

(

gunichar

)

g_unichar_islower: : a Unicode character

Determines whether a character is a lowercase letter. Given some UTF-8 text, obtain a character value with g_utf8_get_char().

Return value: TRUE if is a lowercase letter

g_unichar_isprint ( gunichar )

IMPORT_C gboolean

g_unichar_isprint

(

gunichar

)

g_unichar_isprint: : a Unicode character

Determines whether a character is printable. Unlike g_unichar_isgraph(), returns TRUE for spaces. Given some UTF-8 text, obtain a character value with g_utf8_get_char().

Return value: TRUE if is printable

g_unichar_ispunct ( gunichar )

IMPORT_C gboolean

g_unichar_ispunct

(

gunichar

)

g_unichar_ispunct: : a Unicode character

Determines whether a character is punctuation or a symbol. Given some UTF-8 text, obtain a character value with g_utf8_get_char().

Return value: TRUE if is a punctuation or symbol character

g_unichar_isspace ( gunichar )

IMPORT_C gboolean

g_unichar_isspace

(

gunichar

)

g_unichar_isspace: : a Unicode character

Determines whether a character is a space, tab, or line separator (newline, carriage return, etc.). Given some UTF-8 text, obtain a character value with g_utf8_get_char().

(Note: don't use this to do word breaking; you have to use Pango or equivalent to get word breaking right, the algorithm is fairly complex.)

Return value: TRUE if is a space character

g_unichar_isupper ( gunichar )

IMPORT_C gboolean

g_unichar_isupper

(

gunichar

)

g_unichar_isupper: : a Unicode character

Determines if a character is uppercase.

Return value: TRUE if is an uppercase character

g_unichar_isxdigit ( gunichar )

IMPORT_C gboolean

g_unichar_isxdigit

(

gunichar

)

g_unichar_isxdigit: : a Unicode character.

Determines if a character is a hexidecimal digit.

Return value: TRUE if the character is a hexadecimal digit

g_unichar_istitle ( gunichar )

IMPORT_C gboolean

g_unichar_istitle

(

gunichar

)

g_unichar_istitle: : a Unicode character

Determines if a character is titlecase. Some characters in Unicode which are composites, such as the DZ digraph have three case variants instead of just two. The titlecase form is used at the beginning of a word where only the first letter is capitalized. The titlecase form of the DZ digraph is U+01F2 LATIN CAPITAL LETTTER D WITH SMALL LETTER Z.

Return value: TRUE if the character is titlecase

g_unichar_isdefined ( gunichar )

IMPORT_C gboolean

g_unichar_isdefined

(

gunichar

)

g_unichar_isdefined: : a Unicode character

Determines if a given character is assigned in the Unicode standard.

Return value: TRUE if the character has an assigned value

g_unichar_iswide ( gunichar )

IMPORT_C gboolean

g_unichar_iswide

(

gunichar

)

g_unichar_iswide: : a Unicode character

Determines if a character is typically rendered in a double-width cell.

Return value: TRUE if the character is wide

g_unichar_iswide_cjk ( gunichar )

IMPORT_C gboolean

g_unichar_iswide_cjk

(

gunichar

)

g_unichar_iswide_cjk: : a Unicode character

Determines if a character is typically rendered in a double-width cell under legacy East Asian locales. If a character is wide according to g_unichar_iswide(), then it is also reported wide with this function, but the converse is not necessarily true. See the <ulink url="http://www.unicode.org/reports/tr11/">Unicode Standard Annex #11</ulink> for details.

If a character passes the g_unichar_iswide() test then it will also pass this test, but not the other way around. Note that some characters may pas both this test and g_unichar_iszerowidth().

Return value: TRUE if the character is wide in legacy East Asian locales

Since: 2.12

g_unichar_iszerowidth ( gunichar )

IMPORT_C gboolean

g_unichar_iszerowidth

(

gunichar

)

g_unichar_iszerowidth: : a Unicode character

Determines if a given character typically takes zero width when rendered. The return value is TRUE for all non-spacing and enclosing marks (e.g., combining accents), format characters, zero-width space, but not U+00AD SOFT HYPHEN.

A typical use of this function is with one of g_unichar_iswide() or g_unichar_iswide_cjk() to determine the number of cells a string occupies when displayed on a grid display (terminals). However, note that not all terminals support zero-width rendering of zero-width marks.

Return value: TRUE if the character has zero width

Since: 2.14

g_unichar_ismark ( gunichar )

IMPORT_C gboolean

g_unichar_ismark

(

gunichar

)

g_unichar_ismark: : a Unicode character

Determines whether a character is a mark (non-spacing mark, combining mark, or enclosing mark in Unicode speak). Given some UTF-8 text, obtain a character value with g_utf8_get_char().

Note: in most cases where isalpha characters are allowed, ismark characters should be allowed to as they are essential for writing most European languages as well as many non-Latin scripts.

Return value: TRUE if is a mark character

Since: 2.14

g_unichar_toupper ( gunichar )

IMPORT_C gunichar

g_unichar_toupper

(

gunichar

)

g_unichar_toupper: : a Unicode character

Converts a character to uppercase.

Return value: the result of converting to uppercase. If is not an lowercase or titlecase character, or has no upper case equivalent is returned unchanged.

g_unichar_tolower ( gunichar )

IMPORT_C gunichar

g_unichar_tolower

(

gunichar

)

g_unichar_tolower: : a Unicode character.

Converts a character to lower case.

Return value: the result of converting to lower case. If is not an upperlower or titlecase character, or has no lowercase equivalent is returned unchanged.

g_unichar_totitle ( gunichar )

IMPORT_C gunichar

g_unichar_totitle

(

gunichar

)

g_unichar_totitle: : a Unicode character

Converts a character to the titlecase.

Return value: the result of converting to titlecase. If is not an uppercase or lowercase character, is returned unchanged.

g_unichar_digit_value ( gunichar )

IMPORT_C gint

g_unichar_digit_value

(

gunichar

)

g_unichar_digit_value: : a Unicode character

Determines the numeric value of a character as a decimal digit.

Return value: If is a decimal digit (according to g_unichar_isdigit()), its numeric value. Otherwise, -1.

g_unichar_xdigit_value ( gunichar )

IMPORT_C gint

g_unichar_xdigit_value

(

gunichar

)

g_unichar_xdigit_value: : a Unicode character

Determines the numeric value of a character as a hexidecimal digit.

Return value: If is a hex digit (according to g_unichar_isxdigit()), its numeric value. Otherwise, -1.

g_unichar_type ( gunichar )

IMPORT_C GUnicodeType

g_unichar_type

(

gunichar

)

g_unichar_type: : a Unicode character

Classifies a Unicode character by type.

Return value: the type of the character.

g_unichar_break_type ( gunichar )

IMPORT_C GUnicodeBreakType

g_unichar_break_type

(

gunichar

)

g_unichar_break_type: : a Unicode character

Determines the break type of . should be a Unicode character (to derive a character from UTF-8 encoded text, use g_utf8_get_char()). The break type is used to find word and line breaks ("text boundaries"), Pango implements the Unicode boundary resolution algorithms and normally you would use a function such as pango_break() instead of caring about break types yourself.

Return value: the break type of

g_unichar_combining_class ( gunichar )

IMPORT_C gint

g_unichar_combining_class

(

gunichar

)

g_unichar_combining_class: : a Unicode character

Determines the canonical combining class of a Unicode character.

Return value: the combining class of the character

Since: 2.14

g_unicode_canonical_ordering ( gunichar *, gsize )

IMPORT_C void	g_unicode_canonical_ordering	(	gunichar *	string,
			gsize	len
		)

g_unicode_canonical_ordering: : a UCS-4 encoded string. : the maximum length of to use.

Computes the canonical ordering of a string in-place. This rearranges decomposed characters in the string according to their combining classes. See the Unicode manual for more information.

g_unicode_canonical_decomposition ( gunichar, gsize * )

IMPORT_C gunichar *	g_unicode_canonical_decomposition	(	gunichar	ch,
			gsize *	result_len
		)

g_unicode_canonical_decomposition: : a Unicode character. : location to store the length of the return value.

Computes the canonical decomposition of a Unicode character.

Return value: a newly allocated string of Unicode characters. is set to the resulting length of the string.

_g_utf8_skip ( void )

IMPORT_C const gchar *const *

_g_utf8_skip

(

void

)

g_utf8_skip

GLIB_VAR const gchar *const

g_utf8_skip

g_utf8_next_char

g_utf8_get_char ( const gchar * )

IMPORT_C gunichar

g_utf8_get_char

(

const gchar *

)

g_utf8_get_char: : a pointer to Unicode character encoded as UTF-8

Converts a sequence of bytes encoded as UTF-8 to a Unicode character. If does not point to a valid UTF-8 encoded character, results are undefined. If you are not sure that the bytes are complete valid Unicode characters, you should use g_utf8_get_char_validated() instead.

Return value: the resulting character

g_utf8_get_char_validated ( const gchar *, gssize )

IMPORT_C gunichar	g_utf8_get_char_validated	(	const gchar *	p,
			gssize	max_len
		)

g_utf8_get_char_validated: : a pointer to Unicode character encoded as UTF-8 : the maximum number of bytes to read, or -1, for no maximum or if is nul-terminated

Convert a sequence of bytes encoded as UTF-8 to a Unicode character. This function checks for incomplete characters, for invalid characters such as characters that are out of the range of Unicode, and for overlong encodings of valid characters.

Return value: the resulting character. If points to a partial sequence at the end of a string that could begin a valid character (or if is zero), returns (gunichar)-2; otherwise, if does not point to a valid UTF-8 encoded Unicode character, returns (gunichar)-1.

g_utf8_offset_to_pointer ( const gchar *, glong )

IMPORT_C gchar *	g_utf8_offset_to_pointer	(	const gchar *	str,
			glong	offset
		)

g_utf8_offset_to_pointer: : a UTF-8 encoded string : a character offset within

Converts from an integer character offset to a pointer to a position within the string.

Since 2.10, this function allows to pass a negative to step backwards. It is usually worth stepping backwards from the end instead of forwards if is in the last fourth of the string, since moving forward is about 3 times faster than moving backward.

<note>

This function doesn't abort when reaching the end of . Therefore you should be sure that is within string boundaries before calling that function. Call g_utf8_strlen() when unsure.

This limitation exists as this function is called frequently during text rendering and therefore has to be as fast as possible.

</note>

Return value: the resulting pointer

g_utf8_pointer_to_offset ( const gchar , const gchar )

IMPORT_C glong	g_utf8_pointer_to_offset	(	const gchar *	str,
			const gchar *	pos
		)

g_utf8_pointer_to_offset: : a UTF-8 encoded string : a pointer to a position within

Converts from a pointer to position within a string to a integer character offset.

Since 2.10, this function allows to be before , and returns a negative offset in this case.

Return value: the resulting character offset

g_utf8_prev_char ( const gchar * )

IMPORT_C gchar *

g_utf8_prev_char

(

const gchar *

)

g_utf8_prev_char: : a pointer to a position within a UTF-8 encoded string

Finds the previous UTF-8 character in the string before .

does not have to be at the beginning of a UTF-8 character. No check is made to see if the character found is actually valid other than it starts with an appropriate byte. If might be the first character of the string, you must use g_utf8_find_prev_char() instead.

Return value: a pointer to the found character.

g_utf8_find_next_char ( const gchar , const gchar )

IMPORT_C gchar *	g_utf8_find_next_char	(	const gchar *	p,
			const gchar *	end
		)

g_utf8_find_next_char: : a pointer to a position within a UTF-8 encoded string : a pointer to the byte following the end of the string, or NULL to indicate that the string is nul-terminated.

Finds the start of the next UTF-8 character in the string after .

does not have to be at the beginning of a UTF-8 character. No check is made to see if the character found is actually valid other than it starts with an appropriate byte.

Return value: a pointer to the found character or NULL

g_utf8_find_prev_char ( const gchar , const gchar )

IMPORT_C gchar *	g_utf8_find_prev_char	(	const gchar *	str,
			const gchar *	p
		)

g_utf8_strlen ( const gchar *, gssize )

IMPORT_C glong	g_utf8_strlen	(	const gchar *	p,
			gssize	max
		)

g_utf8_strlen: : pointer to the start of a UTF-8 encoded string. : the maximum number of bytes to examine. If is less than 0, then the string is assumed to be nul-terminated. If is 0, will not be examined and may be NULL.

Returns the length of the string in characters.

Return value: the length of the string in characters

g_utf8_strncpy ( gchar , const gchar , gsize )

IMPORT_C gchar *	g_utf8_strncpy	(	gchar *	dest,
			const gchar *	src,
			gsize	n
		)

g_utf8_strncpy: : buffer to fill with characters from : UTF-8 encoded string : character count

Like the standard C strncpy() function, but copies a given number of characters instead of a given number of bytes. The string must be valid UTF-8 encoded text. (Use g_utf8_validate() on all text before trying to use UTF-8 utility functions with it.)

Return value:

g_utf8_strchr ( const gchar *, gssize, gunichar )

IMPORT_C gchar *	g_utf8_strchr	(	const gchar *	p,
			gssize	len,
			gunichar	c
		)

g_utf8_strrchr ( const gchar *, gssize, gunichar )

IMPORT_C gchar *	g_utf8_strrchr	(	const gchar *	p,
			gssize	len,
			gunichar	c
		)

g_utf8_strreverse ( const gchar *, gssize )

IMPORT_C gchar *	g_utf8_strreverse	(	const gchar *	str,
			gssize	len
		)

g_utf8_strreverse: : a UTF-8 encoded string : the maximum length of to use, in bytes. If < 0, then the string is nul-terminated.

Reverses a UTF-8 string. must be valid UTF-8 encoded text. (Use g_utf8_validate() on all text before trying to use UTF-8 utility functions with it.)

This function is intended for programmatic uses of reversed strings. It pays no attention to decomposed characters, combining marks, byte order marks, directional indicators (LRM, LRO, etc) and similar characters which might need special handling when reversing a string for display purposes.

Note that unlike g_strreverse(), this function returns newly-allocated memory, which should be freed with g_free() when no longer needed.

Returns: a newly-allocated string which is the reverse of .

Since: 2.2

g_utf8_to_utf16 ( const gchar , glong, glong , glong *, GError ** )

IMPORT_C gunichar2 *	g_utf8_to_utf16	(	const gchar *	str,
			glong	len,
			glong *	items_read,
			glong *	items_written,
			GError **	error
		)

g_utf8_to_utf16: : a UTF-8 encoded string : the maximum length (number of characters) of to use. If < 0, then the string is nul-terminated. : location to store number of bytes read, or NULL. If NULL, then G_CONVERT_ERROR_PARTIAL_INPUT will be returned in case contains a trailing partial character. If an error occurs then the index of the invalid input is stored here. : location to store number of <type>gunichar2</type> written, or NULL. The value stored here does not include the trailing 0. : location to store the error occuring, or NULL to ignore errors. Any of the errors in GConvertError other than G_CONVERT_ERROR_NO_CONVERSION may occur.

Convert a string from UTF-8 to UTF-16. A 0 character will be added to the result after the converted text.

Return value: a pointer to a newly allocated UTF-16 string. This value must be freed with g_free(). If an error occurs, NULL will be returned and set.

g_utf8_to_ucs4 ( const gchar , glong, glong , glong *, GError ** )

IMPORT_C gunichar *	g_utf8_to_ucs4	(	const gchar *	str,
			glong	len,
			glong *	items_read,
			glong *	items_written,
			GError **	error
		)

g_utf8_to_ucs4: : a UTF-8 encoded string : the maximum length of to use, in bytes. If < 0, then the string is nul-terminated. : location to store number of bytes read, or NULL. If NULL, then G_CONVERT_ERROR_PARTIAL_INPUT will be returned in case contains a trailing partial character. If an error occurs then the index of the invalid input is stored here. : location to store number of characters written or NULL. The value here stored does not include the trailing 0 character. : location to store the error occuring, or NULL to ignore errors. Any of the errors in GConvertError other than G_CONVERT_ERROR_NO_CONVERSION may occur.

Convert a string from UTF-8 to a 32-bit fixed width representation as UCS-4. A trailing 0 will be added to the string after the converted text.

Return value: a pointer to a newly allocated UCS-4 string. This value must be freed with g_free(). If an error occurs, NULL will be returned and set.

g_utf8_to_ucs4_fast ( const gchar , glong, glong )

IMPORT_C gunichar *	g_utf8_to_ucs4_fast	(	const gchar *	str,
			glong	len,
			glong *	items_written
		)

g_utf8_to_ucs4_fast: : a UTF-8 encoded string : the maximum length of to use, in bytes. If < 0, then the string is nul-terminated. : location to store the number of characters in the result, or NULL.

Convert a string from UTF-8 to a 32-bit fixed width representation as UCS-4, assuming valid UTF-8 input. This function is roughly twice as fast as g_utf8_to_ucs4() but does no error checking on the input.

Return value: a pointer to a newly allocated UCS-4 string. This value must be freed with g_free().

g_utf16_to_ucs4 ( const gunichar2 , glong, glong , glong *, GError ** )

IMPORT_C gunichar *	g_utf16_to_ucs4	(	const gunichar2 *	str,
			glong	len,
			glong *	items_read,
			glong *	items_written,
			GError **	error
		)

g_utf16_to_ucs4: : a UTF-16 encoded string : the maximum length (number of <type>gunichar2</type>) of to use. If < 0, then the string is nul-terminated. : location to store number of words read, or NULL. If NULL, then G_CONVERT_ERROR_PARTIAL_INPUT will be returned in case contains a trailing partial character. If an error occurs then the index of the invalid input is stored here. : location to store number of characters written, or NULL. The value stored here does not include the trailing 0 character. : location to store the error occuring, or NULL to ignore errors. Any of the errors in GConvertError other than G_CONVERT_ERROR_NO_CONVERSION may occur.

Convert a string from UTF-16 to UCS-4. The result will be nul-terminated.

Return value: a pointer to a newly allocated UCS-4 string. This value must be freed with g_free(). If an error occurs, NULL will be returned and set.

g_utf16_to_utf8 ( const gunichar2 , glong, glong , glong *, GError ** )

IMPORT_C gchar *	g_utf16_to_utf8	(	const gunichar2 *	str,
			glong	len,
			glong *	items_read,
			glong *	items_written,
			GError **	error
		)

g_utf16_to_utf8: : a UTF-16 encoded string : the maximum length (number of <type>gunichar2</type>) of to use. If < 0, then the string is nul-terminated. : location to store number of words read, or NULL. If NULL, then G_CONVERT_ERROR_PARTIAL_INPUT will be returned in case contains a trailing partial character. If an error occurs then the index of the invalid input is stored here. : location to store number of bytes written, or NULL. The value stored here does not include the trailing 0 byte. : location to store the error occuring, or NULL to ignore errors. Any of the errors in GConvertError other than G_CONVERT_ERROR_NO_CONVERSION may occur.

Convert a string from UTF-16 to UTF-8. The result will be terminated with a 0 byte.

Note that the input is expected to be already in native endianness, an initial byte-order-mark character is not handled specially. g_convert() can be used to convert a byte buffer of UTF-16 data of ambiguous endianess.

Return value: a pointer to a newly allocated UTF-8 string. This value must be freed with g_free(). If an error occurs, NULL will be returned and set.

g_ucs4_to_utf16 ( const gunichar , glong, glong , glong *, GError ** )

IMPORT_C gunichar2 *	g_ucs4_to_utf16	(	const gunichar *	str,
			glong	len,
			glong *	items_read,
			glong *	items_written,
			GError **	error
		)

g_ucs4_to_utf16: : a UCS-4 encoded string : the maximum length (number of characters) of to use. If < 0, then the string is nul-terminated. : location to store number of bytes read, or NULL. If an error occurs then the index of the invalid input is stored here. : location to store number of <type>gunichar2</type> written, or NULL. The value stored here does not include the trailing 0. : location to store the error occuring, or NULL to ignore errors. Any of the errors in GConvertError other than G_CONVERT_ERROR_NO_CONVERSION may occur.

Convert a string from UCS-4 to UTF-16. A 0 character will be added to the result after the converted text.

Return value: a pointer to a newly allocated UTF-16 string. This value must be freed with g_free(). If an error occurs, NULL will be returned and set.

g_ucs4_to_utf8 ( const gunichar , glong, glong , glong *, GError ** )

IMPORT_C gchar *	g_ucs4_to_utf8	(	const gunichar *	str,
			glong	len,
			glong *	items_read,
			glong *	items_written,
			GError **	error
		)

g_ucs4_to_utf8: : a UCS-4 encoded string : the maximum length (number of characters) of to use. If < 0, then the string is nul-terminated. : location to store number of characters read, or NULL. : location to store number of bytes written or NULL. The value here stored does not include the trailing 0 byte. : location to store the error occuring, or NULL to ignore errors. Any of the errors in GConvertError other than G_CONVERT_ERROR_NO_CONVERSION may occur.

Convert a string from a 32-bit fixed width representation as UCS-4. to UTF-8. The result will be terminated with a 0 byte.

Return value: a pointer to a newly allocated UTF-8 string. This value must be freed with g_free(). If an error occurs, NULL will be returned and set. In that case, will be set to the position of the first invalid input character.

g_unichar_to_utf8 ( gunichar, gchar * )

IMPORT_C gint	g_unichar_to_utf8	(	gunichar	c,
			gchar *	outbuf
		)

g_unichar_to_utf8: : a Unicode character code : output buffer, must have at least 6 bytes of space. If NULL, the length will be computed and returned and nothing will be written to .

Converts a single character to UTF-8.

Return value: number of bytes written

g_utf8_validate ( const gchar *, gssize, const gchar ** )

IMPORT_C gboolean	g_utf8_validate	(	const gchar *	str,
			gssize	max_len,
			const gchar **	end
		)

g_unichar_validate ( gunichar )

IMPORT_C gboolean

g_unichar_validate

(

gunichar

)

g_unichar_validate: : a Unicode character

Checks whether is a valid Unicode character. Some possible integer values of will not be valid. 0 is considered a valid character, though it's normally a string terminator.

Return value: TRUE if is a valid Unicode character

g_utf8_strup ( const gchar *, gssize )

IMPORT_C gchar *	g_utf8_strup	(	const gchar *	str,
			gssize	len
		)

g_utf8_strup: : a UTF-8 encoded string : length of , in bytes, or -1 if is nul-terminated.

Converts all Unicode characters in the string that have a case to uppercase. The exact manner that this is done depends on the current locale, and may result in the number of characters in the string increasing. (For instance, the German ess-zet will be changed to SS.)

Return value: a newly allocated string, with all characters converted to uppercase.

g_utf8_strdown ( const gchar *, gssize )

IMPORT_C gchar *	g_utf8_strdown	(	const gchar *	str,
			gssize	len
		)

g_utf8_strdown: : a UTF-8 encoded string : length of , in bytes, or -1 if is nul-terminated.

Converts all Unicode characters in the string that have a case to lowercase. The exact manner that this is done depends on the current locale, and may result in the number of characters in the string changing.

Return value: a newly allocated string, with all characters converted to lowercase.

g_utf8_casefold ( const gchar *, gssize )

IMPORT_C gchar *	g_utf8_casefold	(	const gchar *	str,
			gssize	len
		)

g_utf8_casefold: : a UTF-8 encoded string : length of , in bytes, or -1 if is nul-terminated.

Converts a string into a form that is independent of case. The result will not correspond to any particular case, but can be compared for equality or ordered with the results of calling g_utf8_casefold() on other strings.

Note that calling g_utf8_casefold() followed by g_utf8_collate() is only an approximation to the correct linguistic case insensitive ordering, though it is a fairly good one. Getting this exactly right would require a more sophisticated collation function that takes case sensitivity into account. GLib does not currently provide such a function.

Return value: a newly allocated string, that is a case independent form of .

Enum GNormalizeMode

Enumerator	Value	Description
G_NORMALIZE_DEFAULT
G_NORMALIZE_NFD	G_NORMALIZE_DEFAULT
G_NORMALIZE_DEFAULT_COMPOSE
G_NORMALIZE_NFC	G_NORMALIZE_DEFAULT_COMPOSE
G_NORMALIZE_ALL
G_NORMALIZE_NFKD	G_NORMALIZE_ALL
G_NORMALIZE_ALL_COMPOSE
G_NORMALIZE_NFKC	G_NORMALIZE_ALL_COMPOSE

g_utf8_normalize ( const gchar *, gssize, GNormalizeMode )

IMPORT_C gchar *	g_utf8_normalize	(	const gchar *	str,
			gssize	len,
			GNormalizeMode	mode
		)

g_utf8_normalize: : a UTF-8 encoded string. : length of , in bytes, or -1 if is nul-terminated. : the type of normalization to perform.

Converts a string into canonical form, standardizing such issues as whether a character with an accent is represented as a base character and combining accent or as a single precomposed character. The string has to be valid UTF-8, otherwise NULL is returned. You should generally call g_utf8_normalize() before comparing two Unicode strings.

The normalization mode G_NORMALIZE_DEFAULT only standardizes differences that do not affect the text content, such as the above-mentioned accent representation. G_NORMALIZE_ALL also standardizes the "compatibility" characters in Unicode, such as SUPERSCRIPT THREE to the standard forms (in this case DIGIT THREE). Formatting information may be lost but for most text operations such characters should be considered the same.

G_NORMALIZE_DEFAULT_COMPOSE and G_NORMALIZE_ALL_COMPOSE are like G_NORMALIZE_DEFAULT and G_NORMALIZE_ALL, but returned a result with composed forms rather than a maximally decomposed form. This is often useful if you intend to convert the string to a legacy encoding or pass it to a system with less capable Unicode handling.

Return value: a newly allocated string, that is the normalized form of , or NULL if is not valid UTF-8.

g_utf8_collate ( const gchar , const gchar )

IMPORT_C gint	g_utf8_collate	(	const gchar *	str1,
			const gchar *	str2
		)

g_utf8_collate: : a UTF-8 encoded string : a UTF-8 encoded string

Compares two strings for ordering using the linguistically correct rules for the <link linkend="setlocale">current locale</link>. When sorting a large number of strings, it will be significantly faster to obtain collation keys with g_utf8_collate_key() and compare the keys with strcmp() when sorting instead of sorting the original strings.

Return value: < 0 if compares before , 0 if they compare equal, > 0 if compares after .

g_utf8_collate_key ( const gchar *, gssize )

IMPORT_C gchar *	g_utf8_collate_key	(	const gchar *	str,
			gssize	len
		)

g_utf8_collate_key: : a UTF-8 encoded string. : length of , in bytes, or -1 if is nul-terminated.

Converts a string into a collation key that can be compared with other collation keys produced by the same function using strcmp().

The results of comparing the collation keys of two strings with strcmp() will always be the same as comparing the two original keys with g_utf8_collate().

Note that this function depends on the <link linkend="setlocale">current locale</link>.

Return value: a newly allocated string. This string should be freed with g_free() when you are done with it.

g_utf8_collate_key_for_filename ( const gchar *, gssize )

IMPORT_C gchar *	g_utf8_collate_key_for_filename	(	const gchar *	str,
			gssize	len
		)

g_utf8_collate_key_for_filename: : a UTF-8 encoded string. : length of , in bytes, or -1 if is nul-terminated.

Converts a string into a collation key that can be compared with other collation keys produced by the same function using strcmp().

In order to sort filenames correctly, this function treats the dot '.' as a special case. Most dictionary orderings seem to consider it insignificant, thus producing the ordering "event.c" "eventgenerator.c" "event.h" instead of "event.c" "event.h" "eventgenerator.c". Also, we would like to treat numbers intelligently so that "file1" "file10" "file5" is sorted as "file1" "file5" "file10".

Note that this function depends on the <link linkend="setlocale">current locale</link>.

Return value: a newly allocated string. This string should be freed with g_free() when you are done with it.

Since: 2.8

g_unichar_get_mirror_char ( gunichar, gunichar * )

IMPORT_C gboolean	g_unichar_get_mirror_char	(	gunichar	ch,
			gunichar *	mirrored_ch
		)

g_unichar_get_mirror_char: : a Unicode character : location to store the mirrored character

In Unicode, some characters are <firstterm>mirrored</firstterm>. This means that their images are mirrored horizontally in text that is laid out from right to left. For instance, "(" would become its mirror image, ")", in right-to-left text.

If has the Unicode mirrored property and there is another unicode character that typically has a glyph that is the mirror image of 's glyph and is set, it puts that character in the address pointed to by . Otherwise the original character is put.

Return value: TRUE if has a mirrored character, FALSE otherwise

Since: 2.4

g_unichar_get_script ( gunichar )

IMPORT_C GUnicodeScript

g_unichar_get_script

(

gunichar

)

g_unichar_get_script: : a Unicode character

Looks up the GUnicodeScript for a particular character (as defined by Unicode Standard Annex #24). No check is made for being a valid Unicode character; if you pass in invalid character, the result is undefined.

This function is equivalent to pango_script_for_unichar() and the two are interchangeable.

Return value: the GUnicodeScript for the character.

Since: 2.14

_g_utf8_make_valid ( const gchar * )

gchar *

_g_utf8_make_valid

(

const gchar *

name

)

gunicode.h File Reference

__G_UNICODE_H__

gunichar

Typedef gunichar2

Enum GUnicodeType

Enum GUnicodeBreakType

Enum GUnicodeScript

g_get_charset ( G_CONST_RETURN char ** )

g_unichar_isalnum ( gunichar )

g_unichar_isalpha ( gunichar )

g_unichar_iscntrl ( gunichar )

g_unichar_isdigit ( gunichar )

g_unichar_isgraph ( gunichar )

g_unichar_islower ( gunichar )

g_unichar_isprint ( gunichar )

g_unichar_ispunct ( gunichar )

g_unichar_isspace ( gunichar )

g_unichar_isupper ( gunichar )

g_unichar_isxdigit ( gunichar )

g_unichar_istitle ( gunichar )

g_unichar_isdefined ( gunichar )

g_unichar_iswide ( gunichar )

g_unichar_iswide_cjk ( gunichar )

g_unichar_iszerowidth ( gunichar )

g_unichar_ismark ( gunichar )

g_unichar_toupper ( gunichar )

g_unichar_tolower ( gunichar )

g_unichar_totitle ( gunichar )

g_unichar_digit_value ( gunichar )

g_unichar_xdigit_value ( gunichar )

g_unichar_type ( gunichar )

g_unichar_break_type ( gunichar )

g_unichar_combining_class ( gunichar )

g_unicode_canonical_ordering ( gunichar *, gsize )

g_unicode_canonical_decomposition ( gunichar, gsize * )

_g_utf8_skip ( void )

g_utf8_skip

g_utf8_next_char

g_utf8_get_char ( const gchar * )

g_utf8_get_char_validated ( const gchar *, gssize )

g_utf8_offset_to_pointer ( const gchar *, glong )

g_utf8_pointer_to_offset ( const gchar *, const gchar * )

g_utf8_prev_char ( const gchar * )

g_utf8_find_next_char ( const gchar *, const gchar * )

g_utf8_find_prev_char ( const gchar *, const gchar * )

g_utf8_strlen ( const gchar *, gssize )

g_utf8_strncpy ( gchar *, const gchar *, gsize )

g_utf8_strchr ( const gchar *, gssize, gunichar )

g_utf8_strrchr ( const gchar *, gssize, gunichar )

g_utf8_strreverse ( const gchar *, gssize )

g_utf8_to_utf16 ( const gchar *, glong, glong *, glong *, GError ** )

g_utf8_to_ucs4 ( const gchar *, glong, glong *, glong *, GError ** )

g_utf8_to_ucs4_fast ( const gchar *, glong, glong * )

g_utf16_to_ucs4 ( const gunichar2 *, glong, glong *, glong *, GError ** )

g_utf16_to_utf8 ( const gunichar2 *, glong, glong *, glong *, GError ** )

g_ucs4_to_utf16 ( const gunichar *, glong, glong *, glong *, GError ** )

g_ucs4_to_utf8 ( const gunichar *, glong, glong *, glong *, GError ** )

g_unichar_to_utf8 ( gunichar, gchar * )

g_utf8_validate ( const gchar *, gssize, const gchar ** )

g_unichar_validate ( gunichar )

g_utf8_strup ( const gchar *, gssize )

g_utf8_strdown ( const gchar *, gssize )

g_utf8_casefold ( const gchar *, gssize )

Enum GNormalizeMode

g_utf8_normalize ( const gchar *, gssize, GNormalizeMode )

g_utf8_collate ( const gchar *, const gchar * )

g_utf8_collate_key ( const gchar *, gssize )

g_utf8_collate_key_for_filename ( const gchar *, gssize )

g_unichar_get_mirror_char ( gunichar, gunichar * )

g_unichar_get_script ( gunichar )

_g_utf8_make_valid ( const gchar * )

g_utf8_pointer_to_offset ( const gchar , const gchar )

g_utf8_find_next_char ( const gchar , const gchar )

g_utf8_find_prev_char ( const gchar , const gchar )

g_utf8_strncpy ( gchar , const gchar , gsize )

g_utf8_to_utf16 ( const gchar , glong, glong , glong *, GError ** )

g_utf8_to_ucs4 ( const gchar , glong, glong , glong *, GError ** )

g_utf8_to_ucs4_fast ( const gchar , glong, glong )

g_utf16_to_ucs4 ( const gunichar2 , glong, glong , glong *, GError ** )

g_utf16_to_utf8 ( const gunichar2 , glong, glong , glong *, GError ** )

g_ucs4_to_utf16 ( const gunichar , glong, glong , glong *, GError ** )

g_ucs4_to_utf8 ( const gunichar , glong, glong , glong *, GError ** )

g_utf8_collate ( const gchar , const gchar )