#include <cassert>
#include <cstdint>
#include <cstring>
#include "mysql/strings/m_ctype.h"
#include "strings/m_ctype_internals.h"
#include "template_utils.h"

Macros
#define	is_mb_1(c) ((uint8_t)(c) <= 0x7F)

#define	is_mb_odd(c) (0x81 <= (uint8_t)(c) && (uint8_t)(c) <= 0xFE)

#define	is_mb_even_2(c)

#define	is_mb_even_4(c) (0x30 <= (uint8_t)(c) && (uint8_t)(c) <= 0x39)

Functions
static unsigned	gb18030_chs_to_code (const uint8_t *src, size_t srclen)
	Convert a gb18030 code in uchars to the code The byte sequence in src must be a gb18030 sequence. More...

static size_t	code_to_gb18030_chs (uint8_t *dst, size_t dstlen, unsigned code)
	Convert a gb18030 code to a sequence of chars. More...

static unsigned	diff_to_gb18030_4 (uint8_t *dst, unsigned dstlen, unsigned diff)
	Calculate the 4-byte GB18030 code from a diff value. More...

static unsigned	gb18030_4_code_to_diff (unsigned code)
	Calculate the diff between the 4-byte gb18030 code and GB+81308130. More...

static unsigned	gb18030_4_chs_to_diff (const uint8_t *src)
	Calculate the diff between the 4-byte gb18030 code in bytes and GB+81308130. More...

static unsigned	my_ismbchar_gb18030 (const CHARSET_INFO cs, const char p, const char *e)
	Judge if a sequence of chars is in gb18030 multi-bytes code. More...

static unsigned	my_mbcharlen_gb18030 (const CHARSET_INFO *cs, unsigned c)
	Get the length of a possible gb18030 code according to its first byte or first two bytes. More...

static int	my_wc_mb_gb18030_chs (const CHARSET_INFO cs, my_wc_t wc, uint8_t s, uint8_t *e)
	Convert the Unicode code to its gb18030 code in bytes. More...

static int	my_mb_wc_gb18030 (const CHARSET_INFO cs, my_wc_t pwc, const uint8_t s, const uint8_t e)
	Convert a gb18030 code in bytes to unicode code. More...

static size_t	my_well_formed_len_gb18030 (const CHARSET_INFO cs, const char b, const char e, size_t pos, int error)
	Get the well formed length of a GB18030 string. More...

static const MY_UNICASE_CHARACTER *	get_case_info (const CHARSET_INFO cs, const uint8_t src, size_t srclen)
	Get the case info of one gb18030 code in bytes. More...

static unsigned	case_info_code_to_gb18030 (unsigned code)
	Convert the code in one MY_UNICASE_CHARACTER to real gb18030 code. More...

static unsigned	get_casefolded_code (const CHARSET_INFO cs, const uint8_t src, size_t srclen, size_t is_upper)
	Get the casefolded code of a given gb18030 code. More...

static size_t	my_casefold_gb18030 (const CHARSET_INFO cs, char src, size_t srclen, char dst, size_t dstlen, const uint8_t map, bool is_upper)
	Get the casefolded gb18030 codes of a given sequence of gb18030 codes Store the casefolded result to a specified dest. More...

static size_t	my_casedn_gb18030 (const CHARSET_INFO cs, char src, size_t srclen, char *dst, size_t dstlen)
	Convert a gb18030 string to a corresponding lower-case gb18030 string. More...

static size_t	my_caseup_gb18030 (const CHARSET_INFO cs, char src, size_t srclen, char *dst, size_t dstlen)
	Convert a gb18030 string to a corresponding up-case gb18030 string. More...

static size_t	my_casedn_gb18030_uca (const CHARSET_INFO cs, char src, size_t srclen, char *dst, size_t dstlen)
	Get the casedown gb18030 code of a given gb18030 code It's only for UCA, because the caseinfo for UCA is different. More...

static size_t	my_caseup_gb18030_uca (const CHARSET_INFO cs, char src, size_t srclen, char *dst, size_t dstlen)
	Get the caseup gb18030 code of a given gb18030 code It's only for UCA, because the caseinfo for UCA is different. More...

static unsigned	get_weight_if_chinese_character (unsigned code)
	Get the weight for a multi-byte gb18030 code if the code point represents a Chinese character defined in collation PINYIN in CLDR24 The result will be PINYIN_WEIGHT_BASE + a none-zero seq NO. More...

static unsigned	get_weight_for_mbchar (const CHARSET_INFO cs, const uint8_t src, size_t mblen)
	Get the weight for a multi-byte gb18030 code, we get the weight by the case up form of gb18030 code if exists. More...

static unsigned	get_weight_for_gb18030_chs (const CHARSET_INFO cs, const char s, size_t s_len)
	Get the weight of a given gb18030 code We can assert the code must be a valid gb18030 code. More...

static size_t	get_code_and_length (const CHARSET_INFO cs, const char s, const char e, size_t code)
	Get the code value and length of next code in given gb18030 string. More...

static int	my_strnncoll_gb18030_internal (const CHARSET_INFO cs, const uint8_t s_res, size_t s_length, const uint8_t *t_res, size_t t_length)
	Internal func to compare two strings according to gb18030 every gb18030 code should compare by its upper-case form. More...

static int	my_strnncoll_gb18030 (const CHARSET_INFO cs, const uint8_t s, size_t s_length, const uint8_t *t, size_t t_length, bool t_is_prefix)
	Compare two strings according to gb18030 every gb18030 code should compare by its caseup form. More...

static int	my_strnncollsp_gb18030 (const CHARSET_INFO cs, const uint8_t s, size_t s_length, const uint8_t *t, size_t t_length)
	Compare two strings according to gb18030, but ignore trailing spaces every gb18030 code should compare by its caseup form. More...

static size_t	my_strnxfrm_gb18030 (const CHARSET_INFO cs, uint8_t dst, size_t dstlen, unsigned nweights, const uint8_t *src, size_t srclen, unsigned flags)
	Make a sort key suitable for memcmp() corresponding to gb18030 Sort according to UPPER() for non-Chinese chars, and PINYIN for Chinese chars. More...

static int	my_strcasecmp_gb18030 (const CHARSET_INFO cs, const char s, const char *t)
	Compare 0-terminated gb18030 strings. More...

static unsigned	unicode_to_gb18030_code (const CHARSET_INFO *cs, int unicode)
	Convert a Unicode code to gb18030 code. More...

static int	my_wildcmp_gb18030_impl (const CHARSET_INFO cs, const char str, const char str_end, const char wildstr, const char *wildend, unsigned escape, unsigned w_one, unsigned w_many, int recurse_level)
	Compare string against string with wildcard. More...

static int	my_wildcmp_gb18030 (const CHARSET_INFO cs, const char str, const char str_end, const char wildstr, const char *wildend, int escape, int w_one, int w_many)
	Compare string against string with wildcard. More...

static void	my_hash_sort_gb18030 (const CHARSET_INFO cs, const uint8_t s, size_t slen, uint64_t n1, uint64_t n2)
	Calculate hash value for given gb18030 string. More...

Variables
static const unsigned	MIN_MB_ODD_BYTE = 0x81

static const unsigned	MIN_MB_EVEN_BYTE_2 = 0x40

static const unsigned	MIN_MB_EVEN_BYTE_4 = 0x30

static const unsigned	MAX_GB18030_DIFF = 0x18398F

static const unsigned	UNI2_TO_GB4_DIFF = 7456

static const unsigned	UNICASE_4_BYTE_OFFSET = 0x80

static const unsigned	MIN_2_BYTE_UNICASE = 0xA000

static const unsigned	MAX_2_BYTE_UNICASE = 0xDFFF

static const unsigned	MIN_3_BYTE_FROM_UNI = 0x2E600

static const unsigned	MAX_3_BYTE_FROM_UNI = 0x2E6FF

static const unsigned	PINYIN_2_BYTE_START = 0x8140

static const unsigned	PINYIN_2_BYTE_END = 0xFE9F

static const unsigned	PINYIN_4_BYTE_1_START = 0x8138FD38

static const unsigned	PINYIN_4_BYTE_1_END = 0x82359232

static const unsigned	PINYIN_4_1_DIFF = 11328

static const unsigned	PINYIN_4_BYTE_2_START = 0x95328236

static const unsigned	PINYIN_4_BYTE_2_END = 0x98399836

static const unsigned	PINYIN_4_2_DIFF = 254536

static const unsigned	PINYIN_WEIGHT_BASE = 0xFFA00000

static const unsigned	COMMON_WEIGHT_BASE = 0xFF000000

static const uint8_t	ctype_gb18030 [257]
	The array used for "type of characters" bit mask for each character. More...

static const uint8_t	to_lower_gb18030 []
	The array[256] used in casedn. More...

static const uint8_t	to_upper_gb18030 []
	The array[256] used in caseup. More...

static const uint8_t	sort_order_gb18030 []
	The array[256] used for strings comparison. More...

static const MY_UNICASE_CHARACTER	plane00 []
	Unicase array for 0x0000-0x00FF. More...

static const MY_UNICASE_CHARACTER	plane01 []
	Unicase array for 0x0100-0x01FF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points. More...

static const MY_UNICASE_CHARACTER	plane02 []
	Unicase array for 0x0200-0x02FF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points. More...

static const MY_UNICASE_CHARACTER	plane03 []
	Unicase array for 0x0300-0x03FF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points. More...

static const MY_UNICASE_CHARACTER	plane04 []
	Unicase array for 0x0400-0x04FF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points. More...

static const MY_UNICASE_CHARACTER	plane10 []
	Unicase array for 0x1000-0x10FF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points. More...

static const MY_UNICASE_CHARACTER	plane1D []
	Unicase array for 0x1D00-0x1DFF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points. More...

static const MY_UNICASE_CHARACTER	plane1E []
	Unicase array for 0x1E00-0x1EFF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points. More...

static const MY_UNICASE_CHARACTER	plane1F []
	Unicase array for 0x1F00-0x1FFF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points. More...

static const MY_UNICASE_CHARACTER	plane20 []
	Unicase array for 0x2000-0x20FF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points. More...

static const MY_UNICASE_CHARACTER	plane23 []
	Unicase array for 0x2300-0x23FF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points. More...

static const MY_UNICASE_CHARACTER	plane2A []
	Unicase array for 0x2A00-0x2AFF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points. More...

static const MY_UNICASE_CHARACTER	plane2B []
	Unicase array for 0x2B00-0x2BFF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points. More...

static const MY_UNICASE_CHARACTER	plane51 []
	Unicase array for 0x5100-0x51FF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points. More...

static const MY_UNICASE_CHARACTER	plane52 []
	Unicase array for 0x5200-0x52FF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points. More...

static const MY_UNICASE_CHARACTER	planeA2 []
	Unicase array for 0xA200-0xA2FF All are for 2-byte code points directly. More...

static const MY_UNICASE_CHARACTER	planeA3 []
	Unicase array for 0xA300-0xA3FF All are for 2-byte code points directly. More...

static const MY_UNICASE_CHARACTER	planeA6 []
	Unicase array for 0xA600-0xA6FF All are for 2-byte code points directly. More...

static const MY_UNICASE_CHARACTER	planeA7 []
	Unicase array for 0xA700-0xA7FF All are for 2-byte code points directly. More...

static const MY_UNICASE_CHARACTER	planeA8 []
	Unicase array for 0xA800-0xA8FF All are for 2-byte code points directly. More...

static const MY_UNICASE_CHARACTER	planeE6 []
	Unicase array for 0xE600-0xE6FF Plus 0x20000, they're the diff(diff between code and 81308130) for 4-byte code points. More...

static const MY_UNICASE_CHARACTER *	my_caseinfo_pages_gb18030 [256]
	The UNICASE array. More...

static MY_UNICASE_INFO	my_caseinfo_gb18030
	UNICASE INFO. More...

static const uint16_t	tab_gb18030_2_uni []
	Mapping table from 2-byte gb18030 to unicode including all 2-byte code points in [GB+8140, GB+FEFE], with 0 for those invalid code points. More...

static const uint16_t	tab_gb18030_4_uni []
	Mapping table from 4-byte gb18030 to Unicode The values here are the diffs for 4-byte gb18030 code points including following ranges: [GB+81308130, GB+8130D330) (GB+8135F436, GB+8137A839) (GB+8138FD38, GB+82358F33) (GB+8336C738, GB+8336D030) (GB+84308534, GB+84309C38) (GB+84318537, GB+8431A439] Others can be calculated algorithmically. More...

static const uint16_t	tab_uni_gb18030_p1 []
	Mapping table from Unicode to gb18030, part one For Unicode in [0x80, 0x9FA6), if the leading byte is less than 0x81, the corresponding value represents the diff for 4-byte gb18030 code, otherwise, it's the corresponding 2-byte gb18030 code. More...

static const uint16_t	tab_uni_gb18030_p2 []
	Mapping table from Unicode to gb18030, part two For Unicode in [0xE000, 0xE865) and (0xF92B, 0xFFFF] The values here have the same meaning with tab_uni_gb18030_p1. More...

static const uint16_t	gb18030_2_weight_py []
	The following 3 Chinese character weight arrays are based upon the PINYIN collation in zh.xml file of CLDR24(http://cldr.unicode.org/) (http://unicode.org/Public/cldr/24/core.zip) More...

static const uint16_t	gb18030_4_weight_py_p1 []
	Weight array for one range of 4-byte gb18030 code points, which is [GB+8138FE38, GB+82359232]. More...

static const uint16_t	gb18030_4_weight_py_p2 []
	Weight array for the other range of 4-byte gb18030 code points, which is [GB+95328236, GB+98399836]. More...

static MY_COLLATION_HANDLER	my_collation_ci_handler

static MY_CHARSET_HANDLER	my_charset_gb18030_handler

MY_CHARSET_HANDLER	my_charset_gb18030_uca_handler

CHARSET_INFO	my_charset_gb18030_chinese_ci

CHARSET_INFO	my_charset_gb18030_bin

Macro Definition Documentation

◆ is_mb_1

#define is_mb_1 ( c ) ((uint8_t)(c) <= 0x7F)

◆ is_mb_even_2

#define is_mb_even_2 ( c )

Value:

((0x40 <= (uint8_t)(c) && (uint8_t)(c) <= 0x7E) || \

(0x80 <= (uint8_t)(c) && (uint8_t)(c) <= 0xFE))

◆ is_mb_even_4

#define is_mb_even_4 ( c ) (0x30 <= (uint8_t)(c) && (uint8_t)(c) <= 0x39)

◆ is_mb_odd

#define is_mb_odd ( c ) (0x81 <= (uint8_t)(c) && (uint8_t)(c) <= 0xFE)

Function Documentation

◆ case_info_code_to_gb18030()

static unsigned case_info_code_to_gb18030 ( unsigned code )

static

Convert the code in one MY_UNICASE_CHARACTER to real gb18030 code.

Parameters

[in] code code in one MY_UNICASE_CHARACTER

Returns: gb18030 code

◆ code_to_gb18030_chs()

static size_t code_to_gb18030_chs	(	uint8_t *	dst,
		size_t	dstlen,
		unsigned	code
	)

static

Convert a gb18030 code to a sequence of chars.

If the code number is too large to store, the tailing bytes will be cut off

Parameters

[out]	dst	dest to store the result
[in]	dstlen	valid length of dest
[in]	code	gb18030 code

Returns: the length of dest used to store the gb18030 chars

◆ diff_to_gb18030_4()

static unsigned diff_to_gb18030_4	(	uint8_t *	dst,
		unsigned	dstlen,
		unsigned	diff
	)

static

Calculate the 4-byte GB18030 code from a diff value.

Parameters

[out]	dst	dest to store the gb18030 code in bytes
[in]	dstlen	valid length of dest
[in]	diff	the diff between gb18030 code and GB+81308130

Return values

4	if the diff is a valid value and there is enough space in dst 0 otherwise

◆ gb18030_4_chs_to_diff()

static unsigned gb18030_4_chs_to_diff ( const uint8_t * src )

inlinestatic

Calculate the diff between the 4-byte gb18030 code in bytes and GB+81308130.

Parameters

[in] src 4-byte gb18030 code in bytes

Returns: the diff

◆ gb18030_4_code_to_diff()

static unsigned gb18030_4_code_to_diff ( unsigned code )

static

Calculate the diff between the 4-byte gb18030 code and GB+81308130.

Parameters

[in] code 4-byte gb18030 code

Returns: the diff

◆ gb18030_chs_to_code()

static unsigned gb18030_chs_to_code	(	const uint8_t *	src,
		size_t	srclen
	)

inlinestatic

Convert a gb18030 code in uchars to the code The byte sequence in src must be a gb18030 sequence.

Parameters

[in]	src	gb18030 code in uchars
[in]	srclen	length of valid chars, should be 1, 2 or 4 only

Returns: the gb18030 code

◆ get_case_info()

static const MY_UNICASE_CHARACTER * get_case_info	(	const CHARSET_INFO *	cs,
		const uint8_t *	src,
		size_t	srclen
	)

static

Get the case info of one gb18030 code in bytes.

Parameters

[in]	cs	charset
[in]	src	start byte of gb18030 code
[in]	srclen	length in bytes of gb18030 code

Returns: the case info(MY_UNICASE_CHARACTER) of given gb18030 code

◆ get_casefolded_code()

static unsigned get_casefolded_code	(	const CHARSET_INFO *	cs,
		const uint8_t *	src,
		size_t	srclen,
		size_t	is_upper
	)

static

Get the casefolded code of a given gb18030 code.

Parameters

[in]	cs	charset
[in]	src	start byte of gb18030 code
[in]	srclen	length of gb18030 code in bytes
[in]	is_upper	true if we want capital letter false if small letter is required

Return values

the	gb18030 code according to is_upper 0 if no upper-case or lower-case exists

◆ get_code_and_length()

static size_t get_code_and_length	(	const CHARSET_INFO *	cs,
		const char *	s,
		const char *	e,
		size_t *	code
	)

static

Get the code value and length of next code in given gb18030 string.

Parameters

[in]	cs	charset
[in]	s	string
[in]	e	end of string
[out]	code	the code value of the next code, be valid when retval>0

Return values

the	length of the next code, if the code is valid 0 if the given string is empty or the code is invalid

◆ get_weight_for_gb18030_chs()

static unsigned get_weight_for_gb18030_chs	(	const CHARSET_INFO *	cs,
		const char *	s,
		size_t	s_len
	)

static

Get the weight of a given gb18030 code We can assert the code must be a valid gb18030 code.

Parameters

[in]	cs	charset
[in]	s	code chars
[in]	s_len	length of the code

Returns: weight the weight of the code

◆ get_weight_for_mbchar()

static unsigned get_weight_for_mbchar	(	const CHARSET_INFO *	cs,
		const uint8_t *	src,
		size_t	mblen
	)

static

Get the weight for a multi-byte gb18030 code, we get the weight by the case up form of gb18030 code if exists.

For any Chinese character, which has non-zero seq NO. defined in gb18030_2_weight_py/gb18030_4_weight_py_p1/gb18030_4_weight_py_p2 according to its gb18030 code, the final weight shall be 0xFFA00000+(seq No.)

For any non-Chinese gb18030 character C, let C'=C or UPPER(C) if exists. So C' would be 2 bytes or 4 bytes.

For any C' represented in 2 bytes, the final weight shall be C' itself. For example: GB+A2F1=UPPER(GB+A2A1), which is a 2-byte non-Chinese character. So C' would be GB+A2F1 for both GB+A2A1 and GB+A2F1, and both final weights of GB+A2A1 and GB+A2F1 are 0xA2F1
For any C' represented in 4 bytes, the final weight shall be COMMON_WEIGHT_BASE+diff(C'). We get the diff by gb18030_4_code_to_diff For example: The character GB+9030E833=UPPER(GB+9030EC33), which is a 4-byte non-Chinese character. So the C' would be GB+9030E833 for both GB+9030E833 and GB+9030EC33, and diff(C') would be 0x2E651 in this case. Both final weights of GB+9030E833 and GB+9030EC33 are 0xFF000000+0x2E651=0xFF02E651

The weight of the maximum code point GB+FE39FE39 is defined as 0xFFFFFFFF

Parameters

[in]	cs	charset
[in]	src	the first byte of multi-byte gb18030 code
[in]	mblen	the length of multi-bytes gb18030 code

Returns: the weight of the given gb18030 code point

◆ get_weight_if_chinese_character()

static unsigned get_weight_if_chinese_character ( unsigned code )

static

Get the weight for a multi-byte gb18030 code if the code point represents a Chinese character defined in collation PINYIN in CLDR24 The result will be PINYIN_WEIGHT_BASE + a none-zero seq NO.

for the code

if not, result would be PINYIN_WEIGHT_BASE

Parameters

[in] code the multi-byte gb18030 code

Return values

a	non-zero weight if it's a Chinese character with PINYIN, which shall be PINYIN_WEIGHT_BASE + none-zero seq NO. otherwise, PINYIN_WEIGHT_BASE

◆ my_casedn_gb18030()

static size_t my_casedn_gb18030	(	const CHARSET_INFO *	cs,
		char *	src,
		size_t	srclen,
		char *	dst,
		size_t	dstlen
	)

static

Convert a gb18030 string to a corresponding lower-case gb18030 string.

Parameters

[in]	cs	charset
[in]	src	start byte of given gb18030 code
[in]	srclen	length of given gb18030 code
[out]	dst	start byte of casedown gb18030 code
[in]	dstlen	length of the result array

Returns: length of the casedown gb18030 code

◆ my_casedn_gb18030_uca()

static size_t my_casedn_gb18030_uca	(	const CHARSET_INFO *	cs,
		char *	src,
		size_t	srclen,
		char *	dst,
		size_t	dstlen
	)

static

Get the casedown gb18030 code of a given gb18030 code It's only for UCA, because the caseinfo for UCA is different.

Parameters

[in]	cs	charset
[in]	src	start byte of given gb18030 code
[in]	srclen	length of given gb18030 code
[out]	dst	start byte of casedown gb18030 code
[in]	dstlen	length of the result array

Returns: length of the casedown gb18030 code

◆ my_casefold_gb18030()

static size_t my_casefold_gb18030	(	const CHARSET_INFO *	cs,
		char *	src,
		size_t	srclen,
		char *	dst,
		size_t	dstlen,
		const uint8_t *	map,
		bool	is_upper
	)

static

Get the casefolded gb18030 codes of a given sequence of gb18030 codes Store the casefolded result to a specified dest.

Parameters

[in]	cs	charset
[in]	src	start byte of given sequence of gb18030 codes
[in]	srclen	length of the sequence
[out]	dst	start byte of the output of casefolded codes
[out]	dstlen	length of the result array
[in]	map	the LOWER map or the UPPER map of gb18030
[in]	is_upper	true if we want capital letter false if small letter is required

Returns: the length of result

◆ my_caseup_gb18030()

static size_t my_caseup_gb18030	(	const CHARSET_INFO *	cs,
		char *	src,
		size_t	srclen,
		char *	dst,
		size_t	dstlen
	)

static

Convert a gb18030 string to a corresponding up-case gb18030 string.

Parameters

[in]	cs	charset
[in]	src	start byte of given gb18030 code
[in]	srclen	length of given gb18030 code
[out]	dst	start byte of caseup gb18030 code
[in]	dstlen	length of the result array

Returns: length of the caseup gb18030 code

◆ my_caseup_gb18030_uca()

static size_t my_caseup_gb18030_uca	(	const CHARSET_INFO *	cs,
		char *	src,
		size_t	srclen,
		char *	dst,
		size_t	dstlen
	)

static

Get the caseup gb18030 code of a given gb18030 code It's only for UCA, because the caseinfo for UCA is different.

Parameters

[in]	cs	charset
[in]	src	start byte of given gb18030 code
[in]	srclen	length of given gb18030 code
[out]	dst	start byte of caseup gb18030 code
[in]	dstlen	length of the result array

Returns: length of the caseup gb18030 code

◆ my_hash_sort_gb18030()

static void my_hash_sort_gb18030	(	const CHARSET_INFO *	cs,
		const uint8_t *	s,
		size_t	slen,
		uint64_t *	n1,
		uint64_t *	n2
	)

static

Calculate hash value for given gb18030 string.

Parameters

[in]	cs	charset
[in]	s	string
[in]	slen	the length of string
[in,out]	n1	n1
[in,out]	n2	n2

◆ my_ismbchar_gb18030()

static unsigned my_ismbchar_gb18030	(	const CHARSET_INFO *	cs,
		const char *	p,
		const char *	e
	)

static

Judge if a sequence of chars is in gb18030 multi-bytes code.

Parameters

[in]	cs	charset
[in]	p	start of gb18030 code
[in]	e	end of gb18030 code

Return values

2	or 4 if it is gb18030 mluti-bytes code, 0 if not

◆ my_mb_wc_gb18030()

static int my_mb_wc_gb18030	(	const CHARSET_INFO *	cs,
		my_wc_t *	pwc,
		const uint8_t *	s,
		const uint8_t *	e
	)

static

Convert a gb18030 code in bytes to unicode code.

Parameters

[in]	cs	charset
[out]	pwc	unicode code
[in]	s	start of gb18030 code
[in]	e	end of gb18030 code

Return values

1)	the length of converted gb18030 code if convertible 2) MY_CS_TOOSMALL..MY_CS_TOOSMALL4 if the gb18030 code is too short 3) MY_CS_ILSEQ if gb18030 code is wrong by sequence

◆ my_mbcharlen_gb18030()

static unsigned my_mbcharlen_gb18030	(	const CHARSET_INFO *	cs,
		unsigned	c
	)

static

Get the length of a possible gb18030 code according to its first byte or first two bytes.

Parameters

[in]	cs	charset
[in]	c	first byte or first two bytes of the code

Return values

1/2/4 accordingly if the leading byte(s) indicate the code would be gb18030, otherwise 0

◆ my_strcasecmp_gb18030()

static int my_strcasecmp_gb18030	(	const CHARSET_INFO *	cs,
		const char *	s,
		const char *	t
	)

static

Compare 0-terminated gb18030 strings.

Parameters

[in]	cs	charset
[in]	s	first 0-terminated string to compare
[in]	t	second 0-terminated string to compare

Return values

negative number if s < t positive number if s > t 0 if the strings are equal

◆ my_strnncoll_gb18030()

static int my_strnncoll_gb18030	(	const CHARSET_INFO *	cs,
		const uint8_t *	s,
		size_t	s_length,
		const uint8_t *	t,
		size_t	t_length,
		bool	t_is_prefix
	)

static

Compare two strings according to gb18030 every gb18030 code should compare by its caseup form.

Parameters

[in]	cs	charset
[in]	s	start of the first string
[in]	s_length	length of the first string
[in]	t	start of the second string
[in]	t_length	length of the second string
[in]	t_is_prefix	true if t is prefix, otherwise false

Return values

0	if two strings are equal 1 if the first string is bigger -1 if the second string is bigger

◆ my_strnncoll_gb18030_internal()

static int my_strnncoll_gb18030_internal	(	const CHARSET_INFO *	cs,
		const uint8_t **	s_res,
		size_t	s_length,
		const uint8_t **	t_res,
		size_t	t_length
	)

static

Internal func to compare two strings according to gb18030 every gb18030 code should compare by its upper-case form.

Parameters

[in]	cs	charset
[in,out]	s_res	pointer to the start byte of first gb18030 code string, return the first byte unchecked
[in]	s_length	length of the first string
[in,out]	t_res	pointer to the start byte of second gb18030 code string, return the first byte unchecked
[in]	t_length	length of the second string

Return values

0	if the strings are equal >0 if the first string is bigger <0 if the second string is bigger

◆ my_strnncollsp_gb18030()

static int my_strnncollsp_gb18030	(	const CHARSET_INFO *	cs,
		const uint8_t *	s,
		size_t	s_length,
		const uint8_t *	t,
		size_t	t_length
	)

static

Compare two strings according to gb18030, but ignore trailing spaces every gb18030 code should compare by its caseup form.

Parameters

[in]	cs	charset
[in]	s	start of the first string
[in]	s_length	length of the first string
[in]	t	start of the second string
[in]	t_length	length of the second string

Return values

0	if two strings are equal 1 if the first string is bigger -1 if the second string is bigger

◆ my_strnxfrm_gb18030()

static size_t my_strnxfrm_gb18030	(	const CHARSET_INFO *	cs,
		uint8_t *	dst,
		size_t	dstlen,
		unsigned	nweights,
		const uint8_t *	src,
		size_t	srclen,
		unsigned	flags
	)

static

Make a sort key suitable for memcmp() corresponding to gb18030 Sort according to UPPER() for non-Chinese chars, and PINYIN for Chinese chars.

Parameters

[in]	cs	charset
[out]	dst	first byte of the result array
[out]	dstlen	the length of the result array
[in]	nweights	how many gb18030 codes we care about
[in]	src	first byte of the given gb18030 code
[in]	srclen	length of the given gb18030 code
[in]	flags	flags for strxfrm

Returns: the length of the sort key

◆ my_wc_mb_gb18030_chs()

static int my_wc_mb_gb18030_chs	(	const CHARSET_INFO *	cs,
		my_wc_t	wc,
		uint8_t *	s,
		uint8_t *	e
	)

static

Convert the Unicode code to its gb18030 code in bytes.

Parameters

[in]	cs	charset
[in]	wc	Unicode code
[out]	s	start of gb18030 code output
[out]	e	end of gb18030 code output

Return values

1)	the length of gb18030 code(1/2/4) if convertible, 2) MY_CS_TOOSMALL..MY_CS_TOOSMALL4 if the output space is too small 3) MY_CS_ILUNI if we can't encode unicode to gb18030

◆ my_well_formed_len_gb18030()

static size_t my_well_formed_len_gb18030	(	const CHARSET_INFO *	cs,
		const char *	b,
		const char *	e,
		size_t	pos,
		int *	error
	)

static

Get the well formed length of a GB18030 string.

Parameters

[in]	cs	charset
[in]	b	start of gb18030 code
[in]	e	end of gb18030 code
[in]	pos	max chars we care about
[out]	error	0 if every gb18030 code we get is correct, otherwise 1

Returns: the length of all well formed bytes

◆ my_wildcmp_gb18030()

static int my_wildcmp_gb18030	(	const CHARSET_INFO *	cs,
		const char *	str,
		const char *	str_end,
		const char *	wildstr,
		const char *	wildend,
		int	escape,
		int	w_one,
		int	w_many
	)

static

Compare string against string with wildcard.

Parameters

[in]	cs	charset
[in]	str	string
[in]	str_end	end of the string
[in]	wildstr	string with wildcard
[in]	wildend	end of the string with wildcard
[in]	escape	escape char, which is a Unicode code
[in]	w_one	wild one char, which is a Unicode code
[in]	w_many	wild many char, which is a Unicode code

Returns: 0 if matched -1 if not matched with wildcard 1 if matched with wildcard

◆ my_wildcmp_gb18030_impl()

static int my_wildcmp_gb18030_impl	(	const CHARSET_INFO *	cs,
		const char *	str,
		const char *	str_end,
		const char *	wildstr,
		const char *	wildend,
		unsigned	escape,
		unsigned	w_one,
		unsigned	w_many,
		int	recurse_level
	)

static

Compare string against string with wildcard.

Parameters

[in]	cs	charset
[in]	str	string
[in]	str_end	end of the string
[in]	wildstr	string with wildcard
[in]	wildend	end of the string with wildcard
[in]	escape	escape char, which is a gb18030 code
[in]	w_one	wild one char, which is a gb18030 code
[in]	w_many	wild many char, which is a gb18030 code
[in]	recurse_level	current recurse level to do wild card

Returns: 0 if matched -1 if not matched with wildcard 1 if matched with wildcard

◆ unicode_to_gb18030_code()

static unsigned unicode_to_gb18030_code	(	const CHARSET_INFO *	cs,
		int	unicode
	)

static

Convert a Unicode code to gb18030 code.

Parameters

[in]	cs	charset
[in]	unicode	unicode code

Returns: gb18030 code

Variable Documentation

◆ COMMON_WEIGHT_BASE

const unsigned COMMON_WEIGHT_BASE = 0xFF000000

static

◆ ctype_gb18030

const uint8_t ctype_gb18030[257]

static

Initial value:

= {
    0, 
    32,  32,  32,  32,  32,  32,  32,  32,  32,  40,  40, 40, 40, 40, 32, 32,
    32,  32,  32,  32,  32,  32,  32,  32,  32,  32,  32, 32, 32, 32, 32, 32,
    72,  16,  16,  16,  16,  16,  16,  16,  16,  16,  16, 16, 16, 16, 16, 16,
    132, 132, 132, 132, 132, 132, 132, 132, 132, 132, 16, 16, 16, 16, 16, 16,
    16,  129, 129, 129, 129, 129, 129, 1,   1,   1,   1,  1,  1,  1,  1,  1,
    1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,  16, 16, 16, 16, 16,
    16,  130, 130, 130, 130, 130, 130, 2,   2,   2,   2,  2,  2,  2,  2,  2,
    2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,  16, 16, 16, 16, 32,
    3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,  3,  3,  3,  3,  3,
    3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,  3,  3,  3,  3,  3,
    3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,  3,  3,  3,  3,  3,
    3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,  3,  3,  3,  3,  3,
    3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,  3,  3,  3,  3,  3,
    3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,  3,  3,  3,  3,  3,
    3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,  3,  3,  3,  3,  3,
    3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,  3,  3,  3,  3,  0}

The array used for "type of characters" bit mask for each character.

The ctype[0] is reserved for EOF(-1), so we use ctype[(char)+1]. Also refer to strings/CHARSET_INFO.txt

◆ gb18030_2_weight_py

const uint16_t gb18030_2_weight_py[]

static

The following 3 Chinese character weight arrays are based upon the PINYIN collation in zh.xml file of CLDR24(http://cldr.unicode.org/) (http://unicode.org/Public/cldr/24/core.zip)

Please note that we only support those Chinese characters with PINYIN in zh.xml currently, we support neither those CJK characters whose category defined in Unicode are Symbol with the same shape as Chinese characters nor the PINYIN characters

The 3 arrays include the sort order of the code points accordingly The weight here just mean the order of the corresponding gb18030 code point. For example: currently U+963F(GB+B0A2) is the first character in the PINYIN collation, while U+5475(GB+BAC7) is the second, and so on. So the weight of U+963F is 1 and U+5475 is 2, and GB+8140 is the 15308th character, which has the value of 15308 in the first array

The weight starts from 1 to 41309 and there are no duplicate weights in the collation, so the simplified Chinese is not equal to the corresponding traditional Chinese

Totally, there are 41309 Chinese characters being taken into account in these arrays

The weight of each code point shall be the corresponding weight in these arrays plus PINYIN_WEIGHT_BASE Weight array for those 2-byte gb18030 code points in the range [GB+8140, GB+FE9F]. If it's not a Chinese char, the weight is 0

◆ gb18030_4_weight_py_p1

const uint16_t gb18030_4_weight_py_p1[]

static

Weight array for one range of 4-byte gb18030 code points, which is [GB+8138FE38, GB+82359232].

The sequence NO. of this array is the diff between the code point and GB+8138FE38 The weights would be 0 for non-Chinese chars

◆ gb18030_4_weight_py_p2

const uint16_t gb18030_4_weight_py_p2[]

static

Weight array for the other range of 4-byte gb18030 code points, which is [GB+95328236, GB+98399836].

The sequence NO. of this array is the diff between code point and GB+95328236 The weights would be 0 for non-Chinese chars This range maps from Unicode [U+20000, U+2FFFF], which is CJK extension

◆ MAX_2_BYTE_UNICASE

const unsigned MAX_2_BYTE_UNICASE = 0xDFFF

static

◆ MAX_3_BYTE_FROM_UNI

const unsigned MAX_3_BYTE_FROM_UNI = 0x2E6FF

static

◆ MAX_GB18030_DIFF

const unsigned MAX_GB18030_DIFF = 0x18398F

static

◆ MIN_2_BYTE_UNICASE

const unsigned MIN_2_BYTE_UNICASE = 0xA000

static

◆ MIN_3_BYTE_FROM_UNI

const unsigned MIN_3_BYTE_FROM_UNI = 0x2E600

static

◆ MIN_MB_EVEN_BYTE_2

const unsigned MIN_MB_EVEN_BYTE_2 = 0x40

static

◆ MIN_MB_EVEN_BYTE_4

const unsigned MIN_MB_EVEN_BYTE_4 = 0x30

static

◆ MIN_MB_ODD_BYTE

const unsigned MIN_MB_ODD_BYTE = 0x81

static

◆ my_caseinfo_gb18030

MY_UNICASE_INFO my_caseinfo_gb18030

static

Initial value:

= {0xFFFF,

my_caseinfo_pages_gb18030}

my_caseinfo_pages_gb18030

static const MY_UNICASE_CHARACTER * my_caseinfo_pages_gb18030[256]

The UNICASE array.

Definition: ctype-gb18030.cc:3033

UNICASE INFO.

◆ my_caseinfo_pages_gb18030

const MY_UNICASE_CHARACTER* my_caseinfo_pages_gb18030[256]

static

The UNICASE array.

◆ my_charset_gb18030_bin

CHARSET_INFO my_charset_gb18030_bin

◆ my_charset_gb18030_chinese_ci

CHARSET_INFO my_charset_gb18030_chinese_ci

◆ my_charset_gb18030_handler

MY_CHARSET_HANDLER my_charset_gb18030_handler

static

Initial value:

= {
    nullptr,
    my_ismbchar_gb18030,
    my_mbcharlen_gb18030,
    my_numchars_mb,
    my_charpos_mb3,
    my_well_formed_len_gb18030,
    my_lengthsp_8bit,
    my_numcells_mb,
    my_mb_wc_gb18030,
    my_wc_mb_gb18030_chs,
    my_mb_ctype_mb,
    my_caseup_str_mb,
    my_casedn_str_mb,
    my_caseup_gb18030,
    my_casedn_gb18030,
    my_snprintf_8bit,
    my_long10_to_str_8bit,
    my_longlong10_to_str_8bit,
    my_fill_8bit,
    my_strntol_8bit,
    my_strntoul_8bit,
    my_strntoll_8bit,
    my_strntoull_8bit,
    my_strntod_8bit,
    my_strtoll10_8bit,
    my_strntoull10rnd_8bit,
    my_scan_8bit}

◆ my_charset_gb18030_uca_handler

MY_CHARSET_HANDLER my_charset_gb18030_uca_handler

Initial value:

= {nullptr,
                                                     my_ismbchar_gb18030,
                                                     my_mbcharlen_gb18030,
                                                     my_numchars_mb,
                                                     my_charpos_mb3,
                                                     my_well_formed_len_gb18030,
                                                     my_lengthsp_8bit,
                                                     my_numcells_mb,
                                                     my_mb_wc_gb18030,
                                                     my_wc_mb_gb18030_chs,
                                                     my_mb_ctype_mb,
                                                     my_caseup_str_mb,
                                                     my_casedn_str_mb,
                                                     my_caseup_gb18030_uca,
                                                     my_casedn_gb18030_uca,
                                                     my_snprintf_8bit,
                                                     my_long10_to_str_8bit,
                                                     my_longlong10_to_str_8bit,
                                                     my_fill_8bit,
                                                     my_strntol_8bit,
                                                     my_strntoul_8bit,
                                                     my_strntoll_8bit,
                                                     my_strntoull_8bit,
                                                     my_strntod_8bit,
                                                     my_strtoll10_8bit,
                                                     my_strntoull10rnd_8bit,
                                                     my_scan_8bit}

◆ my_collation_ci_handler

MY_COLLATION_HANDLER my_collation_ci_handler

static

Initial value:

= {nullptr,
                                                       nullptr,
                                                       my_strnncoll_gb18030,
                                                       my_strnncollsp_gb18030,
                                                       my_strnxfrm_gb18030,
                                                       my_strnxfrmlen_simple,
                                                       my_like_range_mb,
                                                       my_wildcmp_gb18030,
                                                       my_strcasecmp_gb18030,
                                                       my_instr_mb,
                                                       my_hash_sort_gb18030,
                                                       my_propagate_simple}

◆ PINYIN_2_BYTE_END

const unsigned PINYIN_2_BYTE_END = 0xFE9F

static

◆ PINYIN_2_BYTE_START

const unsigned PINYIN_2_BYTE_START = 0x8140

static

◆ PINYIN_4_1_DIFF

const unsigned PINYIN_4_1_DIFF = 11328

static

◆ PINYIN_4_2_DIFF

const unsigned PINYIN_4_2_DIFF = 254536

static

◆ PINYIN_4_BYTE_1_END

const unsigned PINYIN_4_BYTE_1_END = 0x82359232

static

◆ PINYIN_4_BYTE_1_START

const unsigned PINYIN_4_BYTE_1_START = 0x8138FD38

static

◆ PINYIN_4_BYTE_2_END

const unsigned PINYIN_4_BYTE_2_END = 0x98399836

static

◆ PINYIN_4_BYTE_2_START

const unsigned PINYIN_4_BYTE_2_START = 0x95328236

static

◆ PINYIN_WEIGHT_BASE

const unsigned PINYIN_WEIGHT_BASE = 0xFFA00000

static

◆ plane00

const MY_UNICASE_CHARACTER plane00[]

static

Unicase array for 0x0000-0x00FF.

0x0000-0x007F is for 1-byte code points, the others which represent the diff (diff between code and 81308130, plus 0x80), are for 4-byte code points

◆ plane01

const MY_UNICASE_CHARACTER plane01[]

static

Unicase array for 0x0100-0x01FF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points.

◆ plane02

const MY_UNICASE_CHARACTER plane02[]

static

Unicase array for 0x0200-0x02FF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points.

◆ plane03

const MY_UNICASE_CHARACTER plane03[]

static

Unicase array for 0x0300-0x03FF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points.

◆ plane04

const MY_UNICASE_CHARACTER plane04[]

static

Unicase array for 0x0400-0x04FF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points.

◆ plane10

const MY_UNICASE_CHARACTER plane10[]

static

Unicase array for 0x1000-0x10FF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points.

◆ plane1D

const MY_UNICASE_CHARACTER plane1D[]

static

Unicase array for 0x1D00-0x1DFF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points.

◆ plane1E

const MY_UNICASE_CHARACTER plane1E[]

static

Unicase array for 0x1E00-0x1EFF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points.

◆ plane1F

const MY_UNICASE_CHARACTER plane1F[]

static

Unicase array for 0x1F00-0x1FFF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points.

◆ plane20

const MY_UNICASE_CHARACTER plane20[]

static

Unicase array for 0x2000-0x20FF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points.

◆ plane23

const MY_UNICASE_CHARACTER plane23[]

static

Unicase array for 0x2300-0x23FF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points.

◆ plane2A

const MY_UNICASE_CHARACTER plane2A[]

static

Unicase array for 0x2A00-0x2AFF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points.

◆ plane2B

const MY_UNICASE_CHARACTER plane2B[]

static

Unicase array for 0x2B00-0x2BFF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points.

◆ plane51

const MY_UNICASE_CHARACTER plane51[]

static

Unicase array for 0x5100-0x51FF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points.

◆ plane52

const MY_UNICASE_CHARACTER plane52[]

static

Unicase array for 0x5200-0x52FF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points.

◆ planeA2

const MY_UNICASE_CHARACTER planeA2[]

static

Unicase array for 0xA200-0xA2FF All are for 2-byte code points directly.

◆ planeA3

const MY_UNICASE_CHARACTER planeA3[]

static

Unicase array for 0xA300-0xA3FF All are for 2-byte code points directly.

◆ planeA6

const MY_UNICASE_CHARACTER planeA6[]

static

Unicase array for 0xA600-0xA6FF All are for 2-byte code points directly.

◆ planeA7

const MY_UNICASE_CHARACTER planeA7[]

static

Unicase array for 0xA700-0xA7FF All are for 2-byte code points directly.

◆ planeA8

const MY_UNICASE_CHARACTER planeA8[]

static

Unicase array for 0xA800-0xA8FF All are for 2-byte code points directly.

◆ planeE6

const MY_UNICASE_CHARACTER planeE6[]

static

Unicase array for 0xE600-0xE6FF Plus 0x20000, they're the diff(diff between code and 81308130) for 4-byte code points.

◆ sort_order_gb18030

const uint8_t sort_order_gb18030[]

static

Initial value:

= {
    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B,
    0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
    0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, ' ',  '!',  '"',  '#',
    '$',  '%',  '&',  '\'', '(',  ')',  '*',  '+',  ',',  '-',  '.',  '/',
    '0',  '1',  '2',  '3',  '4',  '5',  '6',  '7',  '8',  '9',  ':',  ';',
    '<',  '=',  '>',  '?',  '@',  'A',  'B',  'C',  'D',  'E',  'F',  'G',
    'H',  'I',  'J',  'K',  'L',  'M',  'N',  'O',  'P',  'Q',  'R',  'S',
    'T',  'U',  'V',  'W',  'X',  'Y',  'Z',  '[',  '\\', ']',  '^',  '_',
    '`',  'A',  'B',  'C',  'D',  'E',  'F',  'G',  'H',  'I',  'J',  'K',
    'L',  'M',  'N',  'O',  'P',  'Q',  'R',  'S',  'T',  'U',  'V',  'W',
    'X',  'Y',  'Z',  '{',  '|',  '}',  '~',  0x7F, 0x80, 0x81, 0x82, 0x83,
    0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F,
    0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B,
    0x9C, 0x9D, 0x9E, 0x9F, 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7,
    0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF, 0xB0, 0xB1, 0xB2, 0xB3,
    0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF,
    0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xCB,
    0xCC, 0xCD, 0xCE, 0xCF, 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
    0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, 0xE0, 0xE1, 0xE2, 0xE3,
    0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
    0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB,
    0xFC, 0xFD, 0xFE, 0xFF}

The array[256] used for strings comparison.

◆ tab_gb18030_2_uni

const uint16_t tab_gb18030_2_uni[]

static

Mapping table from 2-byte gb18030 to unicode including all 2-byte code points in [GB+8140, GB+FEFE], with 0 for those invalid code points.

◆ tab_gb18030_4_uni

const uint16_t tab_gb18030_4_uni[]

static

Mapping table from 4-byte gb18030 to Unicode The values here are the diffs for 4-byte gb18030 code points including following ranges: [GB+81308130, GB+8130D330) (GB+8135F436, GB+8137A839) (GB+8138FD38, GB+82358F33) (GB+8336C738, GB+8336D030) (GB+84308534, GB+84309C38) (GB+84318537, GB+8431A439] Others can be calculated algorithmically.

◆ tab_uni_gb18030_p1

const uint16_t tab_uni_gb18030_p1[]

static

Mapping table from Unicode to gb18030, part one For Unicode in [0x80, 0x9FA6), if the leading byte is less than 0x81, the corresponding value represents the diff for 4-byte gb18030 code, otherwise, it's the corresponding 2-byte gb18030 code.

◆ tab_uni_gb18030_p2

const uint16_t tab_uni_gb18030_p2[]

static

Mapping table from Unicode to gb18030, part two For Unicode in [0xE000, 0xE865) and (0xF92B, 0xFFFF] The values here have the same meaning with tab_uni_gb18030_p1.

◆ to_lower_gb18030

const uint8_t to_lower_gb18030[]

static

Initial value:

= {
    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B,
    0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
    0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, ' ',  '!',  '"',  '#',
    '$',  '%',  '&',  '\'', '(',  ')',  '*',  '+',  ',',  '-',  '.',  '/',
    '0',  '1',  '2',  '3',  '4',  '5',  '6',  '7',  '8',  '9',  ':',  ';',
    '<',  '=',  '>',  '?',  '@',  'a',  'b',  'c',  'd',  'e',  'f',  'g',
    'h',  'i',  'j',  'k',  'l',  'm',  'n',  'o',  'p',  'q',  'r',  's',
    't',  'u',  'v',  'w',  'x',  'y',  'z',  '[',  '\\', ']',  '^',  '_',
    '`',  'a',  'b',  'c',  'd',  'e',  'f',  'g',  'h',  'i',  'j',  'k',
    'l',  'm',  'n',  'o',  'p',  'q',  'r',  's',  't',  'u',  'v',  'w',
    'x',  'y',  'z',  '{',  '|',  '}',  '~',  0x7F, 0x80, 0x81, 0x82, 0x83,
    0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F,
    0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B,
    0x9C, 0x9D, 0x9E, 0x9F, 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7,
    0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF, 0xB0, 0xB1, 0xB2, 0xB3,
    0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF,
    0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xCB,
    0xCC, 0xCD, 0xCE, 0xCF, 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
    0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, 0xE0, 0xE1, 0xE2, 0xE3,
    0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
    0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB,
    0xFC, 0xFD, 0xFE, 0xFF}

The array[256] used in casedn.

◆ to_upper_gb18030

const uint8_t to_upper_gb18030[]

static

Initial value:

= {
    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B,
    0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
    0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, ' ',  '!',  '"',  '#',
    '$',  '%',  '&',  '\'', '(',  ')',  '*',  '+',  ',',  '-',  '.',  '/',
    '0',  '1',  '2',  '3',  '4',  '5',  '6',  '7',  '8',  '9',  ':',  ';',
    '<',  '=',  '>',  '?',  '@',  'A',  'B',  'C',  'D',  'E',  'F',  'G',
    'H',  'I',  'J',  'K',  'L',  'M',  'N',  'O',  'P',  'Q',  'R',  'S',
    'T',  'U',  'V',  'W',  'X',  'Y',  'Z',  '[',  '\\', ']',  '^',  '_',
    '`',  'A',  'B',  'C',  'D',  'E',  'F',  'G',  'H',  'I',  'J',  'K',
    'L',  'M',  'N',  'O',  'P',  'Q',  'R',  'S',  'T',  'U',  'V',  'W',
    'X',  'Y',  'Z',  '{',  '|',  '}',  '~',  0x7F, 0x80, 0x81, 0x82, 0x83,
    0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F,
    0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B,
    0x9C, 0x9D, 0x9E, 0x9F, 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7,
    0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF, 0xB0, 0xB1, 0xB2, 0xB3,
    0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF,
    0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xCB,
    0xCC, 0xCD, 0xCE, 0xCF, 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
    0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, 0xE0, 0xE1, 0xE2, 0xE3,
    0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
    0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB,
    0xFC, 0xFD, 0xFE, 0xFF}

The array[256] used in caseup.

◆ UNI2_TO_GB4_DIFF

const unsigned UNI2_TO_GB4_DIFF = 7456

static

◆ UNICASE_4_BYTE_OFFSET

const unsigned UNICASE_4_BYTE_OFFSET = 0x80

static

Macros

Functions

Variables

Macro Definition Documentation

◆ is_mb_1

◆ is_mb_even_2

◆ is_mb_even_4

◆ is_mb_odd

Function Documentation

◆ case_info_code_to_gb18030()

◆ code_to_gb18030_chs()

◆ diff_to_gb18030_4()

◆ gb18030_4_chs_to_diff()

◆ gb18030_4_code_to_diff()

◆ gb18030_chs_to_code()

◆ get_case_info()

◆ get_casefolded_code()

◆ get_code_and_length()

◆ get_weight_for_gb18030_chs()

◆ get_weight_for_mbchar()

◆ get_weight_if_chinese_character()

◆ my_casedn_gb18030()

◆ my_casedn_gb18030_uca()

◆ my_casefold_gb18030()

◆ my_caseup_gb18030()

◆ my_caseup_gb18030_uca()

◆ my_hash_sort_gb18030()

◆ my_ismbchar_gb18030()

◆ my_mb_wc_gb18030()

◆ my_mbcharlen_gb18030()

◆ my_strcasecmp_gb18030()

◆ my_strnncoll_gb18030()

◆ my_strnncoll_gb18030_internal()

◆ my_strnncollsp_gb18030()

◆ my_strnxfrm_gb18030()

◆ my_wc_mb_gb18030_chs()

◆ my_well_formed_len_gb18030()

◆ my_wildcmp_gb18030()

◆ my_wildcmp_gb18030_impl()

◆ unicode_to_gb18030_code()

Variable Documentation

◆ COMMON_WEIGHT_BASE

◆ ctype_gb18030

◆ gb18030_2_weight_py

◆ gb18030_4_weight_py_p1

◆ gb18030_4_weight_py_p2

◆ MAX_2_BYTE_UNICASE

◆ MAX_3_BYTE_FROM_UNI

◆ MAX_GB18030_DIFF

◆ MIN_2_BYTE_UNICASE

◆ MIN_3_BYTE_FROM_UNI

◆ MIN_MB_EVEN_BYTE_2

◆ MIN_MB_EVEN_BYTE_4

◆ MIN_MB_ODD_BYTE

◆ my_caseinfo_gb18030

◆ my_caseinfo_pages_gb18030

◆ my_charset_gb18030_bin

◆ my_charset_gb18030_chinese_ci

◆ my_charset_gb18030_handler

◆ my_charset_gb18030_uca_handler

◆ my_collation_ci_handler

◆ PINYIN_2_BYTE_END

◆ PINYIN_2_BYTE_START

◆ PINYIN_4_1_DIFF

◆ PINYIN_4_2_DIFF

◆ PINYIN_4_BYTE_1_END

◆ PINYIN_4_BYTE_1_START

◆ PINYIN_4_BYTE_2_END

◆ PINYIN_4_BYTE_2_START

◆ PINYIN_WEIGHT_BASE

◆ plane00

◆ plane01

◆ plane02

◆ plane03

◆ plane04

◆ plane10

◆ plane1D

◆ plane1E

◆ plane1F

◆ plane20