MySQL 8.4.2
Source Code Documentation
ctype-gb18030.cc File Reference
#include <cassert>
#include <cstdint>
#include <cstring>
#include "my_compiler.h"
#include "mysql/strings/m_ctype.h"
#include "strings/m_ctype_internals.h"
#include "template_utils.h"

Macros

#define is_mb_1(c)   ((uint8_t)(c) <= 0x7F)
 
#define is_mb_odd(c)   (0x81 <= (uint8_t)(c) && (uint8_t)(c) <= 0xFE)
 
#define is_mb_even_2(c)
 
#define is_mb_even_4(c)   (0x30 <= (uint8_t)(c) && (uint8_t)(c) <= 0x39)
 

Functions

static unsigned gb18030_chs_to_code (const uint8_t *src, size_t srclen)
 Convert a gb18030 code in uchars to the code The byte sequence in src must be a gb18030 sequence. More...
 
static size_t code_to_gb18030_chs (uint8_t *dst, size_t dstlen, unsigned code)
 Convert a gb18030 code to a sequence of chars. More...
 
static unsigned diff_to_gb18030_4 (uint8_t *dst, unsigned dstlen, unsigned diff)
 Calculate the 4-byte GB18030 code from a diff value. More...
 
static unsigned gb18030_4_code_to_diff (unsigned code)
 Calculate the diff between the 4-byte gb18030 code and GB+81308130. More...
 
static unsigned gb18030_4_chs_to_diff (const uint8_t *src)
 Calculate the diff between the 4-byte gb18030 code in bytes and GB+81308130. More...
 
static unsigned my_ismbchar_gb18030 (const CHARSET_INFO *cs, const char *p, const char *e)
 Judge if a sequence of chars is in gb18030 multi-bytes code. More...
 
static unsigned my_mbcharlen_gb18030 (const CHARSET_INFO *cs, unsigned c)
 Get the length of a possible gb18030 code according to its first byte or first two bytes. More...
 
static int my_wc_mb_gb18030_chs (const CHARSET_INFO *cs, my_wc_t wc, uint8_t *s, uint8_t *e)
 Convert the Unicode code to its gb18030 code in bytes. More...
 
static int my_mb_wc_gb18030 (const CHARSET_INFO *cs, my_wc_t *pwc, const uint8_t *s, const uint8_t *e)
 Convert a gb18030 code in bytes to unicode code. More...
 
static size_t my_well_formed_len_gb18030 (const CHARSET_INFO *cs, const char *b, const char *e, size_t pos, int *error)
 Get the well formed length of a GB18030 string. More...
 
static const MY_UNICASE_CHARACTERget_case_info (const CHARSET_INFO *cs, const uint8_t *src, size_t srclen)
 Get the case info of one gb18030 code in bytes. More...
 
static unsigned case_info_code_to_gb18030 (unsigned code)
 Convert the code in one MY_UNICASE_CHARACTER to real gb18030 code. More...
 
static unsigned get_casefolded_code (const CHARSET_INFO *cs, const uint8_t *src, size_t srclen, size_t is_upper)
 Get the casefolded code of a given gb18030 code. More...
 
static size_t my_casefold_gb18030 (const CHARSET_INFO *cs, char *src, size_t srclen, char *dst, size_t dstlen, const uint8_t *map, bool is_upper)
 Get the casefolded gb18030 codes of a given sequence of gb18030 codes Store the casefolded result to a specified dest. More...
 
static size_t my_casedn_gb18030 (const CHARSET_INFO *cs, char *src, size_t srclen, char *dst, size_t dstlen)
 Convert a gb18030 string to a corresponding lower-case gb18030 string. More...
 
static size_t my_caseup_gb18030 (const CHARSET_INFO *cs, char *src, size_t srclen, char *dst, size_t dstlen)
 Convert a gb18030 string to a corresponding up-case gb18030 string. More...
 
static size_t my_casedn_gb18030_uca (const CHARSET_INFO *cs, char *src, size_t srclen, char *dst, size_t dstlen)
 Get the casedown gb18030 code of a given gb18030 code It's only for UCA, because the caseinfo for UCA is different. More...
 
static size_t my_caseup_gb18030_uca (const CHARSET_INFO *cs, char *src, size_t srclen, char *dst, size_t dstlen)
 Get the caseup gb18030 code of a given gb18030 code It's only for UCA, because the caseinfo for UCA is different. More...
 
static unsigned get_weight_if_chinese_character (unsigned code)
 Get the weight for a multi-byte gb18030 code if the code point represents a Chinese character defined in collation PINYIN in CLDR24 The result will be PINYIN_WEIGHT_BASE + a none-zero seq NO. More...
 
static unsigned get_weight_for_mbchar (const CHARSET_INFO *cs, const uint8_t *src, size_t mblen)
 Get the weight for a multi-byte gb18030 code, we get the weight by the case up form of gb18030 code if exists. More...
 
static unsigned get_weight_for_gb18030_chs (const CHARSET_INFO *cs, const char *s, size_t s_len)
 Get the weight of a given gb18030 code We can assert the code must be a valid gb18030 code. More...
 
static size_t get_code_and_length (const CHARSET_INFO *cs, const char *s, const char *e, size_t *code)
 Get the code value and length of next code in given gb18030 string. More...
 
static int my_strnncoll_gb18030_internal (const CHARSET_INFO *cs, const uint8_t **s_res, size_t s_length, const uint8_t **t_res, size_t t_length)
 Internal func to compare two strings according to gb18030 every gb18030 code should compare by its upper-case form. More...
 
static int my_strnncoll_gb18030 (const CHARSET_INFO *cs, const uint8_t *s, size_t s_length, const uint8_t *t, size_t t_length, bool t_is_prefix)
 Compare two strings according to gb18030 every gb18030 code should compare by its caseup form. More...
 
static int my_strnncollsp_gb18030 (const CHARSET_INFO *cs, const uint8_t *s, size_t s_length, const uint8_t *t, size_t t_length)
 Compare two strings according to gb18030, but ignore trailing spaces every gb18030 code should compare by its caseup form. More...
 
static size_t my_strnxfrm_gb18030 (const CHARSET_INFO *cs, uint8_t *dst, size_t dstlen, unsigned nweights, const uint8_t *src, size_t srclen, unsigned flags)
 Make a sort key suitable for memcmp() corresponding to gb18030 Sort according to UPPER() for non-Chinese chars, and PINYIN for Chinese chars. More...
 
static int my_strcasecmp_gb18030 (const CHARSET_INFO *cs, const char *s, const char *t)
 Compare 0-terminated gb18030 strings. More...
 
static unsigned unicode_to_gb18030_code (const CHARSET_INFO *cs, int unicode)
 Convert a Unicode code to gb18030 code. More...
 
static int my_wildcmp_gb18030_impl (const CHARSET_INFO *cs, const char *str, const char *str_end, const char *wildstr, const char *wildend, unsigned escape, unsigned w_one, unsigned w_many, int recurse_level)
 Compare string against string with wildcard. More...
 
static int my_wildcmp_gb18030 (const CHARSET_INFO *cs, const char *str, const char *str_end, const char *wildstr, const char *wildend, int escape, int w_one, int w_many)
 Compare string against string with wildcard. More...
 
static void my_hash_sort_gb18030 (const CHARSET_INFO *cs, const uint8_t *s, size_t slen, uint64_t *n1, uint64_t *n2)
 Calculate hash value for given gb18030 string. More...
 

Variables

static const unsigned MIN_MB_ODD_BYTE = 0x81
 
static const unsigned MIN_MB_EVEN_BYTE_2 = 0x40
 
static const unsigned MIN_MB_EVEN_BYTE_4 = 0x30
 
static const unsigned MAX_GB18030_DIFF = 0x18398F
 
static const unsigned UNI2_TO_GB4_DIFF = 7456
 
static const unsigned UNICASE_4_BYTE_OFFSET = 0x80
 
static const unsigned MIN_2_BYTE_UNICASE = 0xA000
 
static const unsigned MAX_2_BYTE_UNICASE = 0xDFFF
 
static const unsigned MIN_3_BYTE_FROM_UNI = 0x2E600
 
static const unsigned MAX_3_BYTE_FROM_UNI = 0x2E6FF
 
static const unsigned PINYIN_2_BYTE_START = 0x8140
 
static const unsigned PINYIN_2_BYTE_END = 0xFE9F
 
static const unsigned PINYIN_4_BYTE_1_START = 0x8138FD38
 
static const unsigned PINYIN_4_BYTE_1_END = 0x82359232
 
static const unsigned PINYIN_4_1_DIFF = 11328
 
static const unsigned PINYIN_4_BYTE_2_START = 0x95328236
 
static const unsigned PINYIN_4_BYTE_2_END = 0x98399836
 
static const unsigned PINYIN_4_2_DIFF = 254536
 
static const unsigned PINYIN_WEIGHT_BASE = 0xFFA00000
 
static const unsigned COMMON_WEIGHT_BASE = 0xFF000000
 
static const uint8_t ctype_gb18030 [257]
 The array used for "type of characters" bit mask for each character. More...
 
static const uint8_t to_lower_gb18030 []
 The array[256] used in casedn. More...
 
static const uint8_t to_upper_gb18030 []
 The array[256] used in caseup. More...
 
static const uint8_t sort_order_gb18030 []
 The array[256] used for strings comparison. More...
 
static const MY_UNICASE_CHARACTER plane00 []
 Unicase array for 0x0000-0x00FF. More...
 
static const MY_UNICASE_CHARACTER plane01 []
 Unicase array for 0x0100-0x01FF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points. More...
 
static const MY_UNICASE_CHARACTER plane02 []
 Unicase array for 0x0200-0x02FF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points. More...
 
static const MY_UNICASE_CHARACTER plane03 []
 Unicase array for 0x0300-0x03FF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points. More...
 
static const MY_UNICASE_CHARACTER plane04 []
 Unicase array for 0x0400-0x04FF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points. More...
 
static const MY_UNICASE_CHARACTER plane10 []
 Unicase array for 0x1000-0x10FF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points. More...
 
static const MY_UNICASE_CHARACTER plane1D []
 Unicase array for 0x1D00-0x1DFF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points. More...
 
static const MY_UNICASE_CHARACTER plane1E []
 Unicase array for 0x1E00-0x1EFF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points. More...
 
static const MY_UNICASE_CHARACTER plane1F []
 Unicase array for 0x1F00-0x1FFF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points. More...
 
static const MY_UNICASE_CHARACTER plane20 []
 Unicase array for 0x2000-0x20FF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points. More...
 
static const MY_UNICASE_CHARACTER plane23 []
 Unicase array for 0x2300-0x23FF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points. More...
 
static const MY_UNICASE_CHARACTER plane2A []
 Unicase array for 0x2A00-0x2AFF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points. More...
 
static const MY_UNICASE_CHARACTER plane2B []
 Unicase array for 0x2B00-0x2BFF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points. More...
 
static const MY_UNICASE_CHARACTER plane51 []
 Unicase array for 0x5100-0x51FF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points. More...
 
static const MY_UNICASE_CHARACTER plane52 []
 Unicase array for 0x5200-0x52FF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points. More...
 
static const MY_UNICASE_CHARACTER planeA2 []
 Unicase array for 0xA200-0xA2FF All are for 2-byte code points directly. More...
 
static const MY_UNICASE_CHARACTER planeA3 []
 Unicase array for 0xA300-0xA3FF All are for 2-byte code points directly. More...
 
static const MY_UNICASE_CHARACTER planeA6 []
 Unicase array for 0xA600-0xA6FF All are for 2-byte code points directly. More...
 
static const MY_UNICASE_CHARACTER planeA7 []
 Unicase array for 0xA700-0xA7FF All are for 2-byte code points directly. More...
 
static const MY_UNICASE_CHARACTER planeA8 []
 Unicase array for 0xA800-0xA8FF All are for 2-byte code points directly. More...
 
static const MY_UNICASE_CHARACTER planeE6 []
 Unicase array for 0xE600-0xE6FF Plus 0x20000, they're the diff(diff between code and 81308130) for 4-byte code points. More...
 
static const MY_UNICASE_CHARACTERmy_caseinfo_pages_gb18030 [256]
 The UNICASE array. More...
 
static MY_UNICASE_INFO my_caseinfo_gb18030
 UNICASE INFO. More...
 
static const uint16_t tab_gb18030_2_uni []
 Mapping table from 2-byte gb18030 to unicode including all 2-byte code points in [GB+8140, GB+FEFE], with 0 for those invalid code points. More...
 
static const uint16_t tab_gb18030_4_uni []
 Mapping table from 4-byte gb18030 to Unicode The values here are the diffs for 4-byte gb18030 code points including following ranges: [GB+81308130, GB+8130D330) (GB+8135F436, GB+8137A839) (GB+8138FD38, GB+82358F33) (GB+8336C738, GB+8336D030) (GB+84308534, GB+84309C38) (GB+84318537, GB+8431A439] Others can be calculated algorithmically. More...
 
static const uint16_t tab_uni_gb18030_p1 []
 Mapping table from Unicode to gb18030, part one For Unicode in [0x80, 0x9FA6), if the leading byte is less than 0x81, the corresponding value represents the diff for 4-byte gb18030 code, otherwise, it's the corresponding 2-byte gb18030 code. More...
 
static const uint16_t tab_uni_gb18030_p2 []
 Mapping table from Unicode to gb18030, part two For Unicode in [0xE000, 0xE865) and (0xF92B, 0xFFFF] The values here have the same meaning with tab_uni_gb18030_p1. More...
 
static const uint16_t gb18030_2_weight_py []
 The following 3 Chinese character weight arrays are based upon the PINYIN collation in zh.xml file of CLDR24(http://cldr.unicode.org/) (http://unicode.org/Public/cldr/24/core.zip) More...
 
static const uint16_t gb18030_4_weight_py_p1 []
 Weight array for one range of 4-byte gb18030 code points, which is [GB+8138FE38, GB+82359232]. More...
 
static const uint16_t gb18030_4_weight_py_p2 []
 Weight array for the other range of 4-byte gb18030 code points, which is [GB+95328236, GB+98399836]. More...
 
static MY_COLLATION_HANDLER my_collation_ci_handler
 
static MY_CHARSET_HANDLER my_charset_gb18030_handler
 
MY_CHARSET_HANDLER my_charset_gb18030_uca_handler
 
CHARSET_INFO my_charset_gb18030_chinese_ci
 
CHARSET_INFO my_charset_gb18030_bin
 

Macro Definition Documentation

◆ is_mb_1

#define is_mb_1 (   c)    ((uint8_t)(c) <= 0x7F)

◆ is_mb_even_2

#define is_mb_even_2 (   c)
Value:
((0x40 <= (uint8_t)(c) && (uint8_t)(c) <= 0x7E) || \
(0x80 <= (uint8_t)(c) && (uint8_t)(c) <= 0xFE))

◆ is_mb_even_4

#define is_mb_even_4 (   c)    (0x30 <= (uint8_t)(c) && (uint8_t)(c) <= 0x39)

◆ is_mb_odd

#define is_mb_odd (   c)    (0x81 <= (uint8_t)(c) && (uint8_t)(c) <= 0xFE)

Function Documentation

◆ case_info_code_to_gb18030()

static unsigned case_info_code_to_gb18030 ( unsigned  code)
static

Convert the code in one MY_UNICASE_CHARACTER to real gb18030 code.

Parameters
[in]codecode in one MY_UNICASE_CHARACTER
Returns
gb18030 code

◆ code_to_gb18030_chs()

static size_t code_to_gb18030_chs ( uint8_t *  dst,
size_t  dstlen,
unsigned  code 
)
static

Convert a gb18030 code to a sequence of chars.

If the code number is too large to store, the tailing bytes will be cut off

Parameters
[out]dstdest to store the result
[in]dstlenvalid length of dest
[in]codegb18030 code
Returns
the length of dest used to store the gb18030 chars

◆ diff_to_gb18030_4()

static unsigned diff_to_gb18030_4 ( uint8_t *  dst,
unsigned  dstlen,
unsigned  diff 
)
static

Calculate the 4-byte GB18030 code from a diff value.

Parameters
[out]dstdest to store the gb18030 code in bytes
[in]dstlenvalid length of dest
[in]diffthe diff between gb18030 code and GB+81308130
Return values
4if the diff is a valid value and there is enough space in dst 0 otherwise

◆ gb18030_4_chs_to_diff()

static unsigned gb18030_4_chs_to_diff ( const uint8_t *  src)
inlinestatic

Calculate the diff between the 4-byte gb18030 code in bytes and GB+81308130.

Parameters
[in]src4-byte gb18030 code in bytes
Returns
the diff

◆ gb18030_4_code_to_diff()

static unsigned gb18030_4_code_to_diff ( unsigned  code)
static

Calculate the diff between the 4-byte gb18030 code and GB+81308130.

Parameters
[in]code4-byte gb18030 code
Returns
the diff

◆ gb18030_chs_to_code()

static unsigned gb18030_chs_to_code ( const uint8_t *  src,
size_t  srclen 
)
inlinestatic

Convert a gb18030 code in uchars to the code The byte sequence in src must be a gb18030 sequence.

Parameters
[in]srcgb18030 code in uchars
[in]srclenlength of valid chars, should be 1, 2 or 4 only
Returns
the gb18030 code

◆ get_case_info()

static const MY_UNICASE_CHARACTER * get_case_info ( const CHARSET_INFO cs,
const uint8_t *  src,
size_t  srclen 
)
static

Get the case info of one gb18030 code in bytes.

Parameters
[in]cscharset
[in]srcstart byte of gb18030 code
[in]srclenlength in bytes of gb18030 code
Returns
the case info(MY_UNICASE_CHARACTER) of given gb18030 code

◆ get_casefolded_code()

static unsigned get_casefolded_code ( const CHARSET_INFO cs,
const uint8_t *  src,
size_t  srclen,
size_t  is_upper 
)
static

Get the casefolded code of a given gb18030 code.

Parameters
[in]cscharset
[in]srcstart byte of gb18030 code
[in]srclenlength of gb18030 code in bytes
[in]is_uppertrue if we want capital letter false if small letter is required
Return values
thegb18030 code according to is_upper 0 if no upper-case or lower-case exists

◆ get_code_and_length()

static size_t get_code_and_length ( const CHARSET_INFO cs,
const char *  s,
const char *  e,
size_t *  code 
)
static

Get the code value and length of next code in given gb18030 string.

Parameters
[in]cscharset
[in]sstring
[in]eend of string
[out]codethe code value of the next code, be valid when retval>0
Return values
thelength of the next code, if the code is valid 0 if the given string is empty or the code is invalid

◆ get_weight_for_gb18030_chs()

static unsigned get_weight_for_gb18030_chs ( const CHARSET_INFO cs,
const char *  s,
size_t  s_len 
)
static

Get the weight of a given gb18030 code We can assert the code must be a valid gb18030 code.

Parameters
[in]cscharset
[in]scode chars
[in]s_lenlength of the code
Returns
weight the weight of the code

◆ get_weight_for_mbchar()

static unsigned get_weight_for_mbchar ( const CHARSET_INFO cs,
const uint8_t *  src,
size_t  mblen 
)
static

Get the weight for a multi-byte gb18030 code, we get the weight by the case up form of gb18030 code if exists.

For any Chinese character, which has non-zero seq NO. defined in gb18030_2_weight_py/gb18030_4_weight_py_p1/gb18030_4_weight_py_p2 according to its gb18030 code, the final weight shall be 0xFFA00000+(seq No.)

For any non-Chinese gb18030 character C, let C'=C or UPPER(C) if exists. So C' would be 2 bytes or 4 bytes.

  1. For any C' represented in 2 bytes, the final weight shall be C' itself. For example: GB+A2F1=UPPER(GB+A2A1), which is a 2-byte non-Chinese character. So C' would be GB+A2F1 for both GB+A2A1 and GB+A2F1, and both final weights of GB+A2A1 and GB+A2F1 are 0xA2F1
  2. For any C' represented in 4 bytes, the final weight shall be COMMON_WEIGHT_BASE+diff(C'). We get the diff by gb18030_4_code_to_diff For example: The character GB+9030E833=UPPER(GB+9030EC33), which is a 4-byte non-Chinese character. So the C' would be GB+9030E833 for both GB+9030E833 and GB+9030EC33, and diff(C') would be 0x2E651 in this case. Both final weights of GB+9030E833 and GB+9030EC33 are 0xFF000000+0x2E651=0xFF02E651

The weight of the maximum code point GB+FE39FE39 is defined as 0xFFFFFFFF

Parameters
[in]cscharset
[in]srcthe first byte of multi-byte gb18030 code
[in]mblenthe length of multi-bytes gb18030 code
Returns
the weight of the given gb18030 code point

◆ get_weight_if_chinese_character()

static unsigned get_weight_if_chinese_character ( unsigned  code)
static

Get the weight for a multi-byte gb18030 code if the code point represents a Chinese character defined in collation PINYIN in CLDR24 The result will be PINYIN_WEIGHT_BASE + a none-zero seq NO.

for the code

if not, result would be PINYIN_WEIGHT_BASE

Parameters
[in]codethe multi-byte gb18030 code
Return values
anon-zero weight if it's a Chinese character with PINYIN, which shall be PINYIN_WEIGHT_BASE + none-zero seq NO. otherwise, PINYIN_WEIGHT_BASE

◆ my_casedn_gb18030()

static size_t my_casedn_gb18030 ( const CHARSET_INFO cs,
char *  src,
size_t  srclen,
char *  dst,
size_t  dstlen 
)
static

Convert a gb18030 string to a corresponding lower-case gb18030 string.

Parameters
[in]cscharset
[in]srcstart byte of given gb18030 code
[in]srclenlength of given gb18030 code
[out]dststart byte of casedown gb18030 code
[in]dstlenlength of the result array
Returns
length of the casedown gb18030 code

◆ my_casedn_gb18030_uca()

static size_t my_casedn_gb18030_uca ( const CHARSET_INFO cs,
char *  src,
size_t  srclen,
char *  dst,
size_t  dstlen 
)
static

Get the casedown gb18030 code of a given gb18030 code It's only for UCA, because the caseinfo for UCA is different.

Parameters
[in]cscharset
[in]srcstart byte of given gb18030 code
[in]srclenlength of given gb18030 code
[out]dststart byte of casedown gb18030 code
[in]dstlenlength of the result array
Returns
length of the casedown gb18030 code

◆ my_casefold_gb18030()

static size_t my_casefold_gb18030 ( const CHARSET_INFO cs,
char *  src,
size_t  srclen,
char *  dst,
size_t  dstlen,
const uint8_t *  map,
bool  is_upper 
)
static

Get the casefolded gb18030 codes of a given sequence of gb18030 codes Store the casefolded result to a specified dest.

Parameters
[in]cscharset
[in]srcstart byte of given sequence of gb18030 codes
[in]srclenlength of the sequence
[out]dststart byte of the output of casefolded codes
[out]dstlenlength of the result array
[in]mapthe LOWER map or the UPPER map of gb18030
[in]is_uppertrue if we want capital letter false if small letter is required
Returns
the length of result

◆ my_caseup_gb18030()

static size_t my_caseup_gb18030 ( const CHARSET_INFO cs,
char *  src,
size_t  srclen,
char *  dst,
size_t  dstlen 
)
static

Convert a gb18030 string to a corresponding up-case gb18030 string.

Parameters
[in]cscharset
[in]srcstart byte of given gb18030 code
[in]srclenlength of given gb18030 code
[out]dststart byte of caseup gb18030 code
[in]dstlenlength of the result array
Returns
length of the caseup gb18030 code

◆ my_caseup_gb18030_uca()

static size_t my_caseup_gb18030_uca ( const CHARSET_INFO cs,
char *  src,
size_t  srclen,
char *  dst,
size_t  dstlen 
)
static

Get the caseup gb18030 code of a given gb18030 code It's only for UCA, because the caseinfo for UCA is different.

Parameters
[in]cscharset
[in]srcstart byte of given gb18030 code
[in]srclenlength of given gb18030 code
[out]dststart byte of caseup gb18030 code
[in]dstlenlength of the result array
Returns
length of the caseup gb18030 code

◆ my_hash_sort_gb18030()

static void my_hash_sort_gb18030 ( const CHARSET_INFO cs,
const uint8_t *  s,
size_t  slen,
uint64_t *  n1,
uint64_t *  n2 
)
static

Calculate hash value for given gb18030 string.

Parameters
[in]cscharset
[in]sstring
[in]slenthe length of string
[in,out]n1n1
[in,out]n2n2

◆ my_ismbchar_gb18030()

static unsigned my_ismbchar_gb18030 ( const CHARSET_INFO cs,
const char *  p,
const char *  e 
)
static

Judge if a sequence of chars is in gb18030 multi-bytes code.

Parameters
[in]cscharset
[in]pstart of gb18030 code
[in]eend of gb18030 code
Return values
2or 4 if it is gb18030 mluti-bytes code, 0 if not

◆ my_mb_wc_gb18030()

static int my_mb_wc_gb18030 ( const CHARSET_INFO cs,
my_wc_t pwc,
const uint8_t *  s,
const uint8_t *  e 
)
static

Convert a gb18030 code in bytes to unicode code.

Parameters
[in]cscharset
[out]pwcunicode code
[in]sstart of gb18030 code
[in]eend of gb18030 code
Return values
1)the length of converted gb18030 code if convertible 2) MY_CS_TOOSMALL..MY_CS_TOOSMALL4 if the gb18030 code is too short 3) MY_CS_ILSEQ if gb18030 code is wrong by sequence

◆ my_mbcharlen_gb18030()

static unsigned my_mbcharlen_gb18030 ( const CHARSET_INFO cs,
unsigned  c 
)
static

Get the length of a possible gb18030 code according to its first byte or first two bytes.

Parameters
[in]cscharset
[in]cfirst byte or first two bytes of the code
Return values
1/2/4accordingly if the leading byte(s) indicate the code would be gb18030, otherwise 0

◆ my_strcasecmp_gb18030()

static int my_strcasecmp_gb18030 ( const CHARSET_INFO cs,
const char *  s,
const char *  t 
)
static

Compare 0-terminated gb18030 strings.

Parameters
[in]cscharset
[in]sfirst 0-terminated string to compare
[in]tsecond 0-terminated string to compare
Return values
negativenumber if s < t positive number if s > t 0 if the strings are equal

◆ my_strnncoll_gb18030()

static int my_strnncoll_gb18030 ( const CHARSET_INFO cs,
const uint8_t *  s,
size_t  s_length,
const uint8_t *  t,
size_t  t_length,
bool  t_is_prefix 
)
static

Compare two strings according to gb18030 every gb18030 code should compare by its caseup form.

Parameters
[in]cscharset
[in]sstart of the first string
[in]s_lengthlength of the first string
[in]tstart of the second string
[in]t_lengthlength of the second string
[in]t_is_prefixtrue if t is prefix, otherwise false
Return values
0if two strings are equal 1 if the first string is bigger -1 if the second string is bigger

◆ my_strnncoll_gb18030_internal()

static int my_strnncoll_gb18030_internal ( const CHARSET_INFO cs,
const uint8_t **  s_res,
size_t  s_length,
const uint8_t **  t_res,
size_t  t_length 
)
static

Internal func to compare two strings according to gb18030 every gb18030 code should compare by its upper-case form.

Parameters
[in]cscharset
[in,out]s_respointer to the start byte of first gb18030 code string, return the first byte unchecked
[in]s_lengthlength of the first string
[in,out]t_respointer to the start byte of second gb18030 code string, return the first byte unchecked
[in]t_lengthlength of the second string
Return values
0if the strings are equal >0 if the first string is bigger <0 if the second string is bigger

◆ my_strnncollsp_gb18030()

static int my_strnncollsp_gb18030 ( const CHARSET_INFO cs,
const uint8_t *  s,
size_t  s_length,
const uint8_t *  t,
size_t  t_length 
)
static

Compare two strings according to gb18030, but ignore trailing spaces every gb18030 code should compare by its caseup form.

Parameters
[in]cscharset
[in]sstart of the first string
[in]s_lengthlength of the first string
[in]tstart of the second string
[in]t_lengthlength of the second string
Return values
0if two strings are equal 1 if the first string is bigger -1 if the second string is bigger

◆ my_strnxfrm_gb18030()

static size_t my_strnxfrm_gb18030 ( const CHARSET_INFO cs,
uint8_t *  dst,
size_t  dstlen,
unsigned  nweights,
const uint8_t *  src,
size_t  srclen,
unsigned  flags 
)
static

Make a sort key suitable for memcmp() corresponding to gb18030 Sort according to UPPER() for non-Chinese chars, and PINYIN for Chinese chars.

Parameters
[in]cscharset
[out]dstfirst byte of the result array
[out]dstlenthe length of the result array
[in]nweightshow many gb18030 codes we care about
[in]srcfirst byte of the given gb18030 code
[in]srclenlength of the given gb18030 code
[in]flagsflags for strxfrm
Returns
the length of the sort key

◆ my_wc_mb_gb18030_chs()

static int my_wc_mb_gb18030_chs ( const CHARSET_INFO cs,
my_wc_t  wc,
uint8_t *  s,
uint8_t *  e 
)
static

Convert the Unicode code to its gb18030 code in bytes.

Parameters
[in]cscharset
[in]wcUnicode code
[out]sstart of gb18030 code output
[out]eend of gb18030 code output
Return values
1)the length of gb18030 code(1/2/4) if convertible, 2) MY_CS_TOOSMALL..MY_CS_TOOSMALL4 if the output space is too small 3) MY_CS_ILUNI if we can't encode unicode to gb18030

◆ my_well_formed_len_gb18030()

static size_t my_well_formed_len_gb18030 ( const CHARSET_INFO cs,
const char *  b,
const char *  e,
size_t  pos,
int *  error 
)
static

Get the well formed length of a GB18030 string.

Parameters
[in]cscharset
[in]bstart of gb18030 code
[in]eend of gb18030 code
[in]posmax chars we care about
[out]error0 if every gb18030 code we get is correct, otherwise 1
Returns
the length of all well formed bytes

◆ my_wildcmp_gb18030()

static int my_wildcmp_gb18030 ( const CHARSET_INFO cs,
const char *  str,
const char *  str_end,
const char *  wildstr,
const char *  wildend,
int  escape,
int  w_one,
int  w_many 
)
static

Compare string against string with wildcard.

Parameters
[in]cscharset
[in]strstring
[in]str_endend of the string
[in]wildstrstring with wildcard
[in]wildendend of the string with wildcard
[in]escapeescape char, which is a Unicode code
[in]w_onewild one char, which is a Unicode code
[in]w_manywild many char, which is a Unicode code
Returns
0 if matched -1 if not matched with wildcard 1 if matched with wildcard

◆ my_wildcmp_gb18030_impl()

static int my_wildcmp_gb18030_impl ( const CHARSET_INFO cs,
const char *  str,
const char *  str_end,
const char *  wildstr,
const char *  wildend,
unsigned  escape,
unsigned  w_one,
unsigned  w_many,
int  recurse_level 
)
static

Compare string against string with wildcard.

Parameters
[in]cscharset
[in]strstring
[in]str_endend of the string
[in]wildstrstring with wildcard
[in]wildendend of the string with wildcard
[in]escapeescape char, which is a gb18030 code
[in]w_onewild one char, which is a gb18030 code
[in]w_manywild many char, which is a gb18030 code
[in]recurse_levelcurrent recurse level to do wild card
Returns
0 if matched -1 if not matched with wildcard 1 if matched with wildcard

◆ unicode_to_gb18030_code()

static unsigned unicode_to_gb18030_code ( const CHARSET_INFO cs,
int  unicode 
)
static

Convert a Unicode code to gb18030 code.

Parameters
[in]cscharset
[in]unicodeunicode code
Returns
gb18030 code

Variable Documentation

◆ COMMON_WEIGHT_BASE

const unsigned COMMON_WEIGHT_BASE = 0xFF000000
static

◆ ctype_gb18030

const uint8_t ctype_gb18030[257]
static
Initial value:
= {
0,
32, 32, 32, 32, 32, 32, 32, 32, 32, 40, 40, 40, 40, 40, 32, 32,
32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
72, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
132, 132, 132, 132, 132, 132, 132, 132, 132, 132, 16, 16, 16, 16, 16, 16,
16, 129, 129, 129, 129, 129, 129, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 16, 16, 16, 16, 16,
16, 130, 130, 130, 130, 130, 130, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 16, 16, 16, 16, 32,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0}

The array used for "type of characters" bit mask for each character.

The ctype[0] is reserved for EOF(-1), so we use ctype[(char)+1]. Also refer to strings/CHARSET_INFO.txt

◆ gb18030_2_weight_py

const uint16_t gb18030_2_weight_py[]
static

The following 3 Chinese character weight arrays are based upon the PINYIN collation in zh.xml file of CLDR24(http://cldr.unicode.org/) (http://unicode.org/Public/cldr/24/core.zip)

Please note that we only support those Chinese characters with PINYIN in zh.xml currently, we support neither those CJK characters whose category defined in Unicode are Symbol with the same shape as Chinese characters nor the PINYIN characters

The 3 arrays include the sort order of the code points accordingly The weight here just mean the order of the corresponding gb18030 code point. For example: currently U+963F(GB+B0A2) is the first character in the PINYIN collation, while U+5475(GB+BAC7) is the second, and so on. So the weight of U+963F is 1 and U+5475 is 2, and GB+8140 is the 15308th character, which has the value of 15308 in the first array

The weight starts from 1 to 41309 and there are no duplicate weights in the collation, so the simplified Chinese is not equal to the corresponding traditional Chinese

Totally, there are 41309 Chinese characters being taken into account in these arrays

The weight of each code point shall be the corresponding weight in these arrays plus PINYIN_WEIGHT_BASE Weight array for those 2-byte gb18030 code points in the range [GB+8140, GB+FE9F]. If it's not a Chinese char, the weight is 0

◆ gb18030_4_weight_py_p1

const uint16_t gb18030_4_weight_py_p1[]
static

Weight array for one range of 4-byte gb18030 code points, which is [GB+8138FE38, GB+82359232].

The sequence NO. of this array is the diff between the code point and GB+8138FE38 The weights would be 0 for non-Chinese chars

◆ gb18030_4_weight_py_p2

const uint16_t gb18030_4_weight_py_p2[]
static

Weight array for the other range of 4-byte gb18030 code points, which is [GB+95328236, GB+98399836].

The sequence NO. of this array is the diff between code point and GB+95328236 The weights would be 0 for non-Chinese chars This range maps from Unicode [U+20000, U+2FFFF], which is CJK extension

◆ MAX_2_BYTE_UNICASE

const unsigned MAX_2_BYTE_UNICASE = 0xDFFF
static

◆ MAX_3_BYTE_FROM_UNI

const unsigned MAX_3_BYTE_FROM_UNI = 0x2E6FF
static

◆ MAX_GB18030_DIFF

const unsigned MAX_GB18030_DIFF = 0x18398F
static

◆ MIN_2_BYTE_UNICASE

const unsigned MIN_2_BYTE_UNICASE = 0xA000
static

◆ MIN_3_BYTE_FROM_UNI

const unsigned MIN_3_BYTE_FROM_UNI = 0x2E600
static

◆ MIN_MB_EVEN_BYTE_2

const unsigned MIN_MB_EVEN_BYTE_2 = 0x40
static

◆ MIN_MB_EVEN_BYTE_4

const unsigned MIN_MB_EVEN_BYTE_4 = 0x30
static

◆ MIN_MB_ODD_BYTE

const unsigned MIN_MB_ODD_BYTE = 0x81
static

◆ my_caseinfo_gb18030

MY_UNICASE_INFO my_caseinfo_gb18030
static
Initial value:
= {0xFFFF,
static const MY_UNICASE_CHARACTER * my_caseinfo_pages_gb18030[256]
The UNICASE array.
Definition: ctype-gb18030.cc:3035

UNICASE INFO.

◆ my_caseinfo_pages_gb18030

const MY_UNICASE_CHARACTER* my_caseinfo_pages_gb18030[256]
static

The UNICASE array.

◆ my_charset_gb18030_bin

CHARSET_INFO my_charset_gb18030_bin

◆ my_charset_gb18030_chinese_ci

CHARSET_INFO my_charset_gb18030_chinese_ci

◆ my_charset_gb18030_handler

MY_CHARSET_HANDLER my_charset_gb18030_handler
static
Initial value:
= {
nullptr,
static int my_mb_wc_gb18030(const CHARSET_INFO *cs, my_wc_t *pwc, const uint8_t *s, const uint8_t *e)
Convert a gb18030 code in bytes to unicode code.
Definition: ctype-gb18030.cc:19422
static int my_wc_mb_gb18030_chs(const CHARSET_INFO *cs, my_wc_t wc, uint8_t *s, uint8_t *e)
Convert the Unicode code to its gb18030 code in bytes.
Definition: ctype-gb18030.cc:19332
static unsigned my_mbcharlen_gb18030(const CHARSET_INFO *cs, unsigned c)
Get the length of a possible gb18030 code according to its first byte or first two bytes.
Definition: ctype-gb18030.cc:19306
static size_t my_casedn_gb18030(const CHARSET_INFO *cs, char *src, size_t srclen, char *dst, size_t dstlen)
Convert a gb18030 string to a corresponding lower-case gb18030 string.
Definition: ctype-gb18030.cc:19696
static size_t my_well_formed_len_gb18030(const CHARSET_INFO *cs, const char *b, const char *e, size_t pos, int *error)
Get the well formed length of a GB18030 string.
Definition: ctype-gb18030.cc:19508
static unsigned my_ismbchar_gb18030(const CHARSET_INFO *cs, const char *p, const char *e)
Judge if a sequence of chars is in gb18030 multi-bytes code.
Definition: ctype-gb18030.cc:19282
static size_t my_caseup_gb18030(const CHARSET_INFO *cs, char *src, size_t srclen, char *dst, size_t dstlen)
Convert a gb18030 string to a corresponding up-case gb18030 string.
Definition: ctype-gb18030.cc:19714
int my_mb_ctype_mb(const CHARSET_INFO *cs, int *ctype, const uint8_t *s, const uint8_t *e)
Definition: ctype-mb.cc:1322
size_t my_numcells_mb(const CHARSET_INFO *cs, const char *b, const char *e)
Definition: ctype-mb.cc:1290
size_t my_numchars_mb(const CHARSET_INFO *cs, const char *pos, const char *end)
Definition: ctype-mb.cc:315
size_t my_casedn_str_mb(const CHARSET_INFO *cs, char *str)
Definition: ctype-mb.cc:58
size_t my_charpos_mb3(const CHARSET_INFO *cs, const char *pos, const char *end, size_t length)
Definition: ctype-mb.cc:326
size_t my_caseup_str_mb(const CHARSET_INFO *cs, char *str)
Definition: ctype-mb.cc:41
long long my_strtoll10_8bit(const CHARSET_INFO *cs, const char *nptr, const char **endptr, int *error)
Definition: ctype-simple.cc:1125
unsigned long long my_strntoull10rnd_8bit(const CHARSET_INFO *cs, const char *str, size_t length, int unsigned_flag, const char **endptr, int *error)
Definition: ctype-simple.cc:1221
double my_strntod_8bit(const CHARSET_INFO *cs, const char *str, size_t length, const char **end, int *err)
Definition: ctype-simple.cc:636
long long my_strntoll_8bit(const CHARSET_INFO *cs, const char *nptr, size_t l, int base, const char **endptr, int *err)
Definition: ctype-simple.cc:461
size_t my_longlong10_to_str_8bit(const CHARSET_INFO *cs, char *dst, size_t len, int radix, long long val)
Definition: ctype-simple.cc:686
size_t my_long10_to_str_8bit(const CHARSET_INFO *cs, char *dst, size_t len, int radix, long int val)
Definition: ctype-simple.cc:650
size_t my_lengthsp_8bit(const CHARSET_INFO *cs, const char *ptr, size_t length)
Definition: ctype-simple.cc:927
unsigned long long my_strntoull_8bit(const CHARSET_INFO *cs, const char *nptr, size_t l, int base, const char **endptr, int *err)
Definition: ctype-simple.cc:539
size_t my_snprintf_8bit(const CHARSET_INFO *cs, char *to, size_t n, const char *fmt,...)
Definition: ctype-simple.cc:278
void my_fill_8bit(const CHARSET_INFO *cs, char *s, size_t l, int fill)
Definition: ctype-simple.cc:898
unsigned long my_strntoul_8bit(const CHARSET_INFO *cs, const char *nptr, size_t l, int base, const char **endptr, int *err)
Definition: ctype-simple.cc:389
long my_strntol_8bit(const CHARSET_INFO *cs, const char *nptr, size_t l, int base, const char **endptr, int *err)
Definition: ctype-simple.cc:311
size_t my_scan_8bit(const CHARSET_INFO *cs, const char *str, const char *end, int sq)
Definition: ctype-simple.cc:876

◆ my_charset_gb18030_uca_handler

MY_CHARSET_HANDLER my_charset_gb18030_uca_handler
Initial value:
= {nullptr,
static size_t my_caseup_gb18030_uca(const CHARSET_INFO *cs, char *src, size_t srclen, char *dst, size_t dstlen)
Get the caseup gb18030 code of a given gb18030 code It's only for UCA, because the caseinfo for UCA i...
Definition: ctype-gb18030.cc:19777
static size_t my_casedn_gb18030_uca(const CHARSET_INFO *cs, char *src, size_t srclen, char *dst, size_t dstlen)
Get the casedown gb18030 code of a given gb18030 code It's only for UCA, because the caseinfo for UCA...
Definition: ctype-gb18030.cc:19733

◆ my_collation_ci_handler

MY_COLLATION_HANDLER my_collation_ci_handler
static
Initial value:
= {nullptr,
nullptr,
static void my_hash_sort_gb18030(const CHARSET_INFO *cs, const uint8_t *s, size_t slen, uint64_t *n1, uint64_t *n2)
Calculate hash value for given gb18030 string.
Definition: ctype-gb18030.cc:20320
static size_t my_strnxfrm_gb18030(const CHARSET_INFO *cs, uint8_t *dst, size_t dstlen, unsigned nweights, const uint8_t *src, size_t srclen, unsigned flags)
Make a sort key suitable for memcmp() corresponding to gb18030 Sort according to UPPER() for non-Chin...
Definition: ctype-gb18030.cc:20080
static int my_strnncoll_gb18030(const CHARSET_INFO *cs, const uint8_t *s, size_t s_length, const uint8_t *t, size_t t_length, bool t_is_prefix)
Compare two strings according to gb18030 every gb18030 code should compare by its caseup form.
Definition: ctype-gb18030.cc:20014
static int my_wildcmp_gb18030(const CHARSET_INFO *cs, const char *str, const char *str_end, const char *wildstr, const char *wildend, int escape, int w_one, int w_many)
Compare string against string with wildcard.
Definition: ctype-gb18030.cc:20294
static int my_strnncollsp_gb18030(const CHARSET_INFO *cs, const uint8_t *s, size_t s_length, const uint8_t *t, size_t t_length)
Compare two strings according to gb18030, but ignore trailing spaces every gb18030 code should compar...
Definition: ctype-gb18030.cc:20040
static int my_strcasecmp_gb18030(const CHARSET_INFO *cs, const char *s, const char *t)
Compare 0-terminated gb18030 strings.
Definition: ctype-gb18030.cc:20118
unsigned my_instr_mb(const CHARSET_INFO *cs, const char *b, size_t b_length, const char *s, size_t s_length, my_match_t *match, unsigned nmatch)
Definition: ctype-mb.cc:357
bool my_like_range_mb(const CHARSET_INFO *cs, const char *ptr, size_t ptr_length, char escape, char w_one, char w_many, size_t res_length, char *min_str, char *max_str, size_t *min_length, size_t *max_length)
Definition: ctype-mb.cc:657
bool my_propagate_simple(const CHARSET_INFO *cs, const uint8_t *str, size_t length)
Definition: ctype-simple.cc:1474
size_t my_strnxfrmlen_simple(const CHARSET_INFO *cs, size_t len)
Definition: ctype-simple.cc:62

◆ PINYIN_2_BYTE_END

const unsigned PINYIN_2_BYTE_END = 0xFE9F
static

◆ PINYIN_2_BYTE_START

const unsigned PINYIN_2_BYTE_START = 0x8140
static

◆ PINYIN_4_1_DIFF

const unsigned PINYIN_4_1_DIFF = 11328
static

◆ PINYIN_4_2_DIFF

const unsigned PINYIN_4_2_DIFF = 254536
static

◆ PINYIN_4_BYTE_1_END

const unsigned PINYIN_4_BYTE_1_END = 0x82359232
static

◆ PINYIN_4_BYTE_1_START

const unsigned PINYIN_4_BYTE_1_START = 0x8138FD38
static

◆ PINYIN_4_BYTE_2_END

const unsigned PINYIN_4_BYTE_2_END = 0x98399836
static

◆ PINYIN_4_BYTE_2_START

const unsigned PINYIN_4_BYTE_2_START = 0x95328236
static

◆ PINYIN_WEIGHT_BASE

const unsigned PINYIN_WEIGHT_BASE = 0xFFA00000
static

◆ plane00

const MY_UNICASE_CHARACTER plane00[]
static

Unicase array for 0x0000-0x00FF.

0x0000-0x007F is for 1-byte code points, the others which represent the diff (diff between code and 81308130, plus 0x80), are for 4-byte code points

◆ plane01

const MY_UNICASE_CHARACTER plane01[]
static

Unicase array for 0x0100-0x01FF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points.

◆ plane02

const MY_UNICASE_CHARACTER plane02[]
static

Unicase array for 0x0200-0x02FF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points.

◆ plane03

const MY_UNICASE_CHARACTER plane03[]
static

Unicase array for 0x0300-0x03FF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points.

◆ plane04

const MY_UNICASE_CHARACTER plane04[]
static

Unicase array for 0x0400-0x04FF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points.

◆ plane10

const MY_UNICASE_CHARACTER plane10[]
static

Unicase array for 0x1000-0x10FF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points.

◆ plane1D

const MY_UNICASE_CHARACTER plane1D[]
static

Unicase array for 0x1D00-0x1DFF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points.

◆ plane1E

const MY_UNICASE_CHARACTER plane1E[]
static

Unicase array for 0x1E00-0x1EFF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points.

◆ plane1F

const MY_UNICASE_CHARACTER plane1F[]
static

Unicase array for 0x1F00-0x1FFF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points.

◆ plane20

const MY_UNICASE_CHARACTER plane20[]
static

Unicase array for 0x2000-0x20FF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points.

◆ plane23

const MY_UNICASE_CHARACTER plane23[]
static

Unicase array for 0x2300-0x23FF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points.

◆ plane2A

const MY_UNICASE_CHARACTER plane2A[]
static

Unicase array for 0x2A00-0x2AFF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points.

◆ plane2B

const MY_UNICASE_CHARACTER plane2B[]
static

Unicase array for 0x2B00-0x2BFF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points.

◆ plane51

const MY_UNICASE_CHARACTER plane51[]
static

Unicase array for 0x5100-0x51FF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points.

◆ plane52

const MY_UNICASE_CHARACTER plane52[]
static

Unicase array for 0x5200-0x52FF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points.

◆ planeA2

const MY_UNICASE_CHARACTER planeA2[]
static

Unicase array for 0xA200-0xA2FF All are for 2-byte code points directly.

◆ planeA3

const MY_UNICASE_CHARACTER planeA3[]
static

Unicase array for 0xA300-0xA3FF All are for 2-byte code points directly.

◆ planeA6

const MY_UNICASE_CHARACTER planeA6[]
static

Unicase array for 0xA600-0xA6FF All are for 2-byte code points directly.

◆ planeA7

const MY_UNICASE_CHARACTER planeA7[]
static

Unicase array for 0xA700-0xA7FF All are for 2-byte code points directly.

◆ planeA8

const MY_UNICASE_CHARACTER planeA8[]
static

Unicase array for 0xA800-0xA8FF All are for 2-byte code points directly.

◆ planeE6

const MY_UNICASE_CHARACTER planeE6[]
static

Unicase array for 0xE600-0xE6FF Plus 0x20000, they're the diff(diff between code and 81308130) for 4-byte code points.

◆ sort_order_gb18030

const uint8_t sort_order_gb18030[]
static
Initial value:
= {
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B,
0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, ' ', '!', '"', '#',
'$', '%', '&', '\'', '(', ')', '*', '+', ',', '-', '.', '/',
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';',
'<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G',
'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S',
'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '^', '_',
'`', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K',
'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W',
'X', 'Y', 'Z', '{', '|', '}', '~', 0x7F, 0x80, 0x81, 0x82, 0x83,
0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F,
0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B,
0x9C, 0x9D, 0x9E, 0x9F, 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7,
0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF, 0xB0, 0xB1, 0xB2, 0xB3,
0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF,
0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xCB,
0xCC, 0xCD, 0xCE, 0xCF, 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, 0xE0, 0xE1, 0xE2, 0xE3,
0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB,
0xFC, 0xFD, 0xFE, 0xFF}

The array[256] used for strings comparison.

◆ tab_gb18030_2_uni

const uint16_t tab_gb18030_2_uni[]
static

Mapping table from 2-byte gb18030 to unicode including all 2-byte code points in [GB+8140, GB+FEFE], with 0 for those invalid code points.

◆ tab_gb18030_4_uni

const uint16_t tab_gb18030_4_uni[]
static

Mapping table from 4-byte gb18030 to Unicode The values here are the diffs for 4-byte gb18030 code points including following ranges: [GB+81308130, GB+8130D330) (GB+8135F436, GB+8137A839) (GB+8138FD38, GB+82358F33) (GB+8336C738, GB+8336D030) (GB+84308534, GB+84309C38) (GB+84318537, GB+8431A439] Others can be calculated algorithmically.

◆ tab_uni_gb18030_p1

const uint16_t tab_uni_gb18030_p1[]
static

Mapping table from Unicode to gb18030, part one For Unicode in [0x80, 0x9FA6), if the leading byte is less than 0x81, the corresponding value represents the diff for 4-byte gb18030 code, otherwise, it's the corresponding 2-byte gb18030 code.

◆ tab_uni_gb18030_p2

const uint16_t tab_uni_gb18030_p2[]
static

Mapping table from Unicode to gb18030, part two For Unicode in [0xE000, 0xE865) and (0xF92B, 0xFFFF] The values here have the same meaning with tab_uni_gb18030_p1.

◆ to_lower_gb18030

const uint8_t to_lower_gb18030[]
static
Initial value:
= {
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B,
0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, ' ', '!', '"', '#',
'$', '%', '&', '\'', '(', ')', '*', '+', ',', '-', '.', '/',
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';',
'<', '=', '>', '?', '@', 'a', 'b', 'c', 'd', 'e', 'f', 'g',
'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's',
't', 'u', 'v', 'w', 'x', 'y', 'z', '[', '\\', ']', '^', '_',
'`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k',
'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w',
'x', 'y', 'z', '{', '|', '}', '~', 0x7F, 0x80, 0x81, 0x82, 0x83,
0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F,
0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B,
0x9C, 0x9D, 0x9E, 0x9F, 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7,
0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF, 0xB0, 0xB1, 0xB2, 0xB3,
0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF,
0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xCB,
0xCC, 0xCD, 0xCE, 0xCF, 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, 0xE0, 0xE1, 0xE2, 0xE3,
0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB,
0xFC, 0xFD, 0xFE, 0xFF}

The array[256] used in casedn.

◆ to_upper_gb18030

const uint8_t to_upper_gb18030[]
static
Initial value:
= {
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B,
0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, ' ', '!', '"', '#',
'$', '%', '&', '\'', '(', ')', '*', '+', ',', '-', '.', '/',
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';',
'<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G',
'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S',
'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '^', '_',
'`', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K',
'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W',
'X', 'Y', 'Z', '{', '|', '}', '~', 0x7F, 0x80, 0x81, 0x82, 0x83,
0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F,
0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B,
0x9C, 0x9D, 0x9E, 0x9F, 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7,
0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF, 0xB0, 0xB1, 0xB2, 0xB3,
0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF,
0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xCB,
0xCC, 0xCD, 0xCE, 0xCF, 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, 0xE0, 0xE1, 0xE2, 0xE3,
0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB,
0xFC, 0xFD, 0xFE, 0xFF}

The array[256] used in caseup.

◆ UNI2_TO_GB4_DIFF

const unsigned UNI2_TO_GB4_DIFF = 7456
static

◆ UNICASE_4_BYTE_OFFSET

const unsigned UNICASE_4_BYTE_OFFSET = 0x80
static