MySQL 8.0.39
Source Code Documentation
|
#include <assert.h>
#include <string.h>
#include <sys/types.h>
#include "m_ctype.h"
#include "my_compiler.h"
#include "my_inttypes.h"
#include "template_utils.h"
Macros | |
#define | is_mb_1(c) ((uchar)(c) <= 0x7F) |
#define | is_mb_odd(c) (0x81 <= (uchar)(c) && (uchar)(c) <= 0xFE) |
#define | is_mb_even_2(c) |
#define | is_mb_even_4(c) (0x30 <= (uchar)(c) && (uchar)(c) <= 0x39) |
Functions | |
static uint | gb18030_chs_to_code (const uchar *src, size_t srclen) |
Convert a gb18030 code in uchars to the code The byte sequence in src must be a gb18030 sequence. More... | |
static size_t | code_to_gb18030_chs (uchar *dst, size_t dstlen, uint code) |
Convert a gb18030 code to a sequence of chars. More... | |
static uint | diff_to_gb18030_4 (uchar *dst, uint dstlen, uint diff) |
Calculate the 4-byte GB18030 code from a diff value. More... | |
static uint | gb18030_4_code_to_diff (uint code) |
Calculate the diff between the 4-byte gb18030 code and GB+81308130. More... | |
static uint | gb18030_4_chs_to_diff (const uchar *src) |
Calculate the diff between the 4-byte gb18030 code in bytes and GB+81308130. More... | |
static uint | my_ismbchar_gb18030 (const CHARSET_INFO *cs, const char *p, const char *e) |
Judge if a sequence of chars is in gb18030 multi-bytes code. More... | |
static uint | my_mbcharlen_gb18030 (const CHARSET_INFO *cs, uint c) |
Get the length of a possible gb18030 code according to its first byte or first two bytes. More... | |
static int | my_wc_mb_gb18030_chs (const CHARSET_INFO *cs, my_wc_t wc, uchar *s, uchar *e) |
Convert the Unicode code to its gb18030 code in bytes. More... | |
static int | my_mb_wc_gb18030 (const CHARSET_INFO *cs, my_wc_t *pwc, const uchar *s, const uchar *e) |
Convert a gb18030 code in bytes to unicode code. More... | |
static size_t | my_well_formed_len_gb18030 (const CHARSET_INFO *cs, const char *b, const char *e, size_t pos, int *error) |
Get the well formed length of a GB18030 string. More... | |
static const MY_UNICASE_CHARACTER * | get_case_info (const CHARSET_INFO *cs, const uchar *src, size_t srclen) |
Get the case info of one gb18030 code in bytes. More... | |
static uint | case_info_code_to_gb18030 (uint code) |
Convert the code in one MY_UNICASE_CHARACTER to real gb18030 code. More... | |
static uint | get_casefolded_code (const CHARSET_INFO *cs, const uchar *src, size_t srclen, size_t is_upper) |
Get the casefolded code of a given gb18030 code. More... | |
static size_t | my_casefold_gb18030 (const CHARSET_INFO *cs, char *src, size_t srclen, char *dst, size_t dstlen, const uchar *map, bool is_upper) |
Get the casefolded gb18030 codes of a given sequence of gb18030 codes Store the casefolded result to a specified dest. More... | |
static size_t | my_casedn_gb18030 (const CHARSET_INFO *cs, char *src, size_t srclen, char *dst, size_t dstlen) |
Convert a gb18030 string to a corresponding lower-case gb18030 string. More... | |
static size_t | my_caseup_gb18030 (const CHARSET_INFO *cs, char *src, size_t srclen, char *dst, size_t dstlen) |
Convert a gb18030 string to a corresponding up-case gb18030 string. More... | |
static size_t | my_casedn_gb18030_uca (const CHARSET_INFO *cs, char *src, size_t srclen, char *dst, size_t dstlen) |
Get the casedown gb18030 code of a given gb18030 code It's only for UCA, because the caseinfo for UCA is different. More... | |
static size_t | my_caseup_gb18030_uca (const CHARSET_INFO *cs, char *src, size_t srclen, char *dst, size_t dstlen) |
Get the caseup gb18030 code of a given gb18030 code It's only for UCA, because the caseinfo for UCA is different. More... | |
static uint | get_weight_if_chinese_character (uint code) |
Get the weight for a multi-byte gb18030 code if the code point represents a Chinese character defined in collation PINYIN in CLDR24 The result will be PINYIN_WEIGHT_BASE + a none-zero seq NO. More... | |
static uint | get_weight_for_mbchar (const CHARSET_INFO *cs, const uchar *src, size_t mblen) |
Get the weight for a multi-byte gb18030 code, we get the weight by the case up form of gb18030 code if exists. More... | |
static uint | get_weight_for_gb18030_chs (const CHARSET_INFO *cs, const char *s, size_t s_len) |
Get the weight of a given gb18030 code We can assert the code must be a valid gb18030 code. More... | |
static size_t | get_code_and_length (const CHARSET_INFO *cs, const char *s, const char *e, size_t *code) |
Get the code value and length of next code in given gb18030 string. More... | |
static int | my_strnncoll_gb18030_internal (const CHARSET_INFO *cs, const uchar **s_res, size_t s_length, const uchar **t_res, size_t t_length) |
Internal func to compare two strings according to gb18030 every gb18030 code should compare by its upper-case form. More... | |
static int | my_strnncoll_gb18030 (const CHARSET_INFO *cs, const uchar *s, size_t s_length, const uchar *t, size_t t_length, bool t_is_prefix) |
Compare two strings according to gb18030 every gb18030 code should compare by its caseup form. More... | |
static int | my_strnncollsp_gb18030 (const CHARSET_INFO *cs, const uchar *s, size_t s_length, const uchar *t, size_t t_length) |
Compare two strings according to gb18030, but ignore trailing spaces every gb18030 code should compare by its caseup form. More... | |
static size_t | my_strnxfrm_gb18030 (const CHARSET_INFO *cs, uchar *dst, size_t dstlen, uint nweights, const uchar *src, size_t srclen, uint flags) |
Make a sort key suitable for memcmp() corresponding to gb18030 Sort according to UPPER() for non-Chinese chars, and PINYIN for Chinese chars. More... | |
static int | my_strcasecmp_gb18030 (const CHARSET_INFO *cs, const char *s, const char *t) |
Compare 0-terminated gb18030 strings. More... | |
static uint | unicode_to_gb18030_code (const CHARSET_INFO *cs, int unicode) |
Convert a Unicode code to gb18030 code. More... | |
static int | my_wildcmp_gb18030_impl (const CHARSET_INFO *cs, const char *str, const char *str_end, const char *wildstr, const char *wildend, uint escape, uint w_one, uint w_many, int recurse_level) |
Compare string against string with wildcard. More... | |
static int | my_wildcmp_gb18030 (const CHARSET_INFO *cs, const char *str, const char *str_end, const char *wildstr, const char *wildend, int escape, int w_one, int w_many) |
Compare string against string with wildcard. More... | |
static void | my_hash_sort_gb18030 (const CHARSET_INFO *cs, const uchar *s, size_t slen, uint64 *n1, uint64 *n2) |
Calculate hash value for given gb18030 string. More... | |
Variables | |
static const uint | MIN_MB_ODD_BYTE = 0x81 |
static const uint | MIN_MB_EVEN_BYTE_2 = 0x40 |
static const uint | MIN_MB_EVEN_BYTE_4 = 0x30 |
static const uint | MAX_GB18030_DIFF = 0x18398F |
static const uint | UNI2_TO_GB4_DIFF = 7456 |
static const uint | UNICASE_4_BYTE_OFFSET = 0x80 |
static const uint | MIN_2_BYTE_UNICASE = 0xA000 |
static const uint | MAX_2_BYTE_UNICASE = 0xDFFF |
static const uint | MIN_3_BYTE_FROM_UNI = 0x2E600 |
static const uint | MAX_3_BYTE_FROM_UNI = 0x2E6FF |
static const uint | PINYIN_2_BYTE_START = 0x8140 |
static const uint | PINYIN_2_BYTE_END = 0xFE9F |
static const uint | PINYIN_4_BYTE_1_START = 0x8138FD38 |
static const uint | PINYIN_4_BYTE_1_END = 0x82359232 |
static const uint | PINYIN_4_1_DIFF = 11328 |
static const uint | PINYIN_4_BYTE_2_START = 0x95328236 |
static const uint | PINYIN_4_BYTE_2_END = 0x98399836 |
static const uint | PINYIN_4_2_DIFF = 254536 |
static const uint | PINYIN_WEIGHT_BASE = 0xFFA00000 |
static const uint | COMMON_WEIGHT_BASE = 0xFF000000 |
static const uchar | ctype_gb18030 [257] |
The array used for "type of characters" bit mask for each character. More... | |
static const uchar | to_lower_gb18030 [] |
The array[256] used in casedn. More... | |
static const uchar | to_upper_gb18030 [] |
The array[256] used in caseup. More... | |
static const uchar | sort_order_gb18030 [] |
The array[256] used for strings comparison. More... | |
static const MY_UNICASE_CHARACTER | plane00 [] |
Unicase array for 0x0000-0x00FF. More... | |
static const MY_UNICASE_CHARACTER | plane01 [] |
Unicase array for 0x0100-0x01FF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points. More... | |
static const MY_UNICASE_CHARACTER | plane02 [] |
Unicase array for 0x0200-0x02FF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points. More... | |
static const MY_UNICASE_CHARACTER | plane03 [] |
Unicase array for 0x0300-0x03FF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points. More... | |
static const MY_UNICASE_CHARACTER | plane04 [] |
Unicase array for 0x0400-0x04FF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points. More... | |
static const MY_UNICASE_CHARACTER | plane10 [] |
Unicase array for 0x1000-0x10FF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points. More... | |
static const MY_UNICASE_CHARACTER | plane1D [] |
Unicase array for 0x1D00-0x1DFF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points. More... | |
static const MY_UNICASE_CHARACTER | plane1E [] |
Unicase array for 0x1E00-0x1EFF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points. More... | |
static const MY_UNICASE_CHARACTER | plane1F [] |
Unicase array for 0x1F00-0x1FFF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points. More... | |
static const MY_UNICASE_CHARACTER | plane20 [] |
Unicase array for 0x2000-0x20FF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points. More... | |
static const MY_UNICASE_CHARACTER | plane23 [] |
Unicase array for 0x2300-0x23FF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points. More... | |
static const MY_UNICASE_CHARACTER | plane2A [] |
Unicase array for 0x2A00-0x2AFF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points. More... | |
static const MY_UNICASE_CHARACTER | plane2B [] |
Unicase array for 0x2B00-0x2BFF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points. More... | |
static const MY_UNICASE_CHARACTER | plane51 [] |
Unicase array for 0x5100-0x51FF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points. More... | |
static const MY_UNICASE_CHARACTER | plane52 [] |
Unicase array for 0x5200-0x52FF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points. More... | |
static const MY_UNICASE_CHARACTER | planeA2 [] |
Unicase array for 0xA200-0xA2FF All are for 2-byte code points directly. More... | |
static const MY_UNICASE_CHARACTER | planeA3 [] |
Unicase array for 0xA300-0xA3FF All are for 2-byte code points directly. More... | |
static const MY_UNICASE_CHARACTER | planeA6 [] |
Unicase array for 0xA600-0xA6FF All are for 2-byte code points directly. More... | |
static const MY_UNICASE_CHARACTER | planeA7 [] |
Unicase array for 0xA700-0xA7FF All are for 2-byte code points directly. More... | |
static const MY_UNICASE_CHARACTER | planeA8 [] |
Unicase array for 0xA800-0xA8FF All are for 2-byte code points directly. More... | |
static const MY_UNICASE_CHARACTER | planeE6 [] |
Unicase array for 0xE600-0xE6FF Plus 0x20000, they're the diff(diff between code and 81308130) for 4-byte code points. More... | |
static const MY_UNICASE_CHARACTER * | my_caseinfo_pages_gb18030 [256] |
The UNICASE array. More... | |
static MY_UNICASE_INFO | my_caseinfo_gb18030 |
UNICASE INFO. More... | |
static const uint16 | tab_gb18030_2_uni [] |
Mapping table from 2-byte gb18030 to unicode including all 2-byte code points in [GB+8140, GB+FEFE], with 0 for those invalid code points. More... | |
static const uint16 | tab_gb18030_4_uni [] |
Mapping table from 4-byte gb18030 to Unicode The values here are the diffs for 4-byte gb18030 code points including following ranges: [GB+81308130, GB+8130D330) (GB+8135F436, GB+8137A839) (GB+8138FD38, GB+82358F33) (GB+8336C738, GB+8336D030) (GB+84308534, GB+84309C38) (GB+84318537, GB+8431A439] Others can be calculated algorithmically. More... | |
static const uint16 | tab_uni_gb18030_p1 [] |
Mapping table from Unicode to gb18030, part one For Unicode in [0x80, 0x9FA6), if the leading byte is less than 0x81, the corresponding value represents the diff for 4-byte gb18030 code, otherwise, it's the corresponding 2-byte gb18030 code. More... | |
static const uint16 | tab_uni_gb18030_p2 [] |
Mapping table from Unicode to gb18030, part two For Unicode in [0xE000, 0xE865) and (0xF92B, 0xFFFF] The values here have the same meaning with tab_uni_gb18030_p1. More... | |
static const uint16 | gb18030_2_weight_py [] |
The following 3 Chinese character weight arrays are based upon the PINYIN collation in zh.xml file of CLDR24(http://cldr.unicode.org/) (http://unicode.org/Public/cldr/24/core.zip) More... | |
static const uint16 | gb18030_4_weight_py_p1 [] |
Weight array for one range of 4-byte gb18030 code points, which is [GB+8138FE38, GB+82359232]. More... | |
static const uint16 | gb18030_4_weight_py_p2 [] |
Weight array for the other range of 4-byte gb18030 code points, which is [GB+95328236, GB+98399836]. More... | |
static MY_COLLATION_HANDLER | my_collation_ci_handler |
static MY_CHARSET_HANDLER | my_charset_gb18030_handler |
MY_CHARSET_HANDLER | my_charset_gb18030_uca_handler |
CHARSET_INFO | my_charset_gb18030_chinese_ci |
CHARSET_INFO | my_charset_gb18030_bin |
#define is_mb_1 | ( | c | ) | ((uchar)(c) <= 0x7F) |
#define is_mb_even_2 | ( | c | ) |
Convert the code in one MY_UNICASE_CHARACTER to real gb18030 code.
[in] | code | code in one MY_UNICASE_CHARACTER |
Convert a gb18030 code to a sequence of chars.
If the code number is too large to store, the tailing bytes will be cut off
[out] | dst | dest to store the result |
[in] | dstlen | valid length of dest |
[in] | code | gb18030 code |
Calculate the 4-byte GB18030 code from a diff value.
[out] | dst | dest to store the gb18030 code in bytes |
[in] | dstlen | valid length of dest |
[in] | diff | the diff between gb18030 code and GB+81308130 |
4 | if the diff is a valid value and there is enough space in dst 0 otherwise |
Calculate the diff between the 4-byte gb18030 code in bytes and GB+81308130.
[in] | src | 4-byte gb18030 code in bytes |
Calculate the diff between the 4-byte gb18030 code and GB+81308130.
[in] | code | 4-byte gb18030 code |
Convert a gb18030 code in uchars to the code The byte sequence in src must be a gb18030 sequence.
[in] | src | gb18030 code in uchars |
[in] | srclen | length of valid chars, should be 1, 2 or 4 only |
|
static |
Get the case info of one gb18030 code in bytes.
[in] | cs | charset |
[in] | src | start byte of gb18030 code |
[in] | srclen | length in bytes of gb18030 code |
|
static |
Get the casefolded code of a given gb18030 code.
[in] | cs | charset |
[in] | src | start byte of gb18030 code |
[in] | srclen | length of gb18030 code in bytes |
[in] | is_upper | true if we want capital letter false if small letter is required |
the | gb18030 code according to is_upper 0 if no upper-case or lower-case exists |
|
static |
Get the code value and length of next code in given gb18030 string.
[in] | cs | charset |
[in] | s | string |
[in] | e | end of string |
[out] | code | the code value of the next code, be valid when retval>0 |
the | length of the next code, if the code is valid 0 if the given string is empty or the code is invalid |
|
static |
Get the weight of a given gb18030 code We can assert the code must be a valid gb18030 code.
[in] | cs | charset |
[in] | s | code chars |
[in] | s_len | length of the code |
|
static |
Get the weight for a multi-byte gb18030 code, we get the weight by the case up form of gb18030 code if exists.
For any Chinese character, which has non-zero seq NO. defined in gb18030_2_weight_py/gb18030_4_weight_py_p1/gb18030_4_weight_py_p2 according to its gb18030 code, the final weight shall be 0xFFA00000+(seq No.)
For any non-Chinese gb18030 character C, let C'=C or UPPER(C) if exists. So C' would be 2 bytes or 4 bytes.
The weight of the maximum code point GB+FE39FE39 is defined as 0xFFFFFFFF
[in] | cs | charset |
[in] | src | the first byte of multi-byte gb18030 code |
[in] | mblen | the length of multi-bytes gb18030 code |
Get the weight for a multi-byte gb18030 code if the code point represents a Chinese character defined in collation PINYIN in CLDR24 The result will be PINYIN_WEIGHT_BASE + a none-zero seq NO.
for the code
if not, result would be PINYIN_WEIGHT_BASE
[in] | code | the multi-byte gb18030 code |
a | non-zero weight if it's a Chinese character with PINYIN, which shall be PINYIN_WEIGHT_BASE + none-zero seq NO. otherwise, PINYIN_WEIGHT_BASE |
|
static |
Convert a gb18030 string to a corresponding lower-case gb18030 string.
[in] | cs | charset |
[in] | src | start byte of given gb18030 code |
[in] | srclen | length of given gb18030 code |
[out] | dst | start byte of casedown gb18030 code |
[in] | dstlen | length of the result array |
|
static |
Get the casedown gb18030 code of a given gb18030 code It's only for UCA, because the caseinfo for UCA is different.
[in] | cs | charset |
[in] | src | start byte of given gb18030 code |
[in] | srclen | length of given gb18030 code |
[out] | dst | start byte of casedown gb18030 code |
[in] | dstlen | length of the result array |
|
static |
Get the casefolded gb18030 codes of a given sequence of gb18030 codes Store the casefolded result to a specified dest.
[in] | cs | charset |
[in] | src | start byte of given sequence of gb18030 codes |
[in] | srclen | length of the sequence |
[out] | dst | start byte of the output of casefolded codes |
[out] | dstlen | length of the result array |
[in] | map | the LOWER map or the UPPER map of gb18030 |
[in] | is_upper | true if we want capital letter false if small letter is required |
|
static |
Convert a gb18030 string to a corresponding up-case gb18030 string.
[in] | cs | charset |
[in] | src | start byte of given gb18030 code |
[in] | srclen | length of given gb18030 code |
[out] | dst | start byte of caseup gb18030 code |
[in] | dstlen | length of the result array |
|
static |
Get the caseup gb18030 code of a given gb18030 code It's only for UCA, because the caseinfo for UCA is different.
[in] | cs | charset |
[in] | src | start byte of given gb18030 code |
[in] | srclen | length of given gb18030 code |
[out] | dst | start byte of caseup gb18030 code |
[in] | dstlen | length of the result array |
|
static |
Calculate hash value for given gb18030 string.
[in] | cs | charset |
[in] | s | string |
[in] | slen | the length of string |
[in,out] | n1 | n1 |
[in,out] | n2 | n2 |
|
static |
Judge if a sequence of chars is in gb18030 multi-bytes code.
[in] | cs | charset |
[in] | p | start of gb18030 code |
[in] | e | end of gb18030 code |
2 | or 4 if it is gb18030 mluti-bytes code, 0 if not |
|
static |
Convert a gb18030 code in bytes to unicode code.
[in] | cs | charset |
[out] | pwc | unicode code |
[in] | s | start of gb18030 code |
[in] | e | end of gb18030 code |
1) | the length of converted gb18030 code if convertible 2) MY_CS_TOOSMALL..MY_CS_TOOSMALL4 if the gb18030 code is too short 3) MY_CS_ILSEQ if gb18030 code is wrong by sequence |
|
static |
Get the length of a possible gb18030 code according to its first byte or first two bytes.
[in] | cs | charset |
[in] | c | first byte or first two bytes of the code |
1/2/4 | accordingly if the leading byte(s) indicate the code would be gb18030, otherwise 0 |
|
static |
Compare 0-terminated gb18030 strings.
[in] | cs | charset |
[in] | s | first 0-terminated string to compare |
[in] | t | second 0-terminated string to compare |
negative | number if s < t positive number if s > t 0 if the strings are equal |
|
static |
Compare two strings according to gb18030 every gb18030 code should compare by its caseup form.
[in] | cs | charset |
[in] | s | start of the first string |
[in] | s_length | length of the first string |
[in] | t | start of the second string |
[in] | t_length | length of the second string |
[in] | t_is_prefix | true if t is prefix, otherwise false |
0 | if two strings are equal 1 if the first string is bigger -1 if the second string is bigger |
|
static |
Internal func to compare two strings according to gb18030 every gb18030 code should compare by its upper-case form.
[in] | cs | charset |
[in,out] | s_res | pointer to the start byte of first gb18030 code string, return the first byte unchecked |
[in] | s_length | length of the first string |
[in,out] | t_res | pointer to the start byte of second gb18030 code string, return the first byte unchecked |
[in] | t_length | length of the second string |
0 | if the strings are equal >0 if the first string is bigger <0 if the second string is bigger |
|
static |
Compare two strings according to gb18030, but ignore trailing spaces every gb18030 code should compare by its caseup form.
[in] | cs | charset |
[in] | s | start of the first string |
[in] | s_length | length of the first string |
[in] | t | start of the second string |
[in] | t_length | length of the second string |
0 | if two strings are equal 1 if the first string is bigger -1 if the second string is bigger |
|
static |
Make a sort key suitable for memcmp() corresponding to gb18030 Sort according to UPPER() for non-Chinese chars, and PINYIN for Chinese chars.
[in] | cs | charset |
[out] | dst | first byte of the result array |
[out] | dstlen | the length of the result array |
[in] | nweights | how many gb18030 codes we care about |
[in] | src | first byte of the given gb18030 code |
[in] | srclen | length of the given gb18030 code |
[in] | flags | flags for strxfrm |
|
static |
Convert the Unicode code to its gb18030 code in bytes.
[in] | cs | charset |
[in] | wc | Unicode code |
[out] | s | start of gb18030 code output |
[out] | e | end of gb18030 code output |
1) | the length of gb18030 code(1/2/4) if convertible, 2) MY_CS_TOOSMALL..MY_CS_TOOSMALL4 if the output space is too small 3) MY_CS_ILUNI if we can't encode unicode to gb18030 |
|
static |
Get the well formed length of a GB18030 string.
[in] | cs | charset |
[in] | b | start of gb18030 code |
[in] | e | end of gb18030 code |
[in] | pos | max chars we care about |
[out] | error | 0 if every gb18030 code we get is correct, otherwise 1 |
|
static |
Compare string against string with wildcard.
[in] | cs | charset |
[in] | str | string |
[in] | str_end | end of the string |
[in] | wildstr | string with wildcard |
[in] | wildend | end of the string with wildcard |
[in] | escape | escape char, which is a Unicode code |
[in] | w_one | wild one char, which is a Unicode code |
[in] | w_many | wild many char, which is a Unicode code |
|
static |
Compare string against string with wildcard.
[in] | cs | charset |
[in] | str | string |
[in] | str_end | end of the string |
[in] | wildstr | string with wildcard |
[in] | wildend | end of the string with wildcard |
[in] | escape | escape char, which is a gb18030 code |
[in] | w_one | wild one char, which is a gb18030 code |
[in] | w_many | wild many char, which is a gb18030 code |
[in] | recurse_level | current recurse level to do wild card |
|
static |
Convert a Unicode code to gb18030 code.
[in] | cs | charset |
[in] | unicode | unicode code |
|
static |
|
static |
The array used for "type of characters" bit mask for each character.
The ctype[0] is reserved for EOF(-1), so we use ctype[(char)+1]. Also refer to strings/CHARSET_INFO.txt
|
static |
The following 3 Chinese character weight arrays are based upon the PINYIN collation in zh.xml file of CLDR24(http://cldr.unicode.org/) (http://unicode.org/Public/cldr/24/core.zip)
Please note that we only support those Chinese characters with PINYIN in zh.xml currently, we support neither those CJK characters whose category defined in Unicode are Symbol with the same shape as Chinese characters nor the PINYIN characters
The 3 arrays include the sort order of the code points accordingly The weight here just mean the order of the corresponding gb18030 code point. For example: currently U+963F(GB+B0A2) is the first character in the PINYIN collation, while U+5475(GB+BAC7) is the second, and so on. So the weight of U+963F is 1 and U+5475 is 2, and GB+8140 is the 15308th character, which has the value of 15308 in the first array
The weight starts from 1 to 41309 and there are no duplicate weights in the collation, so the simplified Chinese is not equal to the corresponding traditional Chinese
Totally, there are 41309 Chinese characters being taken into account in these arrays
The weight of each code point shall be the corresponding weight in these arrays plus PINYIN_WEIGHT_BASE Weight array for those 2-byte gb18030 code points in the range [GB+8140, GB+FE9F]. If it's not a Chinese char, the weight is 0
|
static |
Weight array for one range of 4-byte gb18030 code points, which is [GB+8138FE38, GB+82359232].
The sequence NO. of this array is the diff between the code point and GB+8138FE38 The weights would be 0 for non-Chinese chars
|
static |
Weight array for the other range of 4-byte gb18030 code points, which is [GB+95328236, GB+98399836].
The sequence NO. of this array is the diff between code point and GB+95328236 The weights would be 0 for non-Chinese chars This range maps from Unicode [U+20000, U+2FFFF], which is CJK extension
|
static |
|
static |
|
static |
|
static |
|
static |
|
static |
|
static |
|
static |
|
static |
UNICASE INFO.
|
static |
The UNICASE array.
CHARSET_INFO my_charset_gb18030_bin |
CHARSET_INFO my_charset_gb18030_chinese_ci |
|
static |
MY_CHARSET_HANDLER my_charset_gb18030_uca_handler |
|
static |
|
static |
|
static |
|
static |
|
static |
|
static |
|
static |
|
static |
|
static |
|
static |
|
static |
Unicase array for 0x0000-0x00FF.
0x0000-0x007F is for 1-byte code points, the others which represent the diff (diff between code and 81308130, plus 0x80), are for 4-byte code points
|
static |
Unicase array for 0x0100-0x01FF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points.
|
static |
Unicase array for 0x0200-0x02FF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points.
|
static |
Unicase array for 0x0300-0x03FF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points.
|
static |
Unicase array for 0x0400-0x04FF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points.
|
static |
Unicase array for 0x1000-0x10FF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points.
|
static |
Unicase array for 0x1D00-0x1DFF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points.
|
static |
Unicase array for 0x1E00-0x1EFF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points.
|
static |
Unicase array for 0x1F00-0x1FFF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points.
|
static |
Unicase array for 0x2000-0x20FF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points.
|
static |
Unicase array for 0x2300-0x23FF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points.
|
static |
Unicase array for 0x2A00-0x2AFF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points.
|
static |
Unicase array for 0x2B00-0x2BFF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points.
|
static |
Unicase array for 0x5100-0x51FF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points.
|
static |
Unicase array for 0x5200-0x52FF All are the diff(diff between code and 81308130, plus 0x80) for 4-byte code points.
|
static |
Unicase array for 0xA200-0xA2FF All are for 2-byte code points directly.
|
static |
Unicase array for 0xA300-0xA3FF All are for 2-byte code points directly.
|
static |
Unicase array for 0xA600-0xA6FF All are for 2-byte code points directly.
|
static |
Unicase array for 0xA700-0xA7FF All are for 2-byte code points directly.
|
static |
Unicase array for 0xA800-0xA8FF All are for 2-byte code points directly.
|
static |
Unicase array for 0xE600-0xE6FF Plus 0x20000, they're the diff(diff between code and 81308130) for 4-byte code points.
|
static |
The array[256] used for strings comparison.
|
static |
Mapping table from 2-byte gb18030 to unicode including all 2-byte code points in [GB+8140, GB+FEFE], with 0 for those invalid code points.
|
static |
Mapping table from 4-byte gb18030 to Unicode The values here are the diffs for 4-byte gb18030 code points including following ranges: [GB+81308130, GB+8130D330) (GB+8135F436, GB+8137A839) (GB+8138FD38, GB+82358F33) (GB+8336C738, GB+8336D030) (GB+84308534, GB+84309C38) (GB+84318537, GB+8431A439] Others can be calculated algorithmically.
|
static |
Mapping table from Unicode to gb18030, part one For Unicode in [0x80, 0x9FA6), if the leading byte is less than 0x81, the corresponding value represents the diff for 4-byte gb18030 code, otherwise, it's the corresponding 2-byte gb18030 code.
|
static |
Mapping table from Unicode to gb18030, part two For Unicode in [0xE000, 0xE865) and (0xF92B, 0xFFFF] The values here have the same meaning with tab_uni_gb18030_p1.
|
static |
The array[256] used in casedn.
|
static |
The array[256] used in caseup.
|
static |
|
static |