MySQL  8.0.19
Source Code Documentation
str_uca_type.h
Go to the documentation of this file.
1 /* Copyright (c) 2016, 2018, Oracle and/or its affiliates. All rights reserved.
2 
3  This program is free software; you can redistribute it and/or modify
4  it under the terms of the GNU General Public License, version 2.0,
5  as published by the Free Software Foundation.
6 
7  This program is also distributed with certain software (including
8  but not limited to OpenSSL) that is licensed under separate terms,
9  as designated in a particular file or component or in included license
10  documentation. The authors of MySQL hereby grant you an additional
11  permission to link the program and your derivative works with the
12  separately licensed software that they have included with MySQL.
13 
14  This program is distributed in the hope that it will be useful,
15  but WITHOUT ANY WARRANTY; without even the implied warranty of
16  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17  GNU General Public License, version 2.0, for more details.
18 
19  You should have received a copy of the GNU General Public License
20  along with this program; if not, write to the Free Software
21  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */
22 
23 /* This header file contains type declarations used by UCA code. */
24 
25 #ifndef STR_UCA_TYPE_H
26 #define STR_UCA_TYPE_H
27 
28 #include <vector>
29 
30 #include "my_inttypes.h"
31 
32 /*
33  So far we have only Croatian collation needs to reorder Latin and
34  Cyrillic group of characters. May add more in future.
35 */
36 #define UCA_MAX_CHAR_GRP 4
38 
47 };
48 
52 };
53 
57 };
58 
59 struct Reorder_param {
64 };
65 
67 
68 struct Coll_param {
70  bool norm_enabled; // false = normalization off, default;
71  // true = on
73 };
74 
75 /*
76  NOTE: If you change MY_UCA_MAX_CONTRACTION, be sure to update the comment on
77  MY_UCA_CNT_MID1 in strings/uca_data.h, as it might cause us to run out of
78  bits in a byte flag.
79 */
80 #define MY_UCA_MAX_CONTRACTION 6
81 #define MY_UCA_MAX_WEIGHT_SIZE 25
82 #define MY_UCA_WEIGHT_LEVELS 1
83 
84 /*
85  We store all the contractions in a trie, indexed on the codepoints they
86  consist of. The trie is organized as:
87  1. Each node stores one code point (ch) of contraction, and a list of nodes
88  (child_nodes) store all possible following code points.
89  2. The vector in MY_UCA_INFO stores a list of nodes which store the first
90  code points of all contractions.
91  3. Each node has a boolean value (is_contraction_tail) which shows
92  whether the code point stored in the node is the end of a contraction.
93  This is necessary because even if one code point is the end of a
94  contraction, there might be longer contraction contains all the
95  code points in the path (e.g., for Hungarian, both 'DZ' and 'DZS' are
96  contractions).
97  4. A contraction is formed by all the code points in the path until the
98  end of the contraction.
99  5. If it is the end of a contraction (is_contraction_tail == true), the
100  weight of this contraction is stored in array weight.
101  6. If it is the end of a contraction (is_contraction_tail == true),
102  with_context shows whether it is common contraction (with_context ==
103  false), or previous context contraction (with_context == true).
104  7. If it is the end of a contraction (is_contraction_tail == true),
105  contraction_len shows how many code points this contraction consists of.
106 */
109  // Lists of following nodes.
110  std::vector<MY_CONTRACTION> child_nodes;
111  std::vector<MY_CONTRACTION> child_nodes_context;
112 
113  // weight and with_context are only useful when is_contraction_tail is true.
114  uint16 weight[MY_UCA_MAX_WEIGHT_SIZE]; /* Its weight string, 0-terminated */
117 };
118 
119 struct MY_UCA_INFO {
121 
122  // Collation weights.
127  std::vector<MY_CONTRACTION> *contraction_nodes;
128  /*
129  contraction_flags is only used when a collation has contraction rule.
130  UCA collation supports at least 65535 characters, but only a few of
131  them can be part of contraction, it is huge waste of time to find out
132  whether one character is in contraction list for every character.
133  contraction_flags points to memory which is allocated when a collation
134  has contraction rule. For a character in contraction, its corresponding
135  byte (contraction_flags[ch & 0x1000]) will be set to a certain value
136  according to the position (head, tail or middle) of this character in
137  contraction. This byte will be used to quick check whether one character
138  can be part of contraction.
139  */
141 
142  /* Logical positions */
155  /*
156  extra_ce_pri_base, extra_ce_sec_base and extra_ce_ter_base are only used for
157  the UCA collations whose UCA version is not smaller than UCA_V900. For why
158  we need this extra CE, please see the comment in my_char_weight_put_900()
159  and apply_primary_shift_900().
160 
161  The value of these three variables is set by the definition of my_uca_v900.
162  The value of extra_ce_pri_base is usually 0x54A4 (which is the maximum
163  regular weight value pluses one, 0x54A3 + 1 = 0x54A4). But for the Chinese
164  collation, the extra_ce_pri_base needs to change. This is because 0x54A4 has
165  been occupied to do reordering. There might be weight conflict if we still
166  use 0x54A4. Please also see the comment on modify_all_zh_pages().
167  */
168  uint16 extra_ce_pri_base; // Primary weight of extra CE
169  uint16 extra_ce_sec_base; // Secondary weight of extra CE
170  uint16 extra_ce_ter_base; // Tertiary weight of extra CE
171 };
172 
173 #define MY_UCA_CNT_FLAG_SIZE 4096
174 #define MY_UCA_CNT_FLAG_MASK 4095
175 
176 /** Whether the given character can be the first in any contraction. */
177 #define MY_UCA_CNT_HEAD 1
178 
179 /** Whether the given character can be the last in any contraction. */
180 #define MY_UCA_CNT_TAIL 2
181 
182 /**
183  Whether the given character can be the second in any contraction.
184 
185  Also defined implicitly through shifting MY_UCA_CNT_MID1:
186 
187  \#define MY_UCA_CNT_MID2 8
188  \#define MY_UCA_CNT_MID3 16
189  \#define MY_UCA_CNT_MID4 32
190 
191  There's no need for MY_UCA_CNT_MID5 (which would cause us to run out of
192  bits) since MY_UCA_MAX_CONTRACTION is 6 (so head, four in the middle,
193  and then tail).
194 */
195 #define MY_UCA_CNT_MID1 4
196 
197 /**
198  Whether the given character is the first part of a context-sensitive
199  contraction. Context-sensitive contractions are like normal contractions,
200  except that for performance reasons, they trigger on the _last_ character
201  instead of the first. The case given in Unicode TR35 is that in some
202  scripts (such as katakana in Japanese), "a-" should sort as "aa"
203  (except on the tertiary level), "e-" should sort as "ee" and so on.
204  However, adding regular contractions on "a" and "e" would cause undue
205  performance loss, so instead, we add a special "context-sensitive"
206  contraction on "-" that then looks at the _previous_ character.
207 
208  We don't support context-sensitive contractions longer than two characters
209  at the moment, since none exist in CLDR. Thus, there is no
210  MY_UCA_PREVIOUS_CONTEXT_MID1 and so on.
211 */
212 #define MY_UCA_PREVIOUS_CONTEXT_HEAD 64
213 
214 /** Similar to MY_UCA_PREVIOUS_CONTEXT_HEAD, just for the tail. */
215 #define MY_UCA_PREVIOUS_CONTEXT_TAIL 128
216 
217 #define MY_UCA_PSHIFT 8
218 
219 /**
220  Check if a code point can be contraction head
221 
222  @param flags Pointer to UCA contraction flag data
223  @param wc Code point
224 
225  @retval 0 - cannot be contraction head
226  @retval 1 - can be contraction head
227 */
228 
229 inline bool my_uca_can_be_contraction_head(const char *flags, my_wc_t wc) {
231 }
232 
233 /**
234  Check if a code point can be contraction tail
235 
236  @param flags Pointer to UCA contraction flag data
237  @param wc Code point
238 
239  @retval 0 - cannot be contraction tail
240  @retval 1 - can be contraction tail
241 */
242 
243 inline bool my_uca_can_be_contraction_tail(const char *flags, my_wc_t wc) {
245 }
246 
248  const std::vector<MY_CONTRACTION> *cont_nodes, my_wc_t wc1, my_wc_t wc2);
249 #endif
UCA_V520
@ UCA_V520
Definition: str_uca_type.h:37
my_wc_t
ulong my_wc_t
Our own version of wchar_t, ie., a type that holds a single Unicode code point ("wide character").
Definition: m_ctype.h:58
UCA_V400
@ UCA_V400
Definition: str_uca_type.h:37
MY_CONTRACTION::child_nodes_context
std::vector< MY_CONTRACTION > child_nodes_context
Definition: str_uca_type.h:111
UCA_MAX_CHAR_GRP
#define UCA_MAX_CHAR_GRP
Definition: str_uca_type.h:36
CHARGRP_ARAB
@ CHARGRP_ARAB
Definition: str_uca_type.h:44
MY_UCA_INFO::first_variable
my_wc_t first_variable
Definition: str_uca_type.h:153
my_uca_can_be_contraction_tail
bool my_uca_can_be_contraction_tail(const char *flags, my_wc_t wc)
Check if a code point can be contraction tail.
Definition: str_uca_type.h:243
MY_UCA_INFO::version
enum enum_uca_ver version
Definition: str_uca_type.h:120
enum_uca_ver
enum_uca_ver
Definition: str_uca_type.h:37
Coll_param::reorder_param
struct Reorder_param * reorder_param
Definition: str_uca_type.h:69
MY_UCA_MAX_WEIGHT_SIZE
#define MY_UCA_MAX_WEIGHT_SIZE
Definition: str_uca_type.h:81
MY_UCA_INFO::extra_ce_ter_base
uint16 extra_ce_ter_base
Definition: str_uca_type.h:170
Reorder_wt_rec::new_wt_bdy
struct Weight_boundary new_wt_bdy
Definition: str_uca_type.h:56
MY_UCA_INFO::first_tertiary_ignorable
my_wc_t first_tertiary_ignorable
Definition: str_uca_type.h:149
MY_UCA_INFO::maxchar
my_wc_t maxchar
Definition: str_uca_type.h:123
MY_UCA_INFO::contraction_nodes
std::vector< MY_CONTRACTION > * contraction_nodes
Definition: str_uca_type.h:127
MY_UCA_INFO::last_non_ignorable
my_wc_t last_non_ignorable
Definition: str_uca_type.h:144
CHARGRP_LATIN
@ CHARGRP_LATIN
Definition: str_uca_type.h:42
MY_UCA_CNT_TAIL
#define MY_UCA_CNT_TAIL
Whether the given character can be the last in any contraction.
Definition: str_uca_type.h:180
Coll_param::norm_enabled
bool norm_enabled
Definition: str_uca_type.h:70
MY_UCA_INFO::extra_ce_sec_base
uint16 extra_ce_sec_base
Definition: str_uca_type.h:169
MY_UCA_INFO::first_trailing
my_wc_t first_trailing
Definition: str_uca_type.h:151
CHARGRP_CYRILLIC
@ CHARGRP_CYRILLIC
Definition: str_uca_type.h:43
MY_UCA_INFO::have_contractions
bool have_contractions
Definition: str_uca_type.h:126
my_inttypes.h
MY_CONTRACTION::child_nodes
std::vector< MY_CONTRACTION > child_nodes
Definition: str_uca_type.h:110
CASE_FIRST_LOWER
@ CASE_FIRST_LOWER
Definition: str_uca_type.h:66
CHARGRP_CORE
@ CHARGRP_CORE
Definition: str_uca_type.h:41
MY_UCA_INFO::last_tertiary_ignorable
my_wc_t last_tertiary_ignorable
Definition: str_uca_type.h:150
MY_CONTRACTION
Definition: str_uca_type.h:107
MY_CONTRACTION::contraction_len
size_t contraction_len
Definition: str_uca_type.h:116
Coll_param::case_first
enum enum_case_first case_first
Definition: str_uca_type.h:72
MY_UCA_CNT_HEAD
#define MY_UCA_CNT_HEAD
Whether the given character can be the first in any contraction.
Definition: str_uca_type.h:177
uchar
unsigned char uchar
Definition: my_inttypes.h:51
MY_CONTRACTION::weight
uint16 weight[MY_UCA_MAX_WEIGHT_SIZE]
Definition: str_uca_type.h:114
Reorder_wt_rec::old_wt_bdy
struct Weight_boundary old_wt_bdy
Definition: str_uca_type.h:55
Reorder_param::wt_rec_num
int wt_rec_num
Definition: str_uca_type.h:62
MY_CONTRACTION::ch
my_wc_t ch
Definition: str_uca_type.h:108
Weight_boundary
Definition: str_uca_type.h:49
MY_UCA_INFO::first_primary_ignorable
my_wc_t first_primary_ignorable
Definition: str_uca_type.h:145
MY_UCA_INFO::first_secondary_ignorable
my_wc_t first_secondary_ignorable
Definition: str_uca_type.h:147
CHARGRP_KANA
@ CHARGRP_KANA
Definition: str_uca_type.h:45
CHARGRP_OTHERS
@ CHARGRP_OTHERS
Definition: str_uca_type.h:46
Reorder_wt_rec
Definition: str_uca_type.h:54
MY_CONTRACTION::is_contraction_tail
bool is_contraction_tail
Definition: str_uca_type.h:115
my_uca_contraction2_weight
const uint16 * my_uca_contraction2_weight(const std::vector< MY_CONTRACTION > *cont_nodes, my_wc_t wc1, my_wc_t wc2)
Find a contraction consisting of two code points and return its weight array.
Definition: ctype-uca.cc:941
enum_case_first
enum_case_first
Definition: str_uca_type.h:66
MY_UCA_INFO::lengths
uchar * lengths
Definition: str_uca_type.h:124
Reorder_param::wt_rec
struct Reorder_wt_rec wt_rec[2 *UCA_MAX_CHAR_GRP]
Definition: str_uca_type.h:61
CASE_FIRST_UPPER
@ CASE_FIRST_UPPER
Definition: str_uca_type.h:66
my_uca_can_be_contraction_head
bool my_uca_can_be_contraction_head(const char *flags, my_wc_t wc)
Check if a code point can be contraction head.
Definition: str_uca_type.h:229
MY_UCA_CNT_FLAG_MASK
#define MY_UCA_CNT_FLAG_MASK
Definition: str_uca_type.h:174
Weight_boundary::begin
uint16 begin
Definition: str_uca_type.h:50
Weight_boundary::end
uint16 end
Definition: str_uca_type.h:51
enum_char_grp
enum_char_grp
Definition: str_uca_type.h:39
UCA_V900
@ UCA_V900
Definition: str_uca_type.h:37
MY_UCA_INFO::last_trailing
my_wc_t last_trailing
Definition: str_uca_type.h:152
CASE_FIRST_OFF
@ CASE_FIRST_OFF
Definition: str_uca_type.h:66
Reorder_param::max_weight
uint16 max_weight
Definition: str_uca_type.h:63
Reorder_param::reorder_grp
enum enum_char_grp reorder_grp[UCA_MAX_CHAR_GRP]
Definition: str_uca_type.h:60
uint16
uint16_t uint16
Definition: my_inttypes.h:64
Reorder_param
Definition: str_uca_type.h:59
MY_UCA_INFO::last_variable
my_wc_t last_variable
Definition: str_uca_type.h:154
MY_UCA_INFO::weights
uint16 ** weights
Definition: str_uca_type.h:125
MY_UCA_INFO::first_non_ignorable
my_wc_t first_non_ignorable
Definition: str_uca_type.h:143
Coll_param
Definition: str_uca_type.h:68
MY_UCA_INFO::contraction_flags
char * contraction_flags
Definition: str_uca_type.h:140
MY_UCA_INFO::extra_ce_pri_base
uint16 extra_ce_pri_base
Definition: str_uca_type.h:168
MY_UCA_INFO
Definition: str_uca_type.h:119
MY_UCA_INFO::last_secondary_ignorable
my_wc_t last_secondary_ignorable
Definition: str_uca_type.h:148
CHARGRP_NONE
@ CHARGRP_NONE
Definition: str_uca_type.h:40
flags
static int flags[50]
Definition: hp_test1.cc:39
MY_UCA_INFO::last_primary_ignorable
my_wc_t last_primary_ignorable
Definition: str_uca_type.h:146