MySQL 8.4.0
Source Code Documentation
str_uca_type.h
Go to the documentation of this file.
1/* Copyright (c) 2016, 2024, Oracle and/or its affiliates.
2
3 This program is free software; you can redistribute it and/or modify
4 it under the terms of the GNU General Public License, version 2.0,
5 as published by the Free Software Foundation.
6
7 This program is designed to work with certain software (including
8 but not limited to OpenSSL) that is licensed under separate terms,
9 as designated in a particular file or component or in included license
10 documentation. The authors of MySQL hereby grant you an additional
11 permission to link the program and your derivative works with the
12 separately licensed software that they have either included with
13 the program or referenced in the documentation.
14
15 This program is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License, version 2.0, for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with this program; if not, write to the Free Software
22 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */
23
24/* This header file contains type declarations used by UCA code. */
25
26#ifndef STR_UCA_TYPE_H
27#define STR_UCA_TYPE_H
28
29#include <array>
30#include <cstdint>
31#include <vector>
32
34
35constexpr int MY_UCA_CNT_FLAG_SIZE = 4096;
37
38/*
39 So far we have only Croatian collation needs to reorder Latin and
40 Cyrillic group of characters. May add more in future.
41*/
42#define UCA_MAX_CHAR_GRP 4
44
53};
54
56 uint16_t begin;
57 uint16_t end;
58};
59
63};
64
69 uint16_t max_weight;
70};
71
73
74struct Coll_param {
76 bool norm_enabled; // false = normalization off, default;
77 // true = on
79};
80
81/*
82 NOTE: If you change MY_UCA_MAX_CONTRACTION, be sure to update the comment on
83 MY_UCA_CNT_MID1 in strings/uca_data.h, as it might cause us to run out of
84 bits in a byte flag.
85*/
86#define MY_UCA_MAX_CONTRACTION 6
87#define MY_UCA_MAX_WEIGHT_SIZE 25
88#define MY_UCA_WEIGHT_LEVELS 1
89
90/*
91 We store all the contractions in a trie, indexed on the codepoints they
92 consist of. The trie is organized as:
93 1. Each node stores one code point (ch) of contraction, and a list of nodes
94 (child_nodes) store all possible following code points.
95 2. The vector in MY_UCA_INFO stores a list of nodes which store the first
96 code points of all contractions.
97 3. Each node has a boolean value (is_contraction_tail) which shows
98 whether the code point stored in the node is the end of a contraction.
99 This is necessary because even if one code point is the end of a
100 contraction, there might be longer contraction contains all the
101 code points in the path (e.g., for Hungarian, both 'DZ' and 'DZS' are
102 contractions).
103 4. A contraction is formed by all the code points in the path until the
104 end of the contraction.
105 5. If it is the end of a contraction (is_contraction_tail == true), the
106 weight of this contraction is stored in array weight.
107 6. If it is the end of a contraction (is_contraction_tail == true),
108 with_context shows whether it is common contraction (with_context ==
109 false), or previous context contraction (with_context == true).
110 7. If it is the end of a contraction (is_contraction_tail == true),
111 contraction_len shows how many code points this contraction consists of.
112*/
115 // Lists of following nodes.
116 std::vector<MY_CONTRACTION> child_nodes;
117 std::vector<MY_CONTRACTION> child_nodes_context;
118
119 // weight and with_context are only useful when is_contraction_tail is true.
120 uint16_t weight[MY_UCA_MAX_WEIGHT_SIZE]; /* Its weight string, 0-terminated */
123};
124
128
129 // Collation weights.
131
132 uint8_t *lengths{nullptr};
133 std::vector<uint8_t> *m_allocated_weights{nullptr};
134 uint16_t **weights{nullptr};
135
136 bool have_contractions{false};
137 std::vector<MY_CONTRACTION> *contraction_nodes{nullptr};
138 /*
139 contraction_flags is only used when a collation has contraction rule.
140 UCA collation supports at least 65535 characters, but only a few of
141 them can be part of contraction, it is huge waste of time to find out
142 whether one character is in contraction list for every character.
143 contraction_flags points to memory which is allocated when a collation
144 has contraction rule. For a character in contraction, its corresponding
145 byte (contraction_flags[ch & 0x1000]) will be set to a certain value
146 according to the position (head, tail or middle) of this character in
147 contraction. This byte will be used to quick check whether one character
148 can be part of contraction.
149 */
150 using flags_type = std::array<char, MY_UCA_CNT_FLAG_SIZE>;
152
153 /* Logical positions */
166 /*
167 extra_ce_pri_base, extra_ce_sec_base and extra_ce_ter_base are only used for
168 the UCA collations whose UCA version is not smaller than UCA_V900. For why
169 we need this extra CE, please see the comment in my_char_weight_put_900()
170 and apply_primary_shift_900().
171
172 The value of these three variables is set by the definition of my_uca_v900.
173 The value of extra_ce_pri_base is usually 0x54A4 (which is the maximum
174 regular weight value pluses one, 0x54A3 + 1 = 0x54A4). But for the Chinese
175 collation, the extra_ce_pri_base needs to change. This is because 0x54A4 has
176 been occupied to do reordering. There might be weight conflict if we still
177 use 0x54A4. Please also see the comment on modify_all_zh_pages().
178 */
179 uint16_t extra_ce_pri_base{0}; // Primary weight of extra CE
180 uint16_t extra_ce_sec_base{0}; // Secondary weight of extra CE
181 uint16_t extra_ce_ter_base{0}; // Tertiary weight of extra CE
182};
183
184/** Whether the given character can be the first in any contraction. */
185#define MY_UCA_CNT_HEAD 1
186
187/** Whether the given character can be the last in any contraction. */
188#define MY_UCA_CNT_TAIL 2
189
190/**
191 Whether the given character can be the second in any contraction.
192
193 Also defined implicitly through shifting MY_UCA_CNT_MID1:
194
195 \#define MY_UCA_CNT_MID2 8
196 \#define MY_UCA_CNT_MID3 16
197 \#define MY_UCA_CNT_MID4 32
198
199 There's no need for MY_UCA_CNT_MID5 (which would cause us to run out of
200 bits) since MY_UCA_MAX_CONTRACTION is 6 (so head, four in the middle,
201 and then tail).
202*/
203#define MY_UCA_CNT_MID1 4
204
205/**
206 Whether the given character is the first part of a context-sensitive
207 contraction. Context-sensitive contractions are like normal contractions,
208 except that for performance reasons, they trigger on the _last_ character
209 instead of the first. The case given in Unicode TR35 is that in some
210 scripts (such as katakana in Japanese), "a-" should sort as "aa"
211 (except on the tertiary level), "e-" should sort as "ee" and so on.
212 However, adding regular contractions on "a" and "e" would cause undue
213 performance loss, so instead, we add a special "context-sensitive"
214 contraction on "-" that then looks at the _previous_ character.
215
216 We don't support context-sensitive contractions longer than two characters
217 at the moment, since none exist in CLDR. Thus, there is no
218 MY_UCA_PREVIOUS_CONTEXT_MID1 and so on.
219*/
220#define MY_UCA_PREVIOUS_CONTEXT_HEAD 64
221
222/** Similar to MY_UCA_PREVIOUS_CONTEXT_HEAD, just for the tail. */
223#define MY_UCA_PREVIOUS_CONTEXT_TAIL 128
224
225#define MY_UCA_PSHIFT 8
226
227/**
228 Check if a code point can be contraction head
229
230 @param flags Pointer to UCA contraction flag data
231 @param wc Code point
232
233 @retval 0 - cannot be contraction head
234 @retval 1 - can be contraction head
235*/
236
238 my_wc_t wc) {
240}
241
242/**
243 Check if a code point can be contraction tail
244
245 @param flags Pointer to UCA contraction flag data
246 @param wc Code point
247
248 @retval 0 - cannot be contraction tail
249 @retval 1 - can be contraction tail
250*/
251
253 my_wc_t wc) {
255}
256
257const uint16_t *my_uca_contraction2_weight(
258 const std::vector<MY_CONTRACTION> *cont_nodes, my_wc_t wc1, my_wc_t wc2);
259#endif
static int flags[50]
Definition: hp_test1.cc:40
A better implementation of the UNIX ctype(3) library.
unsigned long my_wc_t
Our own version of wchar_t, ie., a type that holds a single Unicode code point ("wide character").
Definition: m_ctype.h:57
enum_uca_ver
Definition: str_uca_type.h:43
@ UCA_V520
Definition: str_uca_type.h:43
@ UCA_V400
Definition: str_uca_type.h:43
@ UCA_V900
Definition: str_uca_type.h:43
#define MY_UCA_CNT_TAIL
Whether the given character can be the last in any contraction.
Definition: str_uca_type.h:188
enum_char_grp
Definition: str_uca_type.h:45
@ CHARGRP_NONE
Definition: str_uca_type.h:46
@ CHARGRP_ARAB
Definition: str_uca_type.h:50
@ CHARGRP_LATIN
Definition: str_uca_type.h:48
@ CHARGRP_CYRILLIC
Definition: str_uca_type.h:49
@ CHARGRP_KANA
Definition: str_uca_type.h:51
@ CHARGRP_CORE
Definition: str_uca_type.h:47
@ CHARGRP_OTHERS
Definition: str_uca_type.h:52
const uint16_t * my_uca_contraction2_weight(const std::vector< MY_CONTRACTION > *cont_nodes, my_wc_t wc1, my_wc_t wc2)
Find a contraction consisting of two code points and return its weight array.
Definition: ctype-uca.cc:953
constexpr my_wc_t MY_UCA_CNT_FLAG_MASK
Definition: str_uca_type.h:36
constexpr int MY_UCA_CNT_FLAG_SIZE
Definition: str_uca_type.h:35
#define MY_UCA_MAX_WEIGHT_SIZE
Definition: str_uca_type.h:87
#define UCA_MAX_CHAR_GRP
Definition: str_uca_type.h:42
#define MY_UCA_CNT_HEAD
Whether the given character can be the first in any contraction.
Definition: str_uca_type.h:185
bool my_uca_can_be_contraction_tail(const MY_UCA_INFO::flags_type *flags, my_wc_t wc)
Check if a code point can be contraction tail.
Definition: str_uca_type.h:252
enum_case_first
Definition: str_uca_type.h:72
@ CASE_FIRST_UPPER
Definition: str_uca_type.h:72
@ CASE_FIRST_LOWER
Definition: str_uca_type.h:72
@ CASE_FIRST_OFF
Definition: str_uca_type.h:72
bool my_uca_can_be_contraction_head(const MY_UCA_INFO::flags_type *flags, my_wc_t wc)
Check if a code point can be contraction head.
Definition: str_uca_type.h:237
Definition: str_uca_type.h:74
struct Reorder_param * reorder_param
Definition: str_uca_type.h:75
enum enum_case_first case_first
Definition: str_uca_type.h:78
bool norm_enabled
Definition: str_uca_type.h:76
Definition: str_uca_type.h:113
std::vector< MY_CONTRACTION > child_nodes_context
Definition: str_uca_type.h:117
std::vector< MY_CONTRACTION > child_nodes
Definition: str_uca_type.h:116
uint16_t weight[MY_UCA_MAX_WEIGHT_SIZE]
Definition: str_uca_type.h:120
my_wc_t ch
Definition: str_uca_type.h:114
size_t contraction_len
Definition: str_uca_type.h:122
bool is_contraction_tail
Definition: str_uca_type.h:121
Definition: str_uca_type.h:125
my_wc_t first_non_ignorable
Definition: str_uca_type.h:154
uint16_t extra_ce_pri_base
Definition: str_uca_type.h:179
uint16_t ** weights
Definition: str_uca_type.h:134
std::vector< uint8_t > * m_allocated_weights
Definition: str_uca_type.h:133
my_wc_t last_tertiary_ignorable
Definition: str_uca_type.h:161
enum_uca_ver version
Definition: str_uca_type.h:126
flags_type * contraction_flags
Definition: str_uca_type.h:151
my_wc_t last_secondary_ignorable
Definition: str_uca_type.h:159
uint8_t * lengths
Definition: str_uca_type.h:132
my_wc_t maxchar
Definition: str_uca_type.h:130
my_wc_t last_primary_ignorable
Definition: str_uca_type.h:157
uint16_t extra_ce_sec_base
Definition: str_uca_type.h:180
bool have_contractions
Definition: str_uca_type.h:136
my_wc_t first_variable
Definition: str_uca_type.h:164
MY_UCA_INFO * m_based_on
Definition: str_uca_type.h:127
my_wc_t first_tertiary_ignorable
Definition: str_uca_type.h:160
my_wc_t last_trailing
Definition: str_uca_type.h:163
my_wc_t first_secondary_ignorable
Definition: str_uca_type.h:158
my_wc_t last_non_ignorable
Definition: str_uca_type.h:155
my_wc_t last_variable
Definition: str_uca_type.h:165
my_wc_t first_primary_ignorable
Definition: str_uca_type.h:156
std::array< char, MY_UCA_CNT_FLAG_SIZE > flags_type
Definition: str_uca_type.h:150
uint16_t extra_ce_ter_base
Definition: str_uca_type.h:181
my_wc_t first_trailing
Definition: str_uca_type.h:162
std::vector< MY_CONTRACTION > * contraction_nodes
Definition: str_uca_type.h:137
Definition: str_uca_type.h:65
enum enum_char_grp reorder_grp[UCA_MAX_CHAR_GRP]
Definition: str_uca_type.h:66
int wt_rec_num
Definition: str_uca_type.h:68
uint16_t max_weight
Definition: str_uca_type.h:69
struct Reorder_wt_rec wt_rec[2 *UCA_MAX_CHAR_GRP]
Definition: str_uca_type.h:67
Definition: str_uca_type.h:60
struct Weight_boundary old_wt_bdy
Definition: str_uca_type.h:61
struct Weight_boundary new_wt_bdy
Definition: str_uca_type.h:62
Definition: str_uca_type.h:55
uint16_t begin
Definition: str_uca_type.h:56
uint16_t end
Definition: str_uca_type.h:57