mysql-server/latest/str__uca__type_8h_source.html

/* Copyright (c) 2016, 2025, Oracle and/or its affiliates.


   This program is free software; you can redistribute it and/or modify

   it under the terms of the GNU General Public License, version 2.0,

   as published by the Free Software Foundation.


   This program is designed to work with certain software (including

   but not limited to OpenSSL) that is licensed under separate terms,

   as designated in a particular file or component or in included license

   documentation.  The authors of MySQL hereby grant you an additional

   permission to link the program and your derivative works with the

   separately licensed software that they have either included with

   the program or referenced in the documentation.


   This program is distributed in the hope that it will be useful,

   but WITHOUT ANY WARRANTY; without even the implied warranty of

   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the

   GNU General Public License, version 2.0, for more details.


   You should have received a copy of the GNU General Public License

   along with this program; if not, write to the Free Software

   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */


/* This header file contains type declarations used by UCA code. */


#ifndef STR_UCA_TYPE_H

#define STR_UCA_TYPE_H


#include <array>

#include <cstdint>

#include <vector>


#include "mysql/strings/m_ctype.h"


constexpr int MY_UCA_CNT_FLAG_SIZE = 4096;

constexpr my_wc_t MY_UCA_CNT_FLAG_MASK = 4095;


/*

  So far we have only Croatian collation needs to reorder Latin and

  Cyrillic group of characters. May add more in future.

*/

#define UCA_MAX_CHAR_GRP 4

enum enum_uca_ver { UCA_V400, UCA_V520, UCA_V900 };


enum enum_char_grp {

  CHARGRP_NONE,

  CHARGRP_CORE,

  CHARGRP_LATIN,

  CHARGRP_CYRILLIC,

  CHARGRP_ARAB,

  CHARGRP_KANA,

  CHARGRP_OTHERS

};


struct Weight_boundary {

  uint16_t begin;

  uint16_t end;

};


struct Reorder_wt_rec {

  struct Weight_boundary old_wt_bdy;

  struct Weight_boundary new_wt_bdy;

};


struct Reorder_param {

  enum enum_char_grp reorder_grp[UCA_MAX_CHAR_GRP];

  struct Reorder_wt_rec wt_rec[2 * UCA_MAX_CHAR_GRP];

  int wt_rec_num;

  uint16_t max_weight;

};


enum enum_case_first { CASE_FIRST_OFF, CASE_FIRST_UPPER, CASE_FIRST_LOWER };


struct Coll_param {

  struct Reorder_param *reorder_param;

  bool norm_enabled;  // false = normalization off, default;

                      // true = on

  enum enum_case_first case_first;

};


/*

  NOTE: If you change MY_UCA_MAX_CONTRACTION, be sure to update the comment on

  MY_UCA_CNT_MID1 in strings/uca_data.h, as it might cause us to run out of

  bits in a byte flag.

*/

#define MY_UCA_MAX_CONTRACTION 6

#define MY_UCA_MAX_WEIGHT_SIZE 25

#define MY_UCA_WEIGHT_LEVELS 1


/*

  We store all the contractions in a trie, indexed on the codepoints they

  consist of. The trie is organized as:

  1. Each node stores one code point (ch) of contraction, and a list of nodes

     (child_nodes) store all possible following code points.

  2. The vector in MY_UCA_INFO stores a list of nodes which store the first

     code points of all contractions.

  3. Each node has a boolean value (is_contraction_tail) which shows

     whether the code point stored in the node is the end of a contraction.

     This is necessary because even if one code point is the end of a

     contraction, there might be longer contraction contains all the

     code points in the path (e.g., for Hungarian, both 'DZ' and 'DZS' are

     contractions).

  4. A contraction is formed by all the code points in the path until the

     end of the contraction.

  5. If it is the end of a contraction (is_contraction_tail == true), the

     weight of this contraction is stored in array weight.

  6. If it is the end of a contraction (is_contraction_tail == true),

     with_context shows whether it is common contraction (with_context ==

     false), or previous context contraction (with_context == true).

  7. If it is the end of a contraction (is_contraction_tail == true),

     contraction_len shows how many code points this contraction consists of.

*/

struct MY_CONTRACTION {

  my_wc_t ch;

  // Lists of following nodes.

  std::vector<MY_CONTRACTION> child_nodes;

  std::vector<MY_CONTRACTION> child_nodes_context;


  // weight and with_context are only useful when is_contraction_tail is true.

  uint16_t weight[MY_UCA_MAX_WEIGHT_SIZE]; /* Its weight string, 0-terminated */

  bool is_contraction_tail;

  size_t contraction_len;

};


struct MY_UCA_INFO {

  enum_uca_ver version{UCA_V400};

  MY_UCA_INFO *m_based_on{nullptr};


  // Collation weights.

  my_wc_t maxchar{0};


  uint8_t *lengths{nullptr};

  std::vector<uint8_t> *m_allocated_weights{nullptr};

  uint16_t **weights{nullptr};


  bool have_contractions{false};

  std::vector<MY_CONTRACTION> *contraction_nodes{nullptr};

  /*

    contraction_flags is only used when a collation has contraction rule.

    UCA collation supports at least 65535 characters, but only a few of

    them can be part of contraction, it is huge waste of time to find out

    whether one character is in contraction list for every character.

    contraction_flags points to memory which is allocated when a collation

    has contraction rule. For a character in contraction, its corresponding

    byte (contraction_flags[ch & 0x1000]) will be set to a certain value

    according to the position (head, tail or middle) of this character in

    contraction. This byte will be used to quick check whether one character

    can be part of contraction.

  */

  using flags_type = std::array<char, MY_UCA_CNT_FLAG_SIZE>;

  flags_type *contraction_flags{nullptr};


  /* Logical positions */

  my_wc_t first_non_ignorable{0};

  my_wc_t last_non_ignorable{0};

  my_wc_t first_primary_ignorable{0};

  my_wc_t last_primary_ignorable{0};

  my_wc_t first_secondary_ignorable{0};

  my_wc_t last_secondary_ignorable{0};

  my_wc_t first_tertiary_ignorable{0};

  my_wc_t last_tertiary_ignorable{0};

  my_wc_t first_trailing{0};

  my_wc_t last_trailing{0};

  my_wc_t first_variable{0};

  my_wc_t last_variable{0};

  /*

    extra_ce_pri_base, extra_ce_sec_base and extra_ce_ter_base are only used for

    the UCA collations whose UCA version is not smaller than UCA_V900. For why

    we need this extra CE, please see the comment in my_char_weight_put_900()

    and apply_primary_shift_900().


    The value of these three variables is set by the definition of my_uca_v900.

    The value of extra_ce_pri_base is usually 0x54A4 (which is the maximum

    regular weight value pluses one, 0x54A3 + 1 = 0x54A4). But for the Chinese

    collation, the extra_ce_pri_base needs to change. This is because 0x54A4 has

    been occupied to do reordering. There might be weight conflict if we still

    use 0x54A4. Please also see the comment on modify_all_zh_pages().

   */

  uint16_t extra_ce_pri_base{0};  // Primary weight of extra CE

  uint16_t extra_ce_sec_base{0};  // Secondary weight of extra CE

  uint16_t extra_ce_ter_base{0};  // Tertiary weight of extra CE

};


/** Whether the given character can be the first in any contraction. */

#define MY_UCA_CNT_HEAD 1


/** Whether the given character can be the last in any contraction. */

#define MY_UCA_CNT_TAIL 2


/**

 Whether the given character can be the second in any contraction.


 Also defined implicitly through shifting MY_UCA_CNT_MID1:


 \#define MY_UCA_CNT_MID2  8

 \#define MY_UCA_CNT_MID3  16

 \#define MY_UCA_CNT_MID4  32


 There's no need for MY_UCA_CNT_MID5 (which would cause us to run out of

 bits) since MY_UCA_MAX_CONTRACTION is 6 (so head, four in the middle,

 and then tail).

*/

#define MY_UCA_CNT_MID1 4


/**

 Whether the given character is the first part of a context-sensitive

 contraction. Context-sensitive contractions are like normal contractions,

 except that for performance reasons, they trigger on the _last_ character

 instead of the first. The case given in Unicode TR35 is that in some

 scripts (such as katakana in Japanese), "a-" should sort as "aa"

 (except on the tertiary level), "e-" should sort as "ee" and so on.

 However, adding regular contractions on "a" and "e" would cause undue

 performance loss, so instead, we add a special "context-sensitive"

 contraction on "-" that then looks at the _previous_ character.


 We don't support context-sensitive contractions longer than two characters

 at the moment, since none exist in CLDR. Thus, there is no

 MY_UCA_PREVIOUS_CONTEXT_MID1 and so on.

*/

#define MY_UCA_PREVIOUS_CONTEXT_HEAD 64


/** Similar to MY_UCA_PREVIOUS_CONTEXT_HEAD, just for the tail. */

#define MY_UCA_PREVIOUS_CONTEXT_TAIL 128


#define MY_UCA_PSHIFT 8


/**

  Check if a code point can be contraction head


  @param flags    Pointer to UCA contraction flag data

  @param wc       Code point


  @retval   0 - cannot be contraction head

  @retval   1 - can be contraction head

*/


inline bool my_uca_can_be_contraction_head(const MY_UCA_INFO::flags_type *flags,

                                           my_wc_t wc) {

  return (*flags)[wc & MY_UCA_CNT_FLAG_MASK] & MY_UCA_CNT_HEAD;

}


/**

  Check if a code point can be contraction tail


  @param flags    Pointer to UCA contraction flag data

  @param wc       Code point


  @retval   0 - cannot be contraction tail

  @retval   1 - can be contraction tail

*/


inline bool my_uca_can_be_contraction_tail(const MY_UCA_INFO::flags_type *flags,

                                           my_wc_t wc) {

  return (*flags)[wc & MY_UCA_CNT_FLAG_MASK] & MY_UCA_CNT_TAIL;

}


const uint16_t *my_uca_contraction2_weight(

    const std::vector<MY_CONTRACTION> *cont_nodes, my_wc_t wc1, my_wc_t wc2);

#endif

flags
static int flags[50]
Definition: hp_test1.cc:40

m_ctype.h
A better implementation of the UNIX ctype(3) library.

my_wc_t
unsigned long my_wc_t
Our own version of wchar_t, ie., a type that holds a single Unicode code point ("wide character").
Definition: m_ctype.h:57

enum_uca_ver
enum_uca_ver
Definition: str_uca_type.h:43

UCA_V520
@ UCA_V520
Definition: str_uca_type.h:43

UCA_V400
@ UCA_V400
Definition: str_uca_type.h:43

UCA_V900
@ UCA_V900
Definition: str_uca_type.h:43

MY_UCA_CNT_TAIL
#define MY_UCA_CNT_TAIL
Whether the given character can be the last in any contraction.
Definition: str_uca_type.h:188

enum_char_grp
enum_char_grp
Definition: str_uca_type.h:45

CHARGRP_NONE
@ CHARGRP_NONE
Definition: str_uca_type.h:46

CHARGRP_ARAB
@ CHARGRP_ARAB
Definition: str_uca_type.h:50

CHARGRP_LATIN
@ CHARGRP_LATIN
Definition: str_uca_type.h:48

CHARGRP_CYRILLIC
@ CHARGRP_CYRILLIC
Definition: str_uca_type.h:49

CHARGRP_KANA
@ CHARGRP_KANA
Definition: str_uca_type.h:51

CHARGRP_CORE
@ CHARGRP_CORE
Definition: str_uca_type.h:47

CHARGRP_OTHERS
@ CHARGRP_OTHERS
Definition: str_uca_type.h:52

my_uca_contraction2_weight
const uint16_t * my_uca_contraction2_weight(const std::vector< MY_CONTRACTION > *cont_nodes, my_wc_t wc1, my_wc_t wc2)
Find a contraction consisting of two code points and return its weight array.
Definition: ctype-uca.cc:953

MY_UCA_CNT_FLAG_MASK
constexpr my_wc_t MY_UCA_CNT_FLAG_MASK
Definition: str_uca_type.h:36

MY_UCA_CNT_FLAG_SIZE
constexpr int MY_UCA_CNT_FLAG_SIZE
Definition: str_uca_type.h:35

MY_UCA_MAX_WEIGHT_SIZE
#define MY_UCA_MAX_WEIGHT_SIZE
Definition: str_uca_type.h:87

UCA_MAX_CHAR_GRP
#define UCA_MAX_CHAR_GRP
Definition: str_uca_type.h:42

MY_UCA_CNT_HEAD
#define MY_UCA_CNT_HEAD
Whether the given character can be the first in any contraction.
Definition: str_uca_type.h:185

my_uca_can_be_contraction_tail
bool my_uca_can_be_contraction_tail(const MY_UCA_INFO::flags_type *flags, my_wc_t wc)
Check if a code point can be contraction tail.
Definition: str_uca_type.h:252

enum_case_first
enum_case_first
Definition: str_uca_type.h:72

CASE_FIRST_UPPER
@ CASE_FIRST_UPPER
Definition: str_uca_type.h:72

CASE_FIRST_LOWER
@ CASE_FIRST_LOWER
Definition: str_uca_type.h:72

CASE_FIRST_OFF
@ CASE_FIRST_OFF
Definition: str_uca_type.h:72

my_uca_can_be_contraction_head
bool my_uca_can_be_contraction_head(const MY_UCA_INFO::flags_type *flags, my_wc_t wc)
Check if a code point can be contraction head.
Definition: str_uca_type.h:237

Coll_param
Definition: str_uca_type.h:74

Coll_param::reorder_param
struct Reorder_param * reorder_param
Definition: str_uca_type.h:75

Coll_param::case_first
enum enum_case_first case_first
Definition: str_uca_type.h:78

Coll_param::norm_enabled
bool norm_enabled
Definition: str_uca_type.h:76

MY_CONTRACTION
Definition: str_uca_type.h:113

MY_CONTRACTION::child_nodes_context
std::vector< MY_CONTRACTION > child_nodes_context
Definition: str_uca_type.h:117

MY_CONTRACTION::child_nodes
std::vector< MY_CONTRACTION > child_nodes
Definition: str_uca_type.h:116

MY_CONTRACTION::weight
uint16_t weight[MY_UCA_MAX_WEIGHT_SIZE]
Definition: str_uca_type.h:120

MY_CONTRACTION::ch
my_wc_t ch
Definition: str_uca_type.h:114

MY_CONTRACTION::contraction_len
size_t contraction_len
Definition: str_uca_type.h:122

MY_CONTRACTION::is_contraction_tail
bool is_contraction_tail
Definition: str_uca_type.h:121

MY_UCA_INFO
Definition: str_uca_type.h:125

MY_UCA_INFO::first_non_ignorable
my_wc_t first_non_ignorable
Definition: str_uca_type.h:154

MY_UCA_INFO::extra_ce_pri_base
uint16_t extra_ce_pri_base
Definition: str_uca_type.h:179

MY_UCA_INFO::weights
uint16_t ** weights
Definition: str_uca_type.h:134

MY_UCA_INFO::m_allocated_weights
std::vector< uint8_t > * m_allocated_weights
Definition: str_uca_type.h:133

MY_UCA_INFO::last_tertiary_ignorable
my_wc_t last_tertiary_ignorable
Definition: str_uca_type.h:161

MY_UCA_INFO::version
enum_uca_ver version
Definition: str_uca_type.h:126

MY_UCA_INFO::contraction_flags
flags_type * contraction_flags
Definition: str_uca_type.h:151

MY_UCA_INFO::last_secondary_ignorable
my_wc_t last_secondary_ignorable
Definition: str_uca_type.h:159

MY_UCA_INFO::lengths
uint8_t * lengths
Definition: str_uca_type.h:132

MY_UCA_INFO::maxchar
my_wc_t maxchar
Definition: str_uca_type.h:130

MY_UCA_INFO::last_primary_ignorable
my_wc_t last_primary_ignorable
Definition: str_uca_type.h:157

MY_UCA_INFO::extra_ce_sec_base
uint16_t extra_ce_sec_base
Definition: str_uca_type.h:180

MY_UCA_INFO::have_contractions
bool have_contractions
Definition: str_uca_type.h:136

MY_UCA_INFO::first_variable
my_wc_t first_variable
Definition: str_uca_type.h:164

MY_UCA_INFO::m_based_on
MY_UCA_INFO * m_based_on
Definition: str_uca_type.h:127

MY_UCA_INFO::first_tertiary_ignorable
my_wc_t first_tertiary_ignorable
Definition: str_uca_type.h:160

MY_UCA_INFO::last_trailing
my_wc_t last_trailing
Definition: str_uca_type.h:163

MY_UCA_INFO::first_secondary_ignorable
my_wc_t first_secondary_ignorable
Definition: str_uca_type.h:158

MY_UCA_INFO::last_non_ignorable
my_wc_t last_non_ignorable
Definition: str_uca_type.h:155

MY_UCA_INFO::last_variable
my_wc_t last_variable
Definition: str_uca_type.h:165

MY_UCA_INFO::first_primary_ignorable
my_wc_t first_primary_ignorable
Definition: str_uca_type.h:156

MY_UCA_INFO::flags_type
std::array< char, MY_UCA_CNT_FLAG_SIZE > flags_type
Definition: str_uca_type.h:150

MY_UCA_INFO::extra_ce_ter_base
uint16_t extra_ce_ter_base
Definition: str_uca_type.h:181

MY_UCA_INFO::first_trailing
my_wc_t first_trailing
Definition: str_uca_type.h:162

MY_UCA_INFO::contraction_nodes
std::vector< MY_CONTRACTION > * contraction_nodes
Definition: str_uca_type.h:137

Reorder_param
Definition: str_uca_type.h:65

Reorder_param::reorder_grp
enum enum_char_grp reorder_grp[UCA_MAX_CHAR_GRP]
Definition: str_uca_type.h:66

Reorder_param::wt_rec_num
int wt_rec_num
Definition: str_uca_type.h:68

Reorder_param::max_weight
uint16_t max_weight
Definition: str_uca_type.h:69

Reorder_param::wt_rec
struct Reorder_wt_rec wt_rec[2 *UCA_MAX_CHAR_GRP]
Definition: str_uca_type.h:67

Reorder_wt_rec
Definition: str_uca_type.h:60

Reorder_wt_rec::old_wt_bdy
struct Weight_boundary old_wt_bdy
Definition: str_uca_type.h:61

Reorder_wt_rec::new_wt_bdy
struct Weight_boundary new_wt_bdy
Definition: str_uca_type.h:62

Weight_boundary
Definition: str_uca_type.h:55

Weight_boundary::begin
uint16_t begin
Definition: str_uca_type.h:56

Weight_boundary::end
uint16_t end
Definition: str_uca_type.h:57