MySQL 8.4.2
Source Code Documentation
fts0tokenize.h
Go to the documentation of this file.
1/*****************************************************************************
2
3Copyright (c) 2014, 2024, Oracle and/or its affiliates.
4
5This program is free software; you can redistribute it and/or modify it under
6the terms of the GNU General Public License, version 2.0, as published by the
7Free Software Foundation.
8
9This program is designed to work with certain software (including
10but not limited to OpenSSL) that is licensed under separate terms,
11as designated in a particular file or component or in included license
12documentation. The authors of MySQL hereby grant you an additional
13permission to link the program and your derivative works with the
14separately licensed software that they have either included with
15the program or referenced in the documentation.
16
17This program is distributed in the hope that it will be useful, but WITHOUT
18ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
19FOR A PARTICULAR PURPOSE. See the GNU General Public License, version 2.0,
20for more details.
21
22You should have received a copy of the GNU General Public License along with
23this program; if not, write to the Free Software Foundation, Inc.,
2451 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
25
26*****************************************************************************/
27
28/** @file include/fts0tokenize.h
29 Full Text Search plugin tokenizer refer to MyISAM
30
31 Created 2014/11/17 Shaohua Wang
32 ***********************************************************************/
33
34#include <cstdint>
35
36#include "ft_global.h"
39
40/* Macros and structs below are from ftdefs.h in MyISAM */
41/** Check a char is true word */
42inline bool true_word_char(int c, uint8_t ch) {
43 return ((c & (MY_CHAR_U | MY_CHAR_L | MY_CHAR_NMR)) != 0) || ch == '_';
44}
45
46/** Boolean search syntax */
48
49#define FTB_YES (fts_boolean_syntax[0])
50#define FTB_EGAL (fts_boolean_syntax[1])
51#define FTB_NO (fts_boolean_syntax[2])
52#define FTB_INC (fts_boolean_syntax[3])
53#define FTB_DEC (fts_boolean_syntax[4])
54#define FTB_LBR (fts_boolean_syntax[5])
55#define FTB_RBR (fts_boolean_syntax[6])
56#define FTB_NEG (fts_boolean_syntax[7])
57#define FTB_TRUNC (fts_boolean_syntax[8])
58#define FTB_LQUOT (fts_boolean_syntax[10])
59#define FTB_RQUOT (fts_boolean_syntax[11])
60
61/** FTS query token */
62struct FT_WORD {
63 uchar *pos; /*!< word start pointer */
64 uint len; /*!< word len */
65 double weight; /*!< word weight, unused in innodb */
66};
67
68/** Tokenizer for ngram referring to ft_get_word(ft_parser.c) in MyISAM.
69Differences: a. code format changed; b. stopword processing removed.
70@param[in] cs charset
71@param[in,out] start doc start pointer
72@param[in,out] end doc end pointer
73@param[in,out] word token
74@param[in,out] info token info
75@retval 0 eof
76@retval 1 word found
77@retval 2 left bracket
78@retval 3 right bracket
79@retval 4 stopword found */
82 uchar *doc = *start;
83 int ctype;
84 int mbl;
85
86 info->yesno = (FTB_YES == ' ') ? 1 : (info->quot != nullptr);
87 info->weight_adjust = info->wasign = 0;
88 info->type = FT_TOKEN_EOF;
89
90 while (doc < end) {
91 for (; doc < end; doc += (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1))) {
92 mbl = cs->cset->ctype(cs, &ctype, doc, end);
93
94 if (true_word_char(ctype, *doc)) {
95 break;
96 }
97
98 if (*doc == FTB_RQUOT && info->quot) {
99 *start = doc + 1;
101
102 return (info->type);
103 }
104
105 if (!info->quot) {
106 if (*doc == FTB_LBR || *doc == FTB_RBR || *doc == FTB_LQUOT) {
107 /* param->prev=' '; */
108 *start = doc + 1;
109 if (*doc == FTB_LQUOT) {
110 info->quot = (char *)1;
111 }
112
113 info->type =
115
116 return (info->type);
117 }
118
119 if (info->prev == ' ') {
120 if (*doc == FTB_YES) {
121 info->yesno = +1;
122 continue;
123 } else if (*doc == FTB_EGAL) {
124 info->yesno = 0;
125 continue;
126 } else if (*doc == FTB_NO) {
127 info->yesno = -1;
128 continue;
129 } else if (*doc == FTB_INC) {
130 info->weight_adjust++;
131 continue;
132 } else if (*doc == FTB_DEC) {
133 info->weight_adjust--;
134 continue;
135 } else if (*doc == FTB_NEG) {
136 info->wasign = !info->wasign;
137 continue;
138 }
139 }
140 }
141
142 info->prev = *doc;
143 info->yesno = (FTB_YES == ' ') ? 1 : (info->quot != nullptr);
144 info->weight_adjust = info->wasign = 0;
145 }
146
147 for (word->pos = doc; doc < end;
148 doc += (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1))) {
149 mbl = cs->cset->ctype(cs, &ctype, doc, end);
150
151 if (!true_word_char(ctype, *doc)) {
152 break;
153 }
154 }
155
156 /* Be sure *prev is true_word_char. */
157 info->prev = 'A';
158 word->len = (uint)(doc - word->pos);
159
160 if ((info->trunc = (doc < end && *doc == FTB_TRUNC))) {
161 doc++;
162 }
163
164 /* We don't check stopword here. */
165 *start = doc;
166 info->type = FT_TOKEN_WORD;
167
168 return (info->type);
169 }
170
171 if (info->quot) {
172 *start = doc;
174 }
175
176 return (info->type);
177}
Some definitions for full-text indices.
#define DEFAULT_FTB_SYNTAX
Definition: ft_global.h:45
#define FTB_DEC
Definition: fts0tokenize.h:53
#define FTB_RQUOT
Definition: fts0tokenize.h:59
#define FTB_YES
Definition: fts0tokenize.h:49
#define FTB_LQUOT
Definition: fts0tokenize.h:58
#define FTB_EGAL
Definition: fts0tokenize.h:50
bool true_word_char(int c, uint8_t ch)
Check a char is true word.
Definition: fts0tokenize.h:42
#define FTB_NO
Definition: fts0tokenize.h:51
#define FTB_RBR
Definition: fts0tokenize.h:55
#define FTB_INC
Definition: fts0tokenize.h:52
#define FTB_TRUNC
Definition: fts0tokenize.h:57
static const char * fts_boolean_syntax
Boolean search syntax.
Definition: fts0tokenize.h:47
uchar fts_get_word(const CHARSET_INFO *cs, uchar **start, uchar *end, FT_WORD *word, MYSQL_FTPARSER_BOOLEAN_INFO *info)
Tokenizer for ngram referring to ft_get_word(ft_parser.c) in MyISAM.
Definition: fts0tokenize.h:80
#define FTB_LBR
Definition: fts0tokenize.h:54
#define FTB_NEG
Definition: fts0tokenize.h:56
static void start(mysql_harness::PluginFuncEnv *env)
Definition: http_auth_backend_plugin.cc:180
A better implementation of the UNIX ctype(3) library.
static constexpr uint8_t MY_CHAR_L
Definition: m_ctype.h:542
static constexpr uint8_t MY_CHAR_NMR
Definition: m_ctype.h:543
static constexpr uint8_t MY_CHAR_U
Definition: m_ctype.h:541
unsigned char uchar
Definition: my_inttypes.h:52
Definition: commit_order_queue.h:34
Cursor end()
A past-the-end Cursor.
Definition: rules_table_service.cc:192
@ FT_TOKEN_RIGHT_PAREN
Definition: plugin_ftparser.h:95
@ FT_TOKEN_LEFT_PAREN
Definition: plugin_ftparser.h:94
@ FT_TOKEN_WORD
Definition: plugin_ftparser.h:93
@ FT_TOKEN_EOF
Definition: plugin_ftparser.h:92
Definition: m_ctype.h:423
FTS query token.
Definition: fts0tokenize.h:62
uint len
word len
Definition: fts0tokenize.h:64
double weight
word weight, unused in innodb
Definition: fts0tokenize.h:65
uchar * pos
word start pointer
Definition: fts0tokenize.h:63
Definition: plugin_ftparser.h:133
char * quot
Definition: plugin_ftparser.h:142
char trunc
Definition: plugin_ftparser.h:138
int weight_adjust
Definition: plugin_ftparser.h:136
char prev
Definition: plugin_ftparser.h:141
enum enum_ft_token_type type
Definition: plugin_ftparser.h:134
int yesno
Definition: plugin_ftparser.h:135
char wasign
Definition: plugin_ftparser.h:137