MySQL 8.3.0
Source Code Documentation
fts0tokenize.h
Go to the documentation of this file.
1/*****************************************************************************
2
3Copyright (c) 2014, 2023, Oracle and/or its affiliates.
4
5This program is free software; you can redistribute it and/or modify it under
6the terms of the GNU General Public License, version 2.0, as published by the
7Free Software Foundation.
8
9This program is also distributed with certain software (including but not
10limited to OpenSSL) that is licensed under separate terms, as designated in a
11particular file or component or in included license documentation. The authors
12of MySQL hereby grant you an additional permission to link the program and
13your derivative works with the separately licensed software that they have
14included with MySQL.
15
16This program is distributed in the hope that it will be useful, but WITHOUT
17ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
18FOR A PARTICULAR PURPOSE. See the GNU General Public License, version 2.0,
19for more details.
20
21You should have received a copy of the GNU General Public License along with
22this program; if not, write to the Free Software Foundation, Inc.,
2351 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
24
25*****************************************************************************/
26
27/** @file include/fts0tokenize.h
28 Full Text Search plugin tokenizer refer to MyISAM
29
30 Created 2014/11/17 Shaohua Wang
31 ***********************************************************************/
32
33#include <cstdint>
34
35#include "ft_global.h"
38
39/* Macros and structs below are from ftdefs.h in MyISAM */
40/** Check a char is true word */
41inline bool true_word_char(int c, uint8_t ch) {
42 return ((c & (MY_CHAR_U | MY_CHAR_L | MY_CHAR_NMR)) != 0) || ch == '_';
43}
44
45/** Boolean search syntax */
47
48#define FTB_YES (fts_boolean_syntax[0])
49#define FTB_EGAL (fts_boolean_syntax[1])
50#define FTB_NO (fts_boolean_syntax[2])
51#define FTB_INC (fts_boolean_syntax[3])
52#define FTB_DEC (fts_boolean_syntax[4])
53#define FTB_LBR (fts_boolean_syntax[5])
54#define FTB_RBR (fts_boolean_syntax[6])
55#define FTB_NEG (fts_boolean_syntax[7])
56#define FTB_TRUNC (fts_boolean_syntax[8])
57#define FTB_LQUOT (fts_boolean_syntax[10])
58#define FTB_RQUOT (fts_boolean_syntax[11])
59
60/** FTS query token */
61struct FT_WORD {
62 uchar *pos; /*!< word start pointer */
63 uint len; /*!< word len */
64 double weight; /*!< word weight, unused in innodb */
65};
66
67/** Tokenizer for ngram referring to ft_get_word(ft_parser.c) in MyISAM.
68Differences: a. code format changed; b. stopword processing removed.
69@param[in] cs charset
70@param[in,out] start doc start pointer
71@param[in,out] end doc end pointer
72@param[in,out] word token
73@param[in,out] info token info
74@retval 0 eof
75@retval 1 word found
76@retval 2 left bracket
77@retval 3 right bracket
78@retval 4 stopword found */
81 uchar *doc = *start;
82 int ctype;
83 int mbl;
84
85 info->yesno = (FTB_YES == ' ') ? 1 : (info->quot != nullptr);
86 info->weight_adjust = info->wasign = 0;
87 info->type = FT_TOKEN_EOF;
88
89 while (doc < end) {
90 for (; doc < end; doc += (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1))) {
91 mbl = cs->cset->ctype(cs, &ctype, doc, end);
92
93 if (true_word_char(ctype, *doc)) {
94 break;
95 }
96
97 if (*doc == FTB_RQUOT && info->quot) {
98 *start = doc + 1;
100
101 return (info->type);
102 }
103
104 if (!info->quot) {
105 if (*doc == FTB_LBR || *doc == FTB_RBR || *doc == FTB_LQUOT) {
106 /* param->prev=' '; */
107 *start = doc + 1;
108 if (*doc == FTB_LQUOT) {
109 info->quot = (char *)1;
110 }
111
112 info->type =
114
115 return (info->type);
116 }
117
118 if (info->prev == ' ') {
119 if (*doc == FTB_YES) {
120 info->yesno = +1;
121 continue;
122 } else if (*doc == FTB_EGAL) {
123 info->yesno = 0;
124 continue;
125 } else if (*doc == FTB_NO) {
126 info->yesno = -1;
127 continue;
128 } else if (*doc == FTB_INC) {
129 info->weight_adjust++;
130 continue;
131 } else if (*doc == FTB_DEC) {
132 info->weight_adjust--;
133 continue;
134 } else if (*doc == FTB_NEG) {
135 info->wasign = !info->wasign;
136 continue;
137 }
138 }
139 }
140
141 info->prev = *doc;
142 info->yesno = (FTB_YES == ' ') ? 1 : (info->quot != nullptr);
143 info->weight_adjust = info->wasign = 0;
144 }
145
146 for (word->pos = doc; doc < end;
147 doc += (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1))) {
148 mbl = cs->cset->ctype(cs, &ctype, doc, end);
149
150 if (!true_word_char(ctype, *doc)) {
151 break;
152 }
153 }
154
155 /* Be sure *prev is true_word_char. */
156 info->prev = 'A';
157 word->len = (uint)(doc - word->pos);
158
159 if ((info->trunc = (doc < end && *doc == FTB_TRUNC))) {
160 doc++;
161 }
162
163 /* We don't check stopword here. */
164 *start = doc;
165 info->type = FT_TOKEN_WORD;
166
167 return (info->type);
168 }
169
170 if (info->quot) {
171 *start = doc;
173 }
174
175 return (info->type);
176}
Some definitions for full-text indices.
#define DEFAULT_FTB_SYNTAX
Definition: ft_global.h:44
#define FTB_DEC
Definition: fts0tokenize.h:52
#define FTB_RQUOT
Definition: fts0tokenize.h:58
#define FTB_YES
Definition: fts0tokenize.h:48
#define FTB_LQUOT
Definition: fts0tokenize.h:57
#define FTB_EGAL
Definition: fts0tokenize.h:49
bool true_word_char(int c, uint8_t ch)
Check a char is true word.
Definition: fts0tokenize.h:41
#define FTB_NO
Definition: fts0tokenize.h:50
#define FTB_RBR
Definition: fts0tokenize.h:54
#define FTB_INC
Definition: fts0tokenize.h:51
#define FTB_TRUNC
Definition: fts0tokenize.h:56
static const char * fts_boolean_syntax
Boolean search syntax.
Definition: fts0tokenize.h:46
uchar fts_get_word(const CHARSET_INFO *cs, uchar **start, uchar *end, FT_WORD *word, MYSQL_FTPARSER_BOOLEAN_INFO *info)
Tokenizer for ngram referring to ft_get_word(ft_parser.c) in MyISAM.
Definition: fts0tokenize.h:79
#define FTB_LBR
Definition: fts0tokenize.h:53
#define FTB_NEG
Definition: fts0tokenize.h:55
static void start(mysql_harness::PluginFuncEnv *env)
Definition: http_auth_backend_plugin.cc:176
A better implementation of the UNIX ctype(3) library.
static constexpr uint8_t MY_CHAR_L
Definition: m_ctype.h:541
static constexpr uint8_t MY_CHAR_NMR
Definition: m_ctype.h:542
static constexpr uint8_t MY_CHAR_U
Definition: m_ctype.h:540
unsigned char uchar
Definition: my_inttypes.h:51
Definition: commit_order_queue.h:33
Cursor end()
A past-the-end Cursor.
Definition: rules_table_service.cc:191
@ FT_TOKEN_RIGHT_PAREN
Definition: plugin_ftparser.h:94
@ FT_TOKEN_LEFT_PAREN
Definition: plugin_ftparser.h:93
@ FT_TOKEN_WORD
Definition: plugin_ftparser.h:92
@ FT_TOKEN_EOF
Definition: plugin_ftparser.h:91
Definition: m_ctype.h:422
FTS query token.
Definition: fts0tokenize.h:61
uint len
word len
Definition: fts0tokenize.h:63
double weight
word weight, unused in innodb
Definition: fts0tokenize.h:64
uchar * pos
word start pointer
Definition: fts0tokenize.h:62
Definition: plugin_ftparser.h:132