MySQL 8.0.31
Source Code Documentation
fts0tokenize.h
Go to the documentation of this file.
1/*****************************************************************************
2
3Copyright (c) 2014, 2022, Oracle and/or its affiliates.
4
5This program is free software; you can redistribute it and/or modify it under
6the terms of the GNU General Public License, version 2.0, as published by the
7Free Software Foundation.
8
9This program is also distributed with certain software (including but not
10limited to OpenSSL) that is licensed under separate terms, as designated in a
11particular file or component or in included license documentation. The authors
12of MySQL hereby grant you an additional permission to link the program and
13your derivative works with the separately licensed software that they have
14included with MySQL.
15
16This program is distributed in the hope that it will be useful, but WITHOUT
17ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
18FOR A PARTICULAR PURPOSE. See the GNU General Public License, version 2.0,
19for more details.
20
21You should have received a copy of the GNU General Public License along with
22this program; if not, write to the Free Software Foundation, Inc.,
2351 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
24
25*****************************************************************************/
26
27/** @file include/fts0tokenize.h
28 Full Text Search plugin tokenizer refer to MyISAM
29
30 Created 2014/11/17 Shaohua Wang
31 ***********************************************************************/
32
33#include "ft_global.h"
34#include "m_ctype.h"
36
37/* Macros and structs below are from ftdefs.h in MyISAM */
38/** Check a char is true word */
39#define true_word_char(c, ch) ((c) & (_MY_U | _MY_L | _MY_NMR) || (ch) == '_')
40
41/** Boolean search syntax */
43
44#define FTB_YES (fts_boolean_syntax[0])
45#define FTB_EGAL (fts_boolean_syntax[1])
46#define FTB_NO (fts_boolean_syntax[2])
47#define FTB_INC (fts_boolean_syntax[3])
48#define FTB_DEC (fts_boolean_syntax[4])
49#define FTB_LBR (fts_boolean_syntax[5])
50#define FTB_RBR (fts_boolean_syntax[6])
51#define FTB_NEG (fts_boolean_syntax[7])
52#define FTB_TRUNC (fts_boolean_syntax[8])
53#define FTB_LQUOT (fts_boolean_syntax[10])
54#define FTB_RQUOT (fts_boolean_syntax[11])
55
56/** FTS query token */
57struct FT_WORD {
58 uchar *pos; /*!< word start pointer */
59 uint len; /*!< word len */
60 double weight; /*!< word weight, unused in innodb */
61};
62
63/** Tokenizer for ngram referring to ft_get_word(ft_parser.c) in MyISAM.
64Differences: a. code format changed; b. stopword processing removed.
65@param[in] cs charset
66@param[in,out] start doc start pointer
67@param[in,out] end doc end pointer
68@param[in,out] word token
69@param[in,out] info token info
70@retval 0 eof
71@retval 1 word found
72@retval 2 left bracket
73@retval 3 right bracket
74@retval 4 stopword found */
77 uchar *doc = *start;
78 int ctype;
80 int mbl;
81
82 info->yesno = (FTB_YES == ' ') ? 1 : (info->quot != nullptr);
83 info->weight_adjust = info->wasign = 0;
84 info->type = FT_TOKEN_EOF;
85
86 while (doc < end) {
87 for (; doc < end; doc += (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1))) {
88 mbl = cs->cset->ctype(cs, &ctype, doc, end);
89
90 if (true_word_char(ctype, *doc)) {
91 break;
92 }
93
94 if (*doc == FTB_RQUOT && info->quot) {
95 *start = doc + 1;
97
98 return (info->type);
99 }
100
101 if (!info->quot) {
102 if (*doc == FTB_LBR || *doc == FTB_RBR || *doc == FTB_LQUOT) {
103 /* param->prev=' '; */
104 *start = doc + 1;
105 if (*doc == FTB_LQUOT) {
106 info->quot = (char *)1;
107 }
108
109 info->type =
111
112 return (info->type);
113 }
114
115 if (info->prev == ' ') {
116 if (*doc == FTB_YES) {
117 info->yesno = +1;
118 continue;
119 } else if (*doc == FTB_EGAL) {
120 info->yesno = 0;
121 continue;
122 } else if (*doc == FTB_NO) {
123 info->yesno = -1;
124 continue;
125 } else if (*doc == FTB_INC) {
126 info->weight_adjust++;
127 continue;
128 } else if (*doc == FTB_DEC) {
129 info->weight_adjust--;
130 continue;
131 } else if (*doc == FTB_NEG) {
132 info->wasign = !info->wasign;
133 continue;
134 }
135 }
136 }
137
138 info->prev = *doc;
139 info->yesno = (FTB_YES == ' ') ? 1 : (info->quot != nullptr);
140 info->weight_adjust = info->wasign = 0;
141 }
142
143 length = 0;
144 for (word->pos = doc; doc < end;
145 length++, doc += (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1))) {
146 mbl = cs->cset->ctype(cs, &ctype, doc, end);
147
148 if (!true_word_char(ctype, *doc)) {
149 break;
150 }
151 }
152
153 /* Be sure *prev is true_word_char. */
154 info->prev = 'A';
155 word->len = (uint)(doc - word->pos);
156
157 if ((info->trunc = (doc < end && *doc == FTB_TRUNC))) {
158 doc++;
159 }
160
161 /* We don't check stopword here. */
162 *start = doc;
163 info->type = FT_TOKEN_WORD;
164
165 return (info->type);
166 }
167
168 if (info->quot) {
169 *start = doc;
171 }
172
173 return (info->type);
174}
Some definitions for full-text indices.
#define DEFAULT_FTB_SYNTAX
Definition: ft_global.h:44
#define FTB_DEC
Definition: fts0tokenize.h:48
#define FTB_RQUOT
Definition: fts0tokenize.h:54
#define FTB_YES
Definition: fts0tokenize.h:44
#define FTB_LQUOT
Definition: fts0tokenize.h:53
#define FTB_EGAL
Definition: fts0tokenize.h:45
#define FTB_NO
Definition: fts0tokenize.h:46
#define FTB_RBR
Definition: fts0tokenize.h:50
#define FTB_INC
Definition: fts0tokenize.h:47
#define FTB_TRUNC
Definition: fts0tokenize.h:52
#define true_word_char(c, ch)
Check a char is true word.
Definition: fts0tokenize.h:39
static const char * fts_boolean_syntax
Boolean search syntax.
Definition: fts0tokenize.h:42
uchar fts_get_word(const CHARSET_INFO *cs, uchar **start, uchar *end, FT_WORD *word, MYSQL_FTPARSER_BOOLEAN_INFO *info)
Tokenizer for ngram referring to ft_get_word(ft_parser.c) in MyISAM.
Definition: fts0tokenize.h:75
#define FTB_LBR
Definition: fts0tokenize.h:49
#define FTB_NEG
Definition: fts0tokenize.h:51
static void start(mysql_harness::PluginFuncEnv *env)
Definition: http_auth_backend_plugin.cc:176
A better implementation of the UNIX ctype(3) library.
unsigned char uchar
Definition: my_inttypes.h:51
Log info(cout, "NOTE")
Definition: commit_order_queue.h:33
bool length(const dd::Spatial_reference_system *srs, const Geometry *g1, double *length, bool *null) noexcept
Computes the length of linestrings and multilinestrings.
Definition: length.cc:75
Cursor end()
A past-the-end Cursor.
Definition: rules_table_service.cc:191
@ FT_TOKEN_RIGHT_PAREN
Definition: plugin_ftparser.h:90
@ FT_TOKEN_LEFT_PAREN
Definition: plugin_ftparser.h:89
@ FT_TOKEN_WORD
Definition: plugin_ftparser.h:88
@ FT_TOKEN_EOF
Definition: plugin_ftparser.h:87
Definition: m_ctype.h:382
FTS query token.
Definition: fts0tokenize.h:57
uint len
word len
Definition: fts0tokenize.h:59
double weight
word weight, unused in innodb
Definition: fts0tokenize.h:60
uchar * pos
word start pointer
Definition: fts0tokenize.h:58
Definition: plugin_ftparser.h:128
unsigned int uint
Definition: uca-dump.cc:29