MySQL 8.0.40
Source Code Documentation
fts0tokenize.h
Go to the documentation of this file.
1/*****************************************************************************
2
3Copyright (c) 2014, 2024, Oracle and/or its affiliates.
4
5This program is free software; you can redistribute it and/or modify it under
6the terms of the GNU General Public License, version 2.0, as published by the
7Free Software Foundation.
8
9This program is designed to work with certain software (including
10but not limited to OpenSSL) that is licensed under separate terms,
11as designated in a particular file or component or in included license
12documentation. The authors of MySQL hereby grant you an additional
13permission to link the program and your derivative works with the
14separately licensed software that they have either included with
15the program or referenced in the documentation.
16
17This program is distributed in the hope that it will be useful, but WITHOUT
18ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
19FOR A PARTICULAR PURPOSE. See the GNU General Public License, version 2.0,
20for more details.
21
22You should have received a copy of the GNU General Public License along with
23this program; if not, write to the Free Software Foundation, Inc.,
2451 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
25
26*****************************************************************************/
27
28/** @file include/fts0tokenize.h
29 Full Text Search plugin tokenizer refer to MyISAM
30
31 Created 2014/11/17 Shaohua Wang
32 ***********************************************************************/
33
34#include "ft_global.h"
35#include "m_ctype.h"
37
38/* Macros and structs below are from ftdefs.h in MyISAM */
39/** Check a char is true word */
40#define true_word_char(c, ch) ((c) & (_MY_U | _MY_L | _MY_NMR) || (ch) == '_')
41
42/** Boolean search syntax */
44
45#define FTB_YES (fts_boolean_syntax[0])
46#define FTB_EGAL (fts_boolean_syntax[1])
47#define FTB_NO (fts_boolean_syntax[2])
48#define FTB_INC (fts_boolean_syntax[3])
49#define FTB_DEC (fts_boolean_syntax[4])
50#define FTB_LBR (fts_boolean_syntax[5])
51#define FTB_RBR (fts_boolean_syntax[6])
52#define FTB_NEG (fts_boolean_syntax[7])
53#define FTB_TRUNC (fts_boolean_syntax[8])
54#define FTB_LQUOT (fts_boolean_syntax[10])
55#define FTB_RQUOT (fts_boolean_syntax[11])
56
57/** FTS query token */
58struct FT_WORD {
59 uchar *pos; /*!< word start pointer */
60 uint len; /*!< word len */
61 double weight; /*!< word weight, unused in innodb */
62};
63
64/** Tokenizer for ngram referring to ft_get_word(ft_parser.c) in MyISAM.
65Differences: a. code format changed; b. stopword processing removed.
66@param[in] cs charset
67@param[in,out] start doc start pointer
68@param[in,out] end doc end pointer
69@param[in,out] word token
70@param[in,out] info token info
71@retval 0 eof
72@retval 1 word found
73@retval 2 left bracket
74@retval 3 right bracket
75@retval 4 stopword found */
78 uchar *doc = *start;
79 int ctype;
80 int mbl;
81
82 info->yesno = (FTB_YES == ' ') ? 1 : (info->quot != nullptr);
83 info->weight_adjust = info->wasign = 0;
84 info->type = FT_TOKEN_EOF;
85
86 while (doc < end) {
87 for (; doc < end; doc += (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1))) {
88 mbl = cs->cset->ctype(cs, &ctype, doc, end);
89
90 if (true_word_char(ctype, *doc)) {
91 break;
92 }
93
94 if (*doc == FTB_RQUOT && info->quot) {
95 *start = doc + 1;
97
98 return (info->type);
99 }
100
101 if (!info->quot) {
102 if (*doc == FTB_LBR || *doc == FTB_RBR || *doc == FTB_LQUOT) {
103 /* param->prev=' '; */
104 *start = doc + 1;
105 if (*doc == FTB_LQUOT) {
106 info->quot = (char *)1;
107 }
108
109 info->type =
111
112 return (info->type);
113 }
114
115 if (info->prev == ' ') {
116 if (*doc == FTB_YES) {
117 info->yesno = +1;
118 continue;
119 } else if (*doc == FTB_EGAL) {
120 info->yesno = 0;
121 continue;
122 } else if (*doc == FTB_NO) {
123 info->yesno = -1;
124 continue;
125 } else if (*doc == FTB_INC) {
126 info->weight_adjust++;
127 continue;
128 } else if (*doc == FTB_DEC) {
129 info->weight_adjust--;
130 continue;
131 } else if (*doc == FTB_NEG) {
132 info->wasign = !info->wasign;
133 continue;
134 }
135 }
136 }
137
138 info->prev = *doc;
139 info->yesno = (FTB_YES == ' ') ? 1 : (info->quot != nullptr);
140 info->weight_adjust = info->wasign = 0;
141 }
142
143 for (word->pos = doc; doc < end;
144 doc += (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1))) {
145 mbl = cs->cset->ctype(cs, &ctype, doc, end);
146
147 if (!true_word_char(ctype, *doc)) {
148 break;
149 }
150 }
151
152 /* Be sure *prev is true_word_char. */
153 info->prev = 'A';
154 word->len = (uint)(doc - word->pos);
155
156 if ((info->trunc = (doc < end && *doc == FTB_TRUNC))) {
157 doc++;
158 }
159
160 /* We don't check stopword here. */
161 *start = doc;
162 info->type = FT_TOKEN_WORD;
163
164 return (info->type);
165 }
166
167 if (info->quot) {
168 *start = doc;
170 }
171
172 return (info->type);
173}
Some definitions for full-text indices.
#define DEFAULT_FTB_SYNTAX
Definition: ft_global.h:45
#define FTB_DEC
Definition: fts0tokenize.h:49
#define FTB_RQUOT
Definition: fts0tokenize.h:55
#define FTB_YES
Definition: fts0tokenize.h:45
#define FTB_LQUOT
Definition: fts0tokenize.h:54
#define FTB_EGAL
Definition: fts0tokenize.h:46
#define FTB_NO
Definition: fts0tokenize.h:47
#define FTB_RBR
Definition: fts0tokenize.h:51
#define FTB_INC
Definition: fts0tokenize.h:48
#define FTB_TRUNC
Definition: fts0tokenize.h:53
#define true_word_char(c, ch)
Check a char is true word.
Definition: fts0tokenize.h:40
static const char * fts_boolean_syntax
Boolean search syntax.
Definition: fts0tokenize.h:43
uchar fts_get_word(const CHARSET_INFO *cs, uchar **start, uchar *end, FT_WORD *word, MYSQL_FTPARSER_BOOLEAN_INFO *info)
Tokenizer for ngram referring to ft_get_word(ft_parser.c) in MyISAM.
Definition: fts0tokenize.h:76
#define FTB_LBR
Definition: fts0tokenize.h:50
#define FTB_NEG
Definition: fts0tokenize.h:52
static void start(mysql_harness::PluginFuncEnv *env)
Definition: http_auth_backend_plugin.cc:177
A better implementation of the UNIX ctype(3) library.
unsigned char uchar
Definition: my_inttypes.h:52
Log info(cout, "NOTE")
Definition: commit_order_queue.h:34
Cursor end()
A past-the-end Cursor.
Definition: rules_table_service.cc:192
@ FT_TOKEN_RIGHT_PAREN
Definition: plugin_ftparser.h:91
@ FT_TOKEN_LEFT_PAREN
Definition: plugin_ftparser.h:90
@ FT_TOKEN_WORD
Definition: plugin_ftparser.h:89
@ FT_TOKEN_EOF
Definition: plugin_ftparser.h:88
Definition: m_ctype.h:385
FTS query token.
Definition: fts0tokenize.h:58
uint len
word len
Definition: fts0tokenize.h:60
double weight
word weight, unused in innodb
Definition: fts0tokenize.h:61
uchar * pos
word start pointer
Definition: fts0tokenize.h:59
Definition: plugin_ftparser.h:129
unsigned int uint
Definition: uca9-dump.cc:75