MySQL 8.0.37
Source Code Documentation
fts0types.h
Go to the documentation of this file.
1/*****************************************************************************
2
3Copyright (c) 2007, 2024, Oracle and/or its affiliates.
4
5This program is free software; you can redistribute it and/or modify it under
6the terms of the GNU General Public License, version 2.0, as published by the
7Free Software Foundation.
8
9This program is designed to work with certain software (including
10but not limited to OpenSSL) that is licensed under separate terms,
11as designated in a particular file or component or in included license
12documentation. The authors of MySQL hereby grant you an additional
13permission to link the program and your derivative works with the
14separately licensed software that they have either included with
15the program or referenced in the documentation.
16
17This program is distributed in the hope that it will be useful, but WITHOUT
18ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
19FOR A PARTICULAR PURPOSE. See the GNU General Public License, version 2.0,
20for more details.
21
22You should have received a copy of the GNU General Public License along with
23this program; if not, write to the Free Software Foundation, Inc.,
2451 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
25
26*****************************************************************************/
27
28/** @file include/fts0types.h
29 Full text search types file
30
31 Created 2007-03-27 Sunny Bains
32 *******************************************************/
33
34#ifndef INNOBASE_FTS0TYPES_H
35#define INNOBASE_FTS0TYPES_H
36
37#include "fts0fts.h"
38#include "fut0fut.h"
39#include "pars0pars.h"
40#include "que0types.h"
41#include "univ.i"
42#include "ut0byte.h"
43#include "ut0rbt.h"
44
45/** Types used within FTS. */
46struct fts_que_t;
47struct fts_node_t;
48
49/** Callbacks used within FTS. */
51typedef void (*fts_filter)(void *, fts_node_t *, void *, ulint len);
52
53/** Statistics relevant to a particular document, used during retrieval. */
55 doc_id_t doc_id; /*!< Document id */
56 ulint word_count; /*!< Total words in the document */
57};
58
59/** It's main purpose is to store the SQL prepared statements that
60are required to retrieve a document from the database. */
62 fts_index_cache_t *index_cache; /*!< The index cache instance */
63
64 /*!< Parsed sql statement */
66 fts_cache_t *cache; /*!< The parent cache */
67};
68
69/** Since we can have multiple FTS indexes on a table, we keep a
70per index cache of words etc. */
72 dict_index_t *index; /*!< The FTS index instance */
73
74 ib_rbt_t *words; /*!< Nodes; indexed by fts_string_t*,
75 cells are fts_tokenizer_word_t*.*/
76
77 ib_vector_t *doc_stats; /*!< Array of the fts_doc_stats_t
78 contained in the memory buffer.
79 Must be in sorted order (ascending).
80 The ideal choice is an rb tree but
81 the rb tree imposes a space overhead
82 that we can do without */
83
84 que_t **ins_graph; /*!< Insert query graphs */
85
86 que_t **sel_graph; /*!< Select query graphs */
87 CHARSET_INFO *charset; /*!< charset */
88};
89
90/** For supporting the tracking of updates on multiple FTS indexes we need
91to track which FTS indexes need to be updated. For INSERT and DELETE we
92update all fts indexes. */
94 doc_id_t doc_id; /*!< The doc id affected */
95
96 ib_vector_t *fts_indexes; /*!< The FTS indexes that need to be
97 updated. A NULL value means all
98 indexes need to be updated. This
99 vector is not allocated on the heap
100 and so must be freed explicitly,
101 when we are done with it */
102};
103
104/** Stop word control infotmation. */
106 ulint status; /*!< Status of the stopword tree */
107 ib_alloc_t *heap; /*!< The memory allocator to use */
108 ib_rbt_t *cached_stopword; /*!< This stores all active stopwords */
109 CHARSET_INFO *charset; /*!< charset for stopword */
110};
111
112/** The SYNC state of the cache. There is one instance of this struct
113associated with each ADD thread. */
115 trx_t *trx; /*!< The transaction used for SYNCing
116 the cache to disk */
117 dict_table_t *table; /*!< Table with FTS index(es) */
118 ulint max_cache_size; /*!< Max size in bytes of the cache */
119 bool cache_full; /*!< flag, when true it indicates that
120 we need to sync the cache to disk */
121 ulint lower_index; /*!< the start index of the doc id
122 vector from where to start adding
123 documents to the FTS cache */
124 ulint upper_index; /*!< max index of the doc id vector to
125 add to the FTS cache */
126 bool interrupted; /*!< true if SYNC was interrupted */
127 doc_id_t min_doc_id; /*!< The smallest doc id added to the
128 cache. It should equal to
129 doc_ids[lower_index] */
130 doc_id_t max_doc_id; /*!< The doc id at which the cache was
131 noted as being full, we use this to
132 set the upper_limit field */
133 std::chrono::steady_clock::time_point start_time;
134 /*!< SYNC start time */
135 bool in_progress; /*!< flag whether sync is in progress.*/
136 bool unlock_cache; /*!< flag whether unlock cache when
137 write fts node */
138 os_event_t event; /*!< sync finish event */
139};
140
141/** The cache for the FTS system. It is a memory-based inverted index
142that new entries are added to, until it grows over the configured maximum
143size, at which time its contents are written to the INDEX table. */
145#ifndef UNIV_HOTBACKUP
146 rw_lock_t lock; /*!< lock protecting all access to the
147 memory buffer. FIXME: this needs to
148 be our new upgrade-capable rw-lock */
149
150 rw_lock_t init_lock; /*!< lock used for the cache
151 initialization, it has different
152 SYNC level as above cache lock */
153#endif /* !UNIV_HOTBACKUP */
154
155 ib_mutex_t optimize_lock; /*!< Lock for OPTIMIZE */
156
157 ib_mutex_t deleted_lock; /*!< Lock covering deleted_doc_ids */
158
159 ib_mutex_t doc_id_lock; /*!< Lock covering Doc ID */
160
161 ib_vector_t *deleted_doc_ids; /*!< Array of deleted doc ids, each
162 element is of type fts_update_t */
163
164 ib_vector_t *indexes; /*!< We store the stats and inverted
165 index for the individual FTS indexes
166 in this vector. Each element is
167 an instance of fts_index_cache_t */
168
169 ib_vector_t *get_docs; /*!< information required to read
170 the document from the table. Each
171 element is of type fts_doc_t */
172
173 ulint total_size; /*!< total size consumed by the ilist
174 field of all nodes. SYNC is run
175 whenever this gets too big */
176 uint64_t total_size_before_sync; /*!< total size of fts cache,
177 when last SYNC request was sent */
178 fts_sync_t *sync; /*!< sync structure to sync data to
179 disk */
180 ib_alloc_t *sync_heap; /*!< The heap allocator, for indexes
181 and deleted_doc_ids, ie. transient
182 objects, they are recreated after
183 a SYNC is completed */
184
185 ib_alloc_t *self_heap; /*!< This heap is the heap out of
186 which an instance of the cache itself
187 was created. Objects created using
188 this heap will last for the lifetime
189 of the cache */
190
191 doc_id_t next_doc_id; /*!< Next doc id */
192
193 doc_id_t synced_doc_id; /*!< Doc ID sync-ed to CONFIG table */
194
195 doc_id_t first_doc_id; /*!< first doc id since this table
196 was opened */
197
198 ulint deleted; /*!< Number of doc ids deleted since
199 last optimized. This variable is
200 covered by deleted_lock */
201
202 ulint added; /*!< Number of doc ids added since last
203 optimized. This variable is covered by
204 the deleted lock */
205
206 fts_stopword_t stopword_info; /*!< Cached stopwords for the FTS */
207 mem_heap_t *cache_heap; /*!< Cache Heap */
208};
209
210/** Columns of the FTS auxiliary INDEX table */
212 doc_id_t first_doc_id; /*!< First document id in ilist. */
213
214 doc_id_t last_doc_id; /*!< Last document id in ilist. */
215
216 byte *ilist; /*!< Binary list of documents & word
217 positions the token appears in.
218 TODO: For now, these are simply
219 ut_malloc'd, but if testing shows
220 that they waste memory unacceptably, a
221 special memory allocator will have
222 to be written */
223
224 ulint doc_count; /*!< Number of doc ids in ilist */
225
226 ulint ilist_size; /*!< Used size of ilist in bytes. */
227
229 /*!< Allocated size of ilist in
230 bytes */
231 bool synced; /*!< flag whether the node is synced */
232};
233
234/** A tokenizer word. Contains information about one word. */
236 fts_string_t text; /*!< Token text. */
237
238 ib_vector_t *nodes; /*!< Word node ilists, each element is
239 of type fts_node_t */
240};
241
242/** Word text plus it's array of nodes as on disk in FTS index */
244 fts_string_t text; /*!< Word value in UTF-8 */
245 ib_vector_t *nodes; /*!< Nodes read from disk */
246
247 ib_alloc_t *heap_alloc; /*!< For handling all allocations */
248};
249
250/** Callback for reading and filtering nodes that are read from FTS index */
252 void *read_arg; /*!< Arg for the sql_callback */
253
254 fts_sql_callback read_record; /*!< Callback for reading index
255 record */
256 ulint total_memory; /*!< Total memory used */
257};
258
259/** For horizontally splitting an FTS auxiliary index */
261 ulint value; /*!< Character value at which
262 to split */
263
264 const char *suffix; /*!< FTS aux index suffix */
265};
266
267/** This type represents a single document field.
268 When fulltext index spans multiple columns, the
269 entire document (all indexed text in a row)
270 is comprised of multiple fields, one per indexed
271 column. */
272struct fts_doc_t {
273 fts_string_t text; /*!< document text */
274
275 bool found; /*!< true if the document was found
276 successfully in the database */
277
278 ib_rbt_t *tokens; /*!< This is filled when the document
279 is tokenized. Tokens; indexed by
280 fts_string_t*, cells are of type
281 fts_token_t* */
282
283 ib_alloc_t *self_heap; /*!< An instance of this type is
284 allocated from this heap along
285 with any objects that have the
286 same lifespan, most notably
287 the vector of token positions */
288 CHARSET_INFO *charset; /*!< Document's charset info */
289
290 st_mysql_ftparser *parser; /*!< fts plugin parser */
291
292 bool is_ngram; /*!< Whether it is a ngram parser */
293
294 ib_rbt_t *stopwords; /*!< Stopwords */
295};
296
297/** A token and its positions within a document. */
299 fts_string_t text; /*!< token text */
300
301 ib_vector_t *positions; /*!< an array of the positions the
302 token is found in; each item is
303 actually an ulint. */
304};
305
306/** It's defined in fts/fts0fts.c */
308
309/** It's defined in fts/fts0fts.c */
311
312/** Compare two fts_trx_row_t instances doc_ids.
313@param[in] p1 id1
314@param[in] p2 id2
315@return < 0 if n1 < n2, < 0 if n1 < n2, > 0 if n1 > n2 */
316static inline int fts_trx_row_doc_id_cmp(const void *p1, const void *p2);
317
318/** Compare two fts_ranking_t instances doc_ids.
319@param[in] p1 id1
320@param[in] p2 id2
321@return < 0 if n1 < n2, < 0 if n1 < n2, > 0 if n1 > n2 */
322static inline int fts_ranking_doc_id_cmp(const void *p1, const void *p2);
323
324/** Compare two fts_update_t instances doc_ids.
325@param[in] p1 id1
326@param[in] p2 id2
327@return < 0 if n1 < n2, < 0 if n1 < n2, > 0 if n1 > n2 */
328static inline int fts_update_doc_id_cmp(const void *p1, const void *p2);
329
330/** Decode and return the integer that was encoded using our VLC scheme.*/
331static inline ulint fts_decode_vlc(
332 /*!< out: value decoded */
333 byte **ptr); /*!< in: ptr to decode from, this ptr is
334 incremented by the number of bytes decoded */
335
336/** Duplicate a string.
337@param[in] dst dup to here
338@param[in] src src string
339@param[in] heap heap to use
340*/
341static inline void fts_string_dup(fts_string_t *dst, const fts_string_t *src,
342 mem_heap_t *heap);
343
344/** Return length of val if it were encoded using our VLC scheme. */
346 /*!< out: length of value
347 encoded, in bytes */
348 ulint val); /*!< in: value to encode */
349
350/** Encode an integer using our VLC scheme and return the length in bytes.
351@param[in] val value to encode
352@param[in] buf buffer, must have enough space
353@return length of value encoded, in bytes */
354static inline ulint fts_encode_int(ulint val, byte *buf);
355
356/** Get the selected FTS aux INDEX suffix. */
357static inline const char *fts_get_suffix(
358 ulint selected); /*!< in: selected index */
359
360/** Return the selected FTS aux index suffix in 5.7 compatible format
361@param[in] selected selected index
362@return the suffix name */
363static inline const char *fts_get_suffix_5_7(ulint selected);
364
365/** Select the FTS auxiliary index for the given character.
366@param[in] cs charset
367@param[in] str string
368@param[in] len string length in bytes
369@return the index to use for the string */
370static inline ulint fts_select_index(const CHARSET_INFO *cs, const byte *str,
371 ulint len);
372
373#include "fts0types.ic"
374#include "fts0vlc.ic"
375
376#endif /* INNOBASE_FTS0TYPES_H */
Full text search header file.
uint64_t doc_id_t
Document id type.
Definition: fts0fts.h:77
const fts_index_selector_t fts_index_selector_5_7[]
It's defined in fts/fts0fts.c.
Definition: fts0fts.cc:156
static ulint fts_decode_vlc(byte **ptr)
Decode and return the integer that was encoded using our VLC scheme.
static ulint fts_encode_int(ulint val, byte *buf)
Encode an integer using our VLC scheme and return the length in bytes.
static void fts_string_dup(fts_string_t *dst, const fts_string_t *src, mem_heap_t *heap)
Duplicate a string.
static ulint fts_get_encoded_len(ulint val)
Return length of val if it were encoded using our VLC scheme.
static int fts_trx_row_doc_id_cmp(const void *p1, const void *p2)
Compare two fts_trx_row_t instances doc_ids.
const fts_index_selector_t fts_index_selector[]
It's defined in fts/fts0fts.c.
Definition: fts0fts.cc:151
void(* fts_filter)(void *, fts_node_t *, void *, ulint len)
Definition: fts0types.h:51
static const char * fts_get_suffix(ulint selected)
Get the selected FTS aux INDEX suffix.
static int fts_ranking_doc_id_cmp(const void *p1, const void *p2)
Compare two fts_ranking_t instances doc_ids.
static const char * fts_get_suffix_5_7(ulint selected)
Return the selected FTS aux index suffix in 5.7 compatible format.
static ulint fts_select_index(const CHARSET_INFO *cs, const byte *str, ulint len)
Select the FTS auxiliary index for the given character.
pars_user_func_cb_t fts_sql_callback
Callbacks used within FTS.
Definition: fts0types.h:47
static int fts_update_doc_id_cmp(const void *p1, const void *p2)
Compare two fts_update_t instances doc_ids.
Full text search types.
Full text variable length integer encoding/decoding.
File-based utilities.
std::string str(const mysqlrouter::ConfigGenerator::Options::Endpoint &ep)
Definition: config_generator.cc:1044
Definition: buf0block_hint.cc:30
Definition: commit_order_queue.h:34
SQL parser.
bool(* pars_user_func_cb_t)(void *arg, void *user_arg)
Type of the user functions.
Definition: pars0pars.h:50
Query graph global types.
Definition: m_ctype.h:385
Data structure for an index.
Definition: dict0mem.h:1046
Data structure for a database table.
Definition: dict0mem.h:1909
The cache for the FTS system.
Definition: fts0types.h:144
rw_lock_t lock
lock protecting all access to the memory buffer.
Definition: fts0types.h:146
fts_sync_t * sync
sync structure to sync data to disk
Definition: fts0types.h:178
ib_vector_t * get_docs
information required to read the document from the table.
Definition: fts0types.h:169
ib_alloc_t * sync_heap
The heap allocator, for indexes and deleted_doc_ids, ie.
Definition: fts0types.h:180
ulint deleted
Number of doc ids deleted since last optimized.
Definition: fts0types.h:198
uint64_t total_size_before_sync
total size of fts cache, when last SYNC request was sent
Definition: fts0types.h:176
rw_lock_t init_lock
lock used for the cache initialization, it has different SYNC level as above cache lock
Definition: fts0types.h:150
mem_heap_t * cache_heap
Cache Heap.
Definition: fts0types.h:207
ulint total_size
total size consumed by the ilist field of all nodes.
Definition: fts0types.h:173
doc_id_t first_doc_id
first doc id since this table was opened
Definition: fts0types.h:195
doc_id_t synced_doc_id
Doc ID sync-ed to CONFIG table.
Definition: fts0types.h:193
ulint added
Number of doc ids added since last optimized.
Definition: fts0types.h:202
ib_alloc_t * self_heap
This heap is the heap out of which an instance of the cache itself was created.
Definition: fts0types.h:185
ib_mutex_t deleted_lock
Lock covering deleted_doc_ids.
Definition: fts0types.h:157
ib_vector_t * deleted_doc_ids
Array of deleted doc ids, each element is of type fts_update_t.
Definition: fts0types.h:161
fts_stopword_t stopword_info
Cached stopwords for the FTS.
Definition: fts0types.h:206
ib_vector_t * indexes
We store the stats and inverted index for the individual FTS indexes in this vector.
Definition: fts0types.h:164
ib_mutex_t optimize_lock
Lock for OPTIMIZE.
Definition: fts0types.h:155
doc_id_t next_doc_id
Next doc id.
Definition: fts0types.h:191
ib_mutex_t doc_id_lock
Lock covering Doc ID.
Definition: fts0types.h:159
Statistics relevant to a particular document, used during retrieval.
Definition: fts0types.h:54
doc_id_t doc_id
Document id.
Definition: fts0types.h:55
ulint word_count
Total words in the document.
Definition: fts0types.h:56
This type represents a single document field.
Definition: fts0types.h:272
fts_string_t text
document text
Definition: fts0types.h:273
ib_alloc_t * self_heap
An instance of this type is allocated from this heap along with any objects that have the same lifesp...
Definition: fts0types.h:283
ib_rbt_t * tokens
This is filled when the document is tokenized.
Definition: fts0types.h:278
bool found
true if the document was found successfully in the database
Definition: fts0types.h:275
bool is_ngram
Whether it is a ngram parser.
Definition: fts0types.h:292
CHARSET_INFO * charset
Document's charset info.
Definition: fts0types.h:288
ib_rbt_t * stopwords
Stopwords.
Definition: fts0types.h:294
st_mysql_ftparser * parser
fts plugin parser
Definition: fts0types.h:290
Callback for reading and filtering nodes that are read from FTS index.
Definition: fts0types.h:251
void * read_arg
Arg for the sql_callback.
Definition: fts0types.h:252
fts_sql_callback read_record
Callback for reading index record.
Definition: fts0types.h:254
ulint total_memory
Total memory used.
Definition: fts0types.h:256
It's main purpose is to store the SQL prepared statements that are required to retrieve a document fr...
Definition: fts0types.h:61
fts_cache_t * cache
The parent cache.
Definition: fts0types.h:66
que_t * get_document_graph
Definition: fts0types.h:65
fts_index_cache_t * index_cache
The index cache instance.
Definition: fts0types.h:62
Since we can have multiple FTS indexes on a table, we keep a per index cache of words etc.
Definition: fts0types.h:71
que_t ** sel_graph
Select query graphs.
Definition: fts0types.h:86
que_t ** ins_graph
Insert query graphs.
Definition: fts0types.h:84
dict_index_t * index
The FTS index instance.
Definition: fts0types.h:72
ib_vector_t * doc_stats
Array of the fts_doc_stats_t contained in the memory buffer.
Definition: fts0types.h:77
ib_rbt_t * words
Nodes; indexed by fts_string_t*, cells are fts_tokenizer_word_t*.
Definition: fts0types.h:74
CHARSET_INFO * charset
charset
Definition: fts0types.h:87
For horizontally splitting an FTS auxiliary index.
Definition: fts0types.h:260
ulint value
Character value at which to split.
Definition: fts0types.h:261
const char * suffix
FTS aux index suffix.
Definition: fts0types.h:264
Columns of the FTS auxiliary INDEX table.
Definition: fts0types.h:211
bool synced
flag whether the node is synced
Definition: fts0types.h:231
ulint ilist_size_alloc
Allocated size of ilist in bytes.
Definition: fts0types.h:228
byte * ilist
Binary list of documents & word positions the token appears in.
Definition: fts0types.h:216
ulint doc_count
Number of doc ids in ilist.
Definition: fts0types.h:224
doc_id_t last_doc_id
Last document id in ilist.
Definition: fts0types.h:214
ulint ilist_size
Used size of ilist in bytes.
Definition: fts0types.h:226
doc_id_t first_doc_id
First document id in ilist.
Definition: fts0types.h:212
Stop word control infotmation.
Definition: fts0types.h:105
ulint status
Status of the stopword tree.
Definition: fts0types.h:106
CHARSET_INFO * charset
charset for stopword
Definition: fts0types.h:109
ib_rbt_t * cached_stopword
This stores all active stopwords.
Definition: fts0types.h:108
ib_alloc_t * heap
The memory allocator to use.
Definition: fts0types.h:107
An UTF-16 ro UTF-8 string.
Definition: fts0fts.h:292
The SYNC state of the cache.
Definition: fts0types.h:114
bool unlock_cache
flag whether unlock cache when write fts node
Definition: fts0types.h:136
ulint upper_index
max index of the doc id vector to add to the FTS cache
Definition: fts0types.h:124
bool interrupted
true if SYNC was interrupted
Definition: fts0types.h:126
ulint max_cache_size
Max size in bytes of the cache.
Definition: fts0types.h:118
std::chrono::steady_clock::time_point start_time
SYNC start time.
Definition: fts0types.h:133
doc_id_t min_doc_id
The smallest doc id added to the cache.
Definition: fts0types.h:127
dict_table_t * table
Table with FTS index(es)
Definition: fts0types.h:117
doc_id_t max_doc_id
The doc id at which the cache was noted as being full, we use this to set the upper_limit field.
Definition: fts0types.h:130
os_event_t event
sync finish event
Definition: fts0types.h:138
ulint lower_index
the start index of the doc id vector from where to start adding documents to the FTS cache
Definition: fts0types.h:121
bool in_progress
flag whether sync is in progress.
Definition: fts0types.h:135
trx_t * trx
The transaction used for SYNCing the cache to disk.
Definition: fts0types.h:115
bool cache_full
flag, when true it indicates that we need to sync the cache to disk
Definition: fts0types.h:119
A token and its positions within a document.
Definition: fts0types.h:298
fts_string_t text
token text
Definition: fts0types.h:299
ib_vector_t * positions
an array of the positions the token is found in; each item is actually an ulint.
Definition: fts0types.h:301
A tokenizer word.
Definition: fts0types.h:235
ib_vector_t * nodes
Word node ilists, each element is of type fts_node_t.
Definition: fts0types.h:238
fts_string_t text
Token text.
Definition: fts0types.h:236
For supporting the tracking of updates on multiple FTS indexes we need to track which FTS indexes nee...
Definition: fts0types.h:93
ib_vector_t * fts_indexes
The FTS indexes that need to be updated.
Definition: fts0types.h:96
doc_id_t doc_id
The doc id affected.
Definition: fts0types.h:94
Word text plus it's array of nodes as on disk in FTS index.
Definition: fts0types.h:243
ib_alloc_t * heap_alloc
For handling all allocations.
Definition: fts0types.h:247
fts_string_t text
Word value in UTF-8.
Definition: fts0types.h:244
ib_vector_t * nodes
Nodes read from disk.
Definition: fts0types.h:245
Definition: ut0vec.h:204
Red black tree instance.
Definition: ut0rbt.h:72
Definition: ut0vec.h:213
The info structure stored at the beginning of a heap block.
Definition: mem0mem.h:302
InnoDB condition variable.
Definition: os0event.cc:63
Definition: que0que.h:301
The structure used in the spin lock implementation of a read-write lock.
Definition: sync0rw.h:360
Definition: plugin_ftparser.h:212
Definition: trx0trx.h:684
Version control for database, common definitions, and include files.
unsigned long int ulint
Definition: univ.i:406
Utilities for byte operations.
Various utilities.