MySQL 8.0.32
Source Code Documentation
fts0types.h
Go to the documentation of this file.
1/*****************************************************************************
2
3Copyright (c) 2007, 2022, Oracle and/or its affiliates.
4
5This program is free software; you can redistribute it and/or modify it under
6the terms of the GNU General Public License, version 2.0, as published by the
7Free Software Foundation.
8
9This program is also distributed with certain software (including but not
10limited to OpenSSL) that is licensed under separate terms, as designated in a
11particular file or component or in included license documentation. The authors
12of MySQL hereby grant you an additional permission to link the program and
13your derivative works with the separately licensed software that they have
14included with MySQL.
15
16This program is distributed in the hope that it will be useful, but WITHOUT
17ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
18FOR A PARTICULAR PURPOSE. See the GNU General Public License, version 2.0,
19for more details.
20
21You should have received a copy of the GNU General Public License along with
22this program; if not, write to the Free Software Foundation, Inc.,
2351 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
24
25*****************************************************************************/
26
27/** @file include/fts0types.h
28 Full text search types file
29
30 Created 2007-03-27 Sunny Bains
31 *******************************************************/
32
33#ifndef INNOBASE_FTS0TYPES_H
34#define INNOBASE_FTS0TYPES_H
35
36#include "fts0fts.h"
37#include "fut0fut.h"
38#include "pars0pars.h"
39#include "que0types.h"
40#include "univ.i"
41#include "ut0byte.h"
42#include "ut0rbt.h"
43
44/** Types used within FTS. */
45struct fts_que_t;
46struct fts_node_t;
47
48/** Callbacks used within FTS. */
50typedef void (*fts_filter)(void *, fts_node_t *, void *, ulint len);
51
52/** Statistics relevant to a particular document, used during retrieval. */
54 doc_id_t doc_id; /*!< Document id */
55 ulint word_count; /*!< Total words in the document */
56};
57
58/** It's main purpose is to store the SQL prepared statements that
59are required to retrieve a document from the database. */
61 fts_index_cache_t *index_cache; /*!< The index cache instance */
62
63 /*!< Parsed sql statement */
65 fts_cache_t *cache; /*!< The parent cache */
66};
67
68/** Since we can have multiple FTS indexes on a table, we keep a
69per index cache of words etc. */
71 dict_index_t *index; /*!< The FTS index instance */
72
73 ib_rbt_t *words; /*!< Nodes; indexed by fts_string_t*,
74 cells are fts_tokenizer_word_t*.*/
75
76 ib_vector_t *doc_stats; /*!< Array of the fts_doc_stats_t
77 contained in the memory buffer.
78 Must be in sorted order (ascending).
79 The ideal choice is an rb tree but
80 the rb tree imposes a space overhead
81 that we can do without */
82
83 que_t **ins_graph; /*!< Insert query graphs */
84
85 que_t **sel_graph; /*!< Select query graphs */
86 CHARSET_INFO *charset; /*!< charset */
87};
88
89/** For supporting the tracking of updates on multiple FTS indexes we need
90to track which FTS indexes need to be updated. For INSERT and DELETE we
91update all fts indexes. */
93 doc_id_t doc_id; /*!< The doc id affected */
94
95 ib_vector_t *fts_indexes; /*!< The FTS indexes that need to be
96 updated. A NULL value means all
97 indexes need to be updated. This
98 vector is not allocated on the heap
99 and so must be freed explicitly,
100 when we are done with it */
101};
102
103/** Stop word control infotmation. */
105 ulint status; /*!< Status of the stopword tree */
106 ib_alloc_t *heap; /*!< The memory allocator to use */
107 ib_rbt_t *cached_stopword; /*!< This stores all active stopwords */
108 CHARSET_INFO *charset; /*!< charset for stopword */
109};
110
111/** The SYNC state of the cache. There is one instance of this struct
112associated with each ADD thread. */
114 trx_t *trx; /*!< The transaction used for SYNCing
115 the cache to disk */
116 dict_table_t *table; /*!< Table with FTS index(es) */
117 ulint max_cache_size; /*!< Max size in bytes of the cache */
118 bool cache_full; /*!< flag, when true it indicates that
119 we need to sync the cache to disk */
120 ulint lower_index; /*!< the start index of the doc id
121 vector from where to start adding
122 documents to the FTS cache */
123 ulint upper_index; /*!< max index of the doc id vector to
124 add to the FTS cache */
125 bool interrupted; /*!< true if SYNC was interrupted */
126 doc_id_t min_doc_id; /*!< The smallest doc id added to the
127 cache. It should equal to
128 doc_ids[lower_index] */
129 doc_id_t max_doc_id; /*!< The doc id at which the cache was
130 noted as being full, we use this to
131 set the upper_limit field */
132 std::chrono::steady_clock::time_point start_time;
133 /*!< SYNC start time */
134 bool in_progress; /*!< flag whether sync is in progress.*/
135 bool unlock_cache; /*!< flag whether unlock cache when
136 write fts node */
137 os_event_t event; /*!< sync finish event */
138};
139
140/** The cache for the FTS system. It is a memory-based inverted index
141that new entries are added to, until it grows over the configured maximum
142size, at which time its contents are written to the INDEX table. */
144#ifndef UNIV_HOTBACKUP
145 rw_lock_t lock; /*!< lock protecting all access to the
146 memory buffer. FIXME: this needs to
147 be our new upgrade-capable rw-lock */
148
149 rw_lock_t init_lock; /*!< lock used for the cache
150 initialization, it has different
151 SYNC level as above cache lock */
152#endif /* !UNIV_HOTBACKUP */
153
154 ib_mutex_t optimize_lock; /*!< Lock for OPTIMIZE */
155
156 ib_mutex_t deleted_lock; /*!< Lock covering deleted_doc_ids */
157
158 ib_mutex_t doc_id_lock; /*!< Lock covering Doc ID */
159
160 ib_vector_t *deleted_doc_ids; /*!< Array of deleted doc ids, each
161 element is of type fts_update_t */
162
163 ib_vector_t *indexes; /*!< We store the stats and inverted
164 index for the individual FTS indexes
165 in this vector. Each element is
166 an instance of fts_index_cache_t */
167
168 ib_vector_t *get_docs; /*!< information required to read
169 the document from the table. Each
170 element is of type fts_doc_t */
171
172 ulint total_size; /*!< total size consumed by the ilist
173 field of all nodes. SYNC is run
174 whenever this gets too big */
175 uint64_t total_size_before_sync; /*!< total size of fts cache,
176 when last SYNC request was sent */
177 fts_sync_t *sync; /*!< sync structure to sync data to
178 disk */
179 ib_alloc_t *sync_heap; /*!< The heap allocator, for indexes
180 and deleted_doc_ids, ie. transient
181 objects, they are recreated after
182 a SYNC is completed */
183
184 ib_alloc_t *self_heap; /*!< This heap is the heap out of
185 which an instance of the cache itself
186 was created. Objects created using
187 this heap will last for the lifetime
188 of the cache */
189
190 doc_id_t next_doc_id; /*!< Next doc id */
191
192 doc_id_t synced_doc_id; /*!< Doc ID sync-ed to CONFIG table */
193
194 doc_id_t first_doc_id; /*!< first doc id since this table
195 was opened */
196
197 ulint deleted; /*!< Number of doc ids deleted since
198 last optimized. This variable is
199 covered by deleted_lock */
200
201 ulint added; /*!< Number of doc ids added since last
202 optimized. This variable is covered by
203 the deleted lock */
204
205 fts_stopword_t stopword_info; /*!< Cached stopwords for the FTS */
206 mem_heap_t *cache_heap; /*!< Cache Heap */
207};
208
209/** Columns of the FTS auxiliary INDEX table */
211 doc_id_t first_doc_id; /*!< First document id in ilist. */
212
213 doc_id_t last_doc_id; /*!< Last document id in ilist. */
214
215 byte *ilist; /*!< Binary list of documents & word
216 positions the token appears in.
217 TODO: For now, these are simply
218 ut_malloc'd, but if testing shows
219 that they waste memory unacceptably, a
220 special memory allocator will have
221 to be written */
222
223 ulint doc_count; /*!< Number of doc ids in ilist */
224
225 ulint ilist_size; /*!< Used size of ilist in bytes. */
226
228 /*!< Allocated size of ilist in
229 bytes */
230 bool synced; /*!< flag whether the node is synced */
231};
232
233/** A tokenizer word. Contains information about one word. */
235 fts_string_t text; /*!< Token text. */
236
237 ib_vector_t *nodes; /*!< Word node ilists, each element is
238 of type fts_node_t */
239};
240
241/** Word text plus it's array of nodes as on disk in FTS index */
243 fts_string_t text; /*!< Word value in UTF-8 */
244 ib_vector_t *nodes; /*!< Nodes read from disk */
245
246 ib_alloc_t *heap_alloc; /*!< For handling all allocations */
247};
248
249/** Callback for reading and filtering nodes that are read from FTS index */
251 void *read_arg; /*!< Arg for the sql_callback */
252
253 fts_sql_callback read_record; /*!< Callback for reading index
254 record */
255 ulint total_memory; /*!< Total memory used */
256};
257
258/** For horizontally splitting an FTS auxiliary index */
260 ulint value; /*!< Character value at which
261 to split */
262
263 const char *suffix; /*!< FTS aux index suffix */
264};
265
266/** This type represents a single document. */
267struct fts_doc_t {
268 fts_string_t text; /*!< document text */
269
270 bool found; /*!< true if the document was found
271 successfully in the database */
272
273 ib_rbt_t *tokens; /*!< This is filled when the document
274 is tokenized. Tokens; indexed by
275 fts_string_t*, cells are of type
276 fts_token_t* */
277
278 ib_alloc_t *self_heap; /*!< An instance of this type is
279 allocated from this heap along
280 with any objects that have the
281 same lifespan, most notably
282 the vector of token positions */
283 CHARSET_INFO *charset; /*!< Document's charset info */
284
285 st_mysql_ftparser *parser; /*!< fts plugin parser */
286
287 bool is_ngram; /*!< Whether it is a ngram parser */
288
289 ib_rbt_t *stopwords; /*!< Stopwords */
290};
291
292/** A token and its positions within a document. */
294 fts_string_t text; /*!< token text */
295
296 ib_vector_t *positions; /*!< an array of the positions the
297 token is found in; each item is
298 actually an ulint. */
299};
300
301/** It's defined in fts/fts0fts.c */
303
304/** It's defined in fts/fts0fts.c */
306
307/** Compare two fts_trx_row_t instances doc_ids.
308@param[in] p1 id1
309@param[in] p2 id2
310@return < 0 if n1 < n2, < 0 if n1 < n2, > 0 if n1 > n2 */
311static inline int fts_trx_row_doc_id_cmp(const void *p1, const void *p2);
312
313/** Compare two fts_ranking_t instances doc_ids.
314@param[in] p1 id1
315@param[in] p2 id2
316@return < 0 if n1 < n2, < 0 if n1 < n2, > 0 if n1 > n2 */
317static inline int fts_ranking_doc_id_cmp(const void *p1, const void *p2);
318
319/** Compare two fts_update_t instances doc_ids.
320@param[in] p1 id1
321@param[in] p2 id2
322@return < 0 if n1 < n2, < 0 if n1 < n2, > 0 if n1 > n2 */
323static inline int fts_update_doc_id_cmp(const void *p1, const void *p2);
324
325/** Decode and return the integer that was encoded using our VLC scheme.*/
326static inline ulint fts_decode_vlc(
327 /*!< out: value decoded */
328 byte **ptr); /*!< in: ptr to decode from, this ptr is
329 incremented by the number of bytes decoded */
330
331/** Duplicate a string.
332@param[in] dst dup to here
333@param[in] src src string
334@param[in] heap heap to use
335*/
336static inline void fts_string_dup(fts_string_t *dst, const fts_string_t *src,
337 mem_heap_t *heap);
338
339/** Return length of val if it were encoded using our VLC scheme. */
341 /*!< out: length of value
342 encoded, in bytes */
343 ulint val); /*!< in: value to encode */
344
345/** Encode an integer using our VLC scheme and return the length in bytes.
346@param[in] val value to encode
347@param[in] buf buffer, must have enough space
348@return length of value encoded, in bytes */
349static inline ulint fts_encode_int(ulint val, byte *buf);
350
351/** Get the selected FTS aux INDEX suffix. */
352static inline const char *fts_get_suffix(
353 ulint selected); /*!< in: selected index */
354
355/** Return the selected FTS aux index suffix in 5.7 compatible format
356@param[in] selected selected index
357@return the suffix name */
358static inline const char *fts_get_suffix_5_7(ulint selected);
359
360/** Select the FTS auxiliary index for the given character.
361@param[in] cs charset
362@param[in] str string
363@param[in] len string length in bytes
364@return the index to use for the string */
365static inline ulint fts_select_index(const CHARSET_INFO *cs, const byte *str,
366 ulint len);
367
368#include "fts0types.ic"
369#include "fts0vlc.ic"
370
371#endif /* INNOBASE_FTS0TYPES_H */
Full text search header file.
uint64_t doc_id_t
Document id type.
Definition: fts0fts.h:76
const fts_index_selector_t fts_index_selector_5_7[]
It's defined in fts/fts0fts.c.
Definition: fts0fts.cc:155
static ulint fts_decode_vlc(byte **ptr)
Decode and return the integer that was encoded using our VLC scheme.
static ulint fts_encode_int(ulint val, byte *buf)
Encode an integer using our VLC scheme and return the length in bytes.
static void fts_string_dup(fts_string_t *dst, const fts_string_t *src, mem_heap_t *heap)
Duplicate a string.
static ulint fts_get_encoded_len(ulint val)
Return length of val if it were encoded using our VLC scheme.
static int fts_trx_row_doc_id_cmp(const void *p1, const void *p2)
Compare two fts_trx_row_t instances doc_ids.
const fts_index_selector_t fts_index_selector[]
It's defined in fts/fts0fts.c.
Definition: fts0fts.cc:150
void(* fts_filter)(void *, fts_node_t *, void *, ulint len)
Definition: fts0types.h:50
static const char * fts_get_suffix(ulint selected)
Get the selected FTS aux INDEX suffix.
static int fts_ranking_doc_id_cmp(const void *p1, const void *p2)
Compare two fts_ranking_t instances doc_ids.
static const char * fts_get_suffix_5_7(ulint selected)
Return the selected FTS aux index suffix in 5.7 compatible format.
static ulint fts_select_index(const CHARSET_INFO *cs, const byte *str, ulint len)
Select the FTS auxiliary index for the given character.
pars_user_func_cb_t fts_sql_callback
Callbacks used within FTS.
Definition: fts0types.h:46
static int fts_update_doc_id_cmp(const void *p1, const void *p2)
Compare two fts_update_t instances doc_ids.
Full text search types.
Full text variable length integer encoding/decoding.
File-based utilities.
std::string str(const mysqlrouter::ConfigGenerator::Options::Endpoint &ep)
Definition: config_generator.cc:1063
Definition: buf0block_hint.cc:29
Definition: commit_order_queue.h:33
SQL parser.
bool(* pars_user_func_cb_t)(void *arg, void *user_arg)
Type of the user functions.
Definition: pars0pars.h:49
Query graph global types.
Definition: m_ctype.h:382
Data structure for an index.
Definition: dict0mem.h:1045
Data structure for a database table.
Definition: dict0mem.h:1908
The cache for the FTS system.
Definition: fts0types.h:143
rw_lock_t lock
lock protecting all access to the memory buffer.
Definition: fts0types.h:145
fts_sync_t * sync
sync structure to sync data to disk
Definition: fts0types.h:177
ib_vector_t * get_docs
information required to read the document from the table.
Definition: fts0types.h:168
ib_alloc_t * sync_heap
The heap allocator, for indexes and deleted_doc_ids, ie.
Definition: fts0types.h:179
ulint deleted
Number of doc ids deleted since last optimized.
Definition: fts0types.h:197
uint64_t total_size_before_sync
total size of fts cache, when last SYNC request was sent
Definition: fts0types.h:175
rw_lock_t init_lock
lock used for the cache initialization, it has different SYNC level as above cache lock
Definition: fts0types.h:149
mem_heap_t * cache_heap
Cache Heap.
Definition: fts0types.h:206
ulint total_size
total size consumed by the ilist field of all nodes.
Definition: fts0types.h:172
doc_id_t first_doc_id
first doc id since this table was opened
Definition: fts0types.h:194
doc_id_t synced_doc_id
Doc ID sync-ed to CONFIG table.
Definition: fts0types.h:192
ulint added
Number of doc ids added since last optimized.
Definition: fts0types.h:201
ib_alloc_t * self_heap
This heap is the heap out of which an instance of the cache itself was created.
Definition: fts0types.h:184
ib_mutex_t deleted_lock
Lock covering deleted_doc_ids.
Definition: fts0types.h:156
ib_vector_t * deleted_doc_ids
Array of deleted doc ids, each element is of type fts_update_t.
Definition: fts0types.h:160
fts_stopword_t stopword_info
Cached stopwords for the FTS.
Definition: fts0types.h:205
ib_vector_t * indexes
We store the stats and inverted index for the individual FTS indexes in this vector.
Definition: fts0types.h:163
ib_mutex_t optimize_lock
Lock for OPTIMIZE.
Definition: fts0types.h:154
doc_id_t next_doc_id
Next doc id.
Definition: fts0types.h:190
ib_mutex_t doc_id_lock
Lock covering Doc ID.
Definition: fts0types.h:158
Statistics relevant to a particular document, used during retrieval.
Definition: fts0types.h:53
doc_id_t doc_id
Document id.
Definition: fts0types.h:54
ulint word_count
Total words in the document.
Definition: fts0types.h:55
This type represents a single document.
Definition: fts0types.h:267
fts_string_t text
document text
Definition: fts0types.h:268
ib_alloc_t * self_heap
An instance of this type is allocated from this heap along with any objects that have the same lifesp...
Definition: fts0types.h:278
ib_rbt_t * tokens
This is filled when the document is tokenized.
Definition: fts0types.h:273
bool found
true if the document was found successfully in the database
Definition: fts0types.h:270
bool is_ngram
Whether it is a ngram parser.
Definition: fts0types.h:287
CHARSET_INFO * charset
Document's charset info.
Definition: fts0types.h:283
ib_rbt_t * stopwords
Stopwords.
Definition: fts0types.h:289
st_mysql_ftparser * parser
fts plugin parser
Definition: fts0types.h:285
Callback for reading and filtering nodes that are read from FTS index.
Definition: fts0types.h:250
void * read_arg
Arg for the sql_callback.
Definition: fts0types.h:251
fts_sql_callback read_record
Callback for reading index record.
Definition: fts0types.h:253
ulint total_memory
Total memory used.
Definition: fts0types.h:255
It's main purpose is to store the SQL prepared statements that are required to retrieve a document fr...
Definition: fts0types.h:60
fts_cache_t * cache
The parent cache.
Definition: fts0types.h:65
que_t * get_document_graph
Definition: fts0types.h:64
fts_index_cache_t * index_cache
The index cache instance.
Definition: fts0types.h:61
Since we can have multiple FTS indexes on a table, we keep a per index cache of words etc.
Definition: fts0types.h:70
que_t ** sel_graph
Select query graphs.
Definition: fts0types.h:85
que_t ** ins_graph
Insert query graphs.
Definition: fts0types.h:83
dict_index_t * index
The FTS index instance.
Definition: fts0types.h:71
ib_vector_t * doc_stats
Array of the fts_doc_stats_t contained in the memory buffer.
Definition: fts0types.h:76
ib_rbt_t * words
Nodes; indexed by fts_string_t*, cells are fts_tokenizer_word_t*.
Definition: fts0types.h:73
CHARSET_INFO * charset
charset
Definition: fts0types.h:86
For horizontally splitting an FTS auxiliary index.
Definition: fts0types.h:259
ulint value
Character value at which to split.
Definition: fts0types.h:260
const char * suffix
FTS aux index suffix.
Definition: fts0types.h:263
Columns of the FTS auxiliary INDEX table.
Definition: fts0types.h:210
bool synced
flag whether the node is synced
Definition: fts0types.h:230
ulint ilist_size_alloc
Allocated size of ilist in bytes.
Definition: fts0types.h:227
byte * ilist
Binary list of documents & word positions the token appears in.
Definition: fts0types.h:215
ulint doc_count
Number of doc ids in ilist.
Definition: fts0types.h:223
doc_id_t last_doc_id
Last document id in ilist.
Definition: fts0types.h:213
ulint ilist_size
Used size of ilist in bytes.
Definition: fts0types.h:225
doc_id_t first_doc_id
First document id in ilist.
Definition: fts0types.h:211
Stop word control infotmation.
Definition: fts0types.h:104
ulint status
Status of the stopword tree.
Definition: fts0types.h:105
CHARSET_INFO * charset
charset for stopword
Definition: fts0types.h:108
ib_rbt_t * cached_stopword
This stores all active stopwords.
Definition: fts0types.h:107
ib_alloc_t * heap
The memory allocator to use.
Definition: fts0types.h:106
An UTF-16 ro UTF-8 string.
Definition: fts0fts.h:291
The SYNC state of the cache.
Definition: fts0types.h:113
bool unlock_cache
flag whether unlock cache when write fts node
Definition: fts0types.h:135
ulint upper_index
max index of the doc id vector to add to the FTS cache
Definition: fts0types.h:123
bool interrupted
true if SYNC was interrupted
Definition: fts0types.h:125
ulint max_cache_size
Max size in bytes of the cache.
Definition: fts0types.h:117
std::chrono::steady_clock::time_point start_time
SYNC start time.
Definition: fts0types.h:132
doc_id_t min_doc_id
The smallest doc id added to the cache.
Definition: fts0types.h:126
dict_table_t * table
Table with FTS index(es)
Definition: fts0types.h:116
doc_id_t max_doc_id
The doc id at which the cache was noted as being full, we use this to set the upper_limit field.
Definition: fts0types.h:129
os_event_t event
sync finish event
Definition: fts0types.h:137
ulint lower_index
the start index of the doc id vector from where to start adding documents to the FTS cache
Definition: fts0types.h:120
bool in_progress
flag whether sync is in progress.
Definition: fts0types.h:134
trx_t * trx
The transaction used for SYNCing the cache to disk.
Definition: fts0types.h:114
bool cache_full
flag, when true it indicates that we need to sync the cache to disk
Definition: fts0types.h:118
A token and its positions within a document.
Definition: fts0types.h:293
fts_string_t text
token text
Definition: fts0types.h:294
ib_vector_t * positions
an array of the positions the token is found in; each item is actually an ulint.
Definition: fts0types.h:296
A tokenizer word.
Definition: fts0types.h:234
ib_vector_t * nodes
Word node ilists, each element is of type fts_node_t.
Definition: fts0types.h:237
fts_string_t text
Token text.
Definition: fts0types.h:235
For supporting the tracking of updates on multiple FTS indexes we need to track which FTS indexes nee...
Definition: fts0types.h:92
ib_vector_t * fts_indexes
The FTS indexes that need to be updated.
Definition: fts0types.h:95
doc_id_t doc_id
The doc id affected.
Definition: fts0types.h:93
Word text plus it's array of nodes as on disk in FTS index.
Definition: fts0types.h:242
ib_alloc_t * heap_alloc
For handling all allocations.
Definition: fts0types.h:246
fts_string_t text
Word value in UTF-8.
Definition: fts0types.h:243
ib_vector_t * nodes
Nodes read from disk.
Definition: fts0types.h:244
Definition: ut0vec.h:203
Red black tree instance.
Definition: ut0rbt.h:71
Definition: ut0vec.h:212
The info structure stored at the beginning of a heap block.
Definition: mem0mem.h:301
InnoDB condition variable.
Definition: os0event.cc:62
Definition: que0que.h:300
The structure used in the spin lock implementation of a read-write lock.
Definition: sync0rw.h:359
Definition: plugin_ftparser.h:211
Definition: trx0trx.h:680
Version control for database, common definitions, and include files.
unsigned long int ulint
Definition: univ.i:407
Utilities for byte operations.
Various utilities.