MySQL 8.3.0
Source Code Documentation
fts0types.h
Go to the documentation of this file.
1/*****************************************************************************
2
3Copyright (c) 2007, 2023, Oracle and/or its affiliates.
4
5This program is free software; you can redistribute it and/or modify it under
6the terms of the GNU General Public License, version 2.0, as published by the
7Free Software Foundation.
8
9This program is also distributed with certain software (including but not
10limited to OpenSSL) that is licensed under separate terms, as designated in a
11particular file or component or in included license documentation. The authors
12of MySQL hereby grant you an additional permission to link the program and
13your derivative works with the separately licensed software that they have
14included with MySQL.
15
16This program is distributed in the hope that it will be useful, but WITHOUT
17ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
18FOR A PARTICULAR PURPOSE. See the GNU General Public License, version 2.0,
19for more details.
20
21You should have received a copy of the GNU General Public License along with
22this program; if not, write to the Free Software Foundation, Inc.,
2351 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
24
25*****************************************************************************/
26
27/** @file include/fts0types.h
28 Full text search types file
29
30 Created 2007-03-27 Sunny Bains
31 *******************************************************/
32
33#ifndef INNOBASE_FTS0TYPES_H
34#define INNOBASE_FTS0TYPES_H
35
36#include "fts0fts.h"
37#include "fut0fut.h"
38#include "pars0pars.h"
39#include "que0types.h"
40#include "univ.i"
41#include "ut0byte.h"
42#include "ut0rbt.h"
43
44struct CHARSET_INFO;
45
46/** Types used within FTS. */
47struct fts_que_t;
48struct fts_node_t;
49
50/** Callbacks used within FTS. */
52typedef void (*fts_filter)(void *, fts_node_t *, void *, ulint len);
53
54/** Statistics relevant to a particular document, used during retrieval. */
56 doc_id_t doc_id; /*!< Document id */
57 ulint word_count; /*!< Total words in the document */
58};
59
60/** It's main purpose is to store the SQL prepared statements that
61are required to retrieve a document from the database. */
63 fts_index_cache_t *index_cache; /*!< The index cache instance */
64
65 /*!< Parsed sql statement */
67 fts_cache_t *cache; /*!< The parent cache */
68};
69
70/** Since we can have multiple FTS indexes on a table, we keep a
71per index cache of words etc. */
73 dict_index_t *index; /*!< The FTS index instance */
74
75 ib_rbt_t *words; /*!< Nodes; indexed by fts_string_t*,
76 cells are fts_tokenizer_word_t*.*/
77
78 ib_vector_t *doc_stats; /*!< Array of the fts_doc_stats_t
79 contained in the memory buffer.
80 Must be in sorted order (ascending).
81 The ideal choice is an rb tree but
82 the rb tree imposes a space overhead
83 that we can do without */
84
85 que_t **ins_graph; /*!< Insert query graphs */
86
87 que_t **sel_graph; /*!< Select query graphs */
88 CHARSET_INFO *charset; /*!< charset */
89};
90
91/** For supporting the tracking of updates on multiple FTS indexes we need
92to track which FTS indexes need to be updated. For INSERT and DELETE we
93update all fts indexes. */
95 doc_id_t doc_id; /*!< The doc id affected */
96
97 ib_vector_t *fts_indexes; /*!< The FTS indexes that need to be
98 updated. A NULL value means all
99 indexes need to be updated. This
100 vector is not allocated on the heap
101 and so must be freed explicitly,
102 when we are done with it */
103};
104
105/** Stop word control infotmation. */
107 ulint status; /*!< Status of the stopword tree */
108 ib_alloc_t *heap; /*!< The memory allocator to use */
109 ib_rbt_t *cached_stopword; /*!< This stores all active stopwords */
110 CHARSET_INFO *charset; /*!< charset for stopword */
111};
112
113/** The SYNC state of the cache. There is one instance of this struct
114associated with each ADD thread. */
116 trx_t *trx; /*!< The transaction used for SYNCing
117 the cache to disk */
118 dict_table_t *table; /*!< Table with FTS index(es) */
119 ulint max_cache_size; /*!< Max size in bytes of the cache */
120 bool cache_full; /*!< flag, when true it indicates that
121 we need to sync the cache to disk */
122 ulint lower_index; /*!< the start index of the doc id
123 vector from where to start adding
124 documents to the FTS cache */
125 ulint upper_index; /*!< max index of the doc id vector to
126 add to the FTS cache */
127 bool interrupted; /*!< true if SYNC was interrupted */
128 doc_id_t min_doc_id; /*!< The smallest doc id added to the
129 cache. It should equal to
130 doc_ids[lower_index] */
131 doc_id_t max_doc_id; /*!< The doc id at which the cache was
132 noted as being full, we use this to
133 set the upper_limit field */
134 std::chrono::steady_clock::time_point start_time;
135 /*!< SYNC start time */
136 bool in_progress; /*!< flag whether sync is in progress.*/
137 bool unlock_cache; /*!< flag whether unlock cache when
138 write fts node */
139 os_event_t event; /*!< sync finish event */
140};
141
142/** The cache for the FTS system. It is a memory-based inverted index
143that new entries are added to, until it grows over the configured maximum
144size, at which time its contents are written to the INDEX table. */
146#ifndef UNIV_HOTBACKUP
147 rw_lock_t lock; /*!< lock protecting all access to the
148 memory buffer. FIXME: this needs to
149 be our new upgrade-capable rw-lock */
150
151 rw_lock_t init_lock; /*!< lock used for the cache
152 initialization, it has different
153 SYNC level as above cache lock */
154#endif /* !UNIV_HOTBACKUP */
155
156 ib_mutex_t optimize_lock; /*!< Lock for OPTIMIZE */
157
158 ib_mutex_t deleted_lock; /*!< Lock covering deleted_doc_ids */
159
160 ib_mutex_t doc_id_lock; /*!< Lock covering Doc ID */
161
162 ib_vector_t *deleted_doc_ids; /*!< Array of deleted doc ids, each
163 element is of type fts_update_t */
164
165 ib_vector_t *indexes; /*!< We store the stats and inverted
166 index for the individual FTS indexes
167 in this vector. Each element is
168 an instance of fts_index_cache_t */
169
170 ib_vector_t *get_docs; /*!< information required to read
171 the document from the table. Each
172 element is of type fts_doc_t */
173
174 ulint total_size; /*!< total size consumed by the ilist
175 field of all nodes. SYNC is run
176 whenever this gets too big */
177 uint64_t total_size_before_sync; /*!< total size of fts cache,
178 when last SYNC request was sent */
179 fts_sync_t *sync; /*!< sync structure to sync data to
180 disk */
181 ib_alloc_t *sync_heap; /*!< The heap allocator, for indexes
182 and deleted_doc_ids, ie. transient
183 objects, they are recreated after
184 a SYNC is completed */
185
186 ib_alloc_t *self_heap; /*!< This heap is the heap out of
187 which an instance of the cache itself
188 was created. Objects created using
189 this heap will last for the lifetime
190 of the cache */
191
192 doc_id_t next_doc_id; /*!< Next doc id */
193
194 doc_id_t synced_doc_id; /*!< Doc ID sync-ed to CONFIG table */
195
196 doc_id_t first_doc_id; /*!< first doc id since this table
197 was opened */
198
199 ulint deleted; /*!< Number of doc ids deleted since
200 last optimized. This variable is
201 covered by deleted_lock */
202
203 ulint added; /*!< Number of doc ids added since last
204 optimized. This variable is covered by
205 the deleted lock */
206
207 fts_stopword_t stopword_info; /*!< Cached stopwords for the FTS */
208 mem_heap_t *cache_heap; /*!< Cache Heap */
209};
210
211/** Columns of the FTS auxiliary INDEX table */
213 doc_id_t first_doc_id; /*!< First document id in ilist. */
214
215 doc_id_t last_doc_id; /*!< Last document id in ilist. */
216
217 byte *ilist; /*!< Binary list of documents & word
218 positions the token appears in.
219 TODO: For now, these are simply
220 ut_malloc'd, but if testing shows
221 that they waste memory unacceptably, a
222 special memory allocator will have
223 to be written */
224
225 ulint doc_count; /*!< Number of doc ids in ilist */
226
227 ulint ilist_size; /*!< Used size of ilist in bytes. */
228
230 /*!< Allocated size of ilist in
231 bytes */
232 bool synced; /*!< flag whether the node is synced */
233};
234
235/** A tokenizer word. Contains information about one word. */
237 fts_string_t text; /*!< Token text. */
238
239 ib_vector_t *nodes; /*!< Word node ilists, each element is
240 of type fts_node_t */
241};
242
243/** Word text plus it's array of nodes as on disk in FTS index */
245 fts_string_t text; /*!< Word value in UTF-8 */
246 ib_vector_t *nodes; /*!< Nodes read from disk */
247
248 ib_alloc_t *heap_alloc; /*!< For handling all allocations */
249};
250
251/** Callback for reading and filtering nodes that are read from FTS index */
253 void *read_arg; /*!< Arg for the sql_callback */
254
255 fts_sql_callback read_record; /*!< Callback for reading index
256 record */
257 ulint total_memory; /*!< Total memory used */
258};
259
260/** For horizontally splitting an FTS auxiliary index */
262 ulint value; /*!< Character value at which
263 to split */
264
265 const char *suffix; /*!< FTS aux index suffix */
266};
267
268/** This type represents a single document field.
269 When fulltext index spans multiple columns, the
270 entire document (all indexed text in a row)
271 is comprised of multiple fields, one per indexed
272 column. */
273struct fts_doc_t {
274 fts_string_t text; /*!< document text */
275
276 bool found; /*!< true if the document was found
277 successfully in the database */
278
279 ib_rbt_t *tokens; /*!< This is filled when the document
280 is tokenized. Tokens; indexed by
281 fts_string_t*, cells are of type
282 fts_token_t* */
283
284 ib_alloc_t *self_heap; /*!< An instance of this type is
285 allocated from this heap along
286 with any objects that have the
287 same lifespan, most notably
288 the vector of token positions */
289 CHARSET_INFO *charset; /*!< Document's charset info */
290
291 st_mysql_ftparser *parser; /*!< fts plugin parser */
292
293 bool is_ngram; /*!< Whether it is a ngram parser */
294
295 ib_rbt_t *stopwords; /*!< Stopwords */
296};
297
298/** A token and its positions within a document. */
300 fts_string_t text; /*!< token text */
301
302 ib_vector_t *positions; /*!< an array of the positions the
303 token is found in; each item is
304 actually an ulint. */
305};
306
307/** It's defined in fts/fts0fts.c */
309
310/** It's defined in fts/fts0fts.c */
312
313/** Compare two fts_trx_row_t instances doc_ids.
314@param[in] p1 id1
315@param[in] p2 id2
316@return < 0 if n1 < n2, < 0 if n1 < n2, > 0 if n1 > n2 */
317static inline int fts_trx_row_doc_id_cmp(const void *p1, const void *p2);
318
319/** Compare two fts_ranking_t instances doc_ids.
320@param[in] p1 id1
321@param[in] p2 id2
322@return < 0 if n1 < n2, < 0 if n1 < n2, > 0 if n1 > n2 */
323static inline int fts_ranking_doc_id_cmp(const void *p1, const void *p2);
324
325/** Compare two fts_update_t instances doc_ids.
326@param[in] p1 id1
327@param[in] p2 id2
328@return < 0 if n1 < n2, < 0 if n1 < n2, > 0 if n1 > n2 */
329static inline int fts_update_doc_id_cmp(const void *p1, const void *p2);
330
331/** Decode and return the integer that was encoded using our VLC scheme.*/
332static inline ulint fts_decode_vlc(
333 /*!< out: value decoded */
334 byte **ptr); /*!< in: ptr to decode from, this ptr is
335 incremented by the number of bytes decoded */
336
337/** Duplicate a string.
338@param[in] dst dup to here
339@param[in] src src string
340@param[in] heap heap to use
341*/
342static inline void fts_string_dup(fts_string_t *dst, const fts_string_t *src,
343 mem_heap_t *heap);
344
345/** Return length of val if it were encoded using our VLC scheme. */
347 /*!< out: length of value
348 encoded, in bytes */
349 ulint val); /*!< in: value to encode */
350
351/** Encode an integer using our VLC scheme and return the length in bytes.
352@param[in] val value to encode
353@param[in] buf buffer, must have enough space
354@return length of value encoded, in bytes */
355static inline ulint fts_encode_int(ulint val, byte *buf);
356
357/** Get the selected FTS aux INDEX suffix. */
358static inline const char *fts_get_suffix(
359 ulint selected); /*!< in: selected index */
360
361/** Return the selected FTS aux index suffix in 5.7 compatible format
362@param[in] selected selected index
363@return the suffix name */
364static inline const char *fts_get_suffix_5_7(ulint selected);
365
366/** Select the FTS auxiliary index for the given character.
367@param[in] cs charset
368@param[in] str string
369@param[in] len string length in bytes
370@return the index to use for the string */
371static inline ulint fts_select_index(const CHARSET_INFO *cs, const byte *str,
372 ulint len);
373
374#include "fts0types.ic"
375#include "fts0vlc.ic"
376
377#endif /* INNOBASE_FTS0TYPES_H */
Full text search header file.
uint64_t doc_id_t
Document id type.
Definition: fts0fts.h:78
const fts_index_selector_t fts_index_selector_5_7[]
It's defined in fts/fts0fts.c.
Definition: fts0fts.cc:157
static ulint fts_decode_vlc(byte **ptr)
Decode and return the integer that was encoded using our VLC scheme.
static ulint fts_encode_int(ulint val, byte *buf)
Encode an integer using our VLC scheme and return the length in bytes.
static void fts_string_dup(fts_string_t *dst, const fts_string_t *src, mem_heap_t *heap)
Duplicate a string.
static ulint fts_get_encoded_len(ulint val)
Return length of val if it were encoded using our VLC scheme.
static int fts_trx_row_doc_id_cmp(const void *p1, const void *p2)
Compare two fts_trx_row_t instances doc_ids.
const fts_index_selector_t fts_index_selector[]
It's defined in fts/fts0fts.c.
Definition: fts0fts.cc:152
void(* fts_filter)(void *, fts_node_t *, void *, ulint len)
Definition: fts0types.h:52
static const char * fts_get_suffix(ulint selected)
Get the selected FTS aux INDEX suffix.
static int fts_ranking_doc_id_cmp(const void *p1, const void *p2)
Compare two fts_ranking_t instances doc_ids.
static const char * fts_get_suffix_5_7(ulint selected)
Return the selected FTS aux index suffix in 5.7 compatible format.
static ulint fts_select_index(const CHARSET_INFO *cs, const byte *str, ulint len)
Select the FTS auxiliary index for the given character.
pars_user_func_cb_t fts_sql_callback
Callbacks used within FTS.
Definition: fts0types.h:48
static int fts_update_doc_id_cmp(const void *p1, const void *p2)
Compare two fts_update_t instances doc_ids.
Full text search types.
Full text variable length integer encoding/decoding.
File-based utilities.
std::string str(const mysqlrouter::ConfigGenerator::Options::Endpoint &ep)
Definition: config_generator.cc:1065
Definition: buf0block_hint.cc:29
Definition: commit_order_queue.h:33
SQL parser.
bool(* pars_user_func_cb_t)(void *arg, void *user_arg)
Type of the user functions.
Definition: pars0pars.h:49
Query graph global types.
Definition: m_ctype.h:422
Data structure for an index.
Definition: dict0mem.h:1045
Data structure for a database table.
Definition: dict0mem.h:1908
The cache for the FTS system.
Definition: fts0types.h:145
rw_lock_t lock
lock protecting all access to the memory buffer.
Definition: fts0types.h:147
fts_sync_t * sync
sync structure to sync data to disk
Definition: fts0types.h:179
ib_vector_t * get_docs
information required to read the document from the table.
Definition: fts0types.h:170
ib_alloc_t * sync_heap
The heap allocator, for indexes and deleted_doc_ids, ie.
Definition: fts0types.h:181
ulint deleted
Number of doc ids deleted since last optimized.
Definition: fts0types.h:199
uint64_t total_size_before_sync
total size of fts cache, when last SYNC request was sent
Definition: fts0types.h:177
rw_lock_t init_lock
lock used for the cache initialization, it has different SYNC level as above cache lock
Definition: fts0types.h:151
mem_heap_t * cache_heap
Cache Heap.
Definition: fts0types.h:208
ulint total_size
total size consumed by the ilist field of all nodes.
Definition: fts0types.h:174
doc_id_t first_doc_id
first doc id since this table was opened
Definition: fts0types.h:196
doc_id_t synced_doc_id
Doc ID sync-ed to CONFIG table.
Definition: fts0types.h:194
ulint added
Number of doc ids added since last optimized.
Definition: fts0types.h:203
ib_alloc_t * self_heap
This heap is the heap out of which an instance of the cache itself was created.
Definition: fts0types.h:186
ib_mutex_t deleted_lock
Lock covering deleted_doc_ids.
Definition: fts0types.h:158
ib_vector_t * deleted_doc_ids
Array of deleted doc ids, each element is of type fts_update_t.
Definition: fts0types.h:162
fts_stopword_t stopword_info
Cached stopwords for the FTS.
Definition: fts0types.h:207
ib_vector_t * indexes
We store the stats and inverted index for the individual FTS indexes in this vector.
Definition: fts0types.h:165
ib_mutex_t optimize_lock
Lock for OPTIMIZE.
Definition: fts0types.h:156
doc_id_t next_doc_id
Next doc id.
Definition: fts0types.h:192
ib_mutex_t doc_id_lock
Lock covering Doc ID.
Definition: fts0types.h:160
Statistics relevant to a particular document, used during retrieval.
Definition: fts0types.h:55
doc_id_t doc_id
Document id.
Definition: fts0types.h:56
ulint word_count
Total words in the document.
Definition: fts0types.h:57
This type represents a single document field.
Definition: fts0types.h:273
fts_string_t text
document text
Definition: fts0types.h:274
ib_alloc_t * self_heap
An instance of this type is allocated from this heap along with any objects that have the same lifesp...
Definition: fts0types.h:284
ib_rbt_t * tokens
This is filled when the document is tokenized.
Definition: fts0types.h:279
bool found
true if the document was found successfully in the database
Definition: fts0types.h:276
bool is_ngram
Whether it is a ngram parser.
Definition: fts0types.h:293
CHARSET_INFO * charset
Document's charset info.
Definition: fts0types.h:289
ib_rbt_t * stopwords
Stopwords.
Definition: fts0types.h:295
st_mysql_ftparser * parser
fts plugin parser
Definition: fts0types.h:291
Callback for reading and filtering nodes that are read from FTS index.
Definition: fts0types.h:252
void * read_arg
Arg for the sql_callback.
Definition: fts0types.h:253
fts_sql_callback read_record
Callback for reading index record.
Definition: fts0types.h:255
ulint total_memory
Total memory used.
Definition: fts0types.h:257
It's main purpose is to store the SQL prepared statements that are required to retrieve a document fr...
Definition: fts0types.h:62
fts_cache_t * cache
The parent cache.
Definition: fts0types.h:67
que_t * get_document_graph
Definition: fts0types.h:66
fts_index_cache_t * index_cache
The index cache instance.
Definition: fts0types.h:63
Since we can have multiple FTS indexes on a table, we keep a per index cache of words etc.
Definition: fts0types.h:72
que_t ** sel_graph
Select query graphs.
Definition: fts0types.h:87
que_t ** ins_graph
Insert query graphs.
Definition: fts0types.h:85
dict_index_t * index
The FTS index instance.
Definition: fts0types.h:73
ib_vector_t * doc_stats
Array of the fts_doc_stats_t contained in the memory buffer.
Definition: fts0types.h:78
ib_rbt_t * words
Nodes; indexed by fts_string_t*, cells are fts_tokenizer_word_t*.
Definition: fts0types.h:75
CHARSET_INFO * charset
charset
Definition: fts0types.h:88
For horizontally splitting an FTS auxiliary index.
Definition: fts0types.h:261
ulint value
Character value at which to split.
Definition: fts0types.h:262
const char * suffix
FTS aux index suffix.
Definition: fts0types.h:265
Columns of the FTS auxiliary INDEX table.
Definition: fts0types.h:212
bool synced
flag whether the node is synced
Definition: fts0types.h:232
ulint ilist_size_alloc
Allocated size of ilist in bytes.
Definition: fts0types.h:229
byte * ilist
Binary list of documents & word positions the token appears in.
Definition: fts0types.h:217
ulint doc_count
Number of doc ids in ilist.
Definition: fts0types.h:225
doc_id_t last_doc_id
Last document id in ilist.
Definition: fts0types.h:215
ulint ilist_size
Used size of ilist in bytes.
Definition: fts0types.h:227
doc_id_t first_doc_id
First document id in ilist.
Definition: fts0types.h:213
Stop word control infotmation.
Definition: fts0types.h:106
ulint status
Status of the stopword tree.
Definition: fts0types.h:107
CHARSET_INFO * charset
charset for stopword
Definition: fts0types.h:110
ib_rbt_t * cached_stopword
This stores all active stopwords.
Definition: fts0types.h:109
ib_alloc_t * heap
The memory allocator to use.
Definition: fts0types.h:108
An UTF-16 ro UTF-8 string.
Definition: fts0fts.h:293
The SYNC state of the cache.
Definition: fts0types.h:115
bool unlock_cache
flag whether unlock cache when write fts node
Definition: fts0types.h:137
ulint upper_index
max index of the doc id vector to add to the FTS cache
Definition: fts0types.h:125
bool interrupted
true if SYNC was interrupted
Definition: fts0types.h:127
ulint max_cache_size
Max size in bytes of the cache.
Definition: fts0types.h:119
std::chrono::steady_clock::time_point start_time
SYNC start time.
Definition: fts0types.h:134
doc_id_t min_doc_id
The smallest doc id added to the cache.
Definition: fts0types.h:128
dict_table_t * table
Table with FTS index(es)
Definition: fts0types.h:118
doc_id_t max_doc_id
The doc id at which the cache was noted as being full, we use this to set the upper_limit field.
Definition: fts0types.h:131
os_event_t event
sync finish event
Definition: fts0types.h:139
ulint lower_index
the start index of the doc id vector from where to start adding documents to the FTS cache
Definition: fts0types.h:122
bool in_progress
flag whether sync is in progress.
Definition: fts0types.h:136
trx_t * trx
The transaction used for SYNCing the cache to disk.
Definition: fts0types.h:116
bool cache_full
flag, when true it indicates that we need to sync the cache to disk
Definition: fts0types.h:120
A token and its positions within a document.
Definition: fts0types.h:299
fts_string_t text
token text
Definition: fts0types.h:300
ib_vector_t * positions
an array of the positions the token is found in; each item is actually an ulint.
Definition: fts0types.h:302
A tokenizer word.
Definition: fts0types.h:236
ib_vector_t * nodes
Word node ilists, each element is of type fts_node_t.
Definition: fts0types.h:239
fts_string_t text
Token text.
Definition: fts0types.h:237
For supporting the tracking of updates on multiple FTS indexes we need to track which FTS indexes nee...
Definition: fts0types.h:94
ib_vector_t * fts_indexes
The FTS indexes that need to be updated.
Definition: fts0types.h:97
doc_id_t doc_id
The doc id affected.
Definition: fts0types.h:95
Word text plus it's array of nodes as on disk in FTS index.
Definition: fts0types.h:244
ib_alloc_t * heap_alloc
For handling all allocations.
Definition: fts0types.h:248
fts_string_t text
Word value in UTF-8.
Definition: fts0types.h:245
ib_vector_t * nodes
Nodes read from disk.
Definition: fts0types.h:246
Definition: ut0vec.h:203
Red black tree instance.
Definition: ut0rbt.h:71
Definition: ut0vec.h:212
The info structure stored at the beginning of a heap block.
Definition: mem0mem.h:301
InnoDB condition variable.
Definition: os0event.cc:62
Definition: que0que.h:300
The structure used in the spin lock implementation of a read-write lock.
Definition: sync0rw.h:362
Definition: plugin_ftparser.h:215
Definition: trx0trx.h:683
Version control for database, common definitions, and include files.
unsigned long int ulint
Definition: univ.i:405
Utilities for byte operations.
Various utilities.