MySQL 9.2.0
Source Code Documentation
fts0types.h
Go to the documentation of this file.
1/*****************************************************************************
2
3Copyright (c) 2007, 2024, Oracle and/or its affiliates.
4
5This program is free software; you can redistribute it and/or modify it under
6the terms of the GNU General Public License, version 2.0, as published by the
7Free Software Foundation.
8
9This program is designed to work with certain software (including
10but not limited to OpenSSL) that is licensed under separate terms,
11as designated in a particular file or component or in included license
12documentation. The authors of MySQL hereby grant you an additional
13permission to link the program and your derivative works with the
14separately licensed software that they have either included with
15the program or referenced in the documentation.
16
17This program is distributed in the hope that it will be useful, but WITHOUT
18ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
19FOR A PARTICULAR PURPOSE. See the GNU General Public License, version 2.0,
20for more details.
21
22You should have received a copy of the GNU General Public License along with
23this program; if not, write to the Free Software Foundation, Inc.,
2451 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
25
26*****************************************************************************/
27
28/** @file include/fts0types.h
29 Full text search types file
30
31 Created 2007-03-27 Sunny Bains
32 *******************************************************/
33
34#ifndef INNOBASE_FTS0TYPES_H
35#define INNOBASE_FTS0TYPES_H
36
37#include <cstdint>
38#include "fts0fts.h"
39#include "fut0fut.h"
40#include "pars0pars.h"
41#include "que0types.h"
42#include "univ.i"
43#include "ut0byte.h"
44#include "ut0rbt.h"
45
46struct CHARSET_INFO;
47
48/** Types used within FTS. */
49struct fts_que_t;
50struct fts_node_t;
51
52/** Callbacks used within FTS. */
54typedef void (*fts_filter)(void *, fts_node_t *, void *, ulint len);
55
56/** Statistics relevant to a particular document, used during retrieval. */
58 doc_id_t doc_id; /*!< Document id */
59 ulint word_count; /*!< Total words in the document */
60};
61
62/** It's main purpose is to store the SQL prepared statements that
63are required to retrieve a document from the database. */
65 fts_index_cache_t *index_cache; /*!< The index cache instance */
66
67 /*!< Parsed sql statement */
69 fts_cache_t *cache; /*!< The parent cache */
70};
71
72/** Since we can have multiple FTS indexes on a table, we keep a
73per index cache of words etc. */
75 dict_index_t *index; /*!< The FTS index instance */
76
77 ib_rbt_t *words; /*!< Nodes; indexed by fts_string_t*,
78 cells are fts_tokenizer_word_t*.*/
79
80 ib_vector_t *doc_stats; /*!< Array of the fts_doc_stats_t
81 contained in the memory buffer.
82 Must be in sorted order (ascending).
83 The ideal choice is an rb tree but
84 the rb tree imposes a space overhead
85 that we can do without */
86
87 que_t **ins_graph; /*!< Insert query graphs */
88
89 que_t **sel_graph; /*!< Select query graphs */
90 CHARSET_INFO *charset; /*!< charset */
91};
92
93/** For supporting the tracking of updates on multiple FTS indexes we need
94to track which FTS indexes need to be updated. For INSERT and DELETE we
95update all fts indexes. */
97 doc_id_t doc_id; /*!< The doc id affected */
98
99 ib_vector_t *fts_indexes; /*!< The FTS indexes that need to be
100 updated. A NULL value means all
101 indexes need to be updated. This
102 vector is not allocated on the heap
103 and so must be freed explicitly,
104 when we are done with it */
105};
106
107/** Stop word control infotmation. */
109 ulint status; /*!< Status of the stopword tree */
110 ib_alloc_t *heap; /*!< The memory allocator to use */
111 ib_rbt_t *cached_stopword; /*!< This stores all active stopwords */
112 CHARSET_INFO *charset; /*!< charset for stopword */
113};
114
115/** The SYNC state of the cache. There is one instance of this struct
116associated with each ADD thread. */
118 trx_t *trx; /*!< The transaction used for SYNCing
119 the cache to disk */
120 dict_table_t *table; /*!< Table with FTS index(es) */
121 ulint max_cache_size; /*!< Max size in bytes of the cache */
122 bool cache_full; /*!< flag, when true it indicates that
123 we need to sync the cache to disk */
124 ulint lower_index; /*!< the start index of the doc id
125 vector from where to start adding
126 documents to the FTS cache */
127 ulint upper_index; /*!< max index of the doc id vector to
128 add to the FTS cache */
129 bool interrupted; /*!< true if SYNC was interrupted */
130 doc_id_t min_doc_id; /*!< The smallest doc id added to the
131 cache. It should equal to
132 doc_ids[lower_index] */
133 doc_id_t max_doc_id; /*!< The doc id at which the cache was
134 noted as being full, we use this to
135 set the upper_limit field */
136 std::chrono::steady_clock::time_point start_time;
137 /*!< SYNC start time */
138 bool in_progress; /*!< flag whether sync is in progress.*/
139 bool unlock_cache; /*!< flag whether unlock cache when
140 write fts node */
141 os_event_t event; /*!< sync finish event */
142};
143
144/** The cache for the FTS system. It is a memory-based inverted index
145that new entries are added to, until it grows over the configured maximum
146size, at which time its contents are written to the INDEX table. */
148#ifndef UNIV_HOTBACKUP
149 rw_lock_t lock; /*!< lock protecting all access to the
150 memory buffer. FIXME: this needs to
151 be our new upgrade-capable rw-lock */
152
153 rw_lock_t init_lock; /*!< lock used for the cache
154 initialization, it has different
155 SYNC level as above cache lock */
156#endif /* !UNIV_HOTBACKUP */
157
158 ib_mutex_t optimize_lock; /*!< Lock for OPTIMIZE */
159
160 ib_mutex_t deleted_lock; /*!< Lock covering deleted_doc_ids */
161
162 ib_mutex_t doc_id_lock; /*!< Lock covering Doc ID */
163
164 ib_vector_t *deleted_doc_ids; /*!< Array of deleted doc ids, each
165 element is of type fts_update_t */
166
167 ib_vector_t *indexes; /*!< We store the stats and inverted
168 index for the individual FTS indexes
169 in this vector. Each element is
170 an instance of fts_index_cache_t */
171
172 ib_vector_t *get_docs; /*!< information required to read
173 the document from the table. Each
174 element is of type fts_doc_t */
175
176 ulint total_size; /*!< total size consumed by the ilist
177 field of all nodes. SYNC is run
178 whenever this gets too big */
179 uint64_t total_size_before_sync; /*!< total size of fts cache,
180 when last SYNC request was sent */
181 fts_sync_t *sync; /*!< sync structure to sync data to
182 disk */
183 ib_alloc_t *sync_heap; /*!< The heap allocator, for indexes
184 and deleted_doc_ids, ie. transient
185 objects, they are recreated after
186 a SYNC is completed */
187
188 ib_alloc_t *self_heap; /*!< This heap is the heap out of
189 which an instance of the cache itself
190 was created. Objects created using
191 this heap will last for the lifetime
192 of the cache */
193
194 doc_id_t next_doc_id; /*!< Next doc id */
195
196 doc_id_t synced_doc_id; /*!< Doc ID sync-ed to CONFIG table */
197
198 doc_id_t first_doc_id; /*!< first doc id since this table
199 was opened */
200
201 ulint deleted; /*!< Number of doc ids deleted since
202 last optimized. This variable is
203 covered by deleted_lock */
204
205 ulint added; /*!< Number of doc ids added since last
206 optimized. This variable is covered by
207 the deleted lock */
208
209 fts_stopword_t stopword_info; /*!< Cached stopwords for the FTS */
210 mem_heap_t *cache_heap; /*!< Cache Heap */
211};
212
213/** Columns of the FTS auxiliary INDEX table */
215 doc_id_t first_doc_id; /*!< First document id in ilist. */
216
217 doc_id_t last_doc_id; /*!< Last document id in ilist. */
218
219 byte *ilist; /*!< Binary list of documents & word
220 positions the token appears in.
221 TODO: For now, these are simply
222 ut_malloc'd, but if testing shows
223 that they waste memory unacceptably, a
224 special memory allocator will have
225 to be written */
226
227 ulint doc_count; /*!< Number of doc ids in ilist */
228
229 ulint ilist_size; /*!< Used size of ilist in bytes. */
230
232 /*!< Allocated size of ilist in
233 bytes */
234 bool synced; /*!< flag whether the node is synced */
235};
236
237/** A tokenizer word. Contains information about one word. */
239 fts_string_t text; /*!< Token text. */
240
241 ib_vector_t *nodes; /*!< Word node ilists, each element is
242 of type fts_node_t */
243};
244
245/** Word text plus it's array of nodes as on disk in FTS index */
247 fts_string_t text; /*!< Word value in UTF-8 */
248 ib_vector_t *nodes; /*!< Nodes read from disk */
249
250 ib_alloc_t *heap_alloc; /*!< For handling all allocations */
251};
252
253/** Callback for reading and filtering nodes that are read from FTS index */
255 void *read_arg; /*!< Arg for the sql_callback */
256
257 fts_sql_callback read_record; /*!< Callback for reading index
258 record */
259 ulint total_memory; /*!< Total memory used */
260};
261
262/** For horizontally splitting an FTS auxiliary index */
264 ulint value; /*!< Character value at which
265 to split */
266
267 const char *suffix; /*!< FTS aux index suffix */
268};
269
270/** This type represents a single document field.
271 When fulltext index spans multiple columns, the
272 entire document (all indexed text in a row)
273 is comprised of multiple fields, one per indexed
274 column. */
275struct fts_doc_t {
276 fts_string_t text; /*!< document text */
277
278 bool found; /*!< true if the document was found
279 successfully in the database */
280
281 ib_rbt_t *tokens; /*!< This is filled when the document
282 is tokenized. Tokens; indexed by
283 fts_string_t*, cells are of type
284 fts_token_t* */
285
286 ib_alloc_t *self_heap; /*!< An instance of this type is
287 allocated from this heap along
288 with any objects that have the
289 same lifespan, most notably
290 the vector of token positions */
291 CHARSET_INFO *charset; /*!< Document's charset info */
292
293 st_mysql_ftparser *parser; /*!< fts plugin parser */
294
295 bool is_ngram; /*!< Whether it is a ngram parser */
296
297 ib_rbt_t *stopwords; /*!< Stopwords */
298};
299
300/** A token and its positions within a document. */
302 fts_string_t text; /*!< token text */
303
304 ib_vector_t *positions; /*!< an array of the positions the
305 token is found in; each item is
306 actually an ulint. */
307};
308
309/** It's defined in fts/fts0fts.c */
311
312/** It's defined in fts/fts0fts.c */
314
315/** Decode and return the integer that was encoded using our VLC scheme.
316@param[in,out] ptr ptr to decode from, this ptr is incremented
317 by the number of bytes decoded
318@return value decoded */
319inline uint64_t fts_decode_vlc(byte **ptr);
320
321/** Duplicate a string.
322@param[in] dst dup to here
323@param[in] src src string
324@param[in] heap heap to use
325*/
326inline void fts_string_dup(fts_string_t *dst, const fts_string_t *src,
327 mem_heap_t *heap);
328
329/** Return length of val if it were encoded using our VLC scheme.
330@param[in] val value to encode
331@return length of value encoded, in bytes */
332inline unsigned int fts_get_encoded_len(uint64_t val);
333
334/** Encode an integer using our VLC scheme and return the length in bytes.
335@param[in] val value to encode
336@param[in] buf buffer, must have enough space
337@return length of value encoded, in bytes */
338inline unsigned int fts_encode_int(uint64_t val, byte *buf);
339
340/** Get the selected FTS aux INDEX suffix. */
341inline const char *fts_get_suffix(ulint selected); /*!< in: selected index */
342
343/** Return the selected FTS aux index suffix in 5.7 compatible format
344@param[in] selected selected index
345@return the suffix name */
346inline const char *fts_get_suffix_5_7(ulint selected);
347
348/** Select the FTS auxiliary table for the given character.
349@param[in] cs charset
350@param[in] str string
351@param[in] len string length in bytes
352@retval the auxiliary table number to use for the string, zero-based */
353inline ulint fts_select_index(const CHARSET_INFO *cs, const byte *str,
354 ulint len);
355
356#include "fts0types.ic"
357#include "fts0vlc.ic"
358
359#endif /* INNOBASE_FTS0TYPES_H */
Full text search header file.
uint64_t doc_id_t
Document id type.
Definition: fts0fts.h:79
uint64_t fts_decode_vlc(byte **ptr)
Decode and return the integer that was encoded using our VLC scheme.
Definition: fts0vlc.ic:91
unsigned int fts_get_encoded_len(uint64_t val)
Return length of val if it were encoded using our VLC scheme.
Definition: fts0vlc.ic:41
const fts_index_selector_t fts_index_selector_5_7[]
It's defined in fts/fts0fts.c.
Definition: fts0fts.cc:158
unsigned int fts_encode_int(uint64_t val, byte *buf)
Encode an integer using our VLC scheme and return the length in bytes.
Definition: fts0vlc.ic:54
const fts_index_selector_t fts_index_selector[]
It's defined in fts/fts0fts.c.
Definition: fts0fts.cc:153
void(* fts_filter)(void *, fts_node_t *, void *, ulint len)
Definition: fts0types.h:54
ulint fts_select_index(const CHARSET_INFO *cs, const byte *str, ulint len)
Select the FTS auxiliary table for the given character.
Definition: fts0types.ic:170
const char * fts_get_suffix(ulint selected)
Get the selected FTS aux INDEX suffix.
Definition: fts0types.ic:194
pars_user_func_cb_t fts_sql_callback
Callbacks used within FTS.
Definition: fts0types.h:50
const char * fts_get_suffix_5_7(ulint selected)
Return the selected FTS aux index suffix in 5.7 compatible format.
Definition: fts0types.ic:202
void fts_string_dup(fts_string_t *dst, const fts_string_t *src, mem_heap_t *heap)
Duplicate a string.
Definition: fts0types.ic:41
Full text search types.
Full text variable length integer encoding/decoding.
File-based utilities.
std::string str(const mysqlrouter::ConfigGenerator::Options::Endpoint &ep)
Definition: config_generator.cc:1117
Definition: buf0block_hint.cc:30
Definition: commit_order_queue.h:34
SQL parser.
bool(* pars_user_func_cb_t)(void *arg, void *user_arg)
Type of the user functions.
Definition: pars0pars.h:50
Query graph global types.
Definition: m_ctype.h:421
Data structure for an index.
Definition: dict0mem.h:1041
Data structure for a database table.
Definition: dict0mem.h:1913
The cache for the FTS system.
Definition: fts0types.h:147
rw_lock_t lock
lock protecting all access to the memory buffer.
Definition: fts0types.h:149
fts_sync_t * sync
sync structure to sync data to disk
Definition: fts0types.h:181
ib_vector_t * get_docs
information required to read the document from the table.
Definition: fts0types.h:172
ib_alloc_t * sync_heap
The heap allocator, for indexes and deleted_doc_ids, ie.
Definition: fts0types.h:183
ulint deleted
Number of doc ids deleted since last optimized.
Definition: fts0types.h:201
uint64_t total_size_before_sync
total size of fts cache, when last SYNC request was sent
Definition: fts0types.h:179
rw_lock_t init_lock
lock used for the cache initialization, it has different SYNC level as above cache lock
Definition: fts0types.h:153
mem_heap_t * cache_heap
Cache Heap.
Definition: fts0types.h:210
ulint total_size
total size consumed by the ilist field of all nodes.
Definition: fts0types.h:176
doc_id_t first_doc_id
first doc id since this table was opened
Definition: fts0types.h:198
doc_id_t synced_doc_id
Doc ID sync-ed to CONFIG table.
Definition: fts0types.h:196
ulint added
Number of doc ids added since last optimized.
Definition: fts0types.h:205
ib_alloc_t * self_heap
This heap is the heap out of which an instance of the cache itself was created.
Definition: fts0types.h:188
ib_mutex_t deleted_lock
Lock covering deleted_doc_ids.
Definition: fts0types.h:160
ib_vector_t * deleted_doc_ids
Array of deleted doc ids, each element is of type fts_update_t.
Definition: fts0types.h:164
fts_stopword_t stopword_info
Cached stopwords for the FTS.
Definition: fts0types.h:209
ib_vector_t * indexes
We store the stats and inverted index for the individual FTS indexes in this vector.
Definition: fts0types.h:167
ib_mutex_t optimize_lock
Lock for OPTIMIZE.
Definition: fts0types.h:158
doc_id_t next_doc_id
Next doc id.
Definition: fts0types.h:194
ib_mutex_t doc_id_lock
Lock covering Doc ID.
Definition: fts0types.h:162
Statistics relevant to a particular document, used during retrieval.
Definition: fts0types.h:57
doc_id_t doc_id
Document id.
Definition: fts0types.h:58
ulint word_count
Total words in the document.
Definition: fts0types.h:59
This type represents a single document field.
Definition: fts0types.h:275
fts_string_t text
document text
Definition: fts0types.h:276
ib_alloc_t * self_heap
An instance of this type is allocated from this heap along with any objects that have the same lifesp...
Definition: fts0types.h:286
ib_rbt_t * tokens
This is filled when the document is tokenized.
Definition: fts0types.h:281
bool found
true if the document was found successfully in the database
Definition: fts0types.h:278
bool is_ngram
Whether it is a ngram parser.
Definition: fts0types.h:295
CHARSET_INFO * charset
Document's charset info.
Definition: fts0types.h:291
ib_rbt_t * stopwords
Stopwords.
Definition: fts0types.h:297
st_mysql_ftparser * parser
fts plugin parser
Definition: fts0types.h:293
Callback for reading and filtering nodes that are read from FTS index.
Definition: fts0types.h:254
void * read_arg
Arg for the sql_callback.
Definition: fts0types.h:255
fts_sql_callback read_record
Callback for reading index record.
Definition: fts0types.h:257
ulint total_memory
Total memory used.
Definition: fts0types.h:259
It's main purpose is to store the SQL prepared statements that are required to retrieve a document fr...
Definition: fts0types.h:64
fts_cache_t * cache
The parent cache.
Definition: fts0types.h:69
que_t * get_document_graph
Definition: fts0types.h:68
fts_index_cache_t * index_cache
The index cache instance.
Definition: fts0types.h:65
Since we can have multiple FTS indexes on a table, we keep a per index cache of words etc.
Definition: fts0types.h:74
que_t ** sel_graph
Select query graphs.
Definition: fts0types.h:89
que_t ** ins_graph
Insert query graphs.
Definition: fts0types.h:87
dict_index_t * index
The FTS index instance.
Definition: fts0types.h:75
ib_vector_t * doc_stats
Array of the fts_doc_stats_t contained in the memory buffer.
Definition: fts0types.h:80
ib_rbt_t * words
Nodes; indexed by fts_string_t*, cells are fts_tokenizer_word_t*.
Definition: fts0types.h:77
CHARSET_INFO * charset
charset
Definition: fts0types.h:90
For horizontally splitting an FTS auxiliary index.
Definition: fts0types.h:263
ulint value
Character value at which to split.
Definition: fts0types.h:264
const char * suffix
FTS aux index suffix.
Definition: fts0types.h:267
Columns of the FTS auxiliary INDEX table.
Definition: fts0types.h:214
bool synced
flag whether the node is synced
Definition: fts0types.h:234
ulint ilist_size_alloc
Allocated size of ilist in bytes.
Definition: fts0types.h:231
byte * ilist
Binary list of documents & word positions the token appears in.
Definition: fts0types.h:219
ulint doc_count
Number of doc ids in ilist.
Definition: fts0types.h:227
doc_id_t last_doc_id
Last document id in ilist.
Definition: fts0types.h:217
ulint ilist_size
Used size of ilist in bytes.
Definition: fts0types.h:229
doc_id_t first_doc_id
First document id in ilist.
Definition: fts0types.h:215
Stop word control infotmation.
Definition: fts0types.h:108
ulint status
Status of the stopword tree.
Definition: fts0types.h:109
CHARSET_INFO * charset
charset for stopword
Definition: fts0types.h:112
ib_rbt_t * cached_stopword
This stores all active stopwords.
Definition: fts0types.h:111
ib_alloc_t * heap
The memory allocator to use.
Definition: fts0types.h:110
An UTF-16 ro UTF-8 string.
Definition: fts0fts.h:294
The SYNC state of the cache.
Definition: fts0types.h:117
bool unlock_cache
flag whether unlock cache when write fts node
Definition: fts0types.h:139
ulint upper_index
max index of the doc id vector to add to the FTS cache
Definition: fts0types.h:127
bool interrupted
true if SYNC was interrupted
Definition: fts0types.h:129
ulint max_cache_size
Max size in bytes of the cache.
Definition: fts0types.h:121
std::chrono::steady_clock::time_point start_time
SYNC start time.
Definition: fts0types.h:136
doc_id_t min_doc_id
The smallest doc id added to the cache.
Definition: fts0types.h:130
dict_table_t * table
Table with FTS index(es)
Definition: fts0types.h:120
doc_id_t max_doc_id
The doc id at which the cache was noted as being full, we use this to set the upper_limit field.
Definition: fts0types.h:133
os_event_t event
sync finish event
Definition: fts0types.h:141
ulint lower_index
the start index of the doc id vector from where to start adding documents to the FTS cache
Definition: fts0types.h:124
bool in_progress
flag whether sync is in progress.
Definition: fts0types.h:138
trx_t * trx
The transaction used for SYNCing the cache to disk.
Definition: fts0types.h:118
bool cache_full
flag, when true it indicates that we need to sync the cache to disk
Definition: fts0types.h:122
A token and its positions within a document.
Definition: fts0types.h:301
fts_string_t text
token text
Definition: fts0types.h:302
ib_vector_t * positions
an array of the positions the token is found in; each item is actually an ulint.
Definition: fts0types.h:304
A tokenizer word.
Definition: fts0types.h:238
ib_vector_t * nodes
Word node ilists, each element is of type fts_node_t.
Definition: fts0types.h:241
fts_string_t text
Token text.
Definition: fts0types.h:239
For supporting the tracking of updates on multiple FTS indexes we need to track which FTS indexes nee...
Definition: fts0types.h:96
ib_vector_t * fts_indexes
The FTS indexes that need to be updated.
Definition: fts0types.h:99
doc_id_t doc_id
The doc id affected.
Definition: fts0types.h:97
Word text plus it's array of nodes as on disk in FTS index.
Definition: fts0types.h:246
ib_alloc_t * heap_alloc
For handling all allocations.
Definition: fts0types.h:250
fts_string_t text
Word value in UTF-8.
Definition: fts0types.h:247
ib_vector_t * nodes
Nodes read from disk.
Definition: fts0types.h:248
Definition: ut0vec.h:204
Red black tree instance.
Definition: ut0rbt.h:72
Definition: ut0vec.h:213
The info structure stored at the beginning of a heap block.
Definition: mem0mem.h:302
InnoDB condition variable.
Definition: os0event.cc:63
Definition: que0que.h:301
The structure used in the spin lock implementation of a read-write lock.
Definition: sync0rw.h:363
Definition: plugin_ftparser.h:216
Definition: trx0trx.h:675
Version control for database, common definitions, and include files.
unsigned long int ulint
Definition: univ.i:406
Utilities for byte operations.
Various utilities.