MySQL 9.0.0
Source Code Documentation
fts0types.h
Go to the documentation of this file.
1/*****************************************************************************
2
3Copyright (c) 2007, 2024, Oracle and/or its affiliates.
4
5This program is free software; you can redistribute it and/or modify it under
6the terms of the GNU General Public License, version 2.0, as published by the
7Free Software Foundation.
8
9This program is designed to work with certain software (including
10but not limited to OpenSSL) that is licensed under separate terms,
11as designated in a particular file or component or in included license
12documentation. The authors of MySQL hereby grant you an additional
13permission to link the program and your derivative works with the
14separately licensed software that they have either included with
15the program or referenced in the documentation.
16
17This program is distributed in the hope that it will be useful, but WITHOUT
18ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
19FOR A PARTICULAR PURPOSE. See the GNU General Public License, version 2.0,
20for more details.
21
22You should have received a copy of the GNU General Public License along with
23this program; if not, write to the Free Software Foundation, Inc.,
2451 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
25
26*****************************************************************************/
27
28/** @file include/fts0types.h
29 Full text search types file
30
31 Created 2007-03-27 Sunny Bains
32 *******************************************************/
33
34#ifndef INNOBASE_FTS0TYPES_H
35#define INNOBASE_FTS0TYPES_H
36
37#include "fts0fts.h"
38#include "fut0fut.h"
39#include "pars0pars.h"
40#include "que0types.h"
41#include "univ.i"
42#include "ut0byte.h"
43#include "ut0rbt.h"
44
45struct CHARSET_INFO;
46
47/** Types used within FTS. */
48struct fts_que_t;
49struct fts_node_t;
50
51/** Callbacks used within FTS. */
53typedef void (*fts_filter)(void *, fts_node_t *, void *, ulint len);
54
55/** Statistics relevant to a particular document, used during retrieval. */
57 doc_id_t doc_id; /*!< Document id */
58 ulint word_count; /*!< Total words in the document */
59};
60
61/** It's main purpose is to store the SQL prepared statements that
62are required to retrieve a document from the database. */
64 fts_index_cache_t *index_cache; /*!< The index cache instance */
65
66 /*!< Parsed sql statement */
68 fts_cache_t *cache; /*!< The parent cache */
69};
70
71/** Since we can have multiple FTS indexes on a table, we keep a
72per index cache of words etc. */
74 dict_index_t *index; /*!< The FTS index instance */
75
76 ib_rbt_t *words; /*!< Nodes; indexed by fts_string_t*,
77 cells are fts_tokenizer_word_t*.*/
78
79 ib_vector_t *doc_stats; /*!< Array of the fts_doc_stats_t
80 contained in the memory buffer.
81 Must be in sorted order (ascending).
82 The ideal choice is an rb tree but
83 the rb tree imposes a space overhead
84 that we can do without */
85
86 que_t **ins_graph; /*!< Insert query graphs */
87
88 que_t **sel_graph; /*!< Select query graphs */
89 CHARSET_INFO *charset; /*!< charset */
90};
91
92/** For supporting the tracking of updates on multiple FTS indexes we need
93to track which FTS indexes need to be updated. For INSERT and DELETE we
94update all fts indexes. */
96 doc_id_t doc_id; /*!< The doc id affected */
97
98 ib_vector_t *fts_indexes; /*!< The FTS indexes that need to be
99 updated. A NULL value means all
100 indexes need to be updated. This
101 vector is not allocated on the heap
102 and so must be freed explicitly,
103 when we are done with it */
104};
105
106/** Stop word control infotmation. */
108 ulint status; /*!< Status of the stopword tree */
109 ib_alloc_t *heap; /*!< The memory allocator to use */
110 ib_rbt_t *cached_stopword; /*!< This stores all active stopwords */
111 CHARSET_INFO *charset; /*!< charset for stopword */
112};
113
114/** The SYNC state of the cache. There is one instance of this struct
115associated with each ADD thread. */
117 trx_t *trx; /*!< The transaction used for SYNCing
118 the cache to disk */
119 dict_table_t *table; /*!< Table with FTS index(es) */
120 ulint max_cache_size; /*!< Max size in bytes of the cache */
121 bool cache_full; /*!< flag, when true it indicates that
122 we need to sync the cache to disk */
123 ulint lower_index; /*!< the start index of the doc id
124 vector from where to start adding
125 documents to the FTS cache */
126 ulint upper_index; /*!< max index of the doc id vector to
127 add to the FTS cache */
128 bool interrupted; /*!< true if SYNC was interrupted */
129 doc_id_t min_doc_id; /*!< The smallest doc id added to the
130 cache. It should equal to
131 doc_ids[lower_index] */
132 doc_id_t max_doc_id; /*!< The doc id at which the cache was
133 noted as being full, we use this to
134 set the upper_limit field */
135 std::chrono::steady_clock::time_point start_time;
136 /*!< SYNC start time */
137 bool in_progress; /*!< flag whether sync is in progress.*/
138 bool unlock_cache; /*!< flag whether unlock cache when
139 write fts node */
140 os_event_t event; /*!< sync finish event */
141};
142
143/** The cache for the FTS system. It is a memory-based inverted index
144that new entries are added to, until it grows over the configured maximum
145size, at which time its contents are written to the INDEX table. */
147#ifndef UNIV_HOTBACKUP
148 rw_lock_t lock; /*!< lock protecting all access to the
149 memory buffer. FIXME: this needs to
150 be our new upgrade-capable rw-lock */
151
152 rw_lock_t init_lock; /*!< lock used for the cache
153 initialization, it has different
154 SYNC level as above cache lock */
155#endif /* !UNIV_HOTBACKUP */
156
157 ib_mutex_t optimize_lock; /*!< Lock for OPTIMIZE */
158
159 ib_mutex_t deleted_lock; /*!< Lock covering deleted_doc_ids */
160
161 ib_mutex_t doc_id_lock; /*!< Lock covering Doc ID */
162
163 ib_vector_t *deleted_doc_ids; /*!< Array of deleted doc ids, each
164 element is of type fts_update_t */
165
166 ib_vector_t *indexes; /*!< We store the stats and inverted
167 index for the individual FTS indexes
168 in this vector. Each element is
169 an instance of fts_index_cache_t */
170
171 ib_vector_t *get_docs; /*!< information required to read
172 the document from the table. Each
173 element is of type fts_doc_t */
174
175 ulint total_size; /*!< total size consumed by the ilist
176 field of all nodes. SYNC is run
177 whenever this gets too big */
178 uint64_t total_size_before_sync; /*!< total size of fts cache,
179 when last SYNC request was sent */
180 fts_sync_t *sync; /*!< sync structure to sync data to
181 disk */
182 ib_alloc_t *sync_heap; /*!< The heap allocator, for indexes
183 and deleted_doc_ids, ie. transient
184 objects, they are recreated after
185 a SYNC is completed */
186
187 ib_alloc_t *self_heap; /*!< This heap is the heap out of
188 which an instance of the cache itself
189 was created. Objects created using
190 this heap will last for the lifetime
191 of the cache */
192
193 doc_id_t next_doc_id; /*!< Next doc id */
194
195 doc_id_t synced_doc_id; /*!< Doc ID sync-ed to CONFIG table */
196
197 doc_id_t first_doc_id; /*!< first doc id since this table
198 was opened */
199
200 ulint deleted; /*!< Number of doc ids deleted since
201 last optimized. This variable is
202 covered by deleted_lock */
203
204 ulint added; /*!< Number of doc ids added since last
205 optimized. This variable is covered by
206 the deleted lock */
207
208 fts_stopword_t stopword_info; /*!< Cached stopwords for the FTS */
209 mem_heap_t *cache_heap; /*!< Cache Heap */
210};
211
212/** Columns of the FTS auxiliary INDEX table */
214 doc_id_t first_doc_id; /*!< First document id in ilist. */
215
216 doc_id_t last_doc_id; /*!< Last document id in ilist. */
217
218 byte *ilist; /*!< Binary list of documents & word
219 positions the token appears in.
220 TODO: For now, these are simply
221 ut_malloc'd, but if testing shows
222 that they waste memory unacceptably, a
223 special memory allocator will have
224 to be written */
225
226 ulint doc_count; /*!< Number of doc ids in ilist */
227
228 ulint ilist_size; /*!< Used size of ilist in bytes. */
229
231 /*!< Allocated size of ilist in
232 bytes */
233 bool synced; /*!< flag whether the node is synced */
234};
235
236/** A tokenizer word. Contains information about one word. */
238 fts_string_t text; /*!< Token text. */
239
240 ib_vector_t *nodes; /*!< Word node ilists, each element is
241 of type fts_node_t */
242};
243
244/** Word text plus it's array of nodes as on disk in FTS index */
246 fts_string_t text; /*!< Word value in UTF-8 */
247 ib_vector_t *nodes; /*!< Nodes read from disk */
248
249 ib_alloc_t *heap_alloc; /*!< For handling all allocations */
250};
251
252/** Callback for reading and filtering nodes that are read from FTS index */
254 void *read_arg; /*!< Arg for the sql_callback */
255
256 fts_sql_callback read_record; /*!< Callback for reading index
257 record */
258 ulint total_memory; /*!< Total memory used */
259};
260
261/** For horizontally splitting an FTS auxiliary index */
263 ulint value; /*!< Character value at which
264 to split */
265
266 const char *suffix; /*!< FTS aux index suffix */
267};
268
269/** This type represents a single document field.
270 When fulltext index spans multiple columns, the
271 entire document (all indexed text in a row)
272 is comprised of multiple fields, one per indexed
273 column. */
274struct fts_doc_t {
275 fts_string_t text; /*!< document text */
276
277 bool found; /*!< true if the document was found
278 successfully in the database */
279
280 ib_rbt_t *tokens; /*!< This is filled when the document
281 is tokenized. Tokens; indexed by
282 fts_string_t*, cells are of type
283 fts_token_t* */
284
285 ib_alloc_t *self_heap; /*!< An instance of this type is
286 allocated from this heap along
287 with any objects that have the
288 same lifespan, most notably
289 the vector of token positions */
290 CHARSET_INFO *charset; /*!< Document's charset info */
291
292 st_mysql_ftparser *parser; /*!< fts plugin parser */
293
294 bool is_ngram; /*!< Whether it is a ngram parser */
295
296 ib_rbt_t *stopwords; /*!< Stopwords */
297};
298
299/** A token and its positions within a document. */
301 fts_string_t text; /*!< token text */
302
303 ib_vector_t *positions; /*!< an array of the positions the
304 token is found in; each item is
305 actually an ulint. */
306};
307
308/** It's defined in fts/fts0fts.c */
310
311/** It's defined in fts/fts0fts.c */
313
314/** Compare two fts_trx_row_t instances doc_ids.
315@param[in] p1 id1
316@param[in] p2 id2
317@return < 0 if n1 < n2, < 0 if n1 < n2, > 0 if n1 > n2 */
318static inline int fts_trx_row_doc_id_cmp(const void *p1, const void *p2);
319
320/** Compare two fts_ranking_t instances doc_ids.
321@param[in] p1 id1
322@param[in] p2 id2
323@return < 0 if n1 < n2, < 0 if n1 < n2, > 0 if n1 > n2 */
324static inline int fts_ranking_doc_id_cmp(const void *p1, const void *p2);
325
326/** Compare two fts_update_t instances doc_ids.
327@param[in] p1 id1
328@param[in] p2 id2
329@return < 0 if n1 < n2, < 0 if n1 < n2, > 0 if n1 > n2 */
330static inline int fts_update_doc_id_cmp(const void *p1, const void *p2);
331
332/** Decode and return the integer that was encoded using our VLC scheme.*/
333static inline ulint fts_decode_vlc(
334 /*!< out: value decoded */
335 byte **ptr); /*!< in: ptr to decode from, this ptr is
336 incremented by the number of bytes decoded */
337
338/** Duplicate a string.
339@param[in] dst dup to here
340@param[in] src src string
341@param[in] heap heap to use
342*/
343static inline void fts_string_dup(fts_string_t *dst, const fts_string_t *src,
344 mem_heap_t *heap);
345
346/** Return length of val if it were encoded using our VLC scheme. */
348 /*!< out: length of value
349 encoded, in bytes */
350 ulint val); /*!< in: value to encode */
351
352/** Encode an integer using our VLC scheme and return the length in bytes.
353@param[in] val value to encode
354@param[in] buf buffer, must have enough space
355@return length of value encoded, in bytes */
356static inline ulint fts_encode_int(ulint val, byte *buf);
357
358/** Get the selected FTS aux INDEX suffix. */
359static inline const char *fts_get_suffix(
360 ulint selected); /*!< in: selected index */
361
362/** Return the selected FTS aux index suffix in 5.7 compatible format
363@param[in] selected selected index
364@return the suffix name */
365static inline const char *fts_get_suffix_5_7(ulint selected);
366
367/** Select the FTS auxiliary index for the given character.
368@param[in] cs charset
369@param[in] str string
370@param[in] len string length in bytes
371@return the index to use for the string */
372static inline ulint fts_select_index(const CHARSET_INFO *cs, const byte *str,
373 ulint len);
374
375#include "fts0types.ic"
376#include "fts0vlc.ic"
377
378#endif /* INNOBASE_FTS0TYPES_H */
Full text search header file.
uint64_t doc_id_t
Document id type.
Definition: fts0fts.h:79
const fts_index_selector_t fts_index_selector_5_7[]
It's defined in fts/fts0fts.c.
Definition: fts0fts.cc:158
static ulint fts_decode_vlc(byte **ptr)
Decode and return the integer that was encoded using our VLC scheme.
static ulint fts_encode_int(ulint val, byte *buf)
Encode an integer using our VLC scheme and return the length in bytes.
static void fts_string_dup(fts_string_t *dst, const fts_string_t *src, mem_heap_t *heap)
Duplicate a string.
static ulint fts_get_encoded_len(ulint val)
Return length of val if it were encoded using our VLC scheme.
static int fts_trx_row_doc_id_cmp(const void *p1, const void *p2)
Compare two fts_trx_row_t instances doc_ids.
const fts_index_selector_t fts_index_selector[]
It's defined in fts/fts0fts.c.
Definition: fts0fts.cc:153
void(* fts_filter)(void *, fts_node_t *, void *, ulint len)
Definition: fts0types.h:53
static const char * fts_get_suffix(ulint selected)
Get the selected FTS aux INDEX suffix.
static int fts_ranking_doc_id_cmp(const void *p1, const void *p2)
Compare two fts_ranking_t instances doc_ids.
static const char * fts_get_suffix_5_7(ulint selected)
Return the selected FTS aux index suffix in 5.7 compatible format.
static ulint fts_select_index(const CHARSET_INFO *cs, const byte *str, ulint len)
Select the FTS auxiliary index for the given character.
pars_user_func_cb_t fts_sql_callback
Callbacks used within FTS.
Definition: fts0types.h:49
static int fts_update_doc_id_cmp(const void *p1, const void *p2)
Compare two fts_update_t instances doc_ids.
Full text search types.
Full text variable length integer encoding/decoding.
File-based utilities.
std::string str(const mysqlrouter::ConfigGenerator::Options::Endpoint &ep)
Definition: config_generator.cc:1081
Definition: buf0block_hint.cc:30
Definition: commit_order_queue.h:34
SQL parser.
bool(* pars_user_func_cb_t)(void *arg, void *user_arg)
Type of the user functions.
Definition: pars0pars.h:50
Query graph global types.
Definition: m_ctype.h:421
Data structure for an index.
Definition: dict0mem.h:1046
Data structure for a database table.
Definition: dict0mem.h:1909
The cache for the FTS system.
Definition: fts0types.h:146
rw_lock_t lock
lock protecting all access to the memory buffer.
Definition: fts0types.h:148
fts_sync_t * sync
sync structure to sync data to disk
Definition: fts0types.h:180
ib_vector_t * get_docs
information required to read the document from the table.
Definition: fts0types.h:171
ib_alloc_t * sync_heap
The heap allocator, for indexes and deleted_doc_ids, ie.
Definition: fts0types.h:182
ulint deleted
Number of doc ids deleted since last optimized.
Definition: fts0types.h:200
uint64_t total_size_before_sync
total size of fts cache, when last SYNC request was sent
Definition: fts0types.h:178
rw_lock_t init_lock
lock used for the cache initialization, it has different SYNC level as above cache lock
Definition: fts0types.h:152
mem_heap_t * cache_heap
Cache Heap.
Definition: fts0types.h:209
ulint total_size
total size consumed by the ilist field of all nodes.
Definition: fts0types.h:175
doc_id_t first_doc_id
first doc id since this table was opened
Definition: fts0types.h:197
doc_id_t synced_doc_id
Doc ID sync-ed to CONFIG table.
Definition: fts0types.h:195
ulint added
Number of doc ids added since last optimized.
Definition: fts0types.h:204
ib_alloc_t * self_heap
This heap is the heap out of which an instance of the cache itself was created.
Definition: fts0types.h:187
ib_mutex_t deleted_lock
Lock covering deleted_doc_ids.
Definition: fts0types.h:159
ib_vector_t * deleted_doc_ids
Array of deleted doc ids, each element is of type fts_update_t.
Definition: fts0types.h:163
fts_stopword_t stopword_info
Cached stopwords for the FTS.
Definition: fts0types.h:208
ib_vector_t * indexes
We store the stats and inverted index for the individual FTS indexes in this vector.
Definition: fts0types.h:166
ib_mutex_t optimize_lock
Lock for OPTIMIZE.
Definition: fts0types.h:157
doc_id_t next_doc_id
Next doc id.
Definition: fts0types.h:193
ib_mutex_t doc_id_lock
Lock covering Doc ID.
Definition: fts0types.h:161
Statistics relevant to a particular document, used during retrieval.
Definition: fts0types.h:56
doc_id_t doc_id
Document id.
Definition: fts0types.h:57
ulint word_count
Total words in the document.
Definition: fts0types.h:58
This type represents a single document field.
Definition: fts0types.h:274
fts_string_t text
document text
Definition: fts0types.h:275
ib_alloc_t * self_heap
An instance of this type is allocated from this heap along with any objects that have the same lifesp...
Definition: fts0types.h:285
ib_rbt_t * tokens
This is filled when the document is tokenized.
Definition: fts0types.h:280
bool found
true if the document was found successfully in the database
Definition: fts0types.h:277
bool is_ngram
Whether it is a ngram parser.
Definition: fts0types.h:294
CHARSET_INFO * charset
Document's charset info.
Definition: fts0types.h:290
ib_rbt_t * stopwords
Stopwords.
Definition: fts0types.h:296
st_mysql_ftparser * parser
fts plugin parser
Definition: fts0types.h:292
Callback for reading and filtering nodes that are read from FTS index.
Definition: fts0types.h:253
void * read_arg
Arg for the sql_callback.
Definition: fts0types.h:254
fts_sql_callback read_record
Callback for reading index record.
Definition: fts0types.h:256
ulint total_memory
Total memory used.
Definition: fts0types.h:258
It's main purpose is to store the SQL prepared statements that are required to retrieve a document fr...
Definition: fts0types.h:63
fts_cache_t * cache
The parent cache.
Definition: fts0types.h:68
que_t * get_document_graph
Definition: fts0types.h:67
fts_index_cache_t * index_cache
The index cache instance.
Definition: fts0types.h:64
Since we can have multiple FTS indexes on a table, we keep a per index cache of words etc.
Definition: fts0types.h:73
que_t ** sel_graph
Select query graphs.
Definition: fts0types.h:88
que_t ** ins_graph
Insert query graphs.
Definition: fts0types.h:86
dict_index_t * index
The FTS index instance.
Definition: fts0types.h:74
ib_vector_t * doc_stats
Array of the fts_doc_stats_t contained in the memory buffer.
Definition: fts0types.h:79
ib_rbt_t * words
Nodes; indexed by fts_string_t*, cells are fts_tokenizer_word_t*.
Definition: fts0types.h:76
CHARSET_INFO * charset
charset
Definition: fts0types.h:89
For horizontally splitting an FTS auxiliary index.
Definition: fts0types.h:262
ulint value
Character value at which to split.
Definition: fts0types.h:263
const char * suffix
FTS aux index suffix.
Definition: fts0types.h:266
Columns of the FTS auxiliary INDEX table.
Definition: fts0types.h:213
bool synced
flag whether the node is synced
Definition: fts0types.h:233
ulint ilist_size_alloc
Allocated size of ilist in bytes.
Definition: fts0types.h:230
byte * ilist
Binary list of documents & word positions the token appears in.
Definition: fts0types.h:218
ulint doc_count
Number of doc ids in ilist.
Definition: fts0types.h:226
doc_id_t last_doc_id
Last document id in ilist.
Definition: fts0types.h:216
ulint ilist_size
Used size of ilist in bytes.
Definition: fts0types.h:228
doc_id_t first_doc_id
First document id in ilist.
Definition: fts0types.h:214
Stop word control infotmation.
Definition: fts0types.h:107
ulint status
Status of the stopword tree.
Definition: fts0types.h:108
CHARSET_INFO * charset
charset for stopword
Definition: fts0types.h:111
ib_rbt_t * cached_stopword
This stores all active stopwords.
Definition: fts0types.h:110
ib_alloc_t * heap
The memory allocator to use.
Definition: fts0types.h:109
An UTF-16 ro UTF-8 string.
Definition: fts0fts.h:294
The SYNC state of the cache.
Definition: fts0types.h:116
bool unlock_cache
flag whether unlock cache when write fts node
Definition: fts0types.h:138
ulint upper_index
max index of the doc id vector to add to the FTS cache
Definition: fts0types.h:126
bool interrupted
true if SYNC was interrupted
Definition: fts0types.h:128
ulint max_cache_size
Max size in bytes of the cache.
Definition: fts0types.h:120
std::chrono::steady_clock::time_point start_time
SYNC start time.
Definition: fts0types.h:135
doc_id_t min_doc_id
The smallest doc id added to the cache.
Definition: fts0types.h:129
dict_table_t * table
Table with FTS index(es)
Definition: fts0types.h:119
doc_id_t max_doc_id
The doc id at which the cache was noted as being full, we use this to set the upper_limit field.
Definition: fts0types.h:132
os_event_t event
sync finish event
Definition: fts0types.h:140
ulint lower_index
the start index of the doc id vector from where to start adding documents to the FTS cache
Definition: fts0types.h:123
bool in_progress
flag whether sync is in progress.
Definition: fts0types.h:137
trx_t * trx
The transaction used for SYNCing the cache to disk.
Definition: fts0types.h:117
bool cache_full
flag, when true it indicates that we need to sync the cache to disk
Definition: fts0types.h:121
A token and its positions within a document.
Definition: fts0types.h:300
fts_string_t text
token text
Definition: fts0types.h:301
ib_vector_t * positions
an array of the positions the token is found in; each item is actually an ulint.
Definition: fts0types.h:303
A tokenizer word.
Definition: fts0types.h:237
ib_vector_t * nodes
Word node ilists, each element is of type fts_node_t.
Definition: fts0types.h:240
fts_string_t text
Token text.
Definition: fts0types.h:238
For supporting the tracking of updates on multiple FTS indexes we need to track which FTS indexes nee...
Definition: fts0types.h:95
ib_vector_t * fts_indexes
The FTS indexes that need to be updated.
Definition: fts0types.h:98
doc_id_t doc_id
The doc id affected.
Definition: fts0types.h:96
Word text plus it's array of nodes as on disk in FTS index.
Definition: fts0types.h:245
ib_alloc_t * heap_alloc
For handling all allocations.
Definition: fts0types.h:249
fts_string_t text
Word value in UTF-8.
Definition: fts0types.h:246
ib_vector_t * nodes
Nodes read from disk.
Definition: fts0types.h:247
Definition: ut0vec.h:204
Red black tree instance.
Definition: ut0rbt.h:72
Definition: ut0vec.h:213
The info structure stored at the beginning of a heap block.
Definition: mem0mem.h:302
InnoDB condition variable.
Definition: os0event.cc:63
Definition: que0que.h:301
The structure used in the spin lock implementation of a read-write lock.
Definition: sync0rw.h:363
Definition: plugin_ftparser.h:216
Definition: trx0trx.h:684
Version control for database, common definitions, and include files.
unsigned long int ulint
Definition: univ.i:406
Utilities for byte operations.
Various utilities.