MySQL 8.0.40
Source Code Documentation
ddl0fts.h
Go to the documentation of this file.
1/*****************************************************************************
2
3Copyright (c) 2010, 2024, Oracle and/or its affiliates.
4
5This program is free software; you can redistribute it and/or modify it under
6the terms of the GNU General Public License, version 2.0, as published by the
7Free Software Foundation.
8
9This program is designed to work with certain software (including
10but not limited to OpenSSL) that is licensed under separate terms,
11as designated in a particular file or component or in included license
12documentation. The authors of MySQL hereby grant you an additional
13permission to link the program and your derivative works with the
14separately licensed software that they have either included with
15the program or referenced in the documentation.
16
17This program is distributed in the hope that it will be useful, but WITHOUT
18ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
19FOR A PARTICULAR PURPOSE. See the GNU General Public License, version 2.0,
20for more details.
21
22You should have received a copy of the GNU General Public License along with
23this program; if not, write to the Free Software Foundation, Inc.,
2451 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
25
26*****************************************************************************/
27
28/** @file include/ddl0fts.h
29 Create Full Text Index with (parallel) merge sort.
30 Created 10/13/2010 Jimmy Yang */
31
32#ifndef ddl0fts_h
33#define ddl0fts_h
34
35#include "btr0load.h"
36#include "data0data.h"
37#include "ddl0impl-buffer.h"
38#include "dict0types.h"
39#include "fts0priv.h"
40#include "fts0types.h"
41#include "ut0mpmcbq.h"
42
43/** The general architecture is that the work is done in two phases,
44roughly the read and write phase. The scanner pushes the document to
45a read handler queue for processing.
46
47Phase I:
48 Start several parsing/tokenization threads that read the document from
49 a queue, parse the document, tokenize the document, add them to a buffer,
50 sort the rows in the buffer and then write the buffer to a temporary file.
51 There is one file per auxiliary table per parser instance. So, if you
52 have 2 parse threads you will end up with:
53
54 2 x FTS_NUM_AUX_INDEX files.
55
56Phase 2:
57 The temporary files generated during phase I are not closed but passed to
58 the second (write) phase so that these temporary files can be merged and
59 the rows inserted into the new FTS index. Using the example from above,
60 create FTS_NUM_AUX_INDEX threads and each thread will merge 2 files. */
61
62namespace ddl {
63// Forward declaration
64struct Builder;
65
66/** Full text search index builder. */
67struct FTS {
68 /** Information about temporary files used in merge sort. This structure
69 defines information the scan thread will fetch and put to the linked list
70 for parallel tokenization/sort threads to process */
71 struct Doc_item {
72 /** Field contains document string */
74
75 /** Document ID */
77 };
78
79 /** Constructor.
80 @param[in, out] ctx DDL context.
81 @param[in, out] index DDL index.
82 @param[in, out] table DDL table. */
83 FTS(Context &ctx, dict_index_t *index, dict_table_t *table) noexcept;
84
85 /** ~Destructor. */
86 ~FTS() noexcept;
87
88 /** Create the internal data structures.
89 @param[in] n_threads Number of parse threads to create.
90 @return DB_SUCCESS or error code. */
91 dberr_t init(size_t n_threads) noexcept;
92
93 /** @return the DDL index. */
94 dict_index_t *index() noexcept { return m_index; }
95
96 /** @return the temporary sort index. */
97 dict_index_t *sort_index() noexcept { return m_dup.m_index; }
98
99 /** Start the parsing, create the threads.
100 @return DB_SUCCESS or error code. */
101 dberr_t start_parse_threads(Builder *builder) noexcept;
102
103 /** For sending the documents to parse to the parsing threads.
104 @param[in,out] doc_item Document to parse, takes ownership.
105 @return DB_SUCCESS or error code, doc_item will be deleted either way. */
106 dberr_t enqueue(Doc_item *doc_item) noexcept;
107
108 /** Check for error status after the parsing has finished.
109 @return DB_SUCCESS or error code. */
110 dberr_t check_for_errors() noexcept;
111
112 /** Start the merging and insert threads.
113 @param[in,out] builder Builder instance to use.
114 @return DB_SUCCESS or error code. */
115 dberr_t insert(Builder *builder) noexcept;
116
117 /** Inform the parser threads that the scanning phase is complete so
118 that they can shutdown after emptying the doc item queue.
119 @param[in] err Error status of the scanning thread(s).
120 @return DB_SUCCESS or error code. */
122
123 private:
124 /** Create the data structures required to build the FTS index.
125 @param[in] n_threads Number of parser threads.
126 @return DB_SUCCESS or error code. */
127 dberr_t create(size_t n_threads) noexcept;
128
129 /** @return the number of parses. */
130 size_t get_n_parsers() const noexcept { return m_parsers.size(); }
131
132 /** Destroy the data structures and clean up. */
133 void destroy() noexcept;
134
135 /** Create a temporary "fts sort index" used to merge sort the
136 tokenized doc string. The index has three "fields":
137
138 1. Tokenized word,
139 2. Doc ID
140 3. Word's position in original 'doc'.
141
142 @param[in,out] index Index to sort.
143 @param[in,out] table Table that the FTS index is created on.
144 @param[out] doc_id_32_bit Whether to use 4 bytes instead of 7 bytes
145 integer to store the DOC ID during sort.
146 @return dict_index_t structure for the fts sort index */
147 [[nodiscard]] static dict_index_t *create_index(dict_index_t *index,
148 dict_table_t *table,
149 bool *doc_id_32_bit) noexcept;
150
151 /** Setup the insert phase inoput files generated by the parsers.
152 @return DB_SUCCESS or error code. */
153 dberr_t setup_insert_phase() noexcept;
154
155 private:
156 // Forward declaration
157 struct Parser;
158 struct Inserter;
159
160 using Threads = std::vector<std::thread>;
161 using Parsers = std::vector<Parser *, ut::allocator<Parser *>>;
162
163 /** For parsing the documents, there is one per thread. */
165
166 /** For inserting the rows parsed by the m_parsers. */
168
169 /** DDL context. */
171
172 /** Duplicate key reporting. */
174
175 /** true if document ID should be stored as a 32 bit instead of a 64 bit. */
177
178 /** DDL index instance. */
180
181 /** DDL table instance. */
183
184 /** Temporary index instance with relevant FTS columns. */
186
187 /** For tracking parser threads. */
189};
190
191} // namespace ddl
192#endif /* ddl0fts_h */
The B-tree bulk load.
SQL data field and tuple.
dberr_t
Definition: db0err.h:39
DDL buffer infrastructure.
Data dictionary global types.
uint64_t doc_id_t
Document id type.
Definition: fts0fts.h:77
Full text search internal header file.
Full text search types file.
The general architecture is that the work is done in two phases, roughly the read and write phase.
Definition: btr0load.cc:42
static Value err()
Create a Value object that represents an error condition.
Definition: json_binary.cc:910
Definition: gcs_xcom_synode.h:64
This file contains a set of libraries providing overloads for regular dynamic allocation routines whi...
Definition: aligned_alloc.h:48
std::vector< T, ut::allocator< T > > vector
Specialization of vector which uses allocator.
Definition: ut0new.h:2875
Definition: mysqltest.cc:385
For loading indexes.
Definition: ddl0impl-builder.h:48
DDL context/configuration.
Definition: ddl0ddl.h:321
Structure for reporting duplicate records.
Definition: ddl0ddl.h:132
dict_index_t * m_index
Index being sorted.
Definition: ddl0ddl.h:146
Information about temporary files used in merge sort.
Definition: ddl0fts.h:71
dfield_t * m_field
Field contains document string.
Definition: ddl0fts.h:73
doc_id_t m_doc_id
Document ID.
Definition: ddl0fts.h:76
Definition: ddl0fts.cc:258
For parsing and sorting the documents.
Definition: ddl0fts.cc:119
Full text search index builder.
Definition: ddl0fts.h:67
std::vector< Parser *, ut::allocator< Parser * > > Parsers
Definition: ddl0fts.h:161
Inserter * m_inserter
For inserting the rows parsed by the m_parsers.
Definition: ddl0fts.h:167
FTS(Context &ctx, dict_index_t *index, dict_table_t *table) noexcept
Constructor.
Definition: ddl0fts.cc:1467
dict_index_t * m_index
DDL index instance.
Definition: ddl0fts.h:179
dberr_t init(size_t n_threads) noexcept
Create the internal data structures.
Definition: ddl0fts.cc:1529
dict_index_t * sort_index() noexcept
Definition: ddl0fts.h:97
Parsers m_parsers
For parsing the documents, there is one per thread.
Definition: ddl0fts.h:164
dberr_t enqueue(Doc_item *doc_item) noexcept
For sending the documents to parse to the parsing threads.
Definition: ddl0fts.cc:1575
dict_table_t * m_table
DDL table instance.
Definition: ddl0fts.h:182
dberr_t setup_insert_phase() noexcept
Setup the insert phase inoput files generated by the parsers.
Definition: ddl0fts.cc:1661
~FTS() noexcept
~Destructor.
Definition: ddl0fts.cc:1475
static dict_index_t * create_index(dict_index_t *index, dict_table_t *table, bool *doc_id_32_bit) noexcept
Create a temporary "fts sort index" used to merge sort the tokenized doc string.
Definition: ddl0fts.cc:443
dberr_t create(size_t n_threads) noexcept
Create the data structures required to build the FTS index.
Definition: ddl0fts.cc:1483
size_t get_n_parsers() const noexcept
Definition: ddl0fts.h:130
dict_index_t * index() noexcept
Definition: ddl0fts.h:94
dberr_t insert(Builder *builder) noexcept
Start the merging and insert threads.
Definition: ddl0fts.cc:1603
dberr_t scan_finished(dberr_t err) noexcept
Inform the parser threads that the scanning phase is complete so that they can shutdown after emptyin...
Definition: ddl0fts.cc:1682
std::vector< std::thread > Threads
Definition: ddl0fts.h:160
Context & m_ctx
DDL context.
Definition: ddl0fts.h:170
dberr_t check_for_errors() noexcept
Check for error status after the parsing has finished.
Definition: ddl0fts.cc:1581
dict_index_t * m_sort_index
Temporary index instance with relevant FTS columns.
Definition: ddl0fts.h:185
void destroy() noexcept
Destroy the data structures and clean up.
Definition: ddl0fts.cc:1516
Threads m_threads
For tracking parser threads.
Definition: ddl0fts.h:188
dberr_t start_parse_threads(Builder *builder) noexcept
Start the parsing, create the threads.
Definition: ddl0fts.cc:1539
Dup m_dup
Duplicate key reporting.
Definition: ddl0fts.h:173
bool m_doc_id_32_bit
true if document ID should be stored as a 32 bit instead of a 64 bit.
Definition: ddl0fts.h:176
Structure for an SQL data field.
Definition: data0data.h:617
Data structure for an index.
Definition: dict0mem.h:1046
Data structure for a database table.
Definition: dict0mem.h:1909