MySQL 8.3.0
Source Code Documentation
ddl0fts.h
Go to the documentation of this file.
1/*****************************************************************************
2
3Copyright (c) 2010, 2023, Oracle and/or its affiliates.
4
5This program is free software; you can redistribute it and/or modify it under
6the terms of the GNU General Public License, version 2.0, as published by the
7Free Software Foundation.
8
9This program is also distributed with certain software (including but not
10limited to OpenSSL) that is licensed under separate terms, as designated in a
11particular file or component or in included license documentation. The authors
12of MySQL hereby grant you an additional permission to link the program and
13your derivative works with the separately licensed software that they have
14included with MySQL.
15
16This program is distributed in the hope that it will be useful, but WITHOUT
17ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
18FOR A PARTICULAR PURPOSE. See the GNU General Public License, version 2.0,
19for more details.
20
21You should have received a copy of the GNU General Public License along with
22this program; if not, write to the Free Software Foundation, Inc.,
2351 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
24
25*****************************************************************************/
26
27/** @file include/ddl0fts.h
28 Create Full Text Index with (parallel) merge sort.
29 Created 10/13/2010 Jimmy Yang */
30
31#ifndef ddl0fts_h
32#define ddl0fts_h
33
34#include "btr0load.h"
35#include "data0data.h"
36#include "ddl0impl-buffer.h"
37#include "dict0types.h"
38#include "fts0priv.h"
39#include "fts0types.h"
40#include "ut0mpmcbq.h"
41
42/** The general architecture is that the work is done in two phases,
43roughly the read and write phase. The scanner pushes the document to
44a read handler queue for processing.
45
46Phase I:
47 Start several parsing/tokenization threads that read the document from
48 a queue, parse the document, tokenize the document, add them to a buffer,
49 sort the rows in the buffer and then write the buffer to a temporary file.
50 There is one file per auxiliary table per parser instance. So, if you
51 have 2 parse threads you will end up with:
52
53 2 x FTS_NUM_AUX_INDEX files.
54
55Phase 2:
56 The temporary files generated during phase I are not closed but passed to
57 the second (write) phase so that these temporary files can be merged and
58 the rows inserted into the new FTS index. Using the example from above,
59 create FTS_NUM_AUX_INDEX threads and each thread will merge 2 files. */
60
61namespace ddl {
62// Forward declaration
63struct Builder;
64
65/** Full text search index builder. */
66struct FTS {
67 /** Information about temporary files used in merge sort. This structure
68 defines information the scan thread will fetch and put to the linked list
69 for parallel tokenization/sort threads to process */
70 struct Doc_item {
71 /** Field contains document string */
73
74 /** Document ID */
76 };
77
78 /** Constructor.
79 @param[in, out] ctx DDL context.
80 @param[in, out] index DDL index.
81 @param[in, out] table DDL table. */
83
84 /** ~Destructor. */
85 ~FTS() noexcept;
86
87 /** Create the internal data structures.
88 @param[in] n_threads Number of parse threads to create.
89 @return DB_SUCCESS or error code. */
90 dberr_t init(size_t n_threads) noexcept;
91
92 /** @return the DDL index. */
93 dict_index_t *index() noexcept { return m_index; }
94
95 /** @return the temporary sort index. */
96 dict_index_t *sort_index() noexcept { return m_dup.m_index; }
97
98 /** Start the parsing, create the threads.
99 @return DB_SUCCESS or error code. */
100 dberr_t start_parse_threads(Builder *builder) noexcept;
101
102 /** For sending the documents to parse to the parsing threads.
103 @param[in,out] doc_item Document to parse, takes ownership.
104 @return DB_SUCCESS or error code, doc_item will be deleted either way. */
105 dberr_t enqueue(Doc_item *doc_item) noexcept;
106
107 /** Check for error status after the parsing has finished.
108 @return DB_SUCCESS or error code. */
109 dberr_t check_for_errors() noexcept;
110
111 /** Start the merging and insert threads.
112 @param[in,out] builder Builder instance to use.
113 @return DB_SUCCESS or error code. */
114 dberr_t insert(Builder *builder) noexcept;
115
116 /** Inform the parser threads that the scanning phase is complete so
117 that they can shutdown after emptying the doc item queue.
118 @param[in] err Error status of the scanning thread(s).
119 @return DB_SUCCESS or error code. */
121
122 private:
123 /** Create the data structures required to build the FTS index.
124 @param[in] n_threads Number of parser threads.
125 @return DB_SUCCESS or error code. */
126 dberr_t create(size_t n_threads) noexcept;
127
128 /** @return the number of parses. */
129 size_t get_n_parsers() const noexcept { return m_parsers.size(); }
130
131 /** Destroy the data structures and clean up. */
132 void destroy() noexcept;
133
134 /** Create a temporary "fts sort index" used to merge sort the
135 tokenized doc string. The index has three "fields":
136
137 1. Tokenized word,
138 2. Doc ID
139 3. Word's position in original 'doc'.
140
141 @param[in,out] index Index to sort.
142 @param[in,out] table Table that the FTS index is created on.
143 @param[out] doc_id_32_bit Whether to use 4 bytes instead of 7 bytes
144 integer to store the DOC ID during sort.
145 @return dict_index_t structure for the fts sort index */
146 [[nodiscard]] static dict_index_t *create_index(dict_index_t *index,
148 bool *doc_id_32_bit) noexcept;
149
150 /** Setup the insert phase inoput files generated by the parsers.
151 @return DB_SUCCESS or error code. */
152 dberr_t setup_insert_phase() noexcept;
153
154 private:
155 // Forward declaration
156 struct Parser;
157 struct Inserter;
158
159 using Threads = std::vector<std::thread>;
160 using Parsers = std::vector<Parser *, ut::allocator<Parser *>>;
161
162 /** For parsing the documents, there is one per thread. */
164
165 /** For inserting the rows parsed by the m_parsers. */
167
168 /** DDL context. */
170
171 /** Duplicate key reporting. */
173
174 /** true if document ID should be stored as a 32 bit instead of a 64 bit. */
176
177 /** DDL index instance. */
179
180 /** DDL table instance. */
182
183 /** Temporary index instance with relevant FTS columns. */
185
186 /** For tracking parser threads. */
188};
189
190} // namespace ddl
191#endif /* ddl0fts_h */
The B-tree bulk load.
SQL data field and tuple.
dberr_t
Definition: db0err.h:38
DDL buffer infrastructure.
Data dictionary global types.
uint64_t doc_id_t
Document id type.
Definition: fts0fts.h:78
Full text search internal header file.
Full text search types file.
static PFS_engine_table_share_proxy table
Definition: pfs.cc:60
The general architecture is that the work is done in two phases, roughly the read and write phase.
Definition: btr0load.cc:41
static Value err()
Create a Value object that represents an error condition.
Definition: json_binary.cc:926
Definition: varlen_sort.h:174
This file contains a set of libraries providing overloads for regular dynamic allocation routines whi...
Definition: aligned_alloc.h:47
std::vector< T, ut::allocator< T > > vector
Specialization of vector which uses allocator.
Definition: ut0new.h:2873
Definition: mysqltest.cc:391
For loading indexes.
Definition: ddl0impl-builder.h:47
DDL context/configuration.
Definition: ddl0ddl.h:320
Structure for reporting duplicate records.
Definition: ddl0ddl.h:131
dict_index_t * m_index
Index being sorted.
Definition: ddl0ddl.h:145
Information about temporary files used in merge sort.
Definition: ddl0fts.h:70
dfield_t * m_field
Field contains document string.
Definition: ddl0fts.h:72
doc_id_t m_doc_id
Document ID.
Definition: ddl0fts.h:75
Definition: ddl0fts.cc:257
For parsing and sorting the documents.
Definition: ddl0fts.cc:118
Full text search index builder.
Definition: ddl0fts.h:66
std::vector< Parser *, ut::allocator< Parser * > > Parsers
Definition: ddl0fts.h:160
Inserter * m_inserter
For inserting the rows parsed by the m_parsers.
Definition: ddl0fts.h:166
FTS(Context &ctx, dict_index_t *index, dict_table_t *table) noexcept
Constructor.
Definition: ddl0fts.cc:1464
dict_index_t * m_index
DDL index instance.
Definition: ddl0fts.h:178
dberr_t init(size_t n_threads) noexcept
Create the internal data structures.
Definition: ddl0fts.cc:1526
dict_index_t * sort_index() noexcept
Definition: ddl0fts.h:96
Parsers m_parsers
For parsing the documents, there is one per thread.
Definition: ddl0fts.h:163
dberr_t enqueue(Doc_item *doc_item) noexcept
For sending the documents to parse to the parsing threads.
Definition: ddl0fts.cc:1572
dict_table_t * m_table
DDL table instance.
Definition: ddl0fts.h:181
dberr_t setup_insert_phase() noexcept
Setup the insert phase inoput files generated by the parsers.
Definition: ddl0fts.cc:1658
~FTS() noexcept
~Destructor.
Definition: ddl0fts.cc:1472
static dict_index_t * create_index(dict_index_t *index, dict_table_t *table, bool *doc_id_32_bit) noexcept
Create a temporary "fts sort index" used to merge sort the tokenized doc string.
Definition: ddl0fts.cc:442
dberr_t create(size_t n_threads) noexcept
Create the data structures required to build the FTS index.
Definition: ddl0fts.cc:1480
size_t get_n_parsers() const noexcept
Definition: ddl0fts.h:129
dict_index_t * index() noexcept
Definition: ddl0fts.h:93
dberr_t insert(Builder *builder) noexcept
Start the merging and insert threads.
Definition: ddl0fts.cc:1600
dberr_t scan_finished(dberr_t err) noexcept
Inform the parser threads that the scanning phase is complete so that they can shutdown after emptyin...
Definition: ddl0fts.cc:1679
std::vector< std::thread > Threads
Definition: ddl0fts.h:159
Context & m_ctx
DDL context.
Definition: ddl0fts.h:169
dberr_t check_for_errors() noexcept
Check for error status after the parsing has finished.
Definition: ddl0fts.cc:1578
dict_index_t * m_sort_index
Temporary index instance with relevant FTS columns.
Definition: ddl0fts.h:184
void destroy() noexcept
Destroy the data structures and clean up.
Definition: ddl0fts.cc:1513
Threads m_threads
For tracking parser threads.
Definition: ddl0fts.h:187
dberr_t start_parse_threads(Builder *builder) noexcept
Start the parsing, create the threads.
Definition: ddl0fts.cc:1536
Dup m_dup
Duplicate key reporting.
Definition: ddl0fts.h:172
bool m_doc_id_32_bit
true if document ID should be stored as a 32 bit instead of a 64 bit.
Definition: ddl0fts.h:175
Structure for an SQL data field.
Definition: data0data.h:604
Data structure for an index.
Definition: dict0mem.h:1045
Data structure for a database table.
Definition: dict0mem.h:1908