MySQL 8.0.40
Source Code Documentation
regexp_engine.h
Go to the documentation of this file.
1#ifndef SQL_REGEXP_REGEXP_ENGINE_H_
2#define SQL_REGEXP_REGEXP_ENGINE_H_
3
4/* Copyright (c) 2017, 2024, Oracle and/or its affiliates.
5
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License, version 2.0,
8 as published by the Free Software Foundation.
9
10 This program is designed to work with certain software (including
11 but not limited to OpenSSL) that is licensed under separate terms,
12 as designated in a particular file or component or in included license
13 documentation. The authors of MySQL hereby grant you an additional
14 permission to link the program and your derivative works with the
15 separately licensed software that they have either included with
16 the program or referenced in the documentation.
17
18 This program is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License, version 2.0, for more details.
22
23 You should have received a copy of the GNU General Public License
24 along with this program; if not, write to the Free Software
25 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */
26
27#include <unicode/uregex.h>
28
29#include <stddef.h>
30#include <stdint.h>
31#include <string>
32#include <utility>
33
34#include "m_ctype.h" // CHARSET_INFO.
35#include "my_config.h" // WORDS_BIGENDIAN
36#include "sql/current_thd.h"
37#include "sql/regexp/errors.h"
38#include "sql/sql_class.h" // THD
39#include "template_utils.h"
40
43
45class Mock_regexp_engine;
46}
47
48namespace regexp {
49
51#ifdef WORDS_BIGENDIAN
53#else
55#endif
56
57const char *icu_version_string();
58
59/**
60 Implements a match callback function for icu that aborts execution if the
61 query was killed.
62
63 @param context The session to check for killed query.
64 @param steps Not used.
65
66 @retval false Query was killed in the session and the match should abort.
67 @retval true Query was not killed, matching should continue.
68*/
69UBool QueryNotKilled(const void *context, int32_t steps);
70
71/**
72 This class exposes high-level regular expression operations to the
73 facade. It implements the algorithm for search-and-replace and the various
74 matching options.
75
76 A buffer is used for search-and-replace, whose initial size is that of the
77 subject string. The buffer uses ICU preflight features to probe the required
78 buffer size within each append operation, and the buffer can grow up until
79 max_allowed_packet, at which case and error will be thrown.
80*/
82 public:
83 /**
84 Compiles the URegularExpression object. If compilation fails, my_error()
85 is called and the IsError() returns true. In this case, all subsequent
86 operations will be no-ops, reporting failure. This follows ICU's chaining
87 conventions, see http://icu-project.org/apiref/icu4c/utypes_8h.html.
88
89 @param pattern The pattern string in ICU's character set.
90
91 @param flags ICU flags.
92
93 @param stack_limit Sets the amount of heap storage, in bytes, that the
94 match backtracking stack is allowed to allocate.
95
96 @param time_limit Gets set on the URegularExpression. Please refer to the
97 ICU API docs for the definition of time limit.
98 */
99 Regexp_engine(const std::u16string &pattern, uint flags, int stack_limit,
100 int time_limit) {
101 UParseError error;
102 auto upattern = pattern.data();
103 int length = pattern.size();
104 m_re = uregex_open(pointer_cast<const UChar *>(upattern), length, flags,
106 uregex_setStackLimit(m_re, stack_limit, &m_error_code);
107 uregex_setTimeLimit(m_re, time_limit, &m_error_code);
108 uregex_setMatchCallback(m_re, QueryNotKilled, current_thd, &m_error_code);
110 }
111
113 uint flags = uregex_flags(m_re, &m_error_code);
115 return flags;
116 }
117
118 /**
119 Resets the engine with a new subject string. This also clears the
120 replacement buffer, see Replace().
121
122 @param subject The new string to match the regular
123 expression against.
124 */
125 void Reset(const std::u16string &subject);
126
127 /**
128 Tries to find match number `occurrence` in the string, starting on
129 `start`.
130
131 @param start Start position, 0-based.
132 @param occurrence Which occurrence to replace. If zero, replace all
133 occurrences.
134 */
135 bool Matches(int start, int occurrence);
136
137 /**
138 Returns the start position in the input string of the string where
139 Matches() found a match.
140 */
142 /*
143 The 0 is for capture group number, but we don't deal with those
144 here. Zero means the start of the whole match, which is what's needed.
145 */
146 return uregex_start(m_re, 0, &m_error_code);
147 }
148
149 /**
150 Returns the position in the input string right after the end of the text
151 where Matches() found a match.
152 */
154 // The 0 means start of capture group 0, ie., the whole match.
155 return uregex_end(m_re, 0, &m_error_code);
156 }
157
158 /**
159 Iterates over the subject string, replacing matches.
160
161 @param replacement The string to replace matches with.
162 @param start Start position, 0-based.
163 @param occurrence Which occurrence to replace. If zero, replace all
164 occurrences.
165
166 @return Reference to a the result of the operation. It is guaranteed to
167 stay intact until a call is made to Reset().
168 */
169 const std::u16string &Replace(const std::u16string &replacement, int start,
170 int occurrence);
171
172 /**
173 The start of the match and its length.
174
175 @return The index of the first code point of the match, and the length of
176 the same.
177 */
178 std::pair<int, int> MatchedSubstring();
179
180 bool HasWarning() const {
181 return U_SUCCESS(m_error_code) && m_error_code != U_ZERO_ERROR;
182 }
183 bool IsError() const { return U_FAILURE(m_error_code); }
184 bool CheckError() const { return check_icu_status(m_error_code); }
185
186 virtual ~Regexp_engine() { uregex_close(m_re); }
187
188 /**
189 The hard limit for growing the replace buffer. The buffer cannot grow
190 beyond this size, and an error will be thrown if the limit is reached.
191 */
192 size_t HardLimit() {
193 return current_thd->variables.max_allowed_packet / sizeof(UChar);
194 }
195
196 /**
197 Fills in the prefix in case we are doing a replace operation starting on a
198 non-first occurrence of the pattern, or a non-first start
199 position. AppendReplacement() will fill in the section starting after the
200 previous match or start position, so a prefix must be appended first.
201
202 The part we have to worry about here, the part that ICU doesn't add for
203 us is, is if the search didn't start on the first character or first
204 match for the regular expression. It's the longest such prefix that we
205 have to copy ourselves.
206 */
207 void AppendHead(size_t size);
208
209 /**
210 Tries to write the replacement, growing the buffer if needed.
211
212 @param replacement The replacement string.
213 */
214 void AppendReplacement(const std::u16string &replacement);
215
216 /// Appends the trailing segment after the last match to the subject string,
217 void AppendTail();
218
219 /**
220 The spare capacity in the replacement buffer, given in code points.
221
222 ICU communicates via a `capacity` variable, but we like to use an absolute
223 position instead, and we want to keep a single source of truth, so we
224 calculate it when needed and assert that the number is correct.
225 */
226 int SpareCapacity() const {
227 return m_replace_buffer.capacity() - m_replace_buffer.size();
228 }
229
231
232 private:
233 /**
234 Preflight function: If the buffer capacity is adequate, the replacement is
235 appended to the buffer, otherwise nothing is written. Either way, the
236 replacement's full size is returned.
237 */
238 int TryToAppendReplacement(const std::u16string &replacement);
239
240 /**
241 Tries to append the part of the subject string after the last match to the
242 buffer. This is a preflight function: If the buffer capacity is adequate,
243 the tail is appended to the buffer, otherwise nothing is written. Either
244 way, the tail's full size is returned.
245 */
246 int TryToAppendTail();
247
248 /**
249 Our handle to ICU's compiled regular expression, owned by instances of
250 this class. URegularExpression is a C struct, but this class follows RAII
251 and initializes this pointer in the constructor and cleans it up in the
252 destructor.
253 */
254 URegularExpression *m_re;
255 UErrorCode m_error_code = U_ZERO_ERROR;
256 std::u16string m_current_subject;
257 std::u16string m_replace_buffer;
258 /**
259 This is always the next index in m_replace_buffer where ICU can write
260 data.
261 */
263};
264
265} // namespace regexp
266
267#endif // SQL_REGEXP_REGEXP_ENGINE_H_
System_variables variables
Definition: sql_lexer_thd.h:62
This class exposes high-level regular expression operations to the facade.
Definition: regexp_engine.h:81
UErrorCode m_error_code
Definition: regexp_engine.h:255
std::pair< int, int > MatchedSubstring()
The start of the match and its length.
Definition: regexp_engine.cc:107
void AppendTail()
Appends the trailing segment after the last match to the subject string,.
Definition: regexp_engine.cc:181
int StartOfMatch()
Returns the start position in the input string of the string where Matches() found a match.
Definition: regexp_engine.h:141
const std::u16string & Replace(const std::u16string &replacement, int start, int occurrence)
Iterates over the subject string, replacing matches.
Definition: regexp_engine.cc:65
int m_replace_buffer_pos
This is always the next index in m_replace_buffer where ICU can write data.
Definition: regexp_engine.h:262
bool IsError() const
Definition: regexp_engine.h:183
virtual ~Regexp_engine()
Definition: regexp_engine.h:186
void Reset(const std::u16string &subject)
Resets the engine with a new subject string.
Definition: regexp_engine.cc:44
std::u16string m_current_subject
Definition: regexp_engine.h:256
URegularExpression * m_re
Our handle to ICU's compiled regular expression, owned by instances of this class.
Definition: regexp_engine.h:254
bool HasWarning() const
Definition: regexp_engine.h:180
int TryToAppendReplacement(const std::u16string &replacement)
Preflight function: If the buffer capacity is adequate, the replacement is appended to the buffer,...
Definition: regexp_engine.cc:139
Regexp_engine(const std::u16string &pattern, uint flags, int stack_limit, int time_limit)
Compiles the URegularExpression object.
Definition: regexp_engine.h:99
friend class regexp_engine_unittest::Mock_regexp_engine
Definition: regexp_engine.h:230
uint flags()
Definition: regexp_engine.h:112
int EndOfMatch()
Returns the position in the input string right after the end of the text where Matches() found a matc...
Definition: regexp_engine.h:153
int TryToAppendTail()
Tries to append the part of the subject string after the last match to the buffer.
Definition: regexp_engine.cc:173
bool CheckError() const
Definition: regexp_engine.h:184
int SpareCapacity() const
The spare capacity in the replacement buffer, given in code points.
Definition: regexp_engine.h:226
void AppendHead(size_t size)
Fills in the prefix in case we are doing a replace operation starting on a non-first occurrence of th...
Definition: regexp_engine.cc:118
std::u16string m_replace_buffer
Definition: regexp_engine.h:257
size_t HardLimit()
The hard limit for growing the replace buffer.
Definition: regexp_engine.h:192
void AppendReplacement(const std::u16string &replacement)
Tries to write the replacement, growing the buffer if needed.
Definition: regexp_engine.cc:150
bool Matches(int start, int occurrence)
Tries to find match number occurrence in the string, starting on start.
Definition: regexp_engine.cc:55
thread_local THD * current_thd
Definition: current_thd.cc:26
static void start(mysql_harness::PluginFuncEnv *env)
Definition: http_auth_backend_plugin.cc:177
A better implementation of the UNIX ctype(3) library.
Log error(cerr, "ERROR")
bool length(const dd::Spatial_reference_system *srs, const Geometry *g1, double *length, bool *null) noexcept
Computes the length of linestrings and multilinestrings.
Definition: length.cc:76
Definition: regexp_engine.h:44
Definition: errors.cc:45
static constexpr CHARSET_INFO * regexp_lib_charset
Definition: regexp_engine.h:50
UBool QueryNotKilled(const void *thd, int32_t)
Implements a match callback function for icu that aborts execution if the query was killed.
Definition: regexp_engine.cc:38
bool check_icu_status(UErrorCode status, const UParseError *parse_error)
Definition: errors.cc:97
const char * icu_version_string()
Definition: regexp_engine.cc:42
CHARSET_INFO my_charset_utf16le_general_ci
Definition: ctype-ucs2.cc:1571
CHARSET_INFO my_charset_utf16_general_ci
Definition: ctype-ucs2.cc:1415
Definition: m_ctype.h:385
unsigned int uint
Definition: uca9-dump.cc:75