MySQL 8.4.0
Source Code Documentation
regexp_engine.h
Go to the documentation of this file.
1#ifndef SQL_REGEXP_REGEXP_ENGINE_H_
2#define SQL_REGEXP_REGEXP_ENGINE_H_
3
4/* Copyright (c) 2017, 2024, Oracle and/or its affiliates.
5
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License, version 2.0,
8 as published by the Free Software Foundation.
9
10 This program is designed to work with certain software (including
11 but not limited to OpenSSL) that is licensed under separate terms,
12 as designated in a particular file or component or in included license
13 documentation. The authors of MySQL hereby grant you an additional
14 permission to link the program and your derivative works with the
15 separately licensed software that they have either included with
16 the program or referenced in the documentation.
17
18 This program is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License, version 2.0, for more details.
22
23 You should have received a copy of the GNU General Public License
24 along with this program; if not, write to the Free Software
25 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */
26
27#include <unicode/uregex.h>
28
29#include <stddef.h>
30#include <stdint.h>
31#include <string>
32#include <utility>
33
34#include "sql/current_thd.h"
35#include "sql/regexp/errors.h"
36#include "sql/sql_class.h" // THD
37#include "template_utils.h"
38
40class Mock_regexp_engine;
41}
42
43namespace regexp {
44
45const char *icu_version_string();
46
47/**
48 Implements a match callback function for icu that aborts execution if the
49 query was killed.
50
51 @param context The session to check for killed query.
52 @param steps Not used.
53
54 @retval false Query was killed in the session and the match should abort.
55 @retval true Query was not killed, matching should continue.
56*/
57UBool QueryNotKilled(const void *context, int32_t steps);
58
59/**
60 This class exposes high-level regular expression operations to the
61 facade. It implements the algorithm for search-and-replace and the various
62 matching options.
63
64 A buffer is used for search-and-replace, whose initial size is that of the
65 subject string. The buffer uses ICU preflight features to probe the required
66 buffer size within each append operation, and the buffer can grow up until
67 max_allowed_packet, at which case and error will be thrown.
68*/
70 public:
71 /**
72 Compiles the URegularExpression object. If compilation fails, my_error()
73 is called and the IsError() returns true. In this case, all subsequent
74 operations will be no-ops, reporting failure. This follows ICU's chaining
75 conventions, see http://icu-project.org/apiref/icu4c/utypes_8h.html.
76
77 @param pattern The pattern string in ICU's character set.
78
79 @param flags ICU flags.
80
81 @param stack_limit Sets the amount of heap storage, in bytes, that the
82 match backtracking stack is allowed to allocate.
83
84 @param time_limit Gets set on the URegularExpression. Please refer to the
85 ICU API docs for the definition of time limit.
86 */
87 Regexp_engine(const std::u16string &pattern, uint flags, int stack_limit,
88 int time_limit) {
89 UParseError error;
90 auto upattern = pattern.data();
91 int length = pattern.size();
92 m_re = uregex_open(pointer_cast<const UChar *>(upattern), length, flags,
94 uregex_setStackLimit(m_re, stack_limit, &m_error_code);
95 uregex_setTimeLimit(m_re, time_limit, &m_error_code);
96 uregex_setMatchCallback(m_re, QueryNotKilled, current_thd, &m_error_code);
98 }
99
100 uint flags() {
101 uint flags = uregex_flags(m_re, &m_error_code);
103 return flags;
104 }
105
106 /**
107 Resets the engine with a new subject string. This also clears the
108 replacement buffer, see Replace().
109
110 @param subject The new string to match the regular
111 expression against.
112 */
113 void Reset(const std::u16string &subject);
114
115 /**
116 Tries to find match number `occurrence` in the string, starting on
117 `start`.
118
119 @param start Start position, 0-based.
120 @param occurrence Which occurrence to replace. If zero, replace all
121 occurrences.
122 */
123 bool Matches(int start, int occurrence);
124
125 /**
126 Returns the start position in the input string of the string where
127 Matches() found a match.
128 */
130 /*
131 The 0 is for capture group number, but we don't deal with those
132 here. Zero means the start of the whole match, which is what's needed.
133 */
134 return uregex_start(m_re, 0, &m_error_code);
135 }
136
137 /**
138 Returns the position in the input string right after the end of the text
139 where Matches() found a match.
140 */
142 // The 0 means start of capture group 0, ie., the whole match.
143 return uregex_end(m_re, 0, &m_error_code);
144 }
145
146 /**
147 Iterates over the subject string, replacing matches.
148
149 @param replacement The string to replace matches with.
150 @param start Start position, 0-based.
151 @param occurrence Which occurrence to replace. If zero, replace all
152 occurrences.
153
154 @return Reference to a the result of the operation. It is guaranteed to
155 stay intact until a call is made to Reset().
156 */
157 const std::u16string &Replace(const std::u16string &replacement, int start,
158 int occurrence);
159
160 /**
161 The start of the match and its length.
162
163 @return The index of the first code point of the match, and the length of
164 the same.
165 */
166 std::pair<int, int> MatchedSubstring();
167
168 bool HasWarning() const {
169 return U_SUCCESS(m_error_code) && m_error_code != U_ZERO_ERROR;
170 }
171 bool IsError() const { return U_FAILURE(m_error_code); }
172 bool CheckError() const { return check_icu_status(m_error_code); }
173
174 virtual ~Regexp_engine() { uregex_close(m_re); }
175
176 /**
177 The hard limit for growing the replace buffer. The buffer cannot grow
178 beyond this size, and an error will be thrown if the limit is reached.
179 */
180 size_t HardLimit() {
181 return current_thd->variables.max_allowed_packet / sizeof(UChar);
182 }
183
184 /**
185 Fills in the prefix in case we are doing a replace operation starting on a
186 non-first occurrence of the pattern, or a non-first start
187 position. AppendReplacement() will fill in the section starting after the
188 previous match or start position, so a prefix must be appended first.
189
190 The part we have to worry about here, the part that ICU doesn't add for
191 us is, is if the search didn't start on the first character or first
192 match for the regular expression. It's the longest such prefix that we
193 have to copy ourselves.
194 */
195 void AppendHead(size_t size);
196
197 /**
198 Tries to write the replacement, growing the buffer if needed.
199
200 @param replacement The replacement string.
201 */
202 void AppendReplacement(const std::u16string &replacement);
203
204 /// Appends the trailing segment after the last match to the subject string,
205 void AppendTail();
206
207 /**
208 The spare capacity in the replacement buffer, given in code points.
209
210 ICU communicates via a `capacity` variable, but we like to use an absolute
211 position instead, and we want to keep a single source of truth, so we
212 calculate it when needed and assert that the number is correct.
213 */
214 int SpareCapacity() const {
215 return m_replace_buffer.capacity() - m_replace_buffer.size();
216 }
217
219
220 private:
221 /**
222 Preflight function: If the buffer capacity is adequate, the replacement is
223 appended to the buffer, otherwise nothing is written. Either way, the
224 replacement's full size is returned.
225 */
226 int TryToAppendReplacement(const std::u16string &replacement);
227
228 /**
229 Tries to append the part of the subject string after the last match to the
230 buffer. This is a preflight function: If the buffer capacity is adequate,
231 the tail is appended to the buffer, otherwise nothing is written. Either
232 way, the tail's full size is returned.
233 */
234 int TryToAppendTail();
235
236 /**
237 Our handle to ICU's compiled regular expression, owned by instances of
238 this class. URegularExpression is a C struct, but this class follows RAII
239 and initializes this pointer in the constructor and cleans it up in the
240 destructor.
241 */
242 URegularExpression *m_re;
243 UErrorCode m_error_code = U_ZERO_ERROR;
244 std::u16string m_current_subject;
245 std::u16string m_replace_buffer;
246 /**
247 This is always the next index in m_replace_buffer where ICU can write
248 data.
249 */
251};
252
253} // namespace regexp
254
255#endif // SQL_REGEXP_REGEXP_ENGINE_H_
System_variables variables
Definition: sql_lexer_thd.h:64
This class exposes high-level regular expression operations to the facade.
Definition: regexp_engine.h:69
UErrorCode m_error_code
Definition: regexp_engine.h:243
std::pair< int, int > MatchedSubstring()
The start of the match and its length.
Definition: regexp_engine.cc:107
void AppendTail()
Appends the trailing segment after the last match to the subject string,.
Definition: regexp_engine.cc:181
int StartOfMatch()
Returns the start position in the input string of the string where Matches() found a match.
Definition: regexp_engine.h:129
const std::u16string & Replace(const std::u16string &replacement, int start, int occurrence)
Iterates over the subject string, replacing matches.
Definition: regexp_engine.cc:65
int m_replace_buffer_pos
This is always the next index in m_replace_buffer where ICU can write data.
Definition: regexp_engine.h:250
bool IsError() const
Definition: regexp_engine.h:171
virtual ~Regexp_engine()
Definition: regexp_engine.h:174
void Reset(const std::u16string &subject)
Resets the engine with a new subject string.
Definition: regexp_engine.cc:44
std::u16string m_current_subject
Definition: regexp_engine.h:244
URegularExpression * m_re
Our handle to ICU's compiled regular expression, owned by instances of this class.
Definition: regexp_engine.h:242
bool HasWarning() const
Definition: regexp_engine.h:168
int TryToAppendReplacement(const std::u16string &replacement)
Preflight function: If the buffer capacity is adequate, the replacement is appended to the buffer,...
Definition: regexp_engine.cc:139
Regexp_engine(const std::u16string &pattern, uint flags, int stack_limit, int time_limit)
Compiles the URegularExpression object.
Definition: regexp_engine.h:87
friend class regexp_engine_unittest::Mock_regexp_engine
Definition: regexp_engine.h:218
uint flags()
Definition: regexp_engine.h:100
int EndOfMatch()
Returns the position in the input string right after the end of the text where Matches() found a matc...
Definition: regexp_engine.h:141
int TryToAppendTail()
Tries to append the part of the subject string after the last match to the buffer.
Definition: regexp_engine.cc:173
bool CheckError() const
Definition: regexp_engine.h:172
int SpareCapacity() const
The spare capacity in the replacement buffer, given in code points.
Definition: regexp_engine.h:214
void AppendHead(size_t size)
Fills in the prefix in case we are doing a replace operation starting on a non-first occurrence of th...
Definition: regexp_engine.cc:118
std::u16string m_replace_buffer
Definition: regexp_engine.h:245
size_t HardLimit()
The hard limit for growing the replace buffer.
Definition: regexp_engine.h:180
void AppendReplacement(const std::u16string &replacement)
Tries to write the replacement, growing the buffer if needed.
Definition: regexp_engine.cc:150
bool Matches(int start, int occurrence)
Tries to find match number occurrence in the string, starting on start.
Definition: regexp_engine.cc:55
thread_local THD * current_thd
Definition: current_thd.cc:26
static void start(mysql_harness::PluginFuncEnv *env)
Definition: http_auth_backend_plugin.cc:180
void error(const char *format,...)
bool length(const dd::Spatial_reference_system *srs, const Geometry *g1, double *length, bool *null) noexcept
Computes the length of linestrings and multilinestrings.
Definition: length.cc:76
size_t size(const char *const c)
Definition: base64.h:46
Definition: regexp_engine.h:39
Definition: errors.cc:45
UBool QueryNotKilled(const void *thd, int32_t)
Implements a match callback function for icu that aborts execution if the query was killed.
Definition: regexp_engine.cc:38
bool check_icu_status(UErrorCode status, const UParseError *parse_error)
Definition: errors.cc:97
const char * icu_version_string()
Definition: regexp_engine.cc:42