MySQL 8.3.0
Source Code Documentation
regexp_engine.h
Go to the documentation of this file.
1#ifndef SQL_REGEXP_REGEXP_ENGINE_H_
2#define SQL_REGEXP_REGEXP_ENGINE_H_
3
4/* Copyright (c) 2017, 2023, Oracle and/or its affiliates.
5
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License, version 2.0,
8 as published by the Free Software Foundation.
9
10 This program is also distributed with certain software (including
11 but not limited to OpenSSL) that is licensed under separate terms,
12 as designated in a particular file or component or in included license
13 documentation. The authors of MySQL hereby grant you an additional
14 permission to link the program and your derivative works with the
15 separately licensed software that they have included with MySQL.
16
17 This program is distributed in the hope that it will be useful,
18 but WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 GNU General Public License, version 2.0, for more details.
21
22 You should have received a copy of the GNU General Public License
23 along with this program; if not, write to the Free Software
24 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */
25
26#include <unicode/uregex.h>
27
28#include <stddef.h>
29#include <stdint.h>
30#include <string>
31#include <utility>
32
33#include "sql/current_thd.h"
34#include "sql/regexp/errors.h"
35#include "sql/sql_class.h" // THD
36#include "template_utils.h"
37
39class Mock_regexp_engine;
40}
41
42namespace regexp {
43
44const char *icu_version_string();
45
46/**
47 Implements a match callback function for icu that aborts execution if the
48 query was killed.
49
50 @param context The session to check for killed query.
51 @param steps Not used.
52
53 @retval false Query was killed in the session and the match should abort.
54 @retval true Query was not killed, matching should continue.
55*/
56UBool QueryNotKilled(const void *context, int32_t steps);
57
58/**
59 This class exposes high-level regular expression operations to the
60 facade. It implements the algorithm for search-and-replace and the various
61 matching options.
62
63 A buffer is used for search-and-replace, whose initial size is that of the
64 subject string. The buffer uses ICU preflight features to probe the required
65 buffer size within each append operation, and the buffer can grow up until
66 max_allowed_packet, at which case and error will be thrown.
67*/
69 public:
70 /**
71 Compiles the URegularExpression object. If compilation fails, my_error()
72 is called and the IsError() returns true. In this case, all subsequent
73 operations will be no-ops, reporting failure. This follows ICU's chaining
74 conventions, see http://icu-project.org/apiref/icu4c/utypes_8h.html.
75
76 @param pattern The pattern string in ICU's character set.
77
78 @param flags ICU flags.
79
80 @param stack_limit Sets the amount of heap storage, in bytes, that the
81 match backtracking stack is allowed to allocate.
82
83 @param time_limit Gets set on the URegularExpression. Please refer to the
84 ICU API docs for the definition of time limit.
85 */
86 Regexp_engine(const std::u16string &pattern, uint flags, int stack_limit,
87 int time_limit) {
88 UParseError error;
89 auto upattern = pattern.data();
90 int length = pattern.size();
91 m_re = uregex_open(pointer_cast<const UChar *>(upattern), length, flags,
92 &error, &m_error_code);
93 uregex_setStackLimit(m_re, stack_limit, &m_error_code);
94 uregex_setTimeLimit(m_re, time_limit, &m_error_code);
95 uregex_setMatchCallback(m_re, QueryNotKilled, current_thd, &m_error_code);
97 }
98
99 uint flags() {
100 uint flags = uregex_flags(m_re, &m_error_code);
102 return flags;
103 }
104
105 /**
106 Resets the engine with a new subject string. This also clears the
107 replacement buffer, see Replace().
108
109 @param subject The new string to match the regular
110 expression against.
111 */
112 void Reset(const std::u16string &subject);
113
114 /**
115 Tries to find match number `occurrence` in the string, starting on
116 `start`.
117
118 @param start Start position, 0-based.
119 @param occurrence Which occurrence to replace. If zero, replace all
120 occurrences.
121 */
122 bool Matches(int start, int occurrence);
123
124 /**
125 Returns the start position in the input string of the string where
126 Matches() found a match.
127 */
129 /*
130 The 0 is for capture group number, but we don't deal with those
131 here. Zero means the start of the whole match, which is what's needed.
132 */
133 return uregex_start(m_re, 0, &m_error_code);
134 }
135
136 /**
137 Returns the position in the input string right after the end of the text
138 where Matches() found a match.
139 */
141 // The 0 means start of capture group 0, ie., the whole match.
142 return uregex_end(m_re, 0, &m_error_code);
143 }
144
145 /**
146 Iterates over the subject string, replacing matches.
147
148 @param replacement The string to replace matches with.
149 @param start Start position, 0-based.
150 @param occurrence Which occurrence to replace. If zero, replace all
151 occurrences.
152
153 @return Reference to a the result of the operation. It is guaranteed to
154 stay intact until a call is made to Reset().
155 */
156 const std::u16string &Replace(const std::u16string &replacement, int start,
157 int occurrence);
158
159 /**
160 The start of the match and its length.
161
162 @return The index of the first code point of the match, and the length of
163 the same.
164 */
165 std::pair<int, int> MatchedSubstring();
166
167 bool HasWarning() const {
168 return U_SUCCESS(m_error_code) && m_error_code != U_ZERO_ERROR;
169 }
170 bool IsError() const { return U_FAILURE(m_error_code); }
171 bool CheckError() const { return check_icu_status(m_error_code); }
172
173 virtual ~Regexp_engine() { uregex_close(m_re); }
174
175 /**
176 The hard limit for growing the replace buffer. The buffer cannot grow
177 beyond this size, and an error will be thrown if the limit is reached.
178 */
179 size_t HardLimit() {
180 return current_thd->variables.max_allowed_packet / sizeof(UChar);
181 }
182
183 /**
184 Fills in the prefix in case we are doing a replace operation starting on a
185 non-first occurrence of the pattern, or a non-first start
186 position. AppendReplacement() will fill in the section starting after the
187 previous match or start position, so a prefix must be appended first.
188
189 The part we have to worry about here, the part that ICU doesn't add for
190 us is, is if the search didn't start on the first character or first
191 match for the regular expression. It's the longest such prefix that we
192 have to copy ourselves.
193 */
194 void AppendHead(size_t size);
195
196 /**
197 Tries to write the replacement, growing the buffer if needed.
198
199 @param replacement The replacement string.
200 */
201 void AppendReplacement(const std::u16string &replacement);
202
203 /// Appends the trailing segment after the last match to the subject string,
204 void AppendTail();
205
206 /**
207 The spare capacity in the replacement buffer, given in code points.
208
209 ICU communicates via a `capacity` variable, but we like to use an absolute
210 position instead, and we want to keep a single source of truth, so we
211 calculate it when needed and assert that the number is correct.
212 */
213 int SpareCapacity() const {
214 return m_replace_buffer.capacity() - m_replace_buffer.size();
215 }
216
218
219 private:
220 /**
221 Preflight function: If the buffer capacity is adequate, the replacement is
222 appended to the buffer, otherwise nothing is written. Either way, the
223 replacement's full size is returned.
224 */
225 int TryToAppendReplacement(const std::u16string &replacement);
226
227 /**
228 Tries to append the part of the subject string after the last match to the
229 buffer. This is a preflight function: If the buffer capacity is adequate,
230 the tail is appended to the buffer, otherwise nothing is written. Either
231 way, the tail's full size is returned.
232 */
233 int TryToAppendTail();
234
235 /**
236 Our handle to ICU's compiled regular expression, owned by instances of
237 this class. URegularExpression is a C struct, but this class follows RAII
238 and initializes this pointer in the constructor and cleans it up in the
239 destructor.
240 */
241 URegularExpression *m_re;
242 UErrorCode m_error_code = U_ZERO_ERROR;
243 std::u16string m_current_subject;
244 std::u16string m_replace_buffer;
245 /**
246 This is always the next index in m_replace_buffer where ICU can write
247 data.
248 */
250};
251
252} // namespace regexp
253
254#endif // SQL_REGEXP_REGEXP_ENGINE_H_
System_variables variables
Definition: sql_lexer_thd.h:63
This class exposes high-level regular expression operations to the facade.
Definition: regexp_engine.h:68
UErrorCode m_error_code
Definition: regexp_engine.h:242
std::pair< int, int > MatchedSubstring()
The start of the match and its length.
Definition: regexp_engine.cc:106
void AppendTail()
Appends the trailing segment after the last match to the subject string,.
Definition: regexp_engine.cc:180
int StartOfMatch()
Returns the start position in the input string of the string where Matches() found a match.
Definition: regexp_engine.h:128
const std::u16string & Replace(const std::u16string &replacement, int start, int occurrence)
Iterates over the subject string, replacing matches.
Definition: regexp_engine.cc:64
int m_replace_buffer_pos
This is always the next index in m_replace_buffer where ICU can write data.
Definition: regexp_engine.h:249
bool IsError() const
Definition: regexp_engine.h:170
virtual ~Regexp_engine()
Definition: regexp_engine.h:173
void Reset(const std::u16string &subject)
Resets the engine with a new subject string.
Definition: regexp_engine.cc:43
std::u16string m_current_subject
Definition: regexp_engine.h:243
URegularExpression * m_re
Our handle to ICU's compiled regular expression, owned by instances of this class.
Definition: regexp_engine.h:241
bool HasWarning() const
Definition: regexp_engine.h:167
int TryToAppendReplacement(const std::u16string &replacement)
Preflight function: If the buffer capacity is adequate, the replacement is appended to the buffer,...
Definition: regexp_engine.cc:138
Regexp_engine(const std::u16string &pattern, uint flags, int stack_limit, int time_limit)
Compiles the URegularExpression object.
Definition: regexp_engine.h:86
friend class regexp_engine_unittest::Mock_regexp_engine
Definition: regexp_engine.h:217
uint flags()
Definition: regexp_engine.h:99
int EndOfMatch()
Returns the position in the input string right after the end of the text where Matches() found a matc...
Definition: regexp_engine.h:140
int TryToAppendTail()
Tries to append the part of the subject string after the last match to the buffer.
Definition: regexp_engine.cc:172
bool CheckError() const
Definition: regexp_engine.h:171
int SpareCapacity() const
The spare capacity in the replacement buffer, given in code points.
Definition: regexp_engine.h:213
void AppendHead(size_t size)
Fills in the prefix in case we are doing a replace operation starting on a non-first occurrence of th...
Definition: regexp_engine.cc:117
std::u16string m_replace_buffer
Definition: regexp_engine.h:244
size_t HardLimit()
The hard limit for growing the replace buffer.
Definition: regexp_engine.h:179
void AppendReplacement(const std::u16string &replacement)
Tries to write the replacement, growing the buffer if needed.
Definition: regexp_engine.cc:149
bool Matches(int start, int occurrence)
Tries to find match number occurrence in the string, starting on start.
Definition: regexp_engine.cc:54
thread_local THD * current_thd
Definition: current_thd.cc:25
static void start(mysql_harness::PluginFuncEnv *env)
Definition: http_auth_backend_plugin.cc:176
bool length(const dd::Spatial_reference_system *srs, const Geometry *g1, double *length, bool *null) noexcept
Computes the length of linestrings and multilinestrings.
Definition: length.cc:75
Definition: regexp_engine.h:38
Definition: errors.cc:44
UBool QueryNotKilled(const void *thd, int32_t)
Implements a match callback function for icu that aborts execution if the query was killed.
Definition: regexp_engine.cc:37
bool check_icu_status(UErrorCode status, const UParseError *parse_error)
Definition: errors.cc:96
const char * icu_version_string()
Definition: regexp_engine.cc:41