MySQL 8.4.0
Source Code Documentation
cost_model.h
Go to the documentation of this file.
1/* Copyright (c) 2020, 2024, Oracle and/or its affiliates.
2
3 This program is free software; you can redistribute it and/or modify
4 it under the terms of the GNU General Public License, version 2.0,
5 as published by the Free Software Foundation.
6
7 This program is designed to work with certain software (including
8 but not limited to OpenSSL) that is licensed under separate terms,
9 as designated in a particular file or component or in included license
10 documentation. The authors of MySQL hereby grant you an additional
11 permission to link the program and your derivative works with the
12 separately licensed software that they have either included with
13 the program or referenced in the documentation.
14
15 This program is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License, version 2.0, for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with this program; if not, write to the Free Software
22 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */
23
24#ifndef SQL_JOIN_OPTIMIZER_COST_MODEL_H_
25#define SQL_JOIN_OPTIMIZER_COST_MODEL_H_
26
27#include "my_base.h"
30#include "sql/mem_root_array.h"
31
32struct AccessPath;
34class Item;
35class Query_block;
36class THD;
37struct TABLE;
38
39/**
40 When we make cost estimates, we use this as the maximal length the
41 values we get from evaluating an Item (in bytes). Actual values of
42 e.g. blobs may be much longer, but even so we use this as an upper
43 limit when doing cost calculations. (For context, @see Item#max_length .)
44*/
45constexpr size_t kMaxItemLengthEstimate = 4096;
46
47// These are extremely arbitrary cost model constants. We should revise them
48// based on actual query times (possibly using linear regression?), and then
49// put them into the cost model to make them user-tunable.
50constexpr double kApplyOneFilterCost = 0.1;
51constexpr double kAggregateOneRowCost = 0.1;
52constexpr double kSortOneRowCost = 0.1;
53constexpr double kHashBuildOneRowCost = 0.1;
54constexpr double kHashProbeOneRowCost = 0.1;
55constexpr double kHashReturnOneRowCost = 0.07;
56constexpr double kMaterializeOneRowCost = 0.1;
57constexpr double kWindowOneRowCost = 0.1;
58
59/// A fallback cardinality estimate that is used in case the storage engine
60/// cannot provide one (like for table functions). It's a fairly arbitrary
61/// non-zero value.
63
64/// See EstimateFilterCost.
65struct FilterCost {
66 /// Cost of evaluating the filter for all rows if subqueries are not
67 /// materialized. (Note that this includes the contribution from
68 /// init_cost_if_not_materialized.)
70
71 /// Initial cost before the filter can be applied for the first time.
72 /// Typically the cost of executing 'independent subquery' in queries like:
73 /// "SELECT * FROM tab WHERE field = <independent subquery>".
74 /// (That corresponds to the Item_singlerow_subselect class.)
76
77 /// Cost of evaluating the filter for all rows if all subqueries in
78 /// it have been materialized beforehand. If there are no subqueries
79 /// in the condition, equals cost_if_not_materialized.
81
82 /// Cost of materializing all subqueries present in the filter.
83 /// If there are no subqueries in the condition, equals zero.
85};
86
87/// Used internally by EstimateFilterCost() only.
88void AddCost(THD *thd, const ContainedSubquery &subquery, double num_rows,
89 FilterCost *cost);
90
91/**
92 Estimate the cost of evaluating “condition”, “num_rows” times.
93 This is a fairly rudimentary estimation, _but_ it includes the cost
94 of any subqueries that may be present and that need evaluation.
95 */
96FilterCost EstimateFilterCost(THD *thd, double num_rows, Item *condition,
97 const Query_block *outer_query_block);
98
99/**
100 A cheaper overload of EstimateFilterCost() that assumes that all
101 contained subqueries have already been extracted (ie., it skips the
102 walking, which can be fairly expensive). This data is typically
103 computed by FindContainedSubqueries().
104 */
106 THD *thd, double num_rows,
107 const Mem_root_array<ContainedSubquery> &contained_subqueries) {
108 FilterCost cost;
111
112 for (const ContainedSubquery &subquery : contained_subqueries) {
113 AddCost(thd, subquery, num_rows, &cost);
114 }
115 return cost;
116}
117
118double EstimateCostForRefAccess(THD *thd, TABLE *table, unsigned key_idx,
119 double num_output_rows);
120
121/**
122 Estimate costs and output rows for a SORT AccessPath.
123 @param thd Current thread.
124 @param path the AccessPath.
125 @param distinct_rows An estimate of the number of distinct rows, if
126 remove_duplicates==true and we have an estimate already.
127*/
129 double distinct_rows = kUnknownRowCount);
130
132
133/**
134 Estimate the number of rows with a distinct combination of values for
135 'terms'. @see EstimateDistinctRowsFromStatistics for additional details.
136 @param thd The current thread.
137 @param child_rows The number of input rows.
138 @param terms The terms for which we estimate the number of unique
139 combinations.
140 @returns The estimated number of output rows.
141*/
142double EstimateDistinctRows(THD *thd, double child_rows,
144/**
145 Estimate costs and result row count for an aggregate operation.
146 @param[in,out] thd The current thread.
147 @param[in,out] path The AGGREGATE path.
148 @param[in] query_block The Query_block to which 'path' belongs.
149 */
151 const Query_block *query_block);
154
155/// Estimate the costs and row count for a STREAM AccessPath.
157
158/**
159 Estimate the costs and row count for a WINDOW AccessPath. As described in
160 @see AccessPath::m_init_cost, the cost to read k out of N rows would be
161 init_cost + (k/N) * (cost - init_cost).
162*/
164
165/// Estimate the costs and row count for a WINDOW AccessPath.
167
168/**
169 Estimate the fan out for a left semijoin or a left antijoin. The fan out
170 is defined as the number of result rows, divided by the number of input
171 rows from the left hand relation. For a semijoin, J1:
172
173 SELECT ... FROM t1 WHERE EXISTS (SELECT ... FROM t2 WHERE predicate)
174
175 we know that the fan out of the corresponding inner join J2:
176
177 SELECT ... FROM t1, t2 WHERE predicate
178
179 is: F(J2) = CARD(t2) * SELECTIVITY(predicate) , where CARD(t2)=right_rows,
180 and SELECTIVITY(predicate)=edge.selectivity. If 'predicate' is a
181 deterministic function of t1 and t2 rows, then J1 is equivalent to an inner
182 join J3:
183
184 SELECT ... FROM t1 JOIN (SELECT DISTINCT f1,..fn FROM t2) d ON predicate
185
186 where f1,..fn are those fields from t2 that appear in the predicate.
187
188 Then F(J1) = F(J3) = F(J2) * CARD(d) / CARD(t2)
189 = CARD(d) * SELECTIVITY(predicate).
190
191 This function therefore collects f1..fn and estimates CARD(d). As a special
192 case, 'predicate' may be independent of t2. The query is then equivalent to:
193
194 SELECT ... FROM t1 WHERE predicate AND (SELECT COUNT(*) FROM t2) > 0
195
196 The fan out is then the selectivity of 'predicate' multiplied by the
197 probability of t2 having at least one row.
198
199 @param thd The current thread.
200 @param right_rows The number of input rows from the right hand relation.
201 @param edge Join edge.
202 @returns fan out.
203 */
204double EstimateSemijoinFanOut(THD *thd, double right_rows,
205 const JoinPredicate &edge);
206
207/**
208 Estimate the number of output rows from joining two relations.
209 @param thd The current thread.
210 @param left_rows Number of rows in the left hand relation.
211 @param right_rows Number of rows in the right hand relation.
212 @param edge The join between the two relations.
213*/
214inline double FindOutputRowsForJoin(THD *thd, double left_rows,
215 double right_rows,
216 const JoinPredicate *edge) {
217 switch (edge->expr->type) {
219 // For outer joins, every outer row produces at least one row (if none
220 // are matching, we get a NULL-complemented row).
221 // Note that this can cause inconsistent row counts; see bug #33550360
222 // and/or JoinHypergraph::has_reordered_left_joins.
223 return left_rows * std::max(right_rows * edge->selectivity, 1.0);
224
226 return left_rows * EstimateSemijoinFanOut(thd, right_rows, *edge);
227
229 // Antijoin are estimated as simply the opposite of semijoin (see above),
230 // but wrongly estimating 0 rows (or, of course, a negative amount) could
231 // be really bad, so we assume at least 10% coming out as a fudge factor.
232 // It's better to estimate too high than too low here.
233 return left_rows *
234 std::max(1.0 - EstimateSemijoinFanOut(thd, right_rows, *edge),
235 0.1);
236
239 return left_rows * right_rows * edge->selectivity;
240
241 case RelationalExpression::FULL_OUTER_JOIN: // Not implemented.
242 case RelationalExpression::MULTI_INNER_JOIN: // Should not appear here.
243 case RelationalExpression::TABLE: // Should not appear here.
244 assert(false);
245 return 0;
246
247 default:
248 assert(false);
249 return 0;
250 }
251}
252
253#endif // SQL_JOIN_OPTIMIZER_COST_MODEL_H_
constexpr double kUnknownRowCount
To indicate that a row estimate is not yet made.
Definition: access_path.h:184
A wrapper class which provides array bounds checking.
Definition: sql_array.h:47
Base class that is used to represent any kind of expression in a relational query.
Definition: item.h:934
A typesafe replacement for DYNAMIC_ARRAY.
Definition: mem_root_array.h:426
This class represents a query block, aka a query specification, which is a query consisting of a SELE...
Definition: sql_lex.h:1163
For each client connection we create a separate thread with THD serving as a thread/connection descri...
Definition: sql_lexer_thd.h:36
constexpr double kHashReturnOneRowCost
Definition: cost_model.h:55
constexpr ha_rows kRowEstimateFallback
A fallback cardinality estimate that is used in case the storage engine cannot provide one (like for ...
Definition: cost_model.h:62
void EstimateSortCost(THD *thd, AccessPath *path, double distinct_rows=kUnknownRowCount)
Estimate costs and output rows for a SORT AccessPath.
Definition: cost_model.cc:96
void EstimateWindowCost(AccessPath *path)
Estimate the costs and row count for a WINDOW AccessPath.
Definition: cost_model.cc:975
constexpr size_t kMaxItemLengthEstimate
When we make cost estimates, we use this as the maximal length the values we get from evaluating an I...
Definition: cost_model.h:45
void AddCost(THD *thd, const ContainedSubquery &subquery, double num_rows, FilterCost *cost)
Used internally by EstimateFilterCost() only.
Definition: cost_model.cc:153
constexpr double kHashBuildOneRowCost
Definition: cost_model.h:53
void EstimateLimitOffsetCost(AccessPath *path)
Estimate the costs and row count for a WINDOW AccessPath.
Definition: cost_model.cc:941
constexpr double kWindowOneRowCost
Definition: cost_model.h:57
constexpr double kApplyOneFilterCost
Definition: cost_model.h:50
void EstimateDeleteRowsCost(AccessPath *path)
Definition: cost_model.cc:891
constexpr double kMaterializeOneRowCost
Definition: cost_model.h:56
void EstimateStreamCost(AccessPath *path)
Estimate the costs and row count for a STREAM AccessPath.
Definition: cost_model.cc:925
double EstimateCostForRefAccess(THD *thd, TABLE *table, unsigned key_idx, double num_output_rows)
Definition: cost_model.cc:63
double FindOutputRowsForJoin(THD *thd, double left_rows, double right_rows, const JoinPredicate *edge)
Estimate the number of output rows from joining two relations.
Definition: cost_model.h:214
void EstimateUpdateRowsCost(AccessPath *path)
Definition: cost_model.cc:908
void EstimateMaterializeCost(THD *thd, AccessPath *path)
Definition: cost_model.cc:209
constexpr double kHashProbeOneRowCost
Definition: cost_model.h:54
constexpr double kSortOneRowCost
Definition: cost_model.h:52
double EstimateDistinctRows(THD *thd, double child_rows, Bounds_checked_array< const Item *const > terms)
Estimate the number of rows with a distinct combination of values for 'terms'.
constexpr double kAggregateOneRowCost
Definition: cost_model.h:51
void EstimateAggregateCost(THD *thd, AccessPath *path, const Query_block *query_block)
Estimate costs and result row count for an aggregate operation.
Definition: cost_model.cc:872
FilterCost EstimateFilterCost(THD *thd, double num_rows, Item *condition, const Query_block *outer_query_block)
Estimate the cost of evaluating “condition”, “num_rows” times.
Definition: cost_model.cc:194
double EstimateSemijoinFanOut(THD *thd, double right_rows, const JoinPredicate &edge)
Estimate the fan out for a left semijoin or a left antijoin.
Definition: cost_model.cc:985
This file includes constants used by all storage engines.
my_off_t ha_rows
Definition: my_base.h:1141
static char * path
Definition: mysqldump.cc:149
static PFS_engine_table_share_proxy table
Definition: pfs.cc:61
Access paths are a query planning structure that correspond 1:1 to iterators, in that an access path ...
Definition: access_path.h:213
This class represents a subquery contained in some subclass of Item_subselect,.
Definition: item.h:866
See EstimateFilterCost.
Definition: cost_model.h:65
double init_cost_if_not_materialized
Initial cost before the filter can be applied for the first time.
Definition: cost_model.h:75
double cost_to_materialize
Cost of materializing all subqueries present in the filter.
Definition: cost_model.h:84
double cost_if_materialized
Cost of evaluating the filter for all rows if all subqueries in it have been materialized beforehand.
Definition: cost_model.h:80
double cost_if_not_materialized
Cost of evaluating the filter for all rows if subqueries are not materialized.
Definition: cost_model.h:69
A specification that two specific relational expressions (e.g., two tables, or a table and a join bet...
Definition: access_path.h:78
RelationalExpression * expr
Definition: access_path.h:79
double selectivity
Definition: access_path.h:80
enum RelationalExpression::Type type
@ SEMIJOIN
Left semijoin.
Definition: relational_expression.h:151
@ MULTI_INNER_JOIN
Definition: relational_expression.h:173
@ STRAIGHT_INNER_JOIN
Definition: relational_expression.h:159
@ ANTIJOIN
Left antijoin.
Definition: relational_expression.h:154
@ INNER_JOIN
Definition: relational_expression.h:147
@ FULL_OUTER_JOIN
Definition: relational_expression.h:166
@ TABLE
Definition: relational_expression.h:175
@ LEFT_JOIN
Definition: relational_expression.h:148
Definition: table.h:1405