MySQL 8.3.0
Source Code Documentation
cost_model.h
Go to the documentation of this file.
1/* Copyright (c) 2020, 2023, Oracle and/or its affiliates.
2
3 This program is free software; you can redistribute it and/or modify
4 it under the terms of the GNU General Public License, version 2.0,
5 as published by the Free Software Foundation.
6
7 This program is also distributed with certain software (including
8 but not limited to OpenSSL) that is licensed under separate terms,
9 as designated in a particular file or component or in included license
10 documentation. The authors of MySQL hereby grant you an additional
11 permission to link the program and your derivative works with the
12 separately licensed software that they have included with MySQL.
13
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License, version 2.0, for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with this program; if not, write to the Free Software
21 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */
22
23#ifndef SQL_JOIN_OPTIMIZER_COST_MODEL_H_
24#define SQL_JOIN_OPTIMIZER_COST_MODEL_H_
25
26#include "my_base.h"
29#include "sql/mem_root_array.h"
30
31struct AccessPath;
33class Item;
34class Query_block;
35class THD;
36struct TABLE;
37
38/**
39 When we make cost estimates, we use this as the maximal length the
40 values we get from evaluating an Item (in bytes). Actual values of
41 e.g. blobs may be much longer, but even so we use this as an upper
42 limit when doing cost calculations. (For context, @see Item#max_length .)
43*/
44constexpr size_t kMaxItemLengthEstimate = 4096;
45
46// These are extremely arbitrary cost model constants. We should revise them
47// based on actual query times (possibly using linear regression?), and then
48// put them into the cost model to make them user-tunable.
49constexpr double kApplyOneFilterCost = 0.1;
50constexpr double kAggregateOneRowCost = 0.1;
51constexpr double kSortOneRowCost = 0.1;
52constexpr double kHashBuildOneRowCost = 0.1;
53constexpr double kHashProbeOneRowCost = 0.1;
54constexpr double kHashReturnOneRowCost = 0.07;
55constexpr double kMaterializeOneRowCost = 0.1;
56constexpr double kWindowOneRowCost = 0.1;
57
58/// A fallback cardinality estimate that is used in case the storage engine
59/// cannot provide one (like for table functions). It's a fairly arbitrary
60/// non-zero value.
62
63/// See EstimateFilterCost.
64struct FilterCost {
65 /// Cost of evaluating the filter for all rows if subqueries are not
66 /// materialized. (Note that this includes the contribution from
67 /// init_cost_if_not_materialized.)
69
70 /// Initial cost before the filter can be applied for the first time.
71 /// Typically the cost of executing 'independent subquery' in queries like:
72 /// "SELECT * FROM tab WHERE field = <independent subquery>".
73 /// (That corresponds to the Item_singlerow_subselect class.)
75
76 /// Cost of evaluating the filter for all rows if all subqueries in
77 /// it have been materialized beforehand. If there are no subqueries
78 /// in the condition, equals cost_if_not_materialized.
80
81 /// Cost of materializing all subqueries present in the filter.
82 /// If there are no subqueries in the condition, equals zero.
84};
85
86/// Used internally by EstimateFilterCost() only.
87void AddCost(THD *thd, const ContainedSubquery &subquery, double num_rows,
88 FilterCost *cost);
89
90/**
91 Estimate the cost of evaluating “condition”, “num_rows” times.
92 This is a fairly rudimentary estimation, _but_ it includes the cost
93 of any subqueries that may be present and that need evaluation.
94 */
95FilterCost EstimateFilterCost(THD *thd, double num_rows, Item *condition,
96 const Query_block *outer_query_block);
97
98/**
99 A cheaper overload of EstimateFilterCost() that assumes that all
100 contained subqueries have already been extracted (ie., it skips the
101 walking, which can be fairly expensive). This data is typically
102 computed by FindContainedSubqueries().
103 */
105 THD *thd, double num_rows,
106 const Mem_root_array<ContainedSubquery> &contained_subqueries) {
107 FilterCost cost;
110
111 for (const ContainedSubquery &subquery : contained_subqueries) {
112 AddCost(thd, subquery, num_rows, &cost);
113 }
114 return cost;
115}
116
117double EstimateCostForRefAccess(THD *thd, TABLE *table, unsigned key_idx,
118 double num_output_rows);
121
122/**
123 Estimate the number of rows with a distinct combination of values for
124 'terms'. @see EstimateDistinctRowsFromStatistics for additional details.
125 @param child_rows The number of input rows.
126 @param terms The terms for which we estimate the number of unique
127 combinations.
128 @param trace Optimizer trace.
129 @returns The estimated number of output rows.
130*/
131double EstimateDistinctRows(double child_rows,
133 std::string *trace = nullptr);
134/**
135 Estimate costs and result row count for an aggregate operation.
136 @param[in,out] path The AGGREGATE path.
137 @param[in] query_block The Query_block to which 'path' belongs.
138 @param[in,out] trace Optimizer trace text.
139 */
141 std::string *trace = nullptr);
144
145/// Estimate the costs and row count for a STREAM AccessPath.
147
148/// Estimate the costs and row count for a LIMIT_OFFSET AccessPath.
150
151/// Estimate the costs and row count for a WINDOW AccessPath.
153
154/**
155 Estimate the fan out for a left semijoin or a left antijoin. The fan out
156 is defined as the number of result rows, divided by the number of input
157 rows from the left hand relation. For a semijoin, J1:
158
159 SELECT ... FROM t1 WHERE EXISTS (SELECT ... FROM t2 WHERE predicate)
160
161 we know that the fan out of the corresponding inner join J2:
162
163 SELECT ... FROM t1, t2 WHERE predicate
164
165 is: F(J2) = CARD(t2) * SELECTIVITY(predicate) , where CARD(t2)=right_rows,
166 and SELECTIVITY(predicate)=edge.selectivity. If 'predicate' is a
167 deterministic function of t1 and t2 rows, then J1 is equivalent to an inner
168 join J3:
169
170 SELECT ... FROM t1 JOIN (SELECT DISTINCT f1,..fn FROM t2) d ON predicate
171
172 where f1,..fn are those fields from t2 that appear in the predicate.
173
174 Then F(J1) = F(J3) = F(J2) * CARD(d) / CARD(t2)
175 = CARD(d) * SELECTIVITY(predicate).
176
177 This function therefore collects f1..fn and estimates CARD(d). As a special
178 case, 'predicate' may be independent of t2. The query is then equivalent to:
179
180 SELECT ... FROM t1 WHERE predicate AND (SELECT COUNT(*) FROM t2) > 0
181
182 The fan out is then the selectivity of 'predicate' multiplied by the
183 probability of t2 having at least one row.
184
185 @param right_rows The number of input rows from the right hand relation.
186 @param edge Join edge.
187 @returns fan out.
188 */
189double EstimateSemijoinFanOut(double right_rows, const JoinPredicate &edge);
190
191/**
192 Estimate the number of output rows from joining two relations.
193 @param left_rows Number of rows in the left hand relation.
194 @param right_rows Number of rows in the right hand relation.
195 @param edge The join between the two relations.
196*/
197inline double FindOutputRowsForJoin(double left_rows, double right_rows,
198 const JoinPredicate *edge) {
199 switch (edge->expr->type) {
201 // For outer joins, every outer row produces at least one row (if none
202 // are matching, we get a NULL-complemented row).
203 // Note that this can cause inconsistent row counts; see bug #33550360
204 // and/or JoinHypergraph::has_reordered_left_joins.
205 return left_rows * std::max(right_rows * edge->selectivity, 1.0);
206
208 return left_rows * EstimateSemijoinFanOut(right_rows, *edge);
209
211 // Antijoin are estimated as simply the opposite of semijoin (see above),
212 // but wrongly estimating 0 rows (or, of course, a negative amount) could
213 // be really bad, so we assume at least 10% coming out as a fudge factor.
214 // It's better to estimate too high than too low here.
215 return left_rows *
216 std::max(1.0 - EstimateSemijoinFanOut(right_rows, *edge), 0.1);
217
220 return left_rows * right_rows * edge->selectivity;
221
222 case RelationalExpression::FULL_OUTER_JOIN: // Not implemented.
223 case RelationalExpression::MULTI_INNER_JOIN: // Should not appear here.
224 case RelationalExpression::TABLE: // Should not appear here.
225 assert(false);
226 return 0;
227
228 default:
229 assert(false);
230 return 0;
231 }
232}
233
234#endif // SQL_JOIN_OPTIMIZER_COST_MODEL_H_
A wrapper class which provides array bounds checking.
Definition: sql_array.h:46
Base class that is used to represent any kind of expression in a relational query.
Definition: item.h:933
A typesafe replacement for DYNAMIC_ARRAY.
Definition: mem_root_array.h:425
This class represents a query block, aka a query specification, which is a query consisting of a SELE...
Definition: sql_lex.h:1161
For each client connection we create a separate thread with THD serving as a thread/connection descri...
Definition: sql_lexer_thd.h:35
constexpr double kHashReturnOneRowCost
Definition: cost_model.h:54
constexpr ha_rows kRowEstimateFallback
A fallback cardinality estimate that is used in case the storage engine cannot provide one (like for ...
Definition: cost_model.h:61
void EstimateWindowCost(AccessPath *path)
Estimate the costs and row count for a WINDOW AccessPath.
Definition: cost_model.cc:966
double FindOutputRowsForJoin(double left_rows, double right_rows, const JoinPredicate *edge)
Estimate the number of output rows from joining two relations.
Definition: cost_model.h:197
constexpr size_t kMaxItemLengthEstimate
When we make cost estimates, we use this as the maximal length the values we get from evaluating an I...
Definition: cost_model.h:44
void EstimateAggregateCost(AccessPath *path, const Query_block *query_block, std::string *trace=nullptr)
Estimate costs and result row count for an aggregate operation.
void AddCost(THD *thd, const ContainedSubquery &subquery, double num_rows, FilterCost *cost)
Used internally by EstimateFilterCost() only.
Definition: cost_model.cc:140
constexpr double kHashBuildOneRowCost
Definition: cost_model.h:52
void EstimateLimitOffsetCost(AccessPath *path)
Estimate the costs and row count for a LIMIT_OFFSET AccessPath.
Definition: cost_model.cc:932
constexpr double kWindowOneRowCost
Definition: cost_model.h:56
constexpr double kApplyOneFilterCost
Definition: cost_model.h:49
double EstimateSemijoinFanOut(double right_rows, const JoinPredicate &edge)
Estimate the fan out for a left semijoin or a left antijoin.
Definition: cost_model.cc:976
double EstimateDistinctRows(double child_rows, Bounds_checked_array< const Item *const > terms, std::string *trace=nullptr)
Estimate the number of rows with a distinct combination of values for 'terms'.
void EstimateDeleteRowsCost(AccessPath *path)
Definition: cost_model.cc:882
constexpr double kMaterializeOneRowCost
Definition: cost_model.h:55
void EstimateStreamCost(AccessPath *path)
Estimate the costs and row count for a STREAM AccessPath.
Definition: cost_model.cc:916
double EstimateCostForRefAccess(THD *thd, TABLE *table, unsigned key_idx, double num_output_rows)
Definition: cost_model.cc:61
void EstimateUpdateRowsCost(AccessPath *path)
Definition: cost_model.cc:899
void EstimateMaterializeCost(THD *thd, AccessPath *path)
Definition: cost_model.cc:196
constexpr double kHashProbeOneRowCost
Definition: cost_model.h:53
constexpr double kSortOneRowCost
Definition: cost_model.h:51
constexpr double kAggregateOneRowCost
Definition: cost_model.h:50
FilterCost EstimateFilterCost(THD *thd, double num_rows, Item *condition, const Query_block *outer_query_block)
Estimate the cost of evaluating “condition”, “num_rows” times.
Definition: cost_model.cc:181
void EstimateSortCost(AccessPath *path)
Definition: cost_model.cc:94
This file includes constants used by all storage engines.
my_off_t ha_rows
Definition: my_base.h:1140
static char * path
Definition: mysqldump.cc:148
static PFS_engine_table_share_proxy table
Definition: pfs.cc:60
Access paths are a query planning structure that correspond 1:1 to iterators, in that an access path ...
Definition: access_path.h:212
This class represents a subquery contained in some subclass of Item_subselect,.
Definition: item.h:865
See EstimateFilterCost.
Definition: cost_model.h:64
double init_cost_if_not_materialized
Initial cost before the filter can be applied for the first time.
Definition: cost_model.h:74
double cost_to_materialize
Cost of materializing all subqueries present in the filter.
Definition: cost_model.h:83
double cost_if_materialized
Cost of evaluating the filter for all rows if all subqueries in it have been materialized beforehand.
Definition: cost_model.h:79
double cost_if_not_materialized
Cost of evaluating the filter for all rows if subqueries are not materialized.
Definition: cost_model.h:68
A specification that two specific relational expressions (e.g., two tables, or a table and a join bet...
Definition: access_path.h:77
RelationalExpression * expr
Definition: access_path.h:78
double selectivity
Definition: access_path.h:79
enum RelationalExpression::Type type
@ SEMIJOIN
Left semijoin.
Definition: relational_expression.h:150
@ MULTI_INNER_JOIN
Definition: relational_expression.h:172
@ STRAIGHT_INNER_JOIN
Definition: relational_expression.h:158
@ ANTIJOIN
Left antijoin.
Definition: relational_expression.h:153
@ INNER_JOIN
Definition: relational_expression.h:146
@ FULL_OUTER_JOIN
Definition: relational_expression.h:165
@ TABLE
Definition: relational_expression.h:174
@ LEFT_JOIN
Definition: relational_expression.h:147
Definition: table.h:1403