MySQL 8.1.0
Source Code Documentation
cost_model.h
Go to the documentation of this file.
1/* Copyright (c) 2020, 2023, Oracle and/or its affiliates.
2
3 This program is free software; you can redistribute it and/or modify
4 it under the terms of the GNU General Public License, version 2.0,
5 as published by the Free Software Foundation.
6
7 This program is also distributed with certain software (including
8 but not limited to OpenSSL) that is licensed under separate terms,
9 as designated in a particular file or component or in included license
10 documentation. The authors of MySQL hereby grant you an additional
11 permission to link the program and your derivative works with the
12 separately licensed software that they have included with MySQL.
13
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License, version 2.0, for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with this program; if not, write to the Free Software
21 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */
22
23#ifndef SQL_JOIN_OPTIMIZER_COST_MODEL_H_
24#define SQL_JOIN_OPTIMIZER_COST_MODEL_H_
25
26#include "my_base.h"
29#include "sql/mem_root_array.h"
30
31struct AccessPath;
33class Item;
34class Query_block;
35class THD;
36struct TABLE;
37
38/**
39 When we make cost estimates, we use this as the maximal length the
40 values we get from evaluating an Item (in bytes). Actual values of
41 e.g. blobs may be much longer, but even so we use this as an upper
42 limit when doing cost calculations. (For context, @see Item#max_length .)
43*/
44constexpr size_t kMaxItemLengthEstimate = 4096;
45
46// These are extremely arbitrary cost model constants. We should revise them
47// based on actual query times (possibly using linear regression?), and then
48// put them into the cost model to make them user-tunable.
49constexpr double kApplyOneFilterCost = 0.1;
50constexpr double kAggregateOneRowCost = 0.1;
51constexpr double kSortOneRowCost = 0.1;
52constexpr double kHashBuildOneRowCost = 0.1;
53constexpr double kHashProbeOneRowCost = 0.1;
54constexpr double kHashReturnOneRowCost = 0.07;
55constexpr double kMaterializeOneRowCost = 0.1;
56constexpr double kWindowOneRowCost = 0.1;
57
58/// A fallback cardinality estimate that is used in case the storage engine
59/// cannot provide one (like for table functions). It's a fairly arbitrary
60/// non-zero value.
62
63/// See EstimateFilterCost.
64struct FilterCost {
65 /// Cost of evaluating the filter for all rows if subqueries are not
66 /// materialized. (Note that this includes the contribution from
67 /// init_cost_if_not_materialized.)
69
70 /// Initial cost before the filter can be applied for the first time.
71 /// Typically the cost of executing 'independent subquery' in queries like:
72 /// "SELECT * FROM tab WHERE field = <independent subquery>".
73 /// (That corresponds to the Item_singlerow_subselect class.)
75
76 /// Cost of evaluating the filter for all rows if all subqueries in
77 /// it have been materialized beforehand. If there are no subqueries
78 /// in the condition, equals cost_if_not_materialized.
80
81 /// Cost of materializing all subqueries present in the filter.
82 /// If there are no subqueries in the condition, equals zero.
84};
85
86/// Used internally by EstimateFilterCost() only.
87void AddCost(THD *thd, const ContainedSubquery &subquery, double num_rows,
88 FilterCost *cost);
89
90/**
91 Estimate the cost of evaluating “condition”, “num_rows” times.
92 This is a fairly rudimentary estimation, _but_ it includes the cost
93 of any subqueries that may be present and that need evaluation.
94 */
95FilterCost EstimateFilterCost(THD *thd, double num_rows, Item *condition,
96 const Query_block *outer_query_block);
97
98/**
99 A cheaper overload of EstimateFilterCost() that assumes that all
100 contained subqueries have already been extracted (ie., it skips the
101 walking, which can be fairly expensive). This data is typically
102 computed by FindContainedSubqueries().
103 */
105 THD *thd, double num_rows,
106 const Mem_root_array<ContainedSubquery> &contained_subqueries) {
107 FilterCost cost;
110
111 for (const ContainedSubquery &subquery : contained_subqueries) {
112 AddCost(thd, subquery, num_rows, &cost);
113 }
114 return cost;
115}
116
117double EstimateCostForRefAccess(THD *thd, TABLE *table, unsigned key_idx,
118 double num_output_rows);
121
122/**
123 Estimate the number of output rows for an aggregate operation.
124 @param child_rows The number of input rows.
125 @param aggregate_terms The terms that we aggregate on.
126 @param trace Optimizer trace.
127 @returns The estimated number of output rows.
128*/
130 double child_rows, Bounds_checked_array<const Item *const> aggregate_terms,
131 std::string *trace = nullptr);
132/**
133 Estimate costs and result row count for an aggregate operation.
134 @param[in,out] path The AGGREGATE path.
135 @param[in] query_block The Query_block to which 'path' belongs.
136 @param[in,out] trace Optimizer trace text.
137 */
139 std::string *trace = nullptr);
142
143/// Estimate the costs and row count for a STREAM AccessPath.
145
146/// Estimate the costs and row count for a LIMIT_OFFSET AccessPath.
148
149inline double FindOutputRowsForJoin(double left_rows, double right_rows,
150 const JoinPredicate *edge) {
151 const auto aggregated_right_rows = [&]() {
153 right_rows,
154 {edge->semijoin_group, static_cast<size_t>(edge->semijoin_group_size)});
155 };
156
157 double fanout;
159 // For outer joins, every outer row produces at least one row (if none
160 // are matching, we get a NULL-complemented row).
161 // Note that this can cause inconsistent row counts; see bug #33550360
162 // and/or JoinHypergraph::has_reordered_left_joins.
163 fanout = std::max(right_rows * edge->selectivity, 1.0);
164 } else if (edge->expr->type == RelationalExpression::SEMIJOIN) {
165 // Semi- and antijoin estimation is pretty tricky, since we want isn't
166 // really selectivity; we want the probability that at least one row
167 // is matching, which is something else entirely. However, given that
168 // we only have selectivity to work with, we don't really have anything
169 // better than to estimate it as a normal join and cap the result
170 // at selectivity 1.0 (ie., each outer row generates at most one inner row).
171 // Note that this can cause inconsistent row counts; see bug #33550360 and
172 // CostingReceiver::has_semijoin_with_possibly_clamped_child.
173 //
174 // Note that 'edge->selecivity' may be too high. For a query like:
175 //
176 // SELECT 1 FROM t1 WHERE EXISTS (SELECT 1 FROM t2 WHERE t1.a=t2.b)
177 //
178 // we calculate the selectivity of the predicate t1.a=t2.b as the highest
179 // of the individual selecivities of t1.a and t2.b (see
180 // EstimateSelectivity(). But the selectivity of t2.b is irrelevant here. So
181 // if we could extract the selectivity of t1.a, we would get a better
182 // estimate.
183 fanout = std::min(1.0, aggregated_right_rows() * edge->selectivity);
184 } else if (edge->expr->type == RelationalExpression::ANTIJOIN) {
185 // Antijoin are estimated as simply the opposite of semijoin (see above),
186 // but wrongly estimating 0 rows (or, of course, a negative amount) could be
187 // really bad, so we assume at least 10% coming out as a fudge factor.
188 // It's better to estimate too high than too low here.
189 fanout = std::max(1.0 - aggregated_right_rows() * edge->selectivity, 0.1);
190 } else {
191 fanout = right_rows * edge->selectivity;
192 }
193 return left_rows * fanout;
194}
195
196#endif // SQL_JOIN_OPTIMIZER_COST_MODEL_H_
A wrapper class which provides array bounds checking.
Definition: sql_array.h:46
Base class that is used to represent any kind of expression in a relational query.
Definition: item.h:853
A typesafe replacement for DYNAMIC_ARRAY.
Definition: mem_root_array.h:425
This class represents a query block, aka a query specification, which is a query consisting of a SELE...
Definition: sql_lex.h:1162
For each client connection we create a separate thread with THD serving as a thread/connection descri...
Definition: sql_lexer_thd.h:33
constexpr double kHashReturnOneRowCost
Definition: cost_model.h:54
constexpr ha_rows kRowEstimateFallback
A fallback cardinality estimate that is used in case the storage engine cannot provide one (like for ...
Definition: cost_model.h:61
double EstimateAggregateRows(double child_rows, Bounds_checked_array< const Item *const > aggregate_terms, std::string *trace=nullptr)
Estimate the number of output rows for an aggregate operation.
double FindOutputRowsForJoin(double left_rows, double right_rows, const JoinPredicate *edge)
Definition: cost_model.h:149
constexpr size_t kMaxItemLengthEstimate
When we make cost estimates, we use this as the maximal length the values we get from evaluating an I...
Definition: cost_model.h:44
void EstimateAggregateCost(AccessPath *path, const Query_block *query_block, std::string *trace=nullptr)
Estimate costs and result row count for an aggregate operation.
void AddCost(THD *thd, const ContainedSubquery &subquery, double num_rows, FilterCost *cost)
Used internally by EstimateFilterCost() only.
Definition: cost_model.cc:122
constexpr double kHashBuildOneRowCost
Definition: cost_model.h:52
void EstimateLimitOffsetCost(AccessPath *path)
Estimate the costs and row count for a LIMIT_OFFSET AccessPath.
Definition: cost_model.cc:901
constexpr double kWindowOneRowCost
Definition: cost_model.h:56
constexpr double kApplyOneFilterCost
Definition: cost_model.h:49
void EstimateDeleteRowsCost(AccessPath *path)
Definition: cost_model.cc:851
constexpr double kMaterializeOneRowCost
Definition: cost_model.h:55
void EstimateStreamCost(AccessPath *path)
Estimate the costs and row count for a STREAM AccessPath.
Definition: cost_model.cc:885
double EstimateCostForRefAccess(THD *thd, TABLE *table, unsigned key_idx, double num_output_rows)
Definition: cost_model.cc:58
void EstimateUpdateRowsCost(AccessPath *path)
Definition: cost_model.cc:868
void EstimateMaterializeCost(THD *thd, AccessPath *path)
Definition: cost_model.cc:178
constexpr double kHashProbeOneRowCost
Definition: cost_model.h:53
constexpr double kSortOneRowCost
Definition: cost_model.h:51
constexpr double kAggregateOneRowCost
Definition: cost_model.h:50
FilterCost EstimateFilterCost(THD *thd, double num_rows, Item *condition, const Query_block *outer_query_block)
Estimate the cost of evaluating “condition”, “num_rows” times.
Definition: cost_model.cc:163
void EstimateSortCost(AccessPath *path)
Definition: cost_model.cc:91
This file includes constants used by all storage engines.
my_off_t ha_rows
Definition: my_base.h:1139
static char * path
Definition: mysqldump.cc:140
static PFS_engine_table_share_proxy table
Definition: pfs.cc:60
Access paths are a query planning structure that correspond 1:1 to iterators, in that an access path ...
Definition: access_path.h:193
This class represents a subquery contained in some subclass of Item_subselect,.
Definition: item.h:785
See EstimateFilterCost.
Definition: cost_model.h:64
double init_cost_if_not_materialized
Initial cost before the filter can be applied for the first time.
Definition: cost_model.h:74
double cost_to_materialize
Cost of materializing all subqueries present in the filter.
Definition: cost_model.h:83
double cost_if_materialized
Cost of evaluating the filter for all rows if all subqueries in it have been materialized beforehand.
Definition: cost_model.h:79
double cost_if_not_materialized
Cost of evaluating the filter for all rows if subqueries are not materialized.
Definition: cost_model.h:68
A specification that two specific relational expressions (e.g., two tables, or a table and a join bet...
Definition: access_path.h:76
int semijoin_group_size
Definition: access_path.h:109
RelationalExpression * expr
Definition: access_path.h:77
double selectivity
Definition: access_path.h:78
Item ** semijoin_group
Definition: access_path.h:108
enum RelationalExpression::Type type
@ SEMIJOIN
Definition: relational_expression.h:148
@ ANTIJOIN
Definition: relational_expression.h:149
@ LEFT_JOIN
Definition: relational_expression.h:147
Definition: table.h:1394