MySQL 8.0.40
Source Code Documentation
cost_model.h
Go to the documentation of this file.
1/* Copyright (c) 2020, 2024, Oracle and/or its affiliates.
2
3 This program is free software; you can redistribute it and/or modify
4 it under the terms of the GNU General Public License, version 2.0,
5 as published by the Free Software Foundation.
6
7 This program is designed to work with certain software (including
8 but not limited to OpenSSL) that is licensed under separate terms,
9 as designated in a particular file or component or in included license
10 documentation. The authors of MySQL hereby grant you an additional
11 permission to link the program and your derivative works with the
12 separately licensed software that they have either included with
13 the program or referenced in the documentation.
14
15 This program is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License, version 2.0, for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with this program; if not, write to the Free Software
22 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */
23
24#ifndef SQL_JOIN_OPTIMIZER_COST_MODEL_H_
25#define SQL_JOIN_OPTIMIZER_COST_MODEL_H_
26
27#include "my_base.h"
30#include "sql/mem_root_array.h"
31
32struct AccessPath;
34class Item;
35class Query_block;
36class THD;
37struct TABLE;
38
39/**
40 When we make cost estimates, we use this as the maximal length the
41 values we get from evaluating an Item (in bytes). Actual values of
42 e.g. blobs may be much longer, but even so we use this as an upper
43 limit when doing cost calculations. (For context, @see Item#max_length .)
44*/
45constexpr size_t kMaxItemLengthEstimate = 4096;
46
47// These are extremely arbitrary cost model constants. We should revise them
48// based on actual query times (possibly using linear regression?), and then
49// put them into the cost model to make them user-tunable.
50constexpr double kApplyOneFilterCost = 0.1;
51constexpr double kAggregateOneRowCost = 0.1;
52constexpr double kSortOneRowCost = 0.1;
53constexpr double kHashBuildOneRowCost = 0.1;
54constexpr double kHashProbeOneRowCost = 0.1;
55constexpr double kHashReturnOneRowCost = 0.07;
56constexpr double kMaterializeOneRowCost = 0.1;
57constexpr double kWindowOneRowCost = 0.1;
58
59/// A fallback cardinality estimate that is used in case the storage engine
60/// cannot provide one (like for table functions). It's a fairly arbitrary
61/// non-zero value.
63
64/// See EstimateFilterCost.
65struct FilterCost {
66 /// Cost of evaluating the filter for all rows if subqueries are not
67 /// materialized. (Note that this includes the contribution from
68 /// init_cost_if_not_materialized.)
70
71 /// Initial cost before the filter can be applied for the first time.
72 /// Typically the cost of executing 'independent subquery' in queries like:
73 /// "SELECT * FROM tab WHERE field = <independent subquery>".
74 /// (That corresponds to the Item_singlerow_subselect class.)
76
77 /// Cost of evaluating the filter for all rows if all subqueries in
78 /// it have been materialized beforehand. If there are no subqueries
79 /// in the condition, equals cost_if_not_materialized.
81
82 /// Cost of materializing all subqueries present in the filter.
83 /// If there are no subqueries in the condition, equals zero.
85};
86
87/// Used internally by EstimateFilterCost() only.
88void AddCost(THD *thd, const ContainedSubquery &subquery, double num_rows,
89 FilterCost *cost);
90
91/**
92 Estimate the cost of evaluating “condition”, “num_rows” times.
93 This is a fairly rudimentary estimation, _but_ it includes the cost
94 of any subqueries that may be present and that need evaluation.
95 */
96FilterCost EstimateFilterCost(THD *thd, double num_rows, Item *condition,
97 const Query_block *outer_query_block);
98
99/**
100 A cheaper overload of EstimateFilterCost() that assumes that all
101 contained subqueries have already been extracted (ie., it skips the
102 walking, which can be fairly expensive). This data is typically
103 computed by FindContainedSubqueries().
104 */
106 THD *thd, double num_rows,
107 const Mem_root_array<ContainedSubquery> &contained_subqueries) {
108 FilterCost cost;
111
112 for (const ContainedSubquery &subquery : contained_subqueries) {
113 AddCost(thd, subquery, num_rows, &cost);
114 }
115 return cost;
116}
117
118double EstimateCostForRefAccess(THD *thd, TABLE *table, unsigned key_idx,
119 double num_output_rows);
122
123/**
124 Estimate costs and result row count for an aggregate operation.
125 @param[in,out] path The AGGREGATE path.
126 @param[in] query_block The Query_block to which 'path' belongs.
127 @param[in,out] trace Optimizer trace text.
128 */
130 std::string *trace = nullptr);
133
134/// Estimate the costs and row count for a STREAM AccessPath.
136
137/// Estimate the costs and row count for a LIMIT_OFFSET AccessPath.
139
140/// Estimate the costs and row count for a WINDOW AccessPath.
142
143inline double FindOutputRowsForJoin(double left_rows, double right_rows,
144 const JoinPredicate *edge) {
145 double fanout = right_rows * edge->selectivity;
147 // For outer joins, every outer row produces at least one row (if none
148 // are matching, we get a NULL-complemented row).
149 // Note that this can cause inconsistent row counts; see bug #33550360
150 // and/or JoinHypergraph::has_reordered_left_joins.
151 fanout = std::max(fanout, 1.0);
152 } else if (edge->expr->type == RelationalExpression::SEMIJOIN) {
153 // Semi- and antijoin estimation is pretty tricky, since we want isn't
154 // really selectivity; we want the probability that at least one row
155 // is matching, which is something else entirely. However, given that
156 // we only have selectivity to work with, we don't really have anything
157 // better than to estimate it as a normal join and cap the result
158 // at selectivity 1.0 (ie., each outer row generates at most one inner row).
159 // Note that this can cause inconsistent row counts; see bug #33550360 and
160 // CostingReceiver::has_semijoin_with_possibly_clamped_child.
161 fanout = std::min(fanout, 1.0);
162 } else if (edge->expr->type == RelationalExpression::ANTIJOIN) {
163 // Antijoin are estimated as simply the opposite of semijoin (see above),
164 // but wrongly estimating 0 rows (or, of course, a negative amount) could be
165 // really bad, so we assume at least 10% coming out as a fudge factor.
166 // It's better to estimate too high than too low here.
167 fanout = std::max(1.0 - fanout, 0.1);
168 }
169 return left_rows * fanout;
170}
171
172#endif // SQL_JOIN_OPTIMIZER_COST_MODEL_H_
Base class that is used to represent any kind of expression in a relational query.
Definition: item.h:853
A typesafe replacement for DYNAMIC_ARRAY.
Definition: mem_root_array.h:426
This class represents a query block, aka a query specification, which is a query consisting of a SELE...
Definition: sql_lex.h:1156
For each client connection we create a separate thread with THD serving as a thread/connection descri...
Definition: sql_lexer_thd.h:34
constexpr double kHashReturnOneRowCost
Definition: cost_model.h:55
constexpr ha_rows kRowEstimateFallback
A fallback cardinality estimate that is used in case the storage engine cannot provide one (like for ...
Definition: cost_model.h:62
void EstimateWindowCost(AccessPath *path)
Estimate the costs and row count for a WINDOW AccessPath.
Definition: cost_model.cc:919
double FindOutputRowsForJoin(double left_rows, double right_rows, const JoinPredicate *edge)
Definition: cost_model.h:143
constexpr size_t kMaxItemLengthEstimate
When we make cost estimates, we use this as the maximal length the values we get from evaluating an I...
Definition: cost_model.h:45
void EstimateAggregateCost(AccessPath *path, const Query_block *query_block, std::string *trace=nullptr)
Estimate costs and result row count for an aggregate operation.
void AddCost(THD *thd, const ContainedSubquery &subquery, double num_rows, FilterCost *cost)
Used internally by EstimateFilterCost() only.
Definition: cost_model.cc:123
constexpr double kHashBuildOneRowCost
Definition: cost_model.h:53
void EstimateLimitOffsetCost(AccessPath *path)
Estimate the costs and row count for a LIMIT_OFFSET AccessPath.
Definition: cost_model.cc:887
constexpr double kWindowOneRowCost
Definition: cost_model.h:57
constexpr double kApplyOneFilterCost
Definition: cost_model.h:50
void EstimateDeleteRowsCost(AccessPath *path)
Definition: cost_model.cc:837
constexpr double kMaterializeOneRowCost
Definition: cost_model.h:56
void EstimateStreamCost(AccessPath *path)
Estimate the costs and row count for a STREAM AccessPath.
Definition: cost_model.cc:871
double EstimateCostForRefAccess(THD *thd, TABLE *table, unsigned key_idx, double num_output_rows)
Definition: cost_model.cc:59
void EstimateUpdateRowsCost(AccessPath *path)
Definition: cost_model.cc:854
void EstimateMaterializeCost(THD *thd, AccessPath *path)
Definition: cost_model.cc:179
constexpr double kHashProbeOneRowCost
Definition: cost_model.h:54
constexpr double kSortOneRowCost
Definition: cost_model.h:52
constexpr double kAggregateOneRowCost
Definition: cost_model.h:51
FilterCost EstimateFilterCost(THD *thd, double num_rows, Item *condition, const Query_block *outer_query_block)
Estimate the cost of evaluating “condition”, “num_rows” times.
Definition: cost_model.cc:164
void EstimateSortCost(AccessPath *path)
Definition: cost_model.cc:92
This file includes constants used by all storage engines.
my_off_t ha_rows
Definition: my_base.h:1140
static char * path
Definition: mysqldump.cc:137
Access paths are a query planning structure that correspond 1:1 to iterators, in that an access path ...
Definition: access_path.h:193
This class represents a subquery contained in some subclass of Item_subselect,.
Definition: item.h:785
See EstimateFilterCost.
Definition: cost_model.h:65
double init_cost_if_not_materialized
Initial cost before the filter can be applied for the first time.
Definition: cost_model.h:75
double cost_to_materialize
Cost of materializing all subqueries present in the filter.
Definition: cost_model.h:84
double cost_if_materialized
Cost of evaluating the filter for all rows if all subqueries in it have been materialized beforehand.
Definition: cost_model.h:80
double cost_if_not_materialized
Cost of evaluating the filter for all rows if subqueries are not materialized.
Definition: cost_model.h:69
A specification that two specific relational expressions (e.g., two tables, or a table and a join bet...
Definition: access_path.h:76
RelationalExpression * expr
Definition: access_path.h:77
double selectivity
Definition: access_path.h:78
enum RelationalExpression::Type type
@ SEMIJOIN
Definition: relational_expression.h:92
@ ANTIJOIN
Definition: relational_expression.h:93
@ LEFT_JOIN
Definition: relational_expression.h:91
Definition: table.h:1399