MySQL 9.5.0
Source Code Documentation
cost_model.h
Go to the documentation of this file.
1/* Copyright (c) 2020, 2025, Oracle and/or its affiliates.
2
3 This program is free software; you can redistribute it and/or modify
4 it under the terms of the GNU General Public License, version 2.0,
5 as published by the Free Software Foundation.
6
7 This program is designed to work with certain software (including
8 but not limited to OpenSSL) that is licensed under separate terms,
9 as designated in a particular file or component or in included license
10 documentation. The authors of MySQL hereby grant you an additional
11 permission to link the program and your derivative works with the
12 separately licensed software that they have either included with
13 the program or referenced in the documentation.
14
15 This program is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License, version 2.0, for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with this program; if not, write to the Free Software
22 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */
23
24#ifndef SQL_JOIN_OPTIMIZER_COST_MODEL_H_
25#define SQL_JOIN_OPTIMIZER_COST_MODEL_H_
26
27#include <algorithm> // std::clamp
28#include <span>
29
30#include "my_base.h"
31#include "my_bitmap.h" // bitmap_bits_set
32
36#include "sql/mem_root_array.h"
37#include "sql/table.h"
38
39struct AccessPath;
41class Item;
42class Query_block;
43class THD;
44
45/**
46 When we make cost estimates, we use this as the maximal length the
47 values we get from evaluating an Item (in bytes). Actual values of
48 e.g. blobs may be much longer, but even so we use this as an upper
49 limit when doing cost calculations. (For context, @see Item#max_length .)
50*/
51constexpr size_t kMaxItemLengthEstimate = 4096;
52
53/// A fallback cardinality estimate that is used in case the storage engine
54/// cannot provide one (like for table functions). It's a fairly arbitrary
55/// non-zero value.
57
58/**
59 We model the IO cost for InnoDB tables with the DYNAMIC row format. For
60 other storage engines the IO cost is currently set to zero. For other
61 InnoDB row formats, the model may not be a good fit.
62
63 We only count the cost of accessing the leaf pages of indexes (clustered
64 or unclustered)that are not already in the buffer pool. For tables/indexes
65 that are (estimated to be) fully cached we get an IO cost of zero.
66 Tables that are small relative to the buffer pool are assumed to be fully
67 cached (all pages are in the buffer pool).
68 The only operations for which we count IO cost are random and sequential
69 page reads.
70
71 The cost of a disk IO operation is modeled as an affine function:
72
73 io_cost = kIOStartCost + no_of_bytes * kIOByteCost
74
75 Note that the granularity of no_of_bytes is the storage engine page size.
76 The values used here are derived from measurements in a cloud setting.
77 This means that they may be wrong in another context, such as when
78 running against a local SSD. Cloud measurements may also give inconsistent
79 results due to heterogeneous cloud hardware, and the impact of activity
80 in other cloud VMs.
81 Ideally we should be able to configure these parameters, or even set them
82 dynamically based on observed behavior.
83*/
84constexpr double kIOStartCost{937.0};
85
86/// The additional cost of reading an extra byte from disk.
87constexpr double kIOByteCost{0.0549};
88
89/// This is the estimated fraction of an (innodb) block that is in use
90/// (i.e. not free for future inserts).
91constexpr double kBlockFillFactor{0.75};
92
93/// See EstimateFilterCost.
94struct FilterCost {
95 /// Cost of evaluating the filter for all rows if subqueries are not
96 /// materialized. (Note that this includes the contribution from
97 /// init_cost_if_not_materialized.)
99
100 /// Initial cost before the filter can be applied for the first time.
101 /// Typically the cost of executing 'independent subquery' in queries like:
102 /// "SELECT * FROM tab WHERE field = <independent subquery>".
103 /// (That corresponds to the Item_singlerow_subselect class.)
105
106 /// Cost of evaluating the filter for all rows if all subqueries in
107 /// it have been materialized beforehand. If there are no subqueries
108 /// in the condition, equals cost_if_not_materialized.
110
111 /// Cost of materializing all subqueries present in the filter.
112 /// If there are no subqueries in the condition, equals zero.
114};
115
116/// Used internally by EstimateFilterCost() only.
117void AddCost(THD *thd, const ContainedSubquery &subquery, double num_rows,
118 FilterCost *cost);
119
120/**
121 Estimate the cost of evaluating “condition”, “num_rows” times.
122 This is a fairly rudimentary estimation, _but_ it includes the cost
123 of any subqueries that may be present and that need evaluation.
124 */
125FilterCost EstimateFilterCost(THD *thd, double num_rows, Item *condition,
126 const Query_block *outer_query_block);
127
128/**
129 A cheaper overload of EstimateFilterCost() that assumes that all
130 contained subqueries have already been extracted (ie., it skips the
131 walking, which can be fairly expensive). This data is typically
132 computed by FindContainedSubqueries().
133 */
135 THD *thd, double num_rows,
136 const Mem_root_array<ContainedSubquery> &contained_subqueries) {
137 FilterCost cost;
140
141 for (const ContainedSubquery &subquery : contained_subqueries) {
142 AddCost(thd, subquery, num_rows, &cost);
143 }
144 return cost;
145}
146
147/**
148 Estimate costs and output rows for a SORT AccessPath.
149 @param thd Current thread.
150 @param path the AccessPath.
151 @param distinct_rows An estimate of the number of distinct rows, if
152 remove_duplicates==true and we have an estimate already.
153*/
155 double distinct_rows = kUnknownRowCount);
156
158
159/// Array of aggregation terms.
160using TermArray = std::span<const Item *const>;
161
162/**
163 Estimate the number of rows with a distinct combination of values for
164 'terms'. @see EstimateDistinctRowsFromStatistics for additional details.
165 @param thd The current thread.
166 @param child_rows The number of input rows.
167 @param terms The terms for which we estimate the number of unique
168 combinations.
169 @returns The estimated number of output rows.
170*/
171double EstimateDistinctRows(THD *thd, double child_rows, TermArray terms);
172/**
173 Estimate costs and result row count for an aggregate operation.
174 @param[in,out] thd The current thread.
175 @param[in,out] path The AGGREGATE path.
176 @param[in] query_block The Query_block to which 'path' belongs.
177 */
179 const Query_block *query_block);
180
181/**
182 Estimate costs and result row count for a skip scan operation.
183 @param[in] table The table which is to be scanned
184 @param[in] key_idx The location of the index to be scanned
185 @param[in] num_subrange_scans Number of subrange scans to be executed
186 @param[in] records Number of rows to be scanned
187 @retval Calculated cost value
188 */
189double EstimateSkipScanCost(TABLE *table, uint key_idx, uint num_subrange_scans,
190 ha_rows records);
191
192/**
193 Estimate costs for a group skip scan operation.
194 @param[in] table The table which is to be scanned
195 @param[in] key_idx The location of the index to be scanned
196 @param[in] num_groups Number of GROUP BY groups to be scanned
197 @param[in] has_max True if the query contains MAX(), else false
198 @retval Calculated cost value
199 */
200double EstimateGroupSkipScanCost(TABLE *table, uint key_idx, uint num_groups,
201 bool has_max);
202
205
206/// Estimate the costs and row count for a STREAM AccessPath.
208
209/**
210 Estimate the costs and row count for a WINDOW AccessPath. As described in
211 @see AccessPath::m_init_cost, the cost to read k out of N rows would be
212 init_cost + (k/N) * (cost - init_cost).
213*/
215
216/// Estimate the costs and row count for a Temp table Aggregate AccessPath.
218 const Query_block *query_block);
219
220/// Estimate the costs and row count for a WINDOW AccessPath.
222
223/**
224 Estimate the fan out for a left semijoin or a left antijoin. The fan out
225 is defined as the number of result rows, divided by the number of input
226 rows from the left hand relation. For a semijoin, J1:
227
228 SELECT ... FROM t1 WHERE EXISTS (SELECT ... FROM t2 WHERE predicate)
229
230 we know that the fan out of the corresponding inner join J2:
231
232 SELECT ... FROM t1, t2 WHERE predicate
233
234 is: F(J2) = CARD(t2) * SELECTIVITY(predicate) , where CARD(t2)=right_rows,
235 and SELECTIVITY(predicate)=edge.selectivity. If 'predicate' is a
236 deterministic function of t1 and t2 rows, then J1 is equivalent to an inner
237 join J3:
238
239 SELECT ... FROM t1 JOIN (SELECT DISTINCT f1,..fn FROM t2) d ON predicate
240
241 where f1,..fn are those fields from t2 that appear in the predicate.
242
243 Then F(J1) = F(J3) = F(J2) * CARD(d) / CARD(t2)
244 = CARD(d) * SELECTIVITY(predicate).
245
246 This function therefore collects f1..fn and estimates CARD(d). As a special
247 case, 'predicate' may be independent of t2. The query is then equivalent to:
248
249 SELECT ... FROM t1 WHERE predicate AND (SELECT COUNT(*) FROM t2) > 0
250
251 The fan out is then the selectivity of 'predicate' multiplied by the
252 probability of t2 having at least one row.
253
254 @param thd The current thread.
255 @param right_rows The number of input rows from the right hand relation.
256 @param edge Join edge.
257 @returns fan out.
258 */
259double EstimateSemijoinFanOut(THD *thd, double right_rows,
260 const JoinPredicate &edge);
261
262/**
263 Estimate the number of output rows from joining two relations.
264 @param thd The current thread.
265 @param left_rows Number of rows in the left hand relation.
266 @param right_rows Number of rows in the right hand relation.
267 @param edge The join between the two relations.
268*/
269inline double FindOutputRowsForJoin(THD *thd, double left_rows,
270 double right_rows,
271 const JoinPredicate *edge) {
272 switch (edge->expr->type) {
274 // For outer joins, every outer row produces at least one row (if none
275 // are matching, we get a NULL-complemented row).
276 // Note that this can cause inconsistent row counts; see bug #33550360
277 // and/or JoinHypergraph::has_reordered_left_joins.
278 return left_rows * std::max(right_rows * edge->selectivity, 1.0);
279
281 return left_rows * EstimateSemijoinFanOut(thd, right_rows, *edge);
282
284 // Antijoin are estimated as simply the opposite of semijoin (see above),
285 // but wrongly estimating 0 rows (or, of course, a negative amount) could
286 // be really bad, so we assume at least 10% coming out as a fudge factor.
287 // It's better to estimate too high than too low here.
288 return left_rows *
289 std::max(1.0 - EstimateSemijoinFanOut(thd, right_rows, *edge),
290 0.1);
291
294 return left_rows * right_rows * edge->selectivity;
295
296 case RelationalExpression::FULL_OUTER_JOIN: // Not implemented.
297 case RelationalExpression::MULTI_INNER_JOIN: // Should not appear here.
298 case RelationalExpression::TABLE: // Should not appear here.
299 assert(false);
300 return 0;
301
302 default:
303 assert(false);
304 return 0;
305 }
306}
307
308/**
309 Determines whether a given key on a table is both clustered and primary.
310
311 @param table The table to which the index belongs.
312 @param key_idx The position of the key in table->key_info[].
313
314 @return True if the key is clustered and primary, false otherwise.
315*/
316inline bool IsClusteredPrimaryKey(const TABLE *table, unsigned key_idx) {
317 if (table->s->is_missing_primary_key()) return false;
318 return key_idx == table->s->primary_key &&
319 table->file->primary_key_is_clustered();
320}
321
322/// The minimum number of bytes to return for row length estimates. This is
323/// mostly to guard against returning estimates of zero, which may or may not
324/// actually be able to happen in practice.
325constexpr unsigned kMinEstimatedBytesPerRow = 8;
326
327/// The maximum number of bytes to return for row length estimates. The current
328/// value of this constant is set to cover a wide range of row sizes and should
329/// capture the most common row lengths in bytes. We place an upper limit on the
330/// estimates since we have performed our calibration within this range, and we
331/// would like to prevent cost estimates from running away in case the
332/// underlying statistics are off in some instances. In such cases we prefer
333/// capping the resulting estimate. As a reasonable upper limit we use half the
334/// default InnoDB page size of 2^14 = 16384 bytes.
335constexpr unsigned kMaxEstimatedBytesPerRow = 8 * 1024;
336
337/**
338 This struct represents the number of bytes we expect to read for a table row.
339 Note that the split between b-tree and overflow pages is specific to InnoDB
340 and may not be a good fit for other storage engines. Ideally we should
341 calculate row size and IO-cost in the handler, in a way that is specific to
342 each particular storage engine.
343 When using the DYNAMIC row format, an InnoDB B-tree record cannot be bigger
344 than about half a page. (The default page size is 16KB). Above that, the
345 longest fields are stored in separate overflow pages.
346*/
348 /**
349 The number of bytes read from the B-tree record. This also includes those
350 fields that were not in the projection.
351 */
353
354 /**
355 The number of bytes read from overflow pages. This is the combined size
356 of those long variable-sized fields that are in the projection but stored
357 in overflow pages.
358 */
360
361 /*
362 The probability of reading from an overflow page (i.e. an estimate
363 of the probability that at least one of the columns in the
364 projection overflows).
365 */
367};
368
369/**
370 Get an estimate of the row size of the read set of 'table'.
371*/
372int64_t GetReadSetWidth(const TABLE *table);
373
374/**
375 Estimate the average number of bytes that we need to read from the
376 storage engine when reading a row from 'table'. This is the size of
377 the (b-tree) record and the overflow pages of any field that is
378 part of the projection. This is similar to what EstimateBytesPerRowTable()
379 does, but this function is intended to do a more accurate but also more
380 expensive calculation for tables with potentially large rows (i.e. tables
381 with BLOBs or large VARCHAR fields).
382
383 Note that this function tailored for InnoDB (and also for the
384 DYNAMIC row format of InnoDB). At some point we may want to move
385 this logic to the handler, so that we can customize it for other
386 engines as well.
387
388 @param table The target table
389 @returns The estimated row size.
390*/
392
393/// We clamp the block size to lie in the interval between the max and min
394/// allowed block size for InnoDB (2^12 to 2^16). Ideally we would have a
395/// guarantee that stats.block_size has a reasonable value (across all storage
396/// engines, types of tables, state of statistics), but in the absence of such
397/// a guarantee we clamp to the values for the InnoDB storage engine since the
398/// cost model has been calibrated for these values.
399inline unsigned ClampedBlockSize(const TABLE *table) {
400 constexpr unsigned kMinEstimatedBlockSize = 4096;
401 constexpr unsigned kMaxEstimatedBlockSize = 65536;
402 return std::clamp(table->file->stats.block_size, kMinEstimatedBlockSize,
403 kMaxEstimatedBlockSize);
404}
405
406/**
407 Estimates the number of bytes that MySQL must process when reading a row from
408 a table, independently of the size of the read set.
409
410 @param table The table to produce an estimate for.
411
412 @returns The estimated row size.
413
414 @note There are two different relevant concepts of bytes per row:
415
416 1. The bytes per row on the storage engine side.
417 2. The bytes per row on the server side.
418
419 One the storage engine side, for InnoDB at least, we compute
420 stats.mean_rec_length as the size of the data file divided by the number of
421 rows in the table.
422
423 On the server side we are interested in the size of the MySQL representation
424 in bytes. This could be a more accurate statistic when determining the CPU
425 cost of processing a row (i.e., it does not matter very much if InnoDB pages
426 are only half-full). As an approximation to the size of a row in bytes on the
427 server side we use the length of the record buffer for rows that should
428 not exceed the maximal size of an InnoDB B-tree record. (Otherwise, we call
429 EstimateBytesPerRowWideTable() to make the estimate).
430
431 Note that when the table fits in a single page stats.mean_rec_length
432 will tend to overestimate the record length since it is computed as
433 stats.data_file_length / stats.records and the data file length is
434 at least a full page which defaults to 16384 bytes (for InnoDB at
435 least). We may then get a better estimate from table->s->rec_buff_length.
436*/
438 int64_t max_bytes{0};
439
440 for (uint i = 0; i < table->s->fields; i++) {
441 // max_data_length() is the maximal size (in bytes) of this field.
442 max_bytes += table->field[i]->max_data_length();
443 }
444
445 if (max_bytes < ClampedBlockSize(table) / 2) {
446 // The row should fit in a b-tree record.
447 return {.record_bytes =
448 std::clamp(table->s->rec_buff_length, kMinEstimatedBytesPerRow,
450 .overflow_bytes = 0,
451 .overflow_probability = 0.0};
452 }
453
454 // Make a more sophisticated estimate for tables that may have very
455 // large rows.
457}
458
459/**
460 Estimates the number of bytes that MySQL must process when reading a row from
461 a secondary index, independently of the size of the read set.
462
463 @param table The table to which the index belongs.
464 @param key_idx The position of the key in table->key_info[].
465
466 @return The estimated number of bytes per row in the index.
467*/
468inline unsigned EstimateBytesPerRowIndex(const TABLE *table, unsigned key_idx) {
469 // key_length should correspond to the length of the field(s) of the key in
470 // bytes and ref_length is the length of the field(s) of the primary key in
471 // bytes. Secondary indexes (in InnoDB) contain a copy of the value of the
472 // primary key associated with a given row, in order to make it possible to
473 // retrieve the corresponding row from the primary index in case we use a
474 // non-covering index operation.
475 unsigned estimate =
476 table->key_info[key_idx].key_length + table->file->ref_length;
477 return std::clamp(estimate, kMinEstimatedBytesPerRow,
479}
480
481/**
482 Estimates the height of a B-tree index.
483
484 We estimate the height of the index to be the smallest positive integer h such
485 that table_records <= (1 + records_per_page)^h.
486
487 This function supports both clustered primary indexes and secondary indexes.
488 Secondary indexes will tend to have more records per page compared to primary
489 clustered indexes and as a consequence they will tend to be shorter.
490
491 @param table The table to which the index belongs.
492 @param key_idx The position of the key in table->key_info[].
493
494 @return The estimated height of the index.
495*/
496inline int IndexHeight(const TABLE *table, unsigned key_idx) {
497 unsigned block_size = ClampedBlockSize(table);
498 unsigned bytes_per_row = IsClusteredPrimaryKey(table, key_idx)
499 ? table->bytes_per_row()->record_bytes
501
502 // Ideally we should always have that block_size >= bytes_per_row, but since
503 // the storage engine and MySQL row formats differ, this is not always the
504 // case. Therefore we manually ensure that records_per_page >= 1.0.
505 double records_per_page =
506 std::max(1.0, static_cast<double>(block_size) / bytes_per_row);
507
508 // Computing the height using a while loop instead of using std::log turns out
509 // to be about 5 times faster in microbenchmarks when the measurement is made
510 // using a somewhat realistic and representative set of values for the number
511 // of records per page and the number of records in the table. In the worst
512 // case, if the B-tree contains only a single record per page, the table would
513 // have to contain 2^30 pages (corresponding to more than 16 terabytes of
514 // data) for this loop to run 30 times. A B-tree with 1 billion records and
515 // 100 records per page only uses 4 iterations of the loop (the height is 5).
516 int height = 1;
517 double r = 1.0 + records_per_page;
518 while (r < table->file->stats.records) {
519 r = r * (1.0 + records_per_page);
520 height += 1;
521 }
522 return height;
523}
524
525/// Calculate the IO-cost of reading 'num_rows' rows from 'table'.
526double TableAccessIOCost(const TABLE *table, double num_rows,
527 BytesPerTableRow row_size);
528
529/// Calculate the IO-cost of doing a lookup on index 'key_idx' on 'table'
530/// and then read 'num_rows' rows.
531double CoveringIndexAccessIOCost(const TABLE *table, unsigned key_idx,
532 double num_rows);
533
534/**
535 Computes the expected cost of reading a number of rows. The cost model takes
536 into account the number of fields that is being read from the row and the
537 width of the row in bytes. Both RowReadCostTable() and RowReadCostIndex() call
538 this function and thus use the same cost model.
539
540 @param num_rows The (expected) number of rows to read.
541 @param fields_read_per_row The number of fields to read per row.
542 @param bytes_per_row The total length of the row to be processed (including
543 fields that are not read) in bytes.
544
545 @returns The expected cost of reading num_rows.
546
547 @note It is important that this function be robust to fractional row
548 estimates. For example, if we index nested loop join two primary key columns
549 and the inner table is smaller than the outer table, we should see that
550 num_rows for the inner index lookup is less than one. In this case it is
551 important that we return the expected cost of the operation. For example, if
552 we only expect to read 0.1 rows the cost should be 0.1 of the cost of reading
553 one row (we are working with a linear cost model, so we only have to take the
554 expected number of rows into account, and not the complete distribution).
555*/
556inline double RowReadCost(double num_rows, double fields_read_per_row,
557 double bytes_per_row) {
558 return (kReadOneRowCost + kReadOneFieldCost * fields_read_per_row +
559 kReadOneByteCost * bytes_per_row) *
560 num_rows;
561}
562
563/**
564 Computes the cost of reading a number of rows from a table.
565 @see ReadRowCost() for further details.
566
567 @param table The table to read from.
568 @param num_rows The (expected) number of rows to read.
569
570 @returns The cost of reading num_rows.
571*/
572inline double RowReadCostTable(const TABLE *table, double num_rows) {
573 double fields_read_per_row = bitmap_bits_set(table->read_set);
574 const BytesPerTableRow &bytes_per_row = *table->bytes_per_row();
575 return RowReadCost(
576 num_rows, fields_read_per_row,
577 bytes_per_row.record_bytes + bytes_per_row.overflow_bytes) +
578 TableAccessIOCost(table, num_rows, bytes_per_row);
579}
580
581/**
582 Computes the cost of reading a number of rows from an index.
583 @see ReadRowCost() for further details.
584
585 @param table The table to which the index belongs.
586 @param key_idx The position of the key in table->key_info[].
587 @param num_rows The (expected) number of rows to read.
588
589 @returns The cost of reading num_rows.
590*/
591inline double RowReadCostIndex(const TABLE *table, unsigned key_idx,
592 double num_rows) {
593 if (IsClusteredPrimaryKey(table, key_idx)) {
594 return RowReadCostTable(table, num_rows);
595 }
596 // Assume we read two fields from the index record if it is not covering. The
597 // exact assumption here is not very important as the cost should be dominated
598 // by the additional lookup into the primary index.
599 constexpr double kDefaultFieldsReadFromCoveringIndex = 2;
600 double fields_read_per_row = table->covering_keys.is_set(key_idx)
601 ? bitmap_bits_set(table->read_set)
602 : kDefaultFieldsReadFromCoveringIndex;
603
604 double bytes_per_row = EstimateBytesPerRowIndex(table, key_idx);
605 return RowReadCost(num_rows, fields_read_per_row, bytes_per_row) +
606 CoveringIndexAccessIOCost(table, key_idx, num_rows);
607}
608
609/**
610 Estimates the cost of a full table scan. Primarily used to assign a cost to
611 the TABLE_SCAN access path.
612
613 @param table The table to estimate cost for.
614
615 @returns The cost of scanning the table.
616*/
617inline double EstimateTableScanCost(const TABLE *table) {
618 return RowReadCostTable(table, table->file->stats.records);
619}
620
621/**
622 Estimates the cost of an index lookup.
623
624 @param table The table to which the index belongs.
625 @param key_idx The position of the key in table->key_info[].
626
627 @return The estimated cost of an index lookup.
628
629 @note The model "cost ~ index_height" works well when the Adaptive Hash Index
630 (AHI) is disabled. The AHI essentially works as a dynamic cache for the most
631 frequently accessed index pages that sits on top of the B-tree. With AHI
632 enabled the cost of random lookups does not appear to be predictable using
633 standard explanatory variables such as index height or the logarithm of the
634 number of rows in the index. The performance of AHI will also be dependent on
635 the access pattern, so it is fundamentally difficult to create an accurate
636 model. However, our calibration experiments reveal two things that hold true
637 both with and without AHI enabled:
638
639 1. Index lookups usually take 1-3 microseconds. The height of a B-tree grows
640 very slowly (proportional to log(N)/log(R) for tables with N rows and R
641 records per page), making it near-constant in the common case where tables
642 have many records per page, even without AHI.
643
644 2. Random lookups in large indexes tend to be slower. A random access pattern
645 will cause more cache misses, both for regular hardware caching and AHI. In
646 addition, for a larger B-tree only the top levels fit in cache and we will
647 invariably see a few cache misses with random accesses.
648
649 We model the cost of index lookups by interpolating between a model with
650 constant cost and a model that depends entirely on the height of the index.
651 The constants are based on calibration experiments with and without AHI.
652
653 Another factor that is important when calibrating the cost of index lookups is
654 whether we are interested in the average cost when performing many lookups
655 such as when performing an index nested loop join or scanning along a
656 secondary non-covering index and lookup into the primary index, or the cost of
657 a single lookup such as a point select that uses an index. From our
658 measurements we see that the average running time of an index lookup can
659 easily be a factor ~5x faster when performing thousands of successive lookups
660 compared to just one. This is likely due to hardware caching effects. Since
661 getting the cost right in the case when we perform many lookups is typically
662 more important, we have opted to calibrate costs based on operations that
663 perform many lookups.
664
665 For adding IO costs to this model (future work) we probably want to assume
666 that we only fetch a single page when performing an index lookup, as
667 everything but leaf pages will tend to be cached, at least when performing
668 many index lookups in a query plan, which is exactly the case where it is
669 important to get costs right.
670*/
671inline double IndexLookupCost(const TABLE *table, unsigned key_idx) {
672 assert(key_idx < table->s->keys);
673 double cost_with_ahi = kIndexLookupFixedCost;
674 double cost_without_ahi = kIndexLookupPageCost * IndexHeight(table, key_idx);
675 return 0.5 * (cost_with_ahi + cost_without_ahi);
676}
677
678/// Type of range scan operation.
679enum class RangeScanType : char {
680 /// Using the MRR optimization.
682
683 /// Plain range scan, without the MRR optimization.
685};
686
687/**
688 Estimates the cost of an index range scan.
689
690 The cost model for index range scans accounts for the index lookup cost as
691 well as the cost of reading rows. Both index scans and ref accesses can be
692 viewed as special cases of index range scans, so the cost functions for those
693 operations call this function under the hood.
694
695 @note In the future this function should be extended to account for IO cost.
696
697 @param table The table to which the index belongs.
698 @param key_idx The position of the key in table->key_info[].
699 @param scan_type Whether this an MRR or a regular index range scan.
700 @param num_ranges The number of ranges.
701 @param num_output_rows The estimated expected number of output rows.
702
703 @returns The estimated cost of the index range scan operation.
704*/
705double EstimateIndexRangeScanCost(const TABLE *table, unsigned key_idx,
706 RangeScanType scan_type, double num_ranges,
707 double num_output_rows);
708
709/**
710 Estimates the cost of an index scan. An index scan scans all rows in the
711 table along the supplied index.
712
713 @param table The table to which the index belongs.
714 @param key_idx The position of the key in table->key_info[].
715
716 @returns The estimated cost of the index scan.
717*/
718inline double EstimateIndexScanCost(const TABLE *table, unsigned key_idx) {
720 1.0, table->file->stats.records);
721}
722
723/**
724 Estimates the cost of an index lookup (ref access).
725
726 @param table The table to which the index belongs.
727 @param key_idx The position of the key in table->key_info[].
728 @param num_output_rows The estimated number of output rows.
729
730 @returns The estimated cost of the index scan.
731*/
732inline double EstimateRefAccessCost(const TABLE *table, unsigned key_idx,
733 double num_output_rows) {
734 // We want the optimizer to prefer ref acccesses to range scans when they both
735 // have the same cost. This is particularly important for the EQ_REF access
736 // path (index lookups with at most one matching row) since the EQ_REF
737 // iterator uses caching to improve performance.
738 constexpr double kRefAccessCostDiscount = 0.05;
739 return (1.0 - kRefAccessCostDiscount) *
741 1.0, num_output_rows);
742}
743
744/**
745 Input to HashJoinCost, for calculating the cost of a hash join.
746*/
747struct HashJoinMetrics final {
748 /// The number of rows in the 'build' input.
750 /// The average size of rows in the 'build' input (in bytes).
752 /// The size of the join key, in bytes.
753 double key_size;
754 /// The number of rows in the 'probe' input.
756 /// The average size of rows in the 'probe' input (in bytes).
758 /// The number of rows in the result set.
760};
761
762/** This class represents the cost of a hash join (excluding the cost
763 of sub-paths).
764*/
765class HashJoinCost final {
766 public:
768
771 }
772
773 double init_cost() const { return m_init_cost; }
774
775 double cost() const { return m_cost; }
776
777 private:
778 /// The probability (range [0.0, 1.0]) of needing spill to disk.
780 /// The cost of preparing to produce the first result row.
782 /// The cost of the hash join.
783 double m_cost;
784};
785
786#endif // SQL_JOIN_OPTIMIZER_COST_MODEL_H_
constexpr double kUnknownRowCount
To indicate that a row estimate is not yet made.
Definition: access_path.h:196
This class represents the cost of a hash join (excluding the cost of sub-paths).
Definition: cost_model.h:765
double m_init_cost
The cost of preparing to produce the first result row.
Definition: cost_model.h:781
double spill_to_disk_probability() const
Definition: cost_model.h:769
double init_cost() const
Definition: cost_model.h:773
double m_spill_to_disk_probability
The probability (range [0.0, 1.0]) of needing spill to disk.
Definition: cost_model.h:779
HashJoinCost(THD *thd, const HashJoinMetrics &metrics)
Definition: cost_model.cc:1804
double cost() const
Definition: cost_model.h:775
double m_cost
The cost of the hash join.
Definition: cost_model.h:783
Base class that is used to represent any kind of expression in a relational query.
Definition: item.h:928
A typesafe replacement for DYNAMIC_ARRAY.
Definition: mem_root_array.h:432
This class represents a query block, aka a query specification, which is a query consisting of a SELE...
Definition: sql_lex.h:1179
For each client connection we create a separate thread with THD serving as a thread/connection descri...
Definition: sql_lexer_thd.h:36
Hypergraph optimizer cost constants.
constexpr double kReadOneFieldCost
Cost of per field in the read set.
Definition: cost_constants.h:85
constexpr double kApplyOneFilterCost
Cost of evaluating one filter on one row.
Definition: cost_constants.h:117
constexpr double kReadOneRowCost
Fixed cost of reading a row from the storage engine into the record buffer.
Definition: cost_constants.h:81
constexpr double kIndexLookupPageCost
The cost per page that is visited when performing an index lookup in an InnoDB B-tree.
Definition: cost_constants.h:129
constexpr double kIndexLookupFixedCost
Fixed cost of an index lookup when AHI is enabled (default).
Definition: cost_constants.h:132
constexpr double kReadOneByteCost
Overhead per byte when reading a row.
Definition: cost_constants.h:95
double RowReadCost(double num_rows, double fields_read_per_row, double bytes_per_row)
Computes the expected cost of reading a number of rows.
Definition: cost_model.h:556
constexpr ha_rows kRowEstimateFallback
A fallback cardinality estimate that is used in case the storage engine cannot provide one (like for ...
Definition: cost_model.h:56
double EstimateRefAccessCost(const TABLE *table, unsigned key_idx, double num_output_rows)
Estimates the cost of an index lookup (ref access).
Definition: cost_model.h:732
double TableAccessIOCost(const TABLE *table, double num_rows, BytesPerTableRow row_size)
Calculate the IO-cost of reading 'num_rows' rows from 'table'.
Definition: cost_model.cc:471
void EstimateSortCost(THD *thd, AccessPath *path, double distinct_rows=kUnknownRowCount)
Estimate costs and output rows for a SORT AccessPath.
Definition: cost_model.cc:658
void EstimateTemptableAggregateCost(THD *thd, AccessPath *path, const Query_block *query_block)
Estimate the costs and row count for a Temp table Aggregate AccessPath.
Definition: cost_model.cc:1710
RangeScanType
Type of range scan operation.
Definition: cost_model.h:679
@ kSingleRange
Plain range scan, without the MRR optimization.
@ kMultiRange
Using the MRR optimization.
BytesPerTableRow EstimateBytesPerRowTable(const TABLE *table)
Estimates the number of bytes that MySQL must process when reading a row from a table,...
Definition: cost_model.h:437
void EstimateWindowCost(AccessPath *path)
Estimate the costs and row count for a WINDOW AccessPath.
Definition: cost_model.cc:1754
bool IsClusteredPrimaryKey(const TABLE *table, unsigned key_idx)
Determines whether a given key on a table is both clustered and primary.
Definition: cost_model.h:316
double CoveringIndexAccessIOCost(const TABLE *table, unsigned key_idx, double num_rows)
Calculate the IO-cost of doing a lookup on index 'key_idx' on 'table' and then read 'num_rows' rows.
Definition: cost_model.cc:526
double EstimateGroupSkipScanCost(TABLE *table, uint key_idx, uint num_groups, bool has_max)
Estimate costs for a group skip scan operation.
Definition: cost_model.cc:1612
double EstimateTableScanCost(const TABLE *table)
Estimates the cost of a full table scan.
Definition: cost_model.h:617
constexpr size_t kMaxItemLengthEstimate
When we make cost estimates, we use this as the maximal length the values we get from evaluating an I...
Definition: cost_model.h:51
void AddCost(THD *thd, const ContainedSubquery &subquery, double num_rows, FilterCost *cost)
Used internally by EstimateFilterCost() only.
Definition: cost_model.cc:715
double EstimateIndexScanCost(const TABLE *table, unsigned key_idx)
Estimates the cost of an index scan.
Definition: cost_model.h:718
void EstimateLimitOffsetCost(AccessPath *path)
Estimate the costs and row count for a WINDOW AccessPath.
Definition: cost_model.cc:1677
int64_t GetReadSetWidth(const TABLE *table)
Get an estimate of the row size of the read set of 'table'.
Definition: cost_model.cc:395
double RowReadCostIndex(const TABLE *table, unsigned key_idx, double num_rows)
Computes the cost of reading a number of rows from an index.
Definition: cost_model.h:591
std::span< const Item *const > TermArray
Array of aggregation terms.
Definition: cost_model.h:160
double EstimateSkipScanCost(TABLE *table, uint key_idx, uint num_subrange_scans, ha_rows records)
Estimate costs and result row count for a skip scan operation.
Definition: cost_model.cc:1603
void EstimateDeleteRowsCost(AccessPath *path)
Definition: cost_model.cc:1620
constexpr unsigned kMinEstimatedBytesPerRow
The minimum number of bytes to return for row length estimates.
Definition: cost_model.h:325
int IndexHeight(const TABLE *table, unsigned key_idx)
Estimates the height of a B-tree index.
Definition: cost_model.h:496
double EstimateIndexRangeScanCost(const TABLE *table, unsigned key_idx, RangeScanType scan_type, double num_ranges, double num_output_rows)
Estimates the cost of an index range scan.
Definition: cost_model.cc:571
double FindOutputRowsForJoin(THD *thd, double left_rows, double right_rows, const JoinPredicate *edge)
Estimate the number of output rows from joining two relations.
Definition: cost_model.h:269
void EstimateUpdateRowsCost(AccessPath *path)
Definition: cost_model.cc:1637
constexpr double kBlockFillFactor
This is the estimated fraction of an (innodb) block that is in use (i.e.
Definition: cost_model.h:91
unsigned ClampedBlockSize(const TABLE *table)
We clamp the block size to lie in the interval between the max and min allowed block size for InnoDB ...
Definition: cost_model.h:399
constexpr double kIOStartCost
We model the IO cost for InnoDB tables with the DYNAMIC row format.
Definition: cost_model.h:84
double EstimateDistinctRows(THD *thd, double child_rows, TermArray terms)
Estimate the number of rows with a distinct combination of values for 'terms'.
Definition: cost_model.cc:1552
void EstimateMaterializeCost(THD *thd, AccessPath *path)
Provide row estimates and costs for a MATERIALIZE AccessPath.
Definition: cost_model.cc:880
double IndexLookupCost(const TABLE *table, unsigned key_idx)
Estimates the cost of an index lookup.
Definition: cost_model.h:671
void EstimateStreamCost(THD *thd, AccessPath *path)
Estimate the costs and row count for a STREAM AccessPath.
Definition: cost_model.cc:1654
void EstimateAggregateCost(THD *thd, AccessPath *path, const Query_block *query_block)
Estimate costs and result row count for an aggregate operation.
Definition: cost_model.cc:1581
unsigned EstimateBytesPerRowIndex(const TABLE *table, unsigned key_idx)
Estimates the number of bytes that MySQL must process when reading a row from a secondary index,...
Definition: cost_model.h:468
BytesPerTableRow EstimateBytesPerRowWideTable(const TABLE *table)
Estimate the average number of bytes that we need to read from the storage engine when reading a row ...
Definition: cost_model.cc:408
FilterCost EstimateFilterCost(THD *thd, double num_rows, Item *condition, const Query_block *outer_query_block)
Estimate the cost of evaluating “condition”, “num_rows” times.
Definition: cost_model.cc:756
constexpr unsigned kMaxEstimatedBytesPerRow
The maximum number of bytes to return for row length estimates.
Definition: cost_model.h:335
double EstimateSemijoinFanOut(THD *thd, double right_rows, const JoinPredicate &edge)
Estimate the fan out for a left semijoin or a left antijoin.
Definition: cost_model.cc:1764
double RowReadCostTable(const TABLE *table, double num_rows)
Computes the cost of reading a number of rows from a table.
Definition: cost_model.h:572
constexpr double kIOByteCost
The additional cost of reading an extra byte from disk.
Definition: cost_model.h:87
This file includes constants used by all storage engines.
my_off_t ha_rows
Definition: my_base.h:1217
uint bitmap_bits_set(const MY_BITMAP *map)
Definition: my_bitmap.cc:416
static char * path
Definition: mysqldump.cc:150
static PFS_engine_table_share_proxy table
Definition: pfs.cc:61
Definition: os0file.h:89
ValueType max(X &&first)
Definition: gtid.h:103
static PSI_metric_info_v1 metrics[]
Definition: plugin.cc:83
T clamp(U x)
Definition: ut0ut.h:412
const mysql_service_registry_t * r
Definition: pfs_example_plugin_employee.cc:86
Access paths are a query planning structure that correspond 1:1 to iterators, in that an access path ...
Definition: access_path.h:238
This struct represents the number of bytes we expect to read for a table row.
Definition: cost_model.h:347
int64_t overflow_bytes
The number of bytes read from overflow pages.
Definition: cost_model.h:359
double overflow_probability
Definition: cost_model.h:366
int64_t record_bytes
The number of bytes read from the B-tree record.
Definition: cost_model.h:352
This class represents a subquery contained in some subclass of Item_subselect,.
Definition: item.h:860
See EstimateFilterCost.
Definition: cost_model.h:94
double init_cost_if_not_materialized
Initial cost before the filter can be applied for the first time.
Definition: cost_model.h:104
double cost_to_materialize
Cost of materializing all subqueries present in the filter.
Definition: cost_model.h:113
double cost_if_materialized
Cost of evaluating the filter for all rows if all subqueries in it have been materialized beforehand.
Definition: cost_model.h:109
double cost_if_not_materialized
Cost of evaluating the filter for all rows if subqueries are not materialized.
Definition: cost_model.h:98
Input to HashJoinCost, for calculating the cost of a hash join.
Definition: cost_model.h:747
double probe_row_size
The average size of rows in the 'probe' input (in bytes).
Definition: cost_model.h:757
double build_rows
The number of rows in the 'build' input.
Definition: cost_model.h:749
double key_size
The size of the join key, in bytes.
Definition: cost_model.h:753
double build_row_size
The average size of rows in the 'build' input (in bytes).
Definition: cost_model.h:751
double probe_rows
The number of rows in the 'probe' input.
Definition: cost_model.h:755
double result_rows
The number of rows in the result set.
Definition: cost_model.h:759
A specification that two specific relational expressions (e.g., two tables, or a table and a join bet...
Definition: access_path.h:80
RelationalExpression * expr
Definition: access_path.h:81
double selectivity
Definition: access_path.h:82
enum RelationalExpression::Type type
@ SEMIJOIN
Left semijoin.
Definition: relational_expression.h:161
@ MULTI_INNER_JOIN
Definition: relational_expression.h:183
@ STRAIGHT_INNER_JOIN
Definition: relational_expression.h:169
@ ANTIJOIN
Left antijoin.
Definition: relational_expression.h:164
@ INNER_JOIN
Definition: relational_expression.h:157
@ FULL_OUTER_JOIN
Definition: relational_expression.h:176
@ TABLE
Definition: relational_expression.h:185
@ LEFT_JOIN
Definition: relational_expression.h:158
Definition: table.h:1435