#include "sql/join_optimizer/cost_model.h"
#include <math.h>
#include <stdio.h>
#include <algorithm>
#include <bit>
#include <cmath>
#include <iterator>
#include "mem_root_deque.h"
#include "my_base.h"
#include "my_bitmap.h"
#include "sql/handler.h"
#include "sql/histograms/histogram.h"
#include "sql/item_func.h"
#include "sql/item_subselect.h"
#include "sql/join_optimizer/access_path.h"
#include "sql/join_optimizer/bit_utils.h"
#include "sql/join_optimizer/find_contained_subqueries.h"
#include "sql/join_optimizer/join_optimizer.h"
#include "sql/join_optimizer/materialize_path_parameters.h"
#include "sql/join_optimizer/optimizer_trace.h"
#include "sql/join_optimizer/overflow_bitset.h"
#include "sql/join_optimizer/print_utils.h"
#include "sql/join_optimizer/relational_expression.h"
#include "sql/join_optimizer/secondary_statistics.h"
#include "sql/mem_root_array.h"
#include "sql/mysqld.h"
#include "sql/opt_costmodel.h"
#include "sql/opt_trace.h"
#include "sql/sql_class.h"
#include "sql/sql_const.h"
#include "sql/sql_lex.h"
#include "sql/sql_optimizer.h"
#include "sql/sql_planner.h"
#include "sql/system_variables.h"
#include "sql/table.h"
#include "template_utils.h"

Classes
class	anonymous_namespace{cost_model.cc}::AggregateRowEstimator
	This class finds disjoint sets of aggregation terms that form prefixes of some non-hash index, and makes row estimates for those sets based on index metadata. More...

struct	anonymous_namespace{cost_model.cc}::AggregateRowEstimator::Prefix
	A prefix of some key where each key_part corresponds to an aggregation term. More...

Namespaces
namespace	anonymous_namespace{cost_model.cc}

Typedefs
using	anonymous_namespace{cost_model.cc}::TermArray = Bounds_checked_array< const Item *const >
	Array of aggregation terms. More...

Functions
static double	AggregateCost (THD *thd, double output_rows, double input_rows, int agg_count, int group_by_field_count)
	Calculate the estimated cost of Streaming Aggregation, i.e. More...

static double	MaterializationWithDedupCost (bool use_old_model, double output_rows, double input_rows, int field_count)
	Calculate the estimated initialization cost of a MATERIALIZE Accesspath that involves deduplication. More...

static double	MaterializationCost (bool use_old_model, double output_rows, int field_count)
	Calculate the estimated cost of a MATERIALIZE Accesspath that does not involve deduplication. More...

static double	TempTableScanCost (THD thd, AccessPath table_path, double output_rows)
	Calculate the estimated cost of a Table scan Accesspath for a temporary table created for materialization. More...

static double	StreamCost (THD *thd, double output_rows, int field_count)
	Calculate the estimated cost of a STREAM Accesspath. More...

static double	AddInnodbEngineCostOverhead (double temptable_engine_cost, double temp_table_size, double output_rows, const mem_root_deque< Item * > &join_fields)
	Add InnoDB engine cost overhead into the in-memory table cost if the estimated temp table size exceeds tmp_table_size. More...

static double	TempTableAggregationCost (THD thd, double output_rows, double input_rows, int agg_count, int group_by_fields, const mem_root_deque< Item > &join_fields)
	Calculate the estimated initialization cost of a TEMPTABLE_AGGREGATE Accesspath This involves the cost for deduplicating input rows, inserting them into the temp table, and processing the aggregation functions. More...

BytesPerTableRow	EstimateBytesPerRowWideTable (const TABLE *table, int64_t max_row_size)
	Estimate the average number of bytes that we need to read from the storage engine when reading a row from 'table'. More...

static double	LowerCacheHitRatio (const handler *file, double file_size)
	Estimate a lower limit for the cache hit ratio when reading from an index (or base table), based on the size of the index relative to that of the buffer pool. More...

double	TableAccessIOCost (const TABLE *table, double num_rows, BytesPerTableRow row_size)
	Calculate the IO-cost of reading 'num_rows' rows from 'table'. More...

double	CoveringIndexAccessIOCost (const TABLE *table, unsigned key_idx, double num_rows)
	Calculate the IO-cost of doing a lookup on index 'key_idx' on 'table' and then read 'num_rows' rows. More...

double	EstimateIndexRangeScanCost (const TABLE *table, unsigned key_idx, RangeScanType scan_type, double num_ranges, double num_output_rows)
	Estimates the cost of an index range scan. More...

double	anonymous_namespace{cost_model.cc}::EstimateAggregateRows (THD thd, const AccessPath child, const Query_block *query_block, bool rollup)
	Estimate the row count for an aggregate operation (including ROLLUP rows for GROUP BY ... WITH ROLLUP). More...

void	EstimateSortCost (THD thd, AccessPath path, double distinct_rows)
	Estimate costs and output rows for a SORT AccessPath. More...

void	AddCost (THD thd, const ContainedSubquery &subquery, double num_rows, FilterCost cost)
	Used internally by EstimateFilterCost() only. More...

FilterCost	EstimateFilterCost (THD thd, double num_rows, Item condition, const Query_block *outer_query_block)
	Estimate the cost of evaluating “condition”, “num_rows” times. More...

static void	AddOperandCosts (const MaterializePathParameters::Operand &operand, double subquery_cost, double cost_for_cacheable)

static void	SetDistinctGroupByOutputRowsAndSubqueryCosts (THD thd, AccessPath path, double subquery_cost, double cost_for_cacheable)

static double	GetDistinctOrGroupByInitCost (bool use_old_cost_model, AccessPath *path)
	Return the cost for materialization used for DISTINCT or GROUP BY, which essentially involves deduplication cost. More...

static void	AccummulateOutputRowsAndSubqueryCosts (AccessPath path, double subquery_cost, double *cost_for_cacheable)

void	EstimateMaterializeCost (THD thd, AccessPath path)
	Provide row estimates and costs for a MATERIALIZE AccessPath. More...

TermArray	anonymous_namespace{cost_model.cc}::GetAggregationTerms (const JOIN &join)

double	anonymous_namespace{cost_model.cc}::EstimateDistinctRowsFromStatistics (THD *thd, TermArray terms, double child_rows)
	Estimate the number of distinct tuples in the projection defined by 'terms'. More...

template<typename FunctionLow , typename FunctionHigh >
double	anonymous_namespace{cost_model.cc}::SmoothTransition (FunctionLow function_low, FunctionHigh function_high, double lower_limit, double upper_limit, double argument)
	For a function f(x) such that: f(x) = g(x) for x<=l f(x) = h(x) for x>l. More...

double	anonymous_namespace{cost_model.cc}::EstimateRollupRowsPrimitively (double aggregate_rows, size_t grouping_expressions)
	Do a cheap rollup row estimate for small result sets. More...

double	anonymous_namespace{cost_model.cc}::EstimateRollupRowsAdvanced (THD *thd, double aggregate_rows, TermArray terms)
	Do more precise rollup row estimate for larger result sets. More...

double	EstimateDistinctRows (THD *thd, double child_rows, TermArray terms)

void	EstimateAggregateCost (THD thd, AccessPath path, const Query_block *query_block)
	Estimate costs and result row count for an aggregate operation. More...

void	EstimateDeleteRowsCost (AccessPath *path)

void	EstimateUpdateRowsCost (AccessPath *path)

void	EstimateStreamCost (THD thd, AccessPath path)
	Estimate the costs and row count for a STREAM AccessPath. More...

void	EstimateLimitOffsetCost (AccessPath *path)
	Estimate the costs and row count for a WINDOW AccessPath. More...

void	EstimateTemptableAggregateCost (THD thd, AccessPath path, const Query_block *query_block)
	Estimate the costs and row count for a Temp table Aggregate AccessPath. More...

void	EstimateWindowCost (AccessPath *path)
	Estimate the costs and row count for a WINDOW AccessPath. More...

double	EstimateSemijoinFanOut (THD *thd, double right_rows, const JoinPredicate &edge)
	Estimate the fan out for a left semijoin or a left antijoin. More...

Variables
constexpr double	kTempTableCreationCost = 3

Function Documentation

◆ AccummulateOutputRowsAndSubqueryCosts()

static void AccummulateOutputRowsAndSubqueryCosts	(	AccessPath *	path,
		double *	subquery_cost,
		double *	cost_for_cacheable
	)

static

◆ AddCost()

void AddCost	(	THD *	thd,
		const ContainedSubquery &	subquery,
		double	num_rows,
		FilterCost *	cost
	)

Used internally by EstimateFilterCost() only.

◆ AddInnodbEngineCostOverhead()

static double AddInnodbEngineCostOverhead	(	double	temptable_engine_cost,
		double	temp_table_size,
		double	output_rows,
		const mem_root_deque< Item * > &	join_fields
	)

static

Add InnoDB engine cost overhead into the in-memory table cost if the estimated temp table size exceeds tmp_table_size.

Parameters

temptable_engine_cost	Pre-calculated cost of in-memory table
temp_table_size	'tmp_table_size' system variable value.
output_rows	Number of rows that the path outputs.
join_fields	Reference to join fields. These are used to estimate the temp table row length. See get_tmp_table_rec_length() for details.

Returns: cost after adding the disk cost overhead into the in-memory cost.

◆ AddOperandCosts()

static void AddOperandCosts	(	const MaterializePathParameters::Operand &	operand,
		double *	subquery_cost,
		double *	cost_for_cacheable
	)

static

◆ AggregateCost()

static double AggregateCost	(	THD *	thd,
		double	output_rows,
		double	input_rows,
		int	agg_count,
		int	group_by_field_count
	)

static

Calculate the estimated cost of Streaming Aggregation, i.e.

AGGREGATE Accesspath.

Parameters

thd	Current thread.
output_rows	Number of rows that the path outputs.
input_rows	Number of input rows to the path.
agg_count	Number of aggregation functions present in the path.
group_by_field_count	Number of GROUP BY columns used in the path. In the absence of GROUP BY clause, it can be 0.

Returns: The cost estimate.

Note: This aggregation cost is independent of the cost of Temp table aggregation, and these two paths do not share any logic or cost constants.

◆ CoveringIndexAccessIOCost()

double CoveringIndexAccessIOCost	(	const TABLE *	table,
		unsigned	key_idx,
		double	num_rows
	)

Calculate the IO-cost of doing a lookup on index 'key_idx' on 'table' and then read 'num_rows' rows.

◆ EstimateAggregateCost()

void EstimateAggregateCost	(	THD *	thd,
		AccessPath *	path,
		const Query_block *	query_block
	)

Estimate costs and result row count for an aggregate operation.

Parameters

[in,out]	thd	The current thread.
[in,out]	path	The AGGREGATE path.
[in]	query_block	The Query_block to which 'path' belongs.

◆ EstimateBytesPerRowWideTable()

BytesPerTableRow EstimateBytesPerRowWideTable	(	const TABLE *	table,
		int64_t	max_row_size
	)

Estimate the average number of bytes that we need to read from the storage engine when reading a row from 'table'.

This is the size of the (b-tree) record and the overflow pages of any field that is part of the projection. This is similar to what EstimateBytesPerRowTable() does, but this function is intended to do a more accurate but also more expensive calculation for tables with potentially large rows (i.e. tables with BLOBs or large VARCHAR fields).

Note that this function tailored for InnoDB (and also for the DYNAMIC row format of InnoDB). At some point we may want to move this logic to the handler, so that we can customize it for other engines as well.

Parameters

table	The target table
max_row_size	The row size if all variable-sized fields are full.

Returns: The estimated row size.

◆ EstimateDeleteRowsCost()

void EstimateDeleteRowsCost ( AccessPath * path )

◆ EstimateDistinctRows()

double EstimateDistinctRows	(	THD *	thd,
		double	child_rows,
		TermArray	terms
	)

◆ EstimateFilterCost()

FilterCost EstimateFilterCost	(	THD *	thd,
		double	num_rows,
		Item *	condition,
		const Query_block *	outer_query_block
	)

Estimate the cost of evaluating “condition”, “num_rows” times.

This is a fairly rudimentary estimation, but it includes the cost of any subqueries that may be present and that need evaluation.

◆ EstimateIndexRangeScanCost()

double EstimateIndexRangeScanCost	(	const TABLE *	table,
		unsigned	key_idx,
		RangeScanType	scan_type,
		double	num_ranges,
		double	num_output_rows
	)

Estimates the cost of an index range scan.

The cost model for index range scans accounts for the index lookup cost as well as the cost of reading rows. Both index scans and ref accesses can be viewed as special cases of index range scans, so the cost functions for those operations call this function under the hood.

Note: In the future this function should be extended to account for IO cost.

Parameters

table	The table to which the index belongs.
key_idx	The position of the key in table->key_info[].
scan_type	Whether this an MRR or a regular index range scan.
num_ranges	The number of ranges.
num_output_rows	The estimated expected number of output rows.

Returns: The estimated cost of the index range scan operation.

◆ EstimateLimitOffsetCost()

void EstimateLimitOffsetCost ( AccessPath * path )

Estimate the costs and row count for a WINDOW AccessPath.

As described in

See also: AccessPath::m_init_cost, the cost to read k out of N rows would be init_cost + (k/N) * (cost - init_cost).

◆ EstimateMaterializeCost()

void EstimateMaterializeCost	(	THD *	thd,
		AccessPath *	path
	)

Provide row estimates and costs for a MATERIALIZE AccessPath.

MATERIALIZE AccessPath is created by both old and new optimizer in many different contexts where temp table needs to be created, both with and without deduplication. E.g. materialization for a derived table, materializing deduplicated input rows for DISTINCT, GROUP BY clause without an aggregation functions, SET operations, etc

Note:

SET operations that do deduplication (such as UNION DISTINCT, EXCEPT and INTERSECT) currently do not consider deduplication cost. They should.
There is no aggregation involved in this path. Aggregation with temp table uses a different Access path.

◆ EstimateSemijoinFanOut()

double EstimateSemijoinFanOut	(	THD *	thd,
		double	right_rows,
		const JoinPredicate &	edge
	)

Estimate the fan out for a left semijoin or a left antijoin.

The fan out is defined as the number of result rows, divided by the number of input rows from the left hand relation. For a semijoin, J1:

SELECT ... FROM t1 WHERE EXISTS (SELECT ... FROM t2 WHERE predicate)

we know that the fan out of the corresponding inner join J2:

SELECT ... FROM t1, t2 WHERE predicate

is: F(J2) = CARD(t2) * SELECTIVITY(predicate) , where CARD(t2)=right_rows, and SELECTIVITY(predicate)=edge.selectivity. If 'predicate' is a deterministic function of t1 and t2 rows, then J1 is equivalent to an inner join J3:

SELECT ... FROM t1 JOIN (SELECT DISTINCT f1,..fn FROM t2) d ON predicate

where f1,..fn are those fields from t2 that appear in the predicate.

Then F(J1) = F(J3) = F(J2) * CARD(d) / CARD(t2) = CARD(d) * SELECTIVITY(predicate).

This function therefore collects f1..fn and estimates CARD(d). As a special case, 'predicate' may be independent of t2. The query is then equivalent to:

SELECT ... FROM t1 WHERE predicate AND (SELECT COUNT(*) FROM t2) > 0

The fan out is then the selectivity of 'predicate' multiplied by the probability of t2 having at least one row.

Parameters

thd	The current thread.
right_rows	The number of input rows from the right hand relation.
edge	Join edge.

Returns: fan out.

◆ EstimateSortCost()

void EstimateSortCost	(	THD *	thd,
		AccessPath *	path,
		double	distinct_rows = `kUnknownRowCount`
	)

Estimate costs and output rows for a SORT AccessPath.

Parameters

thd	Current thread.
path	the AccessPath.
distinct_rows	An estimate of the number of distinct rows, if remove_duplicates==true and we have an estimate already.

◆ EstimateStreamCost()

void EstimateStreamCost	(	THD *	thd,
		AccessPath *	path
	)

Estimate the costs and row count for a STREAM AccessPath.

◆ EstimateTemptableAggregateCost()

void EstimateTemptableAggregateCost	(	THD *	thd,
		AccessPath *	path,
		const Query_block *	query_block
	)

Estimate the costs and row count for a Temp table Aggregate AccessPath.

◆ EstimateUpdateRowsCost()

void EstimateUpdateRowsCost ( AccessPath * path )

◆ EstimateWindowCost()

void EstimateWindowCost ( AccessPath * path )

Estimate the costs and row count for a WINDOW AccessPath.

◆ GetDistinctOrGroupByInitCost()

static double GetDistinctOrGroupByInitCost	(	bool	use_old_cost_model,
		AccessPath *	path
	)

static

Return the cost for materialization used for DISTINCT or GROUP BY, which essentially involves deduplication cost.

◆ LowerCacheHitRatio()

static double LowerCacheHitRatio	(	const handler *	file,
		double	file_size
	)

static

Estimate a lower limit for the cache hit ratio when reading from an index (or base table), based on the size of the index relative to that of the buffer pool.

Parameters

file	The handler to which the index or base table belongs.
file_size	The size of the index or base table, in bytes.

Returns: A lower limit for the cache hit ratio.

◆ MaterializationCost()

static double MaterializationCost	(	bool	use_old_model,
		double	output_rows,
		int	field_count
	)

static

Calculate the estimated cost of a MATERIALIZE Accesspath that does not involve deduplication.

Parameters

use_old_model	If false, use the hypergraph cost model based on linear regression. If true, continue to use the earlier cost model.
output_rows	Number of rows that the path outputs.
field_count	Number of fields present in the materialized rows, which translates to the number of temp table columns.

Returns: The cost estimate.

◆ MaterializationWithDedupCost()

static double MaterializationWithDedupCost	(	bool	use_old_model,
		double	output_rows,
		double	input_rows,
		int	field_count
	)

static

Calculate the estimated initialization cost of a MATERIALIZE Accesspath that involves deduplication.

This involves the cost for deduplicating input rows and inserting them into the temp table.

Parameters

use_old_model	If false, use the hypergraph cost model based on linear regression. If true, continue to use the earlier cost model.
output_rows	Number of rows that the path outputs.
input_rows	Number of input rows to the path.
field_count	Number of GROUP BY or DISTINCT columns present.

Returns: The cost estimate.

◆ SetDistinctGroupByOutputRowsAndSubqueryCosts()

static void SetDistinctGroupByOutputRowsAndSubqueryCosts	(	THD *	thd,
		AccessPath *	path,
		double *	subquery_cost,
		double *	cost_for_cacheable
	)

static

◆ StreamCost()

static double StreamCost	(	THD *	thd,
		double	output_rows,
		int	field_count
	)

static

Calculate the estimated cost of a STREAM Accesspath.

Parameters

thd	Current thread.
output_rows	Number of rows that the path outputs, which is same as input rows.
field_count	Number of fields present in the streamed rows, which typically translates to the number of JOIN fields.

Returns: The cost estimate.

◆ TableAccessIOCost()

double TableAccessIOCost	(	const TABLE *	table,
		double	num_rows,
		BytesPerTableRow	row_size
	)

Calculate the IO-cost of reading 'num_rows' rows from 'table'.

◆ TempTableAggregationCost()

static double TempTableAggregationCost	(	THD *	thd,
		double	output_rows,
		double	input_rows,
		int	agg_count,
		int	group_by_fields,
		const mem_root_deque< Item * > &	join_fields
	)

static

Calculate the estimated initialization cost of a TEMPTABLE_AGGREGATE Accesspath This involves the cost for deduplicating input rows, inserting them into the temp table, and processing the aggregation functions.

Cost estimation for this path was introduced only in hypergraph optimizer.

Parameters

thd	Current thread.
output_rows	Number of rows that the path outputs.
input_rows	Number of input rows to the path.
agg_count	Number of aggregation functions present in the path.
group_by_fields	Number of GROUP BY columns present.
join_fields	Reference to join fields. These are used to estimate the temp table row length. See get_tmp_table_rec_length() for details.

Returns: The cost estimate.

◆ TempTableScanCost()

static double TempTableScanCost	(	THD *	thd,
		AccessPath *	table_path,
		double	output_rows
	)

static

Calculate the estimated cost of a Table scan Accesspath for a temporary table created for materialization.

Parameters

thd	Current thread.
table_path	The TABLE_SCAN AccessPath of the temporary table.
output_rows	Number of rows that the path outputs.

Returns: The cost estimate.

Variable Documentation

◆ kTempTableCreationCost

constexpr double kTempTableCreationCost = 3

constexpr

Classes

Namespaces

Typedefs

Functions

Variables

Function Documentation

◆ AccummulateOutputRowsAndSubqueryCosts()

◆ AddCost()

◆ AddInnodbEngineCostOverhead()

◆ AddOperandCosts()

◆ AggregateCost()

◆ CoveringIndexAccessIOCost()

◆ EstimateAggregateCost()

◆ EstimateBytesPerRowWideTable()

◆ EstimateDeleteRowsCost()

◆ EstimateDistinctRows()

◆ EstimateFilterCost()

◆ EstimateIndexRangeScanCost()

◆ EstimateLimitOffsetCost()

◆ EstimateMaterializeCost()

◆ EstimateSemijoinFanOut()

◆ EstimateSortCost()

◆ EstimateStreamCost()

◆ EstimateTemptableAggregateCost()

◆ EstimateUpdateRowsCost()

◆ EstimateWindowCost()

◆ GetDistinctOrGroupByInitCost()

◆ LowerCacheHitRatio()

◆ MaterializationCost()

◆ MaterializationWithDedupCost()

◆ SetDistinctGroupByOutputRowsAndSubqueryCosts()

◆ StreamCost()

◆ TableAccessIOCost()

◆ TempTableAggregationCost()

◆ TempTableScanCost()

Variable Documentation

◆ kTempTableCreationCost