costsize.c 66.7 KB
Newer Older
1 2
/*-------------------------------------------------------------------------
 *
3
 * costsize.c
4
 *	  Routines to compute (and set) relation sizes and path costs
5
 *
6 7
 * Path costs are measured in arbitrary units established by these basic
 * parameters:
8
 *
9
 *	seq_page_cost		Cost of a sequential page fetch
10 11 12
 *	random_page_cost	Cost of a non-sequential page fetch
 *	cpu_tuple_cost		Cost of typical CPU time to process a tuple
 *	cpu_index_tuple_cost  Cost of typical CPU time to process an index tuple
13 14 15 16 17 18
 *	cpu_operator_cost	Cost of CPU time to execute an operator or function
 *
 * We expect that the kernel will typically do some amount of read-ahead
 * optimization; this in conjunction with seek costs means that seq_page_cost
 * is normally considerably less than random_page_cost.  (However, if the
 * database is fully cached in RAM, it is reasonable to set them equal.)
19 20 21 22 23 24 25 26
 *
 * We also use a rough estimate "effective_cache_size" of the number of
 * disk pages in Postgres + OS-level disk cache.  (We can't simply use
 * NBuffers for this purpose because that would ignore the effects of
 * the kernel's disk cache.)
 *
 * Obviously, taking constants for these values is an oversimplification,
 * but it's tough enough to get any useful estimates even at this level of
27
 * detail.	Note that all of these parameters are user-settable, in case
28 29 30 31 32 33 34 35 36 37 38
 * the default values are drastically off for a particular platform.
 *
 * We compute two separate costs for each path:
 *		total_cost: total estimated cost to fetch all tuples
 *		startup_cost: cost that is expended before first tuple is fetched
 * In some scenarios, such as when there is a LIMIT or we are implementing
 * an EXISTS(...) sub-select, it is not necessary to fetch all tuples of the
 * path's result.  A caller can estimate the cost of fetching a partial
 * result by interpolating between startup_cost and total_cost.  In detail:
 *		actual_cost = startup_cost +
 *			(total_cost - startup_cost) * tuples_to_fetch / path->parent->rows;
39 40 41 42 43
 * Note that a base relation's rows count (and, by extension, plan_rows for
 * plan nodes below the LIMIT node) are set without regard to any LIMIT, so
 * that this equation works properly.  (Also, these routines guarantee not to
 * set the rows count to zero, so there will be no zero divide.)  The LIMIT is
 * applied as a top-level plan node.
44
 *
45 46 47 48 49 50 51
 * For largely historical reasons, most of the routines in this module use
 * the passed result Path only to store their startup_cost and total_cost
 * results into.  All the input data they need is passed as separate
 * parameters, even though much of it could be extracted from the Path.
 * An exception is made for the cost_XXXjoin() routines, which expect all
 * the non-cost fields of the passed XXXPath to be filled in.
 *
52
 *
53
 * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
Bruce Momjian's avatar
Add:  
Bruce Momjian committed
54
 * Portions Copyright (c) 1994, Regents of the University of California
55 56
 *
 * IDENTIFICATION
57
 *	  $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.159 2006/07/01 18:38:32 tgl Exp $
58 59 60
 *
 *-------------------------------------------------------------------------
 */
Vadim B. Mikheev's avatar
Vadim B. Mikheev committed
61

Bruce Momjian's avatar
Bruce Momjian committed
62
#include "postgres.h"
63

64
#include <math.h>
65

66
#include "catalog/pg_statistic.h"
67
#include "executor/nodeHash.h"
68
#include "miscadmin.h"
69
#include "optimizer/clauses.h"
70
#include "optimizer/cost.h"
71
#include "optimizer/pathnode.h"
72
#include "optimizer/plancat.h"
73
#include "parser/parsetree.h"
74
#include "utils/array.h"
75
#include "utils/selfuncs.h"
76
#include "utils/lsyscache.h"
77
#include "utils/syscache.h"
78
#include "utils/tuplesort.h"
79

Bruce Momjian's avatar
Bruce Momjian committed
80

81 82
#define LOG2(x)  (log(x) / 0.693147180559945)

83 84 85 86 87 88 89 90 91
/*
 * Some Paths return less than the nominal number of rows of their parent
 * relations; join nodes need to do this to get the correct input count:
 */
#define PATH_ROWS(path) \
	(IsA(path, UniquePath) ? \
	 ((UniquePath *) (path))->rows : \
	 (path)->parent->rows)

92

93
double		seq_page_cost = DEFAULT_SEQ_PAGE_COST;
94 95 96 97
double		random_page_cost = DEFAULT_RANDOM_PAGE_COST;
double		cpu_tuple_cost = DEFAULT_CPU_TUPLE_COST;
double		cpu_index_tuple_cost = DEFAULT_CPU_INDEX_TUPLE_COST;
double		cpu_operator_cost = DEFAULT_CPU_OPERATOR_COST;
98

99 100
double		effective_cache_size = DEFAULT_EFFECTIVE_CACHE_SIZE;

101 102 103 104
Cost		disable_cost = 100000000.0;

bool		enable_seqscan = true;
bool		enable_indexscan = true;
105
bool		enable_bitmapscan = true;
106 107
bool		enable_tidscan = true;
bool		enable_sort = true;
108
bool		enable_hashagg = true;
109 110 111 112 113
bool		enable_nestloop = true;
bool		enable_mergejoin = true;
bool		enable_hashjoin = true;


114
static bool cost_qual_eval_walker(Node *node, QualCost *total);
115
static int	estimate_array_length(Node *arrayexpr);
116
static Selectivity approx_selectivity(PlannerInfo *root, List *quals,
Bruce Momjian's avatar
Bruce Momjian committed
117
				   JoinType jointype);
118 119
static Selectivity join_in_selectivity(JoinPath *path, PlannerInfo *root);
static void set_rel_width(PlannerInfo *root, RelOptInfo *rel);
120 121
static double relation_byte_size(double tuples, int width);
static double page_size(double tuples, int width);
122

123

124 125 126 127 128 129 130 131 132
/*
 * clamp_row_est
 *		Force a row-count estimate to a sane value.
 */
double
clamp_row_est(double nrows)
{
	/*
	 * Force estimate to be at least one row, to make explain output look
133 134
	 * better and to avoid possible divide-by-zero when interpolating costs.
	 * Make it an integer, too.
135
	 */
136
	if (nrows <= 1.0)
137 138
		nrows = 1.0;
	else
139
		nrows = rint(nrows);
140 141 142 143 144

	return nrows;
}


145
/*
146
 * cost_seqscan
147
 *	  Determines and returns the cost of scanning a relation sequentially.
148
 */
149
void
150
cost_seqscan(Path *path, PlannerInfo *root,
151
			 RelOptInfo *baserel)
152
{
153 154 155
	Cost		startup_cost = 0;
	Cost		run_cost = 0;
	Cost		cpu_per_tuple;
156

157
	/* Should only be applied to base relations */
158
	Assert(baserel->relid > 0);
159
	Assert(baserel->rtekind == RTE_RELATION);
160

161
	if (!enable_seqscan)
162
		startup_cost += disable_cost;
163

164 165 166
	/*
	 * disk costs
	 */
167
	run_cost += seq_page_cost * baserel->pages;
168

169
	/* CPU costs */
170 171
	startup_cost += baserel->baserestrictcost.startup;
	cpu_per_tuple = cpu_tuple_cost + baserel->baserestrictcost.per_tuple;
172 173 174 175
	run_cost += cpu_per_tuple * baserel->tuples;

	path->startup_cost = startup_cost;
	path->total_cost = startup_cost + run_cost;
176 177
}

178
/*
179
 * cost_index
180 181
 *	  Determines and returns the cost of scanning a relation using an index.
 *
182
 * 'index' is the index to be used
183
 * 'indexQuals' is the list of applicable qual clauses (implicit AND semantics)
184 185 186 187
 * 'outer_rel' is the outer relation when we are considering using the index
 *		scan as the inside of a nestloop join (hence, some of the indexQuals
 *		are join clauses, and we should expect repeated scans of the index);
 *		NULL for a plain index scan
188
 *
189 190 191 192
 * cost_index() takes an IndexPath not just a Path, because it sets a few
 * additional fields of the IndexPath besides startup_cost and total_cost.
 * These fields are needed if the IndexPath is used in a BitmapIndexScan.
 *
193 194 195 196
 * NOTE: 'indexQuals' must contain only clauses usable as index restrictions.
 * Any additional quals evaluated as qpquals may reduce the number of returned
 * tuples, but they won't reduce the number of tuples we have to fetch from
 * the table, so they don't reduce the scan cost.
197
 *
198
 * NOTE: as of 8.0, indexQuals is a list of RestrictInfo nodes, where formerly
199
 * it was a list of bare clause expressions.
200
 */
201
void
202
cost_index(IndexPath *path, PlannerInfo *root,
203
		   IndexOptInfo *index,
204
		   List *indexQuals,
205
		   RelOptInfo *outer_rel)
206
{
207
	RelOptInfo *baserel = index->rel;
208 209 210 211
	Cost		startup_cost = 0;
	Cost		run_cost = 0;
	Cost		indexStartupCost;
	Cost		indexTotalCost;
212
	Selectivity indexSelectivity;
213 214 215 216 217
	double		indexCorrelation,
				csquared;
	Cost		min_IO_cost,
				max_IO_cost;
	Cost		cpu_per_tuple;
218 219
	double		tuples_fetched;
	double		pages_fetched;
220 221

	/* Should only be applied to base relations */
222 223
	Assert(IsA(baserel, RelOptInfo) &&
		   IsA(index, IndexOptInfo));
224
	Assert(baserel->relid > 0);
225
	Assert(baserel->rtekind == RTE_RELATION);
226

227
	if (!enable_indexscan)
228
		startup_cost += disable_cost;
229

Bruce Momjian's avatar
Bruce Momjian committed
230
	/*
231 232 233 234
	 * Call index-access-method-specific code to estimate the processing cost
	 * for scanning the index, as well as the selectivity of the index (ie,
	 * the fraction of main-table tuples we will have to retrieve) and its
	 * correlation to the main-table tuple order.
235
	 */
236
	OidFunctionCall8(index->amcostestimate,
237 238 239
					 PointerGetDatum(root),
					 PointerGetDatum(index),
					 PointerGetDatum(indexQuals),
240
					 PointerGetDatum(outer_rel),
241 242
					 PointerGetDatum(&indexStartupCost),
					 PointerGetDatum(&indexTotalCost),
243 244
					 PointerGetDatum(&indexSelectivity),
					 PointerGetDatum(&indexCorrelation));
245

246
	/*
247
	 * Save amcostestimate's results for possible use in bitmap scan planning.
248 249
	 * We don't bother to save indexStartupCost or indexCorrelation, because a
	 * bitmap scan doesn't care about either.
250 251 252 253
	 */
	path->indextotalcost = indexTotalCost;
	path->indexselectivity = indexSelectivity;

254
	/* all costs for touching index itself included here */
255 256
	startup_cost += indexStartupCost;
	run_cost += indexTotalCost - indexStartupCost;
257

258 259 260
	/* estimate number of main-table tuples fetched */
	tuples_fetched = clamp_row_est(indexSelectivity * baserel->tuples);

261
	/*----------
262
	 * Estimate number of main-table pages fetched, and compute I/O cost.
263
	 *
264
	 * When the index ordering is uncorrelated with the table ordering,
265 266 267
	 * we use an approximation proposed by Mackert and Lohman (see
	 * index_pages_fetched() for details) to compute the number of pages
	 * fetched, and then charge random_page_cost per page fetched.
268 269 270
	 *
	 * When the index ordering is exactly correlated with the table ordering
	 * (just after a CLUSTER, for example), the number of pages fetched should
271 272 273 274 275 276 277 278
	 * be exactly selectivity * table_size.  What's more, all but the first
	 * will be sequential fetches, not the random fetches that occur in the
	 * uncorrelated case.  So if the number of pages is more than 1, we
	 * ought to charge
	 *		random_page_cost + (pages_fetched - 1) * seq_page_cost
	 * For partially-correlated indexes, we ought to charge somewhere between
	 * these two estimates.  We currently interpolate linearly between the
	 * estimates based on the correlation squared (XXX is that appropriate?).
279
	 *----------
280
	 */
281
	if (outer_rel != NULL && outer_rel->rows > 1)
282
	{
283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298
		/*
		 * For repeated indexscans, scale up the number of tuples fetched
		 * in the Mackert and Lohman formula by the number of scans, so
		 * that we estimate the number of pages fetched by all the scans.
		 * Then pro-rate the costs for one scan.  In this case we assume
		 * all the fetches are random accesses.  XXX it'd be good to
		 * include correlation in this model, but it's not clear how to do
		 * that without double-counting cache effects.
		 */
		double		num_scans = outer_rel->rows;

		pages_fetched = index_pages_fetched(tuples_fetched * num_scans,
											baserel->pages,
											index->pages);

		run_cost += (pages_fetched * random_page_cost) / num_scans;
299
	}
300
	else
301
	{
302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325
		/*
		 * Normal case: apply the Mackert and Lohman formula, and then
		 * interpolate between that and the correlation-derived result.
		 */
		pages_fetched = index_pages_fetched(tuples_fetched,
											baserel->pages,
											index->pages);

		/* max_IO_cost is for the perfectly uncorrelated case (csquared=0) */
		max_IO_cost = pages_fetched * random_page_cost;

		/* min_IO_cost is for the perfectly correlated case (csquared=1) */
		pages_fetched = ceil(indexSelectivity * (double) baserel->pages);
		min_IO_cost = random_page_cost;
		if (pages_fetched > 1)
			min_IO_cost += (pages_fetched - 1) * seq_page_cost;

		/*
		 * Now interpolate based on estimated index order correlation to get
		 * total disk I/O cost for main table accesses.
		 */
		csquared = indexCorrelation * indexCorrelation;

		run_cost += max_IO_cost + csquared * (min_IO_cost - max_IO_cost);
326
	}
327 328

	/*
329 330
	 * Estimate CPU costs per tuple.
	 *
331 332
	 * Normally the indexquals will be removed from the list of restriction
	 * clauses that we have to evaluate as qpquals, so we should subtract
333
	 * their costs from baserestrictcost.  But if we are doing a join then
334 335 336
	 * some of the indexquals are join clauses and shouldn't be subtracted.
	 * Rather than work out exactly how much to subtract, we don't subtract
	 * anything.
337
	 */
338 339
	startup_cost += baserel->baserestrictcost.startup;
	cpu_per_tuple = cpu_tuple_cost + baserel->baserestrictcost.per_tuple;
340

341
	if (outer_rel == NULL)
342 343 344 345
	{
		QualCost	index_qual_cost;

		cost_qual_eval(&index_qual_cost, indexQuals);
346
		/* any startup cost still has to be paid ... */
347 348
		cpu_per_tuple -= index_qual_cost.per_tuple;
	}
349

350 351
	run_cost += cpu_per_tuple * tuples_fetched;

352 353
	path->path.startup_cost = startup_cost;
	path->path.total_cost = startup_cost + run_cost;
354 355
}

356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437
/*
 * index_pages_fetched
 *	  Estimate the number of pages actually fetched after accounting for
 *	  cache effects.
 *
 * We use an approximation proposed by Mackert and Lohman, "Index Scans
 * Using a Finite LRU Buffer: A Validated I/O Model", ACM Transactions
 * on Database Systems, Vol. 14, No. 3, September 1989, Pages 401-424.
 * The Mackert and Lohman approximation is that the number of pages
 * fetched is
 *	PF =
 *		min(2TNs/(2T+Ns), T)			when T <= b
 *		2TNs/(2T+Ns)					when T > b and Ns <= 2Tb/(2T-b)
 *		b + (Ns - 2Tb/(2T-b))*(T-b)/T	when T > b and Ns > 2Tb/(2T-b)
 * where
 *		T = # pages in table
 *		N = # tuples in table
 *		s = selectivity = fraction of table to be scanned
 *		b = # buffer pages available (we include kernel space here)
 *
 * We assume that effective_cache_size is the total number of buffer pages
 * available for both table and index, and pro-rate that space between the
 * table and index.  (Ideally other_pages should include all the other
 * tables and indexes used by the query too; but we don't have a good way
 * to get that number here.)
 *
 * The product Ns is the number of tuples fetched; we pass in that
 * product rather than calculating it here.
 *
 * Caller is expected to have ensured that tuples_fetched is greater than zero
 * and rounded to integer (see clamp_row_est).  The result will likewise be
 * greater than zero and integral.
 */
double
index_pages_fetched(double tuples_fetched, BlockNumber pages,
					BlockNumber other_pages)
{
	double		pages_fetched;
	double		T,
				b;

	/* T is # pages in table, but don't allow it to be zero */
	T = (pages > 1) ? (double) pages : 1.0;

	/* b is pro-rated share of effective_cache_size */
	b = effective_cache_size * T / (T + (double) other_pages);
	/* force it positive and integral */
	if (b <= 1.0)
		b = 1.0;
	else
		b = ceil(b);

	/* This part is the Mackert and Lohman formula */
	if (T <= b)
	{
		pages_fetched =
			(2.0 * T * tuples_fetched) / (2.0 * T + tuples_fetched);
		if (pages_fetched >= T)
			pages_fetched = T;
		else
			pages_fetched = ceil(pages_fetched);
	}
	else
	{
		double		lim;

		lim = (2.0 * T * b) / (2.0 * T - b);
		if (tuples_fetched <= lim)
		{
			pages_fetched =
				(2.0 * T * tuples_fetched) / (2.0 * T + tuples_fetched);
		}
		else
		{
			pages_fetched =
				b + (tuples_fetched - lim) * (T - b) / T;
		}
		pages_fetched = ceil(pages_fetched);
	}
	return pages_fetched;
}

438
/*
439
 * cost_bitmap_heap_scan
440 441 442 443
 *	  Determines and returns the cost of scanning a relation using a bitmap
 *	  index-then-heap plan.
 *
 * 'baserel' is the relation to be scanned
444
 * 'bitmapqual' is a tree of IndexPaths, BitmapAndPaths, and BitmapOrPaths
445 446 447
 *
 * Note: we take no explicit notice here of whether this is a join inner path.
 * If it is, the component IndexPaths should have been costed accordingly.
448 449
 */
void
450
cost_bitmap_heap_scan(Path *path, PlannerInfo *root, RelOptInfo *baserel,
451
					  Path *bitmapqual)
452 453 454
{
	Cost		startup_cost = 0;
	Cost		run_cost = 0;
455 456 457 458 459 460 461
	Cost		indexTotalCost;
	Selectivity indexSelectivity;
	Cost		cpu_per_tuple;
	Cost		cost_per_page;
	double		tuples_fetched;
	double		pages_fetched;
	double		T;
462 463 464 465 466 467

	/* Should only be applied to base relations */
	Assert(IsA(baserel, RelOptInfo));
	Assert(baserel->relid > 0);
	Assert(baserel->rtekind == RTE_RELATION);

468
	if (!enable_bitmapscan)
469 470 471
		startup_cost += disable_cost;

	/*
472
	 * Fetch total cost of obtaining the bitmap, as well as its total
473 474
	 * selectivity.
	 */
475
	cost_bitmap_tree_node(bitmapqual, &indexTotalCost, &indexSelectivity);
476 477 478 479 480 481 482 483 484 485 486 487

	startup_cost += indexTotalCost;

	/*
	 * The number of heap pages that need to be fetched is the same as the
	 * Mackert and Lohman formula for the case T <= b (ie, no re-reads
	 * needed).
	 */
	tuples_fetched = clamp_row_est(indexSelectivity * baserel->tuples);

	T = (baserel->pages > 1) ? (double) baserel->pages : 1.0;
	pages_fetched = (2.0 * T * tuples_fetched) / (2.0 * T + tuples_fetched);
488
	if (pages_fetched >= T)
489
		pages_fetched = T;
490 491
	else
		pages_fetched = ceil(pages_fetched);
492 493 494 495

	/*
	 * For small numbers of pages we should charge random_page_cost apiece,
	 * while if nearly all the table's pages are being read, it's more
496 497 498
	 * appropriate to charge seq_page_cost apiece.  The effect is nonlinear,
	 * too. For lack of a better idea, interpolate like this to determine the
	 * cost per page.
499
	 */
500 501
	if (pages_fetched >= 2.0)
		cost_per_page = random_page_cost -
502
			(random_page_cost - seq_page_cost) * sqrt(pages_fetched / T);
503 504
	else
		cost_per_page = random_page_cost;
505 506 507 508 509 510

	run_cost += pages_fetched * cost_per_page;

	/*
	 * Estimate CPU costs per tuple.
	 *
511 512
	 * Often the indexquals don't need to be rechecked at each tuple ... but
	 * not always, especially not if there are enough tuples involved that the
513 514
	 * bitmaps become lossy.  For the moment, just assume they will be
	 * rechecked always.
515 516 517 518 519
	 */
	startup_cost += baserel->baserestrictcost.startup;
	cpu_per_tuple = cpu_tuple_cost + baserel->baserestrictcost.per_tuple;

	run_cost += cpu_per_tuple * tuples_fetched;
520 521 522 523 524

	path->startup_cost = startup_cost;
	path->total_cost = startup_cost + run_cost;
}

525
/*
526 527
 * cost_bitmap_tree_node
 *		Extract cost and selectivity from a bitmap tree node (index/and/or)
528
 */
529
void
530
cost_bitmap_tree_node(Path *path, Cost *cost, Selectivity *selec)
531
{
532
	if (IsA(path, IndexPath))
533
	{
534 535
		*cost = ((IndexPath *) path)->indextotalcost;
		*selec = ((IndexPath *) path)->indexselectivity;
536
	}
537
	else if (IsA(path, BitmapAndPath))
538
	{
539 540
		*cost = path->total_cost;
		*selec = ((BitmapAndPath *) path)->bitmapselectivity;
541
	}
542
	else if (IsA(path, BitmapOrPath))
543
	{
544 545
		*cost = path->total_cost;
		*selec = ((BitmapOrPath *) path)->bitmapselectivity;
546 547
	}
	else
548 549 550 551 552 553 554 555
		elog(ERROR, "unrecognized node type: %d", nodeTag(path));
}

/*
 * cost_bitmap_and_node
 *		Estimate the cost of a BitmapAnd node
 *
 * Note that this considers only the costs of index scanning and bitmap
556
 * creation, not the eventual heap access.	In that sense the object isn't
557 558 559 560
 * truly a Path, but it has enough path-like properties (costs in particular)
 * to warrant treating it as one.
 */
void
561
cost_bitmap_and_node(BitmapAndPath *path, PlannerInfo *root)
562 563
{
	Cost		totalCost;
564
	Selectivity selec;
565 566 567
	ListCell   *l;

	/*
568 569 570
	 * We estimate AND selectivity on the assumption that the inputs are
	 * independent.  This is probably often wrong, but we don't have the info
	 * to do better.
571 572
	 *
	 * The runtime cost of the BitmapAnd itself is estimated at 100x
573 574
	 * cpu_operator_cost for each tbm_intersect needed.  Probably too small,
	 * definitely too simplistic?
575 576 577 578
	 */
	totalCost = 0.0;
	selec = 1.0;
	foreach(l, path->bitmapquals)
579
	{
580 581
		Path	   *subpath = (Path *) lfirst(l);
		Cost		subCost;
582 583 584 585 586 587 588 589 590
		Selectivity subselec;

		cost_bitmap_tree_node(subpath, &subCost, &subselec);

		selec *= subselec;

		totalCost += subCost;
		if (l != list_head(path->bitmapquals))
			totalCost += 100.0 * cpu_operator_cost;
591
	}
592 593 594 595
	path->bitmapselectivity = selec;
	path->path.startup_cost = totalCost;
	path->path.total_cost = totalCost;
}
596

597 598 599 600 601 602 603
/*
 * cost_bitmap_or_node
 *		Estimate the cost of a BitmapOr node
 *
 * See comments for cost_bitmap_and_node.
 */
void
604
cost_bitmap_or_node(BitmapOrPath *path, PlannerInfo *root)
605 606
{
	Cost		totalCost;
607
	Selectivity selec;
608 609 610
	ListCell   *l;

	/*
611 612 613
	 * We estimate OR selectivity on the assumption that the inputs are
	 * non-overlapping, since that's often the case in "x IN (list)" type
	 * situations.	Of course, we clamp to 1.0 at the end.
614 615
	 *
	 * The runtime cost of the BitmapOr itself is estimated at 100x
616 617 618
	 * cpu_operator_cost for each tbm_union needed.  Probably too small,
	 * definitely too simplistic?  We are aware that the tbm_unions are
	 * optimized out when the inputs are BitmapIndexScans.
619 620 621 622 623
	 */
	totalCost = 0.0;
	selec = 0.0;
	foreach(l, path->bitmapquals)
	{
624 625
		Path	   *subpath = (Path *) lfirst(l);
		Cost		subCost;
626 627 628 629 630 631 632 633 634 635 636 637 638 639
		Selectivity subselec;

		cost_bitmap_tree_node(subpath, &subCost, &subselec);

		selec += subselec;

		totalCost += subCost;
		if (l != list_head(path->bitmapquals) &&
			!IsA(subpath, IndexPath))
			totalCost += 100.0 * cpu_operator_cost;
	}
	path->bitmapselectivity = Min(selec, 1.0);
	path->path.startup_cost = totalCost;
	path->path.total_cost = totalCost;
640 641
}

642 643
/*
 * cost_tidscan
644
 *	  Determines and returns the cost of scanning a relation using TIDs.
645
 */
646
void
647
cost_tidscan(Path *path, PlannerInfo *root,
648
			 RelOptInfo *baserel, List *tidquals)
649
{
650 651 652
	Cost		startup_cost = 0;
	Cost		run_cost = 0;
	Cost		cpu_per_tuple;
653 654
	int			ntuples;
	ListCell   *l;
655

656
	/* Should only be applied to base relations */
657
	Assert(baserel->relid > 0);
658 659
	Assert(baserel->rtekind == RTE_RELATION);

660
	if (!enable_tidscan)
661
		startup_cost += disable_cost;
662

663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681
	/* Count how many tuples we expect to retrieve */
	ntuples = 0;
	foreach(l, tidquals)
	{
		if (IsA(lfirst(l), ScalarArrayOpExpr))
		{
			/* Each element of the array yields 1 tuple */
			ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) lfirst(l);
			Node *arraynode = (Node *) lsecond(saop->args);

			ntuples += estimate_array_length(arraynode);
		}
		else
		{
			/* It's just CTID = something, count 1 tuple */
			ntuples++;
		}
	}

682 683
	/* disk costs --- assume each tuple on a different page */
	run_cost += random_page_cost * ntuples;
684

685
	/* CPU costs */
686 687
	startup_cost += baserel->baserestrictcost.startup;
	cpu_per_tuple = cpu_tuple_cost + baserel->baserestrictcost.per_tuple;
688 689 690 691
	run_cost += cpu_per_tuple * ntuples;

	path->startup_cost = startup_cost;
	path->total_cost = startup_cost + run_cost;
692
}
693

694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721
/*
 * Estimate number of elements in the array yielded by an expression.
 */
static int
estimate_array_length(Node *arrayexpr)
{
	if (arrayexpr && IsA(arrayexpr, Const))
	{
		Datum		arraydatum = ((Const *) arrayexpr)->constvalue;
		bool		arrayisnull = ((Const *) arrayexpr)->constisnull;
		ArrayType  *arrayval;

		if (arrayisnull)
			return 0;
		arrayval = DatumGetArrayTypeP(arraydatum);
		return ArrayGetNItems(ARR_NDIM(arrayval), ARR_DIMS(arrayval));
	}
	else if (arrayexpr && IsA(arrayexpr, ArrayExpr))
	{
		return list_length(((ArrayExpr *) arrayexpr)->elements);
	}
	else
	{
		/* default guess */
		return 10;
	}
}

722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737
/*
 * cost_subqueryscan
 *	  Determines and returns the cost of scanning a subquery RTE.
 */
void
cost_subqueryscan(Path *path, RelOptInfo *baserel)
{
	Cost		startup_cost;
	Cost		run_cost;
	Cost		cpu_per_tuple;

	/* Should only be applied to base relations that are subqueries */
	Assert(baserel->relid > 0);
	Assert(baserel->rtekind == RTE_SUBQUERY);

	/*
738 739 740
	 * Cost of path is cost of evaluating the subplan, plus cost of evaluating
	 * any restriction clauses that will be attached to the SubqueryScan node,
	 * plus cpu_tuple_cost to account for selection and projection overhead.
741 742 743 744 745 746 747 748 749 750 751 752
	 */
	path->startup_cost = baserel->subplan->startup_cost;
	path->total_cost = baserel->subplan->total_cost;

	startup_cost = baserel->baserestrictcost.startup;
	cpu_per_tuple = cpu_tuple_cost + baserel->baserestrictcost.per_tuple;
	run_cost = cpu_per_tuple * baserel->tuples;

	path->startup_cost += startup_cost;
	path->total_cost += startup_cost + run_cost;
}

753 754 755 756 757
/*
 * cost_functionscan
 *	  Determines and returns the cost of scanning a function RTE.
 */
void
758
cost_functionscan(Path *path, PlannerInfo *root, RelOptInfo *baserel)
759 760 761 762 763 764
{
	Cost		startup_cost = 0;
	Cost		run_cost = 0;
	Cost		cpu_per_tuple;

	/* Should only be applied to base relations that are functions */
765
	Assert(baserel->relid > 0);
766 767 768 769
	Assert(baserel->rtekind == RTE_FUNCTION);

	/*
	 * For now, estimate function's cost at one operator eval per function
770 771
	 * call.  Someday we should revive the function cost estimate columns in
	 * pg_proc...
772 773 774 775
	 */
	cpu_per_tuple = cpu_operator_cost;

	/* Add scanning CPU costs */
776 777
	startup_cost += baserel->baserestrictcost.startup;
	cpu_per_tuple += cpu_tuple_cost + baserel->baserestrictcost.per_tuple;
778 779 780 781 782 783
	run_cost += cpu_per_tuple * baserel->tuples;

	path->startup_cost = startup_cost;
	path->total_cost = startup_cost + run_cost;
}

784
/*
785
 * cost_sort
786 787
 *	  Determines and returns the cost of sorting a relation, including
 *	  the cost of reading the input data.
788
 *
789
 * If the total volume of data to sort is less than work_mem, we will do
790
 * an in-memory sort, which requires no I/O and about t*log2(t) tuple
791
 * comparisons for t tuples.
792
 *
793
 * If the total volume exceeds work_mem, we switch to a tape-style merge
794 795
 * algorithm.  There will still be about t*log2(t) tuple comparisons in
 * total, but we will also need to write and read each tuple once per
796 797 798 799
 * merge pass.  We expect about ceil(logM(r)) merge passes where r is the
 * number of initial runs formed and M is the merge order used by tuplesort.c.
 * Since the average initial run should be about twice work_mem, we have
 *		disk traffic = 2 * relsize * ceil(logM(p / (2*work_mem)))
800 801
 *		cpu = comparison_cost * t * log2(t)
 *
802
 * The disk traffic is assumed to be 3/4ths sequential and 1/4th random
803 804 805 806
 * accesses (XXX can't we refine that guess?)
 *
 * We charge two operator evals per tuple comparison, which should be in
 * the right ballpark in most cases.
807
 *
808
 * 'pathkeys' is a list of sort keys
809
 * 'input_cost' is the total cost for reading the input data
810 811
 * 'tuples' is the number of tuples in the relation
 * 'width' is the average tuple width in bytes
812
 *
813
 * NOTE: some callers currently pass NIL for pathkeys because they
814
 * can't conveniently supply the sort keys.  Since this routine doesn't
815
 * currently do anything with pathkeys anyway, that doesn't matter...
816
 * but if it ever does, it should react gracefully to lack of key data.
817 818
 * (Actually, the thing we'd most likely be interested in is just the number
 * of sort keys, which all callers *could* supply.)
819
 */
820
void
821
cost_sort(Path *path, PlannerInfo *root,
822
		  List *pathkeys, Cost input_cost, double tuples, int width)
823
{
824
	Cost		startup_cost = input_cost;
825
	Cost		run_cost = 0;
826
	double		nbytes = relation_byte_size(tuples, width);
827
	long		work_mem_bytes = work_mem * 1024L;
828

829
	if (!enable_sort)
830
		startup_cost += disable_cost;
831

Bruce Momjian's avatar
Bruce Momjian committed
832
	/*
833 834
	 * We want to be sure the cost of a sort is never estimated as zero, even
	 * if passed-in tuple count is zero.  Besides, mustn't do log(0)...
835
	 */
836 837
	if (tuples < 2.0)
		tuples = 2.0;
838

839 840 841
	/*
	 * CPU costs
	 *
842 843
	 * Assume about two operator evals per tuple comparison and N log2 N
	 * comparisons
844 845
	 */
	startup_cost += 2.0 * cpu_operator_cost * tuples * LOG2(tuples);
846

847
	/* disk costs */
848
	if (nbytes > work_mem_bytes)
849 850
	{
		double		npages = ceil(nbytes / BLCKSZ);
851
		double		nruns = (nbytes / work_mem_bytes) * 0.5;
852 853
		double		mergeorder = tuplesort_merge_order(work_mem_bytes);
		double		log_runs;
854
		double		npageaccesses;
855

856 857 858 859
		/* Compute logM(r) as log(r) / log(M) */
		if (nruns > mergeorder)
			log_runs = ceil(log(nruns) / log(mergeorder));
		else
860
			log_runs = 1.0;
861
		npageaccesses = 2.0 * npages * log_runs;
862
		/* Assume 3/4ths of accesses are sequential, 1/4th are not */
863
		startup_cost += npageaccesses *
864
			(seq_page_cost * 0.75 + random_page_cost * 0.25);
865
	}
866

867
	/*
868 869
	 * Also charge a small amount (arbitrarily set equal to operator cost) per
	 * extracted tuple.
870
	 */
871 872
	run_cost += cpu_operator_cost * tuples;

873 874
	path->startup_cost = startup_cost;
	path->total_cost = startup_cost + run_cost;
875
}
876

877 878 879 880 881
/*
 * cost_material
 *	  Determines and returns the cost of materializing a relation, including
 *	  the cost of reading the input data.
 *
882
 * If the total volume of data to materialize exceeds work_mem, we will need
883 884 885 886 887 888 889 890 891
 * to write it to disk, so the cost is much higher in that case.
 */
void
cost_material(Path *path,
			  Cost input_cost, double tuples, int width)
{
	Cost		startup_cost = input_cost;
	Cost		run_cost = 0;
	double		nbytes = relation_byte_size(tuples, width);
892
	long		work_mem_bytes = work_mem * 1024L;
893 894

	/* disk costs */
895
	if (nbytes > work_mem_bytes)
896 897 898 899
	{
		double		npages = ceil(nbytes / BLCKSZ);

		/* We'll write during startup and read during retrieval */
900 901
		startup_cost += seq_page_cost * npages;
		run_cost += seq_page_cost * npages;
902 903
	}

904 905
	/*
	 * Charge a very small amount per inserted tuple, to reflect bookkeeping
906 907
	 * costs.  We use cpu_tuple_cost/10 for this.  This is needed to break the
	 * tie that would otherwise exist between nestloop with A outer,
908 909 910 911 912
	 * materialized B inner and nestloop with B outer, materialized A inner.
	 * The extra cost ensures we'll prefer materializing the smaller rel.
	 */
	startup_cost += cpu_tuple_cost * 0.1 * tuples;

913
	/*
914 915
	 * Also charge a small amount per extracted tuple.	We use cpu_tuple_cost
	 * so that it doesn't appear worthwhile to materialize a bare seqscan.
916 917 918 919 920 921 922
	 */
	run_cost += cpu_tuple_cost * tuples;

	path->startup_cost = startup_cost;
	path->total_cost = startup_cost + run_cost;
}

923 924 925 926 927 928 929 930 931
/*
 * cost_agg
 *		Determines and returns the cost of performing an Agg plan node,
 *		including the cost of its input.
 *
 * Note: when aggstrategy == AGG_SORTED, caller must ensure that input costs
 * are for appropriately-sorted input.
 */
void
932
cost_agg(Path *path, PlannerInfo *root,
933 934 935 936 937 938 939 940 941
		 AggStrategy aggstrategy, int numAggs,
		 int numGroupCols, double numGroups,
		 Cost input_startup_cost, Cost input_total_cost,
		 double input_tuples)
{
	Cost		startup_cost;
	Cost		total_cost;

	/*
942 943 944 945 946
	 * We charge one cpu_operator_cost per aggregate function per input tuple,
	 * and another one per output tuple (corresponding to transfn and finalfn
	 * calls respectively).  If we are grouping, we charge an additional
	 * cpu_operator_cost per grouping column per input tuple for grouping
	 * comparisons.
947
	 *
Bruce Momjian's avatar
Bruce Momjian committed
948
	 * We will produce a single output tuple if not grouping, and a tuple per
949
	 * group otherwise.  We charge cpu_tuple_cost for each output tuple.
950
	 *
951 952 953 954 955 956 957 958
	 * Note: in this cost model, AGG_SORTED and AGG_HASHED have exactly the
	 * same total CPU cost, but AGG_SORTED has lower startup cost.	If the
	 * input path is already sorted appropriately, AGG_SORTED should be
	 * preferred (since it has no risk of memory overflow).  This will happen
	 * as long as the computed total costs are indeed exactly equal --- but if
	 * there's roundoff error we might do the wrong thing.  So be sure that
	 * the computations below form the same intermediate values in the same
	 * order.
959 960 961 962 963 964
	 */
	if (aggstrategy == AGG_PLAIN)
	{
		startup_cost = input_total_cost;
		startup_cost += cpu_operator_cost * (input_tuples + 1) * numAggs;
		/* we aren't grouping */
965
		total_cost = startup_cost + cpu_tuple_cost;
966 967 968 969 970 971
	}
	else if (aggstrategy == AGG_SORTED)
	{
		/* Here we are able to deliver output on-the-fly */
		startup_cost = input_startup_cost;
		total_cost = input_total_cost;
972
		/* calcs phrased this way to match HASHED case, see note above */
973
		total_cost += cpu_operator_cost * input_tuples * numGroupCols;
974 975
		total_cost += cpu_operator_cost * input_tuples * numAggs;
		total_cost += cpu_operator_cost * numGroups * numAggs;
976
		total_cost += cpu_tuple_cost * numGroups;
977 978 979 980 981 982
	}
	else
	{
		/* must be AGG_HASHED */
		startup_cost = input_total_cost;
		startup_cost += cpu_operator_cost * input_tuples * numGroupCols;
983
		startup_cost += cpu_operator_cost * input_tuples * numAggs;
984 985
		total_cost = startup_cost;
		total_cost += cpu_operator_cost * numGroups * numAggs;
986
		total_cost += cpu_tuple_cost * numGroups;
987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001
	}

	path->startup_cost = startup_cost;
	path->total_cost = total_cost;
}

/*
 * cost_group
 *		Determines and returns the cost of performing a Group plan node,
 *		including the cost of its input.
 *
 * Note: caller must ensure that input costs are for appropriately-sorted
 * input.
 */
void
1002
cost_group(Path *path, PlannerInfo *root,
1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013
		   int numGroupCols, double numGroups,
		   Cost input_startup_cost, Cost input_total_cost,
		   double input_tuples)
{
	Cost		startup_cost;
	Cost		total_cost;

	startup_cost = input_startup_cost;
	total_cost = input_total_cost;

	/*
1014 1015
	 * Charge one cpu_operator_cost per comparison per input tuple. We assume
	 * all columns get compared at most of the tuples.
1016 1017 1018 1019 1020 1021
	 */
	total_cost += cpu_operator_cost * input_tuples * numGroupCols;

	path->startup_cost = startup_cost;
	path->total_cost = total_cost;
}
1022

1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054
/*
 * If a nestloop's inner path is an indexscan, be sure to use its estimated
 * output row count, which may be lower than the restriction-clause-only row
 * count of its parent.  (We don't include this case in the PATH_ROWS macro
 * because it applies *only* to a nestloop's inner relation.)  We have to
 * be prepared to recurse through Append nodes in case of an appendrel.
 */
static double
nestloop_inner_path_rows(Path *path)
{
	double		result;

	if (IsA(path, IndexPath))
		result = ((IndexPath *) path)->rows;
	else if (IsA(path, BitmapHeapPath))
		result = ((BitmapHeapPath *) path)->rows;
	else if (IsA(path, AppendPath))
	{
		ListCell   *l;

		result = 0;
		foreach(l, ((AppendPath *) path)->subpaths)
		{
			result += nestloop_inner_path_rows((Path *) lfirst(l));
		}
	}
	else
		result = PATH_ROWS(path);

	return result;
}

1055
/*
1056
 * cost_nestloop
1057 1058 1059
 *	  Determines and returns the cost of joining two relations using the
 *	  nested loop algorithm.
 *
1060
 * 'path' is already filled in except for the cost fields
1061
 */
1062
void
1063
cost_nestloop(NestPath *path, PlannerInfo *root)
1064
{
1065 1066
	Path	   *outer_path = path->outerjoinpath;
	Path	   *inner_path = path->innerjoinpath;
1067 1068 1069
	Cost		startup_cost = 0;
	Cost		run_cost = 0;
	Cost		cpu_per_tuple;
1070
	QualCost	restrict_qual_cost;
1071
	double		outer_path_rows = PATH_ROWS(outer_path);
1072
	double		inner_path_rows = nestloop_inner_path_rows(inner_path);
1073
	double		ntuples;
Bruce Momjian's avatar
Bruce Momjian committed
1074
	Selectivity joininfactor;
1075

1076
	if (!enable_nestloop)
1077 1078
		startup_cost += disable_cost;

1079
	/*
1080 1081 1082 1083 1084
	 * If we're doing JOIN_IN then we will stop scanning inner tuples for an
	 * outer tuple as soon as we have one match.  Account for the effects of
	 * this by scaling down the cost estimates in proportion to the JOIN_IN
	 * selectivity.  (This assumes that all the quals attached to the join are
	 * IN quals, which should be true.)
1085
	 */
1086
	joininfactor = join_in_selectivity(path, root);
1087

1088
	/* cost of source data */
1089

1090
	/*
1091
	 * NOTE: clearly, we must pay both outer and inner paths' startup_cost
1092 1093
	 * before we can start returning tuples, so the join's startup cost is
	 * their sum.  What's not so clear is whether the inner path's
1094 1095 1096
	 * startup_cost must be paid again on each rescan of the inner path. This
	 * is not true if the inner path is materialized or is a hashjoin, but
	 * probably is true otherwise.
1097 1098 1099
	 */
	startup_cost += outer_path->startup_cost + inner_path->startup_cost;
	run_cost += outer_path->total_cost - outer_path->startup_cost;
1100 1101 1102 1103 1104 1105 1106 1107
	if (IsA(inner_path, MaterialPath) ||
		IsA(inner_path, HashPath))
	{
		/* charge only run cost for each iteration of inner path */
	}
	else
	{
		/*
1108
		 * charge startup cost for each iteration of inner path, except we
1109 1110
		 * already charged the first startup_cost in our own startup
		 */
1111
		run_cost += (outer_path_rows - 1) * inner_path->startup_cost;
1112
	}
1113 1114
	run_cost += outer_path_rows *
		(inner_path->total_cost - inner_path->startup_cost) * joininfactor;
1115

1116
	/*
1117
	 * Compute number of tuples processed (not number emitted!)
1118
	 */
1119
	ntuples = outer_path_rows * inner_path_rows * joininfactor;
1120

1121
	/* CPU costs */
1122
	cost_qual_eval(&restrict_qual_cost, path->joinrestrictinfo);
1123 1124
	startup_cost += restrict_qual_cost.startup;
	cpu_per_tuple = cpu_tuple_cost + restrict_qual_cost.per_tuple;
1125 1126
	run_cost += cpu_per_tuple * ntuples;

1127 1128
	path->path.startup_cost = startup_cost;
	path->path.total_cost = startup_cost + run_cost;
1129 1130
}

1131
/*
1132
 * cost_mergejoin
1133 1134
 *	  Determines and returns the cost of joining two relations using the
 *	  merge join algorithm.
1135
 *
1136 1137 1138 1139 1140 1141
 * 'path' is already filled in except for the cost fields
 *
 * Notes: path's mergeclauses should be a subset of the joinrestrictinfo list;
 * outersortkeys and innersortkeys are lists of the keys to be used
 * to sort the outer and inner relations, or NIL if no explicit
 * sort is needed because the source path is already ordered.
1142
 */
1143
void
1144
cost_mergejoin(MergePath *path, PlannerInfo *root)
1145
{
1146 1147 1148 1149 1150
	Path	   *outer_path = path->jpath.outerjoinpath;
	Path	   *inner_path = path->jpath.innerjoinpath;
	List	   *mergeclauses = path->path_mergeclauses;
	List	   *outersortkeys = path->outersortkeys;
	List	   *innersortkeys = path->innersortkeys;
1151 1152 1153
	Cost		startup_cost = 0;
	Cost		run_cost = 0;
	Cost		cpu_per_tuple;
Bruce Momjian's avatar
Bruce Momjian committed
1154
	Selectivity merge_selec;
1155 1156
	QualCost	merge_qual_cost;
	QualCost	qp_qual_cost;
1157
	RestrictInfo *firstclause;
1158 1159
	double		outer_path_rows = PATH_ROWS(outer_path);
	double		inner_path_rows = PATH_ROWS(inner_path);
1160 1161
	double		outer_rows,
				inner_rows;
1162 1163 1164
	double		mergejointuples,
				rescannedtuples;
	double		rescanratio;
Bruce Momjian's avatar
Bruce Momjian committed
1165
	Selectivity outerscansel,
1166
				innerscansel;
Bruce Momjian's avatar
Bruce Momjian committed
1167
	Selectivity joininfactor;
1168
	Path		sort_path;		/* dummy for result of cost_sort */
1169

1170
	if (!enable_mergejoin)
1171
		startup_cost += disable_cost;
1172

1173 1174
	/*
	 * Compute cost and selectivity of the mergequals and qpquals (other
1175 1176
	 * restriction clauses) separately.  We use approx_selectivity here for
	 * speed --- in most cases, any errors won't affect the result much.
1177
	 *
1178 1179
	 * Note: it's probably bogus to use the normal selectivity calculation
	 * here when either the outer or inner path is a UniquePath.
1180
	 */
1181 1182
	merge_selec = approx_selectivity(root, mergeclauses,
									 path->jpath.jointype);
1183
	cost_qual_eval(&merge_qual_cost, mergeclauses);
1184 1185 1186
	cost_qual_eval(&qp_qual_cost, path->jpath.joinrestrictinfo);
	qp_qual_cost.startup -= merge_qual_cost.startup;
	qp_qual_cost.per_tuple -= merge_qual_cost.per_tuple;
1187 1188

	/* approx # tuples passing the merge quals */
1189
	mergejointuples = clamp_row_est(merge_selec * outer_path_rows * inner_path_rows);
1190 1191

	/*
1192 1193 1194 1195 1196 1197
	 * When there are equal merge keys in the outer relation, the mergejoin
	 * must rescan any matching tuples in the inner relation. This means
	 * re-fetching inner tuples.  Our cost model for this is that a re-fetch
	 * costs the same as an original fetch, which is probably an overestimate;
	 * but on the other hand we ignore the bookkeeping costs of mark/restore.
	 * Not clear if it's worth developing a more refined model.
1198
	 *
1199 1200 1201 1202 1203
	 * The number of re-fetches can be estimated approximately as size of
	 * merge join output minus size of inner relation.	Assume that the
	 * distinct key values are 1, 2, ..., and denote the number of values of
	 * each key in the outer relation as m1, m2, ...; in the inner relation,
	 * n1, n2, ... Then we have
1204
	 *
Bruce Momjian's avatar
Bruce Momjian committed
1205
	 * size of join = m1 * n1 + m2 * n2 + ...
1206
	 *
1207 1208
	 * number of rescanned tuples = (m1 - 1) * n1 + (m2 - 1) * n2 + ... = m1 *
	 * n1 + m2 * n2 + ... - (n1 + n2 + ...) = size of join - size of inner
Bruce Momjian's avatar
Bruce Momjian committed
1209
	 * relation
1210
	 *
1211 1212 1213 1214
	 * This equation works correctly for outer tuples having no inner match
	 * (nk = 0), but not for inner tuples having no outer match (mk = 0); we
	 * are effectively subtracting those from the number of rescanned tuples,
	 * when we should not.	Can we do better without expensive selectivity
1215
	 * computations?
1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228
	 */
	if (IsA(outer_path, UniquePath))
		rescannedtuples = 0;
	else
	{
		rescannedtuples = mergejointuples - inner_path_rows;
		/* Must clamp because of possible underestimate */
		if (rescannedtuples < 0)
			rescannedtuples = 0;
	}
	/* We'll inflate inner run cost this much to account for rescanning */
	rescanratio = 1.0 + (rescannedtuples / inner_path_rows);

1229
	/*
1230 1231 1232 1233 1234
	 * A merge join will stop as soon as it exhausts either input stream
	 * (unless it's an outer join, in which case the outer side has to be
	 * scanned all the way anyway).  Estimate fraction of the left and right
	 * inputs that will actually need to be scanned. We use only the first
	 * (most significant) merge clause for this purpose.
1235
	 *
1236 1237 1238
	 * Since this calculation is somewhat expensive, and will be the same for
	 * all mergejoin paths associated with the merge clause, we cache the
	 * results in the RestrictInfo node.
1239
	 */
1240
	if (mergeclauses && path->jpath.jointype != JOIN_FULL)
1241
	{
1242
		firstclause = (RestrictInfo *) linitial(mergeclauses);
Bruce Momjian's avatar
Bruce Momjian committed
1243
		if (firstclause->left_mergescansel < 0) /* not computed yet? */
1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259
			mergejoinscansel(root, (Node *) firstclause->clause,
							 &firstclause->left_mergescansel,
							 &firstclause->right_mergescansel);

		if (bms_is_subset(firstclause->left_relids, outer_path->parent->relids))
		{
			/* left side of clause is outer */
			outerscansel = firstclause->left_mergescansel;
			innerscansel = firstclause->right_mergescansel;
		}
		else
		{
			/* left side of clause is inner */
			outerscansel = firstclause->right_mergescansel;
			innerscansel = firstclause->left_mergescansel;
		}
1260 1261 1262 1263
		if (path->jpath.jointype == JOIN_LEFT)
			outerscansel = 1.0;
		else if (path->jpath.jointype == JOIN_RIGHT)
			innerscansel = 1.0;
1264 1265 1266
	}
	else
	{
1267
		/* cope with clauseless or full mergejoin */
1268
		outerscansel = innerscansel = 1.0;
1269 1270
	}

1271
	/* convert selectivity to row count; must scan at least one row */
1272 1273
	outer_rows = clamp_row_est(outer_path_rows * outerscansel);
	inner_rows = clamp_row_est(inner_path_rows * innerscansel);
1274 1275 1276

	/*
	 * Readjust scan selectivities to account for above rounding.  This is
1277 1278
	 * normally an insignificant effect, but when there are only a few rows in
	 * the inputs, failing to do this makes for a large percentage error.
1279
	 */
1280 1281
	outerscansel = outer_rows / outer_path_rows;
	innerscansel = inner_rows / inner_path_rows;
1282

1283
	/* cost of source data */
1284

1285 1286 1287
	if (outersortkeys)			/* do we need to sort outer? */
	{
		cost_sort(&sort_path,
1288
				  root,
1289
				  outersortkeys,
1290
				  outer_path->total_cost,
1291
				  outer_path_rows,
1292 1293
				  outer_path->parent->width);
		startup_cost += sort_path.startup_cost;
1294
		run_cost += (sort_path.total_cost - sort_path.startup_cost)
1295
			* outerscansel;
1296 1297 1298 1299
	}
	else
	{
		startup_cost += outer_path->startup_cost;
1300
		run_cost += (outer_path->total_cost - outer_path->startup_cost)
1301
			* outerscansel;
1302
	}
1303

1304 1305 1306
	if (innersortkeys)			/* do we need to sort inner? */
	{
		cost_sort(&sort_path,
1307
				  root,
1308
				  innersortkeys,
1309
				  inner_path->total_cost,
1310
				  inner_path_rows,
1311 1312
				  inner_path->parent->width);
		startup_cost += sort_path.startup_cost;
1313
		run_cost += (sort_path.total_cost - sort_path.startup_cost)
1314
			* innerscansel * rescanratio;
1315 1316 1317 1318
	}
	else
	{
		startup_cost += inner_path->startup_cost;
1319
		run_cost += (inner_path->total_cost - inner_path->startup_cost)
1320
			* innerscansel * rescanratio;
1321
	}
1322

1323 1324
	/* CPU costs */

1325
	/*
1326 1327 1328 1329 1330
	 * If we're doing JOIN_IN then we will stop outputting inner tuples for an
	 * outer tuple as soon as we have one match.  Account for the effects of
	 * this by scaling down the cost estimates in proportion to the expected
	 * output size.  (This assumes that all the quals attached to the join are
	 * IN quals, which should be true.)
1331
	 */
1332
	joininfactor = join_in_selectivity(&path->jpath, root);
1333 1334

	/*
1335 1336 1337 1338 1339
	 * The number of tuple comparisons needed is approximately number of outer
	 * rows plus number of inner rows plus number of rescanned tuples (can we
	 * refine this?).  At each one, we need to evaluate the mergejoin quals.
	 * NOTE: JOIN_IN mode does not save any work here, so do NOT include
	 * joininfactor.
1340 1341 1342 1343
	 */
	startup_cost += merge_qual_cost.startup;
	run_cost += merge_qual_cost.per_tuple *
		(outer_rows + inner_rows * rescanratio);
1344 1345 1346 1347

	/*
	 * For each tuple that gets through the mergejoin proper, we charge
	 * cpu_tuple_cost plus the cost of evaluating additional restriction
1348 1349 1350
	 * clauses that are to be applied at the join.	(This is pessimistic since
	 * not all of the quals may get evaluated at each tuple.)  This work is
	 * skipped in JOIN_IN mode, so apply the factor.
1351
	 */
1352 1353 1354
	startup_cost += qp_qual_cost.startup;
	cpu_per_tuple = cpu_tuple_cost + qp_qual_cost.per_tuple;
	run_cost += cpu_per_tuple * mergejointuples * joininfactor;
1355

1356 1357
	path->jpath.path.startup_cost = startup_cost;
	path->jpath.path.total_cost = startup_cost + run_cost;
1358 1359
}

1360
/*
1361
 * cost_hashjoin
1362 1363
 *	  Determines and returns the cost of joining two relations using the
 *	  hash join algorithm.
1364
 *
1365 1366 1367
 * 'path' is already filled in except for the cost fields
 *
 * Note: path's hashclauses should be a subset of the joinrestrictinfo list
1368
 */
1369
void
1370
cost_hashjoin(HashPath *path, PlannerInfo *root)
1371
{
1372 1373 1374
	Path	   *outer_path = path->jpath.outerjoinpath;
	Path	   *inner_path = path->jpath.innerjoinpath;
	List	   *hashclauses = path->path_hashclauses;
1375 1376 1377
	Cost		startup_cost = 0;
	Cost		run_cost = 0;
	Cost		cpu_per_tuple;
Bruce Momjian's avatar
Bruce Momjian committed
1378
	Selectivity hash_selec;
1379 1380 1381 1382 1383 1384
	QualCost	hash_qual_cost;
	QualCost	qp_qual_cost;
	double		hashjointuples;
	double		outer_path_rows = PATH_ROWS(outer_path);
	double		inner_path_rows = PATH_ROWS(inner_path);
	double		outerbytes = relation_byte_size(outer_path_rows,
1385
												outer_path->parent->width);
1386
	double		innerbytes = relation_byte_size(inner_path_rows,
1387
												inner_path->parent->width);
1388
	int			num_hashclauses = list_length(hashclauses);
1389
	int			numbuckets;
1390
	int			numbatches;
1391
	double		virtualbuckets;
1392
	Selectivity innerbucketsize;
Bruce Momjian's avatar
Bruce Momjian committed
1393
	Selectivity joininfactor;
1394
	ListCell   *hcl;
1395

1396
	if (!enable_hashjoin)
1397
		startup_cost += disable_cost;
1398

1399 1400
	/*
	 * Compute cost and selectivity of the hashquals and qpquals (other
1401 1402
	 * restriction clauses) separately.  We use approx_selectivity here for
	 * speed --- in most cases, any errors won't affect the result much.
1403
	 *
1404 1405
	 * Note: it's probably bogus to use the normal selectivity calculation
	 * here when either the outer or inner path is a UniquePath.
1406
	 */
1407 1408
	hash_selec = approx_selectivity(root, hashclauses,
									path->jpath.jointype);
1409
	cost_qual_eval(&hash_qual_cost, hashclauses);
1410 1411 1412
	cost_qual_eval(&qp_qual_cost, path->jpath.joinrestrictinfo);
	qp_qual_cost.startup -= hash_qual_cost.startup;
	qp_qual_cost.per_tuple -= hash_qual_cost.per_tuple;
1413 1414

	/* approx # tuples passing the hash quals */
1415
	hashjointuples = clamp_row_est(hash_selec * outer_path_rows * inner_path_rows);
1416

1417
	/* cost of source data */
1418 1419 1420
	startup_cost += outer_path->startup_cost;
	run_cost += outer_path->total_cost - outer_path->startup_cost;
	startup_cost += inner_path->total_cost;
1421

1422
	/*
1423 1424
	 * Cost of computing hash function: must do it once per input tuple. We
	 * charge one cpu_operator_cost for each column's hash function.
1425
	 *
1426 1427 1428
	 * XXX when a hashclause is more complex than a single operator, we really
	 * should charge the extra eval costs of the left or right side, as
	 * appropriate, here.  This seems more work than it's worth at the moment.
1429 1430 1431
	 */
	startup_cost += cpu_operator_cost * num_hashclauses * inner_path_rows;
	run_cost += cpu_operator_cost * num_hashclauses * outer_path_rows;
1432

1433
	/* Get hash table size that executor would use for inner relation */
1434
	ExecChooseHashTableSize(inner_path_rows,
1435
							inner_path->parent->width,
1436
							&numbuckets,
1437
							&numbatches);
1438
	virtualbuckets = (double) numbuckets *(double) numbatches;
1439

1440
	/*
1441 1442 1443
	 * Determine bucketsize fraction for inner relation.  We use the smallest
	 * bucketsize estimated for any individual hashclause; this is undoubtedly
	 * conservative.
1444
	 *
1445 1446
	 * BUT: if inner relation has been unique-ified, we can assume it's good
	 * for hashing.  This is important both because it's the right answer, and
1447 1448
	 * because we avoid contaminating the cache with a value that's wrong for
	 * non-unique-ified paths.
1449
	 */
1450 1451 1452
	if (IsA(inner_path, UniquePath))
		innerbucketsize = 1.0 / virtualbuckets;
	else
1453
	{
1454 1455 1456 1457 1458
		innerbucketsize = 1.0;
		foreach(hcl, hashclauses)
		{
			RestrictInfo *restrictinfo = (RestrictInfo *) lfirst(hcl);
			Selectivity thisbucketsize;
1459

1460
			Assert(IsA(restrictinfo, RestrictInfo));
1461

1462
			/*
1463 1464
			 * First we have to figure out which side of the hashjoin clause
			 * is the inner side.
1465 1466
			 *
			 * Since we tend to visit the same clauses over and over when
1467 1468
			 * planning a large query, we cache the bucketsize estimate in the
			 * RestrictInfo node to avoid repeated lookups of statistics.
1469
			 */
1470 1471
			if (bms_is_subset(restrictinfo->right_relids,
							  inner_path->parent->relids))
1472
			{
1473 1474 1475 1476 1477 1478 1479
				/* righthand side is inner */
				thisbucketsize = restrictinfo->right_bucketsize;
				if (thisbucketsize < 0)
				{
					/* not cached yet */
					thisbucketsize =
						estimate_hash_bucketsize(root,
1480
										   get_rightop(restrictinfo->clause),
1481 1482 1483
												 virtualbuckets);
					restrictinfo->right_bucketsize = thisbucketsize;
				}
1484
			}
1485
			else
1486
			{
1487 1488
				Assert(bms_is_subset(restrictinfo->left_relids,
									 inner_path->parent->relids));
1489 1490 1491 1492 1493 1494 1495
				/* lefthand side is inner */
				thisbucketsize = restrictinfo->left_bucketsize;
				if (thisbucketsize < 0)
				{
					/* not cached yet */
					thisbucketsize =
						estimate_hash_bucketsize(root,
1496
											get_leftop(restrictinfo->clause),
1497 1498 1499
												 virtualbuckets);
					restrictinfo->left_bucketsize = thisbucketsize;
				}
1500 1501
			}

1502 1503 1504
			if (innerbucketsize > thisbucketsize)
				innerbucketsize = thisbucketsize;
		}
1505 1506
	}

Bruce Momjian's avatar
Bruce Momjian committed
1507
	/*
1508
	 * If inner relation is too big then we will need to "batch" the join,
1509 1510 1511 1512
	 * which implies writing and reading most of the tuples to disk an extra
	 * time.  Charge one cost unit per page of I/O (correct since it should be
	 * nice and sequential...).  Writing the inner rel counts as startup cost,
	 * all the rest as run cost.
1513
	 */
1514
	if (numbatches > 1)
1515
	{
1516
		double		outerpages = page_size(outer_path_rows,
1517
										   outer_path->parent->width);
1518
		double		innerpages = page_size(inner_path_rows,
1519
										   inner_path->parent->width);
1520 1521 1522 1523

		startup_cost += innerpages;
		run_cost += innerpages + 2 * outerpages;
	}
1524

1525 1526 1527
	/* CPU costs */

	/*
1528 1529 1530 1531 1532
	 * If we're doing JOIN_IN then we will stop comparing inner tuples to an
	 * outer tuple as soon as we have one match.  Account for the effects of
	 * this by scaling down the cost estimates in proportion to the expected
	 * output size.  (This assumes that all the quals attached to the join are
	 * IN quals, which should be true.)
1533
	 */
1534
	joininfactor = join_in_selectivity(&path->jpath, root);
1535 1536

	/*
1537 1538 1539 1540 1541 1542
	 * The number of tuple comparisons needed is the number of outer tuples
	 * times the typical number of tuples in a hash bucket, which is the inner
	 * relation size times its bucketsize fraction.  At each one, we need to
	 * evaluate the hashjoin quals.  (Note: charging the full qual eval cost
	 * at each tuple is pessimistic, since we don't evaluate the quals unless
	 * the hash values match exactly.)
1543 1544 1545
	 */
	startup_cost += hash_qual_cost.startup;
	run_cost += hash_qual_cost.per_tuple *
1546
		outer_path_rows * clamp_row_est(inner_path_rows * innerbucketsize) *
1547 1548 1549 1550 1551
		joininfactor;

	/*
	 * For each tuple that gets through the hashjoin proper, we charge
	 * cpu_tuple_cost plus the cost of evaluating additional restriction
1552 1553
	 * clauses that are to be applied at the join.	(This is pessimistic since
	 * not all of the quals may get evaluated at each tuple.)
1554 1555 1556 1557 1558
	 */
	startup_cost += qp_qual_cost.startup;
	cpu_per_tuple = cpu_tuple_cost + qp_qual_cost.per_tuple;
	run_cost += cpu_per_tuple * hashjointuples * joininfactor;

1559
	/*
1560
	 * Bias against putting larger relation on inside.	We don't want an
1561 1562 1563 1564 1565
	 * absolute prohibition, though, since larger relation might have better
	 * bucketsize --- and we can't trust the size estimates unreservedly,
	 * anyway.	Instead, inflate the run cost by the square root of the size
	 * ratio.  (Why square root?  No real good reason, but it seems
	 * reasonable...)
1566
	 *
Bruce Momjian's avatar
Bruce Momjian committed
1567
	 * Note: before 7.4 we implemented this by inflating startup cost; but if
1568 1569 1570
	 * there's a disable_cost component in the input paths' startup cost, that
	 * unfairly penalizes the hash.  Probably it'd be better to keep track of
	 * disable penalty separately from cost.
1571 1572
	 */
	if (innerbytes > outerbytes && outerbytes > 0)
1573
		run_cost *= sqrt(innerbytes / outerbytes);
1574

1575 1576
	path->jpath.path.startup_cost = startup_cost;
	path->jpath.path.total_cost = startup_cost + run_cost;
1577 1578 1579 1580 1581
}


/*
 * cost_qual_eval
1582
 *		Estimate the CPU costs of evaluating a WHERE clause.
1583 1584
 *		The input can be either an implicitly-ANDed list of boolean
 *		expressions, or a list of RestrictInfo nodes.
1585 1586
 *		The result includes both a one-time (startup) component,
 *		and a per-evaluation component.
1587
 */
1588
void
1589
cost_qual_eval(QualCost *cost, List *quals)
1590
{
1591
	ListCell   *l;
1592

1593 1594 1595
	cost->startup = 0;
	cost->per_tuple = 0;

1596 1597 1598 1599
	/* We don't charge any cost for the implicit ANDing at top level ... */

	foreach(l, quals)
	{
1600
		Node	   *qual = (Node *) lfirst(l);
1601 1602 1603 1604 1605

		/*
		 * RestrictInfo nodes contain an eval_cost field reserved for this
		 * routine's use, so that it's not necessary to evaluate the qual
		 * clause's cost more than once.  If the clause's cost hasn't been
1606
		 * computed yet, the field's startup value will contain -1.
1607 1608 1609
		 *
		 * If the RestrictInfo is marked pseudoconstant, it will be tested
		 * only once, so treat its cost as all startup cost.
1610 1611 1612
		 */
		if (qual && IsA(qual, RestrictInfo))
		{
1613
			RestrictInfo *rinfo = (RestrictInfo *) qual;
1614

1615
			if (rinfo->eval_cost.startup < 0)
1616
			{
1617 1618 1619 1620 1621 1622 1623 1624 1625 1626
				rinfo->eval_cost.startup = 0;
				rinfo->eval_cost.per_tuple = 0;
				cost_qual_eval_walker((Node *) rinfo->clause,
									  &rinfo->eval_cost);
				if (rinfo->pseudoconstant)
				{
					/* count one execution during startup */
					rinfo->eval_cost.startup += rinfo->eval_cost.per_tuple;
					rinfo->eval_cost.per_tuple = 0;
				}
1627
			}
1628 1629
			cost->startup += rinfo->eval_cost.startup;
			cost->per_tuple += rinfo->eval_cost.per_tuple;
1630 1631 1632 1633
		}
		else
		{
			/* If it's a bare expression, must always do it the hard way */
1634
			cost_qual_eval_walker(qual, cost);
1635 1636
		}
	}
1637 1638 1639
}

static bool
1640
cost_qual_eval_walker(Node *node, QualCost *total)
1641 1642 1643
{
	if (node == NULL)
		return false;
1644

1645
	/*
1646 1647 1648 1649
	 * Our basic strategy is to charge one cpu_operator_cost for each operator
	 * or function node in the given tree.	Vars and Consts are charged zero,
	 * and so are boolean operators (AND, OR, NOT). Simplistic, but a lot
	 * better than no model at all.
1650
	 *
1651 1652
	 * Should we try to account for the possibility of short-circuit
	 * evaluation of AND/OR?
1653
	 */
1654 1655
	if (IsA(node, FuncExpr) ||
		IsA(node, OpExpr) ||
1656 1657
		IsA(node, DistinctExpr) ||
		IsA(node, NullIfExpr))
1658
		total->per_tuple += cpu_operator_cost;
1659 1660
	else if (IsA(node, ScalarArrayOpExpr))
	{
1661 1662 1663 1664 1665 1666 1667 1668 1669
		/*
		 * Estimate that the operator will be applied to about half of the
		 * array elements before the answer is determined.
		 */
		ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) node;
		Node *arraynode = (Node *) lsecond(saop->args);

		total->per_tuple +=
			cpu_operator_cost * estimate_array_length(arraynode) * 0.5;
1670
	}
1671 1672 1673 1674 1675 1676 1677
	else if (IsA(node, RowCompareExpr))
	{
		/* Conservatively assume we will check all the columns */
		RowCompareExpr *rcexpr = (RowCompareExpr *) node;

		total->per_tuple += cpu_operator_cost * list_length(rcexpr->opnos);
	}
1678 1679 1680
	else if (IsA(node, SubLink))
	{
		/* This routine should not be applied to un-planned expressions */
1681
		elog(ERROR, "cannot handle unplanned sub-select");
1682
	}
1683
	else if (IsA(node, SubPlan))
1684
	{
1685
		/*
1686
		 * A subplan node in an expression typically indicates that the
1687 1688 1689
		 * subplan will be executed on each evaluation, so charge accordingly.
		 * (Sub-selects that can be executed as InitPlans have already been
		 * removed from the expression.)
1690
		 *
1691 1692
		 * An exception occurs when we have decided we can implement the
		 * subplan by hashing.
1693
		 */
Bruce Momjian's avatar
Bruce Momjian committed
1694
		SubPlan    *subplan = (SubPlan *) node;
1695
		Plan	   *plan = subplan->plan;
1696

1697
		if (subplan->useHashTable)
1698
		{
1699
			/*
1700 1701 1702 1703
			 * If we are using a hash table for the subquery outputs, then the
			 * cost of evaluating the query is a one-time cost. We charge one
			 * cpu_operator_cost per tuple for the work of loading the
			 * hashtable, too.
1704 1705 1706
			 */
			total->startup += plan->total_cost +
				cpu_operator_cost * plan->plan_rows;
Bruce Momjian's avatar
Bruce Momjian committed
1707

1708
			/*
1709 1710
			 * The per-tuple costs include the cost of evaluating the lefthand
			 * expressions, plus the cost of probing the hashtable. Recursion
1711
			 * into the testexpr will handle the lefthand expressions
1712 1713 1714 1715
			 * properly, and will count one cpu_operator_cost for each
			 * comparison operator.  That is probably too low for the probing
			 * cost, but it's hard to make a better estimate, so live with it
			 * for now.
1716
			 */
1717
		}
1718 1719
		else
		{
1720 1721
			/*
			 * Otherwise we will be rescanning the subplan output on each
1722 1723 1724
			 * evaluation.	We need to estimate how much of the output we will
			 * actually need to scan.  NOTE: this logic should agree with the
			 * estimates used by make_subplan() in plan/subselect.c.
1725
			 */
Bruce Momjian's avatar
Bruce Momjian committed
1726
			Cost		plan_run_cost = plan->total_cost - plan->startup_cost;
1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745

			if (subplan->subLinkType == EXISTS_SUBLINK)
			{
				/* we only need to fetch 1 tuple */
				total->per_tuple += plan_run_cost / plan->plan_rows;
			}
			else if (subplan->subLinkType == ALL_SUBLINK ||
					 subplan->subLinkType == ANY_SUBLINK)
			{
				/* assume we need 50% of the tuples */
				total->per_tuple += 0.50 * plan_run_cost;
				/* also charge a cpu_operator_cost per row examined */
				total->per_tuple += 0.50 * plan->plan_rows * cpu_operator_cost;
			}
			else
			{
				/* assume we need all tuples */
				total->per_tuple += plan_run_cost;
			}
Bruce Momjian's avatar
Bruce Momjian committed
1746

1747
			/*
Bruce Momjian's avatar
Bruce Momjian committed
1748
			 * Also account for subplan's startup cost. If the subplan is
1749 1750 1751 1752
			 * uncorrelated or undirect correlated, AND its topmost node is a
			 * Sort or Material node, assume that we'll only need to pay its
			 * startup cost once; otherwise assume we pay the startup cost
			 * every time.
1753 1754 1755 1756 1757 1758 1759
			 */
			if (subplan->parParam == NIL &&
				(IsA(plan, Sort) ||
				 IsA(plan, Material)))
				total->startup += plan->startup_cost;
			else
				total->per_tuple += plan->startup_cost;
1760
		}
1761
	}
1762

1763 1764
	return expression_tree_walker(node, cost_qual_eval_walker,
								  (void *) total);
1765 1766
}

1767

1768 1769 1770 1771 1772 1773
/*
 * approx_selectivity
 *		Quick-and-dirty estimation of clause selectivities.
 *		The input can be either an implicitly-ANDed list of boolean
 *		expressions, or a list of RestrictInfo nodes (typically the latter).
 *
1774 1775
 * This is quick-and-dirty because we bypass clauselist_selectivity, and
 * simply multiply the independent clause selectivities together.  Now
1776
 * clauselist_selectivity often can't do any better than that anyhow, but
1777 1778 1779
 * for some situations (such as range constraints) it is smarter.  However,
 * we can't effectively cache the results of clauselist_selectivity, whereas
 * the individual clause selectivities can be and are cached.
1780 1781 1782 1783 1784 1785
 *
 * Since we are only using the results to estimate how many potential
 * output tuples are generated and passed through qpqual checking, it
 * seems OK to live with the approximation.
 */
static Selectivity
1786
approx_selectivity(PlannerInfo *root, List *quals, JoinType jointype)
1787
{
1788
	Selectivity total = 1.0;
1789
	ListCell   *l;
1790 1791 1792 1793 1794

	foreach(l, quals)
	{
		Node	   *qual = (Node *) lfirst(l);

1795 1796
		/* Note that clause_selectivity will be able to cache its result */
		total *= clause_selectivity(root, qual, 0, jointype);
1797 1798 1799 1800 1801
	}
	return total;
}


1802
/*
1803 1804
 * set_baserel_size_estimates
 *		Set the size estimates for the given base relation.
1805
 *
1806 1807 1808 1809 1810
 * The rel's targetlist and restrictinfo list must have been constructed
 * already.
 *
 * We set the following fields of the rel node:
 *	rows: the estimated number of output tuples (after applying
1811
 *		  restriction clauses).
1812
 *	width: the estimated average output tuple width in bytes.
1813
 *	baserestrictcost: estimated cost of evaluating baserestrictinfo clauses.
1814
 */
1815
void
1816
set_baserel_size_estimates(PlannerInfo *root, RelOptInfo *rel)
1817
{
1818
	double		nrows;
1819

1820
	/* Should only be applied to base relations */
1821
	Assert(rel->relid > 0);
1822

1823
	nrows = rel->tuples *
1824 1825 1826 1827
		clauselist_selectivity(root,
							   rel->baserestrictinfo,
							   0,
							   JOIN_INNER);
1828

1829
	rel->rows = clamp_row_est(nrows);
1830

1831
	cost_qual_eval(&rel->baserestrictcost, rel->baserestrictinfo);
1832 1833

	set_rel_width(root, rel);
1834 1835
}

1836
/*
1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852
 * set_joinrel_size_estimates
 *		Set the size estimates for the given join relation.
 *
 * The rel's targetlist must have been constructed already, and a
 * restriction clause list that matches the given component rels must
 * be provided.
 *
 * Since there is more than one way to make a joinrel for more than two
 * base relations, the results we get here could depend on which component
 * rel pair is provided.  In theory we should get the same answers no matter
 * which pair is provided; in practice, since the selectivity estimation
 * routines don't handle all cases equally well, we might not.  But there's
 * not much to be done about it.  (Would it make sense to repeat the
 * calculations for each pair of input rels that's encountered, and somehow
 * average the results?  Probably way more trouble than it's worth.)
 *
1853 1854 1855 1856 1857
 * It's important that the results for symmetric JoinTypes be symmetric,
 * eg, (rel1, rel2, JOIN_LEFT) should produce the same result as (rel2,
 * rel1, JOIN_RIGHT).  Also, JOIN_IN should produce the same result as
 * JOIN_UNIQUE_INNER, likewise JOIN_REVERSE_IN == JOIN_UNIQUE_OUTER.
 *
1858 1859
 * We set only the rows field here.  The width field was already set by
 * build_joinrel_tlist, and baserestrictcost is not used for join rels.
1860
 */
1861
void
1862
set_joinrel_size_estimates(PlannerInfo *root, RelOptInfo *rel,
1863 1864
						   RelOptInfo *outer_rel,
						   RelOptInfo *inner_rel,
1865
						   JoinType jointype,
1866
						   List *restrictlist)
1867
{
1868
	Selectivity selec;
1869
	double		nrows;
1870
	UniquePath *upath;
1871

1872
	/*
Bruce Momjian's avatar
Bruce Momjian committed
1873
	 * Compute joinclause selectivity.	Note that we are only considering
1874 1875 1876
	 * clauses that become restriction clauses at this join level; we are not
	 * double-counting them because they were not considered in estimating the
	 * sizes of the component rels.
1877
	 */
1878 1879 1880 1881
	selec = clauselist_selectivity(root,
								   restrictlist,
								   0,
								   jointype);
1882

1883
	/*
1884
	 * Basically, we multiply size of Cartesian product by selectivity.
1885
	 *
1886 1887
	 * If we are doing an outer join, take that into account: the output must
	 * be at least as large as the non-nullable input.	(Is there any chance
1888 1889 1890
	 * of being even smarter?)  (XXX this is not really right, because it
	 * assumes all the restriction clauses are join clauses; we should figure
	 * pushed-down clauses separately.)
1891
	 *
1892 1893
	 * For JOIN_IN and variants, the Cartesian product is figured with respect
	 * to a unique-ified input, and then we can clamp to the size of the other
1894
	 * input.
1895 1896 1897 1898
	 */
	switch (jointype)
	{
		case JOIN_INNER:
1899
			nrows = outer_rel->rows * inner_rel->rows * selec;
1900 1901
			break;
		case JOIN_LEFT:
1902 1903 1904
			nrows = outer_rel->rows * inner_rel->rows * selec;
			if (nrows < outer_rel->rows)
				nrows = outer_rel->rows;
1905 1906
			break;
		case JOIN_RIGHT:
1907 1908 1909
			nrows = outer_rel->rows * inner_rel->rows * selec;
			if (nrows < inner_rel->rows)
				nrows = inner_rel->rows;
1910 1911
			break;
		case JOIN_FULL:
1912 1913 1914 1915 1916
			nrows = outer_rel->rows * inner_rel->rows * selec;
			if (nrows < outer_rel->rows)
				nrows = outer_rel->rows;
			if (nrows < inner_rel->rows)
				nrows = inner_rel->rows;
1917
			break;
1918
		case JOIN_IN:
1919 1920 1921
		case JOIN_UNIQUE_INNER:
			upath = create_unique_path(root, inner_rel,
									   inner_rel->cheapest_total_path);
1922 1923 1924
			nrows = outer_rel->rows * upath->rows * selec;
			if (nrows > outer_rel->rows)
				nrows = outer_rel->rows;
1925 1926 1927 1928 1929
			break;
		case JOIN_REVERSE_IN:
		case JOIN_UNIQUE_OUTER:
			upath = create_unique_path(root, outer_rel,
									   outer_rel->cheapest_total_path);
1930 1931 1932
			nrows = upath->rows * inner_rel->rows * selec;
			if (nrows > inner_rel->rows)
				nrows = inner_rel->rows;
1933
			break;
1934
		default:
1935
			elog(ERROR, "unrecognized join type: %d", (int) jointype);
1936
			nrows = 0;			/* keep compiler quiet */
1937 1938 1939
			break;
	}

1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950
	rel->rows = clamp_row_est(nrows);
}

/*
 * join_in_selectivity
 *	  Determines the factor by which a JOIN_IN join's result is expected
 *	  to be smaller than an ordinary inner join.
 *
 * 'path' is already filled in except for the cost fields
 */
static Selectivity
1951
join_in_selectivity(JoinPath *path, PlannerInfo *root)
1952
{
1953 1954
	RelOptInfo *innerrel;
	UniquePath *innerunique;
1955 1956
	Selectivity selec;
	double		nrows;
1957

1958
	/* Return 1.0 whenever it's not JOIN_IN */
1959 1960
	if (path->jointype != JOIN_IN)
		return 1.0;
1961

1962
	/*
1963 1964 1965 1966 1967
	 * Return 1.0 if the inner side is already known unique.  The case where
	 * the inner path is already a UniquePath probably cannot happen in
	 * current usage, but check it anyway for completeness.  The interesting
	 * case is where we've determined the inner relation itself is unique,
	 * which we can check by looking at the rows estimate for its UniquePath.
1968 1969 1970 1971 1972 1973 1974 1975 1976 1977
	 */
	if (IsA(path->innerjoinpath, UniquePath))
		return 1.0;
	innerrel = path->innerjoinpath->parent;
	innerunique = create_unique_path(root,
									 innerrel,
									 innerrel->cheapest_total_path);
	if (innerunique->rows >= innerrel->rows)
		return 1.0;

1978
	/*
Bruce Momjian's avatar
Bruce Momjian committed
1979
	 * Compute same result set_joinrel_size_estimates would compute for
1980 1981 1982
	 * JOIN_INNER.	Note that we use the input rels' absolute size estimates,
	 * not PATH_ROWS() which might be less; if we used PATH_ROWS() we'd be
	 * double-counting the effects of any join clauses used in input scans.
1983
	 */
1984 1985 1986 1987
	selec = clauselist_selectivity(root,
								   path->joinrestrictinfo,
								   0,
								   JOIN_INNER);
1988
	nrows = path->outerjoinpath->parent->rows * innerrel->rows * selec;
1989 1990 1991 1992 1993 1994 1995 1996

	nrows = clamp_row_est(nrows);

	/* See if it's larger than the actual JOIN_IN size estimate */
	if (nrows > path->path.parent->rows)
		return path->path.parent->rows / nrows;
	else
		return 1.0;
1997 1998
}

1999 2000 2001 2002 2003 2004 2005
/*
 * set_function_size_estimates
 *		Set the size estimates for a base relation that is a function call.
 *
 * The rel's targetlist and restrictinfo list must have been constructed
 * already.
 *
2006
 * We set the same fields as set_baserel_size_estimates.
2007 2008
 */
void
2009
set_function_size_estimates(PlannerInfo *root, RelOptInfo *rel)
2010
{
2011 2012
	RangeTblEntry *rte;

2013
	/* Should only be applied to base relations that are functions */
2014
	Assert(rel->relid > 0);
2015 2016
	rte = rt_fetch(rel->relid, root->parse->rtable);
	Assert(rte->rtekind == RTE_FUNCTION);
2017 2018 2019 2020

	/*
	 * Estimate number of rows the function itself will return.
	 *
2021 2022
	 * XXX no idea how to do this yet; but we can at least check whether
	 * function returns set or not...
2023
	 */
2024 2025 2026 2027
	if (expression_returns_set(rte->funcexpr))
		rel->tuples = 1000;
	else
		rel->tuples = 1;
2028

2029 2030
	/* Now estimate number of output rows, etc */
	set_baserel_size_estimates(root, rel);
2031 2032 2033
}


2034
/*
2035
 * set_rel_width
2036
 *		Set the estimated output width of a base relation.
2037
 *
2038
 * NB: this works best on plain relations because it prefers to look at
2039 2040
 * real Vars.  It will fail to make use of pg_statistic info when applied
 * to a subquery relation, even if the subquery outputs are simple vars
2041
 * that we could have gotten info for.	Is it worth trying to be smarter
2042
 * about subqueries?
2043 2044 2045
 *
 * The per-attribute width estimates are cached for possible re-use while
 * building join relations.
2046
 */
2047
static void
2048
set_rel_width(PlannerInfo *root, RelOptInfo *rel)
2049
{
2050
	int32		tuple_width = 0;
2051
	ListCell   *tllist;
2052

2053
	foreach(tllist, rel->reltargetlist)
2054
	{
2055
		Var		   *var = (Var *) lfirst(tllist);
2056
		int			ndx;
2057
		Oid			relid;
2058
		int32		item_width;
2059

2060 2061 2062 2063 2064 2065 2066 2067
		/* For now, punt on whole-row child Vars */
		if (!IsA(var, Var))
		{
			tuple_width += 32;	/* arbitrary */
			continue;
		}

		ndx = var->varattno - rel->min_attr;
2068

Bruce Momjian's avatar
Bruce Momjian committed
2069
		/*
2070
		 * The width probably hasn't been cached yet, but may as well check
Bruce Momjian's avatar
Bruce Momjian committed
2071
		 */
2072
		if (rel->attr_widths[ndx] > 0)
2073
		{
Bruce Momjian's avatar
Bruce Momjian committed
2074 2075
			tuple_width += rel->attr_widths[ndx];
			continue;
2076
		}
2077

2078
		relid = getrelid(var->varno, root->parse->rtable);
2079 2080 2081 2082
		if (relid != InvalidOid)
		{
			item_width = get_attavgwidth(relid, var->varattno);
			if (item_width > 0)
2083
			{
2084 2085 2086
				rel->attr_widths[ndx] = item_width;
				tuple_width += item_width;
				continue;
2087 2088
			}
		}
2089

2090
		/*
Bruce Momjian's avatar
Bruce Momjian committed
2091 2092
		 * Not a plain relation, or can't find statistics for it. Estimate
		 * using just the type info.
2093
		 */
2094
		item_width = get_typavgwidth(var->vartype, var->vartypmod);
2095
		Assert(item_width > 0);
2096
		rel->attr_widths[ndx] = item_width;
2097 2098 2099 2100
		tuple_width += item_width;
	}
	Assert(tuple_width >= 0);
	rel->width = tuple_width;
2101 2102
}

2103 2104
/*
 * relation_byte_size
Bruce Momjian's avatar
Bruce Momjian committed
2105 2106
 *	  Estimate the storage space in bytes for a given number of tuples
 *	  of a given width (size in bytes).
2107 2108
 */
static double
2109
relation_byte_size(double tuples, int width)
2110
{
2111
	return tuples * (MAXALIGN(width) + MAXALIGN(sizeof(HeapTupleHeaderData)));
2112 2113
}

2114
/*
2115
 * page_size
2116 2117
 *	  Returns an estimate of the number of pages covered by a given
 *	  number of tuples of a given width (size in bytes).
2118
 */
2119 2120
static double
page_size(double tuples, int width)
2121
{
2122
	return ceil(relation_byte_size(tuples, width) / BLCKSZ);
2123
}