diff options
Diffstat (limited to 'src/backend/optimizer/path/costsize.c')
| -rw-r--r-- | src/backend/optimizer/path/costsize.c | 235 |
1 files changed, 73 insertions, 162 deletions
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index d9d8c89864..14c3cf1f21 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -54,7 +54,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.193 2008/08/14 18:47:59 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.194 2008/08/16 00:01:36 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -118,10 +118,8 @@ static MergeScanSelCache *cached_scansel(PlannerInfo *root, RestrictInfo *rinfo, PathKey *pathkey); static bool cost_qual_eval_walker(Node *node, cost_qual_eval_context *context); -static Selectivity approx_selectivity(PlannerInfo *root, List *quals, - SpecialJoinInfo *sjinfo); -static Selectivity join_in_selectivity(JoinPath *path, PlannerInfo *root, - SpecialJoinInfo *sjinfo); +static double approx_tuple_count(PlannerInfo *root, JoinPath *path, + List *quals, SpecialJoinInfo *sjinfo); static void set_rel_width(PlannerInfo *root, RelOptInfo *rel); static double relation_byte_size(double tuples, int width); static double page_size(double tuples, int width); @@ -1288,20 +1286,10 @@ cost_nestloop(NestPath *path, PlannerInfo *root, SpecialJoinInfo *sjinfo) double outer_path_rows = PATH_ROWS(outer_path); double inner_path_rows = nestloop_inner_path_rows(inner_path); double ntuples; - Selectivity joininfactor; if (!enable_nestloop) startup_cost += disable_cost; - /* - * If we're doing JOIN_IN then we will stop scanning inner tuples for an - * outer tuple as soon as we have one match. Account for the effects of - * this by scaling down the cost estimates in proportion to the JOIN_IN - * selectivity. (This assumes that all the quals attached to the join are - * IN quals, which should be true.) - */ - joininfactor = join_in_selectivity(path, root, sjinfo); - /* cost of source data */ /* @@ -1328,12 +1316,12 @@ cost_nestloop(NestPath *path, PlannerInfo *root, SpecialJoinInfo *sjinfo) run_cost += (outer_path_rows - 1) * inner_path->startup_cost; } run_cost += outer_path_rows * - (inner_path->total_cost - inner_path->startup_cost) * joininfactor; + (inner_path->total_cost - inner_path->startup_cost); /* * Compute number of tuples processed (not number emitted!) */ - ntuples = outer_path_rows * inner_path_rows * joininfactor; + ntuples = outer_path_rows * inner_path_rows; /* CPU costs */ cost_qual_eval(&restrict_qual_cost, path->joinrestrictinfo, root); @@ -1369,7 +1357,6 @@ cost_mergejoin(MergePath *path, PlannerInfo *root, SpecialJoinInfo *sjinfo) Cost startup_cost = 0; Cost run_cost = 0; Cost cpu_per_tuple; - Selectivity merge_selec; QualCost merge_qual_cost; QualCost qp_qual_cost; double outer_path_rows = PATH_ROWS(outer_path); @@ -1385,7 +1372,6 @@ cost_mergejoin(MergePath *path, PlannerInfo *root, SpecialJoinInfo *sjinfo) outerendsel, innerstartsel, innerendsel; - Selectivity joininfactor; Path sort_path; /* dummy for result of cost_sort */ /* Protect some assumptions below that rowcounts aren't zero */ @@ -1398,21 +1384,21 @@ cost_mergejoin(MergePath *path, PlannerInfo *root, SpecialJoinInfo *sjinfo) startup_cost += disable_cost; /* - * Compute cost and selectivity of the mergequals and qpquals (other - * restriction clauses) separately. We use approx_selectivity here for - * speed --- in most cases, any errors won't affect the result much. - * - * Note: it's probably bogus to use the normal selectivity calculation - * here when either the outer or inner path is a UniquePath. + * Compute cost of the mergequals and qpquals (other restriction clauses) + * separately. */ - merge_selec = approx_selectivity(root, mergeclauses, sjinfo); cost_qual_eval(&merge_qual_cost, mergeclauses, root); cost_qual_eval(&qp_qual_cost, path->jpath.joinrestrictinfo, root); qp_qual_cost.startup -= merge_qual_cost.startup; qp_qual_cost.per_tuple -= merge_qual_cost.per_tuple; - /* approx # tuples passing the merge quals */ - mergejointuples = clamp_row_est(merge_selec * outer_path_rows * inner_path_rows); + /* + * Get approx # tuples passing the mergequals. We use approx_tuple_count + * here for speed --- in most cases, any errors won't affect the result + * much. + */ + mergejointuples = approx_tuple_count(root, &path->jpath, + mergeclauses, sjinfo); /* * When there are equal merge keys in the outer relation, the mergejoin @@ -1422,11 +1408,11 @@ cost_mergejoin(MergePath *path, PlannerInfo *root, SpecialJoinInfo *sjinfo) * but on the other hand we ignore the bookkeeping costs of mark/restore. * Not clear if it's worth developing a more refined model. * - * The number of re-fetches can be estimated approximately as size of - * merge join output minus size of inner relation. Assume that the - * distinct key values are 1, 2, ..., and denote the number of values of - * each key in the outer relation as m1, m2, ...; in the inner relation, - * n1, n2, ... Then we have + * For regular inner and outer joins, the number of re-fetches can be + * estimated approximately as size of merge join output minus size of + * inner relation. Assume that the distinct key values are 1, 2, ..., and + * denote the number of values of each key in the outer relation as m1, + * m2, ...; in the inner relation, n1, n2, ... Then we have * * size of join = m1 * n1 + m2 * n2 + ... * @@ -1439,8 +1425,17 @@ cost_mergejoin(MergePath *path, PlannerInfo *root, SpecialJoinInfo *sjinfo) * are effectively subtracting those from the number of rescanned tuples, * when we should not. Can we do better without expensive selectivity * computations? + * + * For SEMI and ANTI joins, only one inner tuple need be rescanned for + * each group of same-keyed outer tuples (assuming that all joinquals + * are merge quals). This makes the effect small enough to ignore, + * so we just set rescannedtuples = 0. Likewise, the whole issue is + * moot if we are working from a unique-ified outer input. */ - if (IsA(outer_path, UniquePath)) + if (sjinfo->jointype == JOIN_SEMI || + sjinfo->jointype == JOIN_ANTI) + rescannedtuples = 0; + else if (IsA(outer_path, UniquePath)) rescannedtuples = 0; else { @@ -1505,7 +1500,8 @@ cost_mergejoin(MergePath *path, PlannerInfo *root, SpecialJoinInfo *sjinfo) innerstartsel = cache->leftstartsel; innerendsel = cache->leftendsel; } - if (path->jpath.jointype == JOIN_LEFT) + if (path->jpath.jointype == JOIN_LEFT || + path->jpath.jointype == JOIN_ANTI) { outerstartsel = 0.0; outerendsel = 1.0; @@ -1601,20 +1597,9 @@ cost_mergejoin(MergePath *path, PlannerInfo *root, SpecialJoinInfo *sjinfo) /* CPU costs */ /* - * If we're doing JOIN_IN then we will stop outputting inner tuples for an - * outer tuple as soon as we have one match. Account for the effects of - * this by scaling down the cost estimates in proportion to the expected - * output size. (This assumes that all the quals attached to the join are - * IN quals, which should be true.) - */ - joininfactor = join_in_selectivity(&path->jpath, root, sjinfo); - - /* * The number of tuple comparisons needed is approximately number of outer * rows plus number of inner rows plus number of rescanned tuples (can we * refine this?). At each one, we need to evaluate the mergejoin quals. - * NOTE: JOIN_IN mode does not save any work here, so do NOT include - * joininfactor. */ startup_cost += merge_qual_cost.startup; startup_cost += merge_qual_cost.per_tuple * @@ -1627,12 +1612,11 @@ cost_mergejoin(MergePath *path, PlannerInfo *root, SpecialJoinInfo *sjinfo) * For each tuple that gets through the mergejoin proper, we charge * cpu_tuple_cost plus the cost of evaluating additional restriction * clauses that are to be applied at the join. (This is pessimistic since - * not all of the quals may get evaluated at each tuple.) This work is - * skipped in JOIN_IN mode, so apply the factor. + * not all of the quals may get evaluated at each tuple.) */ startup_cost += qp_qual_cost.startup; cpu_per_tuple = cpu_tuple_cost + qp_qual_cost.per_tuple; - run_cost += cpu_per_tuple * mergejointuples * joininfactor; + run_cost += cpu_per_tuple * mergejointuples; path->jpath.path.startup_cost = startup_cost; path->jpath.path.total_cost = startup_cost + run_cost; @@ -1711,7 +1695,6 @@ cost_hashjoin(HashPath *path, PlannerInfo *root, SpecialJoinInfo *sjinfo) Cost startup_cost = 0; Cost run_cost = 0; Cost cpu_per_tuple; - Selectivity hash_selec; QualCost hash_qual_cost; QualCost qp_qual_cost; double hashjointuples; @@ -1722,28 +1705,27 @@ cost_hashjoin(HashPath *path, PlannerInfo *root, SpecialJoinInfo *sjinfo) int numbatches; double virtualbuckets; Selectivity innerbucketsize; - Selectivity joininfactor; ListCell *hcl; if (!enable_hashjoin) startup_cost += disable_cost; /* - * Compute cost and selectivity of the hashquals and qpquals (other - * restriction clauses) separately. We use approx_selectivity here for - * speed --- in most cases, any errors won't affect the result much. - * - * Note: it's probably bogus to use the normal selectivity calculation - * here when either the outer or inner path is a UniquePath. + * Compute cost of the hashquals and qpquals (other restriction clauses) + * separately. */ - hash_selec = approx_selectivity(root, hashclauses, sjinfo); cost_qual_eval(&hash_qual_cost, hashclauses, root); cost_qual_eval(&qp_qual_cost, path->jpath.joinrestrictinfo, root); qp_qual_cost.startup -= hash_qual_cost.startup; qp_qual_cost.per_tuple -= hash_qual_cost.per_tuple; - /* approx # tuples passing the hash quals */ - hashjointuples = clamp_row_est(hash_selec * outer_path_rows * inner_path_rows); + /* + * Get approx # tuples passing the hashquals. We use approx_tuple_count + * here for speed --- in most cases, any errors won't affect the result + * much. + */ + hashjointuples = approx_tuple_count(root, &path->jpath, + hashclauses, sjinfo); /* cost of source data */ startup_cost += outer_path->startup_cost; @@ -1859,15 +1841,6 @@ cost_hashjoin(HashPath *path, PlannerInfo *root, SpecialJoinInfo *sjinfo) /* CPU costs */ /* - * If we're doing JOIN_IN then we will stop comparing inner tuples to an - * outer tuple as soon as we have one match. Account for the effects of - * this by scaling down the cost estimates in proportion to the expected - * output size. (This assumes that all the quals attached to the join are - * IN quals, which should be true.) - */ - joininfactor = join_in_selectivity(&path->jpath, root, sjinfo); - - /* * The number of tuple comparisons needed is the number of outer tuples * times the typical number of tuples in a hash bucket, which is the inner * relation size times its bucketsize fraction. At each one, we need to @@ -1878,8 +1851,7 @@ cost_hashjoin(HashPath *path, PlannerInfo *root, SpecialJoinInfo *sjinfo) */ startup_cost += hash_qual_cost.startup; run_cost += hash_qual_cost.per_tuple * - outer_path_rows * clamp_row_est(inner_path_rows * innerbucketsize) * - joininfactor * 0.5; + outer_path_rows * clamp_row_est(inner_path_rows * innerbucketsize) * 0.5; /* * For each tuple that gets through the hashjoin proper, we charge @@ -1889,7 +1861,7 @@ cost_hashjoin(HashPath *path, PlannerInfo *root, SpecialJoinInfo *sjinfo) */ startup_cost += qp_qual_cost.startup; cpu_per_tuple = cpu_tuple_cost + qp_qual_cost.per_tuple; - run_cost += cpu_per_tuple * hashjointuples * joininfactor; + run_cost += cpu_per_tuple * hashjointuples; path->jpath.path.startup_cost = startup_cost; path->jpath.path.total_cost = startup_cost + run_cost; @@ -2213,13 +2185,12 @@ get_initplan_cost(PlannerInfo *root, SubPlan *subplan) /* - * approx_selectivity - * Quick-and-dirty estimation of clause selectivities. - * The input can be either an implicitly-ANDed list of boolean - * expressions, or a list of RestrictInfo nodes (typically the latter). + * approx_tuple_count + * Quick-and-dirty estimation of the number of join rows passing + * a set of qual conditions. * - * Currently this is only used in join estimation, so sjinfo should never - * be NULL. + * The quals can be either an implicitly-ANDed list of boolean expressions, + * or a list of RestrictInfo nodes (typically the latter). * * This is quick-and-dirty because we bypass clauselist_selectivity, and * simply multiply the independent clause selectivities together. Now @@ -2232,20 +2203,34 @@ get_initplan_cost(PlannerInfo *root, SubPlan *subplan) * output tuples are generated and passed through qpqual checking, it * seems OK to live with the approximation. */ -static Selectivity -approx_selectivity(PlannerInfo *root, List *quals, SpecialJoinInfo *sjinfo) +static double +approx_tuple_count(PlannerInfo *root, JoinPath *path, + List *quals, SpecialJoinInfo *sjinfo) { - Selectivity total = 1.0; + double tuples; + double outer_tuples = path->outerjoinpath->parent->rows; + double inner_tuples = path->innerjoinpath->parent->rows; + Selectivity selec = 1.0; ListCell *l; + /* Get the approximate selectivity */ foreach(l, quals) { Node *qual = (Node *) lfirst(l); /* Note that clause_selectivity will be able to cache its result */ - total *= clause_selectivity(root, qual, 0, sjinfo->jointype, sjinfo); + selec *= clause_selectivity(root, qual, 0, sjinfo->jointype, sjinfo); } - return total; + + /* Apply it correctly using the input relation sizes */ + if (sjinfo->jointype == JOIN_SEMI) + tuples = selec * outer_tuples; + else if (sjinfo->jointype == JOIN_ANTI) + tuples = (1.0 - selec) * outer_tuples; + else + tuples = selec * outer_tuples * inner_tuples; + + return clamp_row_est(tuples); } @@ -2315,7 +2300,6 @@ set_joinrel_size_estimates(PlannerInfo *root, RelOptInfo *rel, Selectivity jselec; Selectivity pselec; double nrows; - UniquePath *upath; /* * Compute joinclause selectivity. Note that we are only considering @@ -2380,9 +2364,8 @@ set_joinrel_size_estimates(PlannerInfo *root, RelOptInfo *rel, * pushed-down quals are applied after the outer join, so their * selectivity applies fully. * - * For JOIN_IN and variants, the Cartesian product is figured with respect - * to a unique-ified input, and then we can clamp to the size of the other - * input. + * For JOIN_SEMI and JOIN_ANTI, the selectivity is defined as the fraction + * of LHS rows that have matches, and we apply that straightforwardly. */ switch (jointype) { @@ -2404,22 +2387,11 @@ set_joinrel_size_estimates(PlannerInfo *root, RelOptInfo *rel, nrows *= pselec; break; case JOIN_SEMI: - /* XXX this is unsafe, could Assert? */ - upath = create_unique_path(root, inner_rel, - inner_rel->cheapest_total_path, - sjinfo); - if (upath) - nrows = outer_rel->rows * upath->rows * jselec; - else - nrows = outer_rel->rows * inner_rel->rows * jselec; - if (nrows > outer_rel->rows) - nrows = outer_rel->rows; + nrows = outer_rel->rows * jselec; + nrows *= pselec; break; case JOIN_ANTI: - /* XXX this is utterly wrong */ - nrows = outer_rel->rows * inner_rel->rows * jselec; - if (nrows < outer_rel->rows) - nrows = outer_rel->rows; + nrows = outer_rel->rows * (1.0 - jselec); nrows *= pselec; break; default: @@ -2433,67 +2405,6 @@ set_joinrel_size_estimates(PlannerInfo *root, RelOptInfo *rel, } /* - * join_in_selectivity - * Determines the factor by which a JOIN_IN join's result is expected - * to be smaller than an ordinary inner join. - * - * 'path' is already filled in except for the cost fields - * 'sjinfo' must be the JOIN_SEMI SpecialJoinInfo for the join - */ -static Selectivity -join_in_selectivity(JoinPath *path, PlannerInfo *root, SpecialJoinInfo *sjinfo) -{ - RelOptInfo *innerrel; - UniquePath *innerunique; - Selectivity selec; - double nrows; - - /* Return 1.0 whenever it's not JOIN_IN */ - if (path->jointype != JOIN_SEMI) - return 1.0; - Assert(sjinfo && sjinfo->jointype == JOIN_SEMI); - - /* - * Return 1.0 if the inner side is already known unique. The case where - * the inner path is already a UniquePath probably cannot happen in - * current usage, but check it anyway for completeness. The interesting - * case is where we've determined the inner relation itself is unique, - * which we can check by looking at the rows estimate for its UniquePath. - */ - if (IsA(path->innerjoinpath, UniquePath)) - return 1.0; - innerrel = path->innerjoinpath->parent; - /* XXX might assert if sjinfo doesn't exactly match innerrel? */ - innerunique = create_unique_path(root, - innerrel, - innerrel->cheapest_total_path, - sjinfo); - if (innerunique && innerunique->rows >= innerrel->rows) - return 1.0; - - /* - * Compute same result set_joinrel_size_estimates would compute for - * JOIN_INNER. Note that we use the input rels' absolute size estimates, - * not PATH_ROWS() which might be less; if we used PATH_ROWS() we'd be - * double-counting the effects of any join clauses used in input scans. - */ - selec = clauselist_selectivity(root, - path->joinrestrictinfo, - 0, - JOIN_INNER, - NULL); - nrows = path->outerjoinpath->parent->rows * innerrel->rows * selec; - - nrows = clamp_row_est(nrows); - - /* See if it's larger than the actual JOIN_IN size estimate */ - if (nrows > path->path.parent->rows) - return path->path.parent->rows / nrows; - else - return 1.0; -} - -/* * set_function_size_estimates * Set the size estimates for a base relation that is a function call. * |
