Commit 596efd27 authored by Tom Lane's avatar Tom Lane

Optimize multi-batch hash joins when the outer relation has a nonuniform

distribution, by creating a special fast path for the (first few) most common
values of the outer relation.  Tuples having hashvalues matching the MCVs
are effectively forced to be in the first batch, so that we never write
them out to the batch temp files.

Bryce Cutt and Ramon Lawrence, with some editorialization by me.
parent 249d936b
This diff is collapsed.
......@@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/executor/nodeHashjoin.c,v 1.97 2009/01/01 17:23:41 momjian Exp $
* $PostgreSQL: pgsql/src/backend/executor/nodeHashjoin.c,v 1.98 2009/03/21 00:04:38 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -198,19 +198,23 @@ ExecHashJoin(HashJoinState *node)
node->hj_MatchedOuter = false;
/*
* now we have an outer tuple, find the corresponding bucket for
* this tuple from the hash table
* Now we have an outer tuple; find the corresponding bucket for
* this tuple in the main hash table or skew hash table.
*/
node->hj_CurHashValue = hashvalue;
ExecHashGetBucketAndBatch(hashtable, hashvalue,
&node->hj_CurBucketNo, &batchno);
node->hj_CurSkewBucketNo = ExecHashGetSkewBucket(hashtable,
hashvalue);
node->hj_CurTuple = NULL;
/*
* Now we've got an outer tuple and the corresponding hash bucket,
* but this tuple may not belong to the current batch.
* but it might not belong to the current batch, or it might
* match a skew bucket.
*/
if (batchno != hashtable->curbatch)
if (batchno != hashtable->curbatch &&
node->hj_CurSkewBucketNo == INVALID_SKEW_BUCKET_NO)
{
/*
* Need to postpone this outer tuple to a later batch. Save it
......@@ -452,6 +456,7 @@ ExecInitHashJoin(HashJoin *node, EState *estate, int eflags)
hjstate->hj_CurHashValue = 0;
hjstate->hj_CurBucketNo = 0;
hjstate->hj_CurSkewBucketNo = INVALID_SKEW_BUCKET_NO;
hjstate->hj_CurTuple = NULL;
/*
......@@ -651,6 +656,19 @@ start_over:
BufFileClose(hashtable->outerBatchFile[curbatch]);
hashtable->outerBatchFile[curbatch] = NULL;
}
else /* we just finished the first batch */
{
/*
* Reset some of the skew optimization state variables, since we
* no longer need to consider skew tuples after the first batch.
* The memory context reset we are about to do will release the
* skew hashtable itself.
*/
hashtable->skewEnabled = false;
hashtable->skewBucket = NULL;
hashtable->skewBucketNums = NULL;
hashtable->spaceUsedSkew = 0;
}
/*
* We can always skip over any batches that are completely empty on both
......@@ -880,6 +898,7 @@ ExecReScanHashJoin(HashJoinState *node, ExprContext *exprCtxt)
/* Always reset intra-tuple state */
node->hj_CurHashValue = 0;
node->hj_CurBucketNo = 0;
node->hj_CurSkewBucketNo = INVALID_SKEW_BUCKET_NO;
node->hj_CurTuple = NULL;
node->js.ps.ps_TupFromTlist = false;
......
......@@ -15,7 +15,7 @@
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/nodes/copyfuncs.c,v 1.426 2009/03/10 22:09:25 tgl Exp $
* $PostgreSQL: pgsql/src/backend/nodes/copyfuncs.c,v 1.427 2009/03/21 00:04:39 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -735,6 +735,10 @@ _copyHash(Hash *from)
/*
* copy remainder of node
*/
COPY_SCALAR_FIELD(skewTable);
COPY_SCALAR_FIELD(skewColumn);
COPY_SCALAR_FIELD(skewColType);
COPY_SCALAR_FIELD(skewColTypmod);
return newnode;
}
......
......@@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/nodes/outfuncs.c,v 1.354 2009/03/10 22:09:25 tgl Exp $
* $PostgreSQL: pgsql/src/backend/nodes/outfuncs.c,v 1.355 2009/03/21 00:04:39 tgl Exp $
*
* NOTES
* Every node type that can appear in stored rules' parsetrees *must*
......@@ -675,6 +675,11 @@ _outHash(StringInfo str, Hash *node)
WRITE_NODE_TYPE("HASH");
_outPlanInfo(str, (Plan *) node);
WRITE_OID_FIELD(skewTable);
WRITE_INT_FIELD(skewColumn);
WRITE_OID_FIELD(skewColType);
WRITE_INT_FIELD(skewColTypmod);
}
static void
......
......@@ -54,7 +54,7 @@
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.204 2009/02/06 23:43:23 tgl Exp $
* $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.205 2009/03/21 00:04:39 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -1821,6 +1821,7 @@ cost_hashjoin(HashPath *path, PlannerInfo *root, SpecialJoinInfo *sjinfo)
int num_hashclauses = list_length(hashclauses);
int numbuckets;
int numbatches;
int num_skew_mcvs;
double virtualbuckets;
Selectivity innerbucketsize;
ListCell *hcl;
......@@ -1862,11 +1863,22 @@ cost_hashjoin(HashPath *path, PlannerInfo *root, SpecialJoinInfo *sjinfo)
* inner_path_rows;
run_cost += cpu_operator_cost * num_hashclauses * outer_path_rows;
/* Get hash table size that executor would use for inner relation */
/*
* Get hash table size that executor would use for inner relation.
*
* XXX for the moment, always assume that skew optimization will be
* performed. As long as SKEW_WORK_MEM_PERCENT is small, it's not worth
* trying to determine that for sure.
*
* XXX at some point it might be interesting to try to account for skew
* optimization in the cost estimate, but for now, we don't.
*/
ExecChooseHashTableSize(inner_path_rows,
inner_path->parent->width,
true, /* useskew */
&numbuckets,
&numbatches);
&numbatches,
&num_skew_mcvs);
virtualbuckets = (double) numbuckets *(double) numbatches;
/*
......
......@@ -10,7 +10,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/optimizer/plan/createplan.c,v 1.255 2009/01/01 17:23:44 momjian Exp $
* $PostgreSQL: pgsql/src/backend/optimizer/plan/createplan.c,v 1.256 2009/03/21 00:04:39 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -112,7 +112,11 @@ static HashJoin *make_hashjoin(List *tlist,
List *hashclauses,
Plan *lefttree, Plan *righttree,
JoinType jointype);
static Hash *make_hash(Plan *lefttree);
static Hash *make_hash(Plan *lefttree,
Oid skewTable,
AttrNumber skewColumn,
Oid skewColType,
int32 skewColTypmod);
static MergeJoin *make_mergejoin(List *tlist,
List *joinclauses, List *otherclauses,
List *mergeclauses,
......@@ -1864,6 +1868,10 @@ create_hashjoin_plan(PlannerInfo *root,
List *joinclauses;
List *otherclauses;
List *hashclauses;
Oid skewTable = InvalidOid;
AttrNumber skewColumn = InvalidAttrNumber;
Oid skewColType = InvalidOid;
int32 skewColTypmod = -1;
HashJoin *join_plan;
Hash *hash_plan;
......@@ -1902,10 +1910,47 @@ create_hashjoin_plan(PlannerInfo *root,
/* We don't want any excess columns in the hashed tuples */
disuse_physical_tlist(inner_plan, best_path->jpath.innerjoinpath);
/*
* If there is a single join clause and we can identify the outer
* variable as a simple column reference, supply its identity for
* possible use in skew optimization. (Note: in principle we could
* do skew optimization with multiple join clauses, but we'd have to
* be able to determine the most common combinations of outer values,
* which we don't currently have enough stats for.)
*/
if (list_length(hashclauses) == 1)
{
OpExpr *clause = (OpExpr *) linitial(hashclauses);
Node *node;
Assert(is_opclause(clause));
node = (Node *) linitial(clause->args);
if (IsA(node, RelabelType))
node = (Node *) ((RelabelType *) node)->arg;
if (IsA(node, Var))
{
Var *var = (Var *) node;
RangeTblEntry *rte;
rte = root->simple_rte_array[var->varno];
if (rte->rtekind == RTE_RELATION)
{
skewTable = rte->relid;
skewColumn = var->varattno;
skewColType = var->vartype;
skewColTypmod = var->vartypmod;
}
}
}
/*
* Build the hash node and hash join node.
*/
hash_plan = make_hash(inner_plan);
hash_plan = make_hash(inner_plan,
skewTable,
skewColumn,
skewColType,
skewColTypmod);
join_plan = make_hashjoin(tlist,
joinclauses,
otherclauses,
......@@ -2713,7 +2758,11 @@ make_hashjoin(List *tlist,
}
static Hash *
make_hash(Plan *lefttree)
make_hash(Plan *lefttree,
Oid skewTable,
AttrNumber skewColumn,
Oid skewColType,
int32 skewColTypmod)
{
Hash *node = makeNode(Hash);
Plan *plan = &node->plan;
......@@ -2730,6 +2779,11 @@ make_hash(Plan *lefttree)
plan->lefttree = lefttree;
plan->righttree = NULL;
node->skewTable = skewTable;
node->skewColumn = skewColumn;
node->skewColType = skewColType;
node->skewColTypmod = skewColTypmod;
return node;
}
......
......@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $PostgreSQL: pgsql/src/include/executor/hashjoin.h,v 1.49 2009/01/01 17:23:59 momjian Exp $
* $PostgreSQL: pgsql/src/include/executor/hashjoin.h,v 1.50 2009/03/21 00:04:40 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -72,6 +72,36 @@ typedef struct HashJoinTupleData
#define HJTUPLE_MINTUPLE(hjtup) \
((MinimalTuple) ((char *) (hjtup) + HJTUPLE_OVERHEAD))
/*
* If the outer relation's distribution is sufficiently nonuniform, we attempt
* to optimize the join by treating the hash values corresponding to the outer
* relation's MCVs specially. Inner relation tuples matching these hash
* values go into the "skew" hashtable instead of the main hashtable, and
* outer relation tuples with these hash values are matched against that
* table instead of the main one. Thus, tuples with these hash values are
* effectively handled as part of the first batch and will never go to disk.
* The skew hashtable is limited to SKEW_WORK_MEM_PERCENT of the total memory
* allowed for the join; while building the hashtables, we decrease the number
* of MCVs being specially treated if needed to stay under this limit.
*
* Note: you might wonder why we look at the outer relation stats for this,
* rather than the inner. One reason is that the outer relation is typically
* bigger, so we get more I/O savings by optimizing for its most common values.
* Also, for similarly-sized relations, the planner prefers to put the more
* uniformly distributed relation on the inside, so we're more likely to find
* interesting skew in the outer relation.
*/
typedef struct HashSkewBucket
{
uint32 hashvalue; /* common hash value */
HashJoinTuple tuples; /* linked list of inner-relation tuples */
} HashSkewBucket;
#define SKEW_BUCKET_OVERHEAD MAXALIGN(sizeof(HashSkewBucket))
#define INVALID_SKEW_BUCKET_NO (-1)
#define SKEW_WORK_MEM_PERCENT 2
#define SKEW_MIN_OUTER_FRACTION 0.01
typedef struct HashJoinTableData
{
......@@ -82,6 +112,12 @@ typedef struct HashJoinTableData
struct HashJoinTupleData **buckets;
/* buckets array is per-batch storage, as are all the tuples */
bool skewEnabled; /* are we using skew optimization? */
HashSkewBucket **skewBucket; /* hashtable of skew buckets */
int skewBucketLen; /* size of skewBucket array (a power of 2!) */
int nSkewBuckets; /* number of active skew buckets */
int *skewBucketNums; /* array indexes of active skew buckets */
int nbatch; /* number of batches */
int curbatch; /* current batch #; 0 during 1st pass */
......@@ -113,6 +149,8 @@ typedef struct HashJoinTableData
Size spaceUsed; /* memory space currently used by tuples */
Size spaceAllowed; /* upper limit for space used */
Size spaceUsedSkew; /* skew hash table's current space usage */
Size spaceAllowedSkew; /* upper limit for skew hashtable */
MemoryContext hashCxt; /* context for whole-hash-join storage */
MemoryContext batchCxt; /* context for this-batch-only storage */
......
......@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $PostgreSQL: pgsql/src/include/executor/nodeHash.h,v 1.46 2009/01/01 17:23:59 momjian Exp $
* $PostgreSQL: pgsql/src/include/executor/nodeHash.h,v 1.47 2009/03/21 00:04:40 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -41,8 +41,10 @@ extern void ExecHashGetBucketAndBatch(HashJoinTable hashtable,
extern HashJoinTuple ExecScanHashBucket(HashJoinState *hjstate,
ExprContext *econtext);
extern void ExecHashTableReset(HashJoinTable hashtable);
extern void ExecChooseHashTableSize(double ntuples, int tupwidth,
extern void ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
int *numbuckets,
int *numbatches);
int *numbatches,
int *num_skew_mcvs);
extern int ExecHashGetSkewBucket(HashJoinTable hashtable, uint32 hashvalue);
#endif /* NODEHASH_H */
......@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $PostgreSQL: pgsql/src/include/nodes/execnodes.h,v 1.201 2009/01/12 05:10:45 tgl Exp $
* $PostgreSQL: pgsql/src/include/nodes/execnodes.h,v 1.202 2009/03/21 00:04:40 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -1374,11 +1374,12 @@ typedef struct MergeJoinState
* hj_HashTable hash table for the hashjoin
* (NULL if table not built yet)
* hj_CurHashValue hash value for current outer tuple
* hj_CurBucketNo bucket# for current outer tuple
* hj_CurBucketNo regular bucket# for current outer tuple
* hj_CurSkewBucketNo skew bucket# for current outer tuple
* hj_CurTuple last inner tuple matched to current outer
* tuple, or NULL if starting search
* (CurHashValue, CurBucketNo and CurTuple are
* undefined if OuterTupleSlot is empty!)
* (hj_CurXXX variables are undefined if
* OuterTupleSlot is empty!)
* hj_OuterHashKeys the outer hash keys in the hashjoin condition
* hj_InnerHashKeys the inner hash keys in the hashjoin condition
* hj_HashOperators the join operators in the hashjoin condition
......@@ -1403,6 +1404,7 @@ typedef struct HashJoinState
HashJoinTable hj_HashTable;
uint32 hj_CurHashValue;
int hj_CurBucketNo;
int hj_CurSkewBucketNo;
HashJoinTuple hj_CurTuple;
List *hj_OuterHashKeys; /* list of ExprState nodes */
List *hj_InnerHashKeys; /* list of ExprState nodes */
......
......@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $PostgreSQL: pgsql/src/include/nodes/plannodes.h,v 1.108 2009/01/01 17:24:00 momjian Exp $
* $PostgreSQL: pgsql/src/include/nodes/plannodes.h,v 1.109 2009/03/21 00:04:40 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -460,7 +460,7 @@ typedef struct MergeJoin
} MergeJoin;
/* ----------------
* hash join (probe) node
* hash join node
* ----------------
*/
typedef struct HashJoin
......@@ -567,11 +567,20 @@ typedef struct Unique
/* ----------------
* hash build node
*
* If the executor is supposed to try to apply skew join optimization, then
* skewTable/skewColumn identify the outer relation's join key column, from
* which the relevant MCV statistics can be fetched. Also, its type
* information is provided to save a lookup.
* ----------------
*/
typedef struct Hash
{
Plan plan;
Oid skewTable; /* outer join key's table OID, or InvalidOid */
AttrNumber skewColumn; /* outer join key's column #, or zero */
Oid skewColType; /* datatype of the outer key column */
int32 skewColTypmod; /* typmod of the outer key column */
/* all other info is in the parent HashJoin node */
} Hash;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment