Support RIGHT and FULL OUTER JOIN in hash joins.

This is advantageous first because it allows us to hash the smaller table regardless of the outer-join type, and second because hash join can be more flexible than merge join in dealing with arbitrary join quals in a FULL join. For merge join all the join quals have to be mergejoinable, but hash join will work so long as there's at least one hashjoinable qual --- the others can be any condition. (This is true essentially because we don't keep per-inner-tuple match flags in merge join, while hash join can do so.) To do this, we need a has-it-been-matched flag for each tuple in the hashtable, not just one for the current outer tuple. The key idea that makes this practical is that we can store the match flag in the tuple's infomask, since there are lots of bits there that are of no interest for a MinimalTuple. So we aren't increasing the size of the hashtable at all for the feature. To write this without turning the hash code into even more of a pile of spaghetti than it already was, I rewrote ExecHashJoin in a state-machine style, similar to ExecMergeJoin. Other than that decision, it was pretty straightforward.

Support RIGHT and FULL OUTER JOIN in hash joins.
This is advantageous first because it allows us to hash the smaller table regardless of the outer-join type, and second because hash join can be more flexible than merge join in dealing with arbitrary join quals in a FULL join. For merge join all the join quals have to be mergejoinable, but hash join will work so long as there's at least one hashjoinable qual --- the others can be any condition. (This is true essentially because we don't keep per-inner-tuple match flags in merge join, while hash join can do so.) To do this, we need a has-it-been-matched flag for each tuple in the hashtable, not just one for the current outer tuple. The key idea that makes this practical is that we can store the match flag in the tuple's infomask, since there are lots of bits there that are of no interest for a MinimalTuple. So we aren't increasing the size of the hashtable at all for the feature. To write this without turning the hash code into even more of a pile of spaghetti than it already was, I rewrote ExecHashJoin in a state-machine style, similar to ExecMergeJoin. Other than that decision, it was pretty straightforward.
f4e4b327 · Tom Lane · 17cb9e8c · f4e4b327 · f4e4b327 · f4e4b327
Commit f4e4b327 authored Dec 30, 2010 by Tom Lane
10 changed files
--- a/src/backend/executor/nodeHash.c
+++ b/src/backend/executor/nodeHash.c
@@ -105,7 +105,8 @@ MultiExecHash(HashState *node)
 			break;
 		/* We have to compute the hash value */
 		econtext->ecxt_innertuple = slot;
-		if (ExecHashGetHashValue(hashtable, econtext, hashkeys, false, false,
+		if (ExecHashGetHashValue(hashtable, econtext, hashkeys,
+								 false, hashtable->keepNulls,
 								 &hashvalue))
 		{
 			int			bucketNumber;
@@ -231,7 +232,7 @@ ExecEndHash(HashState *node)
 * ----------------------------------------------------------------
 */
 HashJoinTable
-ExecHashTableCreate(Hash *node, List *hashOperators)
+ExecHashTableCreate(Hash *node, List *hashOperators, bool keepNulls)
 {
 	HashJoinTable hashtable;
 	Plan	   *outerNode;
@@ -273,6 +274,7 @@ ExecHashTableCreate(Hash *node, List *hashOperators)
 	hashtable->nbuckets = nbuckets;
 	hashtable->log2_nbuckets = log2_nbuckets;
 	hashtable->buckets = NULL;
+	hashtable->keepNulls = keepNulls;
 	hashtable->skewEnabled = false;
 	hashtable->skewBucket = NULL;
 	hashtable->skewBucketLen = 0;
@@ -712,13 +714,26 @@ ExecHashTableInsert(HashJoinTable hashtable,
 		HashJoinTuple hashTuple;
 		int			hashTupleSize;
+		/* Create the HashJoinTuple */
 		hashTupleSize = HJTUPLE_OVERHEAD + tuple->t_len;
 		hashTuple = (HashJoinTuple) MemoryContextAlloc(hashtable->batchCxt,
 													   hashTupleSize);
 		hashTuple->hashvalue = hashvalue;
 		memcpy(HJTUPLE_MINTUPLE(hashTuple), tuple, tuple->t_len);
+		/*
+		 * We always reset the tuple-matched flag on insertion.  This is okay
+		 * even when reloading a tuple from a batch file, since the tuple
+		 * could not possibly have been matched to an outer tuple before it
+		 * went into the batch file.
+		 */
+		HeapTupleHeaderClearMatch(HJTUPLE_MINTUPLE(hashTuple));
+		/* Push it onto the front of the bucket's list */
 		hashTuple->next = hashtable->buckets[bucketno];
 		hashtable->buckets[bucketno] = hashTuple;
+		/* Account for space used, and back off if we've used too much */
 		hashtable->spaceUsed += hashTupleSize;
 		if (hashtable->spaceUsed > hashtable->spacePeak)
 			hashtable->spacePeak = hashtable->spaceUsed;
@@ -878,8 +893,12 @@ ExecHashGetBucketAndBatch(HashJoinTable hashtable,
 *		scan a hash bucket for matches to the current outer tuple
 *
 * The current outer tuple must be stored in econtext->ecxt_outertuple.
+ *
+ * On success, the inner tuple is stored into hjstate->hj_CurTuple and
+ * econtext->ecxt_innertuple, using hjstate->hj_HashTupleSlot as the slot
+ * for the latter.
 */
-HashJoinTuple
+bool
 ExecScanHashBucket(HashJoinState *hjstate,
 				   ExprContext *econtext)
 {
@@ -920,7 +939,7 @@ ExecScanHashBucket(HashJoinState *hjstate,
 			if (ExecQual(hjclauses, econtext, false))
 			{
 				hjstate->hj_CurTuple = hashTuple;
-				return hashTuple;
+				return true;
 			}
 		}
@@ -930,7 +949,99 @@ ExecScanHashBucket(HashJoinState *hjstate,
 	/*
 	 * no match
 	 */
-	return NULL;
+	return false;
+}
+/*
+ * ExecPrepHashTableForUnmatched
+ *		set up for a series of ExecScanHashTableForUnmatched calls
+ */
+void
+ExecPrepHashTableForUnmatched(HashJoinState *hjstate)
+{
+	/*
+	 *----------
+	 * During this scan we use the HashJoinState fields as follows:
+	 *
+	 * hj_CurBucketNo: next regular bucket to scan
+	 * hj_CurSkewBucketNo: next skew bucket (an index into skewBucketNums)
+	 * hj_CurTuple: last tuple returned, or NULL to start next bucket
+	 *----------
+	 */
+	hjstate->hj_CurBucketNo = 0;
+	hjstate->hj_CurSkewBucketNo = 0;
+	hjstate->hj_CurTuple = NULL;
+}
+/*
+ * ExecScanHashTableForUnmatched
+ *		scan the hash table for unmatched inner tuples
+ *
+ * On success, the inner tuple is stored into hjstate->hj_CurTuple and
+ * econtext->ecxt_innertuple, using hjstate->hj_HashTupleSlot as the slot
+ * for the latter.
+ */
+bool
+ExecScanHashTableForUnmatched(HashJoinState *hjstate, ExprContext *econtext)
+{
+	HashJoinTable hashtable = hjstate->hj_HashTable;
+	HashJoinTuple hashTuple = hjstate->hj_CurTuple;
+	for (;;)
+	{
+		/*
+		 * hj_CurTuple is the address of the tuple last returned from the
+		 * current bucket, or NULL if it's time to start scanning a new
+		 * bucket.
+		 */
+		if (hashTuple != NULL)
+			hashTuple = hashTuple->next;
+		else if (hjstate->hj_CurBucketNo < hashtable->nbuckets)
+		{
+			hashTuple = hashtable->buckets[hjstate->hj_CurBucketNo];
+			hjstate->hj_CurBucketNo++;
+		}
+		else if (hjstate->hj_CurSkewBucketNo < hashtable->nSkewBuckets)
+		{
+			int		j = hashtable->skewBucketNums[hjstate->hj_CurSkewBucketNo];
+			hashTuple = hashtable->skewBucket[j]->tuples;
+			hjstate->hj_CurSkewBucketNo++;
+		}
+		else
+			break;				/* finished all buckets */
+		while (hashTuple != NULL)
+		{
+			if (!HeapTupleHeaderHasMatch(HJTUPLE_MINTUPLE(hashTuple)))
+			{
+				TupleTableSlot *inntuple;
+				/* insert hashtable's tuple into exec slot */
+				inntuple = ExecStoreMinimalTuple(HJTUPLE_MINTUPLE(hashTuple),
+												 hjstate->hj_HashTupleSlot,
+												 false);	/* do not pfree */
+				econtext->ecxt_innertuple = inntuple;
+				/*
+				 * Reset temp memory each time; although this function doesn't
+				 * do any qual eval, the caller will, so let's keep it
+				 * parallel to ExecScanHashBucket.
+				 */
+				ResetExprContext(econtext);
+				hjstate->hj_CurTuple = hashTuple;
+				return true;
+			}
+			hashTuple = hashTuple->next;
+		}
+	}
+	/*
+	 * no more unmatched tuples
+	 */
+	return false;
 }
 /*
@@ -960,6 +1071,35 @@ ExecHashTableReset(HashJoinTable hashtable)
 	MemoryContextSwitchTo(oldcxt);
 }
+/*
+ * ExecHashTableResetMatchFlags
+ *		Clear all the HeapTupleHeaderHasMatch flags in the table
+ */
+void
+ExecHashTableResetMatchFlags(HashJoinTable hashtable)
+{
+	HashJoinTuple tuple;
+	int			i;
+	/* Reset all flags in the main table ... */
+	for (i = 0; i < hashtable->nbuckets; i++)
+	{
+		for (tuple = hashtable->buckets[i]; tuple != NULL; tuple = tuple->next)
+			HeapTupleHeaderClearMatch(HJTUPLE_MINTUPLE(tuple));
+	}
+	/* ... and the same for the skew buckets, if any */
+	for (i = 0; i < hashtable->nSkewBuckets; i++)
+	{
+		int		j = hashtable->skewBucketNums[i];
+		HashSkewBucket *skewBucket = hashtable->skewBucket[j];
+		for (tuple = skewBucket->tuples; tuple != NULL; tuple = tuple->next)
+			HeapTupleHeaderClearMatch(HJTUPLE_MINTUPLE(tuple));
+	}
+}
 void
 ExecReScanHash(HashState *node)
 {
@@ -1203,6 +1343,7 @@ ExecHashSkewTableInsert(HashJoinTable hashtable,
 												   hashTupleSize);
 	hashTuple->hashvalue = hashvalue;
 	memcpy(HJTUPLE_MINTUPLE(hashTuple), tuple, tuple->t_len);
+	HeapTupleHeaderClearMatch(HJTUPLE_MINTUPLE(hashTuple));
 	/* Push it onto the front of the skew bucket's list */
 	hashTuple->next = hashtable->skewBucket[bucketNumber]->tuples;

--- a/src/backend/executor/nodeHashjoin.c
+++ b/src/backend/executor/nodeHashjoin.c
@@ -22,8 +22,20 @@
 #include "utils/memutils.h"
-/* Returns true for JOIN_LEFT and JOIN_ANTI jointypes */
+/*
-#define HASHJOIN_IS_OUTER(hjstate)	((hjstate)->hj_NullInnerTupleSlot != NULL)
+ * States of the ExecHashJoin state machine
+ */
+#define HJ_BUILD_HASHTABLE		1
+#define HJ_NEED_NEW_OUTER		2
+#define HJ_SCAN_BUCKET			3
+#define HJ_FILL_OUTER_TUPLE		4
+#define HJ_FILL_INNER_TUPLES	5
+#define HJ_NEED_NEW_BATCH		6
+/* Returns true if doing null-fill on outer relation */
+#define HJ_FILL_OUTER(hjstate)	((hjstate)->hj_NullInnerTupleSlot != NULL)
+/* Returns true if doing null-fill on inner relation */
+#define HJ_FILL_INNER(hjstate)	((hjstate)->hj_NullOuterTupleSlot != NULL)
 static TupleTableSlot *ExecHashJoinOuterGetTuple(PlanState *outerNode,
 						  HashJoinState *hjstate,
@@ -32,7 +44,7 @@ static TupleTableSlot *ExecHashJoinGetSavedTuple(HashJoinState *hjstate,
 						  BufFile *file,
 						  uint32 *hashvalue,
 						  TupleTableSlot *tupleSlot);
-static int	ExecHashJoinNewBatch(HashJoinState *hjstate);
+static bool ExecHashJoinNewBatch(HashJoinState *hjstate);
 /* ----------------------------------------------------------------
@@ -52,11 +64,9 @@ ExecHashJoin(HashJoinState *node)
 	HashState  *hashNode;
 	List	   *joinqual;
 	List	   *otherqual;
-	TupleTableSlot *inntuple;
 	ExprContext *econtext;
 	ExprDoneCond isDone;
 	HashJoinTable hashtable;
-	HashJoinTuple curtuple;
 	TupleTableSlot *outerTupleSlot;
 	uint32		hashvalue;
 	int			batchno;
@@ -69,10 +79,6 @@ ExecHashJoin(HashJoinState *node)
 	otherqual = node->js.ps.qual;
 	hashNode = (HashState *) innerPlanState(node);
 	outerNode = outerPlanState(node);
-	/*
-	 * get information from HashJoin state
-	 */
 	hashtable = node->hj_HashTable;
 	econtext = node->js.ps.ps_ExprContext;
@@ -100,238 +106,300 @@ ExecHashJoin(HashJoinState *node)
 	ResetExprContext(econtext);
 	/*
-	 * if this is the first call, build the hash table for inner relation
+	 * run the hash join state machine
 	 */
-	if (hashtable == NULL)
+	for (;;)
 	{
-		/*
+		switch (node->hj_JoinState)
-		 * If the outer relation is completely empty, we can quit without
-		 * building the hash table.  However, for an inner join it is only a
-		 * win to check this when the outer relation's startup cost is less
-		 * than the projected cost of building the hash table.	Otherwise it's
-		 * best to build the hash table first and see if the inner relation is
-		 * empty.  (When it's an outer join, we should always make this check,
-		 * since we aren't going to be able to skip the join on the strength
-		 * of an empty inner relation anyway.)
-		 *
-		 * If we are rescanning the join, we make use of information gained on
-		 * the previous scan: don't bother to try the prefetch if the previous
-		 * scan found the outer relation nonempty.	This is not 100% reliable
-		 * since with new parameters the outer relation might yield different
-		 * results, but it's a good heuristic.
-		 *
-		 * The only way to make the check is to try to fetch a tuple from the
-		 * outer plan node.  If we succeed, we have to stash it away for later
-		 * consumption by ExecHashJoinOuterGetTuple.
-		 */
-		if (HASHJOIN_IS_OUTER(node) ||
-			(outerNode->plan->startup_cost < hashNode->ps.plan->total_cost &&
-			 !node->hj_OuterNotEmpty))
 		{
-			node->hj_FirstOuterTupleSlot = ExecProcNode(outerNode);
+			case HJ_BUILD_HASHTABLE:
-			if (TupIsNull(node->hj_FirstOuterTupleSlot))
+				/*
-			{
+				 * First time through: build hash table for inner relation.
-				node->hj_OuterNotEmpty = false;
+				 */
-				return NULL;
+				Assert(hashtable == NULL);
-			}
-			else
-				node->hj_OuterNotEmpty = true;
-		}
-		else
-			node->hj_FirstOuterTupleSlot = NULL;
-		/*
+				/*
-		 * create the hash table
+				 * If the outer relation is completely empty, and it's not
-		 */
+				 * right/full join, we can quit without building the hash
-		hashtable = ExecHashTableCreate((Hash *) hashNode->ps.plan,
+				 * table.  However, for an inner join it is only a win to
-										node->hj_HashOperators);
+				 * check this when the outer relation's startup cost is less
-		node->hj_HashTable = hashtable;
+				 * than the projected cost of building the hash
+				 * table.  Otherwise it's best to build the hash table first
+				 * and see if the inner relation is empty.  (When it's a left
+				 * join, we should always make this check, since we aren't
+				 * going to be able to skip the join on the strength of an
+				 * empty inner relation anyway.)
+				 *
+				 * If we are rescanning the join, we make use of information
+				 * gained on the previous scan: don't bother to try the
+				 * prefetch if the previous scan found the outer relation
+				 * nonempty. This is not 100% reliable since with new
+				 * parameters the outer relation might yield different
+				 * results, but it's a good heuristic.
+				 *
+				 * The only way to make the check is to try to fetch a tuple
+				 * from the outer plan node.  If we succeed, we have to stash
+				 * it away for later consumption by ExecHashJoinOuterGetTuple.
+				 */
+				if (HJ_FILL_INNER(node))
+				{
+					/* no chance to not build the hash table */
+					node->hj_FirstOuterTupleSlot = NULL;
+				}
+				else if (HJ_FILL_OUTER(node) ||
+						 (outerNode->plan->startup_cost < hashNode->ps.plan->total_cost &&
+						  !node->hj_OuterNotEmpty))
+				{
+					node->hj_FirstOuterTupleSlot = ExecProcNode(outerNode);
+					if (TupIsNull(node->hj_FirstOuterTupleSlot))
+					{
+						node->hj_OuterNotEmpty = false;
+						return NULL;
+					}
+					else
+						node->hj_OuterNotEmpty = true;
+				}
+				else
+					node->hj_FirstOuterTupleSlot = NULL;
-		/*
+				/*
-		 * execute the Hash node, to build the hash table
+				 * create the hash table
-		 */
+				 */
-		hashNode->hashtable = hashtable;
+				hashtable = ExecHashTableCreate((Hash *) hashNode->ps.plan,
-		(void) MultiExecProcNode((PlanState *) hashNode);
+												node->hj_HashOperators,
+												HJ_FILL_INNER(node));
+				node->hj_HashTable = hashtable;
-		/*
+				/*
-		 * If the inner relation is completely empty, and we're not doing an
+				 * execute the Hash node, to build the hash table
-		 * outer join, we can quit without scanning the outer relation.
+				 */
-		 */
+				hashNode->hashtable = hashtable;
-		if (hashtable->totalTuples == 0 && !HASHJOIN_IS_OUTER(node))
+				(void) MultiExecProcNode((PlanState *) hashNode);
-			return NULL;
-		/*
+				/*
-		 * need to remember whether nbatch has increased since we began
+				 * If the inner relation is completely empty, and we're not
-		 * scanning the outer relation
+				 * doing a left outer join, we can quit without scanning the
-		 */
+				 * outer relation.
-		hashtable->nbatch_outstart = hashtable->nbatch;
+				 */
+				if (hashtable->totalTuples == 0 && !HJ_FILL_OUTER(node))
+					return NULL;
-		/*
+				/*
-		 * Reset OuterNotEmpty for scan.  (It's OK if we fetched a tuple
+				 * need to remember whether nbatch has increased since we began
-		 * above, because ExecHashJoinOuterGetTuple will immediately set it
+				 * scanning the outer relation
-		 * again.)
+				 */
-		 */
+				hashtable->nbatch_outstart = hashtable->nbatch;
-		node->hj_OuterNotEmpty = false;
-	}
-	/*
+				/*
-	 * run the hash join process
+				 * Reset OuterNotEmpty for scan.  (It's OK if we fetched a
-	 */
+				 * tuple above, because ExecHashJoinOuterGetTuple will
-	for (;;)
+				 * immediately set it again.)
-	{
+				 */
-		/*
+				node->hj_OuterNotEmpty = false;
-		 * If we don't have an outer tuple, get the next one
-		 */
-		if (node->hj_NeedNewOuter)
-		{
-			outerTupleSlot = ExecHashJoinOuterGetTuple(outerNode,
-													   node,
-													   &hashvalue);
-			if (TupIsNull(outerTupleSlot))
-			{
-				/* end of join */
-				return NULL;
-			}
-			econtext->ecxt_outertuple = outerTupleSlot;
+				node->hj_JoinState = HJ_NEED_NEW_OUTER;
-			node->hj_NeedNewOuter = false;
-			node->hj_MatchedOuter = false;
-			/*
+				/* FALL THRU */
-			 * Now we have an outer tuple; find the corresponding bucket for
-			 * this tuple in the main hash table or skew hash table.
-			 */
-			node->hj_CurHashValue = hashvalue;
-			ExecHashGetBucketAndBatch(hashtable, hashvalue,
-									  &node->hj_CurBucketNo, &batchno);
-			node->hj_CurSkewBucketNo = ExecHashGetSkewBucket(hashtable,
-															 hashvalue);
-			node->hj_CurTuple = NULL;
-			/*
+			case HJ_NEED_NEW_OUTER:
-			 * Now we've got an outer tuple and the corresponding hash bucket,
-			 * but it might not belong to the current batch, or it might match
-			 * a skew bucket.
-			 */
-			if (batchno != hashtable->curbatch &&
-				node->hj_CurSkewBucketNo == INVALID_SKEW_BUCKET_NO)
-			{
 				/*
-				 * Need to postpone this outer tuple to a later batch. Save it
+				 * We don't have an outer tuple, try to get the next one
-				 * in the corresponding outer-batch file.
 				 */
-				Assert(batchno > hashtable->curbatch);
+				outerTupleSlot = ExecHashJoinOuterGetTuple(outerNode,
-				ExecHashJoinSaveTuple(ExecFetchSlotMinimalTuple(outerTupleSlot),
+														   node,
-									  hashvalue,
+														   &hashvalue);
-									  &hashtable->outerBatchFile[batchno]);
+				if (TupIsNull(outerTupleSlot))
-				node->hj_NeedNewOuter = true;
+				{
-				continue;		/* loop around for a new outer tuple */
+					/* end of batch, or maybe whole join */
-			}
+					if (HJ_FILL_INNER(node))
-		}
+					{
+						/* set up to scan for unmatched inner tuples */
+						ExecPrepHashTableForUnmatched(node);
+						node->hj_JoinState = HJ_FILL_INNER_TUPLES;
+					}
+					else
+						node->hj_JoinState = HJ_NEED_NEW_BATCH;
+					continue;
+				}
-		/*
+				econtext->ecxt_outertuple = outerTupleSlot;
-		 * OK, scan the selected hash bucket for matches
+				node->hj_MatchedOuter = false;
-		 */
-		for (;;)
-		{
-			curtuple = ExecScanHashBucket(node, econtext);
-			if (curtuple == NULL)
-				break;			/* out of matches */
-			/*
+				/*
-			 * we've got a match, but still need to test non-hashed quals
+				 * Find the corresponding bucket for this tuple in the main
-			 */
+				 * hash table or skew hash table.
-			inntuple = ExecStoreMinimalTuple(HJTUPLE_MINTUPLE(curtuple),
+				 */
-											 node->hj_HashTupleSlot,
+				node->hj_CurHashValue = hashvalue;
-											 false);	/* don't pfree */
+				ExecHashGetBucketAndBatch(hashtable, hashvalue,
-			econtext->ecxt_innertuple = inntuple;
+										  &node->hj_CurBucketNo, &batchno);
+				node->hj_CurSkewBucketNo = ExecHashGetSkewBucket(hashtable,
+																 hashvalue);
+				node->hj_CurTuple = NULL;
-			/* reset temp memory each time to avoid leaks from qual expr */
+				/*
-			ResetExprContext(econtext);
+				 * The tuple might not belong to the current batch (where
+				 * "current batch" includes the skew buckets if any).
+				 */
+				if (batchno != hashtable->curbatch &&
+					node->hj_CurSkewBucketNo == INVALID_SKEW_BUCKET_NO)
+				{
+					/*
+					 * Need to postpone this outer tuple to a later batch.
+					 * Save it in the corresponding outer-batch file.
+					 */
+					Assert(batchno > hashtable->curbatch);
+					ExecHashJoinSaveTuple(ExecFetchSlotMinimalTuple(outerTupleSlot),
+										  hashvalue,
+										  &hashtable->outerBatchFile[batchno]);
+					/* Loop around, staying in HJ_NEED_NEW_OUTER state */
+					continue;
+				}
-			/*
+				/* OK, let's scan the bucket for matches */
-			 * if we pass the qual, then save state for next call and have
+				node->hj_JoinState = HJ_SCAN_BUCKET;
-			 * ExecProject form the projection, store it in the tuple table,
-			 * and return the slot.
+				/* FALL THRU */
-			 *
-			 * Only the joinquals determine MatchedOuter status, but all quals
-			 * must pass to actually return the tuple.
-			 */
-			if (joinqual == NIL || ExecQual(joinqual, econtext, false))
-			{
-				node->hj_MatchedOuter = true;
-				/* In an antijoin, we never return a matched tuple */
+			case HJ_SCAN_BUCKET:
-				if (node->js.jointype == JOIN_ANTI)
+				/*
+				 * Scan the selected hash bucket for matches to current outer
+				 */
+				if (!ExecScanHashBucket(node, econtext))
 				{
-					node->hj_NeedNewOuter = true;
+					/* out of matches; check for possible outer-join fill */
-					break;		/* out of loop over hash bucket */
+					node->hj_JoinState = HJ_FILL_OUTER_TUPLE;
+					continue;
 				}
 				/*
-				 * In a semijoin, we'll consider returning the first match,
+				 * We've got a match, but still need to test non-hashed quals.
-				 * but after that we're done with this outer tuple.
+				 * ExecScanHashBucket already set up all the state needed to
+				 * call ExecQual.
+				 *
+				 * If we pass the qual, then save state for next call and have
+				 * ExecProject form the projection, store it in the tuple
+				 * table, and return the slot.
+				 *
+				 * Only the joinquals determine tuple match status, but all
+				 * quals must pass to actually return the tuple.
 				 */
-				if (node->js.jointype == JOIN_SEMI)
+				if (joinqual == NIL || ExecQual(joinqual, econtext, false))
-					node->hj_NeedNewOuter = true;
-				if (otherqual == NIL || ExecQual(otherqual, econtext, false))
 				{
-					TupleTableSlot *result;
+					node->hj_MatchedOuter = true;
+					HeapTupleHeaderSetMatch(HJTUPLE_MINTUPLE(node->hj_CurTuple));
-					result = ExecProject(node->js.ps.ps_ProjInfo, &isDone);
+					/* In an antijoin, we never return a matched tuple */
+					if (node->js.jointype == JOIN_ANTI)
+					{
+						node->hj_JoinState = HJ_NEED_NEW_OUTER;
+						continue;
+					}
-					if (isDone != ExprEndResult)
+					/*
+					 * In a semijoin, we'll consider returning the first match,
+					 * but after that we're done with this outer tuple.
+					 */
+					if (node->js.jointype == JOIN_SEMI)
+						node->hj_JoinState = HJ_NEED_NEW_OUTER;
+					if (otherqual == NIL ||
+						ExecQual(otherqual, econtext, false))
 					{
-						node->js.ps.ps_TupFromTlist =
+						TupleTableSlot *result;
-							(isDone == ExprMultipleResult);
-						return result;
+						result = ExecProject(node->js.ps.ps_ProjInfo, &isDone);
+						if (isDone != ExprEndResult)
+						{
+							node->js.ps.ps_TupFromTlist =
+								(isDone == ExprMultipleResult);
+							return result;
+						}
 					}
 				}
+				break;
+			case HJ_FILL_OUTER_TUPLE:
 				/*
-				 * If semijoin and we didn't return the tuple, we're still
+				 * The current outer tuple has run out of matches, so check
-				 * done with this outer tuple.
+				 * whether to emit a dummy outer-join tuple.  Whether we
+				 * emit one or not, the next state is NEED_NEW_OUTER.
 				 */
-				if (node->js.jointype == JOIN_SEMI)
+				node->hj_JoinState = HJ_NEED_NEW_OUTER;
-					break;		/* out of loop over hash bucket */
-			}
-		}
-		/*
+				if (!node->hj_MatchedOuter &&
-		 * Now the current outer tuple has run out of matches, so check
+					HJ_FILL_OUTER(node))
-		 * whether to emit a dummy outer-join tuple. If not, loop around to
+				{
-		 * get a new outer tuple.
+					/*
-		 */
+					 * Generate a fake join tuple with nulls for the inner
-		node->hj_NeedNewOuter = true;
+					 * tuple, and return it if it passes the non-join quals.
+					 */
+					econtext->ecxt_innertuple = node->hj_NullInnerTupleSlot;
+					if (otherqual == NIL ||
+						ExecQual(otherqual, econtext, false))
+					{
+						TupleTableSlot *result;
-		if (!node->hj_MatchedOuter &&
+						result = ExecProject(node->js.ps.ps_ProjInfo, &isDone);
-			HASHJOIN_IS_OUTER(node))
-		{
-			/*
-			 * We are doing an outer join and there were no join matches for
-			 * this outer tuple.  Generate a fake join tuple with nulls for
-			 * the inner tuple, and return it if it passes the non-join quals.
-			 */
-			econtext->ecxt_innertuple = node->hj_NullInnerTupleSlot;
-			if (otherqual == NIL || ExecQual(otherqual, econtext, false))
+						if (isDone != ExprEndResult)
-			{
+						{
+							node->js.ps.ps_TupFromTlist =
+								(isDone == ExprMultipleResult);
+							return result;
+						}
+					}
+				}
+				break;
+			case HJ_FILL_INNER_TUPLES:
 				/*
-				 * qualification was satisfied so we project and return the
+				 * We have finished a batch, but we are doing right/full join,
-				 * slot containing the result tuple using ExecProject().
+				 * so any unmatched inner tuples in the hashtable have to be
+				 * emitted before we continue to the next batch.
 				 */
-				TupleTableSlot *result;
+				if (!ExecScanHashTableForUnmatched(node, econtext))
+				{
+					/* no more unmatched tuples */
+					node->hj_JoinState = HJ_NEED_NEW_BATCH;
+					continue;
+				}
-				result = ExecProject(node->js.ps.ps_ProjInfo, &isDone);
+				/*
+				 * Generate a fake join tuple with nulls for the outer tuple,
+				 * and return it if it passes the non-join quals.
+				 */
+				econtext->ecxt_outertuple = node->hj_NullOuterTupleSlot;
-				if (isDone != ExprEndResult)
+				if (otherqual == NIL ||
+					ExecQual(otherqual, econtext, false))
 				{
-					node->js.ps.ps_TupFromTlist =
+					TupleTableSlot *result;
-						(isDone == ExprMultipleResult);
-					return result;
+					result = ExecProject(node->js.ps.ps_ProjInfo, &isDone);
+					if (isDone != ExprEndResult)
+					{
+						node->js.ps.ps_TupFromTlist =
+							(isDone == ExprMultipleResult);
+						return result;
+					}
 				}
-			}
+				break;
+			case HJ_NEED_NEW_BATCH:
+				/*
+				 * Try to advance to next batch.  Done if there are no more.
+				 */
+				if (!ExecHashJoinNewBatch(node))
+					return NULL;				/* end of join */
+				node->hj_JoinState = HJ_NEED_NEW_OUTER;
+				break;
+			default:
+				elog(ERROR, "unrecognized hashjoin state: %d",
+					 (int) node->hj_JoinState);
 		}
 	}
 }
@@ -406,7 +474,7 @@ ExecInitHashJoin(HashJoin *node, EState *estate, int eflags)
 	ExecInitResultTupleSlot(estate, &hjstate->js.ps);
 	hjstate->hj_OuterTupleSlot = ExecInitExtraTupleSlot(estate);
-	/* note: HASHJOIN_IS_OUTER macro depends on this initialization */
+	/* set up null tuples for outer joins, if needed */
 	switch (node->join.jointype)
 	{
 		case JOIN_INNER:
@@ -418,6 +486,19 @@ ExecInitHashJoin(HashJoin *node, EState *estate, int eflags)
 				ExecInitNullTupleSlot(estate,
 								 ExecGetResultType(innerPlanState(hjstate)));
 			break;
+		case JOIN_RIGHT:
+			hjstate->hj_NullOuterTupleSlot =
+				ExecInitNullTupleSlot(estate,
+								 ExecGetResultType(outerPlanState(hjstate)));
+			break;
+		case JOIN_FULL:
+			hjstate->hj_NullOuterTupleSlot =
+				ExecInitNullTupleSlot(estate,
+								 ExecGetResultType(outerPlanState(hjstate)));
+			hjstate->hj_NullInnerTupleSlot =
+				ExecInitNullTupleSlot(estate,
+								 ExecGetResultType(innerPlanState(hjstate)));
+			break;
 		default:
 			elog(ERROR, "unrecognized join type: %d",
 				 (int) node->join.jointype);
@@ -425,7 +506,7 @@ ExecInitHashJoin(HashJoin *node, EState *estate, int eflags)
 	/*
 	 * now for some voodoo.  our temporary tuple slot is actually the result
-	 * tuple slot of the Hash node (which is our inner plan).  we do this
+	 * tuple slot of the Hash node (which is our inner plan).  we can do this
 	 * because Hash nodes don't return tuples via ExecProcNode() -- instead
 	 * the hash join node uses ExecScanHashBucket() to get at the contents of
 	 * the hash table.	-cim 6/9/91
@@ -485,7 +566,7 @@ ExecInitHashJoin(HashJoin *node, EState *estate, int eflags)
 	((HashState *) innerPlanState(hjstate))->hashkeys = rclauses;
 	hjstate->js.ps.ps_TupFromTlist = false;
-	hjstate->hj_NeedNewOuter = true;
+	hjstate->hj_JoinState = HJ_BUILD_HASHTABLE;
 	hjstate->hj_MatchedOuter = false;
 	hjstate->hj_OuterNotEmpty = false;
@@ -533,12 +614,13 @@ ExecEndHashJoin(HashJoinState *node)
 * ExecHashJoinOuterGetTuple
 *
 *		get the next outer tuple for hashjoin: either by
- *		executing a plan node in the first pass, or from
+ *		executing the outer plan node in the first pass, or from
 *		the temp files for the hashjoin batches.
 *
- * Returns a null slot if no more outer tuples.  On success, the tuple's
+ * Returns a null slot if no more outer tuples (within the current batch).
- * hash value is stored at *hashvalue --- this is either originally computed,
+ *
- * or re-read from the temp file.
+ * On success, the tuple's hash value is stored at *hashvalue --- this is
+ * either originally computed, or re-read from the temp file.
 */
 static TupleTableSlot *
 ExecHashJoinOuterGetTuple(PlanState *outerNode,
@@ -572,7 +654,7 @@ ExecHashJoinOuterGetTuple(PlanState *outerNode,
 			if (ExecHashGetHashValue(hashtable, econtext,
 									 hjstate->hj_OuterHashKeys,
 									 true,		/* outer tuple */
-									 HASHJOIN_IS_OUTER(hjstate),
+									 HJ_FILL_OUTER(hjstate),
 									 hashvalue))
 			{
 				/* remember outer relation is not empty for possible rescan */
@@ -587,31 +669,27 @@ ExecHashJoinOuterGetTuple(PlanState *outerNode,
 			 */
 			slot = ExecProcNode(outerNode);
 		}
+	}
+	else if (curbatch < hashtable->nbatch)
+	{
+		BufFile    *file = hashtable->outerBatchFile[curbatch];
 		/*
-		 * We have just reached the end of the first pass. Try to switch to a
+		 * In outer-join cases, we could get here even though the batch file
-		 * saved batch.
+		 * is empty.
 		 */
-		curbatch = ExecHashJoinNewBatch(hjstate);
+		if (file == NULL)
-	}
+			return NULL;
-	/*
-	 * Try to read from a temp file. Loop allows us to advance to new batches
-	 * as needed.  NOTE: nbatch could increase inside ExecHashJoinNewBatch, so
-	 * don't try to optimize this loop.
-	 */
-	while (curbatch < hashtable->nbatch)
-	{
 		slot = ExecHashJoinGetSavedTuple(hjstate,
-										 hashtable->outerBatchFile[curbatch],
+										 file,
 										 hashvalue,
 										 hjstate->hj_OuterTupleSlot);
 		if (!TupIsNull(slot))
 			return slot;
-		curbatch = ExecHashJoinNewBatch(hjstate);
 	}
-	/* Out of batches... */
+	/* End of this batch */
 	return NULL;
 }
@@ -619,10 +697,9 @@ ExecHashJoinOuterGetTuple(PlanState *outerNode,
 * ExecHashJoinNewBatch
 *		switch to a new hashjoin batch
 *
- * Returns the number of the new batch (1..nbatch-1), or nbatch if no more.
+ * Returns true if successful, false if there are no more batches.
- * We will never return a batch number that has an empty outer batch file.
 */
-static int
+static bool
 ExecHashJoinNewBatch(HashJoinState *hjstate)
 {
 	HashJoinTable hashtable = hjstate->hj_HashTable;
@@ -632,7 +709,6 @@ ExecHashJoinNewBatch(HashJoinState *hjstate)
 	TupleTableSlot *slot;
 	uint32		hashvalue;
-start_over:
 	nbatch = hashtable->nbatch;
 	curbatch = hashtable->curbatch;
@@ -657,6 +733,7 @@ start_over:
 		hashtable->skewEnabled = false;
 		hashtable->skewBucket = NULL;
 		hashtable->skewBucketNums = NULL;
+		hashtable->nSkewBuckets = 0;
 		hashtable->spaceUsedSkew = 0;
 	}
@@ -665,8 +742,9 @@ start_over:
 	 * sides.  We can sometimes skip over batches that are empty on only one
 	 * side, but there are exceptions:
 	 *
-	 * 1. In an outer join, we have to process outer batches even if the inner
+	 * 1. In a left/full outer join, we have to process outer batches even if
-	 * batch is empty.
+	 * the inner batch is empty.  Similarly, in a right/full outer join, we
+	 * have to process inner batches even if the outer batch is empty.
 	 *
 	 * 2. If we have increased nbatch since the initial estimate, we have to
 	 * scan inner batches since they might contain tuples that need to be
@@ -682,7 +760,10 @@ start_over:
 			hashtable->innerBatchFile[curbatch] == NULL))
 	{
 		if (hashtable->outerBatchFile[curbatch] &&
-			HASHJOIN_IS_OUTER(hjstate))
+			HJ_FILL_OUTER(hjstate))
+			break;				/* must process due to rule 1 */
+		if (hashtable->innerBatchFile[curbatch] &&
+			HJ_FILL_INNER(hjstate))
 			break;				/* must process due to rule 1 */
 		if (hashtable->innerBatchFile[curbatch] &&
 			nbatch != hashtable->nbatch_original)
@@ -702,7 +783,7 @@ start_over:
 	}
 	if (curbatch >= nbatch)
-		return curbatch;		/* no more batches */
+		return false;		/* no more batches */
 	hashtable->curbatch = curbatch;
@@ -741,20 +822,17 @@ start_over:
 	}
 	/*
-	 * If there's no outer batch file, advance to next batch.
+	 * Rewind outer batch file (if present), so that we can start reading it.
 	 */
-	if (hashtable->outerBatchFile[curbatch] == NULL)
+	if (hashtable->outerBatchFile[curbatch] != NULL)
-		goto start_over;
+	{
+		if (BufFileSeek(hashtable->outerBatchFile[curbatch], 0, 0L, SEEK_SET))
-	/*
+			ereport(ERROR,
-	 * Rewind outer batch file, so that we can start reading it.
+					(errcode_for_file_access(),
-	 */
+					 errmsg("could not rewind hash-join temporary file: %m")));
-	if (BufFileSeek(hashtable->outerBatchFile[curbatch], 0, 0L, SEEK_SET))
+	}
-		ereport(ERROR,
-				(errcode_for_file_access(),
-				 errmsg("could not rewind hash-join temporary file: %m")));
-	return curbatch;
+	return true;
 }
 /*
@@ -857,9 +935,16 @@ ExecReScanHashJoin(HashJoinState *node)
 			node->js.ps.righttree->chgParam == NULL)
 		{
 			/*
-			 * okay to reuse the hash table; needn't rescan inner, either.
+			 * Okay to reuse the hash table; needn't rescan inner, either.
 			 *
-			 * What we do need to do is reset our state about the emptiness of
+			 * However, if it's a right/full join, we'd better reset the
+			 * inner-tuple match flags contained in the table.
+			 */
+			if (HJ_FILL_INNER(node))
+				ExecHashTableResetMatchFlags(node->hj_HashTable);
+			/*
+			 * Also, we need to reset our state about the emptiness of
 			 * the outer relation, so that the new scan of the outer will
 			 * update it correctly if it turns out to be empty this time.
 			 * (There's no harm in clearing it now because ExecHashJoin won't
@@ -869,12 +954,16 @@ ExecReScanHashJoin(HashJoinState *node)
 			 * through.)
 			 */
 			node->hj_OuterNotEmpty = false;
+			/* ExecHashJoin can skip the BUILD_HASHTABLE step */
+			node->hj_JoinState = HJ_NEED_NEW_OUTER;
 		}
 		else
 		{
 			/* must destroy and rebuild hash table */
 			ExecHashTableDestroy(node->hj_HashTable);
 			node->hj_HashTable = NULL;
+			node->hj_JoinState = HJ_BUILD_HASHTABLE;
 			/*
 			 * if chgParam of subnode is not null then plan will be re-scanned
@@ -892,7 +981,6 @@ ExecReScanHashJoin(HashJoinState *node)
 	node->hj_CurTuple = NULL;
 	node->js.ps.ps_TupFromTlist = false;
-	node->hj_NeedNewOuter = true;
 	node->hj_MatchedOuter = false;
 	node->hj_FirstOuterTupleSlot = NULL;

--- a/src/backend/optimizer/path/equivclass.c
+++ b/src/backend/optimizer/path/equivclass.c
@@ -889,8 +889,8 @@ generate_join_implied_equalities_normal(PlannerInfo *root,
 	 * combination for which an opfamily member operator exists.  If we have
 	 * choices, we prefer simple Var members (possibly with RelabelType) since
 	 * these are (a) cheapest to compute at runtime and (b) most likely to
-	 * have useful statistics.	Also, if enable_hashjoin is on, we prefer
+	 * have useful statistics. Also, prefer operators that are also
-	 * operators that are also hashjoinable.
+	 * hashjoinable.
 	 */
 	if (outer_members && inner_members)
 	{
@@ -925,8 +925,7 @@ generate_join_implied_equalities_normal(PlannerInfo *root,
 					(IsA(inner_em->em_expr, RelabelType) &&
 					 IsA(((RelabelType *) inner_em->em_expr)->arg, Var)))
 					score++;
-				if (!enable_hashjoin ||
+				if (op_hashjoinable(eq_op,
-					op_hashjoinable(eq_op,
 									exprType((Node *) outer_em->em_expr)))
 					score++;
 				if (score > best_score)

--- a/src/backend/optimizer/path/joinpath.c
+++ b/src/backend/optimizer/path/joinpath.c
@@ -41,7 +41,8 @@ static List *select_mergejoin_clauses(PlannerInfo *root,
 						 RelOptInfo *outerrel,
 						 RelOptInfo *innerrel,
 						 List *restrictlist,
-						 JoinType jointype);
+						 JoinType jointype,
+						 bool *have_nonmergeable_clause);
 /*
@@ -77,12 +78,13 @@ add_paths_to_joinrel(PlannerInfo *root,
 					 List *restrictlist)
 {
 	List	   *mergeclause_list = NIL;
+	bool		have_nonmergeable_clause = false;
 	/*
 	 * Find potential mergejoin clauses.  We can skip this if we are not
-	 * interested in doing a mergejoin.  However, mergejoin is currently our
+	 * interested in doing a mergejoin.  However, mergejoin may be our only
-	 * only way of implementing full outer joins, so override mergejoin
+	 * way of implementing a full outer join, so override enable_mergejoin if
-	 * disable if it's a full join.
+	 * it's a full join.
 	 */
 	if (enable_mergejoin || jointype == JOIN_FULL)
 		mergeclause_list = select_mergejoin_clauses(root,
@@ -90,22 +92,27 @@ add_paths_to_joinrel(PlannerInfo *root,
 													outerrel,
 													innerrel,
 													restrictlist,
-													jointype);
+													jointype,
+													&have_nonmergeable_clause);
 	/*
 	 * 1. Consider mergejoin paths where both relations must be explicitly
-	 * sorted.
+	 * sorted.  Skip this if we can't mergejoin.
 	 */
-	sort_inner_and_outer(root, joinrel, outerrel, innerrel,
+	if (!have_nonmergeable_clause)
-						 restrictlist, mergeclause_list, jointype, sjinfo);
+		sort_inner_and_outer(root, joinrel, outerrel, innerrel,
+							 restrictlist, mergeclause_list, jointype, sjinfo);
 	/*
 	 * 2. Consider paths where the outer relation need not be explicitly
 	 * sorted. This includes both nestloops and mergejoins where the outer
-	 * path is already ordered.
+	 * path is already ordered.  Again, skip this if we can't mergejoin.
+	 * (That's okay because we know that nestloop can't handle right/full
+	 * joins at all, so it wouldn't work in those cases either.)
 	 */
-	match_unsorted_outer(root, joinrel, outerrel, innerrel,
+	if (!have_nonmergeable_clause)
-						 restrictlist, mergeclause_list, jointype, sjinfo);
+		match_unsorted_outer(root, joinrel, outerrel, innerrel,
+							 restrictlist, mergeclause_list, jointype, sjinfo);
 #ifdef NOT_USED
@@ -120,15 +127,17 @@ add_paths_to_joinrel(PlannerInfo *root,
 	 * those made by match_unsorted_outer when add_paths_to_joinrel() is
 	 * invoked with the two rels given in the other order.
 	 */
-	match_unsorted_inner(root, joinrel, outerrel, innerrel,
+	if (!have_nonmergeable_clause)
-						 restrictlist, mergeclause_list, jointype, sjinfo);
+		match_unsorted_inner(root, joinrel, outerrel, innerrel,
+							 restrictlist, mergeclause_list, jointype, sjinfo);
 #endif
 	/*
 	 * 4. Consider paths where both outer and inner relations must be hashed
-	 * before being joined.
+	 * before being joined.  As above, disregard enable_hashjoin for full
+	 * joins, because there may be no other alternative.
 	 */
-	if (enable_hashjoin)
+	if (enable_hashjoin || jointype == JOIN_FULL)
 		hash_inner_and_outer(root, joinrel, outerrel, innerrel,
 							 restrictlist, jointype, sjinfo);
 }
@@ -189,37 +198,11 @@ sort_inner_and_outer(PlannerInfo *root,
 					 JoinType jointype,
 					 SpecialJoinInfo *sjinfo)
 {
-	bool		useallclauses;
 	Path	   *outer_path;
 	Path	   *inner_path;
 	List	   *all_pathkeys;
 	ListCell   *l;
-	/*
-	 * If we are doing a right or full join, we must use *all* the
-	 * mergeclauses as join clauses, else we will not have a valid plan.
-	 */
-	switch (jointype)
-	{
-		case JOIN_INNER:
-		case JOIN_LEFT:
-		case JOIN_SEMI:
-		case JOIN_ANTI:
-		case JOIN_UNIQUE_OUTER:
-		case JOIN_UNIQUE_INNER:
-			useallclauses = false;
-			break;
-		case JOIN_RIGHT:
-		case JOIN_FULL:
-			useallclauses = true;
-			break;
-		default:
-			elog(ERROR, "unrecognized join type: %d",
-				 (int) jointype);
-			useallclauses = false;		/* keep compiler quiet */
-			break;
-	}
 	/*
 	 * We only consider the cheapest-total-cost input paths, since we are
 	 * assuming here that a sort is required.  We will consider
@@ -390,9 +373,9 @@ match_unsorted_outer(PlannerInfo *root,
 	/*
 	 * Nestloop only supports inner, left, semi, and anti joins.  Also, if we
-	 * are doing a right or full join, we must use *all* the mergeclauses as
+	 * are doing a right or full mergejoin, we must use *all* the mergeclauses
-	 * join clauses, else we will not have a valid plan.  (Although these two
+	 * as join clauses, else we will not have a valid plan.  (Although these
-	 * flags are currently inverses, keep them separate for clarity and
+	 * two flags are currently inverses, keep them separate for clarity and
 	 * possible future changes.)
 	 */
 	switch (jointype)
@@ -574,8 +557,8 @@ match_unsorted_outer(PlannerInfo *root,
 		 * Special corner case: for "x FULL JOIN y ON true", there will be no
 		 * join clauses at all.  Ordinarily we'd generate a clauseless
 		 * nestloop path, but since mergejoin is our only join type that
-		 * supports FULL JOIN, it's necessary to generate a clauseless
+		 * supports FULL JOIN without any join clauses, it's necessary to
-		 * mergejoin path instead.
+		 * generate a clauseless mergejoin path instead.
 		 */
 		if (mergeclauses == NIL)
 		{
@@ -781,29 +764,10 @@ hash_inner_and_outer(PlannerInfo *root,
 					 JoinType jointype,
 					 SpecialJoinInfo *sjinfo)
 {
-	bool		isouterjoin;
+	bool		isouterjoin = IS_OUTER_JOIN(jointype);
 	List	   *hashclauses;
 	ListCell   *l;
-	/*
-	 * Hashjoin only supports inner, left, semi, and anti joins.
-	 */
-	switch (jointype)
-	{
-		case JOIN_INNER:
-		case JOIN_SEMI:
-		case JOIN_UNIQUE_OUTER:
-		case JOIN_UNIQUE_INNER:
-			isouterjoin = false;
-			break;
-		case JOIN_LEFT:
-		case JOIN_ANTI:
-			isouterjoin = true;
-			break;
-		default:
-			return;
-	}
 	/*
 	 * We need to build only one hashpath for any given pair of outer and
 	 * inner relations; all of the hashable clauses will be used as keys.
@@ -963,6 +927,11 @@ best_appendrel_indexscan(PlannerInfo *root, RelOptInfo *rel,
 *	  Select mergejoin clauses that are usable for a particular join.
 *	  Returns a list of RestrictInfo nodes for those clauses.
 *
+ * *have_nonmergeable_clause is set TRUE if this is a right/full join and
+ * there are nonmergejoinable join clauses.  The executor's mergejoin
+ * machinery cannot handle such cases, so we have to avoid generating a
+ * mergejoin plan.
+ *
 * We also mark each selected RestrictInfo to show which side is currently
 * being considered as outer.  These are transient markings that are only
 * good for the duration of the current add_paths_to_joinrel() call!
@@ -977,13 +946,15 @@ select_mergejoin_clauses(PlannerInfo *root,
 						 RelOptInfo *outerrel,
 						 RelOptInfo *innerrel,
 						 List *restrictlist,
-						 JoinType jointype)
+						 JoinType jointype,
+						 bool *have_nonmergeable_clause)
 {
 	List	   *result_list = NIL;
 	bool		isouterjoin = IS_OUTER_JOIN(jointype);
-	bool		have_nonmergeable_joinclause = false;
 	ListCell   *l;
+	*have_nonmergeable_clause = false;
 	foreach(l, restrictlist)
 	{
 		RestrictInfo *restrictinfo = (RestrictInfo *) lfirst(l);
@@ -991,7 +962,7 @@ select_mergejoin_clauses(PlannerInfo *root,
 		/*
 		 * If processing an outer join, only use its own join clauses in the
 		 * merge.  For inner joins we can use pushed-down clauses too. (Note:
-		 * we don't set have_nonmergeable_joinclause here because pushed-down
+		 * we don't set have_nonmergeable_clause here because pushed-down
 		 * clauses will become otherquals not joinquals.)
 		 */
 		if (isouterjoin && restrictinfo->is_pushed_down)
@@ -1008,7 +979,7 @@ select_mergejoin_clauses(PlannerInfo *root,
 			 * FALSE.)
 			 */
 			if (!restrictinfo->clause || !IsA(restrictinfo->clause, Const))
-				have_nonmergeable_joinclause = true;
+				*have_nonmergeable_clause = true;
 			continue;			/* not mergejoinable */
 		}
@@ -1017,7 +988,7 @@ select_mergejoin_clauses(PlannerInfo *root,
 		 */
 		if (!clause_sides_match_join(restrictinfo, outerrel, innerrel))
 		{
-			have_nonmergeable_joinclause = true;
+			*have_nonmergeable_clause = true;
 			continue;			/* no good for these input relations */
 		}
@@ -1046,7 +1017,7 @@ select_mergejoin_clauses(PlannerInfo *root,
 		if (EC_MUST_BE_REDUNDANT(restrictinfo->left_ec) ||
 			EC_MUST_BE_REDUNDANT(restrictinfo->right_ec))
 		{
-			have_nonmergeable_joinclause = true;
+			*have_nonmergeable_clause = true;
 			continue;			/* can't handle redundant eclasses */
 		}
@@ -1054,27 +1025,19 @@ select_mergejoin_clauses(PlannerInfo *root,
 	}
 	/*
-	 * If it is a right/full join then *all* the explicit join clauses must be
+	 * If it is not a right/full join then we don't need to insist on all the
-	 * mergejoinable, else the executor will fail. If we are asked for a right
+	 * joinclauses being mergejoinable, so reset the flag.  This simplifies
-	 * join then just return NIL to indicate no mergejoin is possible (we can
+	 * the logic in add_paths_to_joinrel.
-	 * handle it as a left join instead). If we are asked for a full join then
-	 * emit an error, because there is no fallback.
 	 */
-	if (have_nonmergeable_joinclause)
+	switch (jointype)
 	{
-		switch (jointype)
+		case JOIN_RIGHT:
-		{
+		case JOIN_FULL:
-			case JOIN_RIGHT:
+			break;
-				return NIL;		/* not mergejoinable */
+		default:
-			case JOIN_FULL:
+			/* otherwise, it's OK to have nonmergeable join quals */
-				ereport(ERROR,
+			*have_nonmergeable_clause = false;
-						(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+			break;
-						 errmsg("FULL JOIN is only supported with merge-joinable join conditions")));
-				break;
-			default:
-				/* otherwise, it's OK to have nonmergeable join quals */
-				break;
-		}
 	}
 	return result_list;

--- a/src/backend/optimizer/path/joinrels.c
+++ b/src/backend/optimizer/path/joinrels.c
@@ -658,6 +658,18 @@ make_join_rel(PlannerInfo *root, RelOptInfo *rel1, RelOptInfo *rel2)
 			add_paths_to_joinrel(root, joinrel, rel2, rel1,
 								 JOIN_FULL, sjinfo,
 								 restrictlist);
+			/*
+			 * If there are join quals that aren't mergeable or hashable, we
+			 * may not be able to build any valid plan.  Complain here so that
+			 * we can give a somewhat-useful error message.  (Since we have no
+			 * flexibility of planning for a full join, there's no chance of
+			 * succeeding later with another pair of input rels.)
+			 */
+			if (joinrel->pathlist == NIL)
+				ereport(ERROR,
+						(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+						 errmsg("FULL JOIN is only supported with merge-joinable or hash-joinable join conditions")));
 			break;
 		case JOIN_SEMI:

--- a/src/backend/optimizer/plan/initsplan.c
+++ b/src/backend/optimizer/plan/initsplan.c
@@ -1328,10 +1328,9 @@ distribute_restrictinfo_to_rels(PlannerInfo *root,
 			/*
 			 * Check for hashjoinable operators.  (We don't bother setting the
-			 * hashjoin info if we're not going to need it.)
+			 * hashjoin info except in true join clauses.)
 			 */
-			if (enable_hashjoin)
+			check_hashjoinable(restrictinfo);
-				check_hashjoinable(restrictinfo);
 			/*
 			 * Add clause to the join lists of all the relevant relations.
@@ -1458,10 +1457,9 @@ build_implied_join_equality(Oid opno,
 									 qualscope, /* required_relids */
 									 NULL);		/* nullable_relids */
-	/* Set mergejoinability info always, and hashjoinability if enabled */
+	/* Set mergejoinability/hashjoinability flags */
 	check_mergejoinable(restrictinfo);
-	if (enable_hashjoin)
+	check_hashjoinable(restrictinfo);
-		check_hashjoinable(restrictinfo);
 	return restrictinfo;
 }

--- a/src/include/access/htup.h
+++ b/src/include/access/htup.h
@@ -195,6 +195,14 @@ typedef HeapTupleHeaderData *HeapTupleHeader;
 #define HEAP2_XACT_MASK			0xC000	/* visibility-related bits */
+/*
+ * HEAP_TUPLE_HAS_MATCH is a temporary flag used during hash joins.  It is
+ * only used in tuples that are in the hash table, and those don't need
+ * any visibility information, so we can overlay it on a visibility flag
+ * instead of using up a dedicated bit.
+ */
+#define HEAP_TUPLE_HAS_MATCH	HEAP_ONLY_TUPLE	/* tuple has a join match */
 /*
 * HeapTupleHeader accessor macros
 *
@@ -343,6 +351,21 @@ do { \
  (tup)->t_infomask2 &= ~HEAP_ONLY_TUPLE \
 )
+#define HeapTupleHeaderHasMatch(tup) \
+( \
+  (tup)->t_infomask2 & HEAP_TUPLE_HAS_MATCH \
+)
+#define HeapTupleHeaderSetMatch(tup) \
+( \
+  (tup)->t_infomask2 |= HEAP_TUPLE_HAS_MATCH \
+)
+#define HeapTupleHeaderClearMatch(tup) \
+( \
+  (tup)->t_infomask2 &= ~HEAP_TUPLE_HAS_MATCH \
+)
 #define HeapTupleHeaderGetNatts(tup) \
 	((tup)->t_infomask2 & HEAP_NATTS_MASK)

--- a/src/include/executor/hashjoin.h
+++ b/src/include/executor/hashjoin.h
@@ -113,6 +113,8 @@ typedef struct HashJoinTableData
 	struct HashJoinTupleData **buckets;
 	/* buckets array is per-batch storage, as are all the tuples */
+	bool		keepNulls;		/* true to store unmatchable NULL tuples */
 	bool		skewEnabled;	/* are we using skew optimization? */
 	HashSkewBucket **skewBucket;	/* hashtable of skew buckets */
 	int			skewBucketLen;	/* size of skewBucket array (a power of 2!) */

--- a/src/include/executor/nodeHash.h
+++ b/src/include/executor/nodeHash.h
@@ -22,7 +22,8 @@ extern Node *MultiExecHash(HashState *node);
 extern void ExecEndHash(HashState *node);
 extern void ExecReScanHash(HashState *node);
-extern HashJoinTable ExecHashTableCreate(Hash *node, List *hashOperators);
+extern HashJoinTable ExecHashTableCreate(Hash *node, List *hashOperators,
+										 bool keepNulls);
 extern void ExecHashTableDestroy(HashJoinTable hashtable);
 extern void ExecHashTableInsert(HashJoinTable hashtable,
 					TupleTableSlot *slot,
@@ -37,9 +38,12 @@ extern void ExecHashGetBucketAndBatch(HashJoinTable hashtable,
 						  uint32 hashvalue,
 						  int *bucketno,
 						  int *batchno);
-extern HashJoinTuple ExecScanHashBucket(HashJoinState *hjstate,
+extern bool ExecScanHashBucket(HashJoinState *hjstate, ExprContext *econtext);
-				   ExprContext *econtext);
+extern void ExecPrepHashTableForUnmatched(HashJoinState *hjstate);
+extern bool ExecScanHashTableForUnmatched(HashJoinState *hjstate,
+										  ExprContext *econtext);
 extern void ExecHashTableReset(HashJoinTable hashtable);
+extern void ExecHashTableResetMatchFlags(HashJoinTable hashtable);
 extern void ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
 						int *numbuckets,
 						int *numbatches,

--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -1468,6 +1468,10 @@ typedef struct MergeJoinState
 /* ----------------
 *	 HashJoinState information
 *
+ *		hashclauses				original form of the hashjoin condition
+ *		hj_OuterHashKeys		the outer hash keys in the hashjoin condition
+ *		hj_InnerHashKeys		the inner hash keys in the hashjoin condition
+ *		hj_HashOperators		the join operators in the hashjoin condition
 *		hj_HashTable			hash table for the hashjoin
 *								(NULL if table not built yet)
 *		hj_CurHashValue			hash value for current outer tuple
@@ -1477,14 +1481,12 @@ typedef struct MergeJoinState
 *								tuple, or NULL if starting search
 *								(hj_CurXXX variables are undefined if
 *								OuterTupleSlot is empty!)
- *		hj_OuterHashKeys		the outer hash keys in the hashjoin condition
- *		hj_InnerHashKeys		the inner hash keys in the hashjoin condition
- *		hj_HashOperators		the join operators in the hashjoin condition
 *		hj_OuterTupleSlot		tuple slot for outer tuples
- *		hj_HashTupleSlot		tuple slot for hashed tuples
+ *		hj_HashTupleSlot		tuple slot for inner (hashed) tuples
- *		hj_NullInnerTupleSlot	prepared null tuple for left outer joins
+ *		hj_NullOuterTupleSlot	prepared null tuple for right/full outer joins
+ *		hj_NullInnerTupleSlot	prepared null tuple for left/full outer joins
 *		hj_FirstOuterTupleSlot	first tuple retrieved from outer plan
- *		hj_NeedNewOuter			true if need new outer tuple on next call
+ *		hj_JoinState			current state of ExecHashJoin state machine
 *		hj_MatchedOuter			true if found a join match for current outer
 *		hj_OuterNotEmpty		true if outer relation known not empty
 * ----------------
@@ -1498,19 +1500,20 @@ typedef struct HashJoinState
 {
 	JoinState	js;				/* its first field is NodeTag */
 	List	   *hashclauses;	/* list of ExprState nodes */
+	List	   *hj_OuterHashKeys;		/* list of ExprState nodes */
+	List	   *hj_InnerHashKeys;		/* list of ExprState nodes */
+	List	   *hj_HashOperators;		/* list of operator OIDs */
 	HashJoinTable hj_HashTable;
 	uint32		hj_CurHashValue;
 	int			hj_CurBucketNo;
 	int			hj_CurSkewBucketNo;
 	HashJoinTuple hj_CurTuple;
-	List	   *hj_OuterHashKeys;		/* list of ExprState nodes */
-	List	   *hj_InnerHashKeys;		/* list of ExprState nodes */
-	List	   *hj_HashOperators;		/* list of operator OIDs */
 	TupleTableSlot *hj_OuterTupleSlot;
 	TupleTableSlot *hj_HashTupleSlot;
+	TupleTableSlot *hj_NullOuterTupleSlot;
 	TupleTableSlot *hj_NullInnerTupleSlot;
 	TupleTableSlot *hj_FirstOuterTupleSlot;
-	bool		hj_NeedNewOuter;
+	int			hj_JoinState;
 	bool		hj_MatchedOuter;
 	bool		hj_OuterNotEmpty;
 } HashJoinState;