Commit e5a11a88 authored by Tom Lane's avatar Tom Lane

Improve hash method for bitmapsets: some examination of actual outputs

shows that adding a circular shift between words greatly improves the
distribution of hash outputs.
parent 1f01d59e
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
* Copyright (c) 2003-2005, PostgreSQL Global Development Group * Copyright (c) 2003-2005, PostgreSQL Global Development Group
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/nodes/bitmapset.c,v 1.8 2005/06/08 23:02:04 tgl Exp $ * $PostgreSQL: pgsql/src/backend/nodes/bitmapset.c,v 1.9 2005/06/15 16:24:07 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -769,22 +769,36 @@ bms_first_member(Bitmapset *a) ...@@ -769,22 +769,36 @@ bms_first_member(Bitmapset *a)
* *
* Note: we must ensure that any two bitmapsets that are bms_equal() will * Note: we must ensure that any two bitmapsets that are bms_equal() will
* hash to the same value; in practice this means that trailing all-zero * hash to the same value; in practice this means that trailing all-zero
* words cannot affect the result. Longitudinal XOR provides a reasonable * words cannot affect the result. The circular-shift-and-XOR hash method
* hash value that has this property. * used here has this property, so long as we work from back to front.
*
* Note: you might wonder why we bother with the circular shift; at first
* glance a straight longitudinal XOR seems as good and much simpler. The
* reason is empirical: this gives a better distribution of hash values on
* the bitmapsets actually generated by the planner. A common way to have
* multiword bitmapsets is "a JOIN b JOIN c JOIN d ...", which gives rise
* to rangetables in which base tables and JOIN nodes alternate; so
* bitmapsets of base table RT indexes tend to use only odd-numbered or only
* even-numbered bits. A straight longitudinal XOR would preserve this
* property, leading to a much smaller set of possible outputs than if
* we include a shift.
*/ */
uint32 uint32
bms_hash_value(const Bitmapset *a) bms_hash_value(const Bitmapset *a)
{ {
bitmapword result = 0; bitmapword result = 0;
int nwords;
int wordnum; int wordnum;
if (a == NULL) if (a == NULL || a->nwords <= 0)
return 0; /* All empty sets hash to 0 */ return 0; /* All empty sets hash to 0 */
nwords = a->nwords; for (wordnum = a->nwords; --wordnum > 0; )
for (wordnum = 0; wordnum < nwords; wordnum++)
{ {
result ^= a->words[wordnum]; result ^= a->words[wordnum];
if (result & ((bitmapword) 1 << (BITS_PER_BITMAPWORD - 1)))
result = (result << 1) | 1;
else
result = (result << 1);
} }
result ^= a->words[0];
return (uint32) result; return (uint32) result;
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment