Commit c9a1cc69 authored by Tom Lane's avatar Tom Lane

Change hash index creation so that rather than always establishing exactly

two buckets at the start, we create a number of buckets appropriate for the
estimated size of the table.  This avoids a lot of expensive bucket-split
actions during initial index build on an already-populated table.

This is one of the two core ideas of Tom Raney and Shreya Bhargava's patch
to reduce hash index build time.  I'm committing it separately to make it
easier for people to test the effects of this separately from the effects
of their other core idea (pre-sorting the index entries by bucket number).
parent 4873c96f
$PostgreSQL: pgsql/src/backend/access/hash/README,v 1.6 2007/04/19 20:24:04 tgl Exp $ $PostgreSQL: pgsql/src/backend/access/hash/README,v 1.7 2008/03/15 20:46:31 tgl Exp $
This directory contains an implementation of hash indexing for Postgres. Most This directory contains an implementation of hash indexing for Postgres. Most
of the core ideas are taken from Margo Seltzer and Ozan Yigit, A New Hashing of the core ideas are taken from Margo Seltzer and Ozan Yigit, A New Hashing
...@@ -65,6 +65,11 @@ hashm_spares[N] <= hashm_spares[N+1], since the latter count includes the ...@@ -65,6 +65,11 @@ hashm_spares[N] <= hashm_spares[N+1], since the latter count includes the
former. The difference between the two represents the number of overflow former. The difference between the two represents the number of overflow
pages appearing between the bucket page groups of splitpoints N and N+1. pages appearing between the bucket page groups of splitpoints N and N+1.
(Note: the above describes what happens when filling an initially minimally
sized hash index. In practice, we try to estimate the required index size
and allocate a suitable number of splitpoints immediately, to avoid
expensive re-splitting during initial index build.)
When S splitpoints exist altogether, the array entries hashm_spares[0] When S splitpoints exist altogether, the array entries hashm_spares[0]
through hashm_spares[S] are valid; hashm_spares[S] records the current through hashm_spares[S] are valid; hashm_spares[S] records the current
total number of overflow pages. New overflow pages are created as needed total number of overflow pages. New overflow pages are created as needed
...@@ -101,9 +106,9 @@ includes the bitmap pages, which is the reason for saying that bitmap ...@@ -101,9 +106,9 @@ includes the bitmap pages, which is the reason for saying that bitmap
pages are a subset of the overflow pages. It turns out in fact that each pages are a subset of the overflow pages. It turns out in fact that each
bitmap page's first bit represents itself --- this is not an essential bitmap page's first bit represents itself --- this is not an essential
property, but falls out of the fact that we only allocate another bitmap property, but falls out of the fact that we only allocate another bitmap
page when we really need one. Bit number zero always corresponds to block page when we really need one. Bit number zero always corresponds to the
number 3, which is the first bitmap page and is allocated during index first bitmap page, which is allocated during index creation just after all
creation. the initially created buckets.
Lock definitions Lock definitions
......
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/access/hash/hash.c,v 1.98 2008/01/01 19:45:46 momjian Exp $ * $PostgreSQL: pgsql/src/backend/access/hash/hash.c,v 1.99 2008/03/15 20:46:31 tgl Exp $
* *
* NOTES * NOTES
* This file contains only the public interface routines. * This file contains only the public interface routines.
...@@ -22,6 +22,7 @@ ...@@ -22,6 +22,7 @@
#include "access/hash.h" #include "access/hash.h"
#include "catalog/index.h" #include "catalog/index.h"
#include "commands/vacuum.h" #include "commands/vacuum.h"
#include "optimizer/plancat.h"
/* Working state for hashbuild and its callback */ /* Working state for hashbuild and its callback */
...@@ -48,6 +49,7 @@ hashbuild(PG_FUNCTION_ARGS) ...@@ -48,6 +49,7 @@ hashbuild(PG_FUNCTION_ARGS)
Relation index = (Relation) PG_GETARG_POINTER(1); Relation index = (Relation) PG_GETARG_POINTER(1);
IndexInfo *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2); IndexInfo *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2);
IndexBuildResult *result; IndexBuildResult *result;
BlockNumber relpages;
double reltuples; double reltuples;
HashBuildState buildstate; HashBuildState buildstate;
...@@ -59,8 +61,11 @@ hashbuild(PG_FUNCTION_ARGS) ...@@ -59,8 +61,11 @@ hashbuild(PG_FUNCTION_ARGS)
elog(ERROR, "index \"%s\" already contains data", elog(ERROR, "index \"%s\" already contains data",
RelationGetRelationName(index)); RelationGetRelationName(index));
/* initialize the hash index metadata page */ /* estimate the number of rows currently present in the table */
_hash_metapinit(index); estimate_rel_size(heap, NULL, &relpages, &reltuples);
/* initialize the hash index metadata page and initial buckets */
_hash_metapinit(index, reltuples);
/* build the index */ /* build the index */
buildstate.indtuples = 0; buildstate.indtuples = 0;
......
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.72 2008/01/01 19:45:46 momjian Exp $ * $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.73 2008/03/15 20:46:31 tgl Exp $
* *
* NOTES * NOTES
* Postgres hash pages look like ordinary relation pages. The opaque * Postgres hash pages look like ordinary relation pages. The opaque
...@@ -312,15 +312,17 @@ _hash_chgbufaccess(Relation rel, ...@@ -312,15 +312,17 @@ _hash_chgbufaccess(Relation rel,
/* /*
* _hash_metapinit() -- Initialize the metadata page of a hash index, * _hash_metapinit() -- Initialize the metadata page of a hash index,
* the two buckets that we begin with and the initial * the initial buckets, and the initial bitmap page.
* bitmap page. *
* The initial number of buckets is dependent on num_tuples, an estimate
* of the number of tuples to be loaded into the index initially.
* *
* We are fairly cavalier about locking here, since we know that no one else * We are fairly cavalier about locking here, since we know that no one else
* could be accessing this index. In particular the rule about not holding * could be accessing this index. In particular the rule about not holding
* multiple buffer locks is ignored. * multiple buffer locks is ignored.
*/ */
void void
_hash_metapinit(Relation rel) _hash_metapinit(Relation rel, double num_tuples)
{ {
HashMetaPage metap; HashMetaPage metap;
HashPageOpaque pageopaque; HashPageOpaque pageopaque;
...@@ -330,7 +332,10 @@ _hash_metapinit(Relation rel) ...@@ -330,7 +332,10 @@ _hash_metapinit(Relation rel)
int32 data_width; int32 data_width;
int32 item_width; int32 item_width;
int32 ffactor; int32 ffactor;
uint16 i; double dnumbuckets;
uint32 num_buckets;
uint32 log2_num_buckets;
uint32 i;
/* safety check */ /* safety check */
if (RelationGetNumberOfBlocks(rel) != 0) if (RelationGetNumberOfBlocks(rel) != 0)
...@@ -354,7 +359,26 @@ _hash_metapinit(Relation rel) ...@@ -354,7 +359,26 @@ _hash_metapinit(Relation rel)
ffactor = 10; ffactor = 10;
/* /*
* We initialize the metapage, the first two bucket pages, and the first * Choose the number of initial bucket pages to match the fill factor
* given the estimated number of tuples. We round up the result to the
* next power of 2, however, and always force at least 2 bucket pages.
* The upper limit is determined by considerations explained in
* _hash_expandtable().
*/
dnumbuckets = num_tuples / ffactor;
if (dnumbuckets <= 2.0)
num_buckets = 2;
else if (dnumbuckets >= (double) 0x40000000)
num_buckets = 0x40000000;
else
num_buckets = ((uint32) 1) << _hash_log2((uint32) dnumbuckets);
log2_num_buckets = _hash_log2(num_buckets);
Assert(num_buckets == (((uint32) 1) << log2_num_buckets));
Assert(log2_num_buckets < HASH_MAX_SPLITPOINTS);
/*
* We initialize the metapage, the first N bucket pages, and the first
* bitmap page in sequence, using _hash_getnewbuf to cause smgrextend() * bitmap page in sequence, using _hash_getnewbuf to cause smgrextend()
* calls to occur. This ensures that the smgr level has the right idea of * calls to occur. This ensures that the smgr level has the right idea of
* the physical index length. * the physical index length.
...@@ -398,23 +422,25 @@ _hash_metapinit(Relation rel) ...@@ -398,23 +422,25 @@ _hash_metapinit(Relation rel)
metap->hashm_procid = index_getprocid(rel, 1, HASHPROC); metap->hashm_procid = index_getprocid(rel, 1, HASHPROC);
/* /*
* We initialize the index with two buckets, 0 and 1, occupying physical * We initialize the index with N buckets, 0 .. N-1, occupying physical
* blocks 1 and 2. The first freespace bitmap page is in block 3. * blocks 1 to N. The first freespace bitmap page is in block N+1.
* Since N is a power of 2, we can set the masks this way:
*/ */
metap->hashm_maxbucket = metap->hashm_lowmask = 1; /* nbuckets - 1 */ metap->hashm_maxbucket = metap->hashm_lowmask = num_buckets - 1;
metap->hashm_highmask = 3; /* (nbuckets << 1) - 1 */ metap->hashm_highmask = (num_buckets << 1) - 1;
MemSet(metap->hashm_spares, 0, sizeof(metap->hashm_spares)); MemSet(metap->hashm_spares, 0, sizeof(metap->hashm_spares));
MemSet(metap->hashm_mapp, 0, sizeof(metap->hashm_mapp)); MemSet(metap->hashm_mapp, 0, sizeof(metap->hashm_mapp));
metap->hashm_spares[1] = 1; /* the first bitmap page is only spare */ /* Set up mapping for one spare page after the initial splitpoints */
metap->hashm_ovflpoint = 1; metap->hashm_spares[log2_num_buckets] = 1;
metap->hashm_ovflpoint = log2_num_buckets;
metap->hashm_firstfree = 0; metap->hashm_firstfree = 0;
/* /*
* Initialize the first two buckets * Initialize the first N buckets
*/ */
for (i = 0; i <= 1; i++) for (i = 0; i < num_buckets; i++)
{ {
buf = _hash_getnewbuf(rel, BUCKET_TO_BLKNO(metap, i)); buf = _hash_getnewbuf(rel, BUCKET_TO_BLKNO(metap, i));
pg = BufferGetPage(buf); pg = BufferGetPage(buf);
...@@ -430,7 +456,7 @@ _hash_metapinit(Relation rel) ...@@ -430,7 +456,7 @@ _hash_metapinit(Relation rel)
/* /*
* Initialize first bitmap page * Initialize first bitmap page
*/ */
_hash_initbitmap(rel, metap, 3); _hash_initbitmap(rel, metap, num_buckets + 1);
/* all done */ /* all done */
_hash_wrtbuf(rel, metabuf); _hash_wrtbuf(rel, metabuf);
...@@ -511,6 +537,9 @@ _hash_expandtable(Relation rel, Buffer metabuf) ...@@ -511,6 +537,9 @@ _hash_expandtable(Relation rel, Buffer metabuf)
* index with 2^32 buckets would certainly overflow BlockNumber and hence * index with 2^32 buckets would certainly overflow BlockNumber and hence
* _hash_alloc_buckets() would fail, but if we supported buckets smaller * _hash_alloc_buckets() would fail, but if we supported buckets smaller
* than a disk block then this would be an independent constraint. * than a disk block then this would be an independent constraint.
*
* If you change this, see also the maximum initial number of buckets
* in _hash_metapinit().
*/ */
if (metap->hashm_maxbucket >= (uint32) 0x7FFFFFFE) if (metap->hashm_maxbucket >= (uint32) 0x7FFFFFFE)
goto fail; goto fail;
......
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/optimizer/util/plancat.c,v 1.140 2008/01/12 00:11:39 tgl Exp $ * $PostgreSQL: pgsql/src/backend/optimizer/util/plancat.c,v 1.141 2008/03/15 20:46:31 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -45,8 +45,6 @@ bool constraint_exclusion = false; ...@@ -45,8 +45,6 @@ bool constraint_exclusion = false;
get_relation_info_hook_type get_relation_info_hook = NULL; get_relation_info_hook_type get_relation_info_hook = NULL;
static void estimate_rel_size(Relation rel, int32 *attr_widths,
BlockNumber *pages, double *tuples);
static List *get_relation_constraints(Oid relationObjectId, RelOptInfo *rel, static List *get_relation_constraints(Oid relationObjectId, RelOptInfo *rel,
bool include_notnull); bool include_notnull);
...@@ -319,7 +317,7 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, ...@@ -319,7 +317,7 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
* relation's attr_width[] cache; we fill this in if we have need to compute * relation's attr_width[] cache; we fill this in if we have need to compute
* the attribute widths for estimation purposes. * the attribute widths for estimation purposes.
*/ */
static void void
estimate_rel_size(Relation rel, int32 *attr_widths, estimate_rel_size(Relation rel, int32 *attr_widths,
BlockNumber *pages, double *tuples) BlockNumber *pages, double *tuples)
{ {
......
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 1994, Regents of the University of California
* *
* $PostgreSQL: pgsql/src/include/access/hash.h,v 1.84 2008/01/01 19:45:56 momjian Exp $ * $PostgreSQL: pgsql/src/include/access/hash.h,v 1.85 2008/03/15 20:46:31 tgl Exp $
* *
* NOTES * NOTES
* modeled after Margo Seltzer's hash implementation for unix. * modeled after Margo Seltzer's hash implementation for unix.
...@@ -298,7 +298,7 @@ extern void _hash_dropbuf(Relation rel, Buffer buf); ...@@ -298,7 +298,7 @@ extern void _hash_dropbuf(Relation rel, Buffer buf);
extern void _hash_wrtbuf(Relation rel, Buffer buf); extern void _hash_wrtbuf(Relation rel, Buffer buf);
extern void _hash_chgbufaccess(Relation rel, Buffer buf, int from_access, extern void _hash_chgbufaccess(Relation rel, Buffer buf, int from_access,
int to_access); int to_access);
extern void _hash_metapinit(Relation rel); extern void _hash_metapinit(Relation rel, double num_tuples);
extern void _hash_pageinit(Page page, Size size); extern void _hash_pageinit(Page page, Size size);
extern void _hash_expandtable(Relation rel, Buffer metabuf); extern void _hash_expandtable(Relation rel, Buffer metabuf);
......
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 1994, Regents of the University of California
* *
* $PostgreSQL: pgsql/src/include/optimizer/plancat.h,v 1.47 2008/01/01 19:45:58 momjian Exp $ * $PostgreSQL: pgsql/src/include/optimizer/plancat.h,v 1.48 2008/03/15 20:46:31 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#define PLANCAT_H #define PLANCAT_H
#include "nodes/relation.h" #include "nodes/relation.h"
#include "utils/rel.h"
/* Hook for plugins to get control in get_relation_info() */ /* Hook for plugins to get control in get_relation_info() */
typedef void (*get_relation_info_hook_type) (PlannerInfo *root, typedef void (*get_relation_info_hook_type) (PlannerInfo *root,
...@@ -27,6 +28,9 @@ extern PGDLLIMPORT get_relation_info_hook_type get_relation_info_hook; ...@@ -27,6 +28,9 @@ extern PGDLLIMPORT get_relation_info_hook_type get_relation_info_hook;
extern void get_relation_info(PlannerInfo *root, Oid relationObjectId, extern void get_relation_info(PlannerInfo *root, Oid relationObjectId,
bool inhparent, RelOptInfo *rel); bool inhparent, RelOptInfo *rel);
extern void estimate_rel_size(Relation rel, int32 *attr_widths,
BlockNumber *pages, double *tuples);
extern bool relation_excluded_by_constraints(RelOptInfo *rel, extern bool relation_excluded_by_constraints(RelOptInfo *rel,
RangeTblEntry *rte); RangeTblEntry *rte);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment