Commit 569174f1 authored by Robert Haas's avatar Robert Haas

btree: Support parallel index scans.

This isn't exposed to the optimizer or the executor yet; we'll add
support for those things in a separate patch.  But this puts the
basic mechanism in place: several processes can attach to a parallel
btree index scan, and each one will get a subset of the tuples that
would have been produced by a non-parallel scan.  Each index page
becomes the responsibility of a single worker, which then returns
all of the TIDs on that page.

Rahila Syed, Amit Kapila, Robert Haas, reviewed and tested by
Anastasia Lubennikova, Tushar Ahuja, and Haribabu Kommi.
parent 8569955e
...@@ -1207,7 +1207,7 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser ...@@ -1207,7 +1207,7 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser
<entry>Waiting in an extension.</entry> <entry>Waiting in an extension.</entry>
</row> </row>
<row> <row>
<entry morerows="9"><literal>IPC</></entry> <entry morerows="10"><literal>IPC</></entry>
<entry><literal>BgWorkerShutdown</></entry> <entry><literal>BgWorkerShutdown</></entry>
<entry>Waiting for background worker to shut down.</entry> <entry>Waiting for background worker to shut down.</entry>
</row> </row>
...@@ -1215,6 +1215,10 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser ...@@ -1215,6 +1215,10 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser
<entry><literal>BgWorkerStartup</></entry> <entry><literal>BgWorkerStartup</></entry>
<entry>Waiting for background worker to start up.</entry> <entry>Waiting for background worker to start up.</entry>
</row> </row>
<row>
<entry><literal>BtreePage</></entry>
<entry>Waiting for the page number needed to continue a parallel btree scan to become available.</entry>
</row>
<row> <row>
<entry><literal>ExecuteGather</></entry> <entry><literal>ExecuteGather</></entry>
<entry>Waiting for activity from child process when executing <literal>Gather</> node.</entry> <entry>Waiting for activity from child process when executing <literal>Gather</> node.</entry>
......
...@@ -23,6 +23,8 @@ ...@@ -23,6 +23,8 @@
#include "access/xlog.h" #include "access/xlog.h"
#include "catalog/index.h" #include "catalog/index.h"
#include "commands/vacuum.h" #include "commands/vacuum.h"
#include "pgstat.h"
#include "storage/condition_variable.h"
#include "storage/indexfsm.h" #include "storage/indexfsm.h"
#include "storage/ipc.h" #include "storage/ipc.h"
#include "storage/lmgr.h" #include "storage/lmgr.h"
...@@ -63,6 +65,45 @@ typedef struct ...@@ -63,6 +65,45 @@ typedef struct
MemoryContext pagedelcontext; MemoryContext pagedelcontext;
} BTVacState; } BTVacState;
/*
* BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started.
*
* BTPARALLEL_ADVANCING indicates that some process is advancing the scan to
* a new page; others must wait.
*
* BTPARALLEL_IDLE indicates that no backend is currently advancing the scan
* to a new page; some process can start doing that.
*
* BTPARALLEL_DONE indicates that the scan is complete (including error exit).
* We reach this state once for every distinct combination of array keys.
*/
typedef enum
{
BTPARALLEL_NOT_INITIALIZED,
BTPARALLEL_ADVANCING,
BTPARALLEL_IDLE,
BTPARALLEL_DONE
} BTPS_State;
/*
* BTParallelScanDescData contains btree specific shared information required
* for parallel scan.
*/
typedef struct BTParallelScanDescData
{
BlockNumber btps_scanPage; /* latest or next page to be scanned */
BTPS_State btps_pageStatus;/* indicates whether next page is available
* for scan. see above for possible states of
* parallel scan. */
int btps_arrayKeyCount; /* count indicating number of array
* scan keys processed by parallel
* scan */
slock_t btps_mutex; /* protects above variables */
ConditionVariable btps_cv; /* used to synchronize parallel scan */
} BTParallelScanDescData;
typedef struct BTParallelScanDescData *BTParallelScanDesc;
static void btbuildCallback(Relation index, static void btbuildCallback(Relation index,
HeapTuple htup, HeapTuple htup,
...@@ -118,9 +159,9 @@ bthandler(PG_FUNCTION_ARGS) ...@@ -118,9 +159,9 @@ bthandler(PG_FUNCTION_ARGS)
amroutine->amendscan = btendscan; amroutine->amendscan = btendscan;
amroutine->ammarkpos = btmarkpos; amroutine->ammarkpos = btmarkpos;
amroutine->amrestrpos = btrestrpos; amroutine->amrestrpos = btrestrpos;
amroutine->amestimateparallelscan = NULL; amroutine->amestimateparallelscan = btestimateparallelscan;
amroutine->aminitparallelscan = NULL; amroutine->aminitparallelscan = btinitparallelscan;
amroutine->amparallelrescan = NULL; amroutine->amparallelrescan = btparallelrescan;
PG_RETURN_POINTER(amroutine); PG_RETURN_POINTER(amroutine);
} }
...@@ -491,6 +532,7 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, ...@@ -491,6 +532,7 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
} }
so->markItemIndex = -1; so->markItemIndex = -1;
so->arrayKeyCount = 0;
BTScanPosUnpinIfPinned(so->markPos); BTScanPosUnpinIfPinned(so->markPos);
BTScanPosInvalidate(so->markPos); BTScanPosInvalidate(so->markPos);
...@@ -652,6 +694,217 @@ btrestrpos(IndexScanDesc scan) ...@@ -652,6 +694,217 @@ btrestrpos(IndexScanDesc scan)
} }
} }
/*
* btestimateparallelscan -- estimate storage for BTParallelScanDescData
*/
Size
btestimateparallelscan(void)
{
return sizeof(BTParallelScanDescData);
}
/*
* btinitparallelscan -- initialize BTParallelScanDesc for parallel btree scan
*/
void
btinitparallelscan(void *target)
{
BTParallelScanDesc bt_target = (BTParallelScanDesc) target;
SpinLockInit(&bt_target->btps_mutex);
bt_target->btps_scanPage = InvalidBlockNumber;
bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
bt_target->btps_arrayKeyCount = 0;
ConditionVariableInit(&bt_target->btps_cv);
}
/*
* btparallelrescan() -- reset parallel scan
*/
void
btparallelrescan(IndexScanDesc scan)
{
BTParallelScanDesc btscan;
ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
Assert(parallel_scan);
btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
parallel_scan->ps_offset);
/*
* In theory, we don't need to acquire the spinlock here, because there
* shouldn't be any other workers running at this point, but we do so for
* consistency.
*/
SpinLockAcquire(&btscan->btps_mutex);
btscan->btps_scanPage = InvalidBlockNumber;
btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
btscan->btps_arrayKeyCount = 0;
SpinLockRelease(&btscan->btps_mutex);
}
/*
* _bt_parallel_seize() -- Begin the process of advancing the scan to a new
* page. Other scans must wait until we call bt_parallel_release() or
* bt_parallel_done().
*
* The return value is true if we successfully seized the scan and false
* if we did not. The latter case occurs if no pages remain for the current
* set of scankeys.
*
* If the return value is true, *pageno returns the next or current page
* of the scan (depending on the scan direction). An invalid block number
* means the scan hasn't yet started, and P_NONE means we've reached the end.
* The first time a participating process reaches the last page, it will return
* true and set *pageno to P_NONE; after that, further attempts to seize the
* scan will return false.
*
* Callers should ignore the value of pageno if the return value is false.
*/
bool
_bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno)
{
BTScanOpaque so = (BTScanOpaque) scan->opaque;
BTPS_State pageStatus;
bool exit_loop = false;
bool status = true;
ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
BTParallelScanDesc btscan;
*pageno = P_NONE;
btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
parallel_scan->ps_offset);
while (1)
{
SpinLockAcquire(&btscan->btps_mutex);
pageStatus = btscan->btps_pageStatus;
if (so->arrayKeyCount < btscan->btps_arrayKeyCount)
{
/* Parallel scan has already advanced to a new set of scankeys. */
status = false;
}
else if (pageStatus == BTPARALLEL_DONE)
{
/*
* We're done with this set of scankeys. This may be the end, or
* there could be more sets to try.
*/
status = false;
}
else if (pageStatus != BTPARALLEL_ADVANCING)
{
/*
* We have successfully seized control of the scan for the purpose
* of advancing it to a new page!
*/
btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
*pageno = btscan->btps_scanPage;
exit_loop = true;
}
SpinLockRelease(&btscan->btps_mutex);
if (exit_loop || !status)
break;
ConditionVariableSleep(&btscan->btps_cv, WAIT_EVENT_BTREE_PAGE);
}
ConditionVariableCancelSleep();
return status;
}
/*
* _bt_parallel_release() -- Complete the process of advancing the scan to a
* new page. We now have the new value btps_scanPage; some other backend
* can now begin advancing the scan.
*/
void
_bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
{
ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
BTParallelScanDesc btscan;
btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
parallel_scan->ps_offset);
SpinLockAcquire(&btscan->btps_mutex);
btscan->btps_scanPage = scan_page;
btscan->btps_pageStatus = BTPARALLEL_IDLE;
SpinLockRelease(&btscan->btps_mutex);
ConditionVariableSignal(&btscan->btps_cv);
}
/*
* _bt_parallel_done() -- Mark the parallel scan as complete.
*
* When there are no pages left to scan, this function should be called to
* notify other workers. Otherwise, they might wait forever for the scan to
* advance to the next page.
*/
void
_bt_parallel_done(IndexScanDesc scan)
{
BTScanOpaque so = (BTScanOpaque) scan->opaque;
ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
BTParallelScanDesc btscan;
bool status_changed = false;
/* Do nothing, for non-parallel scans */
if (parallel_scan == NULL)
return;
btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
parallel_scan->ps_offset);
/*
* Mark the parallel scan as done for this combination of scan keys,
* unless some other process already did so. See also
* _bt_advance_array_keys.
*/
SpinLockAcquire(&btscan->btps_mutex);
if (so->arrayKeyCount >= btscan->btps_arrayKeyCount &&
btscan->btps_pageStatus != BTPARALLEL_DONE)
{
btscan->btps_pageStatus = BTPARALLEL_DONE;
status_changed = true;
}
SpinLockRelease(&btscan->btps_mutex);
/* wake up all the workers associated with this parallel scan */
if (status_changed)
ConditionVariableBroadcast(&btscan->btps_cv);
}
/*
* _bt_parallel_advance_array_keys() -- Advances the parallel scan for array
* keys.
*
* Updates the count of array keys processed for both local and parallel
* scans.
*/
void
_bt_parallel_advance_array_keys(IndexScanDesc scan)
{
BTScanOpaque so = (BTScanOpaque) scan->opaque;
ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
BTParallelScanDesc btscan;
btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
parallel_scan->ps_offset);
so->arrayKeyCount++;
SpinLockAcquire(&btscan->btps_mutex);
if (btscan->btps_pageStatus == BTPARALLEL_DONE)
{
btscan->btps_scanPage = InvalidBlockNumber;
btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
btscan->btps_arrayKeyCount++;
}
SpinLockRelease(&btscan->btps_mutex);
}
/* /*
* Bulk deletion of all index entries pointing to a set of heap tuples. * Bulk deletion of all index entries pointing to a set of heap tuples.
* The set of target tuples is specified via a callback routine that tells * The set of target tuples is specified via a callback routine that tells
......
This diff is collapsed.
...@@ -590,6 +590,10 @@ _bt_advance_array_keys(IndexScanDesc scan, ScanDirection dir) ...@@ -590,6 +590,10 @@ _bt_advance_array_keys(IndexScanDesc scan, ScanDirection dir)
break; break;
} }
/* advance parallel scan */
if (scan->parallel_scan != NULL)
_bt_parallel_advance_array_keys(scan);
return found; return found;
} }
......
...@@ -3374,6 +3374,9 @@ pgstat_get_wait_ipc(WaitEventIPC w) ...@@ -3374,6 +3374,9 @@ pgstat_get_wait_ipc(WaitEventIPC w)
case WAIT_EVENT_BGWORKER_STARTUP: case WAIT_EVENT_BGWORKER_STARTUP:
event_name = "BgWorkerStartup"; event_name = "BgWorkerStartup";
break; break;
case WAIT_EVENT_BTREE_PAGE:
event_name = "BtreePage";
break;
case WAIT_EVENT_EXECUTE_GATHER: case WAIT_EVENT_EXECUTE_GATHER:
event_name = "ExecuteGather"; event_name = "ExecuteGather";
break; break;
......
...@@ -383,6 +383,8 @@ typedef struct BTScanOpaqueData ...@@ -383,6 +383,8 @@ typedef struct BTScanOpaqueData
ScanKey arrayKeyData; /* modified copy of scan->keyData */ ScanKey arrayKeyData; /* modified copy of scan->keyData */
int numArrayKeys; /* number of equality-type array keys (-1 if int numArrayKeys; /* number of equality-type array keys (-1 if
* there are any unsatisfiable array keys) */ * there are any unsatisfiable array keys) */
int arrayKeyCount; /* count indicating number of array scan keys
* processed */
BTArrayKeyInfo *arrayKeys; /* info about each equality-type array key */ BTArrayKeyInfo *arrayKeys; /* info about each equality-type array key */
MemoryContext arrayContext; /* scan-lifespan context for array data */ MemoryContext arrayContext; /* scan-lifespan context for array data */
...@@ -426,7 +428,7 @@ typedef BTScanOpaqueData *BTScanOpaque; ...@@ -426,7 +428,7 @@ typedef BTScanOpaqueData *BTScanOpaque;
#define SK_BT_NULLS_FIRST (INDOPTION_NULLS_FIRST << SK_BT_INDOPTION_SHIFT) #define SK_BT_NULLS_FIRST (INDOPTION_NULLS_FIRST << SK_BT_INDOPTION_SHIFT)
/* /*
* prototypes for functions in nbtree.c (external entry points for btree) * external entry points for btree, in nbtree.c
*/ */
extern IndexBuildResult *btbuild(Relation heap, Relation index, extern IndexBuildResult *btbuild(Relation heap, Relation index,
struct IndexInfo *indexInfo); struct IndexInfo *indexInfo);
...@@ -436,10 +438,13 @@ extern bool btinsert(Relation rel, Datum *values, bool *isnull, ...@@ -436,10 +438,13 @@ extern bool btinsert(Relation rel, Datum *values, bool *isnull,
IndexUniqueCheck checkUnique, IndexUniqueCheck checkUnique,
struct IndexInfo *indexInfo); struct IndexInfo *indexInfo);
extern IndexScanDesc btbeginscan(Relation rel, int nkeys, int norderbys); extern IndexScanDesc btbeginscan(Relation rel, int nkeys, int norderbys);
extern Size btestimateparallelscan(void);
extern void btinitparallelscan(void *target);
extern bool btgettuple(IndexScanDesc scan, ScanDirection dir); extern bool btgettuple(IndexScanDesc scan, ScanDirection dir);
extern int64 btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm); extern int64 btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm);
extern void btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, extern void btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
ScanKey orderbys, int norderbys); ScanKey orderbys, int norderbys);
extern void btparallelrescan(IndexScanDesc scan);
extern void btendscan(IndexScanDesc scan); extern void btendscan(IndexScanDesc scan);
extern void btmarkpos(IndexScanDesc scan); extern void btmarkpos(IndexScanDesc scan);
extern void btrestrpos(IndexScanDesc scan); extern void btrestrpos(IndexScanDesc scan);
...@@ -451,6 +456,14 @@ extern IndexBulkDeleteResult *btvacuumcleanup(IndexVacuumInfo *info, ...@@ -451,6 +456,14 @@ extern IndexBulkDeleteResult *btvacuumcleanup(IndexVacuumInfo *info,
IndexBulkDeleteResult *stats); IndexBulkDeleteResult *stats);
extern bool btcanreturn(Relation index, int attno); extern bool btcanreturn(Relation index, int attno);
/*
* prototypes for internal functions in nbtree.c
*/
extern bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno);
extern void _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page);
extern void _bt_parallel_done(IndexScanDesc scan);
extern void _bt_parallel_advance_array_keys(IndexScanDesc scan);
/* /*
* prototypes for functions in nbtinsert.c * prototypes for functions in nbtinsert.c
*/ */
......
...@@ -780,6 +780,7 @@ typedef enum ...@@ -780,6 +780,7 @@ typedef enum
{ {
WAIT_EVENT_BGWORKER_SHUTDOWN = PG_WAIT_IPC, WAIT_EVENT_BGWORKER_SHUTDOWN = PG_WAIT_IPC,
WAIT_EVENT_BGWORKER_STARTUP, WAIT_EVENT_BGWORKER_STARTUP,
WAIT_EVENT_BTREE_PAGE,
WAIT_EVENT_EXECUTE_GATHER, WAIT_EVENT_EXECUTE_GATHER,
WAIT_EVENT_MQ_INTERNAL, WAIT_EVENT_MQ_INTERNAL,
WAIT_EVENT_MQ_PUT_MESSAGE, WAIT_EVENT_MQ_PUT_MESSAGE,
......
...@@ -161,6 +161,9 @@ BTPageOpaque ...@@ -161,6 +161,9 @@ BTPageOpaque
BTPageOpaqueData BTPageOpaqueData
BTPageStat BTPageStat
BTPageState BTPageState
BTParallelScanDesc
BTParallelScanDescData
BTPS_State
BTScanOpaque BTScanOpaque
BTScanOpaqueData BTScanOpaqueData
BTScanPos BTScanPos
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment