Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
Postgres FD Implementation
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Abuhujair Javed
Postgres FD Implementation
Commits
bc292937
Commit
bc292937
authored
Mar 03, 2007
by
Bruce Momjian
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Split _bt_insertonpg to two functions.
Heikki Linnakangas
parent
63c678d1
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
190 additions
and
142 deletions
+190
-142
src/backend/access/nbtree/nbtinsert.c
src/backend/access/nbtree/nbtinsert.c
+190
-142
No files found.
src/backend/access/nbtree/nbtinsert.c
View file @
bc292937
...
@@ -8,7 +8,7 @@
...
@@ -8,7 +8,7 @@
*
*
*
*
* IDENTIFICATION
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.15
2 2007/02/21 20:02:17
momjian Exp $
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.15
3 2007/03/03 20:13:06
momjian Exp $
*
*
*-------------------------------------------------------------------------
*-------------------------------------------------------------------------
*/
*/
...
@@ -46,13 +46,18 @@ typedef struct
...
@@ -46,13 +46,18 @@ typedef struct
static
Buffer
_bt_newroot
(
Relation
rel
,
Buffer
lbuf
,
Buffer
rbuf
);
static
Buffer
_bt_newroot
(
Relation
rel
,
Buffer
lbuf
,
Buffer
rbuf
);
static
TransactionId
_bt_check_unique
(
Relation
rel
,
IndexTuple
itup
,
static
TransactionId
_bt_check_unique
(
Relation
rel
,
IndexTuple
itup
,
Relation
heapRel
,
Buffer
buf
,
Relation
heapRel
,
Buffer
buf
,
OffsetNumber
ioffset
,
ScanKey
itup_scankey
);
ScanKey
itup_scankey
);
static
void
_bt_findinsertloc
(
Relation
rel
,
Buffer
*
bufptr
,
OffsetNumber
*
offsetptr
,
int
keysz
,
ScanKey
scankey
,
IndexTuple
newtup
);
static
void
_bt_insertonpg
(
Relation
rel
,
Buffer
buf
,
static
void
_bt_insertonpg
(
Relation
rel
,
Buffer
buf
,
BTStack
stack
,
BTStack
stack
,
int
keysz
,
ScanKey
scankey
,
IndexTuple
itup
,
IndexTuple
itup
,
OffsetNumber
afteritem
,
OffsetNumber
newitemoff
,
bool
split_only_page
);
bool
split_only_page
);
static
Buffer
_bt_split
(
Relation
rel
,
Buffer
buf
,
OffsetNumber
firstright
,
static
Buffer
_bt_split
(
Relation
rel
,
Buffer
buf
,
OffsetNumber
firstright
,
OffsetNumber
newitemoff
,
Size
newitemsz
,
OffsetNumber
newitemoff
,
Size
newitemsz
,
...
@@ -86,6 +91,7 @@ _bt_doinsert(Relation rel, IndexTuple itup,
...
@@ -86,6 +91,7 @@ _bt_doinsert(Relation rel, IndexTuple itup,
ScanKey
itup_scankey
;
ScanKey
itup_scankey
;
BTStack
stack
;
BTStack
stack
;
Buffer
buf
;
Buffer
buf
;
OffsetNumber
offset
;
/* we need an insertion scan key to do our search, so build one */
/* we need an insertion scan key to do our search, so build one */
itup_scankey
=
_bt_mkscankey
(
rel
,
itup
);
itup_scankey
=
_bt_mkscankey
(
rel
,
itup
);
...
@@ -94,6 +100,8 @@ top:
...
@@ -94,6 +100,8 @@ top:
/* find the first page containing this key */
/* find the first page containing this key */
stack
=
_bt_search
(
rel
,
natts
,
itup_scankey
,
false
,
&
buf
,
BT_WRITE
);
stack
=
_bt_search
(
rel
,
natts
,
itup_scankey
,
false
,
&
buf
,
BT_WRITE
);
offset
=
InvalidOffsetNumber
;
/* trade in our read lock for a write lock */
/* trade in our read lock for a write lock */
LockBuffer
(
buf
,
BUFFER_LOCK_UNLOCK
);
LockBuffer
(
buf
,
BUFFER_LOCK_UNLOCK
);
LockBuffer
(
buf
,
BT_WRITE
);
LockBuffer
(
buf
,
BT_WRITE
);
...
@@ -128,7 +136,8 @@ top:
...
@@ -128,7 +136,8 @@ top:
{
{
TransactionId
xwait
;
TransactionId
xwait
;
xwait
=
_bt_check_unique
(
rel
,
itup
,
heapRel
,
buf
,
itup_scankey
);
offset
=
_bt_binsrch
(
rel
,
buf
,
natts
,
itup_scankey
,
false
);
xwait
=
_bt_check_unique
(
rel
,
itup
,
heapRel
,
buf
,
offset
,
itup_scankey
);
if
(
TransactionIdIsValid
(
xwait
))
if
(
TransactionIdIsValid
(
xwait
))
{
{
...
@@ -142,7 +151,8 @@ top:
...
@@ -142,7 +151,8 @@ top:
}
}
/* do the insertion */
/* do the insertion */
_bt_insertonpg
(
rel
,
buf
,
stack
,
natts
,
itup_scankey
,
itup
,
0
,
false
);
_bt_findinsertloc
(
rel
,
&
buf
,
&
offset
,
natts
,
itup_scankey
,
itup
);
_bt_insertonpg
(
rel
,
buf
,
stack
,
itup
,
offset
,
false
);
/* be tidy */
/* be tidy */
_bt_freestack
(
stack
);
_bt_freestack
(
stack
);
...
@@ -152,18 +162,21 @@ top:
...
@@ -152,18 +162,21 @@ top:
/*
/*
* _bt_check_unique() -- Check for violation of unique index constraint
* _bt_check_unique() -- Check for violation of unique index constraint
*
*
* offset points to the first possible item that could conflict. It can
* also point to end-of-page, which means that the first tuple to check
* is the first tuple on the next page.
*
* Returns InvalidTransactionId if there is no conflict, else an xact ID
* Returns InvalidTransactionId if there is no conflict, else an xact ID
* we must wait for to see if it commits a conflicting tuple. If an actual
* we must wait for to see if it commits a conflicting tuple. If an actual
* conflict is detected, no return --- just ereport().
* conflict is detected, no return --- just ereport().
*/
*/
static
TransactionId
static
TransactionId
_bt_check_unique
(
Relation
rel
,
IndexTuple
itup
,
Relation
heapRel
,
_bt_check_unique
(
Relation
rel
,
IndexTuple
itup
,
Relation
heapRel
,
Buffer
buf
,
ScanKey
itup_scankey
)
Buffer
buf
,
OffsetNumber
offset
,
ScanKey
itup_scankey
)
{
{
TupleDesc
itupdesc
=
RelationGetDescr
(
rel
);
TupleDesc
itupdesc
=
RelationGetDescr
(
rel
);
int
natts
=
rel
->
rd_rel
->
relnatts
;
int
natts
=
rel
->
rd_rel
->
relnatts
;
OffsetNumber
offset
,
OffsetNumber
maxoff
;
maxoff
;
Page
page
;
Page
page
;
BTPageOpaque
opaque
;
BTPageOpaque
opaque
;
Buffer
nbuf
=
InvalidBuffer
;
Buffer
nbuf
=
InvalidBuffer
;
...
@@ -172,12 +185,6 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
...
@@ -172,12 +185,6 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
opaque
=
(
BTPageOpaque
)
PageGetSpecialPointer
(
page
);
opaque
=
(
BTPageOpaque
)
PageGetSpecialPointer
(
page
);
maxoff
=
PageGetMaxOffsetNumber
(
page
);
maxoff
=
PageGetMaxOffsetNumber
(
page
);
/*
* Find first item >= proposed new item. Note we could also get a pointer
* to end-of-page here.
*/
offset
=
_bt_binsrch
(
rel
,
buf
,
natts
,
itup_scankey
,
false
);
/*
/*
* Scan over all equal tuples, looking for live conflicts.
* Scan over all equal tuples, looking for live conflicts.
*/
*/
...
@@ -342,33 +349,11 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
...
@@ -342,33 +349,11 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
return
InvalidTransactionId
;
return
InvalidTransactionId
;
}
}
/*----------
* _bt_insertonpg() -- Insert a tuple on a particular page in the index.
/*
*
* _bt_findinsertloc() -- Finds an insert location for a tuple
* This recursive procedure does the following things:
*
* + finds the right place to insert the tuple.
* + if necessary, splits the target page (making sure that the
* split is equitable as far as post-insert free space goes).
* + inserts the tuple.
* + if the page was split, pops the parent stack, and finds the
* right place to insert the new child pointer (by walking
* right using information stored in the parent stack).
* + invokes itself with the appropriate tuple for the right
* child page on the parent.
* + updates the metapage if a true root or fast root is split.
*
* On entry, we must have the right buffer in which to do the
* insertion, and the buffer must be pinned and write-locked. On return,
* we will have dropped both the pin and the lock on the buffer.
*
* If 'afteritem' is >0 then the new tuple must be inserted after the
* existing item of that number, noplace else. If 'afteritem' is 0
* then the procedure finds the exact spot to insert it by searching.
* (keysz and scankey parameters are used ONLY if afteritem == 0.
* The scankey must be an insertion-type scankey.)
*
*
*
NOTE: i
f the new key is equal to one or more existing keys, we can
*
I
f the new key is equal to one or more existing keys, we can
* legitimately place it anywhere in the series of equal keys --- in fact,
* legitimately place it anywhere in the series of equal keys --- in fact,
* if the new key is equal to the page's "high key" we can place it on
* if the new key is equal to the page's "high key" we can place it on
* the next page. If it is equal to the high key, and there's not room
* the next page. If it is equal to the high key, and there's not room
...
@@ -379,36 +364,40 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
...
@@ -379,36 +364,40 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
* Once we have chosen the page to put the key on, we'll insert it before
* Once we have chosen the page to put the key on, we'll insert it before
* any existing equal keys because of the way _bt_binsrch() works.
* any existing equal keys because of the way _bt_binsrch() works.
*
*
* The locking interactions in this code are critical. You should
* If there's not enough room in the space, we try to make room by
* grok Lehman and Yao's paper before making any changes. In addition,
* removing any LP_DELETEd tuples.
* you need to understand how we disambiguate duplicate keys in this
*
* implementation, in order to be able to find our location using
* On entry, *buf and *offsetptr point to the first legal position
* L&Y "move right" operations. Since we may insert duplicate user
* where the new tuple could be inserted. The caller should hold an
* keys, and since these dups may propagate up the tree, we use the
* exclusive lock on *buf. *offsetptr can also be set to
* 'afteritem' parameter to position ourselves correctly for the
* InvalidOffsetNumber, in which case the function will search the right
* insertion on internal pages.
* location within the page if needed. On exit, they point to the chosen
*----------
* insert location. If findinsertloc decided to move right, the lock and
* pin on the original page will be released and the new page returned to
* the caller is exclusively locked instead.
*
* newtup is the new tuple we're inserting, and scankey is an insertion
* type scan key for it.
*/
*/
static
void
static
void
_bt_insertonpg
(
Relation
rel
,
_bt_findinsertloc
(
Relation
rel
,
Buffer
buf
,
Buffer
*
bufptr
,
BTStack
stack
,
OffsetNumber
*
offsetptr
,
int
keysz
,
int
keysz
,
ScanKey
scankey
,
ScanKey
scankey
,
IndexTuple
itup
,
IndexTuple
newtup
)
OffsetNumber
afteritem
,
bool
split_only_page
)
{
{
Page
page
;
Buffer
buf
=
*
bufptr
;
Page
page
=
BufferGetPage
(
buf
);
Size
itemsz
;
BTPageOpaque
lpageop
;
BTPageOpaque
lpageop
;
bool
movedright
,
vacuumed
;
OffsetNumber
newitemoff
;
OffsetNumber
newitemoff
;
OffsetNumber
firstright
=
InvalidOffsetNumber
;
OffsetNumber
firstlegaloff
=
*
offsetptr
;
Size
itemsz
;
page
=
BufferGetPage
(
buf
);
lpageop
=
(
BTPageOpaque
)
PageGetSpecialPointer
(
page
);
lpageop
=
(
BTPageOpaque
)
PageGetSpecialPointer
(
page
);
itemsz
=
IndexTupleDSize
(
*
i
tup
);
itemsz
=
IndexTupleDSize
(
*
new
tup
);
itemsz
=
MAXALIGN
(
itemsz
);
/* be safe, PageAddItem will do this but we
itemsz
=
MAXALIGN
(
itemsz
);
/* be safe, PageAddItem will do this but we
* need to be consistent */
* need to be consistent */
...
@@ -429,96 +418,155 @@ _bt_insertonpg(Relation rel,
...
@@ -429,96 +418,155 @@ _bt_insertonpg(Relation rel,
"Consider a function index of an MD5 hash of the value, "
"Consider a function index of an MD5 hash of the value, "
"or use full text indexing."
)));
"or use full text indexing."
)));
/*
* Determine exactly where new item will go.
/*----------
* If we will need to split the page to put the item on this page,
* check whether we can put the tuple somewhere to the right,
* instead. Keep scanning right until we
* (a) find a page with enough free space,
* (b) reach the last page where the tuple can legally go, or
* (c) get tired of searching.
* (c) is not flippant; it is important because if there are many
* pages' worth of equal keys, it's better to split one of the early
* pages than to scan all the way to the end of the run of equal keys
* on every insert. We implement "get tired" as a random choice,
* since stopping after scanning a fixed number of pages wouldn't work
* well (we'd never reach the right-hand side of previously split
* pages). Currently the probability of moving right is set at 0.99,
* which may seem too high to change the behavior much, but it does an
* excellent job of preventing O(N^2) behavior with many equal keys.
*----------
*/
*/
if
(
afteritem
>
0
)
movedright
=
false
;
newitemoff
=
afteritem
+
1
;
vacuumed
=
false
;
else
while
(
PageGetFreeSpace
(
page
)
<
itemsz
)
{
{
/*----------
Buffer
rbuf
;
* If we will need to split the page to put the item here,
* check whether we can put the tuple somewhere to the right,
* instead. Keep scanning right until we
* (a) find a page with enough free space,
* (b) reach the last page where the tuple can legally go, or
* (c) get tired of searching.
* (c) is not flippant; it is important because if there are many
* pages' worth of equal keys, it's better to split one of the early
* pages than to scan all the way to the end of the run of equal keys
* on every insert. We implement "get tired" as a random choice,
* since stopping after scanning a fixed number of pages wouldn't work
* well (we'd never reach the right-hand side of previously split
* pages). Currently the probability of moving right is set at 0.99,
* which may seem too high to change the behavior much, but it does an
* excellent job of preventing O(N^2) behavior with many equal keys.
*----------
*/
bool
movedright
=
false
;
while
(
PageGetFreeSpace
(
page
)
<
itemsz
)
/*
* before considering moving right, see if we can obtain enough
* space by erasing LP_DELETE items
*/
if
(
P_ISLEAF
(
lpageop
)
&&
P_HAS_GARBAGE
(
lpageop
))
{
{
Buffer
rbuf
;
_bt_vacuum_one_page
(
rel
,
buf
);
/*
* before considering moving right, see if we can obtain enough
* space by erasing LP_DELETE items
*/
if
(
P_ISLEAF
(
lpageop
)
&&
P_HAS_GARBAGE
(
lpageop
))
{
_bt_vacuum_one_page
(
rel
,
buf
);
if
(
PageGetFreeSpace
(
page
)
>=
itemsz
)
break
;
/* OK, now we have enough space */
}
/*
/* remember that we vacuumed this page, because that makes
* nope, so check conditions (b) and (c) enumerated above
* the hint supplied by the caller invalid */
*/
vacuumed
=
true
;
if
(
P_RIGHTMOST
(
lpageop
)
||
_bt_compare
(
rel
,
keysz
,
scankey
,
page
,
P_HIKEY
)
!=
0
||
random
()
<=
(
MAX_RANDOM_VALUE
/
100
))
break
;
/*
* step right to next non-dead page
*
* must write-lock that page before releasing write lock on
* current page; else someone else's _bt_check_unique scan could
* fail to see our insertion. write locks on intermediate dead
* pages won't do because we don't know when they will get
* de-linked from the tree.
*/
rbuf
=
InvalidBuffer
;
for
(;;)
{
BlockNumber
rblkno
=
lpageop
->
btpo_next
;
rbuf
=
_bt_relandgetbuf
(
rel
,
rbuf
,
rblkno
,
BT_WRITE
);
if
(
PageGetFreeSpace
(
page
)
>=
itemsz
)
page
=
BufferGetPage
(
rbuf
);
break
;
/* OK, now we have enough space */
lpageop
=
(
BTPageOpaque
)
PageGetSpecialPointer
(
page
);
if
(
!
P_IGNORE
(
lpageop
))
break
;
if
(
P_RIGHTMOST
(
lpageop
))
elog
(
ERROR
,
"fell off the end of
\"
%s
\"
"
,
RelationGetRelationName
(
rel
));
}
_bt_relbuf
(
rel
,
buf
);
buf
=
rbuf
;
movedright
=
true
;
}
}
/*
/*
* Now we are on the right page, so find the insert position. If we
* nope, so check conditions (b) and (c) enumerated above
* moved right at all, we know we should insert at the start of the
* page, else must find the position by searching.
*/
*/
if
(
movedright
)
if
(
P_RIGHTMOST
(
lpageop
)
||
newitemoff
=
P_FIRSTDATAKEY
(
lpageop
);
_bt_compare
(
rel
,
keysz
,
scankey
,
page
,
P_HIKEY
)
!=
0
||
else
random
()
<=
(
MAX_RANDOM_VALUE
/
100
))
newitemoff
=
_bt_binsrch
(
rel
,
buf
,
keysz
,
scankey
,
false
);
break
;
/*
* step right to next non-dead page
*
* must write-lock that page before releasing write lock on
* current page; else someone else's _bt_check_unique scan could
* fail to see our insertion. write locks on intermediate dead
* pages won't do because we don't know when they will get
* de-linked from the tree.
*/
rbuf
=
InvalidBuffer
;
for
(;;)
{
BlockNumber
rblkno
=
lpageop
->
btpo_next
;
rbuf
=
_bt_relandgetbuf
(
rel
,
rbuf
,
rblkno
,
BT_WRITE
);
page
=
BufferGetPage
(
rbuf
);
lpageop
=
(
BTPageOpaque
)
PageGetSpecialPointer
(
page
);
if
(
!
P_IGNORE
(
lpageop
))
break
;
if
(
P_RIGHTMOST
(
lpageop
))
elog
(
ERROR
,
"fell off the end of
\"
%s
\"
"
,
RelationGetRelationName
(
rel
));
}
_bt_relbuf
(
rel
,
buf
);
buf
=
rbuf
;
movedright
=
true
;
vacuumed
=
false
;
}
}
/*
* Now we are on the right page, so find the insert position. If we
* moved right at all, we know we should insert at the start of the
* page. If we didn't move right, we can use the firstlegaloff hint
* if the caller supplied one, unless we vacuumed the page which
* might have moved tuples around making the hint invalid. If we
* didn't move right or can't use the hint, find the position
* by searching.
*/
if
(
movedright
)
newitemoff
=
P_FIRSTDATAKEY
(
lpageop
);
else
if
(
firstlegaloff
!=
InvalidOffsetNumber
&&
!
vacuumed
)
newitemoff
=
firstlegaloff
;
else
newitemoff
=
_bt_binsrch
(
rel
,
buf
,
keysz
,
scankey
,
false
);
*
bufptr
=
buf
;
*
offsetptr
=
newitemoff
;
}
/*----------
* _bt_insertonpg() -- Insert a tuple on a particular page in the index.
*
* This recursive procedure does the following things:
*
* + if necessary, splits the target page (making sure that the
* split is equitable as far as post-insert free space goes).
* + inserts the tuple.
* + if the page was split, pops the parent stack, and finds the
* right place to insert the new child pointer (by walking
* right using information stored in the parent stack).
* + invokes itself with the appropriate tuple for the right
* child page on the parent.
* + updates the metapage if a true root or fast root is split.
*
* On entry, we must have the right buffer in which to do the
* insertion, and the buffer must be pinned and write-locked. On return,
* we will have dropped both the pin and the lock on the buffer.
*
* The locking interactions in this code are critical. You should
* grok Lehman and Yao's paper before making any changes. In addition,
* you need to understand how we disambiguate duplicate keys in this
* implementation, in order to be able to find our location using
* L&Y "move right" operations. Since we may insert duplicate user
* keys, and since these dups may propagate up the tree, we use the
* 'afteritem' parameter to position ourselves correctly for the
* insertion on internal pages.
*----------
*/
static
void
_bt_insertonpg
(
Relation
rel
,
Buffer
buf
,
BTStack
stack
,
IndexTuple
itup
,
OffsetNumber
newitemoff
,
bool
split_only_page
)
{
Page
page
;
BTPageOpaque
lpageop
;
OffsetNumber
firstright
=
InvalidOffsetNumber
;
Size
itemsz
;
page
=
BufferGetPage
(
buf
);
lpageop
=
(
BTPageOpaque
)
PageGetSpecialPointer
(
page
);
itemsz
=
IndexTupleDSize
(
*
itup
);
itemsz
=
MAXALIGN
(
itemsz
);
/* be safe, PageAddItem will do this but we
* need to be consistent */
/*
/*
* Do we need to split the page to fit the item on it?
* Do we need to split the page to fit the item on it?
*
*
...
@@ -1427,7 +1475,7 @@ _bt_insert_parent(Relation rel,
...
@@ -1427,7 +1475,7 @@ _bt_insert_parent(Relation rel,
/* Recursively update the parent */
/* Recursively update the parent */
_bt_insertonpg
(
rel
,
pbuf
,
stack
->
bts_parent
,
_bt_insertonpg
(
rel
,
pbuf
,
stack
->
bts_parent
,
0
,
NULL
,
new_item
,
stack
->
bts_offset
,
new_item
,
stack
->
bts_offset
+
1
,
is_only
);
is_only
);
/* be tidy */
/* be tidy */
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment