summaryrefslogtreecommitdiff
path: root/src/backend/access/nbtree/nbtinsert.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/access/nbtree/nbtinsert.c')
-rw-r--r--src/backend/access/nbtree/nbtinsert.c107
1 files changed, 62 insertions, 45 deletions
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
index c1671ce333..775eaca242 100644
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.155 2007/03/25 19:45:14 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.156 2007/04/11 20:47:37 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -49,7 +49,7 @@ static TransactionId _bt_check_unique(Relation rel, IndexTuple itup,
Relation heapRel, Buffer buf, OffsetNumber ioffset,
ScanKey itup_scankey);
static void _bt_findinsertloc(Relation rel,
- Buffer *bufptr,
+ Buffer *bufptr,
OffsetNumber *offsetptr,
int keysz,
ScanKey scankey,
@@ -66,7 +66,7 @@ static OffsetNumber _bt_findsplitloc(Relation rel, Page page,
OffsetNumber newitemoff,
Size newitemsz,
bool *newitemonleft);
-static void _bt_checksplitloc(FindSplitData *state,
+static void _bt_checksplitloc(FindSplitData *state,
OffsetNumber firstoldonright, bool newitemonleft,
int dataitemstoleft, Size firstoldonrightsz);
static void _bt_pgaddtup(Relation rel, Page page,
@@ -459,7 +459,7 @@ _bt_findinsertloc(Relation rel,
* the hint supplied by the caller invalid */
vacuumed = true;
- if (PageGetFreeSpace(page) >= itemsz)
+ if (PageGetFreeSpace(page) >= itemsz)
break; /* OK, now we have enough space */
}
@@ -506,7 +506,7 @@ _bt_findinsertloc(Relation rel,
* moved right at all, we know we should insert at the start of the
* page. If we didn't move right, we can use the firstlegaloff hint
* if the caller supplied one, unless we vacuumed the page which
- * might have moved tuples around making the hint invalid. If we
+ * might have moved tuples around making the hint invalid. If we
* didn't move right or can't use the hint, find the position
* by searching.
*/
@@ -779,8 +779,6 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
Buffer sbuf = InvalidBuffer;
Page spage = NULL;
BTPageOpaque sopaque = NULL;
- OffsetNumber itup_off = 0;
- BlockNumber itup_blkno = 0;
Size itemsz;
ItemId itemid;
IndexTuple item;
@@ -798,6 +796,14 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
_bt_pageinit(leftpage, BufferGetPageSize(buf));
/* rightpage was already initialized by _bt_getbuf */
+ /*
+ * Copy the original page's LSN and TLI into leftpage, which will become
+ * the updated version of the page. We need this because XLogInsert will
+ * examine these fields and possibly dump them in a page image.
+ */
+ PageSetLSN(leftpage, PageGetLSN(origpage));
+ PageSetTLI(leftpage, PageGetTLI(origpage));
+
/* init btree private data */
oopaque = (BTPageOpaque) PageGetSpecialPointer(origpage);
lopaque = (BTPageOpaque) PageGetSpecialPointer(leftpage);
@@ -864,7 +870,10 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
leftoff = OffsetNumberNext(leftoff);
/*
- * Now transfer all the data items to the appropriate page
+ * Now transfer all the data items to the appropriate page.
+ *
+ * Note: we *must* insert at least the right page's items in item-number
+ * order, for the benefit of _bt_restore_page().
*/
maxoff = PageGetMaxOffsetNumber(origpage);
@@ -881,16 +890,12 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
{
_bt_pgaddtup(rel, leftpage, newitemsz, newitem, leftoff,
"left sibling");
- itup_off = leftoff;
- itup_blkno = BufferGetBlockNumber(buf);
leftoff = OffsetNumberNext(leftoff);
}
else
{
_bt_pgaddtup(rel, rightpage, newitemsz, newitem, rightoff,
"right sibling");
- itup_off = rightoff;
- itup_blkno = BufferGetBlockNumber(rbuf);
rightoff = OffsetNumberNext(rightoff);
}
}
@@ -921,8 +926,6 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
Assert(!newitemonleft);
_bt_pgaddtup(rel, rightpage, newitemsz, newitem, rightoff,
"right sibling");
- itup_off = rightoff;
- itup_blkno = BufferGetBlockNumber(rbuf);
rightoff = OffsetNumberNext(rightoff);
}
@@ -961,7 +964,7 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
/*
* Right sibling is locked, new siblings are prepared, but original page
- * is not updated yet. Log changes before continuing.
+ * is not updated yet.
*
* NO EREPORT(ERROR) till right sibling is updated. We can get away with
* not starting the critical section till here because we haven't been
@@ -970,15 +973,6 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
*/
START_CRIT_SECTION();
- MarkBufferDirty(buf);
- MarkBufferDirty(rbuf);
-
- if (!P_RIGHTMOST(ropaque))
- {
- sopaque->btpo_prev = BufferGetBlockNumber(rbuf);
- MarkBufferDirty(sbuf);
- }
-
/*
* By here, the original data page has been split into two new halves, and
* these are correct. The algorithm requires that the left page never
@@ -994,6 +988,15 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
*/
PageRestoreTempPage(leftpage, origpage);
+ MarkBufferDirty(buf);
+ MarkBufferDirty(rbuf);
+
+ if (!P_RIGHTMOST(ropaque))
+ {
+ sopaque->btpo_prev = BufferGetBlockNumber(rbuf);
+ MarkBufferDirty(sbuf);
+ }
+
/* XLOG stuff */
if (!rel->rd_istemp)
{
@@ -1006,9 +1009,9 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
xlrec.node = rel->rd_node;
xlrec.leftsib = BufferGetBlockNumber(buf);
xlrec.rightsib = BufferGetBlockNumber(rbuf);
- xlrec.firstright = firstright;
xlrec.rnext = ropaque->btpo_next;
xlrec.level = ropaque->btpo.level;
+ xlrec.firstright = firstright;
rdata[0].data = (char *) &xlrec;
rdata[0].len = SizeOfBtreeSplit;
@@ -1027,14 +1030,18 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
lastrdata->buffer = InvalidBuffer;
}
- /* Log the new item, if it was inserted on the left page. If it was
- * put on the right page, we don't need to explicitly WAL log it
- * because it's included with all the other items on the right page.
+ /*
+ * Log the new item and its offset, if it was inserted on the left
+ * page. (If it was put on the right page, we don't need to explicitly
+ * WAL log it because it's included with all the other items on the
+ * right page.) Show these as belonging to the left page buffer,
+ * so that they are not stored if XLogInsert decides it needs a
+ * full-page image of the left page.
*/
- lastrdata->next = lastrdata + 1;
- lastrdata++;
if (newitemonleft)
{
+ lastrdata->next = lastrdata + 1;
+ lastrdata++;
lastrdata->data = (char *) &newitemoff;
lastrdata->len = sizeof(OffsetNumber);
lastrdata->buffer = buf; /* backup block 1 */
@@ -1042,39 +1049,49 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
lastrdata->next = lastrdata + 1;
lastrdata++;
- lastrdata->data = (char *)newitem;
- lastrdata->len = newitemsz;
+ lastrdata->data = (char *) newitem;
+ lastrdata->len = MAXALIGN(newitemsz);
lastrdata->buffer = buf; /* backup block 1 */
lastrdata->buffer_std = true;
}
else
{
+ /*
+ * Although we don't need to WAL-log the new item, we still
+ * need XLogInsert to consider storing a full-page image of the
+ * left page, so make an empty entry referencing that buffer.
+ * This also ensures that the left page is always backup block 1.
+ */
+ lastrdata->next = lastrdata + 1;
+ lastrdata++;
lastrdata->data = NULL;
lastrdata->len = 0;
lastrdata->buffer = buf; /* backup block 1 */
lastrdata->buffer_std = true;
}
- /* Log the contents of the right page in the format understood by
+ /*
+ * Log the contents of the right page in the format understood by
* _bt_restore_page(). We set lastrdata->buffer to InvalidBuffer,
- * because we're going to recreate the whole page anyway.
+ * because we're going to recreate the whole page anyway, so it
+ * should never be stored by XLogInsert.
*
* Direct access to page is not good but faster - we should implement
* some new func in page API. Note we only store the tuples
- * themselves, knowing that the item pointers are in the same order
- * and can be reconstructed by scanning the tuples. See comments for
+ * themselves, knowing that they were inserted in item-number order
+ * and so the item pointers can be reconstructed. See comments for
* _bt_restore_page().
*/
lastrdata->next = lastrdata + 1;
lastrdata++;
- lastrdata->data = (char *) rightpage +
+ lastrdata->data = (char *) rightpage +
((PageHeader) rightpage)->pd_upper;
lastrdata->len = ((PageHeader) rightpage)->pd_special -
((PageHeader) rightpage)->pd_upper;
lastrdata->buffer = InvalidBuffer;
- /* Log the right sibling, because we've changed it's prev-pointer. */
+ /* Log the right sibling, because we've changed its' prev-pointer. */
if (!P_RIGHTMOST(ropaque))
{
lastrdata->next = lastrdata + 1;
@@ -1216,7 +1233,7 @@ _bt_findsplitloc(Relation rel,
olddataitemstoleft = 0;
goodenoughfound = false;
maxoff = PageGetMaxOffsetNumber(page);
-
+
for (offnum = P_FIRSTDATAKEY(opaque);
offnum <= maxoff;
offnum = OffsetNumberNext(offnum))
@@ -1234,7 +1251,7 @@ _bt_findsplitloc(Relation rel,
olddataitemstoleft, itemsz);
else if (offnum < newitemoff)
- _bt_checksplitloc(&state, offnum, false,
+ _bt_checksplitloc(&state, offnum, false,
olddataitemstoleft, itemsz);
else
{
@@ -1285,11 +1302,11 @@ _bt_findsplitloc(Relation rel,
* items go to the left page and only the new item goes to the right page.
* In that case, firstoldonrightsz is not used.
*
- * olddataitemstoleft is the total size of all old items to the left of
- * firstoldonright.
+ * olddataitemstoleft is the total size of all old items to the left of
+ * firstoldonright.
*/
static void
-_bt_checksplitloc(FindSplitData *state,
+_bt_checksplitloc(FindSplitData *state,
OffsetNumber firstoldonright,
bool newitemonleft,
int olddataitemstoleft,
@@ -1311,7 +1328,7 @@ _bt_checksplitloc(FindSplitData *state,
/* Account for all the old tuples */
leftfree = state->leftspace - olddataitemstoleft;
- rightfree = state->rightspace -
+ rightfree = state->rightspace -
(state->olddataitemstotal - olddataitemstoleft);
/*
@@ -1854,7 +1871,7 @@ _bt_vacuum_one_page(Relation rel, Buffer buffer)
BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
/*
- * Scan over all items to see which ones need to be deleted
+ * Scan over all items to see which ones need to be deleted
* according to LP_DELETE flags.
*/
minoff = P_FIRSTDATAKEY(opaque);