summaryrefslogtreecommitdiff
path: root/src/backend/access/nbtree
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/access/nbtree')
-rw-r--r--src/backend/access/nbtree/Makefile.inc15
-rw-r--r--src/backend/access/nbtree/README68
-rw-r--r--src/backend/access/nbtree/nbtcompare.c173
-rw-r--r--src/backend/access/nbtree/nbtinsert.c831
-rw-r--r--src/backend/access/nbtree/nbtpage.c523
-rw-r--r--src/backend/access/nbtree/nbtree.c516
-rw-r--r--src/backend/access/nbtree/nbtscan.c164
-rw-r--r--src/backend/access/nbtree/nbtsearch.c1133
-rw-r--r--src/backend/access/nbtree/nbtsort.c1196
-rw-r--r--src/backend/access/nbtree/nbtstrat.c134
-rw-r--r--src/backend/access/nbtree/nbtutils.c239
11 files changed, 4992 insertions, 0 deletions
diff --git a/src/backend/access/nbtree/Makefile.inc b/src/backend/access/nbtree/Makefile.inc
new file mode 100644
index 0000000000..50854008c0
--- /dev/null
+++ b/src/backend/access/nbtree/Makefile.inc
@@ -0,0 +1,15 @@
+#-------------------------------------------------------------------------
+#
+# Makefile.inc--
+# Makefile for access/nbtree (btree acess methods)
+#
+# Copyright (c) 1994, Regents of the University of California
+#
+#
+# IDENTIFICATION
+# $Header: /cvsroot/pgsql/src/backend/access/nbtree/Attic/Makefile.inc,v 1.1.1.1 1996/07/09 06:21:11 scrappy Exp $
+#
+#-------------------------------------------------------------------------
+
+SUBSRCS+= nbtcompare.c nbtinsert.c nbtpage.c nbtree.c nbtscan.c nbtsearch.c \
+ nbtstrat.c nbtutils.c nbtsort.c
diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README
new file mode 100644
index 0000000000..a204ad4af0
--- /dev/null
+++ b/src/backend/access/nbtree/README
@@ -0,0 +1,68 @@
+$Header: /cvsroot/pgsql/src/backend/access/nbtree/README,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $
+
+This directory contains a correct implementation of Lehman and Yao's
+btree management algorithm that supports concurrent access for Postgres.
+We have made the following changes in order to incorporate their algorithm
+into Postgres:
+
+ + The requirement that all btree keys be unique is too onerous,
+ but the algorithm won't work correctly without it. As a result,
+ this implementation adds an OID (guaranteed to be unique) to
+ every key in the index. This guarantees uniqueness within a set
+ of duplicates. Space overhead is four bytes.
+
+ For this reason, when we're passed an index tuple to store by the
+ common access method code, we allocate a larger one and copy the
+ supplied tuple into it. No Postgres code outside of the btree
+ access method knows about this xid or sequence number.
+
+ + Lehman and Yao don't require read locks, but assume that in-
+ memory copies of tree nodes are unshared. Postgres shares
+ in-memory buffers among backends. As a result, we do page-
+ level read locking on btree nodes in order to guarantee that
+ no record is modified while we are examining it. This reduces
+ concurrency but guaranteees correct behavior.
+
+ + Read locks on a page are held for as long as a scan has a pointer
+ to the page. However, locks are always surrendered before the
+ sibling page lock is acquired (for readers), so we remain deadlock-
+ free. I will do a formal proof if I get bored anytime soon.
+
+In addition, the following things are handy to know:
+
+ + Page zero of every btree is a meta-data page. This page stores
+ the location of the root page, a pointer to a list of free
+ pages, and other stuff that's handy to know.
+
+ + This algorithm doesn't really work, since it requires ordered
+ writes, and UNIX doesn't support ordered writes.
+
+ + There's one other case where we may screw up in this
+ implementation. When we start a scan, we descend the tree
+ to the key nearest the one in the qual, and once we get there,
+ position ourselves correctly for the qual type (eg, <, >=, etc).
+ If we happen to step off a page, decide we want to get back to
+ it, and fetch the page again, and if some bad person has split
+ the page and moved the last tuple we saw off of it, then the
+ code complains about botched concurrency in an elog(WARN, ...)
+ and gives up the ghost. This is the ONLY violation of Lehman
+ and Yao's guarantee of correct behavior that I am aware of in
+ this code.
+
+Notes to operator class implementors:
+
+ With this implementation, we require the user to supply us with
+ a procedure for pg_amproc. This procedure should take two keys
+ A and B and return < 0, 0, or > 0 if A < B, A = B, or A > B,
+ respectively. See the contents of that relation for the btree
+ access method for some samples.
+
+Notes to mao for implementation document:
+
+ On deletions, we need to adjust the position of active scans on
+ the index. The code in nbtscan.c handles this. We don't need to
+ do this for splits because of the way splits are handled; if they
+ happen behind us, we'll automatically go to the next page, and if
+ they happen in front of us, we're not affected by them. For
+ insertions, if we inserted a tuple behind the current scan location
+ on the current scan page, we move one space ahead.
diff --git a/src/backend/access/nbtree/nbtcompare.c b/src/backend/access/nbtree/nbtcompare.c
new file mode 100644
index 0000000000..e567b3c44c
--- /dev/null
+++ b/src/backend/access/nbtree/nbtcompare.c
@@ -0,0 +1,173 @@
+/*-------------------------------------------------------------------------
+ *
+ * btcompare.c--
+ * Comparison functions for btree access method.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtcompare.c,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $
+ *
+ * NOTES
+ * These functions are stored in pg_amproc. For each operator class
+ * defined on btrees, they compute
+ *
+ * compare(a, b):
+ * < 0 if a < b,
+ * = 0 if a == b,
+ * > 0 if a > b.
+ *-------------------------------------------------------------------------
+ */
+#include <string.h>
+#include "postgres.h"
+#include "utils/nabstime.h"
+
+int32
+btint2cmp(int16 a, int16 b)
+{
+ return ((int32) (a - b));
+}
+
+int32
+btint4cmp(int32 a, int32 b)
+{
+ return (a - b);
+}
+
+int32
+btint24cmp(int16 a, int32 b)
+{
+ return (((int32) a) - b);
+}
+
+int32
+btint42cmp(int32 a, int16 b)
+{
+ return (a - ((int32) b));
+}
+
+int32
+btfloat4cmp(float32 a, float32 b)
+{
+ if (*a > *b)
+ return (1);
+ else if (*a == *b)
+ return (0);
+ else
+ return (-1);
+}
+
+int32
+btfloat8cmp(float64 a, float64 b)
+{
+ if (*a > *b)
+ return (1);
+ else if (*a == *b)
+ return (0);
+ else
+ return (-1);
+}
+
+int32
+btoidcmp(Oid a, Oid b)
+{
+ if (a > b)
+ return (1);
+ else if (a == b)
+ return (0);
+ else
+ return (-1);
+}
+
+int32
+btabstimecmp(AbsoluteTime a, AbsoluteTime b)
+{
+ if (AbsoluteTimeIsBefore(a, b))
+ return (1);
+ else if (AbsoluteTimeIsBefore(b, a))
+ return (-1);
+ else
+ return (0);
+}
+
+int32
+btcharcmp(char a, char b)
+{
+ return ((int32) (a - b));
+}
+
+int32
+btchar2cmp(uint16 a, uint16 b)
+{
+ return (strncmp((char *) &a, (char *) &b, 2));
+}
+
+int32
+btchar4cmp(uint32 a, uint32 b)
+{
+ return (strncmp((char *) &a, (char *) &b, 4));
+}
+
+int32
+btchar8cmp(char *a, char *b)
+{
+ return (strncmp(a, b, 8));
+}
+
+int32
+btchar16cmp(char *a, char *b)
+{
+ return (strncmp(a, b, 16));
+}
+
+int32
+btnamecmp(NameData *a, NameData *b)
+{
+ return (strncmp(a->data, b->data, NAMEDATALEN));
+}
+
+int32
+bttextcmp(struct varlena *a, struct varlena *b)
+{
+ char *ap, *bp;
+ int len;
+ int res;
+
+ ap = VARDATA(a);
+ bp = VARDATA(b);
+
+ /* len is the length of the shorter of the two strings */
+ if ((len = VARSIZE(a)) > VARSIZE(b))
+ len = VARSIZE(b);
+
+ /* len includes the four bytes in which string length is stored */
+ len -= sizeof(VARSIZE(a));
+
+ /*
+ * If the two strings differ in the first len bytes, or if they're
+ * the same in the first len bytes and they're both len bytes long,
+ * we're done.
+ */
+
+ res = 0;
+ if (len > 0) {
+ do {
+ res = (int) (*ap++ - *bp++);
+ len--;
+ } while (res == 0 && len != 0);
+ }
+
+ if (res != 0 || VARSIZE(a) == VARSIZE(b))
+ return (res);
+
+ /*
+ * The two strings are the same in the first len bytes, and they
+ * are of different lengths.
+ */
+
+ if (VARSIZE(a) < VARSIZE(b))
+ return (-1);
+ else
+ return (1);
+}
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
new file mode 100644
index 0000000000..536c0aa385
--- /dev/null
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -0,0 +1,831 @@
+/*-------------------------------------------------------------------------
+ *
+ * btinsert.c--
+ * Item insertion in Lehman and Yao btrees for Postgres.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "storage/bufmgr.h"
+#include "storage/bufpage.h"
+
+#include "utils/elog.h"
+#include "utils/palloc.h"
+#include "utils/rel.h"
+#include "utils/excid.h"
+
+#include "access/heapam.h"
+#include "access/genam.h"
+#include "access/nbtree.h"
+
+static InsertIndexResult _bt_insertonpg(Relation rel, Buffer buf, BTStack stack, int keysz, ScanKey scankey, BTItem btitem, BTItem afteritem);
+static Buffer _bt_split(Relation rel, Buffer buf);
+static OffsetNumber _bt_findsplitloc(Relation rel, Page page, OffsetNumber start, OffsetNumber maxoff, Size llimit);
+static void _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf);
+static OffsetNumber _bt_pgaddtup(Relation rel, Buffer buf, int keysz, ScanKey itup_scankey, Size itemsize, BTItem btitem, BTItem afteritem);
+static bool _bt_goesonpg(Relation rel, Buffer buf, Size keysz, ScanKey scankey, BTItem afteritem);
+static void _bt_updateitem(Relation rel, Size keysz, Buffer buf, Oid bti_oid, BTItem newItem);
+
+/*
+ * _bt_doinsert() -- Handle insertion of a single btitem in the tree.
+ *
+ * This routine is called by the public interface routines, btbuild
+ * and btinsert. By here, btitem is filled in, and has a unique
+ * (xid, seqno) pair.
+ */
+InsertIndexResult
+_bt_doinsert(Relation rel, BTItem btitem)
+{
+ ScanKey itup_scankey;
+ IndexTuple itup;
+ BTStack stack;
+ Buffer buf;
+ BlockNumber blkno;
+ int natts;
+ InsertIndexResult res;
+
+ itup = &(btitem->bti_itup);
+
+ /* we need a scan key to do our search, so build one */
+ itup_scankey = _bt_mkscankey(rel, itup);
+ natts = rel->rd_rel->relnatts;
+
+ /* find the page containing this key */
+ stack = _bt_search(rel, natts, itup_scankey, &buf);
+ blkno = BufferGetBlockNumber(buf);
+
+ /* trade in our read lock for a write lock */
+ _bt_relbuf(rel, buf, BT_READ);
+ buf = _bt_getbuf(rel, blkno, BT_WRITE);
+
+ /*
+ * If the page was split between the time that we surrendered our
+ * read lock and acquired our write lock, then this page may no
+ * longer be the right place for the key we want to insert. In this
+ * case, we need to move right in the tree. See Lehman and Yao for
+ * an excruciatingly precise description.
+ */
+
+ buf = _bt_moveright(rel, buf, natts, itup_scankey, BT_WRITE);
+
+ /* do the insertion */
+ res = _bt_insertonpg(rel, buf, stack, natts, itup_scankey,
+ btitem, (BTItem) NULL);
+
+ /* be tidy */
+ _bt_freestack(stack);
+ _bt_freeskey(itup_scankey);
+
+ return (res);
+}
+
+/*
+ * _bt_insertonpg() -- Insert a tuple on a particular page in the index.
+ *
+ * This recursive procedure does the following things:
+ *
+ * + if necessary, splits the target page.
+ * + finds the right place to insert the tuple (taking into
+ * account any changes induced by a split).
+ * + inserts the tuple.
+ * + if the page was split, pops the parent stack, and finds the
+ * right place to insert the new child pointer (by walking
+ * right using information stored in the parent stack).
+ * + invoking itself with the appropriate tuple for the right
+ * child page on the parent.
+ *
+ * On entry, we must have the right buffer on which to do the
+ * insertion, and the buffer must be pinned and locked. On return,
+ * we will have dropped both the pin and the write lock on the buffer.
+ *
+ * The locking interactions in this code are critical. You should
+ * grok Lehman and Yao's paper before making any changes. In addition,
+ * you need to understand how we disambiguate duplicate keys in this
+ * implementation, in order to be able to find our location using
+ * L&Y "move right" operations. Since we may insert duplicate user
+ * keys, and since these dups may propogate up the tree, we use the
+ * 'afteritem' parameter to position ourselves correctly for the
+ * insertion on internal pages.
+ */
+static InsertIndexResult
+_bt_insertonpg(Relation rel,
+ Buffer buf,
+ BTStack stack,
+ int keysz,
+ ScanKey scankey,
+ BTItem btitem,
+ BTItem afteritem)
+{
+ InsertIndexResult res;
+ Page page;
+ Buffer rbuf;
+ Buffer pbuf;
+ Page rpage;
+ ScanKey newskey;
+ BTItem ritem;
+ BTPageOpaque rpageop;
+ BlockNumber rbknum, itup_blkno;
+ OffsetNumber itup_off;
+ int itemsz;
+ InsertIndexResult newres;
+ BTItem new_item = (BTItem) NULL;
+ BTItem lowLeftItem;
+
+ page = BufferGetPage(buf);
+ itemsz = IndexTupleDSize(btitem->bti_itup)
+ + (sizeof(BTItemData) - sizeof(IndexTupleData));
+
+ itemsz = DOUBLEALIGN(itemsz); /* be safe, PageAddItem will do this
+ but we need to be consistent */
+
+ if (PageGetFreeSpace(page) < itemsz) {
+
+ /* split the buffer into left and right halves */
+ rbuf = _bt_split(rel, buf);
+
+ /* which new page (left half or right half) gets the tuple? */
+ if (_bt_goesonpg(rel, buf, keysz, scankey, afteritem)) {
+ /* left page */
+ itup_off = _bt_pgaddtup(rel, buf, keysz, scankey,
+ itemsz, btitem, afteritem);
+ itup_blkno = BufferGetBlockNumber(buf);
+ } else {
+ /* right page */
+ itup_off = _bt_pgaddtup(rel, rbuf, keysz, scankey,
+ itemsz, btitem, afteritem);
+ itup_blkno = BufferGetBlockNumber(rbuf);
+ }
+
+ /*
+ * By here,
+ *
+ * + our target page has been split;
+ * + the original tuple has been inserted;
+ * + we have write locks on both the old (left half) and new
+ * (right half) buffers, after the split; and
+ * + we have the key we want to insert into the parent.
+ *
+ * Do the parent insertion. We need to hold onto the locks for
+ * the child pages until we locate the parent, but we can release
+ * them before doing the actual insertion (see Lehman and Yao for
+ * the reasoning).
+ */
+
+ if (stack == (BTStack) NULL) {
+
+ /* create a new root node and release the split buffers */
+ _bt_newroot(rel, buf, rbuf);
+ _bt_relbuf(rel, buf, BT_WRITE);
+ _bt_relbuf(rel, rbuf, BT_WRITE);
+
+ } else {
+
+ /* form a index tuple that points at the new right page */
+ rbknum = BufferGetBlockNumber(rbuf);
+ rpage = BufferGetPage(rbuf);
+ rpageop = (BTPageOpaque) PageGetSpecialPointer(rpage);
+
+ /*
+ * By convention, the first entry (0) on every
+ * non-rightmost page is the high key for that page. In
+ * order to get the lowest key on the new right page, we
+ * actually look at its second (1) entry.
+ */
+
+ if (! P_RIGHTMOST(rpageop)) {
+ ritem = (BTItem) PageGetItem(rpage,
+ PageGetItemId(rpage, P_FIRSTKEY));
+ } else {
+ ritem = (BTItem) PageGetItem(rpage,
+ PageGetItemId(rpage, P_HIKEY));
+ }
+
+ /* get a unique btitem for this key */
+ new_item = _bt_formitem(&(ritem->bti_itup));
+
+ ItemPointerSet(&(new_item->bti_itup.t_tid), rbknum, P_HIKEY);
+
+ /* find the parent buffer */
+ pbuf = _bt_getstackbuf(rel, stack, BT_WRITE);
+
+ /*
+ * If the key of new_item is < than the key of the item
+ * in the parent page pointing to the left page
+ * (stack->bts_btitem), we have to update the latter key;
+ * otherwise the keys on the parent page wouldn't be
+ * monotonically increasing after we inserted the new
+ * pointer to the right page (new_item). This only
+ * happens if our left page is the leftmost page and a
+ * new minimum key had been inserted before, which is not
+ * reflected in the parent page but didn't matter so
+ * far. If there are duplicate keys and this new minimum
+ * key spills over to our new right page, we get an
+ * inconsistency if we don't update the left key in the
+ * parent page.
+ */
+
+ if (_bt_itemcmp(rel, keysz, stack->bts_btitem, new_item,
+ BTGreaterStrategyNumber)) {
+ lowLeftItem =
+ (BTItem) PageGetItem(page,
+ PageGetItemId(page, P_FIRSTKEY));
+ /* page must have right pointer after split */
+ _bt_updateitem(rel, keysz, pbuf, stack->bts_btitem->bti_oid,
+ lowLeftItem);
+ }
+
+ /* don't need the children anymore */
+ _bt_relbuf(rel, buf, BT_WRITE);
+ _bt_relbuf(rel, rbuf, BT_WRITE);
+
+ newskey = _bt_mkscankey(rel, &(new_item->bti_itup));
+ newres = _bt_insertonpg(rel, pbuf, stack->bts_parent,
+ keysz, newskey, new_item,
+ stack->bts_btitem);
+
+ /* be tidy */
+ pfree(newres);
+ pfree(newskey);
+ pfree(new_item);
+ }
+ } else {
+ itup_off = _bt_pgaddtup(rel, buf, keysz, scankey,
+ itemsz, btitem, afteritem);
+ itup_blkno = BufferGetBlockNumber(buf);
+
+ _bt_relbuf(rel, buf, BT_WRITE);
+ }
+
+ /* by here, the new tuple is inserted */
+ res = (InsertIndexResult) palloc(sizeof(InsertIndexResultData));
+ ItemPointerSet(&(res->pointerData), itup_blkno, itup_off);
+
+ return (res);
+}
+
+/*
+ * _bt_split() -- split a page in the btree.
+ *
+ * On entry, buf is the page to split, and is write-locked and pinned.
+ * Returns the new right sibling of buf, pinned and write-locked. The
+ * pin and lock on buf are maintained.
+ */
+static Buffer
+_bt_split(Relation rel, Buffer buf)
+{
+ Buffer rbuf;
+ Page origpage;
+ Page leftpage, rightpage;
+ BTPageOpaque ropaque, lopaque, oopaque;
+ Buffer sbuf;
+ Page spage;
+ BTPageOpaque sopaque;
+ Size itemsz;
+ ItemId itemid;
+ BTItem item;
+ OffsetNumber leftoff, rightoff;
+ OffsetNumber start;
+ OffsetNumber maxoff;
+ OffsetNumber firstright;
+ OffsetNumber i;
+ Size llimit;
+
+ rbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
+ origpage = BufferGetPage(buf);
+ leftpage = PageGetTempPage(origpage, sizeof(BTPageOpaqueData));
+ rightpage = BufferGetPage(rbuf);
+
+ _bt_pageinit(rightpage, BufferGetPageSize(rbuf));
+ _bt_pageinit(leftpage, BufferGetPageSize(buf));
+
+ /* init btree private data */
+ oopaque = (BTPageOpaque) PageGetSpecialPointer(origpage);
+ lopaque = (BTPageOpaque) PageGetSpecialPointer(leftpage);
+ ropaque = (BTPageOpaque) PageGetSpecialPointer(rightpage);
+
+ /* if we're splitting this page, it won't be the root when we're done */
+ oopaque->btpo_flags &= ~BTP_ROOT;
+ lopaque->btpo_flags = ropaque->btpo_flags = oopaque->btpo_flags;
+ lopaque->btpo_prev = oopaque->btpo_prev;
+ ropaque->btpo_prev = BufferGetBlockNumber(buf);
+ lopaque->btpo_next = BufferGetBlockNumber(rbuf);
+ ropaque->btpo_next = oopaque->btpo_next;
+
+ /*
+ * If the page we're splitting is not the rightmost page at its
+ * level in the tree, then the first (0) entry on the page is the
+ * high key for the page. We need to copy that to the right
+ * half. Otherwise (meaning the rightmost page case), we should
+ * treat the line pointers beginning at zero as user data.
+ *
+ * We leave a blank space at the start of the line table for the
+ * left page. We'll come back later and fill it in with the high
+ * key item we get from the right key.
+ */
+
+ leftoff = P_FIRSTKEY;
+ ropaque->btpo_next = oopaque->btpo_next;
+ if (! P_RIGHTMOST(oopaque)) {
+ /* splitting a non-rightmost page, start at the first data item */
+ start = P_FIRSTKEY;
+
+ /* copy the original high key to the new page */
+ itemid = PageGetItemId(origpage, P_HIKEY);
+ itemsz = ItemIdGetLength(itemid);
+ item = (BTItem) PageGetItem(origpage, itemid);
+ (void) PageAddItem(rightpage, (Item) item, itemsz, P_HIKEY, LP_USED);
+ rightoff = P_FIRSTKEY;
+ } else {
+ /* splitting a rightmost page, "high key" is the first data item */
+ start = P_HIKEY;
+
+ /* the new rightmost page will not have a high key */
+ rightoff = P_HIKEY;
+ }
+ maxoff = PageGetMaxOffsetNumber(origpage);
+ llimit = PageGetFreeSpace(leftpage) / 2;
+ firstright = _bt_findsplitloc(rel, origpage, start, maxoff, llimit);
+
+ for (i = start; i <= maxoff; i = OffsetNumberNext(i)) {
+ itemid = PageGetItemId(origpage, i);
+ itemsz = ItemIdGetLength(itemid);
+ item = (BTItem) PageGetItem(origpage, itemid);
+
+ /* decide which page to put it on */
+ if (i < firstright) {
+ (void) PageAddItem(leftpage, (Item) item, itemsz, leftoff,
+ LP_USED);
+ leftoff = OffsetNumberNext(leftoff);
+ } else {
+ (void) PageAddItem(rightpage, (Item) item, itemsz, rightoff,
+ LP_USED);
+ rightoff = OffsetNumberNext(rightoff);
+ }
+ }
+
+ /*
+ * Okay, page has been split, high key on right page is correct. Now
+ * set the high key on the left page to be the min key on the right
+ * page.
+ */
+
+ if (P_RIGHTMOST(ropaque)) {
+ itemid = PageGetItemId(rightpage, P_HIKEY);
+ } else {
+ itemid = PageGetItemId(rightpage, P_FIRSTKEY);
+ }
+ itemsz = ItemIdGetLength(itemid);
+ item = (BTItem) PageGetItem(rightpage, itemid);
+
+ /*
+ * We left a hole for the high key on the left page; fill it. The
+ * modal crap is to tell the page manager to put the new item on the
+ * page and not screw around with anything else. Whoever designed
+ * this interface has presumably crawled back into the dung heap they
+ * came from. No one here will admit to it.
+ */
+
+ PageManagerModeSet(OverwritePageManagerMode);
+ (void) PageAddItem(leftpage, (Item) item, itemsz, P_HIKEY, LP_USED);
+ PageManagerModeSet(ShufflePageManagerMode);
+
+ /*
+ * By here, the original data page has been split into two new halves,
+ * and these are correct. The algorithm requires that the left page
+ * never move during a split, so we copy the new left page back on top
+ * of the original. Note that this is not a waste of time, since we
+ * also require (in the page management code) that the center of a
+ * page always be clean, and the most efficient way to guarantee this
+ * is just to compact the data by reinserting it into a new left page.
+ */
+
+ PageRestoreTempPage(leftpage, origpage);
+
+ /* write these guys out */
+ _bt_wrtnorelbuf(rel, rbuf);
+ _bt_wrtnorelbuf(rel, buf);
+
+ /*
+ * Finally, we need to grab the right sibling (if any) and fix the
+ * prev pointer there. We are guaranteed that this is deadlock-free
+ * since no other writer will be moving holding a lock on that page
+ * and trying to move left, and all readers release locks on a page
+ * before trying to fetch its neighbors.
+ */
+
+ if (! P_RIGHTMOST(ropaque)) {
+ sbuf = _bt_getbuf(rel, ropaque->btpo_next, BT_WRITE);
+ spage = BufferGetPage(sbuf);
+ sopaque = (BTPageOpaque) PageGetSpecialPointer(spage);
+ sopaque->btpo_prev = BufferGetBlockNumber(rbuf);
+
+ /* write and release the old right sibling */
+ _bt_wrtbuf(rel, sbuf);
+ }
+
+ /* split's done */
+ return (rbuf);
+}
+
+/*
+ * _bt_findsplitloc() -- find a safe place to split a page.
+ *
+ * In order to guarantee the proper handling of searches for duplicate
+ * keys, the first duplicate in the chain must either be the first
+ * item on the page after the split, or the entire chain must be on
+ * one of the two pages. That is,
+ * [1 2 2 2 3 4 5]
+ * must become
+ * [1] [2 2 2 3 4 5]
+ * or
+ * [1 2 2 2] [3 4 5]
+ * but not
+ * [1 2 2] [2 3 4 5].
+ * However,
+ * [2 2 2 2 2 3 4]
+ * may be split as
+ * [2 2 2 2] [2 3 4].
+ */
+static OffsetNumber
+_bt_findsplitloc(Relation rel,
+ Page page,
+ OffsetNumber start,
+ OffsetNumber maxoff,
+ Size llimit)
+{
+ OffsetNumber i;
+ OffsetNumber saferight;
+ ItemId nxtitemid, safeitemid;
+ BTItem safeitem, nxtitem;
+ IndexTuple safetup, nxttup;
+ Size nbytes;
+ TupleDesc itupdesc;
+ int natts;
+ int attno;
+ Datum attsafe;
+ Datum attnext;
+ bool null;
+
+ itupdesc = RelationGetTupleDescriptor(rel);
+ natts = rel->rd_rel->relnatts;
+
+ saferight = start;
+ safeitemid = PageGetItemId(page, saferight);
+ nbytes = ItemIdGetLength(safeitemid) + sizeof(ItemIdData);
+ safeitem = (BTItem) PageGetItem(page, safeitemid);
+ safetup = &(safeitem->bti_itup);
+
+ i = OffsetNumberNext(start);
+
+ while (nbytes < llimit) {
+
+ /* check the next item on the page */
+ nxtitemid = PageGetItemId(page, i);
+ nbytes += (ItemIdGetLength(nxtitemid) + sizeof(ItemIdData));
+ nxtitem = (BTItem) PageGetItem(page, nxtitemid);
+ nxttup = &(nxtitem->bti_itup);
+
+ /* test against last known safe item */
+ for (attno = 1; attno <= natts; attno++) {
+ attsafe = index_getattr(safetup, attno, itupdesc, &null);
+ attnext = index_getattr(nxttup, attno, itupdesc, &null);
+
+ /*
+ * If the tuple we're looking at isn't equal to the last safe one
+ * we saw, then it's our new safe tuple.
+ */
+
+ if (!_bt_invokestrat(rel, attno, BTEqualStrategyNumber,
+ attsafe, attnext)) {
+ safetup = nxttup;
+ saferight = i;
+
+ /* break is for the attno for loop */
+ break;
+ }
+ }
+ i = OffsetNumberNext(i);
+ }
+
+ /*
+ * If the chain of dups starts at the beginning of the page and extends
+ * past the halfway mark, we can split it in the middle.
+ */
+
+ if (saferight == start)
+ saferight = i;
+
+ return (saferight);
+}
+
+/*
+ * _bt_newroot() -- Create a new root page for the index.
+ *
+ * We've just split the old root page and need to create a new one.
+ * In order to do this, we add a new root page to the file, then lock
+ * the metadata page and update it. This is guaranteed to be deadlock-
+ * free, because all readers release their locks on the metadata page
+ * before trying to lock the root, and all writers lock the root before
+ * trying to lock the metadata page. We have a write lock on the old
+ * root page, so we have not introduced any cycles into the waits-for
+ * graph.
+ *
+ * On entry, lbuf (the old root) and rbuf (its new peer) are write-
+ * locked. We don't drop the locks in this routine; that's done by
+ * the caller. On exit, a new root page exists with entries for the
+ * two new children. The new root page is neither pinned nor locked.
+ */
+static void
+_bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
+{
+ Buffer rootbuf;
+ Page lpage, rpage, rootpage;
+ BlockNumber lbkno, rbkno;
+ BlockNumber rootbknum;
+ BTPageOpaque rootopaque;
+ ItemId itemid;
+ BTItem item;
+ Size itemsz;
+ BTItem new_item;
+
+ /* get a new root page */
+ rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
+ rootpage = BufferGetPage(rootbuf);
+ _bt_pageinit(rootpage, BufferGetPageSize(rootbuf));
+
+ /* set btree special data */
+ rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
+ rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
+ rootopaque->btpo_flags |= BTP_ROOT;
+
+ /*
+ * Insert the internal tuple pointers.
+ */
+
+ lbkno = BufferGetBlockNumber(lbuf);
+ rbkno = BufferGetBlockNumber(rbuf);
+ lpage = BufferGetPage(lbuf);
+ rpage = BufferGetPage(rbuf);
+
+ /*
+ * step over the high key on the left page while building the
+ * left page pointer.
+ */
+ itemid = PageGetItemId(lpage, P_FIRSTKEY);
+ itemsz = ItemIdGetLength(itemid);
+ item = (BTItem) PageGetItem(lpage, itemid);
+ new_item = _bt_formitem(&(item->bti_itup));
+ ItemPointerSet(&(new_item->bti_itup.t_tid), lbkno, P_FIRSTKEY);
+
+ /*
+ * insert the left page pointer into the new root page. the root
+ * page is the rightmost page on its level so the "high key" item
+ * is the first data item.
+ */
+ (void) PageAddItem(rootpage, (Item) new_item, itemsz, P_HIKEY, LP_USED);
+ pfree(new_item);
+
+ /*
+ * the right page is the rightmost page on the second level, so
+ * the "high key" item is the first data item on that page as well.
+ */
+ itemid = PageGetItemId(rpage, P_HIKEY);
+ itemsz = ItemIdGetLength(itemid);
+ item = (BTItem) PageGetItem(rpage, itemid);
+ new_item = _bt_formitem(&(item->bti_itup));
+ ItemPointerSet(&(new_item->bti_itup.t_tid), rbkno, P_HIKEY);
+
+ /*
+ * insert the right page pointer into the new root page.
+ */
+ (void) PageAddItem(rootpage, (Item) new_item, itemsz, P_FIRSTKEY, LP_USED);
+ pfree(new_item);
+
+ /* write and let go of the root buffer */
+ rootbknum = BufferGetBlockNumber(rootbuf);
+ _bt_wrtbuf(rel, rootbuf);
+
+ /* update metadata page with new root block number */
+ _bt_metaproot(rel, rootbknum);
+}
+
+/*
+ * _bt_pgaddtup() -- add a tuple to a particular page in the index.
+ *
+ * This routine adds the tuple to the page as requested, and keeps the
+ * write lock and reference associated with the page's buffer. It is
+ * an error to call pgaddtup() without a write lock and reference. If
+ * afteritem is non-null, it's the item that we expect our new item
+ * to follow. Otherwise, we do a binary search for the correct place
+ * and insert the new item there.
+ */
+static OffsetNumber
+_bt_pgaddtup(Relation rel,
+ Buffer buf,
+ int keysz,
+ ScanKey itup_scankey,
+ Size itemsize,
+ BTItem btitem,
+ BTItem afteritem)
+{
+ OffsetNumber itup_off;
+ OffsetNumber first;
+ Page page;
+ BTPageOpaque opaque;
+ BTItem chkitem;
+ Oid afteroid;
+
+ page = BufferGetPage(buf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ first = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
+
+ if (afteritem == (BTItem) NULL) {
+ itup_off = _bt_binsrch(rel, buf, keysz, itup_scankey, BT_INSERTION);
+ } else {
+ afteroid = afteritem->bti_oid;
+ itup_off = first;
+
+ do {
+ chkitem =
+ (BTItem) PageGetItem(page, PageGetItemId(page, itup_off));
+ itup_off = OffsetNumberNext(itup_off);
+ } while (chkitem->bti_oid != afteroid);
+ }
+
+ (void) PageAddItem(page, (Item) btitem, itemsize, itup_off, LP_USED);
+
+ /* write the buffer, but hold our lock */
+ _bt_wrtnorelbuf(rel, buf);
+
+ return (itup_off);
+}
+
+/*
+ * _bt_goesonpg() -- Does a new tuple belong on this page?
+ *
+ * This is part of the complexity introduced by allowing duplicate
+ * keys into the index. The tuple belongs on this page if:
+ *
+ * + there is no page to the right of this one; or
+ * + it is less than the high key on the page; or
+ * + the item it is to follow ("afteritem") appears on this
+ * page.
+ */
+static bool
+_bt_goesonpg(Relation rel,
+ Buffer buf,
+ Size keysz,
+ ScanKey scankey,
+ BTItem afteritem)
+{
+ Page page;
+ ItemId hikey;
+ BTPageOpaque opaque;
+ BTItem chkitem;
+ OffsetNumber offnum, maxoff;
+ Oid afteroid;
+ bool found;
+
+ page = BufferGetPage(buf);
+
+ /* no right neighbor? */
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ if (P_RIGHTMOST(opaque))
+ return (true);
+
+ /*
+ * this is a non-rightmost page, so it must have a high key item.
+ *
+ * If the scan key is < the high key (the min key on the next page),
+ * then it for sure belongs here.
+ */
+ hikey = PageGetItemId(page, P_HIKEY);
+ if (_bt_skeycmp(rel, keysz, scankey, page, hikey, BTLessStrategyNumber))
+ return (true);
+
+ /*
+ * If the scan key is > the high key, then it for sure doesn't belong
+ * here.
+ */
+
+ if (_bt_skeycmp(rel, keysz, scankey, page, hikey, BTGreaterStrategyNumber))
+ return (false);
+
+ /*
+ * If we have no adjacency information, and the item is equal to the
+ * high key on the page (by here it is), then the item does not belong
+ * on this page.
+ */
+
+ if (afteritem == (BTItem) NULL)
+ return (false);
+
+ /* damn, have to work for it. i hate that. */
+ afteroid = afteritem->bti_oid;
+ maxoff = PageGetMaxOffsetNumber(page);
+
+ /*
+ * Search the entire page for the afteroid. We need to do this, rather
+ * than doing a binary search and starting from there, because if the
+ * key we're searching for is the leftmost key in the tree at this
+ * level, then a binary search will do the wrong thing. Splits are
+ * pretty infrequent, so the cost isn't as bad as it could be.
+ */
+
+ found = false;
+ for (offnum = P_FIRSTKEY;
+ offnum <= maxoff;
+ offnum = OffsetNumberNext(offnum)) {
+ chkitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
+ if (chkitem->bti_oid == afteroid) {
+ found = true;
+ break;
+ }
+ }
+
+ return (found);
+}
+
+/*
+ * _bt_itemcmp() -- compare item1 to item2 using a requested
+ * strategy (<, <=, =, >=, >)
+ *
+ */
+bool
+_bt_itemcmp(Relation rel,
+ Size keysz,
+ BTItem item1,
+ BTItem item2,
+ StrategyNumber strat)
+{
+ TupleDesc tupDes;
+ IndexTuple indexTuple1, indexTuple2;
+ Datum attrDatum1, attrDatum2;
+ int i;
+ bool isNull;
+ bool compare;
+
+ tupDes = RelationGetTupleDescriptor(rel);
+ indexTuple1 = &(item1->bti_itup);
+ indexTuple2 = &(item2->bti_itup);
+
+ for (i = 1; i <= keysz; i++) {
+ attrDatum1 = index_getattr(indexTuple1, i, tupDes, &isNull);
+ attrDatum2 = index_getattr(indexTuple2, i, tupDes, &isNull);
+ compare = _bt_invokestrat(rel, i, strat, attrDatum1, attrDatum2);
+ if (!compare) {
+ return (false);
+ }
+ }
+ return (true);
+}
+
+/*
+ * _bt_updateitem() -- updates the key of the item identified by the
+ * oid with the key of newItem (done in place)
+ *
+ */
+static void
+_bt_updateitem(Relation rel,
+ Size keysz,
+ Buffer buf,
+ Oid bti_oid,
+ BTItem newItem)
+{
+ Page page;
+ OffsetNumber maxoff;
+ OffsetNumber i;
+ ItemPointerData itemPtrData;
+ BTItem item;
+ IndexTuple oldIndexTuple, newIndexTuple;
+
+ page = BufferGetPage(buf);
+ maxoff = PageGetMaxOffsetNumber(page);
+
+ /* locate item on the page */
+ i = P_HIKEY;
+ do {
+ item = (BTItem) PageGetItem(page, PageGetItemId(page, i));
+ i = OffsetNumberNext(i);
+ } while (i <= maxoff && item->bti_oid != bti_oid);
+
+ /* this should never happen (in theory) */
+ if (item->bti_oid != bti_oid) {
+ elog(FATAL, "_bt_getstackbuf was lying!!");
+ }
+
+ oldIndexTuple = &(item->bti_itup);
+ newIndexTuple = &(newItem->bti_itup);
+
+ /* keep the original item pointer */
+ ItemPointerCopy(&(oldIndexTuple->t_tid), &itemPtrData);
+ CopyIndexTuple(newIndexTuple, &oldIndexTuple);
+ ItemPointerCopy(&itemPtrData, &(oldIndexTuple->t_tid));
+}
diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c
new file mode 100644
index 0000000000..ce411a80d1
--- /dev/null
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -0,0 +1,523 @@
+/*-------------------------------------------------------------------------
+ *
+ * btpage.c--
+ * BTree-specific page management code for the Postgres btree access
+ * method.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtpage.c,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $
+ *
+ * NOTES
+ * Postgres btree pages look like ordinary relation pages. The opaque
+ * data at high addresses includes pointers to left and right siblings
+ * and flag data describing page state. The first page in a btree, page
+ * zero, is special -- it stores meta-information describing the tree.
+ * Pages one and higher store the actual tree data.
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "storage/bufmgr.h"
+#include "storage/bufpage.h"
+
+#include "utils/elog.h"
+#include "utils/rel.h"
+#include "utils/excid.h"
+
+#include "access/genam.h"
+#include "access/nbtree.h"
+
+#define BTREE_METAPAGE 0
+#define BTREE_MAGIC 0x053162
+#define BTREE_VERSION 0
+
+typedef struct BTMetaPageData {
+ uint32 btm_magic;
+ uint32 btm_version;
+ BlockNumber btm_root;
+} BTMetaPageData;
+
+#define BTPageGetMeta(p) \
+ ((BTMetaPageData *) &((PageHeader) p)->pd_linp[0])
+
+extern bool BuildingBtree;
+
+/*
+ * We use high-concurrency locking on btrees. There are two cases in
+ * which we don't do locking. One is when we're building the btree.
+ * Since the creating transaction has not committed, no one can see
+ * the index, and there's no reason to share locks. The second case
+ * is when we're just starting up the database system. We use some
+ * special-purpose initialization code in the relation cache manager
+ * (see utils/cache/relcache.c) to allow us to do indexed scans on
+ * the system catalogs before we'd normally be able to. This happens
+ * before the lock table is fully initialized, so we can't use it.
+ * Strictly speaking, this violates 2pl, but we don't do 2pl on the
+ * system catalogs anyway, so I declare this to be okay.
+ */
+
+#define USELOCKING (!BuildingBtree && !IsInitProcessingMode())
+
+/*
+ * _bt_metapinit() -- Initialize the metadata page of a btree.
+ */
+void
+_bt_metapinit(Relation rel)
+{
+ Buffer buf;
+ Page pg;
+ int nblocks;
+ BTMetaPageData metad;
+ BTPageOpaque op;
+
+ /* can't be sharing this with anyone, now... */
+ if (USELOCKING)
+ RelationSetLockForWrite(rel);
+
+ if ((nblocks = RelationGetNumberOfBlocks(rel)) != 0) {
+ elog(WARN, "Cannot initialize non-empty btree %s",
+ RelationGetRelationName(rel));
+ }
+
+ buf = ReadBuffer(rel, P_NEW);
+ pg = BufferGetPage(buf);
+ _bt_pageinit(pg, BufferGetPageSize(buf));
+
+ metad.btm_magic = BTREE_MAGIC;
+ metad.btm_version = BTREE_VERSION;
+ metad.btm_root = P_NONE;
+ memmove((char *) BTPageGetMeta(pg), (char *) &metad, sizeof(metad));
+
+ op = (BTPageOpaque) PageGetSpecialPointer(pg);
+ op->btpo_flags = BTP_META;
+
+ WriteBuffer(buf);
+
+ /* all done */
+ if (USELOCKING)
+ RelationUnsetLockForWrite(rel);
+}
+
+/*
+ * _bt_checkmeta() -- Verify that the metadata stored in a btree are
+ * reasonable.
+ */
+void
+_bt_checkmeta(Relation rel)
+{
+ Buffer metabuf;
+ Page metap;
+ BTMetaPageData *metad;
+ BTPageOpaque op;
+ int nblocks;
+
+ /* if the relation is empty, this is init time; don't complain */
+ if ((nblocks = RelationGetNumberOfBlocks(rel)) == 0)
+ return;
+
+ metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
+ metap = BufferGetPage(metabuf);
+ op = (BTPageOpaque) PageGetSpecialPointer(metap);
+ if (!(op->btpo_flags & BTP_META)) {
+ elog(WARN, "Invalid metapage for index %s",
+ RelationGetRelationName(rel));
+ }
+ metad = BTPageGetMeta(metap);
+
+ if (metad->btm_magic != BTREE_MAGIC) {
+ elog(WARN, "Index %s is not a btree",
+ RelationGetRelationName(rel));
+ }
+
+ if (metad->btm_version != BTREE_VERSION) {
+ elog(WARN, "Version mismatch on %s: version %d file, version %d code",
+ RelationGetRelationName(rel),
+ metad->btm_version, BTREE_VERSION);
+ }
+
+ _bt_relbuf(rel, metabuf, BT_READ);
+}
+
+/*
+ * _bt_getroot() -- Get the root page of the btree.
+ *
+ * Since the root page can move around the btree file, we have to read
+ * its location from the metadata page, and then read the root page
+ * itself. If no root page exists yet, we have to create one. The
+ * standard class of race conditions exists here; I think I covered
+ * them all in the Hopi Indian rain dance of lock requests below.
+ *
+ * We pass in the access type (BT_READ or BT_WRITE), and return the
+ * root page's buffer with the appropriate lock type set. Reference
+ * count on the root page gets bumped by ReadBuffer. The metadata
+ * page is unlocked and unreferenced by this process when this routine
+ * returns.
+ */
+Buffer
+_bt_getroot(Relation rel, int access)
+{
+ Buffer metabuf;
+ Page metapg;
+ BTPageOpaque metaopaque;
+ Buffer rootbuf;
+ Page rootpg;
+ BTPageOpaque rootopaque;
+ BlockNumber rootblkno;
+ BTMetaPageData *metad;
+
+ metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
+ metapg = BufferGetPage(metabuf);
+ metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
+ Assert(metaopaque->btpo_flags & BTP_META);
+ metad = BTPageGetMeta(metapg);
+
+ /* if no root page initialized yet, do it */
+ if (metad->btm_root == P_NONE) {
+
+ /* turn our read lock in for a write lock */
+ _bt_relbuf(rel, metabuf, BT_READ);
+ metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
+ metapg = BufferGetPage(metabuf);
+ metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
+ Assert(metaopaque->btpo_flags & BTP_META);
+ metad = BTPageGetMeta(metapg);
+
+ /*
+ * Race condition: if someone else initialized the metadata between
+ * the time we released the read lock and acquired the write lock,
+ * above, we want to avoid doing it again.
+ */
+
+ if (metad->btm_root == P_NONE) {
+
+ /*
+ * Get, initialize, write, and leave a lock of the appropriate
+ * type on the new root page. Since this is the first page in
+ * the tree, it's a leaf.
+ */
+
+ rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
+ rootblkno = BufferGetBlockNumber(rootbuf);
+ rootpg = BufferGetPage(rootbuf);
+ metad->btm_root = rootblkno;
+ _bt_pageinit(rootpg, BufferGetPageSize(rootbuf));
+ rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpg);
+ rootopaque->btpo_flags |= (BTP_LEAF | BTP_ROOT);
+ _bt_wrtnorelbuf(rel, rootbuf);
+
+ /* swap write lock for read lock, if appropriate */
+ if (access != BT_WRITE) {
+ _bt_setpagelock(rel, rootblkno, BT_READ);
+ _bt_unsetpagelock(rel, rootblkno, BT_WRITE);
+ }
+
+ /* okay, metadata is correct */
+ _bt_wrtbuf(rel, metabuf);
+ } else {
+
+ /*
+ * Metadata initialized by someone else. In order to guarantee
+ * no deadlocks, we have to release the metadata page and start
+ * all over again.
+ */
+
+ _bt_relbuf(rel, metabuf, BT_WRITE);
+ return (_bt_getroot(rel, access));
+ }
+ } else {
+ rootbuf = _bt_getbuf(rel, metad->btm_root, access);
+
+ /* done with the meta page */
+ _bt_relbuf(rel, metabuf, BT_READ);
+ }
+
+ /*
+ * Race condition: If the root page split between the time we looked
+ * at the metadata page and got the root buffer, then we got the wrong
+ * buffer.
+ */
+
+ rootpg = BufferGetPage(rootbuf);
+ rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpg);
+ if (!(rootopaque->btpo_flags & BTP_ROOT)) {
+
+ /* it happened, try again */
+ _bt_relbuf(rel, rootbuf, access);
+ return (_bt_getroot(rel, access));
+ }
+
+ /*
+ * By here, we have a correct lock on the root block, its reference
+ * count is correct, and we have no lock set on the metadata page.
+ * Return the root block.
+ */
+
+ return (rootbuf);
+}
+
+/*
+ * _bt_getbuf() -- Get a buffer by block number for read or write.
+ *
+ * When this routine returns, the appropriate lock is set on the
+ * requested buffer its reference count is correct.
+ */
+Buffer
+_bt_getbuf(Relation rel, BlockNumber blkno, int access)
+{
+ Buffer buf;
+ Page page;
+
+ /*
+ * If we want a new block, we can't set a lock of the appropriate type
+ * until we've instantiated the buffer.
+ */
+
+ if (blkno != P_NEW) {
+ if (access == BT_WRITE)
+ _bt_setpagelock(rel, blkno, BT_WRITE);
+ else
+ _bt_setpagelock(rel, blkno, BT_READ);
+
+ buf = ReadBuffer(rel, blkno);
+ } else {
+ buf = ReadBuffer(rel, blkno);
+ blkno = BufferGetBlockNumber(buf);
+ page = BufferGetPage(buf);
+ _bt_pageinit(page, BufferGetPageSize(buf));
+
+ if (access == BT_WRITE)
+ _bt_setpagelock(rel, blkno, BT_WRITE);
+ else
+ _bt_setpagelock(rel, blkno, BT_READ);
+ }
+
+ /* ref count and lock type are correct */
+ return (buf);
+}
+
+/*
+ * _bt_relbuf() -- release a locked buffer.
+ */
+void
+_bt_relbuf(Relation rel, Buffer buf, int access)
+{
+ BlockNumber blkno;
+
+ blkno = BufferGetBlockNumber(buf);
+
+ /* access had better be one of read or write */
+ if (access == BT_WRITE)
+ _bt_unsetpagelock(rel, blkno, BT_WRITE);
+ else
+ _bt_unsetpagelock(rel, blkno, BT_READ);
+
+ ReleaseBuffer(buf);
+}
+
+/*
+ * _bt_wrtbuf() -- write a btree page to disk.
+ *
+ * This routine releases the lock held on the buffer and our reference
+ * to it. It is an error to call _bt_wrtbuf() without a write lock
+ * or a reference to the buffer.
+ */
+void
+_bt_wrtbuf(Relation rel, Buffer buf)
+{
+ BlockNumber blkno;
+
+ blkno = BufferGetBlockNumber(buf);
+ WriteBuffer(buf);
+ _bt_unsetpagelock(rel, blkno, BT_WRITE);
+}
+
+/*
+ * _bt_wrtnorelbuf() -- write a btree page to disk, but do not release
+ * our reference or lock.
+ *
+ * It is an error to call _bt_wrtnorelbuf() without a write lock
+ * or a reference to the buffer.
+ */
+void
+_bt_wrtnorelbuf(Relation rel, Buffer buf)
+{
+ BlockNumber blkno;
+
+ blkno = BufferGetBlockNumber(buf);
+ WriteNoReleaseBuffer(buf);
+}
+
+/*
+ * _bt_pageinit() -- Initialize a new page.
+ */
+void
+_bt_pageinit(Page page, Size size)
+{
+ /*
+ * Cargo-cult programming -- don't really need this to be zero, but
+ * creating new pages is an infrequent occurrence and it makes me feel
+ * good when I know they're empty.
+ */
+
+ memset(page, 0, size);
+
+ PageInit(page, size, sizeof(BTPageOpaqueData));
+}
+
+/*
+ * _bt_metaproot() -- Change the root page of the btree.
+ *
+ * Lehman and Yao require that the root page move around in order to
+ * guarantee deadlock-free short-term, fine-granularity locking. When
+ * we split the root page, we record the new parent in the metadata page
+ * for the relation. This routine does the work.
+ *
+ * No direct preconditions, but if you don't have the a write lock on
+ * at least the old root page when you call this, you're making a big
+ * mistake. On exit, metapage data is correct and we no longer have
+ * a reference to or lock on the metapage.
+ */
+void
+_bt_metaproot(Relation rel, BlockNumber rootbknum)
+{
+ Buffer metabuf;
+ Page metap;
+ BTPageOpaque metaopaque;
+ BTMetaPageData *metad;
+
+ metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
+ metap = BufferGetPage(metabuf);
+ metaopaque = (BTPageOpaque) PageGetSpecialPointer(metap);
+ Assert(metaopaque->btpo_flags & BTP_META);
+ metad = BTPageGetMeta(metap);
+ metad->btm_root = rootbknum;
+ _bt_wrtbuf(rel, metabuf);
+}
+
+/*
+ * _bt_getstackbuf() -- Walk back up the tree one step, and find the item
+ * we last looked at in the parent.
+ *
+ * This is possible because we save a bit image of the last item
+ * we looked at in the parent, and the update algorithm guarantees
+ * that if items above us in the tree move, they only move right.
+ */
+Buffer
+_bt_getstackbuf(Relation rel, BTStack stack, int access)
+{
+ Buffer buf;
+ BlockNumber blkno;
+ OffsetNumber start, offnum, maxoff;
+ OffsetNumber i;
+ Page page;
+ ItemId itemid;
+ BTItem item;
+ BTPageOpaque opaque;
+
+ blkno = stack->bts_blkno;
+ buf = _bt_getbuf(rel, blkno, access);
+ page = BufferGetPage(buf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ maxoff = PageGetMaxOffsetNumber(page);
+
+ if (maxoff >= stack->bts_offset) {
+ itemid = PageGetItemId(page, stack->bts_offset);
+ item = (BTItem) PageGetItem(page, itemid);
+
+ /* if the item is where we left it, we're done */
+ if (item->bti_oid == stack->bts_btitem->bti_oid)
+ return (buf);
+
+ /* if the item has just moved right on this page, we're done */
+ for (i = OffsetNumberNext(stack->bts_offset);
+ i <= maxoff;
+ i = OffsetNumberNext(i)) {
+ itemid = PageGetItemId(page, i);
+ item = (BTItem) PageGetItem(page, itemid);
+
+ /* if the item is where we left it, we're done */
+ if (item->bti_oid == stack->bts_btitem->bti_oid)
+ return (buf);
+ }
+ }
+
+ /* by here, the item we're looking for moved right at least one page */
+ for (;;) {
+ blkno = opaque->btpo_next;
+ if (P_RIGHTMOST(opaque))
+ elog(FATAL, "my bits moved right off the end of the world!");
+
+ _bt_relbuf(rel, buf, access);
+ buf = _bt_getbuf(rel, blkno, access);
+ page = BufferGetPage(buf);
+ maxoff = PageGetMaxOffsetNumber(page);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+ /* if we have a right sibling, step over the high key */
+ start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
+
+ /* see if it's on this page */
+ for (offnum = start;
+ offnum <= maxoff;
+ offnum = OffsetNumberNext(offnum)) {
+ itemid = PageGetItemId(page, offnum);
+ item = (BTItem) PageGetItem(page, itemid);
+ if (item->bti_oid == stack->bts_btitem->bti_oid)
+ return (buf);
+ }
+ }
+}
+
+void
+_bt_setpagelock(Relation rel, BlockNumber blkno, int access)
+{
+ ItemPointerData iptr;
+
+ if (USELOCKING) {
+ ItemPointerSet(&iptr, blkno, P_HIKEY);
+
+ if (access == BT_WRITE)
+ RelationSetSingleWLockPage(rel, &iptr);
+ else
+ RelationSetSingleRLockPage(rel, &iptr);
+ }
+}
+
+void
+_bt_unsetpagelock(Relation rel, BlockNumber blkno, int access)
+{
+ ItemPointerData iptr;
+
+ if (USELOCKING) {
+ ItemPointerSet(&iptr, blkno, P_HIKEY);
+
+ if (access == BT_WRITE)
+ RelationUnsetSingleWLockPage(rel, &iptr);
+ else
+ RelationUnsetSingleRLockPage(rel, &iptr);
+ }
+}
+
+void
+_bt_pagedel(Relation rel, ItemPointer tid)
+{
+ Buffer buf;
+ Page page;
+ BlockNumber blkno;
+ OffsetNumber offno;
+
+ blkno = ItemPointerGetBlockNumber(tid);
+ offno = ItemPointerGetOffsetNumber(tid);
+
+ buf = _bt_getbuf(rel, blkno, BT_WRITE);
+ page = BufferGetPage(buf);
+
+ PageIndexTupleDelete(page, offno);
+
+ /* write the buffer and release the lock */
+ _bt_wrtbuf(rel, buf);
+}
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
new file mode 100644
index 0000000000..0601611996
--- /dev/null
+++ b/src/backend/access/nbtree/nbtree.c
@@ -0,0 +1,516 @@
+/*-------------------------------------------------------------------------
+ *
+ * btree.c--
+ * Implementation of Lehman and Yao's btree management algorithm for
+ * Postgres.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $
+ *
+ * NOTES
+ * This file contains only the public interface routines.
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "storage/bufmgr.h"
+#include "storage/bufpage.h"
+
+#include "utils/elog.h"
+#include "utils/palloc.h"
+#include "utils/rel.h"
+#include "utils/excid.h"
+
+#include "access/heapam.h"
+#include "access/genam.h"
+#include "access/sdir.h"
+#include "access/nbtree.h"
+#include "access/funcindex.h"
+
+#include "nodes/execnodes.h"
+#include "nodes/plannodes.h"
+
+#include "executor/executor.h"
+#include "executor/tuptable.h"
+
+#include "catalog/index.h"
+
+bool BuildingBtree = false;
+bool FastBuild = false; /* turn this on to make bulk builds work*/
+
+/*
+ * btbuild() -- build a new btree index.
+ *
+ * We use a global variable to record the fact that we're creating
+ * a new index. This is used to avoid high-concurrency locking,
+ * since the index won't be visible until this transaction commits
+ * and since building is guaranteed to be single-threaded.
+ */
+void
+btbuild(Relation heap,
+ Relation index,
+ int natts,
+ AttrNumber *attnum,
+ IndexStrategy istrat,
+ uint16 pcount,
+ Datum *params,
+ FuncIndexInfo *finfo,
+ PredInfo *predInfo)
+{
+ HeapScanDesc hscan;
+ Buffer buffer;
+ HeapTuple htup;
+ IndexTuple itup;
+ TupleDesc htupdesc, itupdesc;
+ Datum *attdata;
+ bool *nulls;
+ InsertIndexResult res;
+ int nhtups, nitups;
+ int i;
+ BTItem btitem;
+ ExprContext *econtext;
+ TupleTable tupleTable;
+ TupleTableSlot *slot;
+ Oid hrelid, irelid;
+ Node *pred, *oldPred;
+ void *spool;
+
+ /* note that this is a new btree */
+ BuildingBtree = true;
+
+ pred = predInfo->pred;
+ oldPred = predInfo->oldPred;
+
+ /* initialize the btree index metadata page (if this is a new index) */
+ if (oldPred == NULL)
+ _bt_metapinit(index);
+
+ /* get tuple descriptors for heap and index relations */
+ htupdesc = RelationGetTupleDescriptor(heap);
+ itupdesc = RelationGetTupleDescriptor(index);
+
+ /* get space for data items that'll appear in the index tuple */
+ attdata = (Datum *) palloc(natts * sizeof(Datum));
+ nulls = (bool *) palloc(natts * sizeof(bool));
+
+ /*
+ * If this is a predicate (partial) index, we will need to evaluate the
+ * predicate using ExecQual, which requires the current tuple to be in a
+ * slot of a TupleTable. In addition, ExecQual must have an ExprContext
+ * referring to that slot. Here, we initialize dummy TupleTable and
+ * ExprContext objects for this purpose. --Nels, Feb '92
+ */
+#ifndef OMIT_PARTIAL_INDEX
+ if (pred != NULL || oldPred != NULL) {
+ tupleTable = ExecCreateTupleTable(1);
+ slot = ExecAllocTableSlot(tupleTable);
+ econtext = makeNode(ExprContext);
+ FillDummyExprContext(econtext, slot, htupdesc, InvalidBuffer);
+ }
+#endif /* OMIT_PARTIAL_INDEX */
+
+ /* start a heap scan */
+ hscan = heap_beginscan(heap, 0, NowTimeQual, 0, (ScanKey) NULL);
+ htup = heap_getnext(hscan, 0, &buffer);
+
+ /* build the index */
+ nhtups = nitups = 0;
+
+ if (FastBuild) {
+ spool = _bt_spoolinit(index, 7);
+ res = (InsertIndexResult) NULL;
+ }
+
+ for (; HeapTupleIsValid(htup); htup = heap_getnext(hscan, 0, &buffer)) {
+
+ nhtups++;
+
+ /*
+ * If oldPred != NULL, this is an EXTEND INDEX command, so skip
+ * this tuple if it was already in the existing partial index
+ */
+ if (oldPred != NULL) {
+#ifndef OMIT_PARTIAL_INDEX
+
+ /*SetSlotContents(slot, htup);*/
+ slot->val = htup;
+ if (ExecQual((List*)oldPred, econtext) == true) {
+ nitups++;
+ continue;
+ }
+#endif /* OMIT_PARTIAL_INDEX */
+ }
+
+ /* Skip this tuple if it doesn't satisfy the partial-index predicate */
+ if (pred != NULL) {
+#ifndef OMIT_PARTIAL_INDEX
+ /* SetSlotContents(slot, htup); */
+ slot->val = htup;
+ if (ExecQual((List*)pred, econtext) == false)
+ continue;
+#endif /* OMIT_PARTIAL_INDEX */
+ }
+
+ nitups++;
+
+ /*
+ * For the current heap tuple, extract all the attributes
+ * we use in this index, and note which are null.
+ */
+
+ for (i = 1; i <= natts; i++) {
+ int attoff;
+ bool attnull;
+
+ /*
+ * Offsets are from the start of the tuple, and are
+ * zero-based; indices are one-based. The next call
+ * returns i - 1. That's data hiding for you.
+ */
+
+ attoff = AttrNumberGetAttrOffset(i);
+ attdata[attoff] = GetIndexValue(htup,
+ htupdesc,
+ attoff,
+ attnum,
+ finfo,
+ &attnull,
+ buffer);
+ nulls[attoff] = (attnull ? 'n' : ' ');
+ }
+
+ /* form an index tuple and point it at the heap tuple */
+ itup = index_formtuple(itupdesc, attdata, nulls);
+
+ /*
+ * If the single index key is null, we don't insert it into
+ * the index. Btrees support scans on <, <=, =, >=, and >.
+ * Relational algebra says that A op B (where op is one of the
+ * operators above) returns null if either A or B is null. This
+ * means that no qualification used in an index scan could ever
+ * return true on a null attribute. It also means that indices
+ * can't be used by ISNULL or NOTNULL scans, but that's an
+ * artifact of the strategy map architecture chosen in 1986, not
+ * of the way nulls are handled here.
+ */
+
+ if (itup->t_info & INDEX_NULL_MASK) {
+ pfree(itup);
+ continue;
+ }
+
+ itup->t_tid = htup->t_ctid;
+ btitem = _bt_formitem(itup);
+
+ /*
+ * if we are doing bottom-up btree build, we insert the index
+ * into a spool page for subsequent processing. otherwise, we
+ * insert into the btree.
+ */
+ if (FastBuild) {
+ _bt_spool(index, btitem, spool);
+ } else {
+ res = _bt_doinsert(index, btitem);
+ }
+
+ pfree(btitem);
+ pfree(itup);
+ if (res) {
+ pfree(res);
+ }
+ }
+
+ /* okay, all heap tuples are indexed */
+ heap_endscan(hscan);
+
+ if (pred != NULL || oldPred != NULL) {
+#ifndef OMIT_PARTIAL_INDEX
+ ExecDestroyTupleTable(tupleTable, true);
+ pfree(econtext);
+#endif /* OMIT_PARTIAL_INDEX */
+ }
+
+ /*
+ * if we are doing bottom-up btree build, we now have a bunch of
+ * sorted runs in the spool pages. finish the build by (1)
+ * merging the runs, (2) inserting the sorted tuples into btree
+ * pages and (3) building the upper levels.
+ */
+ if (FastBuild) {
+ _bt_spool(index, (BTItem) NULL, spool); /* flush spool */
+ _bt_leafbuild(index, spool);
+ _bt_spooldestroy(spool);
+ }
+
+ /*
+ * Since we just counted the tuples in the heap, we update its
+ * stats in pg_class to guarantee that the planner takes advantage
+ * of the index we just created. Finally, only update statistics
+ * during normal index definitions, not for indices on system catalogs
+ * created during bootstrap processing. We must close the relations
+ * before updatings statistics to guarantee that the relcache entries
+ * are flushed when we increment the command counter in UpdateStats().
+ */
+ if (IsNormalProcessingMode())
+ {
+ hrelid = heap->rd_id;
+ irelid = index->rd_id;
+ heap_close(heap);
+ index_close(index);
+ UpdateStats(hrelid, nhtups, true);
+ UpdateStats(irelid, nitups, false);
+ if (oldPred != NULL) {
+ if (nitups == nhtups) pred = NULL;
+ UpdateIndexPredicate(irelid, oldPred, pred);
+ }
+ }
+
+ /* be tidy */
+ pfree(nulls);
+ pfree(attdata);
+
+ /* all done */
+ BuildingBtree = false;
+}
+
+/*
+ * btinsert() -- insert an index tuple into a btree.
+ *
+ * Descend the tree recursively, find the appropriate location for our
+ * new tuple, put it there, set its unique OID as appropriate, and
+ * return an InsertIndexResult to the caller.
+ */
+InsertIndexResult
+btinsert(Relation rel, IndexTuple itup)
+{
+ BTItem btitem;
+ InsertIndexResult res;
+
+ if (itup->t_info & INDEX_NULL_MASK)
+ return ((InsertIndexResult) NULL);
+
+ btitem = _bt_formitem(itup);
+
+ res = _bt_doinsert(rel, btitem);
+ pfree(btitem);
+
+ return (res);
+}
+
+/*
+ * btgettuple() -- Get the next tuple in the scan.
+ */
+char *
+btgettuple(IndexScanDesc scan, ScanDirection dir)
+{
+ RetrieveIndexResult res;
+
+ /*
+ * If we've already initialized this scan, we can just advance it
+ * in the appropriate direction. If we haven't done so yet, we
+ * call a routine to get the first item in the scan.
+ */
+
+ if (ItemPointerIsValid(&(scan->currentItemData)))
+ res = _bt_next(scan, dir);
+ else
+ res = _bt_first(scan, dir);
+
+ return ((char *) res);
+}
+
+/*
+ * btbeginscan() -- start a scan on a btree index
+ */
+char *
+btbeginscan(Relation rel, bool fromEnd, uint16 keysz, ScanKey scankey)
+{
+ IndexScanDesc scan;
+ StrategyNumber strat;
+ BTScanOpaque so;
+
+ /* first order the keys in the qualification */
+ if (keysz > 1)
+ _bt_orderkeys(rel, &keysz, scankey);
+
+ /* now get the scan */
+ scan = RelationGetIndexScan(rel, fromEnd, keysz, scankey);
+ so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
+ so->btso_curbuf = so->btso_mrkbuf = InvalidBuffer;
+ scan->opaque = so;
+
+ /* finally, be sure that the scan exploits the tree order */
+ scan->scanFromEnd = false;
+ scan->flags = 0x0;
+ if (keysz > 0) {
+ strat = _bt_getstrat(scan->relation, 1 /* XXX */,
+ scankey[0].sk_procedure);
+
+ if (strat == BTLessStrategyNumber
+ || strat == BTLessEqualStrategyNumber)
+ scan->scanFromEnd = true;
+ } else {
+ scan->scanFromEnd = true;
+ }
+
+ /* register scan in case we change pages it's using */
+ _bt_regscan(scan);
+
+ return ((char *) scan);
+}
+
+/*
+ * btrescan() -- rescan an index relation
+ */
+void
+btrescan(IndexScanDesc scan, bool fromEnd, ScanKey scankey)
+{
+ ItemPointer iptr;
+ BTScanOpaque so;
+
+ so = (BTScanOpaque) scan->opaque;
+
+ /* we hold a read lock on the current page in the scan */
+ if (ItemPointerIsValid(iptr = &(scan->currentItemData))) {
+ _bt_relbuf(scan->relation, so->btso_curbuf, BT_READ);
+ so->btso_curbuf = InvalidBuffer;
+ ItemPointerSetInvalid(iptr);
+ }
+
+ /* and we hold a read lock on the last marked item in the scan */
+ if (ItemPointerIsValid(iptr = &(scan->currentMarkData))) {
+ _bt_relbuf(scan->relation, so->btso_mrkbuf, BT_READ);
+ so->btso_mrkbuf = InvalidBuffer;
+ ItemPointerSetInvalid(iptr);
+ }
+
+ /* reset the scan key */
+ if (scan->numberOfKeys > 0) {
+ memmove(scan->keyData,
+ scankey,
+ scan->numberOfKeys * sizeof(ScanKeyData));
+ }
+}
+
+void
+btmovescan(IndexScanDesc scan, Datum v)
+{
+ ItemPointer iptr;
+ BTScanOpaque so;
+
+ so = (BTScanOpaque) scan->opaque;
+
+ /* release any locks we still hold */
+ if (ItemPointerIsValid(iptr = &(scan->currentItemData))) {
+ _bt_relbuf(scan->relation, so->btso_curbuf, BT_READ);
+ so->btso_curbuf = InvalidBuffer;
+ ItemPointerSetInvalid(iptr);
+ }
+
+ scan->keyData[0].sk_argument = v;
+}
+
+/*
+ * btendscan() -- close down a scan
+ */
+void
+btendscan(IndexScanDesc scan)
+{
+ ItemPointer iptr;
+ BTScanOpaque so;
+
+ so = (BTScanOpaque) scan->opaque;
+
+ /* release any locks we still hold */
+ if (ItemPointerIsValid(iptr = &(scan->currentItemData))) {
+ if (BufferIsValid(so->btso_curbuf))
+ _bt_relbuf(scan->relation, so->btso_curbuf, BT_READ);
+ so->btso_curbuf = InvalidBuffer;
+ ItemPointerSetInvalid(iptr);
+ }
+
+ if (ItemPointerIsValid(iptr = &(scan->currentMarkData))) {
+ if (BufferIsValid(so->btso_mrkbuf))
+ _bt_relbuf(scan->relation, so->btso_mrkbuf, BT_READ);
+ so->btso_mrkbuf = InvalidBuffer;
+ ItemPointerSetInvalid(iptr);
+ }
+
+ /* don't need scan registered anymore */
+ _bt_dropscan(scan);
+
+ /* be tidy */
+#ifdef PERFECT_MMGR
+ pfree (scan->opaque);
+#endif /* PERFECT_MMGR */
+}
+
+/*
+ * btmarkpos() -- save current scan position
+ */
+void
+btmarkpos(IndexScanDesc scan)
+{
+ ItemPointer iptr;
+ BTScanOpaque so;
+
+ so = (BTScanOpaque) scan->opaque;
+
+ /* release lock on old marked data, if any */
+ if (ItemPointerIsValid(iptr = &(scan->currentMarkData))) {
+ _bt_relbuf(scan->relation, so->btso_mrkbuf, BT_READ);
+ so->btso_mrkbuf = InvalidBuffer;
+ ItemPointerSetInvalid(iptr);
+ }
+
+ /* bump lock on currentItemData and copy to currentMarkData */
+ if (ItemPointerIsValid(&(scan->currentItemData))) {
+ so->btso_mrkbuf = _bt_getbuf(scan->relation,
+ BufferGetBlockNumber(so->btso_curbuf),
+ BT_READ);
+ scan->currentMarkData = scan->currentItemData;
+ }
+}
+
+/*
+ * btrestrpos() -- restore scan to last saved position
+ */
+void
+btrestrpos(IndexScanDesc scan)
+{
+ ItemPointer iptr;
+ BTScanOpaque so;
+
+ so = (BTScanOpaque) scan->opaque;
+
+ /* release lock on current data, if any */
+ if (ItemPointerIsValid(iptr = &(scan->currentItemData))) {
+ _bt_relbuf(scan->relation, so->btso_curbuf, BT_READ);
+ so->btso_curbuf = InvalidBuffer;
+ ItemPointerSetInvalid(iptr);
+ }
+
+ /* bump lock on currentMarkData and copy to currentItemData */
+ if (ItemPointerIsValid(&(scan->currentMarkData))) {
+ so->btso_curbuf = _bt_getbuf(scan->relation,
+ BufferGetBlockNumber(so->btso_mrkbuf),
+ BT_READ);
+
+ scan->currentItemData = scan->currentMarkData;
+ }
+}
+
+/* stubs */
+void
+btdelete(Relation rel, ItemPointer tid)
+{
+ /* adjust any active scans that will be affected by this deletion */
+ _bt_adjscans(rel, tid);
+
+ /* delete the data from the page */
+ _bt_pagedel(rel, tid);
+}
diff --git a/src/backend/access/nbtree/nbtscan.c b/src/backend/access/nbtree/nbtscan.c
new file mode 100644
index 0000000000..62a029bc06
--- /dev/null
+++ b/src/backend/access/nbtree/nbtscan.c
@@ -0,0 +1,164 @@
+/*-------------------------------------------------------------------------
+ *
+ * btscan.c--
+ * manage scans on btrees.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/access/nbtree/Attic/nbtscan.c,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $
+ *
+ *
+ * NOTES
+ * Because we can be doing an index scan on a relation while we update
+ * it, we need to avoid missing data that moves around in the index.
+ * The routines and global variables in this file guarantee that all
+ * scans in the local address space stay correctly positioned. This
+ * is all we need to worry about, since write locking guarantees that
+ * no one else will be on the same page at the same time as we are.
+ *
+ * The scheme is to manage a list of active scans in the current backend.
+ * Whenever we add or remove records from an index, or whenever we
+ * split a leaf page, we check the list of active scans to see if any
+ * has been affected. A scan is affected only if it is on the same
+ * relation, and the same page, as the update.
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "storage/bufmgr.h"
+#include "storage/bufpage.h"
+
+#include "utils/elog.h"
+#include "utils/palloc.h"
+#include "utils/rel.h"
+#include "utils/excid.h"
+
+#include "access/heapam.h"
+#include "access/genam.h"
+#include "access/sdir.h"
+#include "access/nbtree.h"
+
+typedef struct BTScanListData {
+ IndexScanDesc btsl_scan;
+ struct BTScanListData *btsl_next;
+} BTScanListData;
+
+typedef BTScanListData *BTScanList;
+
+static BTScanList BTScans = (BTScanList) NULL;
+
+/*
+ * _bt_regscan() -- register a new scan.
+ */
+void
+_bt_regscan(IndexScanDesc scan)
+{
+ BTScanList new_el;
+
+ new_el = (BTScanList) palloc(sizeof(BTScanListData));
+ new_el->btsl_scan = scan;
+ new_el->btsl_next = BTScans;
+ BTScans = new_el;
+}
+
+/*
+ * _bt_dropscan() -- drop a scan from the scan list
+ */
+void
+_bt_dropscan(IndexScanDesc scan)
+{
+ BTScanList chk, last;
+
+ last = (BTScanList) NULL;
+ for (chk = BTScans;
+ chk != (BTScanList) NULL && chk->btsl_scan != scan;
+ chk = chk->btsl_next) {
+ last = chk;
+ }
+
+ if (chk == (BTScanList) NULL)
+ elog(WARN, "btree scan list trashed; can't find 0x%lx", scan);
+
+ if (last == (BTScanList) NULL)
+ BTScans = chk->btsl_next;
+ else
+ last->btsl_next = chk->btsl_next;
+
+#ifdef PERFECT_MEM
+ pfree (chk);
+#endif /* PERFECT_MEM */
+}
+
+void
+_bt_adjscans(Relation rel, ItemPointer tid)
+{
+ BTScanList l;
+ Oid relid;
+
+ relid = rel->rd_id;
+ for (l = BTScans; l != (BTScanList) NULL; l = l->btsl_next) {
+ if (relid == l->btsl_scan->relation->rd_id)
+ _bt_scandel(l->btsl_scan, ItemPointerGetBlockNumber(tid),
+ ItemPointerGetOffsetNumber(tid));
+ }
+}
+
+void
+_bt_scandel(IndexScanDesc scan, BlockNumber blkno, OffsetNumber offno)
+{
+ ItemPointer current;
+ Buffer buf;
+ BTScanOpaque so;
+
+ if (!_bt_scantouched(scan, blkno, offno))
+ return;
+
+ so = (BTScanOpaque) scan->opaque;
+ buf = so->btso_curbuf;
+
+ current = &(scan->currentItemData);
+ if (ItemPointerIsValid(current)
+ && ItemPointerGetBlockNumber(current) == blkno
+ && ItemPointerGetOffsetNumber(current) >= offno) {
+ _bt_step(scan, &buf, BackwardScanDirection);
+ so->btso_curbuf = buf;
+ }
+
+ current = &(scan->currentMarkData);
+ if (ItemPointerIsValid(current)
+ && ItemPointerGetBlockNumber(current) == blkno
+ && ItemPointerGetOffsetNumber(current) >= offno) {
+ ItemPointerData tmp;
+ tmp = *current;
+ *current = scan->currentItemData;
+ scan->currentItemData = tmp;
+ _bt_step(scan, &buf, BackwardScanDirection);
+ so->btso_mrkbuf = buf;
+ tmp = *current;
+ *current = scan->currentItemData;
+ scan->currentItemData = tmp;
+ }
+}
+
+bool
+_bt_scantouched(IndexScanDesc scan, BlockNumber blkno, OffsetNumber offno)
+{
+ ItemPointer current;
+
+ current = &(scan->currentItemData);
+ if (ItemPointerIsValid(current)
+ && ItemPointerGetBlockNumber(current) == blkno
+ && ItemPointerGetOffsetNumber(current) >= offno)
+ return (true);
+
+ current = &(scan->currentMarkData);
+ if (ItemPointerIsValid(current)
+ && ItemPointerGetBlockNumber(current) == blkno
+ && ItemPointerGetOffsetNumber(current) >= offno)
+ return (true);
+
+ return (false);
+}
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
new file mode 100644
index 0000000000..d7a7fc7d62
--- /dev/null
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -0,0 +1,1133 @@
+/*-------------------------------------------------------------------------
+ *
+ * btsearch.c--
+ * search code for postgres btrees.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "storage/bufmgr.h"
+#include "storage/bufpage.h"
+
+#include "utils/elog.h"
+#include "utils/palloc.h"
+#include "utils/rel.h"
+#include "utils/excid.h"
+
+#include "fmgr.h"
+
+#include "access/heapam.h"
+#include "access/genam.h"
+#include "access/skey.h"
+#include "access/sdir.h"
+#include "access/nbtree.h"
+
+static BTStack _bt_searchr(Relation rel, int keysz, ScanKey scankey, Buffer *bufP, BTStack stack_in);
+static OffsetNumber _bt_firsteq(Relation rel, TupleDesc itupdesc, Page page, Size keysz, ScanKey scankey, OffsetNumber offnum);
+static int _bt_compare(Relation rel, TupleDesc itupdesc, Page page, int keysz, ScanKey scankey, OffsetNumber offnum);
+static bool _bt_twostep(IndexScanDesc scan, Buffer *bufP, ScanDirection dir);
+static RetrieveIndexResult _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
+
+/*
+ * _bt_search() -- Search for a scan key in the index.
+ *
+ * This routine is actually just a helper that sets things up and
+ * calls a recursive-descent search routine on the tree.
+ */
+BTStack
+_bt_search(Relation rel, int keysz, ScanKey scankey, Buffer *bufP)
+{
+ *bufP = _bt_getroot(rel, BT_READ);
+ return (_bt_searchr(rel, keysz, scankey, bufP, (BTStack) NULL));
+}
+
+/*
+ * _bt_searchr() -- Search the tree recursively for a particular scankey.
+ */
+static BTStack
+_bt_searchr(Relation rel,
+ int keysz,
+ ScanKey scankey,
+ Buffer *bufP,
+ BTStack stack_in)
+{
+ BTStack stack;
+ OffsetNumber offnum;
+ Page page;
+ BTPageOpaque opaque;
+ BlockNumber par_blkno;
+ BlockNumber blkno;
+ ItemId itemid;
+ BTItem btitem;
+ BTItem item_save;
+ int item_nbytes;
+ IndexTuple itup;
+
+ /* if this is a leaf page, we're done */
+ page = BufferGetPage(*bufP);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ if (opaque->btpo_flags & BTP_LEAF)
+ return (stack_in);
+
+ /*
+ * Find the appropriate item on the internal page, and get the child
+ * page that it points to.
+ */
+
+ par_blkno = BufferGetBlockNumber(*bufP);
+ offnum = _bt_binsrch(rel, *bufP, keysz, scankey, BT_DESCENT);
+ itemid = PageGetItemId(page, offnum);
+ btitem = (BTItem) PageGetItem(page, itemid);
+ itup = &(btitem->bti_itup);
+ blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
+
+ /*
+ * We need to save the bit image of the index entry we chose in the
+ * parent page on a stack. In case we split the tree, we'll use this
+ * bit image to figure out what our real parent page is, in case the
+ * parent splits while we're working lower in the tree. See the paper
+ * by Lehman and Yao for how this is detected and handled. (We use
+ * unique OIDs to disambiguate duplicate keys in the index -- Lehman
+ * and Yao disallow duplicate keys).
+ */
+
+ item_nbytes = ItemIdGetLength(itemid);
+ item_save = (BTItem) palloc(item_nbytes);
+ memmove((char *) item_save, (char *) btitem, item_nbytes);
+ stack = (BTStack) palloc(sizeof(BTStackData));
+ stack->bts_blkno = par_blkno;
+ stack->bts_offset = offnum;
+ stack->bts_btitem = item_save;
+ stack->bts_parent = stack_in;
+
+ /* drop the read lock on the parent page and acquire one on the child */
+ _bt_relbuf(rel, *bufP, BT_READ);
+ *bufP = _bt_getbuf(rel, blkno, BT_READ);
+
+ /*
+ * Race -- the page we just grabbed may have split since we read its
+ * pointer in the parent. If it has, we may need to move right to its
+ * new sibling. Do that.
+ */
+
+ *bufP = _bt_moveright(rel, *bufP, keysz, scankey, BT_READ);
+
+ /* okay, all set to move down a level */
+ return (_bt_searchr(rel, keysz, scankey, bufP, stack));
+}
+
+/*
+ * _bt_moveright() -- move right in the btree if necessary.
+ *
+ * When we drop and reacquire a pointer to a page, it is possible that
+ * the page has changed in the meanwhile. If this happens, we're
+ * guaranteed that the page has "split right" -- that is, that any
+ * data that appeared on the page originally is either on the page
+ * or strictly to the right of it.
+ *
+ * This routine decides whether or not we need to move right in the
+ * tree by examining the high key entry on the page. If that entry
+ * is strictly less than one we expect to be on the page, then our
+ * picture of the page is incorrect and we need to move right.
+ *
+ * On entry, we have the buffer pinned and a lock of the proper type.
+ * If we move right, we release the buffer and lock and acquire the
+ * same on the right sibling.
+ */
+Buffer
+_bt_moveright(Relation rel,
+ Buffer buf,
+ int keysz,
+ ScanKey scankey,
+ int access)
+{
+ Page page;
+ BTPageOpaque opaque;
+ ItemId hikey;
+ ItemId itemid;
+ BlockNumber rblkno;
+
+ page = BufferGetPage(buf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+ /* if we're on a rightmost page, we don't need to move right */
+ if (P_RIGHTMOST(opaque))
+ return (buf);
+
+ /* by convention, item 0 on non-rightmost pages is the high key */
+ hikey = PageGetItemId(page, P_HIKEY);
+
+ /*
+ * If the scan key that brought us to this page is >= the high key
+ * stored on the page, then the page has split and we need to move
+ * right.
+ */
+
+ if (_bt_skeycmp(rel, keysz, scankey, page, hikey,
+ BTGreaterEqualStrategyNumber)) {
+
+ /* move right as long as we need to */
+ do {
+ /*
+ * If this page consists of all duplicate keys (hikey and first
+ * key on the page have the same value), then we don't need to
+ * step right.
+ */
+ if (PageGetMaxOffsetNumber(page) > P_HIKEY) {
+ itemid = PageGetItemId(page, P_FIRSTKEY);
+ if (_bt_skeycmp(rel, keysz, scankey, page, itemid,
+ BTEqualStrategyNumber)) {
+ /* break is for the "move right" while loop */
+ break;
+ }
+ }
+
+ /* step right one page */
+ rblkno = opaque->btpo_next;
+ _bt_relbuf(rel, buf, access);
+ buf = _bt_getbuf(rel, rblkno, access);
+ page = BufferGetPage(buf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ hikey = PageGetItemId(page, P_HIKEY);
+
+ } while (! P_RIGHTMOST(opaque)
+ && _bt_skeycmp(rel, keysz, scankey, page, hikey,
+ BTGreaterEqualStrategyNumber));
+ }
+ return (buf);
+}
+
+/*
+ * _bt_skeycmp() -- compare a scan key to a particular item on a page using
+ * a requested strategy (<, <=, =, >=, >).
+ *
+ * We ignore the unique OIDs stored in the btree item here. Those
+ * numbers are intended for use internally only, in repositioning a
+ * scan after a page split. They do not impose any meaningful ordering.
+ *
+ * The comparison is A <op> B, where A is the scan key and B is the
+ * tuple pointed at by itemid on page.
+ */
+bool
+_bt_skeycmp(Relation rel,
+ Size keysz,
+ ScanKey scankey,
+ Page page,
+ ItemId itemid,
+ StrategyNumber strat)
+{
+ BTItem item;
+ IndexTuple indexTuple;
+ TupleDesc tupDes;
+ ScanKey entry;
+ int i;
+ Datum attrDatum;
+ Datum keyDatum;
+ bool compare;
+ bool isNull;
+
+ item = (BTItem) PageGetItem(page, itemid);
+ indexTuple = &(item->bti_itup);
+
+ tupDes = RelationGetTupleDescriptor(rel);
+
+ /* see if the comparison is true for all of the key attributes */
+ for (i=1; i <= keysz; i++) {
+
+ entry = &scankey[i-1];
+ attrDatum = index_getattr(indexTuple,
+ entry->sk_attno,
+ tupDes,
+ &isNull);
+ keyDatum = entry->sk_argument;
+
+ compare = _bt_invokestrat(rel, i, strat, keyDatum, attrDatum);
+ if (!compare)
+ return (false);
+ }
+
+ return (true);
+}
+
+/*
+ * _bt_binsrch() -- Do a binary search for a key on a particular page.
+ *
+ * The scankey we get has the compare function stored in the procedure
+ * entry of each data struct. We invoke this regproc to do the
+ * comparison for every key in the scankey. _bt_binsrch() returns
+ * the OffsetNumber of the first matching key on the page, or the
+ * OffsetNumber at which the matching key would appear if it were
+ * on this page.
+ *
+ * By the time this procedure is called, we're sure we're looking
+ * at the right page -- don't need to walk right. _bt_binsrch() has
+ * no lock or refcount side effects on the buffer.
+ */
+OffsetNumber
+_bt_binsrch(Relation rel,
+ Buffer buf,
+ int keysz,
+ ScanKey scankey,
+ int srchtype)
+{
+ TupleDesc itupdesc;
+ Page page;
+ BTPageOpaque opaque;
+ OffsetNumber low, mid, high;
+ bool match;
+ int result;
+
+ page = BufferGetPage(buf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+ /* by convention, item 0 on any non-rightmost page is the high key */
+ low = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
+
+ high = PageGetMaxOffsetNumber(page);
+
+ /*
+ * Since for non-rightmost pages, the zeroeth item on the page is the
+ * high key, there are two notions of emptiness. One is if nothing
+ * appears on the page. The other is if nothing but the high key does.
+ * The reason we test high <= low, rather than high == low, is that
+ * after vacuuming there may be nothing *but* the high key on a page.
+ * In that case, given the scheme above, low = 1 and high = 0.
+ */
+
+ if (PageIsEmpty(page) || (! P_RIGHTMOST(opaque) && high <= low))
+ return (low);
+
+ itupdesc = RelationGetTupleDescriptor(rel);
+ match = false;
+
+ while ((high - low) > 1) {
+ mid = low + ((high - low) / 2);
+ result = _bt_compare(rel, itupdesc, page, keysz, scankey, mid);
+
+ if (result > 0)
+ low = mid;
+ else if (result < 0)
+ high = mid - 1;
+ else {
+ match = true;
+ break;
+ }
+ }
+
+ /* if we found a match, we want to find the first one on the page */
+ if (match) {
+ return (_bt_firsteq(rel, itupdesc, page, keysz, scankey, mid));
+ } else {
+
+ /*
+ * We terminated because the endpoints got too close together. There
+ * are two cases to take care of.
+ *
+ * For non-insertion searches on internal pages, we want to point at
+ * the last key <, or first key =, the scankey on the page. This
+ * guarantees that we'll descend the tree correctly.
+ *
+ * For all other cases, we want to point at the first key >=
+ * the scankey on the page. This guarantees that scans and
+ * insertions will happen correctly.
+ */
+
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ if (!(opaque->btpo_flags & BTP_LEAF) && srchtype == BT_DESCENT) {
+
+ /*
+ * We want the last key <, or first key ==, the scan key.
+ */
+
+ result = _bt_compare(rel, itupdesc, page, keysz, scankey, high);
+
+ if (result == 0) {
+ return (_bt_firsteq(rel, itupdesc, page, keysz, scankey, high));
+ } else if (result > 0) {
+ return (high);
+ } else {
+ return (low);
+ }
+ } else {
+
+ /* we want the first key >= the scan key */
+ result = _bt_compare(rel, itupdesc, page, keysz, scankey, low);
+ if (result <= 0) {
+ return (low);
+ } else {
+ if (low == high)
+ return (OffsetNumberNext(low));
+
+ result = _bt_compare(rel, itupdesc, page, keysz, scankey, high);
+ if (result <= 0)
+ return (high);
+ else
+ return (OffsetNumberNext(high));
+ }
+ }
+ }
+}
+
+static OffsetNumber
+_bt_firsteq(Relation rel,
+ TupleDesc itupdesc,
+ Page page,
+ Size keysz,
+ ScanKey scankey,
+ OffsetNumber offnum)
+{
+ BTPageOpaque opaque;
+ OffsetNumber limit;
+
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+ /* skip the high key, if any */
+ limit = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
+
+ /* walk backwards looking for the first key in the chain of duplicates */
+ while (offnum > limit
+ && _bt_compare(rel, itupdesc, page,
+ keysz, scankey, OffsetNumberPrev(offnum)) == 0) {
+ offnum = OffsetNumberPrev(offnum);
+ }
+
+ return (offnum);
+}
+
+/*
+ * _bt_compare() -- Compare scankey to a particular tuple on the page.
+ *
+ * This routine returns:
+ * -1 if scankey < tuple at offnum;
+ * 0 if scankey == tuple at offnum;
+ * +1 if scankey > tuple at offnum.
+ *
+ * In order to avoid having to propagate changes up the tree any time
+ * a new minimal key is inserted, the leftmost entry on the leftmost
+ * page is less than all possible keys, by definition.
+ */
+static int
+_bt_compare(Relation rel,
+ TupleDesc itupdesc,
+ Page page,
+ int keysz,
+ ScanKey scankey,
+ OffsetNumber offnum)
+{
+ Datum datum;
+ BTItem btitem;
+ ItemId itemid;
+ IndexTuple itup;
+ BTPageOpaque opaque;
+ ScanKey entry;
+ AttrNumber attno;
+ int result;
+ int i;
+ bool null;
+
+ /*
+ * If this is a leftmost internal page, and if our comparison is
+ * with the first key on the page, then the item at that position is
+ * by definition less than the scan key.
+ */
+
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ if (!(opaque->btpo_flags & BTP_LEAF)
+ && P_LEFTMOST(opaque)
+ && offnum == P_HIKEY) {
+ itemid = PageGetItemId(page, offnum);
+
+ /*
+ * we just have to believe that this will only be called with
+ * offnum == P_HIKEY when P_HIKEY is the OffsetNumber of the
+ * first actual data key (i.e., this is also a rightmost
+ * page). there doesn't seem to be any code that implies
+ * that the leftmost page is normally missing a high key as
+ * well as the rightmost page. but that implies that this
+ * code path only applies to the root -- which seems
+ * unlikely..
+ */
+ if (! P_RIGHTMOST(opaque)) {
+ elog(WARN, "_bt_compare: invalid comparison to high key");
+ }
+
+ /*
+ * If the item on the page is equal to the scankey, that's
+ * okay to admit. We just can't claim that the first key on
+ * the page is greater than anything.
+ */
+
+ if (_bt_skeycmp(rel, keysz, scankey, page, itemid,
+ BTEqualStrategyNumber)) {
+ return (0);
+ }
+ return (1);
+ }
+
+ btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
+ itup = &(btitem->bti_itup);
+
+ /*
+ * The scan key is set up with the attribute number associated with each
+ * term in the key. It is important that, if the index is multi-key,
+ * the scan contain the first k key attributes, and that they be in
+ * order. If you think about how multi-key ordering works, you'll
+ * understand why this is.
+ *
+ * We don't test for violation of this condition here.
+ */
+
+ for (i = 1; i <= keysz; i++) {
+ long tmpres;
+
+ entry = &scankey[i - 1];
+ attno = entry->sk_attno;
+ datum = index_getattr(itup, attno, itupdesc, &null);
+ tmpres = (long) FMGR_PTR2(entry->sk_func, entry->sk_procedure,
+ entry->sk_argument, datum);
+ result = tmpres;
+
+ /* if the keys are unequal, return the difference */
+ if (result != 0)
+ return (result);
+ }
+
+ /* by here, the keys are equal */
+ return (0);
+}
+
+/*
+ * _bt_next() -- Get the next item in a scan.
+ *
+ * On entry, we have a valid currentItemData in the scan, and a
+ * read lock on the page that contains that item. We do not have
+ * the page pinned. We return the next item in the scan. On
+ * exit, we have the page containing the next item locked but not
+ * pinned.
+ */
+RetrieveIndexResult
+_bt_next(IndexScanDesc scan, ScanDirection dir)
+{
+ Relation rel;
+ Buffer buf;
+ Page page;
+ OffsetNumber offnum;
+ RetrieveIndexResult res;
+ BlockNumber blkno;
+ ItemPointer current;
+ ItemPointer iptr;
+ BTItem btitem;
+ IndexTuple itup;
+ BTScanOpaque so;
+
+ rel = scan->relation;
+ so = (BTScanOpaque) scan->opaque;
+ current = &(scan->currentItemData);
+
+ /*
+ * XXX 10 may 91: somewhere there's a bug in our management of the
+ * cached buffer for this scan. wei discovered it. the following
+ * is a workaround so he can work until i figure out what's going on.
+ */
+
+ if (!BufferIsValid(so->btso_curbuf))
+ so->btso_curbuf = _bt_getbuf(rel, ItemPointerGetBlockNumber(current),
+ BT_READ);
+
+ /* we still have the buffer pinned and locked */
+ buf = so->btso_curbuf;
+ blkno = BufferGetBlockNumber(buf);
+
+ /* step one tuple in the appropriate direction */
+ if (!_bt_step(scan, &buf, dir))
+ return ((RetrieveIndexResult) NULL);
+
+ /* by here, current is the tuple we want to return */
+ offnum = ItemPointerGetOffsetNumber(current);
+ page = BufferGetPage(buf);
+ btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
+ itup = &btitem->bti_itup;
+
+ if (_bt_checkqual(scan, itup)) {
+ iptr = (ItemPointer) palloc(sizeof(ItemPointerData));
+ memmove((char *) iptr, (char *) &(itup->t_tid),
+ sizeof(ItemPointerData));
+ res = FormRetrieveIndexResult(current, iptr);
+
+ /* remember which buffer we have pinned and locked */
+ so->btso_curbuf = buf;
+ } else {
+ ItemPointerSetInvalid(current);
+ so->btso_curbuf = InvalidBuffer;
+ _bt_relbuf(rel, buf, BT_READ);
+ res = (RetrieveIndexResult) NULL;
+ }
+
+ return (res);
+}
+
+/*
+ * _bt_first() -- Find the first item in a scan.
+ *
+ * We need to be clever about the type of scan, the operation it's
+ * performing, and the tree ordering. We return the RetrieveIndexResult
+ * of the first item in the tree that satisfies the qualification
+ * associated with the scan descriptor. On exit, the page containing
+ * the current index tuple is read locked and pinned, and the scan's
+ * opaque data entry is updated to include the buffer.
+ */
+RetrieveIndexResult
+_bt_first(IndexScanDesc scan, ScanDirection dir)
+{
+ Relation rel;
+ TupleDesc itupdesc;
+ Buffer buf;
+ Page page;
+ BTStack stack;
+ OffsetNumber offnum, maxoff;
+ BTItem btitem;
+ IndexTuple itup;
+ ItemPointer current;
+ ItemPointer iptr;
+ BlockNumber blkno;
+ StrategyNumber strat;
+ RetrieveIndexResult res;
+ RegProcedure proc;
+ int result;
+ BTScanOpaque so;
+ ScanKeyData skdata;
+
+ /* if we just need to walk down one edge of the tree, do that */
+ if (scan->scanFromEnd)
+ return (_bt_endpoint(scan, dir));
+
+ rel = scan->relation;
+ itupdesc = RelationGetTupleDescriptor(scan->relation);
+ current = &(scan->currentItemData);
+ so = (BTScanOpaque) scan->opaque;
+
+ /*
+ * Okay, we want something more complicated. What we'll do is use
+ * the first item in the scan key passed in (which has been correctly
+ * ordered to take advantage of index ordering) to position ourselves
+ * at the right place in the scan.
+ */
+
+ /*
+ * XXX -- The attribute number stored in the scan key is the attno
+ * in the heap relation. We need to transmogrify this into
+ * the index relation attno here. For the moment, we have
+ * hardwired attno == 1.
+ */
+ proc = index_getprocid(rel, 1, BTORDER_PROC);
+ ScanKeyEntryInitialize(&skdata, 0x0, 1, proc,
+ scan->keyData[0].sk_argument);
+
+ stack = _bt_search(rel, 1, &skdata, &buf);
+ _bt_freestack(stack);
+
+ /* find the nearest match to the manufactured scan key on the page */
+ offnum = _bt_binsrch(rel, buf, 1, &skdata, BT_DESCENT);
+ page = BufferGetPage(buf);
+
+ /*
+ * This will happen if the tree we're searching is entirely empty,
+ * or if we're doing a search for a key that would appear on an
+ * entirely empty internal page. In either case, there are no
+ * matching tuples in the index.
+ */
+
+ if (PageIsEmpty(page)) {
+ ItemPointerSetInvalid(current);
+ so->btso_curbuf = InvalidBuffer;
+ _bt_relbuf(rel, buf, BT_READ);
+ return ((RetrieveIndexResult) NULL);
+ }
+
+ maxoff = PageGetMaxOffsetNumber(page);
+
+ if (offnum > maxoff)
+ offnum = maxoff;
+
+ blkno = BufferGetBlockNumber(buf);
+ ItemPointerSet(current, blkno, offnum);
+
+ /*
+ * Now find the right place to start the scan. Result is the
+ * value we're looking for minus the value we're looking at
+ * in the index.
+ */
+
+ result = _bt_compare(rel, itupdesc, page, 1, &skdata, offnum);
+ strat = _bt_getstrat(rel, 1, scan->keyData[0].sk_procedure);
+
+ switch (strat) {
+ case BTLessStrategyNumber:
+ if (result <= 0) {
+ do {
+ if (!_bt_twostep(scan, &buf, BackwardScanDirection))
+ break;
+
+ offnum = ItemPointerGetOffsetNumber(current);
+ page = BufferGetPage(buf);
+ result = _bt_compare(rel, itupdesc, page, 1, &skdata, offnum);
+ } while (result <= 0);
+
+ /* if this is true, the key we just looked at is gone */
+ if (result > 0)
+ (void) _bt_twostep(scan, &buf, ForwardScanDirection);
+ }
+ break;
+
+ case BTLessEqualStrategyNumber:
+ if (result >= 0) {
+ do {
+ if (!_bt_twostep(scan, &buf, ForwardScanDirection))
+ break;
+
+ offnum = ItemPointerGetOffsetNumber(current);
+ page = BufferGetPage(buf);
+ result = _bt_compare(rel, itupdesc, page, 1, &skdata, offnum);
+ } while (result >= 0);
+
+ if (result < 0)
+ (void) _bt_twostep(scan, &buf, BackwardScanDirection);
+ }
+ break;
+
+ case BTEqualStrategyNumber:
+ if (result != 0) {
+ _bt_relbuf(scan->relation, buf, BT_READ);
+ so->btso_curbuf = InvalidBuffer;
+ ItemPointerSetInvalid(&(scan->currentItemData));
+ return ((RetrieveIndexResult) NULL);
+ }
+ break;
+
+ case BTGreaterEqualStrategyNumber:
+ if (result < 0) {
+ do {
+ if (!_bt_twostep(scan, &buf, BackwardScanDirection))
+ break;
+
+ page = BufferGetPage(buf);
+ offnum = ItemPointerGetOffsetNumber(current);
+ result = _bt_compare(rel, itupdesc, page, 1, &skdata, offnum);
+ } while (result < 0);
+
+ if (result > 0)
+ (void) _bt_twostep(scan, &buf, ForwardScanDirection);
+ }
+ break;
+
+ case BTGreaterStrategyNumber:
+ if (result >= 0) {
+ do {
+ if (!_bt_twostep(scan, &buf, ForwardScanDirection))
+ break;
+
+ offnum = ItemPointerGetOffsetNumber(current);
+ page = BufferGetPage(buf);
+ result = _bt_compare(rel, itupdesc, page, 1, &skdata, offnum);
+ } while (result >= 0);
+ }
+ break;
+ }
+
+ /* okay, current item pointer for the scan is right */
+ offnum = ItemPointerGetOffsetNumber(current);
+ page = BufferGetPage(buf);
+ btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
+ itup = &btitem->bti_itup;
+
+ if (_bt_checkqual(scan, itup)) {
+ iptr = (ItemPointer) palloc(sizeof(ItemPointerData));
+ memmove((char *) iptr, (char *) &(itup->t_tid),
+ sizeof(ItemPointerData));
+ res = FormRetrieveIndexResult(current, iptr);
+ pfree(iptr);
+
+ /* remember which buffer we have pinned */
+ so->btso_curbuf = buf;
+ } else {
+ ItemPointerSetInvalid(current);
+ so->btso_curbuf = InvalidBuffer;
+ _bt_relbuf(rel, buf, BT_READ);
+ res = (RetrieveIndexResult) NULL;
+ }
+
+ return (res);
+}
+
+/*
+ * _bt_step() -- Step one item in the requested direction in a scan on
+ * the tree.
+ *
+ * If no adjacent record exists in the requested direction, return
+ * false. Else, return true and set the currentItemData for the
+ * scan to the right thing.
+ */
+bool
+_bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
+{
+ Page page;
+ BTPageOpaque opaque;
+ OffsetNumber offnum, maxoff;
+ OffsetNumber start;
+ BlockNumber blkno;
+ BlockNumber obknum;
+ BTScanOpaque so;
+ ItemPointer current;
+ Relation rel;
+
+ rel = scan->relation;
+ current = &(scan->currentItemData);
+ offnum = ItemPointerGetOffsetNumber(current);
+ page = BufferGetPage(*bufP);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ so = (BTScanOpaque) scan->opaque;
+ maxoff = PageGetMaxOffsetNumber(page);
+
+ /* get the next tuple */
+ if (ScanDirectionIsForward(dir)) {
+ if (!PageIsEmpty(page) && offnum < maxoff) {
+ offnum = OffsetNumberNext(offnum);
+ } else {
+
+ /* if we're at end of scan, release the buffer and return */
+ blkno = opaque->btpo_next;
+ if (P_RIGHTMOST(opaque)) {
+ _bt_relbuf(rel, *bufP, BT_READ);
+ ItemPointerSetInvalid(current);
+ *bufP = so->btso_curbuf = InvalidBuffer;
+ return (false);
+ } else {
+
+ /* walk right to the next page with data */
+ _bt_relbuf(rel, *bufP, BT_READ);
+ for (;;) {
+ *bufP = _bt_getbuf(rel, blkno, BT_READ);
+ page = BufferGetPage(*bufP);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ maxoff = PageGetMaxOffsetNumber(page);
+ start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
+
+ if (!PageIsEmpty(page) && start <= maxoff) {
+ break;
+ } else {
+ blkno = opaque->btpo_next;
+ _bt_relbuf(rel, *bufP, BT_READ);
+ if (blkno == P_NONE) {
+ *bufP = so->btso_curbuf = InvalidBuffer;
+ ItemPointerSetInvalid(current);
+ return (false);
+ }
+ }
+ }
+ offnum = start;
+ }
+ }
+ } else if (ScanDirectionIsBackward(dir)) {
+
+ /* remember that high key is item zero on non-rightmost pages */
+ start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
+
+ if (offnum > start) {
+ offnum = OffsetNumberPrev(offnum);
+ } else {
+
+ /* if we're at end of scan, release the buffer and return */
+ blkno = opaque->btpo_prev;
+ if (P_LEFTMOST(opaque)) {
+ _bt_relbuf(rel, *bufP, BT_READ);
+ *bufP = so->btso_curbuf = InvalidBuffer;
+ ItemPointerSetInvalid(current);
+ return (false);
+ } else {
+
+ obknum = BufferGetBlockNumber(*bufP);
+
+ /* walk right to the next page with data */
+ _bt_relbuf(rel, *bufP, BT_READ);
+ for (;;) {
+ *bufP = _bt_getbuf(rel, blkno, BT_READ);
+ page = BufferGetPage(*bufP);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ maxoff = PageGetMaxOffsetNumber(page);
+
+ /*
+ * If the adjacent page just split, then we may have the
+ * wrong block. Handle this case. Because pages only
+ * split right, we don't have to worry about this failing
+ * to terminate.
+ */
+
+ while (opaque->btpo_next != obknum) {
+ blkno = opaque->btpo_next;
+ _bt_relbuf(rel, *bufP, BT_READ);
+ *bufP = _bt_getbuf(rel, blkno, BT_READ);
+ page = BufferGetPage(*bufP);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ maxoff = PageGetMaxOffsetNumber(page);
+ }
+
+ /* don't consider the high key */
+ start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
+
+ /* anything to look at here? */
+ if (!PageIsEmpty(page) && maxoff >= start) {
+ break;
+ } else {
+ blkno = opaque->btpo_prev;
+ obknum = BufferGetBlockNumber(*bufP);
+ _bt_relbuf(rel, *bufP, BT_READ);
+ if (blkno == P_NONE) {
+ *bufP = so->btso_curbuf = InvalidBuffer;
+ ItemPointerSetInvalid(current);
+ return (false);
+ }
+ }
+ }
+ offnum = maxoff; /* XXX PageIsEmpty? */
+ }
+ }
+ }
+ blkno = BufferGetBlockNumber(*bufP);
+ so->btso_curbuf = *bufP;
+ ItemPointerSet(current, blkno, offnum);
+
+ return (true);
+}
+
+/*
+ * _bt_twostep() -- Move to an adjacent record in a scan on the tree,
+ * if an adjacent record exists.
+ *
+ * This is like _bt_step, except that if no adjacent record exists
+ * it restores us to where we were before trying the step. This is
+ * only hairy when you cross page boundaries, since the page you cross
+ * from could have records inserted or deleted, or could even split.
+ * This is unlikely, but we try to handle it correctly here anyway.
+ *
+ * This routine contains the only case in which our changes to Lehman
+ * and Yao's algorithm.
+ *
+ * Like step, this routine leaves the scan's currentItemData in the
+ * proper state and acquires a lock and pin on *bufP. If the twostep
+ * succeeded, we return true; otherwise, we return false.
+ */
+static bool
+_bt_twostep(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
+{
+ Page page;
+ BTPageOpaque opaque;
+ OffsetNumber offnum, maxoff;
+ OffsetNumber start;
+ ItemPointer current;
+ ItemId itemid;
+ int itemsz;
+ BTItem btitem;
+ BTItem svitem;
+ BlockNumber blkno;
+
+ blkno = BufferGetBlockNumber(*bufP);
+ page = BufferGetPage(*bufP);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ maxoff = PageGetMaxOffsetNumber(page);
+ current = &(scan->currentItemData);
+ offnum = ItemPointerGetOffsetNumber(current);
+
+ start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
+
+ /* if we're safe, just do it */
+ if (ScanDirectionIsForward(dir) && offnum < maxoff) { /* XXX PageIsEmpty? */
+ ItemPointerSet(current, blkno, OffsetNumberNext(offnum));
+ return (true);
+ } else if (ScanDirectionIsBackward(dir) && offnum > start) {
+ ItemPointerSet(current, blkno, OffsetNumberPrev(offnum));
+ return (true);
+ }
+
+ /* if we've hit end of scan we don't have to do any work */
+ if (ScanDirectionIsForward(dir) && P_RIGHTMOST(opaque)) {
+ return (false);
+ } else if (ScanDirectionIsBackward(dir) && P_LEFTMOST(opaque)) {
+ return (false);
+ }
+
+ /*
+ * Okay, it's off the page; let _bt_step() do the hard work, and we'll
+ * try to remember where we were. This is not guaranteed to work; this
+ * is the only place in the code where concurrency can screw us up,
+ * and it's because we want to be able to move in two directions in
+ * the scan.
+ */
+
+ itemid = PageGetItemId(page, offnum);
+ itemsz = ItemIdGetLength(itemid);
+ btitem = (BTItem) PageGetItem(page, itemid);
+ svitem = (BTItem) palloc(itemsz);
+ memmove((char *) svitem, (char *) btitem, itemsz);
+
+ if (_bt_step(scan, bufP, dir)) {
+ pfree(svitem);
+ return (true);
+ }
+
+ /* try to find our place again */
+ *bufP = _bt_getbuf(scan->relation, blkno, BT_READ);
+ page = BufferGetPage(*bufP);
+ maxoff = PageGetMaxOffsetNumber(page);
+
+ while (offnum <= maxoff) {
+ itemid = PageGetItemId(page, offnum);
+ btitem = (BTItem) PageGetItem(page, itemid);
+ if (btitem->bti_oid == svitem->bti_oid) {
+ pfree(svitem);
+ ItemPointerSet(current, blkno, offnum);
+ return (false);
+ }
+ }
+
+ /*
+ * XXX crash and burn -- can't find our place. We can be a little
+ * smarter -- walk to the next page to the right, for example, since
+ * that's the only direction that splits happen in. Deletions screw
+ * us up less often since they're only done by the vacuum daemon.
+ */
+
+ elog(WARN, "btree synchronization error: concurrent update botched scan");
+
+ return (false);
+}
+
+/*
+ * _bt_endpoint() -- Find the first or last key in the index.
+ */
+static RetrieveIndexResult
+_bt_endpoint(IndexScanDesc scan, ScanDirection dir)
+{
+ Relation rel;
+ Buffer buf;
+ Page page;
+ BTPageOpaque opaque;
+ ItemPointer current;
+ ItemPointer iptr;
+ OffsetNumber offnum, maxoff;
+ OffsetNumber start;
+ BlockNumber blkno;
+ BTItem btitem;
+ IndexTuple itup;
+ BTScanOpaque so;
+ RetrieveIndexResult res;
+
+ rel = scan->relation;
+ current = &(scan->currentItemData);
+
+ buf = _bt_getroot(rel, BT_READ);
+ blkno = BufferGetBlockNumber(buf);
+ page = BufferGetPage(buf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+ for (;;) {
+ if (opaque->btpo_flags & BTP_LEAF)
+ break;
+
+ if (ScanDirectionIsForward(dir)) {
+ offnum = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
+ } else {
+ offnum = PageGetMaxOffsetNumber(page);
+ }
+
+ btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
+ itup = &(btitem->bti_itup);
+
+ blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
+
+ _bt_relbuf(rel, buf, BT_READ);
+ buf = _bt_getbuf(rel, blkno, BT_READ);
+ page = BufferGetPage(buf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+ /*
+ * Race condition: If the child page we just stepped onto is
+ * in the process of being split, we need to make sure we're
+ * all the way at the right edge of the tree. See the paper
+ * by Lehman and Yao.
+ */
+
+ if (ScanDirectionIsBackward(dir) && ! P_RIGHTMOST(opaque)) {
+ do {
+ blkno = opaque->btpo_next;
+ _bt_relbuf(rel, buf, BT_READ);
+ buf = _bt_getbuf(rel, blkno, BT_READ);
+ page = BufferGetPage(buf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ } while (! P_RIGHTMOST(opaque));
+ }
+ }
+
+ /* okay, we've got the {left,right}-most page in the tree */
+ maxoff = PageGetMaxOffsetNumber(page);
+
+ if (ScanDirectionIsForward(dir)) {
+ if (PageIsEmpty(page)) {
+ maxoff = FirstOffsetNumber;
+ } else {
+ maxoff = PageGetMaxOffsetNumber(page);
+ }
+ start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
+
+ if (PageIsEmpty(page) || start > maxoff) {
+ ItemPointerSet(current, blkno, maxoff);
+ if (!_bt_step(scan, &buf, BackwardScanDirection))
+ return ((RetrieveIndexResult) NULL);
+
+ start = ItemPointerGetOffsetNumber(current);
+ page = BufferGetPage(buf);
+ } else {
+ ItemPointerSet(current, blkno, start);
+ }
+ } else if (ScanDirectionIsBackward(dir)) {
+ if (PageIsEmpty(page)) {
+ ItemPointerSet(current, blkno, FirstOffsetNumber);
+ if (!_bt_step(scan, &buf, ForwardScanDirection))
+ return ((RetrieveIndexResult) NULL);
+
+ start = ItemPointerGetOffsetNumber(current);
+ page = BufferGetPage(buf);
+ } else {
+ start = PageGetMaxOffsetNumber(page);
+ ItemPointerSet(current, blkno, start);
+ }
+ } else {
+ elog(WARN, "Illegal scan direction %d", dir);
+ }
+
+ btitem = (BTItem) PageGetItem(page, PageGetItemId(page, start));
+ itup = &(btitem->bti_itup);
+
+ /* see if we picked a winner */
+ if (_bt_checkqual(scan, itup)) {
+ iptr = (ItemPointer) palloc(sizeof(ItemPointerData));
+ memmove((char *) iptr, (char *) &(itup->t_tid),
+ sizeof(ItemPointerData));
+ res = FormRetrieveIndexResult(current, iptr);
+
+ /* remember which buffer we have pinned */
+ so = (BTScanOpaque) scan->opaque;
+ so->btso_curbuf = buf;
+ } else {
+ _bt_relbuf(rel, buf, BT_READ);
+ res = (RetrieveIndexResult) NULL;
+ }
+
+ return (res);
+}
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c
new file mode 100644
index 0000000000..3d2676324a
--- /dev/null
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -0,0 +1,1196 @@
+/*-------------------------------------------------------------------------
+ * btsort.c--
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Id: nbtsort.c,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $
+ *
+ * NOTES
+ *
+ * what we do is:
+ * - generate a set of initial one-block runs, distributed round-robin
+ * between the output tapes.
+ * - for each pass,
+ * - swap input and output tape sets, rewinding both and truncating
+ * the output tapes.
+ * - merge the current run in each input tape to the current output
+ * tape.
+ * - when each input run has been exhausted, switch to another output
+ * tape and start processing another run.
+ * - when we have fewer runs than tapes, we know we are ready to start
+ * merging into the btree leaf pages.
+ * - every time we complete a level of the btree, we can construct the
+ * next level up. when we have only one page on a level, it can be
+ * attached to the btree metapage and we are done.
+ *
+ * conventions:
+ * - external interface routines take in and return "void *" for their
+ * opaque handles. this is for modularity reasons (i prefer not to
+ * export these structures without good reason).
+ *
+ * this code is moderately slow (~10% slower) compared to the regular
+ * btree (insertion) build code on sorted or well-clustered data. on
+ * random data, however, the insertion build code is unusable -- the
+ * difference on a 60MB heap is a factor of 15 because the random
+ * probes into the btree thrash the buffer pool.
+ *
+ * this code currently packs the pages to 100% of capacity. this is
+ * not wise, since *any* insertion will cause splitting. filling to
+ * something like the standard 70% steady-state load factor for btrees
+ * would probably be better.
+ *
+ * somebody desperately needs to figure out how to do a better job of
+ * balancing the merge passes -- the fan-in on the final merges can be
+ * pretty poor, which is bad for performance.
+ *-------------------------------------------------------------------------
+ */
+
+#include <stdio.h>
+
+#include "c.h"
+
+#include "access/nbtree.h"
+
+#include "storage/bufmgr.h"
+#include "storage/fd.h"
+#include "utils/rel.h"
+#include "utils/palloc.h"
+#include "utils/elog.h"
+
+/*#define FASTBUILD_DEBUG*/ /* turn on debugging output */
+
+#define FASTBUILD
+
+#ifdef FASTBUILD
+
+#define MAXTAPES (7)
+#define TAPEBLCKSZ (BLCKSZ << 2)
+#define TAPETEMP "pg_btsortXXXXXX"
+
+
+/*-------------------------------------------------------------------------
+ * sorting comparison routine - returns {-1,0,1} depending on whether
+ * the key in the left BTItem is {<,=,>} the key in the right BTItem.
+ *
+ * we want to use _bt_isortcmp as a comparison function for qsort(3),
+ * but it needs extra arguments, so we "pass them in" as global
+ * variables. ick. fortunately, they are the same throughout the
+ * build, so we need do this only once. this is why you must call
+ * _bt_isortcmpinit before the call to qsort(3).
+ *
+ * a NULL BTItem is always assumed to be greater than any actual
+ * value; our heap routines (see below) assume that the smallest
+ * element in the heap is returned. that way, NULL values from the
+ * exhausted tapes can sift down to the bottom of the heap. in point
+ * of fact we just don't replace the elements of exhausted tapes, but
+ * what the heck.
+ * *-------------------------------------------------------------------------
+ */
+static Relation _bt_sortrel;
+
+static void
+_bt_isortcmpinit(Relation index)
+{
+ _bt_sortrel = index;
+}
+
+static int
+_bt_isortcmp(BTItem *bti1p, BTItem *bti2p)
+{
+ BTItem bti1 = *bti1p;
+ BTItem bti2 = *bti2p;
+
+ if (bti1 == (BTItem) NULL) {
+ if (bti2 == (BTItem) NULL) {
+ return(0); /* 1 = 2 */
+ }
+ return(1); /* 1 > 2 */
+ } else if (bti2 == (BTItem) NULL) {
+ return(-1); /* 1 < 2 */
+ } else if (_bt_itemcmp(_bt_sortrel, 1, bti1, bti2,
+ BTGreaterStrategyNumber)) {
+ return(1); /* 1 > 2 */
+ } else if (_bt_itemcmp(_bt_sortrel, 1, bti2, bti1,
+ BTGreaterStrategyNumber)) {
+ return(-1); /* 1 < 2 */
+ }
+ return(0); /* 1 = 2 */
+}
+
+/*-------------------------------------------------------------------------
+ * priority queue methods
+ *
+ * these were more-or-less lifted from the heap section of the 1984
+ * edition of gonnet's book on algorithms and data structures. they
+ * are coded so that the smallest element in the heap is returned (we
+ * use them for merging sorted runs).
+ *
+ * XXX these probably ought to be generic library functions.
+ *-------------------------------------------------------------------------
+ */
+
+typedef struct {
+ int btpqe_tape; /* tape identifier */
+ BTItem btpqe_item; /* pointer to BTItem in tape buffer */
+} BTPriQueueElem;
+
+#define MAXELEM MAXTAPES
+typedef struct {
+ int btpq_nelem;
+ BTPriQueueElem btpq_queue[MAXELEM];
+ Relation btpq_rel;
+} BTPriQueue;
+
+/* be sure to call _bt_isortcmpinit first */
+#define GREATER(a, b) \
+ (_bt_isortcmp(&((a)->btpqe_item), &((b)->btpqe_item)) > 0)
+
+static void
+_bt_pqsift(BTPriQueue *q, int parent)
+{
+ int child;
+ BTPriQueueElem e;
+
+ for (child = parent * 2 + 1;
+ child < q->btpq_nelem;
+ child = parent * 2 + 1) {
+ if (child < q->btpq_nelem - 1) {
+ if (GREATER(&(q->btpq_queue[child]), &(q->btpq_queue[child+1]))) {
+ ++child;
+ }
+ }
+ if (GREATER(&(q->btpq_queue[parent]), &(q->btpq_queue[child]))) {
+ e = q->btpq_queue[child]; /* struct = */
+ q->btpq_queue[child] = q->btpq_queue[parent]; /* struct = */
+ q->btpq_queue[parent] = e; /* struct = */
+ parent = child;
+ } else {
+ parent = child + 1;
+ }
+ }
+}
+
+static int
+_bt_pqnext(BTPriQueue *q, BTPriQueueElem *e)
+{
+ if (q->btpq_nelem < 1) { /* already empty */
+ return(-1);
+ }
+ *e = q->btpq_queue[0]; /* struct = */
+
+ if (--q->btpq_nelem < 1) { /* now empty, don't sift */
+ return(0);
+ }
+ q->btpq_queue[0] = q->btpq_queue[q->btpq_nelem]; /* struct = */
+ _bt_pqsift(q, 0);
+ return(0);
+}
+
+static void
+_bt_pqadd(BTPriQueue *q, BTPriQueueElem *e)
+{
+ int child, parent;
+
+ if (q->btpq_nelem >= MAXELEM) {
+ elog(WARN, "_bt_pqadd: queue overflow");
+ }
+
+ child = q->btpq_nelem++;
+ while (child > 0) {
+ parent = child / 2;
+ if (GREATER(e, &(q->btpq_queue[parent]))) {
+ break;
+ } else {
+ q->btpq_queue[child] = q->btpq_queue[parent]; /* struct = */
+ child = parent;
+ }
+ }
+
+ q->btpq_queue[child] = *e; /* struct = */
+}
+
+/*-------------------------------------------------------------------------
+ * tape methods
+ *-------------------------------------------------------------------------
+ */
+
+#define BTITEMSZ(btitem) \
+ ((btitem) ? \
+ (IndexTupleDSize((btitem)->bti_itup) + \
+ (sizeof(BTItemData) - sizeof(IndexTupleData))) : \
+ 0)
+#define SPCLEFT(tape) \
+ (sizeof((tape)->bttb_data) - (tape)->bttb_top)
+#define EMPTYTAPE(tape) \
+ ((tape)->bttb_ntup <= 0)
+#define BTTAPEMAGIC 0x19660226
+
+/*
+ * this is what we use to shovel BTItems in and out of memory. it's
+ * bigger than a standard block because we are doing a lot of strictly
+ * sequential i/o. this is obviously something of a tradeoff since we
+ * are potentially reading a bunch of zeroes off of disk in many
+ * cases.
+ *
+ * BTItems are packed in and DOUBLEALIGN'd.
+ *
+ * the fd should not be going out to disk, strictly speaking, but it's
+ * the only thing like that so i'm not going to worry about wasting a
+ * few bytes.
+ */
+typedef struct {
+ int bttb_magic; /* magic number */
+ int bttb_fd; /* file descriptor */
+ int bttb_top; /* top of free space within bttb_data */
+ short bttb_ntup; /* number of tuples in this block */
+ short bttb_eor; /* End-Of-Run marker */
+ char bttb_data[TAPEBLCKSZ - 2 * sizeof(double)];
+} BTTapeBlock;
+
+
+/*
+ * reset the tape header for its next use without doing anything to
+ * the physical tape file. (setting bttb_top to 0 makes the block
+ * empty.)
+ */
+static void
+_bt_tapereset(BTTapeBlock *tape)
+{
+ tape->bttb_eor = 0;
+ tape->bttb_top = 0;
+ tape->bttb_ntup = 0;
+}
+
+/*
+ * rewind the physical tape file.
+ */
+static void
+_bt_taperewind(BTTapeBlock *tape)
+{
+ (void) FileSeek(tape->bttb_fd, 0, SEEK_SET);
+}
+
+/*
+ * destroy the contents of the physical tape file without destroying
+ * the tape data structure or removing the physical tape file.
+ *
+ * we use the VFD version of ftruncate(2) to do this rather than
+ * unlinking and recreating the file. you still have to wait while
+ * the OS frees up all of the file system blocks and stuff, but at
+ * least you don't have to delete and reinsert the directory entries.
+ */
+static void
+_bt_tapeclear(BTTapeBlock *tape)
+{
+ /* blow away the contents of the old file */
+ _bt_taperewind(tape);
+#if 0
+ FileSync(tape->bttb_fd);
+#endif
+ FileTruncate(tape->bttb_fd, 0);
+
+ /* reset the buffer */
+ _bt_tapereset(tape);
+}
+
+/*
+ * create a new BTTapeBlock, allocating memory for the data structure
+ * as well as opening a physical tape file.
+ */
+static BTTapeBlock *
+_bt_tapecreate(char *fname)
+{
+ BTTapeBlock *tape = (BTTapeBlock *) palloc(sizeof(BTTapeBlock));
+
+ if (tape == (BTTapeBlock *) NULL) {
+ elog(WARN, "_bt_tapecreate: out of memory");
+ }
+
+ tape->bttb_magic = BTTAPEMAGIC;
+
+ tape->bttb_fd = FileNameOpenFile(fname, O_RDWR|O_CREAT|O_TRUNC, 0600);
+ Assert(tape->bttb_fd >= 0);
+
+ /* initialize the buffer */
+ _bt_tapereset(tape);
+
+ return(tape);
+}
+
+/*
+ * destroy the BTTapeBlock structure and its physical tape file.
+ */
+static void
+_bt_tapedestroy(BTTapeBlock *tape)
+{
+ FileUnlink(tape->bttb_fd);
+ pfree((void *) tape);
+}
+
+/*
+ * flush the tape block to the file, marking End-Of-Run if requested.
+ */
+static void
+_bt_tapewrite(BTTapeBlock *tape, int eor)
+{
+ tape->bttb_eor = eor;
+ FileWrite(tape->bttb_fd, (char*)tape, TAPEBLCKSZ);
+ _bt_tapereset(tape);
+}
+
+/*
+ * read a tape block from the file, overwriting the current contents
+ * of the buffer.
+ *
+ * returns:
+ * - 0 if there are no more blocks in the tape or in this run (call
+ * _bt_tapereset to clear the End-Of-Run marker)
+ * - 1 if a valid block was read
+ */
+static int
+_bt_taperead(BTTapeBlock *tape)
+{
+ int fd;
+ int nread;
+
+ if (tape->bttb_eor) {
+ return(0); /* we are at End-Of-Run */
+ }
+
+ /*
+ * we're clobbering the old tape block, but we do need to save the
+ * VFD (the one in the block we're reading is bogus).
+ */
+ fd = tape->bttb_fd;
+ nread = FileRead(fd, (char*) tape, TAPEBLCKSZ);
+ tape->bttb_fd = fd;
+
+ if (nread != TAPEBLCKSZ) {
+ Assert(nread == 0); /* we are at EOF */
+ return(0);
+ }
+ Assert(tape->bttb_magic == BTTAPEMAGIC);
+ return(1);
+}
+
+/*
+ * get the next BTItem from a tape block.
+ *
+ * returns:
+ * - NULL if we have run out of BTItems
+ * - a pointer to the BTItemData in the block otherwise
+ *
+ * side effects:
+ * - sets 'pos' to the current position within the block.
+ */
+static BTItem
+_bt_tapenext(BTTapeBlock *tape, char **pos)
+{
+ Size itemsz;
+ BTItem bti;
+
+ if (*pos >= tape->bttb_data + tape->bttb_top) {
+ return((BTItem) NULL);
+ }
+ bti = (BTItem) *pos;
+ itemsz = BTITEMSZ(bti);
+ *pos += DOUBLEALIGN(itemsz);
+ return(bti);
+}
+
+/*
+ * copy a BTItem into a tape block.
+ *
+ * assumes that we have already checked to see if the block has enough
+ * space for the item.
+ *
+ * side effects:
+ *
+ * - advances the 'top' pointer in the tape block header to point to
+ * the beginning of free space.
+ */
+static void
+_bt_tapeadd(BTTapeBlock *tape, BTItem item, int itemsz)
+{
+ (void) memcpy(tape->bttb_data + tape->bttb_top, item, itemsz);
+ ++tape->bttb_ntup;
+ tape->bttb_top += DOUBLEALIGN(itemsz);
+}
+
+/*-------------------------------------------------------------------------
+ * spool methods
+ *-------------------------------------------------------------------------
+ */
+
+/*
+ * this structure holds the bookkeeping for a simple balanced multiway
+ * merge. (polyphase merging is hairier than i want to get into right
+ * now, and i don't see why i have to care how many "tapes" i use
+ * right now. though if psort was in a condition that i could hack it
+ * to do this, you bet i would.)
+ */
+typedef struct {
+ int bts_ntapes;
+ int bts_tape;
+ BTTapeBlock **bts_itape; /* input tape blocks */
+ BTTapeBlock **bts_otape; /* output tape blocks */
+} BTSpool;
+
+/*
+ * create and initialize a spool structure, including the underlying
+ * files.
+ */
+void *
+_bt_spoolinit(Relation index, int ntapes)
+{
+ char *mktemp();
+
+ BTSpool *btspool = (BTSpool *) palloc(sizeof(BTSpool));
+ int i;
+ char *fname = (char *) palloc(sizeof(TAPETEMP) + 1);
+
+ if (btspool == (BTSpool *) NULL || fname == (char *) NULL) {
+ elog(WARN, "_bt_spoolinit: out of memory");
+ }
+ (void) memset((char *) btspool, 0, sizeof(BTSpool));
+ btspool->bts_ntapes = ntapes;
+ btspool->bts_tape = 0;
+
+ btspool->bts_itape =
+ (BTTapeBlock **) palloc(sizeof(BTTapeBlock *) * ntapes);
+ btspool->bts_otape =
+ (BTTapeBlock **) palloc(sizeof(BTTapeBlock *) * ntapes);
+ if (btspool->bts_itape == (BTTapeBlock **) NULL ||
+ btspool->bts_otape == (BTTapeBlock **) NULL) {
+ elog(WARN, "_bt_spoolinit: out of memory");
+ }
+
+ for (i = 0; i < ntapes; ++i) {
+ btspool->bts_itape[i] =
+ _bt_tapecreate(mktemp(strcpy(fname, TAPETEMP)));
+ btspool->bts_otape[i] =
+ _bt_tapecreate(mktemp(strcpy(fname, TAPETEMP)));
+ }
+ pfree((void *) fname);
+
+ _bt_isortcmpinit(index);
+
+ return((void *) btspool);
+}
+
+/*
+ * clean up a spool structure and its substructures.
+ */
+void
+_bt_spooldestroy(void *spool)
+{
+ BTSpool *btspool = (BTSpool *) spool;
+ int i;
+
+ for (i = 0; i < btspool->bts_ntapes; ++i) {
+ _bt_tapedestroy(btspool->bts_otape[i]);
+ _bt_tapedestroy(btspool->bts_itape[i]);
+ }
+ pfree((void *) btspool);
+}
+
+/*
+ * flush out any dirty output tape blocks
+ */
+static void
+_bt_spoolflush(BTSpool *btspool)
+{
+ int i;
+
+ for (i = 0; i < btspool->bts_ntapes; ++i) {
+ if (!EMPTYTAPE(btspool->bts_otape[i])) {
+ _bt_tapewrite(btspool->bts_otape[i], 1);
+ }
+ }
+}
+
+/*
+ * swap input tapes and output tapes by swapping their file
+ * descriptors. additional preparation for the next merge pass
+ * includes rewinding the new input tapes and clearing out the new
+ * output tapes.
+ */
+static void
+_bt_spoolswap(BTSpool *btspool)
+{
+ File tmpfd;
+ BTTapeBlock *itape;
+ BTTapeBlock *otape;
+ int i;
+
+ for (i = 0; i < btspool->bts_ntapes; ++i) {
+ itape = btspool->bts_itape[i];
+ otape = btspool->bts_otape[i];
+
+ /*
+ * swap the input and output VFDs.
+ */
+ tmpfd = itape->bttb_fd;
+ itape->bttb_fd = otape->bttb_fd;
+ otape->bttb_fd = tmpfd;
+
+ /*
+ * rewind the new input tape.
+ */
+ _bt_taperewind(itape);
+ _bt_tapereset(itape);
+
+ /*
+ * clear the new output tape -- it's ok to throw away the old
+ * inputs.
+ */
+ _bt_tapeclear(otape);
+ }
+}
+
+/*-------------------------------------------------------------------------
+ * sorting routines
+ *-------------------------------------------------------------------------
+ */
+
+/*
+ * spool 'btitem' into an initial run. as tape blocks are filled, the
+ * block BTItems are qsorted and written into some output tape (it
+ * doesn't matter which; we go round-robin for simplicity). the
+ * initial runs are therefore always just one block.
+ */
+void
+_bt_spool(Relation index, BTItem btitem, void *spool)
+{
+ BTSpool *btspool = (BTSpool *) spool;
+ BTTapeBlock *itape;
+ Size itemsz;
+
+ itape = btspool->bts_itape[btspool->bts_tape];
+ itemsz = BTITEMSZ(btitem);
+ itemsz = DOUBLEALIGN(itemsz);
+
+ /*
+ * if this buffer is too full for this BTItemData, or if we have
+ * run out of BTItems, we need to sort the buffer and write it
+ * out. in this case, the BTItemData will go into the next tape's
+ * buffer.
+ */
+ if (btitem == (BTItem) NULL || SPCLEFT(itape) < itemsz) {
+ BTItem *parray;
+ BTTapeBlock *otape;
+ BTItem bti;
+ char *pos;
+ int btisz;
+ int i;
+
+ /*
+ * build an array of pointers to the BTItemDatas on the input
+ * block.
+ */
+ parray = (BTItem *) palloc(itape->bttb_ntup * sizeof(BTItem));
+ if (parray == (BTItem *) NULL) {
+ elog(WARN, "_bt_spool: out of memory");
+ }
+ pos = itape->bttb_data;
+ for (i = 0; i < itape->bttb_ntup; ++i) {
+ parray[i] = _bt_tapenext(itape, &pos);
+ }
+
+ /*
+ * qsort the pointer array.
+ */
+ _bt_isortcmpinit(index);
+ qsort((void *) parray, itape->bttb_ntup, sizeof(BTItem), _bt_isortcmp);
+
+ /*
+ * write the spooled run into the output tape. we copy the
+ * BTItemDatas in the order dictated by the sorted array of
+ * BTItems, not the original order.
+ *
+ * (since everything was DOUBLEALIGN'd and is all on a single
+ * page, everything had *better* still fit on one page..)
+ */
+ otape = btspool->bts_otape[btspool->bts_tape];
+ for (i = 0; i < itape->bttb_ntup; ++i) {
+ bti = parray[i];
+ btisz = BTITEMSZ(bti);
+ btisz = DOUBLEALIGN(btisz);
+ _bt_tapeadd(otape, bti, btisz);
+#ifdef FASTBUILD_DEBUG
+ {
+ bool isnull;
+ Datum d = index_getattr(&(bti->bti_itup), 1,
+ RelationGetTupleDescriptor(index),
+ &isnull);
+ printf("_bt_spool: inserted <%x> into output tape %d\n",
+ d, btspool->bts_tape);
+ }
+#endif /* FASTBUILD_DEBUG */
+ }
+
+ /*
+ * the initial runs are always single tape blocks. flush the
+ * output block, marking End-Of-Run.
+ */
+ _bt_tapewrite(otape, 1);
+
+ /*
+ * reset the input buffer for the next run. we don't have to
+ * write it out or anything -- we only use it to hold the
+ * unsorted BTItemDatas, the output tape contains all the
+ * sorted stuff.
+ *
+ * changing bts_tape changes the output tape and input tape;
+ * we change itape for the code below.
+ */
+ _bt_tapereset(itape);
+ btspool->bts_tape = (btspool->bts_tape + 1) % btspool->bts_ntapes;
+ itape = btspool->bts_itape[btspool->bts_tape];
+
+ /*
+ * destroy the pointer array.
+ */
+ pfree((void *) parray);
+ }
+
+ /* insert this item into the current buffer */
+ if (btitem != (BTItem) NULL) {
+ _bt_tapeadd(itape, btitem, itemsz);
+ }
+}
+
+/*
+ * allocate a new, clean btree page, not linked to any siblings.
+ */
+static void
+_bt_blnewpage(Relation index, Buffer *buf, Page *page, int flags)
+{
+ BTPageOpaque opaque;
+
+ *buf = _bt_getbuf(index, P_NEW, BT_WRITE);
+ *page = BufferGetPage(*buf);
+ _bt_pageinit(*page, BufferGetPageSize(*buf));
+ opaque = (BTPageOpaque) PageGetSpecialPointer(*page);
+ opaque->btpo_prev = opaque->btpo_next = P_NONE;
+ opaque->btpo_flags = flags;
+}
+
+/*
+ * slide an array of ItemIds back one slot (from P_FIRSTKEY to
+ * P_HIKEY). we need to do this when we discover that we have built
+ * an ItemId array in what has turned out to be a P_RIGHTMOST page.
+ */
+static void
+_bt_slideleft(Relation index, Buffer buf, Page page)
+{
+ OffsetNumber off;
+ OffsetNumber maxoff;
+ ItemId previi;
+ ItemId thisii;
+
+ maxoff = PageGetMaxOffsetNumber(page);
+ previi = PageGetItemId(page, P_HIKEY);
+ for (off = P_FIRSTKEY; off <= maxoff; off = OffsetNumberNext(off)) {
+ thisii = PageGetItemId(page, off);
+ *previi = *thisii;
+ previi = thisii;
+ }
+ ((PageHeader) page)->pd_lower -= sizeof(ItemIdData);
+}
+
+typedef struct {
+ Buffer btps_buf;
+ Page btps_page;
+ BTItem btps_lastbti;
+ OffsetNumber btps_lastoff;
+ OffsetNumber btps_firstoff;
+} BTPageState;
+
+/*
+ * add an item to a disk page from a merge tape block.
+ *
+ * we must be careful to observe the following restrictions, placed
+ * upon us by the conventions in nbtsearch.c:
+ * - rightmost pages start data items at P_HIKEY instead of at
+ * P_FIRSTKEY.
+ * - duplicates cannot be split among pages unless the chain of
+ * duplicates starts at the first data item.
+ *
+ * a leaf page being built looks like:
+ *
+ * +----------------+---------------------------------+
+ * | PageHeaderData | linp0 linp1 linp2 ... |
+ * +-----------+----+---------------------------------+
+ * | ... linpN | ^ first |
+ * +-----------+--------------------------------------+
+ * | ^ last |
+ * | |
+ * | v last |
+ * +-------------+------------------------------------+
+ * | | itemN ... |
+ * +-------------+------------------+-----------------+
+ * | ... item3 item2 item1 | "special space" |
+ * +--------------------------------+-----------------+
+ * ^ first
+ *
+ * contrast this with the diagram in bufpage.h; note the mismatch
+ * between linps and items. this is because we reserve linp0 as a
+ * placeholder for the pointer to the "high key" item; when we have
+ * filled up the page, we will set linp0 to point to itemN and clear
+ * linpN.
+ *
+ * 'last' pointers indicate the last offset/item added to the page.
+ * 'first' pointers indicate the first offset/item that is part of a
+ * chain of duplicates extending from 'first' to 'last'.
+ *
+ * if all keys are unique, 'first' will always be the same as 'last'.
+ */
+static void
+_bt_buildadd(Relation index, BTPageState *state, BTItem bti, int flags)
+{
+ Buffer nbuf;
+ Page npage;
+ BTItem last_bti;
+ OffsetNumber first_off;
+ OffsetNumber last_off;
+ OffsetNumber off;
+ Size pgspc;
+ Size btisz;
+
+ nbuf = state->btps_buf;
+ npage = state->btps_page;
+ first_off = state->btps_firstoff;
+ last_off = state->btps_lastoff;
+ last_bti = state->btps_lastbti;
+
+ pgspc = PageGetFreeSpace(npage);
+ btisz = BTITEMSZ(bti);
+ btisz = DOUBLEALIGN(btisz);
+ if (pgspc < btisz) {
+ Buffer obuf = nbuf;
+ Page opage = npage;
+ OffsetNumber o, n;
+ ItemId ii;
+ ItemId hii;
+
+ _bt_blnewpage(index, &nbuf, &npage, flags);
+
+ /*
+ * if 'last' is part of a chain of duplicates that does not
+ * start at the beginning of the old page, the entire chain is
+ * copied to the new page; we delete all of the duplicates
+ * from the old page except the first, which becomes the high
+ * key item of the old page.
+ *
+ * if the chain starts at the beginning of the page or there
+ * is no chain ('first' == 'last'), we need only copy 'last'
+ * to the new page. again, 'first' (== 'last') becomes the
+ * high key of the old page.
+ *
+ * note that in either case, we copy at least one item to the
+ * new page, so 'last_bti' will always be valid. 'bti' will
+ * never be the first data item on the new page.
+ */
+ if (first_off == P_FIRSTKEY) {
+ Assert(last_off != P_FIRSTKEY);
+ first_off = last_off;
+ }
+ for (o = first_off, n = P_FIRSTKEY;
+ o <= last_off;
+ o = OffsetNumberNext(o), n = OffsetNumberNext(n)) {
+ ii = PageGetItemId(opage, o);
+ (void) PageAddItem(npage, PageGetItem(opage, ii),
+ ii->lp_len, n, LP_USED);
+#ifdef FASTBUILD_DEBUG
+ {
+ bool isnull;
+ BTItem tmpbti =
+ (BTItem) PageGetItem(npage, PageGetItemId(npage, n));
+ Datum d = index_getattr(&(tmpbti->bti_itup), 1,
+ RelationGetTupleDescriptor(index),
+ &isnull);
+ printf("_bt_buildadd: moved <%x> to offset %d\n",
+ d, n);
+ }
+#endif /* FASTBUILD_DEBUG */
+ }
+ for (o = last_off; o > first_off; o = OffsetNumberPrev(o)) {
+ PageIndexTupleDelete(opage, o);
+ }
+ hii = PageGetItemId(opage, P_HIKEY);
+ ii = PageGetItemId(opage, first_off);
+ *hii = *ii;
+ ii->lp_flags &= ~LP_USED;
+ ((PageHeader) opage)->pd_lower -= sizeof(ItemIdData);
+
+ first_off = P_FIRSTKEY;
+ last_off = PageGetMaxOffsetNumber(npage);
+ last_bti = (BTItem) PageGetItem(npage, PageGetItemId(npage, last_off));
+
+ /*
+ * set the page (side link) pointers.
+ */
+ {
+ BTPageOpaque oopaque = (BTPageOpaque) PageGetSpecialPointer(opage);
+ BTPageOpaque nopaque = (BTPageOpaque) PageGetSpecialPointer(npage);
+
+ oopaque->btpo_next = BufferGetBlockNumber(nbuf);
+ nopaque->btpo_prev = BufferGetBlockNumber(obuf);
+ nopaque->btpo_next = P_NONE;
+ }
+
+ /*
+ * write out the old stuff. we never want to see it again, so
+ * we can give up our lock (if we had one; BuildingBtree is
+ * set, so we aren't locking).
+ */
+ _bt_wrtbuf(index, obuf);
+ }
+
+ /*
+ * if this item is different from the last item added, we start a
+ * new chain of duplicates.
+ */
+ off = OffsetNumberNext(last_off);
+ (void) PageAddItem(npage, (Item) bti, btisz, off, LP_USED);
+#ifdef FASTBUILD_DEBUG
+ {
+ bool isnull;
+ Datum d = index_getattr(&(bti->bti_itup), 1,
+ RelationGetTupleDescriptor(index),
+ &isnull);
+ printf("_bt_buildadd: inserted <%x> at offset %d\n",
+ d, off);
+ }
+#endif /* FASTBUILD_DEBUG */
+ if (last_bti == (BTItem) NULL) {
+ first_off = P_FIRSTKEY;
+ } else if (!_bt_itemcmp(index, 1, bti, last_bti, BTEqualStrategyNumber)) {
+ first_off = off;
+ }
+ last_off = off;
+ last_bti = (BTItem) PageGetItem(npage, PageGetItemId(npage, off));
+
+ state->btps_buf = nbuf;
+ state->btps_page = npage;
+ state->btps_lastbti = last_bti;
+ state->btps_lastoff = last_off;
+ state->btps_firstoff = first_off;
+}
+
+/*
+ * take the input tapes stored by 'btspool' and perform successive
+ * merging passes until at most one run is left in each tape. at that
+ * point, merge the final tape runs into a set of btree leaves.
+ *
+ * XXX three nested loops? gross. cut me up into smaller routines.
+ */
+static BlockNumber
+_bt_merge(Relation index, BTSpool *btspool)
+{
+ BTPageState state;
+ BlockNumber firstblk;
+ BTPriQueue q;
+ BTPriQueueElem e;
+ BTItem bti;
+ BTTapeBlock *itape;
+ BTTapeBlock *otape;
+ char *tapepos[MAXTAPES];
+ int tapedone[MAXTAPES];
+ int t;
+ int goodtapes;
+ int nruns;
+ Size btisz;
+ bool doleaf = false;
+
+ /*
+ * initialize state needed for the merge into the btree leaf pages.
+ */
+ (void) memset((char *) &state, 0, sizeof(BTPageState));
+ _bt_blnewpage(index, &(state.btps_buf), &(state.btps_page), BTP_LEAF);
+ state.btps_lastoff = P_HIKEY;
+ state.btps_lastbti = (BTItem) NULL;
+ firstblk = BufferGetBlockNumber(state.btps_buf);
+
+ do { /* pass */
+ /*
+ * each pass starts by flushing the previous outputs and
+ * swapping inputs and outputs. this process also clears the
+ * new output tapes and rewinds the new input tapes.
+ */
+ btspool->bts_tape = btspool->bts_ntapes - 1;
+ _bt_spoolflush(btspool);
+ _bt_spoolswap(btspool);
+
+ nruns = 0;
+
+ for (;;) { /* run */
+ /*
+ * each run starts by selecting a new output tape. the
+ * merged results of a given run are always sent to this
+ * one tape.
+ */
+ btspool->bts_tape = (btspool->bts_tape + 1) % btspool->bts_ntapes;
+ otape = btspool->bts_otape[btspool->bts_tape];
+
+ /*
+ * initialize the priority queue by loading it with the
+ * first element of the given run in each tape. since we
+ * are starting a new run, we reset the tape (clearing the
+ * End-Of-Run marker) before reading it. this means that
+ * _bt_taperead will return 0 only if the tape is actually
+ * at EOF.
+ */
+ (void) memset((char *) &q, 0, sizeof(BTPriQueue));
+ goodtapes = 0;
+ for (t = 0; t < btspool->bts_ntapes; ++t) {
+ itape = btspool->bts_itape[t];
+ tapepos[t] = itape->bttb_data;
+ _bt_tapereset(itape);
+ if (_bt_taperead(itape) == 0) {
+ tapedone[t] = 1;
+ } else {
+ ++goodtapes;
+ tapedone[t] = 0;
+ e.btpqe_tape = t;
+ e.btpqe_item = _bt_tapenext(itape, &tapepos[t]);
+ if (e.btpqe_item != (BTItem) NULL) {
+ _bt_pqadd(&q, &e);
+ }
+ }
+ }
+ /*
+ * if we don't have any tapes with any input (i.e., they
+ * are all at EOF), we must be done with this pass.
+ */
+ if (goodtapes == 0) {
+ break; /* for */
+ }
+ ++nruns;
+
+ /*
+ * output the smallest element from the queue until there are no
+ * more.
+ */
+ while (_bt_pqnext(&q, &e) >= 0) { /* item */
+ /*
+ * replace the element taken from priority queue,
+ * fetching a new block if needed. a tape can run out
+ * if it hits either End-Of-Run or EOF.
+ */
+ t = e.btpqe_tape;
+ bti = e.btpqe_item;
+ if (bti != (BTItem) NULL) {
+ btisz = BTITEMSZ(bti);
+ btisz = DOUBLEALIGN(btisz);
+ if (doleaf) {
+ _bt_buildadd(index, &state, bti, BTP_LEAF);
+#ifdef FASTBUILD_DEBUG
+ {
+ bool isnull;
+ Datum d = index_getattr(&(bti->bti_itup), 1,
+ RelationGetTupleDescriptor(index),
+ &isnull);
+ printf("_bt_merge: inserted <%x> into block %d\n",
+ d, BufferGetBlockNumber(state.btps_buf));
+ }
+#endif /* FASTBUILD_DEBUG */
+ } else {
+ if (SPCLEFT(otape) < btisz) {
+ /*
+ * if it's full, write it out and add the
+ * item to the next block. (since we know
+ * there will be at least one more block,
+ * we know we do *not* want to set
+ * End-Of-Run here!)
+ */
+ _bt_tapewrite(otape, 0);
+ }
+ _bt_tapeadd(otape, bti, btisz);
+#ifdef FASTBUILD_DEBUG
+ {
+ bool isnull;
+ Datum d = index_getattr(&(bti->bti_itup), 1,
+ RelationGetTupleDescriptor(index), &isnull);
+ printf("_bt_merge: inserted <%x> into tape %d\n",
+ d, btspool->bts_tape);
+ }
+#endif /* FASTBUILD_DEBUG */
+ }
+ }
+#ifdef FASTBUILD_DEBUG
+ {
+ bool isnull;
+ Datum d = index_getattr(&(bti->bti_itup), 1,
+ RelationGetTupleDescriptor(index),
+ &isnull);
+ printf("_bt_merge: got <%x> from tape %d\n", d, t);
+ }
+#endif /* FASTBUILD_DEBUG */
+
+ itape = btspool->bts_itape[t];
+ if (!tapedone[t]) {
+ BTItem newbti = _bt_tapenext(itape, &tapepos[t]);
+
+ if (newbti == (BTItem) NULL) {
+ if (_bt_taperead(itape) == 0) {
+ tapedone[t] = 1;
+ } else {
+ tapepos[t] = itape->bttb_data;
+ newbti = _bt_tapenext(itape, &tapepos[t]);
+ }
+ }
+ if (newbti != (BTItem) NULL) {
+ BTPriQueueElem nexte;
+
+ nexte.btpqe_tape = t;
+ nexte.btpqe_item = newbti;
+ _bt_pqadd(&q, &nexte);
+ }
+ }
+ } /* item */
+ } /* run */
+
+ /*
+ * we are here because we ran out of input on all of the input
+ * tapes.
+ *
+ * if this pass did not generate more actual output runs than
+ * we have tapes, we know we have at most one run in each
+ * tape. this means that we are ready to merge into the final
+ * btree leaf pages instead of merging into a tape file.
+ */
+ if (nruns <= btspool->bts_ntapes) {
+ doleaf = true;
+ }
+ } while (nruns > 0); /* pass */
+
+ /*
+ * this is the rightmost page, so the ItemId array needs to be
+ * slid back one slot.
+ */
+ _bt_slideleft(index, state.btps_buf, state.btps_page);
+ _bt_wrtbuf(index, state.btps_buf);
+
+ return(firstblk);
+}
+
+
+/*
+ * given the block number 'blk' of the first page of a set of linked
+ * siblings (i.e., the start of an entire level of the btree),
+ * construct the corresponding next level of the btree. we do this by
+ * placing minimum keys from each page into this page. the format of
+ * the internal pages is otherwise the same as for leaf pages.
+ */
+void
+_bt_upperbuild(Relation index, BlockNumber blk, int level)
+{
+ Buffer rbuf;
+ Page rpage;
+ BTPageOpaque ropaque;
+ BTPageState state;
+ BlockNumber firstblk;
+ BTItem bti;
+ BTItem nbti;
+ OffsetNumber off;
+
+ rbuf = _bt_getbuf(index, blk, BT_WRITE);
+ rpage = BufferGetPage(rbuf);
+ ropaque = (BTPageOpaque) PageGetSpecialPointer(rpage);
+
+ /*
+ * if we only have one page on a level, we can just make it the
+ * root.
+ */
+ if (P_RIGHTMOST(ropaque)) {
+ ropaque->btpo_flags |= BTP_ROOT;
+ _bt_wrtbuf(index, rbuf);
+ _bt_metaproot(index, blk);
+ return;
+ }
+ _bt_relbuf(index, rbuf, BT_WRITE);
+
+ (void) memset((char *) &state, 0, sizeof(BTPageState));
+ _bt_blnewpage(index, &(state.btps_buf), &(state.btps_page), 0);
+ state.btps_lastoff = P_HIKEY;
+ state.btps_lastbti = (BTItem) NULL;
+ firstblk = BufferGetBlockNumber(state.btps_buf);
+
+ /* for each page... */
+ do {
+ rbuf = _bt_getbuf(index, blk, BT_READ);
+ rpage = BufferGetPage(rbuf);
+ ropaque = (BTPageOpaque) PageGetSpecialPointer(rpage);
+
+ /* for each item... */
+ if (!PageIsEmpty(rpage)) {
+ /*
+ * form a new index tuple corresponding to the minimum key
+ * of the lower page and insert it into a page at this
+ * level.
+ */
+ off = P_RIGHTMOST(ropaque) ? P_HIKEY : P_FIRSTKEY;
+ bti = (BTItem) PageGetItem(rpage, PageGetItemId(rpage, off));
+ nbti = _bt_formitem(&(bti->bti_itup));
+ ItemPointerSet(&(nbti->bti_itup.t_tid), blk, P_HIKEY);
+#ifdef FASTBUILD_DEBUG
+ {
+ bool isnull;
+ Datum d = index_getattr(&(nbti->bti_itup), 1,
+ RelationGetTupleDescriptor(index),
+ &isnull);
+ printf("_bt_upperbuild: inserting <%x> at %d\n",
+ d, level);
+ }
+#endif /* FASTBUILD_DEBUG */
+ _bt_buildadd(index, &state, nbti, 0);
+ pfree((void *) nbti);
+ }
+ blk = ropaque->btpo_next;
+ _bt_relbuf(index, rbuf, BT_READ);
+ } while (blk != P_NONE);
+
+ /*
+ * this is the rightmost page, so the ItemId array needs to be
+ * slid back one slot.
+ */
+ _bt_slideleft(index, state.btps_buf, state.btps_page);
+ _bt_wrtbuf(index, state.btps_buf);
+
+ _bt_upperbuild(index, firstblk, level + 1);
+}
+
+/*
+ * given a spool loading by successive calls to _bt_spool, create an
+ * entire btree.
+ */
+void
+_bt_leafbuild(Relation index, void *spool)
+{
+ BTSpool *btspool = (BTSpool *) spool;
+ BlockNumber firstblk;
+
+ /*
+ * merge the runs into btree leaf pages.
+ */
+ firstblk = _bt_merge(index, btspool);
+
+ /*
+ * build the upper levels of the btree.
+ */
+ _bt_upperbuild(index, firstblk, 0);
+}
+
+#else /* !FASTBUILD */
+
+void *_bt_spoolinit(Relation index, int ntapes) { return((void *) NULL); }
+void _bt_spooldestroy(void *spool) { }
+void _bt_spool(Relation index, BTItem btitem, void *spool) { }
+void _bt_upperbuild(Relation index, BlockNumber blk, int level) { }
+void _bt_leafbuild(Relation index, void *spool) { }
+
+#endif /* !FASTBUILD */
diff --git a/src/backend/access/nbtree/nbtstrat.c b/src/backend/access/nbtree/nbtstrat.c
new file mode 100644
index 0000000000..2214c60950
--- /dev/null
+++ b/src/backend/access/nbtree/nbtstrat.c
@@ -0,0 +1,134 @@
+/*-------------------------------------------------------------------------
+ *
+ * btstrat.c--
+ * Srategy map entries for the btree indexed access method
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/access/nbtree/Attic/nbtstrat.c,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "storage/bufpage.h"
+
+#include "utils/elog.h"
+#include "utils/rel.h"
+#include "utils/excid.h"
+
+#include "access/genam.h"
+#include "access/nbtree.h"
+
+/*
+ * Note:
+ * StrategyNegate, StrategyCommute, and StrategyNegateCommute
+ * assume <, <=, ==, >=, > ordering.
+ */
+static StrategyNumber BTNegate[5] = {
+ BTGreaterEqualStrategyNumber,
+ BTGreaterStrategyNumber,
+ InvalidStrategy,
+ BTLessStrategyNumber,
+ BTLessEqualStrategyNumber
+};
+
+static StrategyNumber BTCommute[5] = {
+ BTGreaterStrategyNumber,
+ BTGreaterEqualStrategyNumber,
+ InvalidStrategy,
+ BTLessEqualStrategyNumber,
+ BTLessStrategyNumber
+};
+
+static StrategyNumber BTNegateCommute[5] = {
+ BTLessEqualStrategyNumber,
+ BTLessStrategyNumber,
+ InvalidStrategy,
+ BTGreaterStrategyNumber,
+ BTGreaterEqualStrategyNumber
+};
+
+static uint16 BTLessTermData[] = { /* XXX type clash */
+ 2,
+ BTLessStrategyNumber,
+ SK_NEGATE,
+ BTLessStrategyNumber,
+ SK_NEGATE | SK_COMMUTE
+};
+
+static uint16 BTLessEqualTermData[] = { /* XXX type clash */
+ 2,
+ BTLessEqualStrategyNumber,
+ 0x0,
+ BTLessEqualStrategyNumber,
+ SK_COMMUTE
+};
+
+static uint16 BTGreaterEqualTermData[] = { /* XXX type clash */
+ 2,
+ BTGreaterEqualStrategyNumber,
+ 0x0,
+ BTGreaterEqualStrategyNumber,
+ SK_COMMUTE
+ };
+
+static uint16 BTGreaterTermData[] = { /* XXX type clash */
+ 2,
+ BTGreaterStrategyNumber,
+ SK_NEGATE,
+ BTGreaterStrategyNumber,
+ SK_NEGATE | SK_COMMUTE
+};
+
+static StrategyTerm BTEqualExpressionData[] = {
+ (StrategyTerm)BTLessTermData, /* XXX */
+ (StrategyTerm)BTLessEqualTermData, /* XXX */
+ (StrategyTerm)BTGreaterEqualTermData, /* XXX */
+ (StrategyTerm)BTGreaterTermData, /* XXX */
+ NULL
+};
+
+static StrategyEvaluationData BTEvaluationData = {
+ /* XXX static for simplicity */
+
+ BTMaxStrategyNumber,
+ (StrategyTransformMap)BTNegate, /* XXX */
+ (StrategyTransformMap)BTCommute, /* XXX */
+ (StrategyTransformMap)BTNegateCommute, /* XXX */
+
+ { NULL, NULL, (StrategyExpression)BTEqualExpressionData, NULL, NULL,
+ NULL,NULL,NULL,NULL,NULL,NULL,NULL}
+};
+
+/* ----------------------------------------------------------------
+ * RelationGetBTStrategy
+ * ----------------------------------------------------------------
+ */
+
+StrategyNumber
+_bt_getstrat(Relation rel,
+ AttrNumber attno,
+ RegProcedure proc)
+{
+ StrategyNumber strat;
+
+ strat = RelationGetStrategy(rel, attno, &BTEvaluationData, proc);
+
+ Assert(StrategyNumberIsValid(strat));
+
+ return (strat);
+}
+
+bool
+_bt_invokestrat(Relation rel,
+ AttrNumber attno,
+ StrategyNumber strat,
+ Datum left,
+ Datum right)
+{
+ return (RelationInvokeStrategy(rel, &BTEvaluationData, attno, strat,
+ left, right));
+}
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
new file mode 100644
index 0000000000..695a2b637c
--- /dev/null
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -0,0 +1,239 @@
+/*-------------------------------------------------------------------------
+ *
+ * btutils.c--
+ * Utility code for Postgres btree implementation.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtutils.c,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include <stdio.h>
+#include "postgres.h"
+
+#include "storage/bufmgr.h"
+#include "storage/bufpage.h"
+
+#include "fmgr.h"
+#include "utils/elog.h"
+#include "utils/palloc.h"
+#include "utils/rel.h"
+#include "utils/excid.h"
+#include "utils/datum.h"
+
+#include "access/heapam.h"
+#include "access/genam.h"
+#include "access/iqual.h"
+#include "access/nbtree.h"
+
+ScanKey
+_bt_mkscankey(Relation rel, IndexTuple itup)
+{
+ ScanKey skey;
+ TupleDesc itupdesc;
+ int natts;
+ int i;
+ Datum arg;
+ RegProcedure proc;
+ bool null;
+
+ natts = rel->rd_rel->relnatts;
+ itupdesc = RelationGetTupleDescriptor(rel);
+
+ skey = (ScanKey) palloc(natts * sizeof(ScanKeyData));
+
+ for (i = 0; i < natts; i++) {
+ arg = index_getattr(itup, i + 1, itupdesc, &null);
+ proc = index_getprocid(rel, i + 1, BTORDER_PROC);
+ ScanKeyEntryInitialize(&skey[i],
+ 0x0, (AttrNumber) (i + 1), proc, arg);
+ }
+
+ return (skey);
+}
+
+void
+_bt_freeskey(ScanKey skey)
+{
+ pfree(skey);
+}
+
+void
+_bt_freestack(BTStack stack)
+{
+ BTStack ostack;
+
+ while (stack != (BTStack) NULL) {
+ ostack = stack;
+ stack = stack->bts_parent;
+ pfree(ostack->bts_btitem);
+ pfree(ostack);
+ }
+}
+
+/*
+ * _bt_orderkeys() -- Put keys in a sensible order for conjunctive quals.
+ *
+ * The order of the keys in the qual match the ordering imposed by
+ * the index. This routine only needs to be called if there are
+ * more than one qual clauses using this index.
+ */
+void
+_bt_orderkeys(Relation relation, uint16 *numberOfKeys, ScanKey key)
+{
+ ScanKey xform;
+ ScanKeyData *cur;
+ StrategyMap map;
+ int nbytes;
+ long test;
+ int i, j;
+ int init[BTMaxStrategyNumber+1];
+
+ /* haven't looked at any strategies yet */
+ for (i = 0; i <= BTMaxStrategyNumber; i++)
+ init[i] = 0;
+
+ /* get space for the modified array of keys */
+ nbytes = BTMaxStrategyNumber * sizeof(ScanKeyData);
+ xform = (ScanKey) palloc(nbytes);
+ memset(xform, 0, nbytes);
+
+
+ /* get the strategy map for this index/attribute pair */
+ /*
+ * XXX
+ * When we support multiple keys in a single index, this is what
+ * we'll want to do. At present, the planner is hosed, so we
+ * hard-wire the attribute number below. Postgres only does single-
+ * key indices...
+ * map = IndexStrategyGetStrategyMap(RelationGetIndexStrategy(relation),
+ * BTMaxStrategyNumber,
+ * key->data[0].attributeNumber);
+ */
+ map = IndexStrategyGetStrategyMap(RelationGetIndexStrategy(relation),
+ BTMaxStrategyNumber,
+ 1 /* XXX */ );
+
+ /* check each key passed in */
+ for (i = *numberOfKeys; --i >= 0; ) {
+ cur = &key[i];
+ for (j = BTMaxStrategyNumber; --j >= 0; ) {
+ if (cur->sk_procedure == map->entry[j].sk_procedure)
+ break;
+ }
+
+ /* have we seen one of these before? */
+ if (init[j]) {
+ /* yup, use the appropriate value */
+ test =
+ (long) FMGR_PTR2(cur->sk_func, cur->sk_procedure,
+ cur->sk_argument, xform[j].sk_argument);
+ if (test)
+ xform[j].sk_argument = cur->sk_argument;
+ } else {
+ /* nope, use this value */
+ memmove(&xform[j], cur, sizeof(*cur));
+
+ init[j] = 1;
+ }
+ }
+
+ /* if = has been specified, no other key will be used */
+ if (init[BTEqualStrategyNumber - 1]) {
+ init[BTLessStrategyNumber - 1] = 0;
+ init[BTLessEqualStrategyNumber - 1] = 0;
+ init[BTGreaterEqualStrategyNumber - 1] = 0;
+ init[BTGreaterStrategyNumber - 1] = 0;
+ }
+
+ /* only one of <, <= */
+ if (init[BTLessStrategyNumber - 1]
+ && init[BTLessEqualStrategyNumber - 1]) {
+
+ ScanKeyData *lt, *le;
+
+ lt = &xform[BTLessStrategyNumber - 1];
+ le = &xform[BTLessEqualStrategyNumber - 1];
+
+ /*
+ * DO NOT use the cached function stuff here -- this is key
+ * ordering, happens only when the user expresses a hokey
+ * qualification, and gets executed only once, anyway. The
+ * transform maps are hard-coded, and can't be initialized
+ * in the correct way.
+ */
+
+ test = (long) fmgr(le->sk_procedure, le->sk_argument, lt->sk_argument);
+
+ if (test)
+ init[BTLessEqualStrategyNumber - 1] = 0;
+ else
+ init[BTLessStrategyNumber - 1] = 0;
+ }
+
+ /* only one of >, >= */
+ if (init[BTGreaterStrategyNumber - 1]
+ && init[BTGreaterEqualStrategyNumber - 1]) {
+
+ ScanKeyData *gt, *ge;
+
+ gt = &xform[BTGreaterStrategyNumber - 1];
+ ge = &xform[BTGreaterEqualStrategyNumber - 1];
+
+ /* see note above on function cache */
+ test = (long) fmgr(ge->sk_procedure, gt->sk_argument, gt->sk_argument);
+
+ if (test)
+ init[BTGreaterStrategyNumber - 1] = 0;
+ else
+ init[BTGreaterEqualStrategyNumber - 1] = 0;
+ }
+
+ /* okay, reorder and count */
+ j = 0;
+
+ for (i = BTMaxStrategyNumber; --i >= 0; )
+ if (init[i])
+ key[j++] = xform[i];
+
+ *numberOfKeys = j;
+
+ pfree(xform);
+}
+
+bool
+_bt_checkqual(IndexScanDesc scan, IndexTuple itup)
+{
+ if (scan->numberOfKeys > 0)
+ return (index_keytest(itup, RelationGetTupleDescriptor(scan->relation),
+ scan->numberOfKeys, scan->keyData));
+ else
+ return (true);
+}
+
+BTItem
+_bt_formitem(IndexTuple itup)
+{
+ int nbytes_btitem;
+ BTItem btitem;
+ Size tuplen;
+ extern Oid newoid();
+
+ /* disallow nulls in btree keys */
+ if (itup->t_info & INDEX_NULL_MASK)
+ elog(WARN, "btree indices cannot include null keys");
+
+ /* make a copy of the index tuple with room for the sequence number */
+ tuplen = IndexTupleSize(itup);
+ nbytes_btitem = tuplen +
+ (sizeof(BTItemData) - sizeof(IndexTupleData));
+
+ btitem = (BTItem) palloc(nbytes_btitem);
+ memmove((char *) &(btitem->bti_itup), (char *) itup, tuplen);
+
+ btitem->bti_oid = newoid();
+ return (btitem);
+}