11 files changed, 4992 insertions, 0 deletions
diff --git a/src/backend/access/nbtree/Makefile.inc b/src/backend/access/nbtree/Makefile.inc
new file mode 100644
index 0000000000..50854008c0
--- /dev/null
+++ b/src/backend/access/nbtree/Makefile.inc
@@ -0,0 +1,15 @@
+#-------------------------------------------------------------------------
+#
+# Makefile.inc--
+#    Makefile for access/nbtree (btree acess methods)
+#
+# Copyright (c) 1994, Regents of the University of California
+#
+#
+# IDENTIFICATION
+#    $Header: /cvsroot/pgsql/src/backend/access/nbtree/Attic/Makefile.inc,v 1.1.1.1 1996/07/09 06:21:11 scrappy Exp $
+#
+#-------------------------------------------------------------------------
+
+SUBSRCS+= nbtcompare.c nbtinsert.c nbtpage.c nbtree.c nbtscan.c nbtsearch.c \
+	nbtstrat.c nbtutils.c nbtsort.c
diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README
new file mode 100644
index 0000000000..a204ad4af0
--- /dev/null
+++ b/src/backend/access/nbtree/README
@@ -0,0 +1,68 @@
+$Header: /cvsroot/pgsql/src/backend/access/nbtree/README,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $
+
+This directory contains a correct implementation of Lehman and Yao's
+btree management algorithm that supports concurrent access for Postgres.
+We have made the following changes in order to incorporate their algorithm
+into Postgres:
+
+	+  The requirement that all btree keys be unique is too onerous,
+	   but the algorithm won't work correctly without it.  As a result,
+	   this implementation adds an OID (guaranteed to be unique) to
+	   every key in the index.  This guarantees uniqueness within a set
+	   of duplicates.  Space overhead is four bytes.
+
+	   For this reason, when we're passed an index tuple to store by the
+	   common access method code, we allocate a larger one and copy the
+	   supplied tuple into it.  No Postgres code outside of the btree
+	   access method knows about this xid or sequence number.
+
+	+  Lehman and Yao don't require read locks, but assume that in-
+	   memory copies of tree nodes are unshared.  Postgres shares
+	   in-memory buffers among backends.  As a result, we do page-
+	   level read locking on btree nodes in order to guarantee that
+	   no record is modified while we are examining it.  This reduces
+	   concurrency but guaranteees correct behavior.
+
+	+  Read locks on a page are held for as long as a scan has a pointer
+	   to the page.  However, locks are always surrendered before the
+	   sibling page lock is acquired (for readers), so we remain deadlock-
+	   free.  I will do a formal proof if I get bored anytime soon.
+
+In addition, the following things are handy to know:
+
+	+  Page zero of every btree is a meta-data page.  This page stores
+	   the location of the root page, a pointer to a list of free
+	   pages, and other stuff that's handy to know.
+
+	+  This algorithm doesn't really work, since it requires ordered
+	   writes, and UNIX doesn't support ordered writes.
+
+	+  There's one other case where we may screw up in this
+	   implementation.  When we start a scan, we descend the tree
+	   to the key nearest the one in the qual, and once we get there,
+	   position ourselves correctly for the qual type (eg, <, >=, etc).
+	   If we happen to step off a page, decide we want to get back to
+	   it, and fetch the page again, and if some bad person has split
+	   the page and moved the last tuple we saw off of it, then the
+	   code complains about botched concurrency in an elog(WARN, ...)
+	   and gives up the ghost.  This is the ONLY violation of Lehman
+	   and Yao's guarantee of correct behavior that I am aware of in
+	   this code.
+
+Notes to operator class implementors:
+
+	With this implementation, we require the user to supply us with
+	a procedure for pg_amproc.  This procedure should take two keys
+	A and B and return < 0, 0, or > 0 if A < B, A = B, or A > B,
+	respectively.  See the contents of that relation for the btree
+	access method for some samples.
+
+Notes to mao for implementation document:
+
+	On deletions, we need to adjust the position of active scans on
+	the index.  The code in nbtscan.c handles this.  We don't need to
+	do this for splits because of the way splits are handled; if they
+	happen behind us, we'll automatically go to the next page, and if
+	they happen in front of us, we're not affected by them.  For
+	insertions, if we inserted a tuple behind the current scan location
+	on the current scan page, we move one space ahead.
diff --git a/src/backend/access/nbtree/nbtcompare.c b/src/backend/access/nbtree/nbtcompare.c
new file mode 100644
index 0000000000..e567b3c44c
--- /dev/null
+++ b/src/backend/access/nbtree/nbtcompare.c
@@ -0,0 +1,173 @@
+/*-------------------------------------------------------------------------
+ *
+ * btcompare.c--
+ *    Comparison functions for btree access method.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *    $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtcompare.c,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $
+ *
+ *  NOTES
+ *	These functions are stored in pg_amproc.  For each operator class
+ *	defined on btrees, they compute
+ *
+ *		compare(a, b):
+ *			< 0 if a < b,
+ *			= 0 if a == b,
+ *			> 0 if a > b.
+ *-------------------------------------------------------------------------
+ */
+#include <string.h>
+#include "postgres.h"
+#include "utils/nabstime.h"
+
+int32
+btint2cmp(int16 a, int16 b)
+{
+    return ((int32) (a - b));
+}
+
+int32
+btint4cmp(int32 a, int32 b)
+{
+    return (a - b);
+}
+
+int32
+btint24cmp(int16 a, int32 b)
+{
+    return (((int32) a) - b);
+}
+
+int32
+btint42cmp(int32 a, int16 b)
+{
+    return (a - ((int32) b));
+}
+
+int32
+btfloat4cmp(float32 a, float32 b)
+{
+    if (*a > *b)
+	return (1);
+    else if (*a == *b)
+	return (0);
+    else
+	return (-1);
+}
+
+int32
+btfloat8cmp(float64 a, float64 b)
+{
+    if (*a > *b)
+	return (1);
+    else if (*a == *b)
+	return (0);
+    else
+	return (-1);
+}
+
+int32
+btoidcmp(Oid a, Oid b)
+{
+    if (a > b)
+	return (1);
+    else if (a == b)
+	return (0);
+    else
+	return (-1);
+}
+
+int32
+btabstimecmp(AbsoluteTime a, AbsoluteTime b)
+{
+    if (AbsoluteTimeIsBefore(a, b))
+	return (1);
+    else if (AbsoluteTimeIsBefore(b, a))
+	return (-1);
+    else
+	return (0);
+}
+
+int32
+btcharcmp(char a, char b)
+{
+    return ((int32) (a - b));
+}
+
+int32
+btchar2cmp(uint16 a, uint16 b)
+{
+    return (strncmp((char *) &a, (char *) &b, 2));
+}
+
+int32
+btchar4cmp(uint32 a, uint32 b)
+{
+    return (strncmp((char *) &a, (char *) &b, 4));
+}
+
+int32
+btchar8cmp(char *a, char *b)
+{
+    return (strncmp(a, b, 8));
+}
+
+int32
+btchar16cmp(char *a, char *b)
+{
+    return (strncmp(a, b, 16));
+}
+
+int32
+btnamecmp(NameData *a, NameData *b)
+{
+     return (strncmp(a->data, b->data, NAMEDATALEN));
+}
+
+int32
+bttextcmp(struct varlena *a, struct varlena *b)
+{
+    char *ap, *bp;
+    int len;
+    int res;
+    
+    ap = VARDATA(a);
+    bp = VARDATA(b);
+    
+    /* len is the length of the shorter of the two strings */
+    if ((len = VARSIZE(a)) > VARSIZE(b))
+	len = VARSIZE(b);
+    
+    /* len includes the four bytes in which string length is stored */
+    len -= sizeof(VARSIZE(a));
+    
+    /*
+     *  If the two strings differ in the first len bytes, or if they're
+     *  the same in the first len bytes and they're both len bytes long,
+     *  we're done.
+     */
+    
+    res = 0;
+    if (len > 0) {
+	do {
+	    res = (int) (*ap++ - *bp++);
+	    len--;
+	} while (res == 0 && len != 0);
+    }
+    
+    if (res != 0 || VARSIZE(a) == VARSIZE(b))
+	return (res);
+    
+    /*
+     *  The two strings are the same in the first len bytes, and they
+     *  are of different lengths.
+     */
+    
+    if (VARSIZE(a) < VARSIZE(b))
+	return (-1);
+    else
+	return (1);
+}
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
new file mode 100644
index 0000000000..536c0aa385
--- /dev/null
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -0,0 +1,831 @@
+/*-------------------------------------------------------------------------
+ *
+ * btinsert.c--
+ *    Item insertion in Lehman and Yao btrees for Postgres.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *    $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "storage/bufmgr.h"
+#include "storage/bufpage.h"
+
+#include "utils/elog.h"
+#include "utils/palloc.h"
+#include "utils/rel.h"
+#include "utils/excid.h"
+
+#include "access/heapam.h"
+#include "access/genam.h"
+#include "access/nbtree.h"
+
+static InsertIndexResult _bt_insertonpg(Relation rel, Buffer buf, BTStack stack, int keysz, ScanKey scankey, BTItem btitem, BTItem afteritem);
+static Buffer _bt_split(Relation rel, Buffer buf);
+static OffsetNumber _bt_findsplitloc(Relation rel, Page page, OffsetNumber start, OffsetNumber maxoff, Size llimit);
+static void _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf);
+static OffsetNumber _bt_pgaddtup(Relation rel, Buffer buf, int keysz, ScanKey itup_scankey, Size itemsize, BTItem btitem, BTItem afteritem);
+static bool _bt_goesonpg(Relation rel, Buffer buf, Size keysz, ScanKey scankey, BTItem afteritem);
+static void _bt_updateitem(Relation rel, Size keysz, Buffer buf, Oid bti_oid, BTItem newItem);
+
+/*
+ *  _bt_doinsert() -- Handle insertion of a single btitem in the tree.
+ *
+ *	This routine is called by the public interface routines, btbuild
+ *	and btinsert.  By here, btitem is filled in, and has a unique
+ *	(xid, seqno) pair.
+ */
+InsertIndexResult
+_bt_doinsert(Relation rel, BTItem btitem)
+{
+    ScanKey itup_scankey;
+    IndexTuple itup;
+    BTStack stack;
+    Buffer buf;
+    BlockNumber blkno;
+    int natts;
+    InsertIndexResult res;
+    
+    itup = &(btitem->bti_itup);
+    
+    /* we need a scan key to do our search, so build one */
+    itup_scankey = _bt_mkscankey(rel, itup);
+    natts = rel->rd_rel->relnatts;
+    
+    /* find the page containing this key */
+    stack = _bt_search(rel, natts, itup_scankey, &buf);
+    blkno = BufferGetBlockNumber(buf);
+    
+    /* trade in our read lock for a write lock */
+    _bt_relbuf(rel, buf, BT_READ);
+    buf = _bt_getbuf(rel, blkno, BT_WRITE);
+    
+    /*
+     *  If the page was split between the time that we surrendered our
+     *  read lock and acquired our write lock, then this page may no
+     *  longer be the right place for the key we want to insert.  In this
+     *  case, we need to move right in the tree.  See Lehman and Yao for
+     *  an excruciatingly precise description.
+     */
+    
+    buf = _bt_moveright(rel, buf, natts, itup_scankey, BT_WRITE);
+    
+    /* do the insertion */
+    res = _bt_insertonpg(rel, buf, stack, natts, itup_scankey,
+			 btitem, (BTItem) NULL);
+    
+    /* be tidy */
+    _bt_freestack(stack);
+    _bt_freeskey(itup_scankey);
+    
+    return (res);
+}
+
+/*
+ *  _bt_insertonpg() -- Insert a tuple on a particular page in the index.
+ *
+ *	This recursive procedure does the following things:
+ *
+ *	    +  if necessary, splits the target page.
+ *	    +  finds the right place to insert the tuple (taking into
+ *	       account any changes induced by a split).
+ *	    +  inserts the tuple.
+ *	    +  if the page was split, pops the parent stack, and finds the
+ *	       right place to insert the new child pointer (by walking
+ *	       right using information stored in the parent stack).
+ *	    +  invoking itself with the appropriate tuple for the right
+ *	       child page on the parent.
+ *
+ *	On entry, we must have the right buffer on which to do the
+ *	insertion, and the buffer must be pinned and locked.  On return,
+ *	we will have dropped both the pin and the write lock on the buffer.
+ *
+ *	The locking interactions in this code are critical.  You should
+ *	grok Lehman and Yao's paper before making any changes.  In addition,
+ *	you need to understand how we disambiguate duplicate keys in this
+ *	implementation, in order to be able to find our location using
+ *	L&Y "move right" operations.  Since we may insert duplicate user
+ *	keys, and since these dups may propogate up the tree, we use the
+ *	'afteritem' parameter to position ourselves correctly for the
+ *	insertion on internal pages.
+ */
+static InsertIndexResult
+_bt_insertonpg(Relation rel,
+	       Buffer buf,
+	       BTStack stack,
+	       int keysz,
+	       ScanKey scankey,
+	       BTItem btitem,
+	       BTItem afteritem)
+{
+    InsertIndexResult res;
+    Page page;
+    Buffer rbuf;
+    Buffer pbuf;
+    Page rpage;
+    ScanKey newskey;
+    BTItem ritem;
+    BTPageOpaque rpageop;
+    BlockNumber rbknum, itup_blkno;
+    OffsetNumber itup_off;
+    int itemsz;
+    InsertIndexResult newres;
+    BTItem new_item = (BTItem) NULL;
+    BTItem lowLeftItem;
+    
+    page = BufferGetPage(buf);
+    itemsz = IndexTupleDSize(btitem->bti_itup)
+	+ (sizeof(BTItemData) - sizeof(IndexTupleData));
+
+    itemsz = DOUBLEALIGN(itemsz);	/* be safe, PageAddItem will do this
+					   but we need to be consistent */
+    
+    if (PageGetFreeSpace(page) < itemsz) {
+	
+	/* split the buffer into left and right halves */
+	rbuf = _bt_split(rel, buf);
+	
+	/* which new page (left half or right half) gets the tuple? */
+	if (_bt_goesonpg(rel, buf, keysz, scankey, afteritem)) {
+	    /* left page */
+	    itup_off = _bt_pgaddtup(rel, buf, keysz, scankey,
+				    itemsz, btitem, afteritem);
+	    itup_blkno = BufferGetBlockNumber(buf);
+	} else {
+	    /* right page */
+	    itup_off = _bt_pgaddtup(rel, rbuf, keysz, scankey,
+				    itemsz, btitem, afteritem);
+	    itup_blkno = BufferGetBlockNumber(rbuf);
+	}
+	
+	/*
+	 *  By here,
+	 *
+	 *	+  our target page has been split;
+	 *	+  the original tuple has been inserted;
+	 *	+  we have write locks on both the old (left half) and new
+	 *	   (right half) buffers, after the split; and
+	 *	+  we have the key we want to insert into the parent.
+	 *
+	 *  Do the parent insertion.  We need to hold onto the locks for
+	 *  the child pages until we locate the parent, but we can release
+	 *  them before doing the actual insertion (see Lehman and Yao for
+	 *  the reasoning).
+	 */
+	
+	if (stack == (BTStack) NULL) {
+	    
+	    /* create a new root node and release the split buffers */
+	    _bt_newroot(rel, buf, rbuf);
+	    _bt_relbuf(rel, buf, BT_WRITE);
+	    _bt_relbuf(rel, rbuf, BT_WRITE);
+	    
+	} else {
+
+	    /* form a index tuple that points at the new right page */
+	    rbknum = BufferGetBlockNumber(rbuf);
+	    rpage = BufferGetPage(rbuf);
+	    rpageop = (BTPageOpaque) PageGetSpecialPointer(rpage);
+	    
+	    /*
+	     *  By convention, the first entry (0) on every
+	     *  non-rightmost page is the high key for that page.  In
+	     *  order to get the lowest key on the new right page, we
+	     *  actually look at its second (1) entry.
+	     */
+	    
+	    if (! P_RIGHTMOST(rpageop)) {
+		ritem = (BTItem) PageGetItem(rpage,
+					     PageGetItemId(rpage, P_FIRSTKEY));
+	    } else {
+		ritem = (BTItem) PageGetItem(rpage,
+					     PageGetItemId(rpage, P_HIKEY));
+	    }
+	    
+	    /* get a unique btitem for this key */
+	    new_item = _bt_formitem(&(ritem->bti_itup));
+	    
+	    ItemPointerSet(&(new_item->bti_itup.t_tid), rbknum, P_HIKEY);
+	    
+	    /* find the parent buffer */
+	    pbuf = _bt_getstackbuf(rel, stack, BT_WRITE);
+	    
+	    /*
+	     *  If the key of new_item is < than the key of the item
+	     *  in the parent page pointing to the left page
+	     *  (stack->bts_btitem), we have to update the latter key;
+	     *  otherwise the keys on the parent page wouldn't be
+	     *  monotonically increasing after we inserted the new
+	     *  pointer to the right page (new_item). This only
+	     *  happens if our left page is the leftmost page and a
+	     *  new minimum key had been inserted before, which is not
+	     *  reflected in the parent page but didn't matter so
+	     *  far. If there are duplicate keys and this new minimum
+	     *  key spills over to our new right page, we get an
+	     *  inconsistency if we don't update the left key in the
+	     *  parent page.
+	     */
+	    
+	    if (_bt_itemcmp(rel, keysz, stack->bts_btitem, new_item,
+	                    BTGreaterStrategyNumber)) {
+		lowLeftItem =
+		    (BTItem) PageGetItem(page,
+					 PageGetItemId(page, P_FIRSTKEY));
+		/* page must have right pointer after split */
+		_bt_updateitem(rel, keysz, pbuf, stack->bts_btitem->bti_oid,
+		               lowLeftItem);
+	    }
+	    
+	    /* don't need the children anymore */
+	    _bt_relbuf(rel, buf, BT_WRITE);
+	    _bt_relbuf(rel, rbuf, BT_WRITE);
+	    
+	    newskey = _bt_mkscankey(rel, &(new_item->bti_itup));
+	    newres = _bt_insertonpg(rel, pbuf, stack->bts_parent,
+				    keysz, newskey, new_item,
+				    stack->bts_btitem);
+	    
+	    /* be tidy */
+	    pfree(newres);
+	    pfree(newskey);
+	    pfree(new_item);
+	}
+    } else {
+	itup_off = _bt_pgaddtup(rel, buf, keysz, scankey,
+				itemsz, btitem, afteritem);
+	itup_blkno = BufferGetBlockNumber(buf);
+	
+	_bt_relbuf(rel, buf, BT_WRITE);
+    }
+    
+    /* by here, the new tuple is inserted */
+    res = (InsertIndexResult) palloc(sizeof(InsertIndexResultData));
+    ItemPointerSet(&(res->pointerData), itup_blkno, itup_off);
+    
+    return (res);
+}
+
+/*
+ *  _bt_split() -- split a page in the btree.
+ *
+ *	On entry, buf is the page to split, and is write-locked and pinned.
+ *	Returns the new right sibling of buf, pinned and write-locked.  The
+ *	pin and lock on buf are maintained.
+ */
+static Buffer
+_bt_split(Relation rel, Buffer buf)
+{
+    Buffer rbuf;
+    Page origpage;
+    Page leftpage, rightpage;
+    BTPageOpaque ropaque, lopaque, oopaque;
+    Buffer sbuf;
+    Page spage;
+    BTPageOpaque sopaque;
+    Size itemsz;
+    ItemId itemid;
+    BTItem item;
+    OffsetNumber leftoff, rightoff;
+    OffsetNumber start;
+    OffsetNumber maxoff;
+    OffsetNumber firstright;
+    OffsetNumber i;
+    Size llimit;
+    
+    rbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
+    origpage = BufferGetPage(buf);
+    leftpage = PageGetTempPage(origpage, sizeof(BTPageOpaqueData));
+    rightpage = BufferGetPage(rbuf);
+    
+    _bt_pageinit(rightpage, BufferGetPageSize(rbuf));
+    _bt_pageinit(leftpage, BufferGetPageSize(buf));
+    
+    /* init btree private data */
+    oopaque = (BTPageOpaque) PageGetSpecialPointer(origpage);
+    lopaque = (BTPageOpaque) PageGetSpecialPointer(leftpage);
+    ropaque = (BTPageOpaque) PageGetSpecialPointer(rightpage);
+    
+    /* if we're splitting this page, it won't be the root when we're done */
+    oopaque->btpo_flags &= ~BTP_ROOT;
+    lopaque->btpo_flags = ropaque->btpo_flags = oopaque->btpo_flags;
+    lopaque->btpo_prev = oopaque->btpo_prev;
+    ropaque->btpo_prev = BufferGetBlockNumber(buf);
+    lopaque->btpo_next = BufferGetBlockNumber(rbuf);
+    ropaque->btpo_next = oopaque->btpo_next;
+    
+    /*
+     *  If the page we're splitting is not the rightmost page at its
+     *  level in the tree, then the first (0) entry on the page is the
+     *  high key for the page.  We need to copy that to the right
+     *  half.  Otherwise (meaning the rightmost page case), we should
+     *  treat the line pointers beginning at zero as user data.
+     *
+     *  We leave a blank space at the start of the line table for the
+     *  left page.  We'll come back later and fill it in with the high
+     *  key item we get from the right key.
+     */
+    
+    leftoff = P_FIRSTKEY;
+    ropaque->btpo_next = oopaque->btpo_next;
+    if (! P_RIGHTMOST(oopaque)) {
+	/* splitting a non-rightmost page, start at the first data item */
+	start = P_FIRSTKEY;
+
+	/* copy the original high key to the new page */
+	itemid = PageGetItemId(origpage, P_HIKEY);
+	itemsz = ItemIdGetLength(itemid);
+	item = (BTItem) PageGetItem(origpage, itemid);
+	(void) PageAddItem(rightpage, (Item) item, itemsz, P_HIKEY, LP_USED);
+	rightoff = P_FIRSTKEY;
+    } else {
+	/* splitting a rightmost page, "high key" is the first data item */
+	start = P_HIKEY;
+
+	/* the new rightmost page will not have a high key */
+	rightoff = P_HIKEY;
+    }
+    maxoff = PageGetMaxOffsetNumber(origpage);
+    llimit = PageGetFreeSpace(leftpage) / 2;
+    firstright = _bt_findsplitloc(rel, origpage, start, maxoff, llimit);
+    
+    for (i = start; i <= maxoff; i = OffsetNumberNext(i)) {
+	itemid = PageGetItemId(origpage, i);
+	itemsz = ItemIdGetLength(itemid);
+	item = (BTItem) PageGetItem(origpage, itemid);
+	
+	/* decide which page to put it on */
+	if (i < firstright) {
+	    (void) PageAddItem(leftpage, (Item) item, itemsz, leftoff,
+			       LP_USED);
+	    leftoff = OffsetNumberNext(leftoff);
+	} else {
+	    (void) PageAddItem(rightpage, (Item) item, itemsz, rightoff,
+			       LP_USED);
+	    rightoff = OffsetNumberNext(rightoff);
+	}
+    }
+    
+    /*
+     *  Okay, page has been split, high key on right page is correct.  Now
+     *  set the high key on the left page to be the min key on the right
+     *  page.
+     */
+    
+    if (P_RIGHTMOST(ropaque)) {
+	itemid = PageGetItemId(rightpage, P_HIKEY);
+    } else {
+	itemid = PageGetItemId(rightpage, P_FIRSTKEY);
+    }
+    itemsz = ItemIdGetLength(itemid);
+    item = (BTItem) PageGetItem(rightpage, itemid);
+    
+    /*
+     *  We left a hole for the high key on the left page; fill it.  The
+     *  modal crap is to tell the page manager to put the new item on the
+     *  page and not screw around with anything else.  Whoever designed
+     *  this interface has presumably crawled back into the dung heap they
+     *  came from.  No one here will admit to it.
+     */
+    
+    PageManagerModeSet(OverwritePageManagerMode);
+    (void) PageAddItem(leftpage, (Item) item, itemsz, P_HIKEY, LP_USED);
+    PageManagerModeSet(ShufflePageManagerMode);
+    
+    /*
+     *  By here, the original data page has been split into two new halves,
+     *  and these are correct.  The algorithm requires that the left page
+     *  never move during a split, so we copy the new left page back on top
+     *  of the original.  Note that this is not a waste of time, since we
+     *  also require (in the page management code) that the center of a
+     *  page always be clean, and the most efficient way to guarantee this
+     *  is just to compact the data by reinserting it into a new left page.
+     */
+    
+    PageRestoreTempPage(leftpage, origpage);
+    
+    /* write these guys out */
+    _bt_wrtnorelbuf(rel, rbuf);
+    _bt_wrtnorelbuf(rel, buf);
+    
+    /*
+     *  Finally, we need to grab the right sibling (if any) and fix the
+     *  prev pointer there.  We are guaranteed that this is deadlock-free
+     *  since no other writer will be moving holding a lock on that page
+     *  and trying to move left, and all readers release locks on a page
+     *  before trying to fetch its neighbors.
+     */
+    
+    if (! P_RIGHTMOST(ropaque)) {
+	sbuf = _bt_getbuf(rel, ropaque->btpo_next, BT_WRITE);
+	spage = BufferGetPage(sbuf);
+	sopaque = (BTPageOpaque) PageGetSpecialPointer(spage);
+	sopaque->btpo_prev = BufferGetBlockNumber(rbuf);
+	
+	/* write and release the old right sibling */
+	_bt_wrtbuf(rel, sbuf);
+    }
+    
+    /* split's done */
+    return (rbuf);
+}
+
+/*
+ *  _bt_findsplitloc() -- find a safe place to split a page.
+ *
+ *	In order to guarantee the proper handling of searches for duplicate
+ *	keys, the first duplicate in the chain must either be the first
+ *	item on the page after the split, or the entire chain must be on
+ *	one of the two pages.  That is,
+ *		[1 2 2 2 3 4 5]
+ *	must become
+ *		[1] [2 2 2 3 4 5]
+ *	or
+ *		[1 2 2 2] [3 4 5]
+ *	but not
+ *		[1 2 2] [2 3 4 5].
+ *	However,
+ *		[2 2 2 2 2 3 4]
+ *	may be split as
+ *		[2 2 2 2] [2 3 4].
+ */
+static OffsetNumber
+_bt_findsplitloc(Relation rel,
+		 Page page,
+		 OffsetNumber start,
+		 OffsetNumber maxoff,
+		 Size llimit)
+{
+    OffsetNumber i;
+    OffsetNumber saferight;
+    ItemId nxtitemid, safeitemid;
+    BTItem safeitem, nxtitem;
+    IndexTuple safetup, nxttup;
+    Size nbytes;
+    TupleDesc itupdesc;
+    int natts;
+    int attno;
+    Datum attsafe;
+    Datum attnext;
+    bool null;
+    
+    itupdesc = RelationGetTupleDescriptor(rel);
+    natts = rel->rd_rel->relnatts;
+    
+    saferight = start;
+    safeitemid = PageGetItemId(page, saferight);
+    nbytes = ItemIdGetLength(safeitemid) + sizeof(ItemIdData);
+    safeitem = (BTItem) PageGetItem(page, safeitemid);
+    safetup = &(safeitem->bti_itup);
+    
+    i = OffsetNumberNext(start);
+    
+    while (nbytes < llimit) {
+	
+	/* check the next item on the page */
+	nxtitemid = PageGetItemId(page, i);
+	nbytes += (ItemIdGetLength(nxtitemid) + sizeof(ItemIdData));
+	nxtitem = (BTItem) PageGetItem(page, nxtitemid);
+	nxttup = &(nxtitem->bti_itup);
+	
+	/* test against last known safe item */
+	for (attno = 1; attno <= natts; attno++) {
+	    attsafe = index_getattr(safetup, attno, itupdesc, &null);
+	    attnext = index_getattr(nxttup, attno, itupdesc, &null);
+
+	    /*
+	     *  If the tuple we're looking at isn't equal to the last safe one
+	     *  we saw, then it's our new safe tuple.
+	     */
+	    
+	    if (!_bt_invokestrat(rel, attno, BTEqualStrategyNumber,
+				 attsafe, attnext)) {
+		safetup = nxttup;
+		saferight = i;
+		
+		/* break is for the attno for loop */
+		break;
+	    }
+	}
+	i = OffsetNumberNext(i);
+    }
+    
+    /*
+     *  If the chain of dups starts at the beginning of the page and extends
+     *  past the halfway mark, we can split it in the middle.
+     */
+    
+    if (saferight == start)
+	saferight = i;
+    
+    return (saferight);
+}
+
+/*
+ *  _bt_newroot() -- Create a new root page for the index.
+ *
+ *	We've just split the old root page and need to create a new one.
+ *	In order to do this, we add a new root page to the file, then lock
+ *	the metadata page and update it.  This is guaranteed to be deadlock-
+ *	free, because all readers release their locks on the metadata page
+ *	before trying to lock the root, and all writers lock the root before
+ *	trying to lock the metadata page.  We have a write lock on the old
+ *	root page, so we have not introduced any cycles into the waits-for
+ *	graph.
+ *
+ *	On entry, lbuf (the old root) and rbuf (its new peer) are write-
+ *	locked.  We don't drop the locks in this routine; that's done by
+ *	the caller.  On exit, a new root page exists with entries for the
+ *	two new children.  The new root page is neither pinned nor locked.
+ */
+static void
+_bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
+{
+    Buffer rootbuf;
+    Page lpage, rpage, rootpage;
+    BlockNumber lbkno, rbkno;
+    BlockNumber rootbknum;
+    BTPageOpaque rootopaque;
+    ItemId itemid;
+    BTItem item;
+    Size itemsz;
+    BTItem new_item;
+    
+    /* get a new root page */
+    rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
+    rootpage = BufferGetPage(rootbuf);
+    _bt_pageinit(rootpage, BufferGetPageSize(rootbuf));
+    
+    /* set btree special data */
+    rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
+    rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
+    rootopaque->btpo_flags |= BTP_ROOT;
+    
+    /*
+     *  Insert the internal tuple pointers.
+     */
+    
+    lbkno = BufferGetBlockNumber(lbuf);
+    rbkno = BufferGetBlockNumber(rbuf);
+    lpage = BufferGetPage(lbuf);
+    rpage = BufferGetPage(rbuf);
+    
+    /*
+     * step over the high key on the left page while building the 
+     * left page pointer.
+     */
+    itemid = PageGetItemId(lpage, P_FIRSTKEY);
+    itemsz = ItemIdGetLength(itemid);
+    item = (BTItem) PageGetItem(lpage, itemid);
+    new_item = _bt_formitem(&(item->bti_itup));
+    ItemPointerSet(&(new_item->bti_itup.t_tid), lbkno, P_FIRSTKEY);
+    
+    /*
+     * insert the left page pointer into the new root page.  the root
+     * page is the rightmost page on its level so the "high key" item
+     * is the first data item.
+     */
+    (void) PageAddItem(rootpage, (Item) new_item, itemsz, P_HIKEY, LP_USED);
+    pfree(new_item);
+    
+    /*
+     * the right page is the rightmost page on the second level, so 
+     * the "high key" item is the first data item on that page as well.
+     */
+    itemid = PageGetItemId(rpage, P_HIKEY);
+    itemsz = ItemIdGetLength(itemid);
+    item = (BTItem) PageGetItem(rpage, itemid);
+    new_item = _bt_formitem(&(item->bti_itup));
+    ItemPointerSet(&(new_item->bti_itup.t_tid), rbkno, P_HIKEY);
+    
+    /*
+     * insert the right page pointer into the new root page.
+     */
+    (void) PageAddItem(rootpage, (Item) new_item, itemsz, P_FIRSTKEY, LP_USED);
+    pfree(new_item);
+    
+    /* write and let go of the root buffer */
+    rootbknum = BufferGetBlockNumber(rootbuf);
+    _bt_wrtbuf(rel, rootbuf);
+    
+    /* update metadata page with new root block number */
+    _bt_metaproot(rel, rootbknum);
+}
+
+/*
+ *  _bt_pgaddtup() -- add a tuple to a particular page in the index.
+ *
+ *	This routine adds the tuple to the page as requested, and keeps the
+ *	write lock and reference associated with the page's buffer.  It is
+ *	an error to call pgaddtup() without a write lock and reference.  If
+ *	afteritem is non-null, it's the item that we expect our new item
+ *	to follow.  Otherwise, we do a binary search for the correct place
+ *	and insert the new item there.
+ */
+static OffsetNumber
+_bt_pgaddtup(Relation rel,
+	     Buffer buf,
+	     int keysz,
+	     ScanKey itup_scankey,
+	     Size itemsize,
+	     BTItem btitem,
+	     BTItem afteritem)
+{
+    OffsetNumber itup_off;
+    OffsetNumber first;
+    Page page;
+    BTPageOpaque opaque;
+    BTItem chkitem;
+    Oid afteroid;
+    
+    page = BufferGetPage(buf);
+    opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+    first = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
+    
+    if (afteritem == (BTItem) NULL) {
+	itup_off = _bt_binsrch(rel, buf, keysz, itup_scankey, BT_INSERTION);
+    } else {
+	afteroid = afteritem->bti_oid;
+	itup_off = first;
+	
+	do {
+	    chkitem =
+		(BTItem) PageGetItem(page, PageGetItemId(page, itup_off));
+	    itup_off = OffsetNumberNext(itup_off);
+	} while (chkitem->bti_oid != afteroid);
+    }
+
+    (void) PageAddItem(page, (Item) btitem, itemsize, itup_off, LP_USED);
+    
+    /* write the buffer, but hold our lock */
+    _bt_wrtnorelbuf(rel, buf);
+    
+    return (itup_off);
+}
+
+/*
+ *  _bt_goesonpg() -- Does a new tuple belong on this page?
+ *
+ *	This is part of the complexity introduced by allowing duplicate
+ *	keys into the index.  The tuple belongs on this page if:
+ *
+ *		+ there is no page to the right of this one; or
+ *		+ it is less than the high key on the page; or
+ *		+ the item it is to follow ("afteritem") appears on this
+ *		  page.
+ */
+static bool
+_bt_goesonpg(Relation rel,
+	     Buffer buf,
+	     Size keysz,
+	     ScanKey scankey,
+	     BTItem afteritem)
+{
+    Page page;
+    ItemId hikey;
+    BTPageOpaque opaque;
+    BTItem chkitem;
+    OffsetNumber offnum, maxoff;
+    Oid afteroid;
+    bool found;
+    
+    page = BufferGetPage(buf);
+    
+    /* no right neighbor? */
+    opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+    if (P_RIGHTMOST(opaque))
+	return (true);
+    
+    /*
+     *  this is a non-rightmost page, so it must have a high key item.
+     *
+     *  If the scan key is < the high key (the min key on the next page),
+     *  then it for sure belongs here.
+     */
+    hikey = PageGetItemId(page, P_HIKEY);
+    if (_bt_skeycmp(rel, keysz, scankey, page, hikey, BTLessStrategyNumber))
+	return (true);
+    
+    /*
+     *  If the scan key is > the high key, then it for sure doesn't belong
+     *  here.
+     */
+    
+    if (_bt_skeycmp(rel, keysz, scankey, page, hikey, BTGreaterStrategyNumber))
+	return (false);
+    
+    /*
+     *  If we have no adjacency information, and the item is equal to the
+     *  high key on the page (by here it is), then the item does not belong
+     *  on this page.
+     */
+    
+    if (afteritem == (BTItem) NULL)
+	return (false);
+    
+    /* damn, have to work for it.  i hate that. */
+    afteroid = afteritem->bti_oid;
+    maxoff = PageGetMaxOffsetNumber(page);
+    
+    /*
+     *  Search the entire page for the afteroid.  We need to do this, rather
+     *  than doing a binary search and starting from there, because if the
+     *  key we're searching for is the leftmost key in the tree at this
+     *  level, then a binary search will do the wrong thing.  Splits are
+     *  pretty infrequent, so the cost isn't as bad as it could be.
+     */
+    
+    found = false;
+    for (offnum = P_FIRSTKEY;
+	 offnum <= maxoff;
+	 offnum = OffsetNumberNext(offnum)) {
+	chkitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
+	if (chkitem->bti_oid == afteroid) {
+	    found = true;
+	    break;
+	}
+    }
+    
+    return (found);
+}
+
+/*
+ *	_bt_itemcmp() -- compare item1 to item2 using a requested
+ *		         strategy (<, <=, =, >=, >)
+ *
+ */
+bool
+_bt_itemcmp(Relation rel,
+	    Size keysz,
+	    BTItem item1,
+	    BTItem item2,
+	    StrategyNumber strat)
+{
+    TupleDesc tupDes;
+    IndexTuple indexTuple1, indexTuple2;
+    Datum attrDatum1, attrDatum2;
+    int i;
+    bool isNull;
+    bool compare;
+    
+    tupDes = RelationGetTupleDescriptor(rel);
+    indexTuple1 = &(item1->bti_itup);
+    indexTuple2 = &(item2->bti_itup);
+    
+    for (i = 1; i <= keysz; i++) {
+	attrDatum1 = index_getattr(indexTuple1, i, tupDes, &isNull);
+	attrDatum2 = index_getattr(indexTuple2, i, tupDes, &isNull);
+	compare = _bt_invokestrat(rel, i, strat, attrDatum1, attrDatum2);
+	if (!compare) {
+	    return (false);
+	}
+    }
+    return (true);
+}
+
+/*
+ *	_bt_updateitem() -- updates the key of the item identified by the
+ *			    oid with the key of newItem (done in place)
+ *
+ */
+static void
+_bt_updateitem(Relation rel,
+	       Size keysz,
+	       Buffer buf,
+	       Oid bti_oid,
+	       BTItem newItem)
+{
+    Page page;
+    OffsetNumber maxoff;
+    OffsetNumber i;
+    ItemPointerData itemPtrData;
+    BTItem item;
+    IndexTuple oldIndexTuple, newIndexTuple;
+    
+    page = BufferGetPage(buf);
+    maxoff = PageGetMaxOffsetNumber(page);
+    
+    /* locate item on the page */
+    i = P_HIKEY;
+    do {
+	item = (BTItem) PageGetItem(page, PageGetItemId(page, i));
+	i = OffsetNumberNext(i);
+    } while (i <= maxoff && item->bti_oid != bti_oid);
+    
+    /* this should never happen (in theory) */
+    if (item->bti_oid != bti_oid) {
+	elog(FATAL, "_bt_getstackbuf was lying!!");
+    }
+    
+    oldIndexTuple = &(item->bti_itup);
+    newIndexTuple = &(newItem->bti_itup);
+    
+    /* keep the original item pointer */
+    ItemPointerCopy(&(oldIndexTuple->t_tid), &itemPtrData);
+    CopyIndexTuple(newIndexTuple, &oldIndexTuple);
+    ItemPointerCopy(&itemPtrData, &(oldIndexTuple->t_tid));
+}
diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c
new file mode 100644
index 0000000000..ce411a80d1
--- /dev/null
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -0,0 +1,523 @@
+/*-------------------------------------------------------------------------
+ *
+ * btpage.c--
+ *    BTree-specific page management code for the Postgres btree access
+ *    method.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *    $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtpage.c,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $
+ *
+ *  NOTES
+ *     Postgres btree pages look like ordinary relation pages.  The opaque
+ *     data at high addresses includes pointers to left and right siblings
+ *     and flag data describing page state.  The first page in a btree, page
+ *     zero, is special -- it stores meta-information describing the tree.
+ *     Pages one and higher store the actual tree data.
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "storage/bufmgr.h"
+#include "storage/bufpage.h"
+
+#include "utils/elog.h"
+#include "utils/rel.h"
+#include "utils/excid.h"
+
+#include "access/genam.h"
+#include "access/nbtree.h"
+
+#define BTREE_METAPAGE	0
+#define BTREE_MAGIC	0x053162
+#define BTREE_VERSION	0
+
+typedef struct BTMetaPageData {
+    uint32	btm_magic;
+    uint32	btm_version;
+    BlockNumber	btm_root;
+} BTMetaPageData;
+
+#define	BTPageGetMeta(p) \
+    ((BTMetaPageData *) &((PageHeader) p)->pd_linp[0])
+
+extern bool	BuildingBtree;
+
+/*
+ *  We use high-concurrency locking on btrees.  There are two cases in
+ *  which we don't do locking.  One is when we're building the btree.
+ *  Since the creating transaction has not committed, no one can see
+ *  the index, and there's no reason to share locks.  The second case
+ *  is when we're just starting up the database system.  We use some
+ *  special-purpose initialization code in the relation cache manager
+ *  (see utils/cache/relcache.c) to allow us to do indexed scans on
+ *  the system catalogs before we'd normally be able to.  This happens
+ *  before the lock table is fully initialized, so we can't use it.
+ *  Strictly speaking, this violates 2pl, but we don't do 2pl on the
+ *  system catalogs anyway, so I declare this to be okay.
+ */
+
+#define USELOCKING	(!BuildingBtree && !IsInitProcessingMode())
+
+/*
+ *  _bt_metapinit() -- Initialize the metadata page of a btree.
+ */
+void
+_bt_metapinit(Relation rel)
+{
+    Buffer buf;
+    Page pg;
+    int nblocks;
+    BTMetaPageData metad;
+    BTPageOpaque op;
+    
+    /* can't be sharing this with anyone, now... */
+    if (USELOCKING)
+	RelationSetLockForWrite(rel);
+    
+    if ((nblocks = RelationGetNumberOfBlocks(rel)) != 0) {
+	elog(WARN, "Cannot initialize non-empty btree %s",
+	     RelationGetRelationName(rel));
+    }
+    
+    buf = ReadBuffer(rel, P_NEW);
+    pg = BufferGetPage(buf);
+    _bt_pageinit(pg, BufferGetPageSize(buf));
+    
+    metad.btm_magic = BTREE_MAGIC;
+    metad.btm_version = BTREE_VERSION;
+    metad.btm_root = P_NONE;
+    memmove((char *) BTPageGetMeta(pg), (char *) &metad, sizeof(metad));
+    
+    op = (BTPageOpaque) PageGetSpecialPointer(pg);
+    op->btpo_flags = BTP_META;
+
+    WriteBuffer(buf);
+    
+    /* all done */
+    if (USELOCKING)
+	RelationUnsetLockForWrite(rel);
+}
+
+/*
+ *  _bt_checkmeta() -- Verify that the metadata stored in a btree are
+ *		       reasonable.
+ */
+void
+_bt_checkmeta(Relation rel)
+{
+    Buffer metabuf;
+    Page metap;
+    BTMetaPageData *metad;
+    BTPageOpaque op;
+    int nblocks;
+    
+    /* if the relation is empty, this is init time; don't complain */
+    if ((nblocks = RelationGetNumberOfBlocks(rel)) == 0)
+	return;
+    
+    metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
+    metap = BufferGetPage(metabuf);
+    op = (BTPageOpaque) PageGetSpecialPointer(metap);
+    if (!(op->btpo_flags & BTP_META)) {
+	elog(WARN, "Invalid metapage for index %s",
+	     RelationGetRelationName(rel));
+    }
+    metad = BTPageGetMeta(metap);
+
+    if (metad->btm_magic != BTREE_MAGIC) {
+	elog(WARN, "Index %s is not a btree",
+	     RelationGetRelationName(rel));
+    }
+    
+    if (metad->btm_version != BTREE_VERSION) {
+	elog(WARN, "Version mismatch on %s:  version %d file, version %d code",
+	     RelationGetRelationName(rel),
+	     metad->btm_version, BTREE_VERSION);
+    }
+    
+    _bt_relbuf(rel, metabuf, BT_READ);
+}
+
+/*
+ *  _bt_getroot() -- Get the root page of the btree.
+ *
+ *	Since the root page can move around the btree file, we have to read
+ *	its location from the metadata page, and then read the root page
+ *	itself.  If no root page exists yet, we have to create one.  The
+ *	standard class of race conditions exists here; I think I covered
+ *	them all in the Hopi Indian rain dance of lock requests below.
+ *
+ *	We pass in the access type (BT_READ or BT_WRITE), and return the
+ *	root page's buffer with the appropriate lock type set.  Reference
+ *	count on the root page gets bumped by ReadBuffer.  The metadata
+ *	page is unlocked and unreferenced by this process when this routine
+ *	returns.
+ */
+Buffer
+_bt_getroot(Relation rel, int access)
+{
+    Buffer metabuf;
+    Page metapg;
+    BTPageOpaque metaopaque;
+    Buffer rootbuf;
+    Page rootpg;
+    BTPageOpaque rootopaque;
+    BlockNumber rootblkno;
+    BTMetaPageData *metad;
+    
+    metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
+    metapg = BufferGetPage(metabuf);
+    metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
+    Assert(metaopaque->btpo_flags & BTP_META);
+    metad = BTPageGetMeta(metapg);
+    
+    /* if no root page initialized yet, do it */
+    if (metad->btm_root == P_NONE) {
+	
+	/* turn our read lock in for a write lock */
+	_bt_relbuf(rel, metabuf, BT_READ);
+	metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
+	metapg = BufferGetPage(metabuf);
+	metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
+	Assert(metaopaque->btpo_flags & BTP_META);
+	metad = BTPageGetMeta(metapg);
+	
+	/*
+	 *  Race condition:  if someone else initialized the metadata between
+	 *  the time we released the read lock and acquired the write lock,
+	 *  above, we want to avoid doing it again.
+	 */
+	
+	if (metad->btm_root == P_NONE) {
+	    
+	    /*
+	     *  Get, initialize, write, and leave a lock of the appropriate
+	     *  type on the new root page.  Since this is the first page in
+	     *  the tree, it's a leaf.
+	     */
+	    
+	    rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
+	    rootblkno = BufferGetBlockNumber(rootbuf);
+	    rootpg = BufferGetPage(rootbuf);
+	    metad->btm_root = rootblkno;
+	    _bt_pageinit(rootpg, BufferGetPageSize(rootbuf));
+	    rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpg);
+	    rootopaque->btpo_flags |= (BTP_LEAF | BTP_ROOT);
+	    _bt_wrtnorelbuf(rel, rootbuf);
+	    
+	    /* swap write lock for read lock, if appropriate */
+	    if (access != BT_WRITE) {
+		_bt_setpagelock(rel, rootblkno, BT_READ);
+		_bt_unsetpagelock(rel, rootblkno, BT_WRITE);
+	    }
+	    
+	    /* okay, metadata is correct */
+	    _bt_wrtbuf(rel, metabuf);
+	} else {
+	    
+	    /*
+	     *  Metadata initialized by someone else.  In order to guarantee
+	     *  no deadlocks, we have to release the metadata page and start
+	     *  all over again.
+	     */
+	    
+	    _bt_relbuf(rel, metabuf, BT_WRITE);
+	    return (_bt_getroot(rel, access));
+	}
+    } else {
+	rootbuf = _bt_getbuf(rel, metad->btm_root, access);
+	
+	/* done with the meta page */
+	_bt_relbuf(rel, metabuf, BT_READ);
+    }
+    
+    /*
+     *  Race condition:  If the root page split between the time we looked
+     *  at the metadata page and got the root buffer, then we got the wrong
+     *  buffer.
+     */
+    
+    rootpg = BufferGetPage(rootbuf);
+    rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpg);
+    if (!(rootopaque->btpo_flags & BTP_ROOT)) {
+	
+	/* it happened, try again */
+	_bt_relbuf(rel, rootbuf, access);
+	return (_bt_getroot(rel, access));
+    }
+    
+    /*
+     *  By here, we have a correct lock on the root block, its reference
+     *  count is correct, and we have no lock set on the metadata page.
+     *  Return the root block.
+     */
+    
+    return (rootbuf);
+}
+
+/*
+ *  _bt_getbuf() -- Get a buffer by block number for read or write.
+ *
+ *	When this routine returns, the appropriate lock is set on the
+ *	requested buffer its reference count is correct.
+ */
+Buffer
+_bt_getbuf(Relation rel, BlockNumber blkno, int access)
+{
+    Buffer buf;
+    Page page;
+    
+    /*
+     *  If we want a new block, we can't set a lock of the appropriate type
+     *  until we've instantiated the buffer.
+     */
+    
+    if (blkno != P_NEW) {
+	if (access == BT_WRITE)
+	    _bt_setpagelock(rel, blkno, BT_WRITE);
+	else
+	    _bt_setpagelock(rel, blkno, BT_READ);
+	
+	buf = ReadBuffer(rel, blkno);
+    } else {
+	buf = ReadBuffer(rel, blkno);
+	blkno = BufferGetBlockNumber(buf);
+	page = BufferGetPage(buf);
+	_bt_pageinit(page, BufferGetPageSize(buf));
+	
+	if (access == BT_WRITE)
+	    _bt_setpagelock(rel, blkno, BT_WRITE);
+	else
+	    _bt_setpagelock(rel, blkno, BT_READ);
+    }
+    
+    /* ref count and lock type are correct */
+    return (buf);
+}
+
+/*
+ *  _bt_relbuf() -- release a locked buffer.
+ */
+void
+_bt_relbuf(Relation rel, Buffer buf, int access)
+{
+    BlockNumber blkno;
+    
+    blkno = BufferGetBlockNumber(buf);
+    
+    /* access had better be one of read or write */
+    if (access == BT_WRITE)
+	_bt_unsetpagelock(rel, blkno, BT_WRITE);
+    else
+	_bt_unsetpagelock(rel, blkno, BT_READ);
+    
+    ReleaseBuffer(buf);
+}
+
+/*
+ *  _bt_wrtbuf() -- write a btree page to disk.
+ *
+ *	This routine releases the lock held on the buffer and our reference
+ *	to it.  It is an error to call _bt_wrtbuf() without a write lock
+ *	or a reference to the buffer.
+ */
+void
+_bt_wrtbuf(Relation rel, Buffer buf)
+{
+    BlockNumber blkno;
+    
+    blkno = BufferGetBlockNumber(buf);
+    WriteBuffer(buf);
+    _bt_unsetpagelock(rel, blkno, BT_WRITE);
+}
+
+/*
+ *  _bt_wrtnorelbuf() -- write a btree page to disk, but do not release
+ *			 our reference or lock.
+ *
+ *	It is an error to call _bt_wrtnorelbuf() without a write lock
+ *	or a reference to the buffer.
+ */
+void
+_bt_wrtnorelbuf(Relation rel, Buffer buf)
+{
+    BlockNumber blkno;
+    
+    blkno = BufferGetBlockNumber(buf);
+    WriteNoReleaseBuffer(buf);
+}
+
+/*
+ *  _bt_pageinit() -- Initialize a new page.
+ */
+void
+_bt_pageinit(Page page, Size size)
+{
+    /*
+     *  Cargo-cult programming -- don't really need this to be zero, but
+     *  creating new pages is an infrequent occurrence and it makes me feel
+     *  good when I know they're empty.
+     */
+    
+    memset(page, 0, size);
+    
+    PageInit(page, size, sizeof(BTPageOpaqueData));
+}
+
+/*
+ *  _bt_metaproot() -- Change the root page of the btree.
+ *
+ *	Lehman and Yao require that the root page move around in order to
+ *	guarantee deadlock-free short-term, fine-granularity locking.  When
+ *	we split the root page, we record the new parent in the metadata page
+ *	for the relation.  This routine does the work.
+ *
+ *	No direct preconditions, but if you don't have the a write lock on
+ *	at least the old root page when you call this, you're making a big
+ *	mistake.  On exit, metapage data is correct and we no longer have
+ *	a reference to or lock on the metapage.
+ */
+void
+_bt_metaproot(Relation rel, BlockNumber rootbknum)
+{
+    Buffer metabuf;
+    Page metap;
+    BTPageOpaque metaopaque;
+    BTMetaPageData *metad;
+    
+    metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
+    metap = BufferGetPage(metabuf);
+    metaopaque = (BTPageOpaque) PageGetSpecialPointer(metap);
+    Assert(metaopaque->btpo_flags & BTP_META);
+    metad = BTPageGetMeta(metap);
+    metad->btm_root = rootbknum;
+    _bt_wrtbuf(rel, metabuf);
+}
+
+/*
+ *  _bt_getstackbuf() -- Walk back up the tree one step, and find the item
+ *			 we last looked at in the parent.
+ *
+ *	This is possible because we save a bit image of the last item
+ *	we looked at in the parent, and the update algorithm guarantees
+ *	that if items above us in the tree move, they only move right.
+ */
+Buffer
+_bt_getstackbuf(Relation rel, BTStack stack, int access)
+{
+    Buffer buf;
+    BlockNumber blkno;
+    OffsetNumber start, offnum, maxoff;
+    OffsetNumber i;
+    Page page;
+    ItemId itemid;
+    BTItem item;
+    BTPageOpaque opaque;
+    
+    blkno = stack->bts_blkno;
+    buf = _bt_getbuf(rel, blkno, access);
+    page = BufferGetPage(buf);
+    opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+    maxoff = PageGetMaxOffsetNumber(page);
+    
+    if (maxoff >= stack->bts_offset) {
+	itemid = PageGetItemId(page, stack->bts_offset);
+	item = (BTItem) PageGetItem(page, itemid);
+	
+	/* if the item is where we left it, we're done */
+	if (item->bti_oid == stack->bts_btitem->bti_oid)
+	    return (buf);
+	
+	/* if the item has just moved right on this page, we're done */
+	for (i = OffsetNumberNext(stack->bts_offset);
+	     i <= maxoff;
+	     i = OffsetNumberNext(i)) {
+	    itemid = PageGetItemId(page, i);
+	    item = (BTItem) PageGetItem(page, itemid);
+	    
+	    /* if the item is where we left it, we're done */
+	    if (item->bti_oid == stack->bts_btitem->bti_oid)
+		return (buf);
+	}
+    }
+    
+    /* by here, the item we're looking for moved right at least one page */
+    for (;;) {
+	blkno = opaque->btpo_next;
+	if (P_RIGHTMOST(opaque))
+	    elog(FATAL, "my bits moved right off the end of the world!");
+	
+	_bt_relbuf(rel, buf, access);
+	buf = _bt_getbuf(rel, blkno, access);
+	page = BufferGetPage(buf);
+	maxoff = PageGetMaxOffsetNumber(page);
+	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+	
+	/* if we have a right sibling, step over the high key */
+	start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
+	
+	/* see if it's on this page */
+	for (offnum = start;
+	     offnum <= maxoff;
+	     offnum = OffsetNumberNext(offnum)) {
+	    itemid = PageGetItemId(page, offnum);
+	    item = (BTItem) PageGetItem(page, itemid);
+	    if (item->bti_oid == stack->bts_btitem->bti_oid)
+		return (buf);
+	}
+    }
+}
+
+void
+_bt_setpagelock(Relation rel, BlockNumber blkno, int access)
+{
+    ItemPointerData iptr;
+    
+    if (USELOCKING) {
+	ItemPointerSet(&iptr, blkno, P_HIKEY);
+	
+	if (access == BT_WRITE)
+	    RelationSetSingleWLockPage(rel, &iptr);
+	else
+	    RelationSetSingleRLockPage(rel, &iptr);
+    }
+}
+
+void
+_bt_unsetpagelock(Relation rel, BlockNumber blkno, int access)
+{
+    ItemPointerData iptr;
+    
+    if (USELOCKING) {
+	ItemPointerSet(&iptr, blkno, P_HIKEY);
+	
+	if (access == BT_WRITE)
+	    RelationUnsetSingleWLockPage(rel, &iptr);
+	else
+	    RelationUnsetSingleRLockPage(rel, &iptr);
+    }
+}
+
+void
+_bt_pagedel(Relation rel, ItemPointer tid)
+{
+    Buffer buf;
+    Page page;
+    BlockNumber blkno;
+    OffsetNumber offno;
+    
+    blkno = ItemPointerGetBlockNumber(tid);
+    offno = ItemPointerGetOffsetNumber(tid);
+    
+    buf = _bt_getbuf(rel, blkno, BT_WRITE);
+    page = BufferGetPage(buf);
+    
+    PageIndexTupleDelete(page, offno);
+    
+    /* write the buffer and release the lock */
+    _bt_wrtbuf(rel, buf);
+}
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
new file mode 100644
index 0000000000..0601611996
--- /dev/null
+++ b/src/backend/access/nbtree/nbtree.c
@@ -0,0 +1,516 @@
+/*-------------------------------------------------------------------------
+ *
+ * btree.c--
+ *    Implementation of Lehman and Yao's btree management algorithm for
+ *    Postgres.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *    $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $
+ *
+ * NOTES
+ *    This file contains only the public interface routines.
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "storage/bufmgr.h"
+#include "storage/bufpage.h"
+
+#include "utils/elog.h"
+#include "utils/palloc.h"
+#include "utils/rel.h"
+#include "utils/excid.h"
+
+#include "access/heapam.h"
+#include "access/genam.h"
+#include "access/sdir.h"
+#include "access/nbtree.h"
+#include "access/funcindex.h"
+
+#include "nodes/execnodes.h"
+#include "nodes/plannodes.h"
+
+#include "executor/executor.h"
+#include "executor/tuptable.h"
+
+#include "catalog/index.h"
+
+bool	BuildingBtree = false;
+bool	FastBuild = false; /* turn this on to make bulk builds work*/
+
+/*
+ *  btbuild() -- build a new btree index.
+ *
+ *	We use a global variable to record the fact that we're creating
+ *	a new index.  This is used to avoid high-concurrency locking,
+ *	since the index won't be visible until this transaction commits
+ *	and since building is guaranteed to be single-threaded.
+ */
+void
+btbuild(Relation heap,
+	Relation index,
+	int natts,
+	AttrNumber *attnum,
+	IndexStrategy istrat,
+	uint16 pcount,
+	Datum *params,
+	FuncIndexInfo *finfo,
+	PredInfo *predInfo)
+{
+    HeapScanDesc hscan;
+    Buffer buffer;
+    HeapTuple htup;
+    IndexTuple itup;
+    TupleDesc htupdesc, itupdesc;
+    Datum *attdata;
+    bool *nulls;
+    InsertIndexResult res;
+    int nhtups, nitups;
+    int i;
+    BTItem btitem;
+    ExprContext *econtext;
+    TupleTable tupleTable;
+    TupleTableSlot *slot;
+    Oid hrelid, irelid;
+    Node *pred, *oldPred;
+    void *spool;
+    
+    /* note that this is a new btree */
+    BuildingBtree = true;
+    
+    pred = predInfo->pred;
+    oldPred = predInfo->oldPred;
+
+    /* initialize the btree index metadata page (if this is a new index) */
+    if (oldPred == NULL)
+	_bt_metapinit(index);
+    
+    /* get tuple descriptors for heap and index relations */
+    htupdesc = RelationGetTupleDescriptor(heap);
+    itupdesc = RelationGetTupleDescriptor(index);
+    
+    /* get space for data items that'll appear in the index tuple */
+    attdata = (Datum *) palloc(natts * sizeof(Datum));
+    nulls = (bool *) palloc(natts * sizeof(bool));
+    
+    /*
+     * If this is a predicate (partial) index, we will need to evaluate the
+     * predicate using ExecQual, which requires the current tuple to be in a
+     * slot of a TupleTable.  In addition, ExecQual must have an ExprContext
+     * referring to that slot.  Here, we initialize dummy TupleTable and
+     * ExprContext objects for this purpose. --Nels, Feb '92
+     */
+#ifndef OMIT_PARTIAL_INDEX
+    if (pred != NULL || oldPred != NULL) {
+	tupleTable = ExecCreateTupleTable(1);
+	slot = ExecAllocTableSlot(tupleTable);
+	econtext = makeNode(ExprContext);
+	FillDummyExprContext(econtext, slot, htupdesc, InvalidBuffer);
+    }
+#endif /* OMIT_PARTIAL_INDEX */
+    
+    /* start a heap scan */
+    hscan = heap_beginscan(heap, 0, NowTimeQual, 0, (ScanKey) NULL);
+    htup = heap_getnext(hscan, 0, &buffer);
+    
+    /* build the index */
+    nhtups = nitups = 0;
+    
+    if (FastBuild) {
+	spool = _bt_spoolinit(index, 7);
+	res = (InsertIndexResult) NULL;
+    }
+
+    for (; HeapTupleIsValid(htup); htup = heap_getnext(hscan, 0, &buffer)) {
+	
+	nhtups++;
+	
+	/*
+	 * If oldPred != NULL, this is an EXTEND INDEX command, so skip
+	 * this tuple if it was already in the existing partial index
+	 */
+	if (oldPred != NULL) {
+#ifndef OMIT_PARTIAL_INDEX
+
+	    /*SetSlotContents(slot, htup);*/
+	    slot->val = htup;
+	    if (ExecQual((List*)oldPred, econtext) == true) {
+		nitups++;
+		continue;
+	    }
+#endif /* OMIT_PARTIAL_INDEX */    	
+	}
+	
+	/* Skip this tuple if it doesn't satisfy the partial-index predicate */
+	if (pred != NULL) {
+#ifndef OMIT_PARTIAL_INDEX
+	    /* SetSlotContents(slot, htup); */
+	    slot->val = htup;
+	    if (ExecQual((List*)pred, econtext) == false)
+		continue;
+#endif /* OMIT_PARTIAL_INDEX */    	
+	}
+	
+	nitups++;
+	
+	/*
+	 *  For the current heap tuple, extract all the attributes
+	 *  we use in this index, and note which are null.
+	 */
+	
+	for (i = 1; i <= natts; i++) {
+	    int  attoff;
+	    bool attnull;
+	    
+	    /*
+	     *  Offsets are from the start of the tuple, and are
+	     *  zero-based; indices are one-based.  The next call
+	     *  returns i - 1.  That's data hiding for you.
+	     */
+	    
+	    attoff = AttrNumberGetAttrOffset(i);
+	    attdata[attoff] = GetIndexValue(htup, 
+					    htupdesc,
+					    attoff, 
+					    attnum, 
+					    finfo, 
+					    &attnull,
+					    buffer);
+	    nulls[attoff] = (attnull ? 'n' : ' ');
+	}
+	
+	/* form an index tuple and point it at the heap tuple */
+	itup = index_formtuple(itupdesc, attdata, nulls);
+	
+	/*
+	 *  If the single index key is null, we don't insert it into
+	 *  the index.  Btrees support scans on <, <=, =, >=, and >.
+	 *  Relational algebra says that A op B (where op is one of the
+	 *  operators above) returns null if either A or B is null.  This
+	 *  means that no qualification used in an index scan could ever
+	 *  return true on a null attribute.  It also means that indices
+	 *  can't be used by ISNULL or NOTNULL scans, but that's an
+	 *  artifact of the strategy map architecture chosen in 1986, not
+	 *  of the way nulls are handled here.
+	 */
+	
+	if (itup->t_info & INDEX_NULL_MASK) {
+	    pfree(itup);
+	    continue;
+	}
+	
+	itup->t_tid = htup->t_ctid;
+	btitem = _bt_formitem(itup);
+
+	/*
+	 * if we are doing bottom-up btree build, we insert the index
+	 * into a spool page for subsequent processing.  otherwise, we
+	 * insert into the btree.
+	 */
+	if (FastBuild) {
+	    _bt_spool(index, btitem, spool);
+	} else {
+	    res = _bt_doinsert(index, btitem);
+	}
+
+	pfree(btitem);
+	pfree(itup);
+	if (res) {
+	    pfree(res);
+	}
+    }
+    
+    /* okay, all heap tuples are indexed */
+    heap_endscan(hscan);
+    
+    if (pred != NULL || oldPred != NULL) {
+#ifndef OMIT_PARTIAL_INDEX
+	ExecDestroyTupleTable(tupleTable, true);
+	pfree(econtext);
+#endif /* OMIT_PARTIAL_INDEX */    	
+    }
+    
+    /*
+     * if we are doing bottom-up btree build, we now have a bunch of
+     * sorted runs in the spool pages.  finish the build by (1)
+     * merging the runs, (2) inserting the sorted tuples into btree
+     * pages and (3) building the upper levels.
+     */
+    if (FastBuild) {
+	_bt_spool(index, (BTItem) NULL, spool);	/* flush spool */
+	_bt_leafbuild(index, spool);
+	_bt_spooldestroy(spool);
+    }
+
+    /*
+     *  Since we just counted the tuples in the heap, we update its
+     *  stats in pg_class to guarantee that the planner takes advantage
+     *  of the index we just created. Finally, only update statistics
+     *  during normal index definitions, not for indices on system catalogs
+     *  created during bootstrap processing.  We must close the relations
+     *  before updatings statistics to guarantee that the relcache entries
+     *  are flushed when we increment the command counter in UpdateStats().
+     */
+    if (IsNormalProcessingMode())
+	{
+	    hrelid = heap->rd_id;
+	    irelid = index->rd_id;
+	    heap_close(heap);
+	    index_close(index);
+	    UpdateStats(hrelid, nhtups, true);
+	    UpdateStats(irelid, nitups, false);
+	    if (oldPred != NULL) {
+		if (nitups == nhtups) pred = NULL;
+		UpdateIndexPredicate(irelid, oldPred, pred);
+	    }  
+	}
+    
+    /* be tidy */
+    pfree(nulls);
+    pfree(attdata);
+    
+    /* all done */
+    BuildingBtree = false;
+}
+
+/*
+ *  btinsert() -- insert an index tuple into a btree.
+ *
+ *	Descend the tree recursively, find the appropriate location for our
+ *	new tuple, put it there, set its unique OID as appropriate, and
+ *	return an InsertIndexResult to the caller.
+ */
+InsertIndexResult
+btinsert(Relation rel, IndexTuple itup)
+{
+    BTItem btitem;
+    InsertIndexResult res;
+    
+    if (itup->t_info & INDEX_NULL_MASK)
+	return ((InsertIndexResult) NULL);
+    
+    btitem = _bt_formitem(itup);
+    
+    res = _bt_doinsert(rel, btitem);
+    pfree(btitem);
+    
+    return (res);
+}
+
+/*
+ *  btgettuple() -- Get the next tuple in the scan.
+ */
+char *
+btgettuple(IndexScanDesc scan, ScanDirection dir)
+{
+    RetrieveIndexResult res;
+    
+    /*
+     *  If we've already initialized this scan, we can just advance it
+     *  in the appropriate direction.  If we haven't done so yet, we
+     *  call a routine to get the first item in the scan.
+     */
+    
+    if (ItemPointerIsValid(&(scan->currentItemData)))
+	res = _bt_next(scan, dir);
+    else
+	res = _bt_first(scan, dir);
+    
+    return ((char *) res);
+}
+
+/*
+ *  btbeginscan() -- start a scan on a btree index
+ */
+char *
+btbeginscan(Relation rel, bool fromEnd, uint16 keysz, ScanKey scankey)
+{
+    IndexScanDesc scan;
+    StrategyNumber strat;
+    BTScanOpaque so;
+    
+    /* first order the keys in the qualification */
+    if (keysz > 1)
+	_bt_orderkeys(rel, &keysz, scankey);
+    
+    /* now get the scan */
+    scan = RelationGetIndexScan(rel, fromEnd, keysz, scankey);
+    so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
+    so->btso_curbuf = so->btso_mrkbuf = InvalidBuffer;
+    scan->opaque = so;
+    
+    /* finally, be sure that the scan exploits the tree order */
+    scan->scanFromEnd = false;
+    scan->flags = 0x0;
+    if (keysz > 0) {
+	strat = _bt_getstrat(scan->relation, 1 /* XXX */,
+			     scankey[0].sk_procedure);
+	
+	if (strat == BTLessStrategyNumber
+	    || strat == BTLessEqualStrategyNumber)
+	    scan->scanFromEnd = true;
+    } else {
+	scan->scanFromEnd = true;
+    }
+    
+    /* register scan in case we change pages it's using */
+    _bt_regscan(scan);
+    
+    return ((char *) scan);
+}
+
+/*
+ *  btrescan() -- rescan an index relation
+ */
+void
+btrescan(IndexScanDesc scan, bool fromEnd, ScanKey scankey)
+{
+    ItemPointer iptr;
+    BTScanOpaque so;
+    
+    so = (BTScanOpaque) scan->opaque;
+    
+    /* we hold a read lock on the current page in the scan */
+    if (ItemPointerIsValid(iptr = &(scan->currentItemData))) {
+	_bt_relbuf(scan->relation, so->btso_curbuf, BT_READ);
+	so->btso_curbuf = InvalidBuffer;
+	ItemPointerSetInvalid(iptr);
+    }
+    
+    /* and we hold a read lock on the last marked item in the scan */
+    if (ItemPointerIsValid(iptr = &(scan->currentMarkData))) {
+	_bt_relbuf(scan->relation, so->btso_mrkbuf, BT_READ);
+	so->btso_mrkbuf = InvalidBuffer;
+	ItemPointerSetInvalid(iptr);
+    }
+    
+    /* reset the scan key */
+    if (scan->numberOfKeys > 0) {
+	memmove(scan->keyData,
+		scankey,
+		scan->numberOfKeys * sizeof(ScanKeyData));
+    }
+}
+
+void
+btmovescan(IndexScanDesc scan, Datum v)
+{
+    ItemPointer iptr;
+    BTScanOpaque so;
+    
+    so = (BTScanOpaque) scan->opaque;
+    
+    /* release any locks we still hold */
+    if (ItemPointerIsValid(iptr = &(scan->currentItemData))) {
+	_bt_relbuf(scan->relation, so->btso_curbuf, BT_READ);
+	so->btso_curbuf = InvalidBuffer;
+	ItemPointerSetInvalid(iptr);
+    }
+    
+    scan->keyData[0].sk_argument = v;
+}
+
+/*
+ *  btendscan() -- close down a scan
+ */
+void
+btendscan(IndexScanDesc scan)
+{
+    ItemPointer iptr;
+    BTScanOpaque so;
+    
+    so = (BTScanOpaque) scan->opaque;
+    
+    /* release any locks we still hold */
+    if (ItemPointerIsValid(iptr = &(scan->currentItemData))) {
+	if (BufferIsValid(so->btso_curbuf))
+	    _bt_relbuf(scan->relation, so->btso_curbuf, BT_READ);
+	so->btso_curbuf = InvalidBuffer;
+	ItemPointerSetInvalid(iptr);
+    }
+    
+    if (ItemPointerIsValid(iptr = &(scan->currentMarkData))) {
+	if (BufferIsValid(so->btso_mrkbuf))
+	    _bt_relbuf(scan->relation, so->btso_mrkbuf, BT_READ);
+	so->btso_mrkbuf = InvalidBuffer;
+	ItemPointerSetInvalid(iptr);
+    }
+    
+    /* don't need scan registered anymore */
+    _bt_dropscan(scan);
+    
+    /* be tidy */
+#ifdef PERFECT_MMGR
+    pfree (scan->opaque);
+#endif /* PERFECT_MMGR */
+}
+
+/*
+ *  btmarkpos() -- save current scan position
+ */
+void
+btmarkpos(IndexScanDesc scan)
+{
+    ItemPointer iptr;
+    BTScanOpaque so;
+    
+    so = (BTScanOpaque) scan->opaque;
+    
+    /* release lock on old marked data, if any */
+    if (ItemPointerIsValid(iptr = &(scan->currentMarkData))) {
+	_bt_relbuf(scan->relation, so->btso_mrkbuf, BT_READ);
+	so->btso_mrkbuf = InvalidBuffer;
+	ItemPointerSetInvalid(iptr);
+    }
+    
+    /* bump lock on currentItemData and copy to currentMarkData */
+    if (ItemPointerIsValid(&(scan->currentItemData))) {
+	so->btso_mrkbuf = _bt_getbuf(scan->relation,
+				     BufferGetBlockNumber(so->btso_curbuf),
+				     BT_READ);
+	scan->currentMarkData = scan->currentItemData;
+    }
+}
+
+/*
+ *  btrestrpos() -- restore scan to last saved position
+ */
+void
+btrestrpos(IndexScanDesc scan)
+{
+    ItemPointer iptr;
+    BTScanOpaque so;
+    
+    so = (BTScanOpaque) scan->opaque;
+    
+    /* release lock on current data, if any */
+    if (ItemPointerIsValid(iptr = &(scan->currentItemData))) {
+	_bt_relbuf(scan->relation, so->btso_curbuf, BT_READ);
+	so->btso_curbuf = InvalidBuffer;
+	ItemPointerSetInvalid(iptr);
+    }
+    
+    /* bump lock on currentMarkData and copy to currentItemData */
+    if (ItemPointerIsValid(&(scan->currentMarkData))) {
+	so->btso_curbuf = _bt_getbuf(scan->relation,
+				     BufferGetBlockNumber(so->btso_mrkbuf),
+				     BT_READ);
+	
+	scan->currentItemData = scan->currentMarkData;
+    }
+}
+
+/* stubs */
+void
+btdelete(Relation rel, ItemPointer tid)
+{
+    /* adjust any active scans that will be affected by this deletion */
+    _bt_adjscans(rel, tid);
+    
+    /* delete the data from the page */
+    _bt_pagedel(rel, tid);
+}
diff --git a/src/backend/access/nbtree/nbtscan.c b/src/backend/access/nbtree/nbtscan.c
new file mode 100644
index 0000000000..62a029bc06
--- /dev/null
+++ b/src/backend/access/nbtree/nbtscan.c
@@ -0,0 +1,164 @@
+/*-------------------------------------------------------------------------
+ *
+ * btscan.c--
+ *    manage scans on btrees.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *    $Header: /cvsroot/pgsql/src/backend/access/nbtree/Attic/nbtscan.c,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $
+ *
+ *
+ * NOTES
+ *   Because we can be doing an index scan on a relation while we update
+ *   it, we need to avoid missing data that moves around in the index.
+ *   The routines and global variables in this file guarantee that all
+ *   scans in the local address space stay correctly positioned.  This
+ *   is all we need to worry about, since write locking guarantees that
+ *   no one else will be on the same page at the same time as we are.
+ *
+ *   The scheme is to manage a list of active scans in the current backend.
+ *   Whenever we add or remove records from an index, or whenever we
+ *   split a leaf page, we check the list of active scans to see if any
+ *   has been affected.  A scan is affected only if it is on the same
+ *   relation, and the same page, as the update.
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "storage/bufmgr.h"
+#include "storage/bufpage.h"
+
+#include "utils/elog.h"
+#include "utils/palloc.h"
+#include "utils/rel.h"
+#include "utils/excid.h"
+
+#include "access/heapam.h"
+#include "access/genam.h"
+#include "access/sdir.h"
+#include "access/nbtree.h"
+
+typedef struct BTScanListData {
+    IndexScanDesc		btsl_scan;
+    struct BTScanListData	*btsl_next;
+} BTScanListData;
+
+typedef BTScanListData	*BTScanList;
+
+static BTScanList	BTScans = (BTScanList) NULL;
+     
+/*
+ *  _bt_regscan() -- register a new scan.
+ */
+void
+_bt_regscan(IndexScanDesc scan)
+{
+    BTScanList new_el;
+    
+    new_el = (BTScanList) palloc(sizeof(BTScanListData));
+    new_el->btsl_scan = scan;
+    new_el->btsl_next = BTScans;
+    BTScans = new_el;
+}
+
+/*
+ *  _bt_dropscan() -- drop a scan from the scan list
+ */
+void
+_bt_dropscan(IndexScanDesc scan)
+{
+    BTScanList chk, last;
+    
+    last = (BTScanList) NULL;
+    for (chk = BTScans;
+	 chk != (BTScanList) NULL && chk->btsl_scan != scan;
+	 chk = chk->btsl_next) {
+	last = chk;
+    }
+    
+    if (chk == (BTScanList) NULL)
+	elog(WARN, "btree scan list trashed; can't find 0x%lx", scan);
+    
+    if (last == (BTScanList) NULL)
+	BTScans = chk->btsl_next;
+    else
+	last->btsl_next = chk->btsl_next;
+    
+#ifdef PERFECT_MEM
+    pfree (chk);
+#endif /* PERFECT_MEM */
+}
+
+void
+_bt_adjscans(Relation rel, ItemPointer tid)
+{
+    BTScanList l;
+    Oid relid;
+    
+    relid = rel->rd_id;
+    for (l = BTScans; l != (BTScanList) NULL; l = l->btsl_next) {
+	if (relid == l->btsl_scan->relation->rd_id)
+	    _bt_scandel(l->btsl_scan, ItemPointerGetBlockNumber(tid),
+			ItemPointerGetOffsetNumber(tid));
+    }
+}
+
+void
+_bt_scandel(IndexScanDesc scan, BlockNumber blkno, OffsetNumber offno)
+{
+    ItemPointer current;
+    Buffer buf;
+    BTScanOpaque so;
+    
+    if (!_bt_scantouched(scan, blkno, offno))
+	return;
+    
+    so = (BTScanOpaque) scan->opaque;
+    buf = so->btso_curbuf;
+    
+    current = &(scan->currentItemData);
+    if (ItemPointerIsValid(current)
+	&& ItemPointerGetBlockNumber(current) == blkno
+	&& ItemPointerGetOffsetNumber(current) >= offno) {
+	_bt_step(scan, &buf, BackwardScanDirection);
+	so->btso_curbuf = buf;
+    }
+    
+    current = &(scan->currentMarkData);
+    if (ItemPointerIsValid(current)
+	&& ItemPointerGetBlockNumber(current) == blkno
+	&& ItemPointerGetOffsetNumber(current) >= offno) {
+	ItemPointerData tmp;
+	tmp = *current;
+	*current = scan->currentItemData;
+	scan->currentItemData = tmp;
+	_bt_step(scan, &buf, BackwardScanDirection);
+	so->btso_mrkbuf = buf;
+	tmp = *current;
+	*current = scan->currentItemData;
+	scan->currentItemData = tmp;
+    }
+}
+
+bool
+_bt_scantouched(IndexScanDesc scan, BlockNumber blkno, OffsetNumber offno)
+{
+    ItemPointer current;
+    
+    current = &(scan->currentItemData);
+    if (ItemPointerIsValid(current)
+	&& ItemPointerGetBlockNumber(current) == blkno
+	&& ItemPointerGetOffsetNumber(current) >= offno)
+	return (true);
+    
+    current = &(scan->currentMarkData);
+    if (ItemPointerIsValid(current)
+	&& ItemPointerGetBlockNumber(current) == blkno
+	&& ItemPointerGetOffsetNumber(current) >= offno)
+	return (true);
+    
+    return (false);
+}
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
new file mode 100644
index 0000000000..d7a7fc7d62
--- /dev/null
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -0,0 +1,1133 @@
+/*-------------------------------------------------------------------------
+ *
+ * btsearch.c--
+ *    search code for postgres btrees.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *    $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "storage/bufmgr.h"
+#include "storage/bufpage.h"
+
+#include "utils/elog.h"
+#include "utils/palloc.h"
+#include "utils/rel.h"
+#include "utils/excid.h"
+
+#include "fmgr.h"
+
+#include "access/heapam.h"
+#include "access/genam.h"
+#include "access/skey.h"
+#include "access/sdir.h"
+#include "access/nbtree.h"
+
+static BTStack _bt_searchr(Relation rel, int keysz, ScanKey scankey, Buffer *bufP, BTStack stack_in);
+static OffsetNumber _bt_firsteq(Relation rel, TupleDesc itupdesc, Page page, Size keysz, ScanKey scankey, OffsetNumber offnum);
+static int _bt_compare(Relation rel, TupleDesc itupdesc, Page page, int keysz, ScanKey scankey, OffsetNumber offnum);
+static bool _bt_twostep(IndexScanDesc scan, Buffer *bufP, ScanDirection dir);
+static RetrieveIndexResult _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
+
+/*
+ *  _bt_search() -- Search for a scan key in the index.
+ *
+ *	This routine is actually just a helper that sets things up and
+ *	calls a recursive-descent search routine on the tree.
+ */
+BTStack
+_bt_search(Relation rel, int keysz, ScanKey scankey, Buffer *bufP)
+{
+    *bufP = _bt_getroot(rel, BT_READ);
+    return (_bt_searchr(rel, keysz, scankey, bufP, (BTStack) NULL));
+}
+
+/*
+ *  _bt_searchr() -- Search the tree recursively for a particular scankey.
+ */
+static BTStack
+_bt_searchr(Relation rel,
+	    int keysz,
+	    ScanKey scankey,
+	    Buffer *bufP,
+	    BTStack stack_in)
+{
+    BTStack stack;
+    OffsetNumber offnum;
+    Page page;
+    BTPageOpaque opaque;
+    BlockNumber par_blkno;
+    BlockNumber blkno;
+    ItemId itemid;
+    BTItem btitem;
+    BTItem item_save;
+    int item_nbytes;
+    IndexTuple itup;
+    
+    /* if this is a leaf page, we're done */
+    page = BufferGetPage(*bufP);
+    opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+    if (opaque->btpo_flags & BTP_LEAF)
+	return (stack_in);
+    
+    /*
+     *  Find the appropriate item on the internal page, and get the child
+     *  page that it points to.
+     */
+    
+    par_blkno = BufferGetBlockNumber(*bufP);
+    offnum = _bt_binsrch(rel, *bufP, keysz, scankey, BT_DESCENT);
+    itemid = PageGetItemId(page, offnum);
+    btitem = (BTItem) PageGetItem(page, itemid);
+    itup = &(btitem->bti_itup);
+    blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
+    
+    /*
+     *  We need to save the bit image of the index entry we chose in the
+     *  parent page on a stack.  In case we split the tree, we'll use this
+     *  bit image to figure out what our real parent page is, in case the
+     *  parent splits while we're working lower in the tree.  See the paper
+     *  by Lehman and Yao for how this is detected and handled.  (We use
+     *  unique OIDs to disambiguate duplicate keys in the index -- Lehman
+     *  and Yao disallow duplicate keys).
+     */
+    
+    item_nbytes = ItemIdGetLength(itemid);
+    item_save = (BTItem) palloc(item_nbytes);
+    memmove((char *) item_save, (char *) btitem, item_nbytes);
+    stack = (BTStack) palloc(sizeof(BTStackData));
+    stack->bts_blkno = par_blkno;
+    stack->bts_offset = offnum;
+    stack->bts_btitem = item_save;
+    stack->bts_parent = stack_in;
+    
+    /* drop the read lock on the parent page and acquire one on the child */
+    _bt_relbuf(rel, *bufP, BT_READ);
+    *bufP = _bt_getbuf(rel, blkno, BT_READ);
+    
+    /*
+     *  Race -- the page we just grabbed may have split since we read its
+     *  pointer in the parent.  If it has, we may need to move right to its
+     *  new sibling.  Do that.
+     */
+    
+    *bufP = _bt_moveright(rel, *bufP, keysz, scankey, BT_READ);
+    
+    /* okay, all set to move down a level */
+    return (_bt_searchr(rel, keysz, scankey, bufP, stack));
+}
+
+/*
+ *  _bt_moveright() -- move right in the btree if necessary.
+ *
+ *	When we drop and reacquire a pointer to a page, it is possible that
+ *	the page has changed in the meanwhile.  If this happens, we're
+ *	guaranteed that the page has "split right" -- that is, that any
+ *	data that appeared on the page originally is either on the page
+ *	or strictly to the right of it.
+ *
+ *	This routine decides whether or not we need to move right in the
+ *	tree by examining the high key entry on the page.  If that entry
+ *	is strictly less than one we expect to be on the page, then our
+ *	picture of the page is incorrect and we need to move right.
+ *
+ *	On entry, we have the buffer pinned and a lock of the proper type.
+ *	If we move right, we release the buffer and lock and acquire the
+ *	same on the right sibling.
+ */
+Buffer
+_bt_moveright(Relation rel,
+	      Buffer buf,
+	      int keysz,
+	      ScanKey scankey,
+	      int access)
+{
+    Page page;
+    BTPageOpaque opaque;
+    ItemId hikey;
+    ItemId itemid;
+    BlockNumber rblkno;
+    
+    page = BufferGetPage(buf);
+    opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+    
+    /* if we're on a rightmost page, we don't need to move right */
+    if (P_RIGHTMOST(opaque))
+	return (buf);
+    
+    /* by convention, item 0 on non-rightmost pages is the high key */
+    hikey = PageGetItemId(page, P_HIKEY);
+    
+    /*
+     *  If the scan key that brought us to this page is >= the high key
+     *  stored on the page, then the page has split and we need to move
+     *  right.
+     */
+    
+    if (_bt_skeycmp(rel, keysz, scankey, page, hikey,
+		    BTGreaterEqualStrategyNumber)) {
+	
+	/* move right as long as we need to */
+	do {
+	    /*
+	     *  If this page consists of all duplicate keys (hikey and first
+	     *  key on the page have the same value), then we don't need to
+	     *  step right.
+	     */
+	    if (PageGetMaxOffsetNumber(page) > P_HIKEY) {
+		itemid = PageGetItemId(page, P_FIRSTKEY);
+		if (_bt_skeycmp(rel, keysz, scankey, page, itemid,
+				BTEqualStrategyNumber)) {
+		    /* break is for the "move right" while loop */
+		    break;
+		}
+	    }
+	    
+	    /* step right one page */
+	    rblkno = opaque->btpo_next;
+	    _bt_relbuf(rel, buf, access);
+	    buf = _bt_getbuf(rel, rblkno, access);
+	    page = BufferGetPage(buf);
+	    opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+	    hikey = PageGetItemId(page, P_HIKEY);
+	    
+	} while (! P_RIGHTMOST(opaque)
+		 && _bt_skeycmp(rel, keysz, scankey, page, hikey,
+				BTGreaterEqualStrategyNumber));
+    }
+    return (buf);
+}
+
+/*
+ *  _bt_skeycmp() -- compare a scan key to a particular item on a page using
+ *		     a requested strategy (<, <=, =, >=, >).
+ *
+ *	We ignore the unique OIDs stored in the btree item here.  Those
+ *	numbers are intended for use internally only, in repositioning a
+ *	scan after a page split.  They do not impose any meaningful ordering.
+ *
+ *	The comparison is A <op> B, where A is the scan key and B is the
+ *	tuple pointed at by itemid on page.
+ */
+bool
+_bt_skeycmp(Relation rel,
+	    Size keysz,
+	    ScanKey scankey,
+	    Page page,
+	    ItemId itemid,
+	    StrategyNumber strat)
+{
+    BTItem item;
+    IndexTuple indexTuple;
+    TupleDesc tupDes;
+    ScanKey entry;
+    int i;
+    Datum attrDatum;
+    Datum keyDatum;
+    bool compare;
+    bool isNull;
+    
+    item = (BTItem) PageGetItem(page, itemid);
+    indexTuple = &(item->bti_itup);
+    
+    tupDes = RelationGetTupleDescriptor(rel);
+    
+    /* see if the comparison is true for all of the key attributes */
+    for (i=1; i <= keysz; i++) {
+	
+	entry = &scankey[i-1];
+	attrDatum = index_getattr(indexTuple,
+				  entry->sk_attno,
+				  tupDes,
+				  &isNull);
+	keyDatum  = entry->sk_argument;
+	
+	compare = _bt_invokestrat(rel, i, strat, keyDatum, attrDatum);
+	if (!compare)
+	    return (false);
+    }
+    
+    return (true);
+}
+
+/*
+ *  _bt_binsrch() -- Do a binary search for a key on a particular page.
+ *
+ *	The scankey we get has the compare function stored in the procedure
+ *	entry of each data struct.  We invoke this regproc to do the
+ *	comparison for every key in the scankey.  _bt_binsrch() returns
+ *	the OffsetNumber of the first matching key on the page, or the
+ *	OffsetNumber at which the matching key would appear if it were
+ *	on this page.
+ *
+ *	By the time this procedure is called, we're sure we're looking
+ *	at the right page -- don't need to walk right.  _bt_binsrch() has
+ *	no lock or refcount side effects on the buffer.
+ */
+OffsetNumber
+_bt_binsrch(Relation rel,
+	    Buffer buf,
+	    int keysz,
+	    ScanKey scankey,
+	    int srchtype)
+{
+    TupleDesc itupdesc;
+    Page page;
+    BTPageOpaque opaque;
+    OffsetNumber low, mid, high;
+    bool match;
+    int result;
+    
+    page = BufferGetPage(buf);
+    opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+    
+    /* by convention, item 0 on any non-rightmost page is the high key */
+    low = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
+    
+    high = PageGetMaxOffsetNumber(page);
+    
+    /*
+     *  Since for non-rightmost pages, the zeroeth item on the page is the
+     *  high key, there are two notions of emptiness.  One is if nothing
+     *  appears on the page.  The other is if nothing but the high key does.
+     *  The reason we test high <= low, rather than high == low, is that
+     *  after vacuuming there may be nothing *but* the high key on a page.
+     *  In that case, given the scheme above, low = 1 and high = 0.
+     */
+    
+    if (PageIsEmpty(page) || (! P_RIGHTMOST(opaque) && high <= low))
+	return (low);
+    
+    itupdesc = RelationGetTupleDescriptor(rel);
+    match = false;
+    
+    while ((high - low) > 1) {
+	mid = low + ((high - low) / 2);
+	result = _bt_compare(rel, itupdesc, page, keysz, scankey, mid);
+	
+	if (result > 0)
+	    low = mid;
+	else if (result < 0)
+	    high = mid - 1;
+	else {
+	    match = true;
+	    break;
+	}
+    }
+    
+    /* if we found a match, we want to find the first one on the page */
+    if (match) {
+	return (_bt_firsteq(rel, itupdesc, page, keysz, scankey, mid));
+    } else {
+	
+	/*
+	 *  We terminated because the endpoints got too close together.  There
+	 *  are two cases to take care of.
+	 *
+	 *  For non-insertion searches on internal pages, we want to point at
+	 *  the last key <, or first key =, the scankey on the page.  This
+	 *  guarantees that we'll descend the tree correctly.
+	 *
+	 *  For all other cases, we want to point at the first key >=
+	 *  the scankey on the page.  This guarantees that scans and
+	 *  insertions will happen correctly.
+	 */
+	
+	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+	if (!(opaque->btpo_flags & BTP_LEAF) && srchtype == BT_DESCENT) {
+	    
+	    /*
+	     *  We want the last key <, or first key ==, the scan key.
+	     */
+	    
+	    result = _bt_compare(rel, itupdesc, page, keysz, scankey, high);
+	    
+	    if (result == 0) {
+		return (_bt_firsteq(rel, itupdesc, page, keysz, scankey, high));
+	    } else if (result > 0) {
+		return (high);
+	    } else {
+		return (low);
+	    }
+	} else {
+	    
+	    /* we want the first key >= the scan key */
+	    result = _bt_compare(rel, itupdesc, page, keysz, scankey, low);
+	    if (result <= 0) {
+		return (low);
+	    } else {
+		if (low == high)
+		    return (OffsetNumberNext(low));
+		
+		result = _bt_compare(rel, itupdesc, page, keysz, scankey, high);
+		if (result <= 0)
+		    return (high);
+		else
+		    return (OffsetNumberNext(high));
+	    }
+	}
+    }
+}
+
+static OffsetNumber
+_bt_firsteq(Relation rel,
+	    TupleDesc itupdesc,
+	    Page page,
+	    Size keysz,
+	    ScanKey scankey,
+	    OffsetNumber offnum)
+{
+    BTPageOpaque opaque;
+    OffsetNumber limit;
+    
+    opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+    
+    /* skip the high key, if any */
+    limit = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
+    
+    /* walk backwards looking for the first key in the chain of duplicates */
+    while (offnum > limit
+	   && _bt_compare(rel, itupdesc, page,
+			  keysz, scankey, OffsetNumberPrev(offnum)) == 0) {
+	offnum = OffsetNumberPrev(offnum);
+    }
+    
+    return (offnum);
+}
+
+/*
+ *  _bt_compare() -- Compare scankey to a particular tuple on the page.
+ *
+ *	This routine returns:
+ *	    -1 if scankey < tuple at offnum;
+ *	     0 if scankey == tuple at offnum;
+ *	    +1 if scankey > tuple at offnum.
+ *
+ *	In order to avoid having to propagate changes up the tree any time
+ *	a new minimal key is inserted, the leftmost entry on the leftmost
+ *	page is less than all possible keys, by definition.
+ */
+static int
+_bt_compare(Relation rel,
+	    TupleDesc itupdesc,
+	    Page page,
+	    int keysz,
+	    ScanKey scankey,
+	    OffsetNumber offnum)
+{
+    Datum datum;
+    BTItem btitem;
+    ItemId itemid;
+    IndexTuple itup;
+    BTPageOpaque opaque;
+    ScanKey entry;
+    AttrNumber attno;
+    int result;
+    int i;
+    bool null;
+    
+    /*
+     *  If this is a leftmost internal page, and if our comparison is
+     *  with the first key on the page, then the item at that position is
+     *  by definition less than the scan key.
+     */
+    
+    opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+    if (!(opaque->btpo_flags & BTP_LEAF)
+	&& P_LEFTMOST(opaque)
+	&& offnum == P_HIKEY) {
+	itemid = PageGetItemId(page, offnum);
+	
+	/*
+	 *  we just have to believe that this will only be called with
+	 *  offnum == P_HIKEY when P_HIKEY is the OffsetNumber of the
+	 *  first actual data key (i.e., this is also a rightmost
+	 *  page).  there doesn't seem to be any code that implies
+	 *  that the leftmost page is normally missing a high key as
+	 *  well as the rightmost page.  but that implies that this
+	 *  code path only applies to the root -- which seems
+	 *  unlikely..
+	 */
+	if (! P_RIGHTMOST(opaque)) {
+	    elog(WARN, "_bt_compare: invalid comparison to high key");
+	}
+
+	/*
+	 *  If the item on the page is equal to the scankey, that's
+	 *  okay to admit.  We just can't claim that the first key on
+	 *  the page is greater than anything.
+	 */
+	
+	if (_bt_skeycmp(rel, keysz, scankey, page, itemid,
+			BTEqualStrategyNumber)) {
+	    return (0);
+	}
+	return (1);
+    }
+    
+    btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
+    itup = &(btitem->bti_itup);
+    
+    /*
+     *  The scan key is set up with the attribute number associated with each
+     *  term in the key.  It is important that, if the index is multi-key,
+     *  the scan contain the first k key attributes, and that they be in
+     *  order.  If you think about how multi-key ordering works, you'll
+     *  understand why this is.
+     *
+     *  We don't test for violation of this condition here.
+     */
+    
+    for (i = 1; i <= keysz; i++) {
+	long tmpres;
+	
+	entry = &scankey[i - 1];
+	attno = entry->sk_attno;
+	datum = index_getattr(itup, attno, itupdesc, &null);
+	tmpres = (long) FMGR_PTR2(entry->sk_func, entry->sk_procedure,
+				  entry->sk_argument, datum);
+	result = tmpres;
+	
+	/* if the keys are unequal, return the difference */
+	if (result != 0)
+	    return (result);
+    }
+    
+    /* by here, the keys are equal */
+    return (0);
+}
+
+/*
+ *  _bt_next() -- Get the next item in a scan.
+ *
+ *	On entry, we have a valid currentItemData in the scan, and a
+ *	read lock on the page that contains that item.  We do not have
+ *	the page pinned.  We return the next item in the scan.  On
+ *	exit, we have the page containing the next item locked but not
+ *	pinned.
+ */
+RetrieveIndexResult
+_bt_next(IndexScanDesc scan, ScanDirection dir)
+{
+    Relation rel;
+    Buffer buf;
+    Page page;
+    OffsetNumber offnum;
+    RetrieveIndexResult res;
+    BlockNumber blkno;
+    ItemPointer current;
+    ItemPointer iptr;
+    BTItem btitem;
+    IndexTuple itup;
+    BTScanOpaque so;
+    
+    rel = scan->relation;
+    so = (BTScanOpaque) scan->opaque;
+    current = &(scan->currentItemData);
+    
+    /*
+     *  XXX 10 may 91:  somewhere there's a bug in our management of the
+     *  cached buffer for this scan.  wei discovered it.  the following
+     *  is a workaround so he can work until i figure out what's going on.
+     */
+    
+    if (!BufferIsValid(so->btso_curbuf))
+	so->btso_curbuf = _bt_getbuf(rel, ItemPointerGetBlockNumber(current),
+				     BT_READ);
+    
+    /* we still have the buffer pinned and locked */
+    buf = so->btso_curbuf;
+    blkno = BufferGetBlockNumber(buf);
+    
+    /* step one tuple in the appropriate direction */
+    if (!_bt_step(scan, &buf, dir))
+	return ((RetrieveIndexResult) NULL);
+    
+    /* by here, current is the tuple we want to return */
+    offnum = ItemPointerGetOffsetNumber(current);
+    page = BufferGetPage(buf);
+    btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
+    itup = &btitem->bti_itup;
+    
+    if (_bt_checkqual(scan, itup)) {
+	iptr = (ItemPointer) palloc(sizeof(ItemPointerData));
+	memmove((char *) iptr, (char *) &(itup->t_tid),
+		sizeof(ItemPointerData));
+	res = FormRetrieveIndexResult(current, iptr);
+	
+	/* remember which buffer we have pinned and locked */
+	so->btso_curbuf = buf;
+    } else {
+	ItemPointerSetInvalid(current);
+	so->btso_curbuf = InvalidBuffer;
+	_bt_relbuf(rel, buf, BT_READ);
+	res = (RetrieveIndexResult) NULL;
+    }
+    
+    return (res);
+}
+
+/*
+ *  _bt_first() -- Find the first item in a scan.
+ *
+ *	We need to be clever about the type of scan, the operation it's
+ *	performing, and the tree ordering.  We return the RetrieveIndexResult
+ *	of the first item in the tree that satisfies the qualification
+ *	associated with the scan descriptor.  On exit, the page containing
+ *	the current index tuple is read locked and pinned, and the scan's
+ *	opaque data entry is updated to include the buffer.
+ */
+RetrieveIndexResult
+_bt_first(IndexScanDesc scan, ScanDirection dir)
+{
+    Relation rel;
+    TupleDesc itupdesc;
+    Buffer buf;
+    Page page;
+    BTStack stack;
+    OffsetNumber offnum, maxoff;
+    BTItem btitem;
+    IndexTuple itup;
+    ItemPointer current;
+    ItemPointer iptr;
+    BlockNumber blkno;
+    StrategyNumber strat;
+    RetrieveIndexResult res;
+    RegProcedure proc;
+    int result;
+    BTScanOpaque so;
+    ScanKeyData skdata;
+    
+    /* if we just need to walk down one edge of the tree, do that */
+    if (scan->scanFromEnd)
+	return (_bt_endpoint(scan, dir));
+    
+    rel = scan->relation;
+    itupdesc = RelationGetTupleDescriptor(scan->relation);
+    current = &(scan->currentItemData);
+    so = (BTScanOpaque) scan->opaque;
+    
+    /*
+     *  Okay, we want something more complicated.  What we'll do is use
+     *  the first item in the scan key passed in (which has been correctly
+     *  ordered to take advantage of index ordering) to position ourselves
+     *  at the right place in the scan.
+     */
+    
+    /*
+     *  XXX -- The attribute number stored in the scan key is the attno
+     *	       in the heap relation.  We need to transmogrify this into
+     *         the index relation attno here.  For the moment, we have
+     *	       hardwired attno == 1.
+     */
+    proc = index_getprocid(rel, 1, BTORDER_PROC);
+    ScanKeyEntryInitialize(&skdata, 0x0, 1, proc,
+			   scan->keyData[0].sk_argument);
+    
+    stack = _bt_search(rel, 1, &skdata, &buf);
+    _bt_freestack(stack);
+    
+    /* find the nearest match to the manufactured scan key on the page */
+    offnum = _bt_binsrch(rel, buf, 1, &skdata, BT_DESCENT);
+    page = BufferGetPage(buf);
+    
+    /*
+     *  This will happen if the tree we're searching is entirely empty,
+     *  or if we're doing a search for a key that would appear on an
+     *  entirely empty internal page.  In either case, there are no
+     *  matching tuples in the index.
+     */
+    
+    if (PageIsEmpty(page)) {
+	ItemPointerSetInvalid(current);
+	so->btso_curbuf = InvalidBuffer;
+	_bt_relbuf(rel, buf, BT_READ);
+	return ((RetrieveIndexResult) NULL);
+    }
+    
+    maxoff = PageGetMaxOffsetNumber(page);
+    
+    if (offnum > maxoff)
+	offnum = maxoff;
+    
+    blkno = BufferGetBlockNumber(buf);
+    ItemPointerSet(current, blkno, offnum);
+    
+    /*
+     *  Now find the right place to start the scan.  Result is the
+     *  value we're looking for minus the value we're looking at
+     *  in the index.
+     */
+    
+    result = _bt_compare(rel, itupdesc, page, 1, &skdata, offnum);
+    strat = _bt_getstrat(rel, 1, scan->keyData[0].sk_procedure);
+    
+    switch (strat) {
+    case BTLessStrategyNumber:
+	if (result <= 0) {
+	    do {
+		if (!_bt_twostep(scan, &buf, BackwardScanDirection))
+		    break;
+		
+		offnum = ItemPointerGetOffsetNumber(current);
+		page = BufferGetPage(buf);
+		result = _bt_compare(rel, itupdesc, page, 1, &skdata, offnum);
+	    } while (result <= 0);
+	    
+	    /* if this is true, the key we just looked at is gone */
+	    if (result > 0)
+		(void) _bt_twostep(scan, &buf, ForwardScanDirection);
+	}
+	break;
+	
+    case BTLessEqualStrategyNumber:
+	if (result >= 0) {
+	    do {
+		if (!_bt_twostep(scan, &buf, ForwardScanDirection))
+		    break;
+		
+		offnum = ItemPointerGetOffsetNumber(current);
+		page = BufferGetPage(buf);
+		result = _bt_compare(rel, itupdesc, page, 1, &skdata, offnum);
+	    } while (result >= 0);
+	    
+	    if (result < 0)
+		(void) _bt_twostep(scan, &buf, BackwardScanDirection);
+	}
+	break;
+	
+    case BTEqualStrategyNumber:
+	if (result != 0) {
+	    _bt_relbuf(scan->relation, buf, BT_READ);
+	    so->btso_curbuf = InvalidBuffer;
+	    ItemPointerSetInvalid(&(scan->currentItemData));
+	    return ((RetrieveIndexResult) NULL);
+	}
+	break;
+	
+    case BTGreaterEqualStrategyNumber:
+	if (result < 0) {
+	    do {
+		if (!_bt_twostep(scan, &buf, BackwardScanDirection))
+		    break;
+		
+		page = BufferGetPage(buf);
+		offnum = ItemPointerGetOffsetNumber(current);
+		result = _bt_compare(rel, itupdesc, page, 1, &skdata, offnum);
+	    } while (result < 0);
+	    
+	    if (result > 0)
+		(void) _bt_twostep(scan, &buf, ForwardScanDirection);
+	}
+	break;
+	
+    case BTGreaterStrategyNumber:
+	if (result >= 0) {
+	    do {
+		if (!_bt_twostep(scan, &buf, ForwardScanDirection))
+		    break;
+		
+		offnum = ItemPointerGetOffsetNumber(current);
+		page = BufferGetPage(buf);
+		result = _bt_compare(rel, itupdesc, page, 1, &skdata, offnum);
+	    } while (result >= 0);
+	}
+	break;
+    }
+    
+    /* okay, current item pointer for the scan is right */
+    offnum = ItemPointerGetOffsetNumber(current);
+    page = BufferGetPage(buf);
+    btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
+    itup = &btitem->bti_itup;
+    
+    if (_bt_checkqual(scan, itup)) {
+	iptr = (ItemPointer) palloc(sizeof(ItemPointerData));
+	memmove((char *) iptr, (char *) &(itup->t_tid),
+		sizeof(ItemPointerData));
+	res = FormRetrieveIndexResult(current, iptr);
+	pfree(iptr);
+	
+	/* remember which buffer we have pinned */
+	so->btso_curbuf = buf;
+    } else {
+	ItemPointerSetInvalid(current);
+	so->btso_curbuf = InvalidBuffer;
+	_bt_relbuf(rel, buf, BT_READ);
+	res = (RetrieveIndexResult) NULL;
+    }
+    
+    return (res);
+}
+
+/*
+ *  _bt_step() -- Step one item in the requested direction in a scan on
+ *		  the tree.
+ *
+ *	If no adjacent record exists in the requested direction, return
+ *	false.  Else, return true and set the currentItemData for the
+ *	scan to the right thing.
+ */
+bool
+_bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
+{
+    Page page;
+    BTPageOpaque opaque;
+    OffsetNumber offnum, maxoff;
+    OffsetNumber start;
+    BlockNumber blkno;
+    BlockNumber obknum;
+    BTScanOpaque so;
+    ItemPointer current;
+    Relation rel;
+    
+    rel = scan->relation;
+    current = &(scan->currentItemData);
+    offnum = ItemPointerGetOffsetNumber(current);
+    page = BufferGetPage(*bufP);
+    opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+    so = (BTScanOpaque) scan->opaque;
+    maxoff = PageGetMaxOffsetNumber(page);
+    
+    /* get the next tuple */
+    if (ScanDirectionIsForward(dir)) {
+	if (!PageIsEmpty(page) && offnum < maxoff) {
+	    offnum = OffsetNumberNext(offnum);
+	} else {
+	    
+	    /* if we're at end of scan, release the buffer and return */
+	    blkno = opaque->btpo_next;
+	    if (P_RIGHTMOST(opaque)) {
+		_bt_relbuf(rel, *bufP, BT_READ);
+		ItemPointerSetInvalid(current);
+		*bufP = so->btso_curbuf = InvalidBuffer;
+		return (false);
+	    } else {
+		
+		/* walk right to the next page with data */
+		_bt_relbuf(rel, *bufP, BT_READ);
+		for (;;) {
+		    *bufP = _bt_getbuf(rel, blkno, BT_READ);
+		    page = BufferGetPage(*bufP);
+		    opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+		    maxoff = PageGetMaxOffsetNumber(page);
+		    start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
+		    
+		    if (!PageIsEmpty(page) && start <= maxoff) {
+			break;
+		    } else {
+			blkno = opaque->btpo_next;
+			_bt_relbuf(rel, *bufP, BT_READ);
+			if (blkno == P_NONE) {
+			    *bufP = so->btso_curbuf = InvalidBuffer;
+			    ItemPointerSetInvalid(current);
+			    return (false);
+			}
+		    }
+		}
+		offnum = start;
+	    }
+	}
+    } else if (ScanDirectionIsBackward(dir)) {
+	
+	/* remember that high key is item zero on non-rightmost pages */
+	start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
+
+	if (offnum > start) {
+	    offnum = OffsetNumberPrev(offnum);
+	} else {
+	    
+	    /* if we're at end of scan, release the buffer and return */
+	    blkno = opaque->btpo_prev;
+	    if (P_LEFTMOST(opaque)) {
+		_bt_relbuf(rel, *bufP, BT_READ);
+		*bufP = so->btso_curbuf = InvalidBuffer;
+		ItemPointerSetInvalid(current);
+		return (false);
+	    } else {
+		
+		obknum = BufferGetBlockNumber(*bufP);
+		
+		/* walk right to the next page with data */
+		_bt_relbuf(rel, *bufP, BT_READ);
+		for (;;) {
+		    *bufP = _bt_getbuf(rel, blkno, BT_READ);
+		    page = BufferGetPage(*bufP);
+		    opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+		    maxoff = PageGetMaxOffsetNumber(page);
+		    
+		    /*
+		     *  If the adjacent page just split, then we may have the
+		     *  wrong block.  Handle this case.  Because pages only
+		     *  split right, we don't have to worry about this failing
+		     *  to terminate.
+		     */
+		    
+		    while (opaque->btpo_next != obknum) {
+			blkno = opaque->btpo_next;
+			_bt_relbuf(rel, *bufP, BT_READ);
+			*bufP = _bt_getbuf(rel, blkno, BT_READ);
+			page = BufferGetPage(*bufP);
+			opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+			maxoff = PageGetMaxOffsetNumber(page);
+		    }
+		    
+		    /* don't consider the high key */
+		    start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
+		    
+		    /* anything to look at here? */
+		    if (!PageIsEmpty(page) && maxoff >= start) {
+			break;
+		    } else {
+			blkno = opaque->btpo_prev;
+			obknum = BufferGetBlockNumber(*bufP);
+			_bt_relbuf(rel, *bufP, BT_READ);
+			if (blkno == P_NONE) {
+			    *bufP = so->btso_curbuf = InvalidBuffer;
+			    ItemPointerSetInvalid(current);
+			    return (false);
+			}
+		    }
+		}
+		offnum = maxoff;	/* XXX PageIsEmpty? */
+	    }
+	}
+    }
+    blkno = BufferGetBlockNumber(*bufP);
+    so->btso_curbuf = *bufP;
+    ItemPointerSet(current, blkno, offnum);
+    
+    return (true);
+}
+
+/*
+ *  _bt_twostep() -- Move to an adjacent record in a scan on the tree,
+ *		     if an adjacent record exists.
+ *
+ *	This is like _bt_step, except that if no adjacent record exists
+ *	it restores us to where we were before trying the step.  This is
+ *	only hairy when you cross page boundaries, since the page you cross
+ *	from could have records inserted or deleted, or could even split.
+ *	This is unlikely, but we try to handle it correctly here anyway.
+ *
+ *	This routine contains the only case in which our changes to Lehman
+ *	and Yao's algorithm.
+ *
+ *	Like step, this routine leaves the scan's currentItemData in the
+ *	proper state and acquires a lock and pin on *bufP.  If the twostep
+ *	succeeded, we return true; otherwise, we return false.
+ */
+static bool
+_bt_twostep(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
+{
+    Page page;
+    BTPageOpaque opaque;
+    OffsetNumber offnum, maxoff;
+    OffsetNumber start;
+    ItemPointer current;
+    ItemId itemid;
+    int itemsz;
+    BTItem btitem;
+    BTItem svitem;
+    BlockNumber blkno;
+    
+    blkno = BufferGetBlockNumber(*bufP);
+    page = BufferGetPage(*bufP);
+    opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+    maxoff = PageGetMaxOffsetNumber(page);
+    current = &(scan->currentItemData);
+    offnum = ItemPointerGetOffsetNumber(current);
+    
+    start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
+    
+    /* if we're safe, just do it */
+    if (ScanDirectionIsForward(dir) && offnum < maxoff) { /* XXX PageIsEmpty? */
+	ItemPointerSet(current, blkno, OffsetNumberNext(offnum));
+	return (true);
+    } else if (ScanDirectionIsBackward(dir) && offnum > start) {
+	ItemPointerSet(current, blkno, OffsetNumberPrev(offnum));
+	return (true);
+    }
+    
+    /* if we've hit end of scan we don't have to do any work */
+    if (ScanDirectionIsForward(dir) && P_RIGHTMOST(opaque)) {
+	return (false);
+    } else if (ScanDirectionIsBackward(dir) && P_LEFTMOST(opaque)) {
+	return (false);
+    }
+    
+    /*
+     *  Okay, it's off the page; let _bt_step() do the hard work, and we'll
+     *  try to remember where we were.  This is not guaranteed to work; this
+     *  is the only place in the code where concurrency can screw us up,
+     *  and it's because we want to be able to move in two directions in
+     *  the scan.
+     */
+    
+    itemid = PageGetItemId(page, offnum);
+    itemsz = ItemIdGetLength(itemid);
+    btitem = (BTItem) PageGetItem(page, itemid);
+    svitem = (BTItem) palloc(itemsz);
+    memmove((char *) svitem, (char *) btitem, itemsz);
+    
+    if (_bt_step(scan, bufP, dir)) {
+	pfree(svitem);
+	return (true);
+    }
+    
+    /* try to find our place again */
+    *bufP = _bt_getbuf(scan->relation, blkno, BT_READ);
+    page = BufferGetPage(*bufP);
+    maxoff = PageGetMaxOffsetNumber(page);
+    
+    while (offnum <= maxoff) {
+	itemid = PageGetItemId(page, offnum);
+	btitem = (BTItem) PageGetItem(page, itemid);
+	if (btitem->bti_oid == svitem->bti_oid) {
+	    pfree(svitem);
+	    ItemPointerSet(current, blkno, offnum);
+	    return (false);
+	}
+    }
+    
+    /*
+     *  XXX crash and burn -- can't find our place.  We can be a little
+     *  smarter -- walk to the next page to the right, for example, since
+     *  that's the only direction that splits happen in.  Deletions screw
+     *  us up less often since they're only done by the vacuum daemon.
+     */
+    
+    elog(WARN, "btree synchronization error:  concurrent update botched scan");
+    
+    return (false);
+}
+
+/*
+ *  _bt_endpoint() -- Find the first or last key in the index.
+ */
+static RetrieveIndexResult
+_bt_endpoint(IndexScanDesc scan, ScanDirection dir)
+{
+    Relation rel;
+    Buffer buf;
+    Page page;
+    BTPageOpaque opaque;
+    ItemPointer current;
+    ItemPointer iptr;
+    OffsetNumber offnum, maxoff;
+    OffsetNumber start;
+    BlockNumber blkno;
+    BTItem btitem;
+    IndexTuple itup;
+    BTScanOpaque so;
+    RetrieveIndexResult res;
+    
+    rel = scan->relation;
+    current = &(scan->currentItemData);
+    
+    buf = _bt_getroot(rel, BT_READ);
+    blkno = BufferGetBlockNumber(buf);
+    page = BufferGetPage(buf);
+    opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+    
+    for (;;) {
+	if (opaque->btpo_flags & BTP_LEAF)
+	    break;
+	
+	if (ScanDirectionIsForward(dir)) {
+	    offnum = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
+	} else {
+	    offnum = PageGetMaxOffsetNumber(page);
+	}
+	
+	btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
+	itup = &(btitem->bti_itup);
+	
+	blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
+	
+	_bt_relbuf(rel, buf, BT_READ);
+	buf = _bt_getbuf(rel, blkno, BT_READ);
+	page = BufferGetPage(buf);
+	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+	
+	/*
+	 *  Race condition: If the child page we just stepped onto is
+	 *  in the process of being split, we need to make sure we're
+	 *  all the way at the right edge of the tree.  See the paper
+	 *  by Lehman and Yao.
+	 */
+	
+	if (ScanDirectionIsBackward(dir) && ! P_RIGHTMOST(opaque)) {
+	    do {
+		blkno = opaque->btpo_next;
+		_bt_relbuf(rel, buf, BT_READ);
+		buf = _bt_getbuf(rel, blkno, BT_READ);
+		page = BufferGetPage(buf);
+		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+	    } while (! P_RIGHTMOST(opaque));
+	}
+    }
+    
+    /* okay, we've got the {left,right}-most page in the tree */
+    maxoff = PageGetMaxOffsetNumber(page);
+    
+    if (ScanDirectionIsForward(dir)) {
+	if (PageIsEmpty(page)) {
+	    maxoff = FirstOffsetNumber;
+	} else {
+	    maxoff = PageGetMaxOffsetNumber(page);
+	}
+	start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
+	
+	if (PageIsEmpty(page) || start > maxoff) {
+	    ItemPointerSet(current, blkno, maxoff);
+	    if (!_bt_step(scan, &buf, BackwardScanDirection))
+		return ((RetrieveIndexResult) NULL);
+	    
+	    start = ItemPointerGetOffsetNumber(current);
+	    page = BufferGetPage(buf);
+	} else {
+	    ItemPointerSet(current, blkno, start);
+	}
+    } else if (ScanDirectionIsBackward(dir)) {
+	if (PageIsEmpty(page)) {
+	    ItemPointerSet(current, blkno, FirstOffsetNumber);
+	    if (!_bt_step(scan, &buf, ForwardScanDirection))
+		return ((RetrieveIndexResult) NULL);
+	    
+	    start = ItemPointerGetOffsetNumber(current);
+	    page = BufferGetPage(buf);
+	} else {
+	    start = PageGetMaxOffsetNumber(page);
+	    ItemPointerSet(current, blkno, start);
+	}
+    } else {
+	elog(WARN, "Illegal scan direction %d", dir);
+    }
+    
+    btitem = (BTItem) PageGetItem(page, PageGetItemId(page, start));
+    itup = &(btitem->bti_itup);
+    
+    /* see if we picked a winner */
+    if (_bt_checkqual(scan, itup)) {
+	iptr = (ItemPointer) palloc(sizeof(ItemPointerData));
+	memmove((char *) iptr, (char *) &(itup->t_tid),
+		sizeof(ItemPointerData));
+	res = FormRetrieveIndexResult(current, iptr);
+	
+	/* remember which buffer we have pinned */
+	so = (BTScanOpaque) scan->opaque;
+	so->btso_curbuf = buf;
+    } else {
+	_bt_relbuf(rel, buf, BT_READ);
+	res = (RetrieveIndexResult) NULL;
+    }
+    
+    return (res);
+}
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c
new file mode 100644
index 0000000000..3d2676324a
--- /dev/null
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -0,0 +1,1196 @@
+/*-------------------------------------------------------------------------
+ * btsort.c--
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *    $Id: nbtsort.c,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $
+ *
+ * NOTES
+ *
+ * what we do is:
+ * - generate a set of initial one-block runs, distributed round-robin
+ *   between the output tapes.
+ * - for each pass,
+ *   - swap input and output tape sets, rewinding both and truncating
+ *     the output tapes.
+ *   - merge the current run in each input tape to the current output
+ *     tape.
+ *     - when each input run has been exhausted, switch to another output
+ *       tape and start processing another run.
+ * - when we have fewer runs than tapes, we know we are ready to start
+ *   merging into the btree leaf pages. 
+ * - every time we complete a level of the btree, we can construct the
+ *   next level up.  when we have only one page on a level, it can be
+ *   attached to the btree metapage and we are done.
+ *
+ * conventions:
+ * - external interface routines take in and return "void *" for their
+ *   opaque handles.  this is for modularity reasons (i prefer not to
+ *   export these structures without good reason).
+ *
+ * this code is moderately slow (~10% slower) compared to the regular
+ * btree (insertion) build code on sorted or well-clustered data.  on
+ * random data, however, the insertion build code is unusable -- the
+ * difference on a 60MB heap is a factor of 15 because the random
+ * probes into the btree thrash the buffer pool.
+ *
+ * this code currently packs the pages to 100% of capacity.  this is
+ * not wise, since *any* insertion will cause splitting.  filling to
+ * something like the standard 70% steady-state load factor for btrees
+ * would probably be better.
+ *
+ * somebody desperately needs to figure out how to do a better job of
+ * balancing the merge passes -- the fan-in on the final merges can be
+ * pretty poor, which is bad for performance.
+ *-------------------------------------------------------------------------
+ */
+
+#include <stdio.h>
+
+#include "c.h"
+
+#include "access/nbtree.h"
+
+#include "storage/bufmgr.h"
+#include "storage/fd.h"
+#include "utils/rel.h"
+#include "utils/palloc.h"
+#include "utils/elog.h"
+
+/*#define FASTBUILD_DEBUG*/ /* turn on debugging output */
+
+#define FASTBUILD
+
+#ifdef FASTBUILD
+
+#define	MAXTAPES	(7)
+#define	TAPEBLCKSZ	(BLCKSZ << 2)
+#define	TAPETEMP	"pg_btsortXXXXXX"
+
+
+/*-------------------------------------------------------------------------
+ * sorting comparison routine - returns {-1,0,1} depending on whether
+ * the key in the left BTItem is {<,=,>} the key in the right BTItem.
+ *
+ * we want to use _bt_isortcmp as a comparison function for qsort(3),
+ * but it needs extra arguments, so we "pass them in" as global
+ * variables.  ick.  fortunately, they are the same throughout the
+ * build, so we need do this only once.  this is why you must call
+ * _bt_isortcmpinit before the call to qsort(3).
+ *
+ * a NULL BTItem is always assumed to be greater than any actual
+ * value; our heap routines (see below) assume that the smallest
+ * element in the heap is returned.  that way, NULL values from the
+ * exhausted tapes can sift down to the bottom of the heap.  in point
+ * of fact we just don't replace the elements of exhausted tapes, but
+ * what the heck.
+ * *-------------------------------------------------------------------------
+ */
+static Relation _bt_sortrel;
+
+static void
+_bt_isortcmpinit(Relation index)
+{
+    _bt_sortrel = index;
+}
+
+static int
+_bt_isortcmp(BTItem *bti1p, BTItem *bti2p)
+{
+    BTItem bti1 = *bti1p;
+    BTItem bti2 = *bti2p;
+
+    if (bti1 == (BTItem) NULL) {
+	if (bti2 == (BTItem) NULL) {
+	    return(0);	/* 1 = 2 */
+	}
+	return(1);	/* 1 > 2 */
+    } else if (bti2 == (BTItem) NULL) {
+	return(-1);	/* 1 < 2 */
+    } else if (_bt_itemcmp(_bt_sortrel, 1, bti1, bti2,
+			   BTGreaterStrategyNumber)) {
+	return(1);	/* 1 > 2 */
+    } else if (_bt_itemcmp(_bt_sortrel, 1, bti2, bti1,
+			   BTGreaterStrategyNumber)) {
+	return(-1);	/* 1 < 2 */
+    }
+    return(0);		/* 1 = 2 */
+}
+
+/*-------------------------------------------------------------------------
+ * priority queue methods
+ *
+ * these were more-or-less lifted from the heap section of the 1984
+ * edition of gonnet's book on algorithms and data structures.  they
+ * are coded so that the smallest element in the heap is returned (we
+ * use them for merging sorted runs).
+ *
+ * XXX these probably ought to be generic library functions.
+ *-------------------------------------------------------------------------
+ */
+
+typedef struct {
+    int		btpqe_tape;	/* tape identifier */
+    BTItem	btpqe_item;	/* pointer to BTItem in tape buffer */
+} BTPriQueueElem;
+
+#define	MAXELEM	MAXTAPES
+typedef struct {
+    int			btpq_nelem;
+    BTPriQueueElem	btpq_queue[MAXELEM];
+    Relation		btpq_rel;
+} BTPriQueue;
+
+/* be sure to call _bt_isortcmpinit first */
+#define GREATER(a, b) \
+    (_bt_isortcmp(&((a)->btpqe_item), &((b)->btpqe_item)) > 0)
+
+static void
+_bt_pqsift(BTPriQueue *q, int parent)
+{
+    int child;
+    BTPriQueueElem e;
+
+    for (child = parent * 2 + 1;
+	 child < q->btpq_nelem;
+	 child = parent * 2 + 1) {
+	if (child < q->btpq_nelem - 1) {
+	    if (GREATER(&(q->btpq_queue[child]), &(q->btpq_queue[child+1]))) {
+		++child;
+	    }
+	}
+	if (GREATER(&(q->btpq_queue[parent]), &(q->btpq_queue[child]))) {
+	    e = q->btpq_queue[child];				/* struct = */
+	    q->btpq_queue[child] = q->btpq_queue[parent];	/* struct = */
+	    q->btpq_queue[parent] = e;				/* struct = */
+	    parent = child;
+	} else {
+	    parent = child + 1;
+	}
+    }
+}
+
+static int
+_bt_pqnext(BTPriQueue *q, BTPriQueueElem *e)
+{
+    if (q->btpq_nelem < 1) {	/* already empty */
+	return(-1);
+    }
+    *e = q->btpq_queue[0];					/* struct = */
+
+    if (--q->btpq_nelem < 1) {	/* now empty, don't sift */
+	return(0);
+    }
+    q->btpq_queue[0] = q->btpq_queue[q->btpq_nelem];		/* struct = */
+    _bt_pqsift(q, 0);
+    return(0);
+}
+
+static void
+_bt_pqadd(BTPriQueue *q, BTPriQueueElem *e)
+{
+    int child, parent;
+
+    if (q->btpq_nelem >= MAXELEM) {
+	elog(WARN, "_bt_pqadd: queue overflow");
+    }
+
+    child = q->btpq_nelem++;
+    while (child > 0) {
+	parent = child / 2;
+	if (GREATER(e, &(q->btpq_queue[parent]))) {
+	    break;
+	} else {
+	    q->btpq_queue[child] = q->btpq_queue[parent];	/* struct = */
+	    child = parent;
+	}
+    }
+
+    q->btpq_queue[child] = *e;					/* struct = */
+}
+
+/*-------------------------------------------------------------------------
+ * tape methods
+ *-------------------------------------------------------------------------
+ */
+
+#define	BTITEMSZ(btitem) \
+    ((btitem) ? \
+     (IndexTupleDSize((btitem)->bti_itup) + \
+      (sizeof(BTItemData) - sizeof(IndexTupleData))) : \
+     0)
+#define	SPCLEFT(tape) \
+    (sizeof((tape)->bttb_data) - (tape)->bttb_top)
+#define	EMPTYTAPE(tape) \
+    ((tape)->bttb_ntup <= 0)
+#define	BTTAPEMAGIC	0x19660226
+
+/*
+ * this is what we use to shovel BTItems in and out of memory.  it's
+ * bigger than a standard block because we are doing a lot of strictly
+ * sequential i/o.  this is obviously something of a tradeoff since we
+ * are potentially reading a bunch of zeroes off of disk in many
+ * cases.
+ *
+ * BTItems are packed in and DOUBLEALIGN'd.
+ *
+ * the fd should not be going out to disk, strictly speaking, but it's
+ * the only thing like that so i'm not going to worry about wasting a
+ * few bytes.
+ */
+typedef struct {
+    int		bttb_magic;	/* magic number */
+    int		bttb_fd;	/* file descriptor */
+    int		bttb_top;	/* top of free space within bttb_data */
+    short	bttb_ntup;	/* number of tuples in this block */
+    short	bttb_eor;	/* End-Of-Run marker */
+    char	bttb_data[TAPEBLCKSZ - 2 * sizeof(double)];
+} BTTapeBlock;
+
+
+/*
+ * reset the tape header for its next use without doing anything to
+ * the physical tape file.  (setting bttb_top to 0 makes the block
+ * empty.)
+ */
+static void
+_bt_tapereset(BTTapeBlock *tape)
+{
+    tape->bttb_eor = 0;
+    tape->bttb_top = 0;
+    tape->bttb_ntup = 0;
+}
+
+/*
+ * rewind the physical tape file.
+ */
+static void
+_bt_taperewind(BTTapeBlock *tape)
+{
+    (void) FileSeek(tape->bttb_fd, 0, SEEK_SET);
+}
+
+/*
+ * destroy the contents of the physical tape file without destroying
+ * the tape data structure or removing the physical tape file.
+ *
+ * we use the VFD version of ftruncate(2) to do this rather than
+ * unlinking and recreating the file.  you still have to wait while
+ * the OS frees up all of the file system blocks and stuff, but at
+ * least you don't have to delete and reinsert the directory entries.
+ */
+static void
+_bt_tapeclear(BTTapeBlock *tape)
+{
+    /* blow away the contents of the old file */
+    _bt_taperewind(tape);
+#if 0
+    FileSync(tape->bttb_fd);
+#endif
+    FileTruncate(tape->bttb_fd, 0);
+
+    /* reset the buffer */
+    _bt_tapereset(tape);
+}
+
+/*
+ * create a new BTTapeBlock, allocating memory for the data structure
+ * as well as opening a physical tape file.
+ */
+static BTTapeBlock *
+_bt_tapecreate(char *fname)
+{
+    BTTapeBlock *tape = (BTTapeBlock *) palloc(sizeof(BTTapeBlock));
+
+    if (tape == (BTTapeBlock *) NULL) {
+	elog(WARN, "_bt_tapecreate: out of memory");
+    }
+
+    tape->bttb_magic = BTTAPEMAGIC;
+
+    tape->bttb_fd = FileNameOpenFile(fname, O_RDWR|O_CREAT|O_TRUNC, 0600);
+    Assert(tape->bttb_fd >= 0);
+
+    /* initialize the buffer */
+    _bt_tapereset(tape);
+
+    return(tape);
+}
+
+/*
+ * destroy the BTTapeBlock structure and its physical tape file.
+ */
+static void
+_bt_tapedestroy(BTTapeBlock *tape)
+{
+    FileUnlink(tape->bttb_fd);
+    pfree((void *) tape);
+}
+
+/*
+ * flush the tape block to the file, marking End-Of-Run if requested.
+ */
+static void
+_bt_tapewrite(BTTapeBlock *tape, int eor)
+{
+    tape->bttb_eor = eor;
+    FileWrite(tape->bttb_fd, (char*)tape, TAPEBLCKSZ);
+    _bt_tapereset(tape);
+}
+
+/*
+ * read a tape block from the file, overwriting the current contents
+ * of the buffer.
+ *
+ * returns:
+ * - 0 if there are no more blocks in the tape or in this run (call
+ *   _bt_tapereset to clear the End-Of-Run marker)
+ * - 1 if a valid block was read
+ */
+static int
+_bt_taperead(BTTapeBlock *tape)
+{
+    int fd;
+    int nread;
+
+    if (tape->bttb_eor) {
+	return(0);		/* we are at End-Of-Run */
+    }
+
+    /*
+     * we're clobbering the old tape block, but we do need to save the
+     * VFD (the one in the block we're reading is bogus).
+     */
+    fd = tape->bttb_fd;
+    nread = FileRead(fd, (char*) tape, TAPEBLCKSZ);
+    tape->bttb_fd = fd;
+
+    if (nread != TAPEBLCKSZ) {
+	Assert(nread == 0);	/* we are at EOF */
+	return(0);
+    }
+    Assert(tape->bttb_magic == BTTAPEMAGIC);
+    return(1);
+}
+
+/*
+ * get the next BTItem from a tape block.
+ *
+ * returns:
+ * - NULL if we have run out of BTItems
+ * - a pointer to the BTItemData in the block otherwise
+ *
+ * side effects:
+ * - sets 'pos' to the current position within the block.
+ */
+static BTItem
+_bt_tapenext(BTTapeBlock *tape, char **pos)
+{
+    Size itemsz;
+    BTItem bti;
+
+    if (*pos >= tape->bttb_data + tape->bttb_top) {
+	return((BTItem) NULL);
+    }
+    bti = (BTItem) *pos;
+    itemsz = BTITEMSZ(bti);
+    *pos += DOUBLEALIGN(itemsz);
+    return(bti);
+}
+
+/*
+ * copy a BTItem into a tape block.
+ *
+ * assumes that we have already checked to see if the block has enough
+ * space for the item.
+ *
+ * side effects:
+ *
+ * - advances the 'top' pointer in the tape block header to point to
+ * the beginning of free space.
+ */
+static void
+_bt_tapeadd(BTTapeBlock *tape, BTItem item, int itemsz)
+{
+    (void) memcpy(tape->bttb_data + tape->bttb_top, item, itemsz);
+    ++tape->bttb_ntup;
+    tape->bttb_top += DOUBLEALIGN(itemsz);
+}
+
+/*-------------------------------------------------------------------------
+ * spool methods
+ *-------------------------------------------------------------------------
+ */
+
+/*
+ * this structure holds the bookkeeping for a simple balanced multiway
+ * merge.  (polyphase merging is hairier than i want to get into right
+ * now, and i don't see why i have to care how many "tapes" i use
+ * right now.  though if psort was in a condition that i could hack it
+ * to do this, you bet i would.)
+ */
+typedef struct {
+    int		bts_ntapes;
+    int		bts_tape;
+    BTTapeBlock	**bts_itape;	/* input tape blocks */
+    BTTapeBlock	**bts_otape;	/* output tape blocks */
+} BTSpool;
+
+/*
+ * create and initialize a spool structure, including the underlying
+ * files.
+ */
+void *
+_bt_spoolinit(Relation index, int ntapes)
+{
+    char *mktemp();
+
+    BTSpool *btspool = (BTSpool *) palloc(sizeof(BTSpool));
+    int i;
+    char *fname = (char *) palloc(sizeof(TAPETEMP) + 1);
+
+    if (btspool == (BTSpool *) NULL || fname == (char *) NULL) {
+	elog(WARN, "_bt_spoolinit: out of memory");
+    }
+    (void) memset((char *) btspool, 0, sizeof(BTSpool));
+    btspool->bts_ntapes = ntapes;
+    btspool->bts_tape = 0;
+
+    btspool->bts_itape =
+	(BTTapeBlock **) palloc(sizeof(BTTapeBlock *) * ntapes);
+    btspool->bts_otape =
+	(BTTapeBlock **) palloc(sizeof(BTTapeBlock *) * ntapes);
+    if (btspool->bts_itape == (BTTapeBlock **) NULL ||
+	btspool->bts_otape == (BTTapeBlock **) NULL) {
+	elog(WARN, "_bt_spoolinit: out of memory");
+    }
+
+    for (i = 0; i < ntapes; ++i) {
+	btspool->bts_itape[i] =
+	    _bt_tapecreate(mktemp(strcpy(fname, TAPETEMP)));
+	btspool->bts_otape[i] =
+	    _bt_tapecreate(mktemp(strcpy(fname, TAPETEMP)));
+    }
+    pfree((void *) fname);
+
+    _bt_isortcmpinit(index);
+
+    return((void *) btspool);
+}
+
+/*
+ * clean up a spool structure and its substructures.
+ */
+void
+_bt_spooldestroy(void *spool)
+{
+    BTSpool *btspool = (BTSpool *) spool;
+    int i;
+
+    for (i = 0; i < btspool->bts_ntapes; ++i) {
+	_bt_tapedestroy(btspool->bts_otape[i]);
+	_bt_tapedestroy(btspool->bts_itape[i]);
+    }
+    pfree((void *) btspool);
+}
+
+/*
+ * flush out any dirty output tape blocks
+ */
+static void
+_bt_spoolflush(BTSpool *btspool)
+{
+    int i;
+
+    for (i = 0; i < btspool->bts_ntapes; ++i) {
+	if (!EMPTYTAPE(btspool->bts_otape[i])) {
+	    _bt_tapewrite(btspool->bts_otape[i], 1);
+	}
+    }
+}
+
+/*
+ * swap input tapes and output tapes by swapping their file
+ * descriptors.  additional preparation for the next merge pass
+ * includes rewinding the new input tapes and clearing out the new
+ * output tapes.
+ */
+static void
+_bt_spoolswap(BTSpool *btspool)
+{
+    File tmpfd;
+    BTTapeBlock *itape;
+    BTTapeBlock *otape;
+    int i;
+
+    for (i = 0; i < btspool->bts_ntapes; ++i) {
+	itape = btspool->bts_itape[i];
+	otape = btspool->bts_otape[i];
+
+	/*
+	 * swap the input and output VFDs.
+	 */
+	tmpfd = itape->bttb_fd;
+	itape->bttb_fd = otape->bttb_fd;
+	otape->bttb_fd = tmpfd;
+
+	/*
+	 * rewind the new input tape.
+	 */
+	_bt_taperewind(itape);
+	_bt_tapereset(itape);
+
+	/*
+	 * clear the new output tape -- it's ok to throw away the old
+	 * inputs.
+	 */
+	_bt_tapeclear(otape);
+    }    
+}
+
+/*-------------------------------------------------------------------------
+ * sorting routines
+ *-------------------------------------------------------------------------
+ */
+
+/*
+ * spool 'btitem' into an initial run.  as tape blocks are filled, the
+ * block BTItems are qsorted and written into some output tape (it
+ * doesn't matter which; we go round-robin for simplicity).  the
+ * initial runs are therefore always just one block.
+ */
+void
+_bt_spool(Relation index, BTItem btitem, void *spool)
+{
+    BTSpool *btspool = (BTSpool *) spool;
+    BTTapeBlock *itape;
+    Size itemsz;
+
+    itape = btspool->bts_itape[btspool->bts_tape];
+    itemsz = BTITEMSZ(btitem);
+    itemsz = DOUBLEALIGN(itemsz);
+
+    /*
+     * if this buffer is too full for this BTItemData, or if we have
+     * run out of BTItems, we need to sort the buffer and write it
+     * out.  in this case, the BTItemData will go into the next tape's
+     * buffer.
+     */
+    if (btitem == (BTItem) NULL || SPCLEFT(itape) < itemsz) {
+	BTItem *parray;
+	BTTapeBlock *otape;
+	BTItem bti;
+	char *pos;
+	int btisz;
+	int i;
+
+	/*
+	 * build an array of pointers to the BTItemDatas on the input
+	 * block.
+	 */
+	parray = (BTItem *) palloc(itape->bttb_ntup * sizeof(BTItem));
+	if (parray == (BTItem *) NULL) {
+	    elog(WARN, "_bt_spool: out of memory");
+	}
+	pos = itape->bttb_data;
+	for (i = 0; i < itape->bttb_ntup; ++i) {
+	    parray[i] = _bt_tapenext(itape, &pos);
+	}
+
+	/*
+	 * qsort the pointer array.
+	 */
+	_bt_isortcmpinit(index);
+	qsort((void *) parray, itape->bttb_ntup, sizeof(BTItem), _bt_isortcmp);
+
+	/*
+	 * write the spooled run into the output tape.  we copy the
+	 * BTItemDatas in the order dictated by the sorted array of
+	 * BTItems, not the original order.
+	 *
+	 * (since everything was DOUBLEALIGN'd and is all on a single
+	 * page, everything had *better* still fit on one page..)
+	 */
+	otape = btspool->bts_otape[btspool->bts_tape];
+	for (i = 0; i < itape->bttb_ntup; ++i) {
+	    bti = parray[i];
+	    btisz = BTITEMSZ(bti);
+	    btisz = DOUBLEALIGN(btisz);
+	    _bt_tapeadd(otape, bti, btisz);
+#ifdef FASTBUILD_DEBUG
+	    {
+		bool isnull;
+		Datum d = index_getattr(&(bti->bti_itup), 1,
+					RelationGetTupleDescriptor(index),
+					&isnull);
+		printf("_bt_spool: inserted <%x> into output tape %d\n",
+		       d, btspool->bts_tape);
+	    }
+#endif /* FASTBUILD_DEBUG */
+	}
+
+	/*
+	 * the initial runs are always single tape blocks.  flush the
+	 * output block, marking End-Of-Run.
+	 */
+	_bt_tapewrite(otape, 1);
+
+	/*
+	 * reset the input buffer for the next run.  we don't have to
+	 * write it out or anything -- we only use it to hold the
+	 * unsorted BTItemDatas, the output tape contains all the
+	 * sorted stuff.
+	 *
+	 * changing bts_tape changes the output tape and input tape;
+	 * we change itape for the code below.
+	 */
+	_bt_tapereset(itape);
+	btspool->bts_tape = (btspool->bts_tape + 1) % btspool->bts_ntapes;
+	itape = btspool->bts_itape[btspool->bts_tape];
+
+	/*
+	 * destroy the pointer array.
+	 */
+	pfree((void *) parray);
+    }
+
+    /* insert this item into the current buffer */
+    if (btitem != (BTItem) NULL) {
+	_bt_tapeadd(itape, btitem, itemsz);
+    }
+}
+
+/*
+ * allocate a new, clean btree page, not linked to any siblings.
+ */
+static void
+_bt_blnewpage(Relation index, Buffer *buf, Page *page, int flags)
+{
+    BTPageOpaque opaque;
+
+    *buf = _bt_getbuf(index, P_NEW, BT_WRITE);
+    *page = BufferGetPage(*buf);
+    _bt_pageinit(*page, BufferGetPageSize(*buf));
+    opaque = (BTPageOpaque) PageGetSpecialPointer(*page);
+    opaque->btpo_prev = opaque->btpo_next = P_NONE;
+    opaque->btpo_flags = flags;
+}
+
+/*
+ * slide an array of ItemIds back one slot (from P_FIRSTKEY to
+ * P_HIKEY).  we need to do this when we discover that we have built
+ * an ItemId array in what has turned out to be a P_RIGHTMOST page.
+ */
+static void
+_bt_slideleft(Relation index, Buffer buf, Page page)
+{
+    OffsetNumber off;
+    OffsetNumber maxoff;
+    ItemId previi;
+    ItemId thisii;
+
+    maxoff = PageGetMaxOffsetNumber(page);
+    previi = PageGetItemId(page, P_HIKEY);
+    for (off = P_FIRSTKEY; off <= maxoff; off = OffsetNumberNext(off)) {
+	thisii = PageGetItemId(page, off);
+	*previi = *thisii;
+	previi = thisii;
+    }
+    ((PageHeader) page)->pd_lower -= sizeof(ItemIdData);
+}
+
+typedef struct {
+    Buffer		btps_buf;
+    Page		btps_page;
+    BTItem		btps_lastbti;
+    OffsetNumber	btps_lastoff;
+    OffsetNumber	btps_firstoff;
+} BTPageState;
+
+/*
+ * add an item to a disk page from a merge tape block.
+ *
+ * we must be careful to observe the following restrictions, placed
+ * upon us by the conventions in nbtsearch.c:
+ * - rightmost pages start data items at P_HIKEY instead of at
+ *   P_FIRSTKEY.
+ * - duplicates cannot be split among pages unless the chain of
+ *   duplicates starts at the first data item.
+ *
+ * a leaf page being built looks like:
+ *
+ * +----------------+---------------------------------+
+ * | PageHeaderData | linp0 linp1 linp2 ...           |
+ * +-----------+----+---------------------------------+
+ * | ... linpN |                  ^ first             |
+ * +-----------+--------------------------------------+
+ * |     ^ last                                       |
+ * |                                                  |
+ * |               v last                             |
+ * +-------------+------------------------------------+
+ * |             | itemN ...                          |
+ * +-------------+------------------+-----------------+
+ * |          ... item3 item2 item1 | "special space" |
+ * +--------------------------------+-----------------+
+ *                      ^ first
+ *
+ * contrast this with the diagram in bufpage.h; note the mismatch
+ * between linps and items.  this is because we reserve linp0 as a
+ * placeholder for the pointer to the "high key" item; when we have
+ * filled up the page, we will set linp0 to point to itemN and clear
+ * linpN.
+ *
+ * 'last' pointers indicate the last offset/item added to the page.
+ * 'first' pointers indicate the first offset/item that is part of a
+ * chain of duplicates extending from 'first' to 'last'.
+ *
+ * if all keys are unique, 'first' will always be the same as 'last'.
+ */
+static void
+_bt_buildadd(Relation index, BTPageState *state, BTItem bti, int flags)
+{
+    Buffer nbuf;
+    Page npage;
+    BTItem last_bti;
+    OffsetNumber first_off;
+    OffsetNumber last_off;
+    OffsetNumber off;
+    Size pgspc;
+    Size btisz;
+
+    nbuf = state->btps_buf;
+    npage = state->btps_page;
+    first_off = state->btps_firstoff;
+    last_off = state->btps_lastoff;
+    last_bti = state->btps_lastbti;
+
+    pgspc = PageGetFreeSpace(npage);
+    btisz = BTITEMSZ(bti);
+    btisz = DOUBLEALIGN(btisz);
+    if (pgspc < btisz) {
+	Buffer obuf = nbuf;
+	Page opage = npage;
+	OffsetNumber o, n;
+	ItemId ii;
+	ItemId hii;
+
+	_bt_blnewpage(index, &nbuf, &npage, flags);
+
+	/*
+	 * if 'last' is part of a chain of duplicates that does not
+	 * start at the beginning of the old page, the entire chain is
+	 * copied to the new page; we delete all of the duplicates
+	 * from the old page except the first, which becomes the high
+	 * key item of the old page.
+	 *
+	 * if the chain starts at the beginning of the page or there
+	 * is no chain ('first' == 'last'), we need only copy 'last'
+	 * to the new page.  again, 'first' (== 'last') becomes the
+	 * high key of the old page.
+	 *
+	 * note that in either case, we copy at least one item to the
+	 * new page, so 'last_bti' will always be valid.  'bti' will
+	 * never be the first data item on the new page.
+	 */
+	if (first_off == P_FIRSTKEY) {
+	    Assert(last_off != P_FIRSTKEY);
+	    first_off = last_off;
+	}
+	for (o = first_off, n = P_FIRSTKEY;
+	     o <= last_off;
+	     o = OffsetNumberNext(o), n = OffsetNumberNext(n)) {
+	    ii = PageGetItemId(opage, o);
+	    (void) PageAddItem(npage, PageGetItem(opage, ii),
+			       ii->lp_len, n, LP_USED);
+#ifdef FASTBUILD_DEBUG
+	    {
+		bool isnull;
+		BTItem tmpbti =
+		    (BTItem) PageGetItem(npage, PageGetItemId(npage, n));
+		Datum d = index_getattr(&(tmpbti->bti_itup), 1,
+					RelationGetTupleDescriptor(index),
+					&isnull);
+		printf("_bt_buildadd: moved <%x> to offset %d\n",
+		       d, n);
+	    }
+#endif /* FASTBUILD_DEBUG */
+	}
+	for (o = last_off; o > first_off; o = OffsetNumberPrev(o)) {
+	    PageIndexTupleDelete(opage, o);
+	}
+	hii = PageGetItemId(opage, P_HIKEY);
+	ii = PageGetItemId(opage, first_off);
+	*hii = *ii;
+	ii->lp_flags &= ~LP_USED;
+	((PageHeader) opage)->pd_lower -= sizeof(ItemIdData);
+
+	first_off = P_FIRSTKEY;
+	last_off = PageGetMaxOffsetNumber(npage);
+	last_bti = (BTItem) PageGetItem(npage, PageGetItemId(npage, last_off));
+
+	/*
+	 * set the page (side link) pointers.
+	 */
+	{
+	    BTPageOpaque oopaque = (BTPageOpaque) PageGetSpecialPointer(opage);
+	    BTPageOpaque nopaque = (BTPageOpaque) PageGetSpecialPointer(npage);
+
+	    oopaque->btpo_next = BufferGetBlockNumber(nbuf);
+	    nopaque->btpo_prev = BufferGetBlockNumber(obuf);
+	    nopaque->btpo_next = P_NONE;
+	}
+
+	/*
+	 * write out the old stuff.  we never want to see it again, so
+	 * we can give up our lock (if we had one; BuildingBtree is
+	 * set, so we aren't locking).
+	 */
+	_bt_wrtbuf(index, obuf);
+    }
+    
+    /*
+     * if this item is different from the last item added, we start a
+     * new chain of duplicates.
+     */
+    off = OffsetNumberNext(last_off);
+    (void) PageAddItem(npage, (Item) bti, btisz, off, LP_USED);
+#ifdef FASTBUILD_DEBUG
+    {
+	bool isnull;
+	Datum d = index_getattr(&(bti->bti_itup), 1, 
+				RelationGetTupleDescriptor(index),
+				&isnull);
+	printf("_bt_buildadd: inserted <%x> at offset %d\n",
+	       d, off);
+    }
+#endif /* FASTBUILD_DEBUG */
+    if (last_bti == (BTItem) NULL) {
+	first_off = P_FIRSTKEY;
+    } else if (!_bt_itemcmp(index, 1, bti, last_bti, BTEqualStrategyNumber)) {
+	first_off = off;
+    }
+    last_off = off;
+    last_bti = (BTItem) PageGetItem(npage, PageGetItemId(npage, off));
+
+    state->btps_buf = nbuf;
+    state->btps_page = npage;
+    state->btps_lastbti = last_bti;
+    state->btps_lastoff = last_off;
+    state->btps_firstoff = first_off;
+}
+
+/*
+ * take the input tapes stored by 'btspool' and perform successive
+ * merging passes until at most one run is left in each tape.  at that
+ * point, merge the final tape runs into a set of btree leaves.
+ *
+ * XXX three nested loops?  gross.  cut me up into smaller routines.
+ */
+static BlockNumber
+_bt_merge(Relation index, BTSpool *btspool)
+{
+    BTPageState state;
+    BlockNumber firstblk;
+    BTPriQueue q;
+    BTPriQueueElem e;
+    BTItem bti;
+    BTTapeBlock *itape;
+    BTTapeBlock *otape;
+    char *tapepos[MAXTAPES];
+    int tapedone[MAXTAPES];
+    int t;
+    int goodtapes;
+    int nruns;
+    Size btisz;
+    bool doleaf = false;
+
+    /*
+     * initialize state needed for the merge into the btree leaf pages.
+     */
+    (void) memset((char *) &state, 0, sizeof(BTPageState));
+    _bt_blnewpage(index, &(state.btps_buf), &(state.btps_page), BTP_LEAF);
+    state.btps_lastoff = P_HIKEY;
+    state.btps_lastbti = (BTItem) NULL;
+    firstblk = BufferGetBlockNumber(state.btps_buf);
+
+    do {							/* pass */
+	/*
+	 * each pass starts by flushing the previous outputs and
+	 * swapping inputs and outputs.  this process also clears the
+	 * new output tapes and rewinds the new input tapes.
+	 */
+	btspool->bts_tape = btspool->bts_ntapes - 1;
+	_bt_spoolflush(btspool);
+	_bt_spoolswap(btspool);
+
+	nruns = 0;
+
+	for (;;) {						/* run */
+	    /*
+	     * each run starts by selecting a new output tape.  the
+	     * merged results of a given run are always sent to this
+	     * one tape.
+	     */
+	    btspool->bts_tape = (btspool->bts_tape + 1) % btspool->bts_ntapes;
+	    otape = btspool->bts_otape[btspool->bts_tape];
+
+	    /*
+	     * initialize the priority queue by loading it with the
+	     * first element of the given run in each tape.  since we
+	     * are starting a new run, we reset the tape (clearing the
+	     * End-Of-Run marker) before reading it.  this means that
+	     * _bt_taperead will return 0 only if the tape is actually
+	     * at EOF.
+	     */
+	    (void) memset((char *) &q, 0, sizeof(BTPriQueue));
+	    goodtapes = 0;
+	    for (t = 0; t < btspool->bts_ntapes; ++t) {
+		itape = btspool->bts_itape[t];
+		tapepos[t] = itape->bttb_data;
+		_bt_tapereset(itape);
+		if (_bt_taperead(itape) == 0) {
+		    tapedone[t] = 1;
+		} else {
+		    ++goodtapes;
+		    tapedone[t] = 0;
+		    e.btpqe_tape = t;
+		    e.btpqe_item = _bt_tapenext(itape, &tapepos[t]);
+		    if (e.btpqe_item != (BTItem) NULL) {
+			_bt_pqadd(&q, &e);
+		    }
+		}
+	    }
+	    /*
+	     * if we don't have any tapes with any input (i.e., they
+	     * are all at EOF), we must be done with this pass.
+	     */
+	    if (goodtapes == 0) {
+		break;	/* for */
+	    }
+	    ++nruns;
+	
+	    /*
+	     * output the smallest element from the queue until there are no
+	     * more.
+	     */
+	    while (_bt_pqnext(&q, &e) >= 0) {			/* item */
+		/*
+		 * replace the element taken from priority queue,
+		 * fetching a new block if needed.  a tape can run out
+		 * if it hits either End-Of-Run or EOF.
+		 */
+		t = e.btpqe_tape;
+		bti = e.btpqe_item;
+		if (bti != (BTItem) NULL) {
+		    btisz = BTITEMSZ(bti);
+		    btisz = DOUBLEALIGN(btisz);
+		    if (doleaf) {
+			_bt_buildadd(index, &state, bti, BTP_LEAF);
+#ifdef FASTBUILD_DEBUG
+			{
+			    bool isnull;
+			    Datum d = index_getattr(&(bti->bti_itup), 1,
+				    RelationGetTupleDescriptor(index),
+						    &isnull);
+			    printf("_bt_merge: inserted <%x> into block %d\n",
+				   d, BufferGetBlockNumber(state.btps_buf));
+			}
+#endif /* FASTBUILD_DEBUG */
+		    } else {
+			if (SPCLEFT(otape) < btisz) {
+			    /*
+			     * if it's full, write it out and add the
+			     * item to the next block.  (since we know
+			     * there will be at least one more block,
+			     * we know we do *not* want to set
+			     * End-Of-Run here!)
+			     */
+			    _bt_tapewrite(otape, 0);
+			}
+			_bt_tapeadd(otape, bti, btisz);
+#ifdef FASTBUILD_DEBUG
+			{
+			    bool isnull;
+			    Datum d = index_getattr(&(bti->bti_itup), 1,
+				  RelationGetTupleDescriptor(index), &isnull);
+			    printf("_bt_merge: inserted <%x> into tape %d\n",
+				   d, btspool->bts_tape);
+			}
+#endif /* FASTBUILD_DEBUG */
+		    }
+		}
+#ifdef FASTBUILD_DEBUG
+		{
+		    bool isnull;
+		    Datum d = index_getattr(&(bti->bti_itup), 1,
+					   RelationGetTupleDescriptor(index),
+					    &isnull);
+		    printf("_bt_merge: got <%x> from tape %d\n", d, t);
+		}
+#endif /* FASTBUILD_DEBUG */
+
+		itape = btspool->bts_itape[t];
+		if (!tapedone[t]) {
+		    BTItem newbti = _bt_tapenext(itape, &tapepos[t]);
+
+		    if (newbti == (BTItem) NULL) {
+			if (_bt_taperead(itape) == 0) {
+			    tapedone[t] = 1;
+			} else {
+			    tapepos[t] = itape->bttb_data;
+			    newbti = _bt_tapenext(itape, &tapepos[t]);
+			}
+		    }
+		    if (newbti != (BTItem) NULL) {
+			BTPriQueueElem nexte;
+			
+			nexte.btpqe_tape = t;
+			nexte.btpqe_item = newbti;
+			_bt_pqadd(&q, &nexte);
+		    }
+		}
+	    } 							/* item */
+	}							/* run */
+	
+	/*
+	 * we are here because we ran out of input on all of the input
+	 * tapes.
+	 *
+	 * if this pass did not generate more actual output runs than
+	 * we have tapes, we know we have at most one run in each
+	 * tape.  this means that we are ready to merge into the final
+	 * btree leaf pages instead of merging into a tape file.
+	 */
+	if (nruns <= btspool->bts_ntapes) {
+	    doleaf = true;
+	}
+    } while (nruns > 0);					/* pass */
+
+    /*
+     * this is the rightmost page, so the ItemId array needs to be
+     * slid back one slot.
+     */
+    _bt_slideleft(index, state.btps_buf, state.btps_page);
+    _bt_wrtbuf(index, state.btps_buf);
+
+    return(firstblk);
+}
+
+
+/*
+ * given the block number 'blk' of the first page of a set of linked
+ * siblings (i.e., the start of an entire level of the btree),
+ * construct the corresponding next level of the btree.  we do this by
+ * placing minimum keys from each page into this page.  the format of
+ * the internal pages is otherwise the same as for leaf pages.
+ */
+void
+_bt_upperbuild(Relation index, BlockNumber blk, int level)
+{
+    Buffer rbuf;
+    Page rpage;
+    BTPageOpaque ropaque;
+    BTPageState state;
+    BlockNumber firstblk;
+    BTItem bti;
+    BTItem nbti;
+    OffsetNumber off;
+
+    rbuf = _bt_getbuf(index, blk, BT_WRITE);
+    rpage = BufferGetPage(rbuf);
+    ropaque = (BTPageOpaque) PageGetSpecialPointer(rpage);
+
+    /*
+     * if we only have one page on a level, we can just make it the
+     * root.
+     */
+    if (P_RIGHTMOST(ropaque)) {
+	ropaque->btpo_flags |= BTP_ROOT;
+	_bt_wrtbuf(index, rbuf);
+	_bt_metaproot(index, blk);
+	return;
+    }
+    _bt_relbuf(index, rbuf, BT_WRITE);
+	
+    (void) memset((char *) &state, 0, sizeof(BTPageState));
+    _bt_blnewpage(index, &(state.btps_buf), &(state.btps_page), 0);
+    state.btps_lastoff = P_HIKEY;
+    state.btps_lastbti = (BTItem) NULL;
+    firstblk = BufferGetBlockNumber(state.btps_buf);
+    
+    /* for each page... */
+    do {
+	rbuf = _bt_getbuf(index, blk, BT_READ);
+	rpage = BufferGetPage(rbuf);
+	ropaque = (BTPageOpaque) PageGetSpecialPointer(rpage);
+	
+	/* for each item... */
+	if (!PageIsEmpty(rpage)) {
+	    /*
+	     * form a new index tuple corresponding to the minimum key
+	     * of the lower page and insert it into a page at this
+	     * level.
+	     */
+	    off = P_RIGHTMOST(ropaque) ? P_HIKEY : P_FIRSTKEY;
+	    bti = (BTItem) PageGetItem(rpage, PageGetItemId(rpage, off));
+	    nbti = _bt_formitem(&(bti->bti_itup));
+	    ItemPointerSet(&(nbti->bti_itup.t_tid), blk, P_HIKEY);
+#ifdef FASTBUILD_DEBUG
+	    {
+		bool isnull;
+		Datum d = index_getattr(&(nbti->bti_itup), 1, 
+					RelationGetTupleDescriptor(index),
+					&isnull);
+		printf("_bt_upperbuild: inserting <%x> at %d\n",
+		       d, level);
+	    }
+#endif /* FASTBUILD_DEBUG */
+	    _bt_buildadd(index, &state, nbti, 0);
+	    pfree((void *) nbti);
+	}
+	blk = ropaque->btpo_next;
+	_bt_relbuf(index, rbuf, BT_READ);
+    } while (blk != P_NONE);
+	
+    /*
+     * this is the rightmost page, so the ItemId array needs to be
+     * slid back one slot.
+     */
+    _bt_slideleft(index, state.btps_buf, state.btps_page);
+    _bt_wrtbuf(index, state.btps_buf);
+    
+    _bt_upperbuild(index, firstblk, level + 1);
+}
+
+/*
+ * given a spool loading by successive calls to _bt_spool, create an
+ * entire btree.
+ */
+void
+_bt_leafbuild(Relation index, void *spool)
+{
+    BTSpool *btspool = (BTSpool *) spool;
+    BlockNumber firstblk;
+
+    /*
+     * merge the runs into btree leaf pages.
+     */
+    firstblk = _bt_merge(index, btspool);
+
+    /*
+     * build the upper levels of the btree.
+     */
+    _bt_upperbuild(index, firstblk, 0);
+}
+
+#else /* !FASTBUILD */
+
+void *_bt_spoolinit(Relation index, int ntapes) { return((void *) NULL); }
+void _bt_spooldestroy(void *spool) { }
+void _bt_spool(Relation index, BTItem btitem, void *spool) { }
+void _bt_upperbuild(Relation index, BlockNumber blk, int level) { }
+void _bt_leafbuild(Relation index, void *spool) { }
+
+#endif /* !FASTBUILD */
diff --git a/src/backend/access/nbtree/nbtstrat.c b/src/backend/access/nbtree/nbtstrat.c
new file mode 100644
index 0000000000..2214c60950
--- /dev/null
+++ b/src/backend/access/nbtree/nbtstrat.c
@@ -0,0 +1,134 @@
+/*-------------------------------------------------------------------------
+ *
+ * btstrat.c--
+ *    Srategy map entries for the btree indexed access method
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *    $Header: /cvsroot/pgsql/src/backend/access/nbtree/Attic/nbtstrat.c,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "storage/bufpage.h"
+
+#include "utils/elog.h"
+#include "utils/rel.h"
+#include "utils/excid.h"
+
+#include "access/genam.h"
+#include "access/nbtree.h"
+
+/*
+ * Note:
+ *	StrategyNegate, StrategyCommute, and StrategyNegateCommute
+ *	assume <, <=, ==, >=, > ordering.
+ */
+static StrategyNumber	BTNegate[5] = {
+    BTGreaterEqualStrategyNumber,
+    BTGreaterStrategyNumber,
+    InvalidStrategy,
+    BTLessStrategyNumber,
+    BTLessEqualStrategyNumber
+};
+
+static StrategyNumber	BTCommute[5] = {
+    BTGreaterStrategyNumber,
+    BTGreaterEqualStrategyNumber,
+    InvalidStrategy,
+    BTLessEqualStrategyNumber,
+    BTLessStrategyNumber
+};
+
+static StrategyNumber	BTNegateCommute[5] = {
+    BTLessEqualStrategyNumber,
+    BTLessStrategyNumber,
+    InvalidStrategy,
+    BTGreaterStrategyNumber,
+    BTGreaterEqualStrategyNumber
+};
+
+static uint16	BTLessTermData[] = {		/* XXX type clash */
+    2,
+    BTLessStrategyNumber,
+    SK_NEGATE,
+    BTLessStrategyNumber,
+    SK_NEGATE | SK_COMMUTE
+};
+
+static uint16	BTLessEqualTermData[] = {	/* XXX type clash */
+    2,
+    BTLessEqualStrategyNumber,
+    0x0,
+    BTLessEqualStrategyNumber,
+    SK_COMMUTE
+};
+
+static uint16	BTGreaterEqualTermData[] = {	/* XXX type clash */
+    2,
+    BTGreaterEqualStrategyNumber,
+    0x0,
+    BTGreaterEqualStrategyNumber,
+    SK_COMMUTE
+    };
+
+static uint16	BTGreaterTermData[] = {		/* XXX type clash */
+    2,
+    BTGreaterStrategyNumber,
+    SK_NEGATE,
+    BTGreaterStrategyNumber,
+    SK_NEGATE | SK_COMMUTE
+};
+
+static StrategyTerm	BTEqualExpressionData[] = {
+    (StrategyTerm)BTLessTermData,		/* XXX */
+    (StrategyTerm)BTLessEqualTermData,		/* XXX */
+    (StrategyTerm)BTGreaterEqualTermData,	/* XXX */
+    (StrategyTerm)BTGreaterTermData,		/* XXX */
+    NULL
+};
+
+static StrategyEvaluationData	BTEvaluationData = {
+    /* XXX static for simplicity */
+    
+    BTMaxStrategyNumber,
+    (StrategyTransformMap)BTNegate,	/* XXX */
+    (StrategyTransformMap)BTCommute,	/* XXX */
+    (StrategyTransformMap)BTNegateCommute,	/* XXX */
+
+    { NULL, NULL, (StrategyExpression)BTEqualExpressionData, NULL, NULL,
+      NULL,NULL,NULL,NULL,NULL,NULL,NULL}
+};
+
+/* ----------------------------------------------------------------
+ *	RelationGetBTStrategy
+ * ----------------------------------------------------------------
+ */
+
+StrategyNumber
+_bt_getstrat(Relation rel,
+	     AttrNumber attno,
+	     RegProcedure proc)
+{
+    StrategyNumber	strat;
+    
+    strat = RelationGetStrategy(rel, attno, &BTEvaluationData, proc);
+    
+    Assert(StrategyNumberIsValid(strat));
+    
+    return (strat);
+}
+
+bool
+_bt_invokestrat(Relation rel,
+		AttrNumber attno,
+		StrategyNumber strat,
+		Datum left,
+		Datum right)
+{
+    return (RelationInvokeStrategy(rel, &BTEvaluationData, attno, strat, 
+				   left, right));
+}
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
new file mode 100644
index 0000000000..695a2b637c
--- /dev/null
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -0,0 +1,239 @@
+/*-------------------------------------------------------------------------
+ *
+ * btutils.c--
+ *    Utility code for Postgres btree implementation.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *    $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtutils.c,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include <stdio.h>
+#include "postgres.h"
+
+#include "storage/bufmgr.h"
+#include "storage/bufpage.h"
+
+#include "fmgr.h"
+#include "utils/elog.h"
+#include "utils/palloc.h"
+#include "utils/rel.h"
+#include "utils/excid.h"
+#include "utils/datum.h"
+
+#include "access/heapam.h"
+#include "access/genam.h"
+#include "access/iqual.h"
+#include "access/nbtree.h"
+
+ScanKey 
+_bt_mkscankey(Relation rel, IndexTuple itup)
+{     
+    ScanKey skey;
+    TupleDesc itupdesc;
+    int natts;
+    int i;
+    Datum arg;
+    RegProcedure proc;
+    bool null;
+    
+    natts = rel->rd_rel->relnatts;
+    itupdesc = RelationGetTupleDescriptor(rel);
+    
+    skey = (ScanKey) palloc(natts * sizeof(ScanKeyData));
+    
+    for (i = 0; i < natts; i++) {
+	arg = index_getattr(itup, i + 1, itupdesc, &null);
+	proc = index_getprocid(rel, i + 1, BTORDER_PROC);
+	ScanKeyEntryInitialize(&skey[i],
+			       0x0, (AttrNumber) (i + 1), proc, arg);
+    }
+    
+    return (skey);
+}
+
+void
+_bt_freeskey(ScanKey skey)
+{
+    pfree(skey);
+}
+
+void
+_bt_freestack(BTStack stack)
+{
+    BTStack ostack;
+    
+    while (stack != (BTStack) NULL) {
+	ostack = stack;
+	stack = stack->bts_parent;
+	pfree(ostack->bts_btitem);
+	pfree(ostack);
+    }
+}
+
+/*
+ *  _bt_orderkeys() -- Put keys in a sensible order for conjunctive quals.
+ *
+ *	The order of the keys in the qual match the ordering imposed by
+ *	the index.  This routine only needs to be called if there are
+ *	more than one qual clauses using this index.
+ */
+void
+_bt_orderkeys(Relation relation, uint16 *numberOfKeys, ScanKey key)
+{
+    ScanKey xform;
+    ScanKeyData *cur;
+    StrategyMap map;
+    int nbytes;
+    long test;
+    int i, j;
+    int init[BTMaxStrategyNumber+1];
+    
+    /* haven't looked at any strategies yet */
+    for (i = 0; i <= BTMaxStrategyNumber; i++)
+	init[i] = 0;
+    
+    /* get space for the modified array of keys */
+    nbytes = BTMaxStrategyNumber * sizeof(ScanKeyData);
+    xform = (ScanKey) palloc(nbytes);
+    memset(xform, 0, nbytes); 
+    
+    
+    /* get the strategy map for this index/attribute pair */
+    /*
+     *  XXX
+     *  When we support multiple keys in a single index, this is what
+     *  we'll want to do.  At present, the planner is hosed, so we
+     *  hard-wire the attribute number below.  Postgres only does single-
+     *  key indices...
+     * map = IndexStrategyGetStrategyMap(RelationGetIndexStrategy(relation),
+     * 				    BTMaxStrategyNumber,
+     * 				    key->data[0].attributeNumber);
+     */
+    map = IndexStrategyGetStrategyMap(RelationGetIndexStrategy(relation),
+				      BTMaxStrategyNumber,
+				      1 /* XXX */ );
+    
+    /* check each key passed in */
+    for (i = *numberOfKeys; --i >= 0; ) {
+	cur = &key[i];
+	for (j = BTMaxStrategyNumber; --j >= 0; ) {
+	    if (cur->sk_procedure == map->entry[j].sk_procedure)
+		break;
+	}
+	
+	/* have we seen one of these before? */
+	if (init[j]) {
+	    /* yup, use the appropriate value */
+	    test =
+		(long) FMGR_PTR2(cur->sk_func, cur->sk_procedure,
+				 cur->sk_argument, xform[j].sk_argument);
+	    if (test)
+		xform[j].sk_argument = cur->sk_argument;
+	} else {
+	    /* nope, use this value */
+	    memmove(&xform[j], cur, sizeof(*cur));
+	   
+	    init[j] = 1;
+	}
+    }
+    
+    /* if = has been specified, no other key will be used */
+    if (init[BTEqualStrategyNumber - 1]) {
+	init[BTLessStrategyNumber - 1] = 0;
+	init[BTLessEqualStrategyNumber - 1] = 0;
+	init[BTGreaterEqualStrategyNumber - 1] = 0;
+	init[BTGreaterStrategyNumber - 1] = 0;
+    }
+    
+    /* only one of <, <= */
+    if (init[BTLessStrategyNumber - 1]
+	&& init[BTLessEqualStrategyNumber - 1]) {
+	
+	ScanKeyData *lt, *le;
+	
+	lt = &xform[BTLessStrategyNumber - 1];
+	le = &xform[BTLessEqualStrategyNumber - 1];
+	
+	/*
+	 *  DO NOT use the cached function stuff here -- this is key
+	 *  ordering, happens only when the user expresses a hokey
+	 *  qualification, and gets executed only once, anyway.  The
+	 *  transform maps are hard-coded, and can't be initialized
+	 *  in the correct way.
+	 */
+	
+	test = (long) fmgr(le->sk_procedure, le->sk_argument, lt->sk_argument);
+	
+	if (test)
+	    init[BTLessEqualStrategyNumber - 1] = 0;
+	else
+	    init[BTLessStrategyNumber - 1] = 0;
+    }
+    
+    /* only one of >, >= */
+    if (init[BTGreaterStrategyNumber - 1]
+	&& init[BTGreaterEqualStrategyNumber - 1]) {
+	
+	ScanKeyData *gt, *ge;
+	
+	gt = &xform[BTGreaterStrategyNumber - 1];
+	ge = &xform[BTGreaterEqualStrategyNumber - 1];
+	
+	/* see note above on function cache */
+	test = (long) fmgr(ge->sk_procedure, gt->sk_argument, gt->sk_argument);
+	
+	if (test)
+	    init[BTGreaterStrategyNumber - 1] = 0;
+	else
+	    init[BTGreaterEqualStrategyNumber - 1] = 0;
+    }
+    
+    /* okay, reorder and count */
+    j = 0;
+    
+    for (i = BTMaxStrategyNumber; --i >= 0; )
+	if (init[i])
+	    key[j++] = xform[i];
+    
+    *numberOfKeys = j;
+    
+    pfree(xform);
+}
+
+bool
+_bt_checkqual(IndexScanDesc scan, IndexTuple itup)
+{
+    if (scan->numberOfKeys > 0)
+	return (index_keytest(itup, RelationGetTupleDescriptor(scan->relation),
+			      scan->numberOfKeys, scan->keyData));
+    else
+	return (true);
+}
+
+BTItem
+_bt_formitem(IndexTuple itup)
+{
+    int nbytes_btitem;
+    BTItem btitem;
+    Size tuplen;
+    extern Oid newoid();
+    
+    /* disallow nulls in btree keys */
+    if (itup->t_info & INDEX_NULL_MASK)
+	elog(WARN, "btree indices cannot include null keys");
+    
+    /* make a copy of the index tuple with room for the sequence number */
+    tuplen = IndexTupleSize(itup);
+    nbytes_btitem = tuplen +
+	(sizeof(BTItemData) - sizeof(IndexTupleData));
+    
+    btitem = (BTItem) palloc(nbytes_btitem);
+    memmove((char *) &(btitem->bti_itup), (char *) itup, tuplen);
+    
+    btitem->bti_oid = newoid();
+    return (btitem);
+}