diff options
Diffstat (limited to 'src/backend/access/nbtree')
| -rw-r--r-- | src/backend/access/nbtree/Makefile.inc | 15 | ||||
| -rw-r--r-- | src/backend/access/nbtree/README | 68 | ||||
| -rw-r--r-- | src/backend/access/nbtree/nbtcompare.c | 173 | ||||
| -rw-r--r-- | src/backend/access/nbtree/nbtinsert.c | 831 | ||||
| -rw-r--r-- | src/backend/access/nbtree/nbtpage.c | 523 | ||||
| -rw-r--r-- | src/backend/access/nbtree/nbtree.c | 516 | ||||
| -rw-r--r-- | src/backend/access/nbtree/nbtscan.c | 164 | ||||
| -rw-r--r-- | src/backend/access/nbtree/nbtsearch.c | 1133 | ||||
| -rw-r--r-- | src/backend/access/nbtree/nbtsort.c | 1196 | ||||
| -rw-r--r-- | src/backend/access/nbtree/nbtstrat.c | 134 | ||||
| -rw-r--r-- | src/backend/access/nbtree/nbtutils.c | 239 |
11 files changed, 4992 insertions, 0 deletions
diff --git a/src/backend/access/nbtree/Makefile.inc b/src/backend/access/nbtree/Makefile.inc new file mode 100644 index 0000000000..50854008c0 --- /dev/null +++ b/src/backend/access/nbtree/Makefile.inc @@ -0,0 +1,15 @@ +#------------------------------------------------------------------------- +# +# Makefile.inc-- +# Makefile for access/nbtree (btree acess methods) +# +# Copyright (c) 1994, Regents of the University of California +# +# +# IDENTIFICATION +# $Header: /cvsroot/pgsql/src/backend/access/nbtree/Attic/Makefile.inc,v 1.1.1.1 1996/07/09 06:21:11 scrappy Exp $ +# +#------------------------------------------------------------------------- + +SUBSRCS+= nbtcompare.c nbtinsert.c nbtpage.c nbtree.c nbtscan.c nbtsearch.c \ + nbtstrat.c nbtutils.c nbtsort.c diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README new file mode 100644 index 0000000000..a204ad4af0 --- /dev/null +++ b/src/backend/access/nbtree/README @@ -0,0 +1,68 @@ +$Header: /cvsroot/pgsql/src/backend/access/nbtree/README,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $ + +This directory contains a correct implementation of Lehman and Yao's +btree management algorithm that supports concurrent access for Postgres. +We have made the following changes in order to incorporate their algorithm +into Postgres: + + + The requirement that all btree keys be unique is too onerous, + but the algorithm won't work correctly without it. As a result, + this implementation adds an OID (guaranteed to be unique) to + every key in the index. This guarantees uniqueness within a set + of duplicates. Space overhead is four bytes. + + For this reason, when we're passed an index tuple to store by the + common access method code, we allocate a larger one and copy the + supplied tuple into it. No Postgres code outside of the btree + access method knows about this xid or sequence number. + + + Lehman and Yao don't require read locks, but assume that in- + memory copies of tree nodes are unshared. Postgres shares + in-memory buffers among backends. As a result, we do page- + level read locking on btree nodes in order to guarantee that + no record is modified while we are examining it. This reduces + concurrency but guaranteees correct behavior. + + + Read locks on a page are held for as long as a scan has a pointer + to the page. However, locks are always surrendered before the + sibling page lock is acquired (for readers), so we remain deadlock- + free. I will do a formal proof if I get bored anytime soon. + +In addition, the following things are handy to know: + + + Page zero of every btree is a meta-data page. This page stores + the location of the root page, a pointer to a list of free + pages, and other stuff that's handy to know. + + + This algorithm doesn't really work, since it requires ordered + writes, and UNIX doesn't support ordered writes. + + + There's one other case where we may screw up in this + implementation. When we start a scan, we descend the tree + to the key nearest the one in the qual, and once we get there, + position ourselves correctly for the qual type (eg, <, >=, etc). + If we happen to step off a page, decide we want to get back to + it, and fetch the page again, and if some bad person has split + the page and moved the last tuple we saw off of it, then the + code complains about botched concurrency in an elog(WARN, ...) + and gives up the ghost. This is the ONLY violation of Lehman + and Yao's guarantee of correct behavior that I am aware of in + this code. + +Notes to operator class implementors: + + With this implementation, we require the user to supply us with + a procedure for pg_amproc. This procedure should take two keys + A and B and return < 0, 0, or > 0 if A < B, A = B, or A > B, + respectively. See the contents of that relation for the btree + access method for some samples. + +Notes to mao for implementation document: + + On deletions, we need to adjust the position of active scans on + the index. The code in nbtscan.c handles this. We don't need to + do this for splits because of the way splits are handled; if they + happen behind us, we'll automatically go to the next page, and if + they happen in front of us, we're not affected by them. For + insertions, if we inserted a tuple behind the current scan location + on the current scan page, we move one space ahead. diff --git a/src/backend/access/nbtree/nbtcompare.c b/src/backend/access/nbtree/nbtcompare.c new file mode 100644 index 0000000000..e567b3c44c --- /dev/null +++ b/src/backend/access/nbtree/nbtcompare.c @@ -0,0 +1,173 @@ +/*------------------------------------------------------------------------- + * + * btcompare.c-- + * Comparison functions for btree access method. + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtcompare.c,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $ + * + * NOTES + * These functions are stored in pg_amproc. For each operator class + * defined on btrees, they compute + * + * compare(a, b): + * < 0 if a < b, + * = 0 if a == b, + * > 0 if a > b. + *------------------------------------------------------------------------- + */ +#include <string.h> +#include "postgres.h" +#include "utils/nabstime.h" + +int32 +btint2cmp(int16 a, int16 b) +{ + return ((int32) (a - b)); +} + +int32 +btint4cmp(int32 a, int32 b) +{ + return (a - b); +} + +int32 +btint24cmp(int16 a, int32 b) +{ + return (((int32) a) - b); +} + +int32 +btint42cmp(int32 a, int16 b) +{ + return (a - ((int32) b)); +} + +int32 +btfloat4cmp(float32 a, float32 b) +{ + if (*a > *b) + return (1); + else if (*a == *b) + return (0); + else + return (-1); +} + +int32 +btfloat8cmp(float64 a, float64 b) +{ + if (*a > *b) + return (1); + else if (*a == *b) + return (0); + else + return (-1); +} + +int32 +btoidcmp(Oid a, Oid b) +{ + if (a > b) + return (1); + else if (a == b) + return (0); + else + return (-1); +} + +int32 +btabstimecmp(AbsoluteTime a, AbsoluteTime b) +{ + if (AbsoluteTimeIsBefore(a, b)) + return (1); + else if (AbsoluteTimeIsBefore(b, a)) + return (-1); + else + return (0); +} + +int32 +btcharcmp(char a, char b) +{ + return ((int32) (a - b)); +} + +int32 +btchar2cmp(uint16 a, uint16 b) +{ + return (strncmp((char *) &a, (char *) &b, 2)); +} + +int32 +btchar4cmp(uint32 a, uint32 b) +{ + return (strncmp((char *) &a, (char *) &b, 4)); +} + +int32 +btchar8cmp(char *a, char *b) +{ + return (strncmp(a, b, 8)); +} + +int32 +btchar16cmp(char *a, char *b) +{ + return (strncmp(a, b, 16)); +} + +int32 +btnamecmp(NameData *a, NameData *b) +{ + return (strncmp(a->data, b->data, NAMEDATALEN)); +} + +int32 +bttextcmp(struct varlena *a, struct varlena *b) +{ + char *ap, *bp; + int len; + int res; + + ap = VARDATA(a); + bp = VARDATA(b); + + /* len is the length of the shorter of the two strings */ + if ((len = VARSIZE(a)) > VARSIZE(b)) + len = VARSIZE(b); + + /* len includes the four bytes in which string length is stored */ + len -= sizeof(VARSIZE(a)); + + /* + * If the two strings differ in the first len bytes, or if they're + * the same in the first len bytes and they're both len bytes long, + * we're done. + */ + + res = 0; + if (len > 0) { + do { + res = (int) (*ap++ - *bp++); + len--; + } while (res == 0 && len != 0); + } + + if (res != 0 || VARSIZE(a) == VARSIZE(b)) + return (res); + + /* + * The two strings are the same in the first len bytes, and they + * are of different lengths. + */ + + if (VARSIZE(a) < VARSIZE(b)) + return (-1); + else + return (1); +} diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c new file mode 100644 index 0000000000..536c0aa385 --- /dev/null +++ b/src/backend/access/nbtree/nbtinsert.c @@ -0,0 +1,831 @@ +/*------------------------------------------------------------------------- + * + * btinsert.c-- + * Item insertion in Lehman and Yao btrees for Postgres. + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "storage/bufmgr.h" +#include "storage/bufpage.h" + +#include "utils/elog.h" +#include "utils/palloc.h" +#include "utils/rel.h" +#include "utils/excid.h" + +#include "access/heapam.h" +#include "access/genam.h" +#include "access/nbtree.h" + +static InsertIndexResult _bt_insertonpg(Relation rel, Buffer buf, BTStack stack, int keysz, ScanKey scankey, BTItem btitem, BTItem afteritem); +static Buffer _bt_split(Relation rel, Buffer buf); +static OffsetNumber _bt_findsplitloc(Relation rel, Page page, OffsetNumber start, OffsetNumber maxoff, Size llimit); +static void _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf); +static OffsetNumber _bt_pgaddtup(Relation rel, Buffer buf, int keysz, ScanKey itup_scankey, Size itemsize, BTItem btitem, BTItem afteritem); +static bool _bt_goesonpg(Relation rel, Buffer buf, Size keysz, ScanKey scankey, BTItem afteritem); +static void _bt_updateitem(Relation rel, Size keysz, Buffer buf, Oid bti_oid, BTItem newItem); + +/* + * _bt_doinsert() -- Handle insertion of a single btitem in the tree. + * + * This routine is called by the public interface routines, btbuild + * and btinsert. By here, btitem is filled in, and has a unique + * (xid, seqno) pair. + */ +InsertIndexResult +_bt_doinsert(Relation rel, BTItem btitem) +{ + ScanKey itup_scankey; + IndexTuple itup; + BTStack stack; + Buffer buf; + BlockNumber blkno; + int natts; + InsertIndexResult res; + + itup = &(btitem->bti_itup); + + /* we need a scan key to do our search, so build one */ + itup_scankey = _bt_mkscankey(rel, itup); + natts = rel->rd_rel->relnatts; + + /* find the page containing this key */ + stack = _bt_search(rel, natts, itup_scankey, &buf); + blkno = BufferGetBlockNumber(buf); + + /* trade in our read lock for a write lock */ + _bt_relbuf(rel, buf, BT_READ); + buf = _bt_getbuf(rel, blkno, BT_WRITE); + + /* + * If the page was split between the time that we surrendered our + * read lock and acquired our write lock, then this page may no + * longer be the right place for the key we want to insert. In this + * case, we need to move right in the tree. See Lehman and Yao for + * an excruciatingly precise description. + */ + + buf = _bt_moveright(rel, buf, natts, itup_scankey, BT_WRITE); + + /* do the insertion */ + res = _bt_insertonpg(rel, buf, stack, natts, itup_scankey, + btitem, (BTItem) NULL); + + /* be tidy */ + _bt_freestack(stack); + _bt_freeskey(itup_scankey); + + return (res); +} + +/* + * _bt_insertonpg() -- Insert a tuple on a particular page in the index. + * + * This recursive procedure does the following things: + * + * + if necessary, splits the target page. + * + finds the right place to insert the tuple (taking into + * account any changes induced by a split). + * + inserts the tuple. + * + if the page was split, pops the parent stack, and finds the + * right place to insert the new child pointer (by walking + * right using information stored in the parent stack). + * + invoking itself with the appropriate tuple for the right + * child page on the parent. + * + * On entry, we must have the right buffer on which to do the + * insertion, and the buffer must be pinned and locked. On return, + * we will have dropped both the pin and the write lock on the buffer. + * + * The locking interactions in this code are critical. You should + * grok Lehman and Yao's paper before making any changes. In addition, + * you need to understand how we disambiguate duplicate keys in this + * implementation, in order to be able to find our location using + * L&Y "move right" operations. Since we may insert duplicate user + * keys, and since these dups may propogate up the tree, we use the + * 'afteritem' parameter to position ourselves correctly for the + * insertion on internal pages. + */ +static InsertIndexResult +_bt_insertonpg(Relation rel, + Buffer buf, + BTStack stack, + int keysz, + ScanKey scankey, + BTItem btitem, + BTItem afteritem) +{ + InsertIndexResult res; + Page page; + Buffer rbuf; + Buffer pbuf; + Page rpage; + ScanKey newskey; + BTItem ritem; + BTPageOpaque rpageop; + BlockNumber rbknum, itup_blkno; + OffsetNumber itup_off; + int itemsz; + InsertIndexResult newres; + BTItem new_item = (BTItem) NULL; + BTItem lowLeftItem; + + page = BufferGetPage(buf); + itemsz = IndexTupleDSize(btitem->bti_itup) + + (sizeof(BTItemData) - sizeof(IndexTupleData)); + + itemsz = DOUBLEALIGN(itemsz); /* be safe, PageAddItem will do this + but we need to be consistent */ + + if (PageGetFreeSpace(page) < itemsz) { + + /* split the buffer into left and right halves */ + rbuf = _bt_split(rel, buf); + + /* which new page (left half or right half) gets the tuple? */ + if (_bt_goesonpg(rel, buf, keysz, scankey, afteritem)) { + /* left page */ + itup_off = _bt_pgaddtup(rel, buf, keysz, scankey, + itemsz, btitem, afteritem); + itup_blkno = BufferGetBlockNumber(buf); + } else { + /* right page */ + itup_off = _bt_pgaddtup(rel, rbuf, keysz, scankey, + itemsz, btitem, afteritem); + itup_blkno = BufferGetBlockNumber(rbuf); + } + + /* + * By here, + * + * + our target page has been split; + * + the original tuple has been inserted; + * + we have write locks on both the old (left half) and new + * (right half) buffers, after the split; and + * + we have the key we want to insert into the parent. + * + * Do the parent insertion. We need to hold onto the locks for + * the child pages until we locate the parent, but we can release + * them before doing the actual insertion (see Lehman and Yao for + * the reasoning). + */ + + if (stack == (BTStack) NULL) { + + /* create a new root node and release the split buffers */ + _bt_newroot(rel, buf, rbuf); + _bt_relbuf(rel, buf, BT_WRITE); + _bt_relbuf(rel, rbuf, BT_WRITE); + + } else { + + /* form a index tuple that points at the new right page */ + rbknum = BufferGetBlockNumber(rbuf); + rpage = BufferGetPage(rbuf); + rpageop = (BTPageOpaque) PageGetSpecialPointer(rpage); + + /* + * By convention, the first entry (0) on every + * non-rightmost page is the high key for that page. In + * order to get the lowest key on the new right page, we + * actually look at its second (1) entry. + */ + + if (! P_RIGHTMOST(rpageop)) { + ritem = (BTItem) PageGetItem(rpage, + PageGetItemId(rpage, P_FIRSTKEY)); + } else { + ritem = (BTItem) PageGetItem(rpage, + PageGetItemId(rpage, P_HIKEY)); + } + + /* get a unique btitem for this key */ + new_item = _bt_formitem(&(ritem->bti_itup)); + + ItemPointerSet(&(new_item->bti_itup.t_tid), rbknum, P_HIKEY); + + /* find the parent buffer */ + pbuf = _bt_getstackbuf(rel, stack, BT_WRITE); + + /* + * If the key of new_item is < than the key of the item + * in the parent page pointing to the left page + * (stack->bts_btitem), we have to update the latter key; + * otherwise the keys on the parent page wouldn't be + * monotonically increasing after we inserted the new + * pointer to the right page (new_item). This only + * happens if our left page is the leftmost page and a + * new minimum key had been inserted before, which is not + * reflected in the parent page but didn't matter so + * far. If there are duplicate keys and this new minimum + * key spills over to our new right page, we get an + * inconsistency if we don't update the left key in the + * parent page. + */ + + if (_bt_itemcmp(rel, keysz, stack->bts_btitem, new_item, + BTGreaterStrategyNumber)) { + lowLeftItem = + (BTItem) PageGetItem(page, + PageGetItemId(page, P_FIRSTKEY)); + /* page must have right pointer after split */ + _bt_updateitem(rel, keysz, pbuf, stack->bts_btitem->bti_oid, + lowLeftItem); + } + + /* don't need the children anymore */ + _bt_relbuf(rel, buf, BT_WRITE); + _bt_relbuf(rel, rbuf, BT_WRITE); + + newskey = _bt_mkscankey(rel, &(new_item->bti_itup)); + newres = _bt_insertonpg(rel, pbuf, stack->bts_parent, + keysz, newskey, new_item, + stack->bts_btitem); + + /* be tidy */ + pfree(newres); + pfree(newskey); + pfree(new_item); + } + } else { + itup_off = _bt_pgaddtup(rel, buf, keysz, scankey, + itemsz, btitem, afteritem); + itup_blkno = BufferGetBlockNumber(buf); + + _bt_relbuf(rel, buf, BT_WRITE); + } + + /* by here, the new tuple is inserted */ + res = (InsertIndexResult) palloc(sizeof(InsertIndexResultData)); + ItemPointerSet(&(res->pointerData), itup_blkno, itup_off); + + return (res); +} + +/* + * _bt_split() -- split a page in the btree. + * + * On entry, buf is the page to split, and is write-locked and pinned. + * Returns the new right sibling of buf, pinned and write-locked. The + * pin and lock on buf are maintained. + */ +static Buffer +_bt_split(Relation rel, Buffer buf) +{ + Buffer rbuf; + Page origpage; + Page leftpage, rightpage; + BTPageOpaque ropaque, lopaque, oopaque; + Buffer sbuf; + Page spage; + BTPageOpaque sopaque; + Size itemsz; + ItemId itemid; + BTItem item; + OffsetNumber leftoff, rightoff; + OffsetNumber start; + OffsetNumber maxoff; + OffsetNumber firstright; + OffsetNumber i; + Size llimit; + + rbuf = _bt_getbuf(rel, P_NEW, BT_WRITE); + origpage = BufferGetPage(buf); + leftpage = PageGetTempPage(origpage, sizeof(BTPageOpaqueData)); + rightpage = BufferGetPage(rbuf); + + _bt_pageinit(rightpage, BufferGetPageSize(rbuf)); + _bt_pageinit(leftpage, BufferGetPageSize(buf)); + + /* init btree private data */ + oopaque = (BTPageOpaque) PageGetSpecialPointer(origpage); + lopaque = (BTPageOpaque) PageGetSpecialPointer(leftpage); + ropaque = (BTPageOpaque) PageGetSpecialPointer(rightpage); + + /* if we're splitting this page, it won't be the root when we're done */ + oopaque->btpo_flags &= ~BTP_ROOT; + lopaque->btpo_flags = ropaque->btpo_flags = oopaque->btpo_flags; + lopaque->btpo_prev = oopaque->btpo_prev; + ropaque->btpo_prev = BufferGetBlockNumber(buf); + lopaque->btpo_next = BufferGetBlockNumber(rbuf); + ropaque->btpo_next = oopaque->btpo_next; + + /* + * If the page we're splitting is not the rightmost page at its + * level in the tree, then the first (0) entry on the page is the + * high key for the page. We need to copy that to the right + * half. Otherwise (meaning the rightmost page case), we should + * treat the line pointers beginning at zero as user data. + * + * We leave a blank space at the start of the line table for the + * left page. We'll come back later and fill it in with the high + * key item we get from the right key. + */ + + leftoff = P_FIRSTKEY; + ropaque->btpo_next = oopaque->btpo_next; + if (! P_RIGHTMOST(oopaque)) { + /* splitting a non-rightmost page, start at the first data item */ + start = P_FIRSTKEY; + + /* copy the original high key to the new page */ + itemid = PageGetItemId(origpage, P_HIKEY); + itemsz = ItemIdGetLength(itemid); + item = (BTItem) PageGetItem(origpage, itemid); + (void) PageAddItem(rightpage, (Item) item, itemsz, P_HIKEY, LP_USED); + rightoff = P_FIRSTKEY; + } else { + /* splitting a rightmost page, "high key" is the first data item */ + start = P_HIKEY; + + /* the new rightmost page will not have a high key */ + rightoff = P_HIKEY; + } + maxoff = PageGetMaxOffsetNumber(origpage); + llimit = PageGetFreeSpace(leftpage) / 2; + firstright = _bt_findsplitloc(rel, origpage, start, maxoff, llimit); + + for (i = start; i <= maxoff; i = OffsetNumberNext(i)) { + itemid = PageGetItemId(origpage, i); + itemsz = ItemIdGetLength(itemid); + item = (BTItem) PageGetItem(origpage, itemid); + + /* decide which page to put it on */ + if (i < firstright) { + (void) PageAddItem(leftpage, (Item) item, itemsz, leftoff, + LP_USED); + leftoff = OffsetNumberNext(leftoff); + } else { + (void) PageAddItem(rightpage, (Item) item, itemsz, rightoff, + LP_USED); + rightoff = OffsetNumberNext(rightoff); + } + } + + /* + * Okay, page has been split, high key on right page is correct. Now + * set the high key on the left page to be the min key on the right + * page. + */ + + if (P_RIGHTMOST(ropaque)) { + itemid = PageGetItemId(rightpage, P_HIKEY); + } else { + itemid = PageGetItemId(rightpage, P_FIRSTKEY); + } + itemsz = ItemIdGetLength(itemid); + item = (BTItem) PageGetItem(rightpage, itemid); + + /* + * We left a hole for the high key on the left page; fill it. The + * modal crap is to tell the page manager to put the new item on the + * page and not screw around with anything else. Whoever designed + * this interface has presumably crawled back into the dung heap they + * came from. No one here will admit to it. + */ + + PageManagerModeSet(OverwritePageManagerMode); + (void) PageAddItem(leftpage, (Item) item, itemsz, P_HIKEY, LP_USED); + PageManagerModeSet(ShufflePageManagerMode); + + /* + * By here, the original data page has been split into two new halves, + * and these are correct. The algorithm requires that the left page + * never move during a split, so we copy the new left page back on top + * of the original. Note that this is not a waste of time, since we + * also require (in the page management code) that the center of a + * page always be clean, and the most efficient way to guarantee this + * is just to compact the data by reinserting it into a new left page. + */ + + PageRestoreTempPage(leftpage, origpage); + + /* write these guys out */ + _bt_wrtnorelbuf(rel, rbuf); + _bt_wrtnorelbuf(rel, buf); + + /* + * Finally, we need to grab the right sibling (if any) and fix the + * prev pointer there. We are guaranteed that this is deadlock-free + * since no other writer will be moving holding a lock on that page + * and trying to move left, and all readers release locks on a page + * before trying to fetch its neighbors. + */ + + if (! P_RIGHTMOST(ropaque)) { + sbuf = _bt_getbuf(rel, ropaque->btpo_next, BT_WRITE); + spage = BufferGetPage(sbuf); + sopaque = (BTPageOpaque) PageGetSpecialPointer(spage); + sopaque->btpo_prev = BufferGetBlockNumber(rbuf); + + /* write and release the old right sibling */ + _bt_wrtbuf(rel, sbuf); + } + + /* split's done */ + return (rbuf); +} + +/* + * _bt_findsplitloc() -- find a safe place to split a page. + * + * In order to guarantee the proper handling of searches for duplicate + * keys, the first duplicate in the chain must either be the first + * item on the page after the split, or the entire chain must be on + * one of the two pages. That is, + * [1 2 2 2 3 4 5] + * must become + * [1] [2 2 2 3 4 5] + * or + * [1 2 2 2] [3 4 5] + * but not + * [1 2 2] [2 3 4 5]. + * However, + * [2 2 2 2 2 3 4] + * may be split as + * [2 2 2 2] [2 3 4]. + */ +static OffsetNumber +_bt_findsplitloc(Relation rel, + Page page, + OffsetNumber start, + OffsetNumber maxoff, + Size llimit) +{ + OffsetNumber i; + OffsetNumber saferight; + ItemId nxtitemid, safeitemid; + BTItem safeitem, nxtitem; + IndexTuple safetup, nxttup; + Size nbytes; + TupleDesc itupdesc; + int natts; + int attno; + Datum attsafe; + Datum attnext; + bool null; + + itupdesc = RelationGetTupleDescriptor(rel); + natts = rel->rd_rel->relnatts; + + saferight = start; + safeitemid = PageGetItemId(page, saferight); + nbytes = ItemIdGetLength(safeitemid) + sizeof(ItemIdData); + safeitem = (BTItem) PageGetItem(page, safeitemid); + safetup = &(safeitem->bti_itup); + + i = OffsetNumberNext(start); + + while (nbytes < llimit) { + + /* check the next item on the page */ + nxtitemid = PageGetItemId(page, i); + nbytes += (ItemIdGetLength(nxtitemid) + sizeof(ItemIdData)); + nxtitem = (BTItem) PageGetItem(page, nxtitemid); + nxttup = &(nxtitem->bti_itup); + + /* test against last known safe item */ + for (attno = 1; attno <= natts; attno++) { + attsafe = index_getattr(safetup, attno, itupdesc, &null); + attnext = index_getattr(nxttup, attno, itupdesc, &null); + + /* + * If the tuple we're looking at isn't equal to the last safe one + * we saw, then it's our new safe tuple. + */ + + if (!_bt_invokestrat(rel, attno, BTEqualStrategyNumber, + attsafe, attnext)) { + safetup = nxttup; + saferight = i; + + /* break is for the attno for loop */ + break; + } + } + i = OffsetNumberNext(i); + } + + /* + * If the chain of dups starts at the beginning of the page and extends + * past the halfway mark, we can split it in the middle. + */ + + if (saferight == start) + saferight = i; + + return (saferight); +} + +/* + * _bt_newroot() -- Create a new root page for the index. + * + * We've just split the old root page and need to create a new one. + * In order to do this, we add a new root page to the file, then lock + * the metadata page and update it. This is guaranteed to be deadlock- + * free, because all readers release their locks on the metadata page + * before trying to lock the root, and all writers lock the root before + * trying to lock the metadata page. We have a write lock on the old + * root page, so we have not introduced any cycles into the waits-for + * graph. + * + * On entry, lbuf (the old root) and rbuf (its new peer) are write- + * locked. We don't drop the locks in this routine; that's done by + * the caller. On exit, a new root page exists with entries for the + * two new children. The new root page is neither pinned nor locked. + */ +static void +_bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf) +{ + Buffer rootbuf; + Page lpage, rpage, rootpage; + BlockNumber lbkno, rbkno; + BlockNumber rootbknum; + BTPageOpaque rootopaque; + ItemId itemid; + BTItem item; + Size itemsz; + BTItem new_item; + + /* get a new root page */ + rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE); + rootpage = BufferGetPage(rootbuf); + _bt_pageinit(rootpage, BufferGetPageSize(rootbuf)); + + /* set btree special data */ + rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage); + rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE; + rootopaque->btpo_flags |= BTP_ROOT; + + /* + * Insert the internal tuple pointers. + */ + + lbkno = BufferGetBlockNumber(lbuf); + rbkno = BufferGetBlockNumber(rbuf); + lpage = BufferGetPage(lbuf); + rpage = BufferGetPage(rbuf); + + /* + * step over the high key on the left page while building the + * left page pointer. + */ + itemid = PageGetItemId(lpage, P_FIRSTKEY); + itemsz = ItemIdGetLength(itemid); + item = (BTItem) PageGetItem(lpage, itemid); + new_item = _bt_formitem(&(item->bti_itup)); + ItemPointerSet(&(new_item->bti_itup.t_tid), lbkno, P_FIRSTKEY); + + /* + * insert the left page pointer into the new root page. the root + * page is the rightmost page on its level so the "high key" item + * is the first data item. + */ + (void) PageAddItem(rootpage, (Item) new_item, itemsz, P_HIKEY, LP_USED); + pfree(new_item); + + /* + * the right page is the rightmost page on the second level, so + * the "high key" item is the first data item on that page as well. + */ + itemid = PageGetItemId(rpage, P_HIKEY); + itemsz = ItemIdGetLength(itemid); + item = (BTItem) PageGetItem(rpage, itemid); + new_item = _bt_formitem(&(item->bti_itup)); + ItemPointerSet(&(new_item->bti_itup.t_tid), rbkno, P_HIKEY); + + /* + * insert the right page pointer into the new root page. + */ + (void) PageAddItem(rootpage, (Item) new_item, itemsz, P_FIRSTKEY, LP_USED); + pfree(new_item); + + /* write and let go of the root buffer */ + rootbknum = BufferGetBlockNumber(rootbuf); + _bt_wrtbuf(rel, rootbuf); + + /* update metadata page with new root block number */ + _bt_metaproot(rel, rootbknum); +} + +/* + * _bt_pgaddtup() -- add a tuple to a particular page in the index. + * + * This routine adds the tuple to the page as requested, and keeps the + * write lock and reference associated with the page's buffer. It is + * an error to call pgaddtup() without a write lock and reference. If + * afteritem is non-null, it's the item that we expect our new item + * to follow. Otherwise, we do a binary search for the correct place + * and insert the new item there. + */ +static OffsetNumber +_bt_pgaddtup(Relation rel, + Buffer buf, + int keysz, + ScanKey itup_scankey, + Size itemsize, + BTItem btitem, + BTItem afteritem) +{ + OffsetNumber itup_off; + OffsetNumber first; + Page page; + BTPageOpaque opaque; + BTItem chkitem; + Oid afteroid; + + page = BufferGetPage(buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + first = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY; + + if (afteritem == (BTItem) NULL) { + itup_off = _bt_binsrch(rel, buf, keysz, itup_scankey, BT_INSERTION); + } else { + afteroid = afteritem->bti_oid; + itup_off = first; + + do { + chkitem = + (BTItem) PageGetItem(page, PageGetItemId(page, itup_off)); + itup_off = OffsetNumberNext(itup_off); + } while (chkitem->bti_oid != afteroid); + } + + (void) PageAddItem(page, (Item) btitem, itemsize, itup_off, LP_USED); + + /* write the buffer, but hold our lock */ + _bt_wrtnorelbuf(rel, buf); + + return (itup_off); +} + +/* + * _bt_goesonpg() -- Does a new tuple belong on this page? + * + * This is part of the complexity introduced by allowing duplicate + * keys into the index. The tuple belongs on this page if: + * + * + there is no page to the right of this one; or + * + it is less than the high key on the page; or + * + the item it is to follow ("afteritem") appears on this + * page. + */ +static bool +_bt_goesonpg(Relation rel, + Buffer buf, + Size keysz, + ScanKey scankey, + BTItem afteritem) +{ + Page page; + ItemId hikey; + BTPageOpaque opaque; + BTItem chkitem; + OffsetNumber offnum, maxoff; + Oid afteroid; + bool found; + + page = BufferGetPage(buf); + + /* no right neighbor? */ + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + if (P_RIGHTMOST(opaque)) + return (true); + + /* + * this is a non-rightmost page, so it must have a high key item. + * + * If the scan key is < the high key (the min key on the next page), + * then it for sure belongs here. + */ + hikey = PageGetItemId(page, P_HIKEY); + if (_bt_skeycmp(rel, keysz, scankey, page, hikey, BTLessStrategyNumber)) + return (true); + + /* + * If the scan key is > the high key, then it for sure doesn't belong + * here. + */ + + if (_bt_skeycmp(rel, keysz, scankey, page, hikey, BTGreaterStrategyNumber)) + return (false); + + /* + * If we have no adjacency information, and the item is equal to the + * high key on the page (by here it is), then the item does not belong + * on this page. + */ + + if (afteritem == (BTItem) NULL) + return (false); + + /* damn, have to work for it. i hate that. */ + afteroid = afteritem->bti_oid; + maxoff = PageGetMaxOffsetNumber(page); + + /* + * Search the entire page for the afteroid. We need to do this, rather + * than doing a binary search and starting from there, because if the + * key we're searching for is the leftmost key in the tree at this + * level, then a binary search will do the wrong thing. Splits are + * pretty infrequent, so the cost isn't as bad as it could be. + */ + + found = false; + for (offnum = P_FIRSTKEY; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) { + chkitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum)); + if (chkitem->bti_oid == afteroid) { + found = true; + break; + } + } + + return (found); +} + +/* + * _bt_itemcmp() -- compare item1 to item2 using a requested + * strategy (<, <=, =, >=, >) + * + */ +bool +_bt_itemcmp(Relation rel, + Size keysz, + BTItem item1, + BTItem item2, + StrategyNumber strat) +{ + TupleDesc tupDes; + IndexTuple indexTuple1, indexTuple2; + Datum attrDatum1, attrDatum2; + int i; + bool isNull; + bool compare; + + tupDes = RelationGetTupleDescriptor(rel); + indexTuple1 = &(item1->bti_itup); + indexTuple2 = &(item2->bti_itup); + + for (i = 1; i <= keysz; i++) { + attrDatum1 = index_getattr(indexTuple1, i, tupDes, &isNull); + attrDatum2 = index_getattr(indexTuple2, i, tupDes, &isNull); + compare = _bt_invokestrat(rel, i, strat, attrDatum1, attrDatum2); + if (!compare) { + return (false); + } + } + return (true); +} + +/* + * _bt_updateitem() -- updates the key of the item identified by the + * oid with the key of newItem (done in place) + * + */ +static void +_bt_updateitem(Relation rel, + Size keysz, + Buffer buf, + Oid bti_oid, + BTItem newItem) +{ + Page page; + OffsetNumber maxoff; + OffsetNumber i; + ItemPointerData itemPtrData; + BTItem item; + IndexTuple oldIndexTuple, newIndexTuple; + + page = BufferGetPage(buf); + maxoff = PageGetMaxOffsetNumber(page); + + /* locate item on the page */ + i = P_HIKEY; + do { + item = (BTItem) PageGetItem(page, PageGetItemId(page, i)); + i = OffsetNumberNext(i); + } while (i <= maxoff && item->bti_oid != bti_oid); + + /* this should never happen (in theory) */ + if (item->bti_oid != bti_oid) { + elog(FATAL, "_bt_getstackbuf was lying!!"); + } + + oldIndexTuple = &(item->bti_itup); + newIndexTuple = &(newItem->bti_itup); + + /* keep the original item pointer */ + ItemPointerCopy(&(oldIndexTuple->t_tid), &itemPtrData); + CopyIndexTuple(newIndexTuple, &oldIndexTuple); + ItemPointerCopy(&itemPtrData, &(oldIndexTuple->t_tid)); +} diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c new file mode 100644 index 0000000000..ce411a80d1 --- /dev/null +++ b/src/backend/access/nbtree/nbtpage.c @@ -0,0 +1,523 @@ +/*------------------------------------------------------------------------- + * + * btpage.c-- + * BTree-specific page management code for the Postgres btree access + * method. + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtpage.c,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $ + * + * NOTES + * Postgres btree pages look like ordinary relation pages. The opaque + * data at high addresses includes pointers to left and right siblings + * and flag data describing page state. The first page in a btree, page + * zero, is special -- it stores meta-information describing the tree. + * Pages one and higher store the actual tree data. + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "storage/bufmgr.h" +#include "storage/bufpage.h" + +#include "utils/elog.h" +#include "utils/rel.h" +#include "utils/excid.h" + +#include "access/genam.h" +#include "access/nbtree.h" + +#define BTREE_METAPAGE 0 +#define BTREE_MAGIC 0x053162 +#define BTREE_VERSION 0 + +typedef struct BTMetaPageData { + uint32 btm_magic; + uint32 btm_version; + BlockNumber btm_root; +} BTMetaPageData; + +#define BTPageGetMeta(p) \ + ((BTMetaPageData *) &((PageHeader) p)->pd_linp[0]) + +extern bool BuildingBtree; + +/* + * We use high-concurrency locking on btrees. There are two cases in + * which we don't do locking. One is when we're building the btree. + * Since the creating transaction has not committed, no one can see + * the index, and there's no reason to share locks. The second case + * is when we're just starting up the database system. We use some + * special-purpose initialization code in the relation cache manager + * (see utils/cache/relcache.c) to allow us to do indexed scans on + * the system catalogs before we'd normally be able to. This happens + * before the lock table is fully initialized, so we can't use it. + * Strictly speaking, this violates 2pl, but we don't do 2pl on the + * system catalogs anyway, so I declare this to be okay. + */ + +#define USELOCKING (!BuildingBtree && !IsInitProcessingMode()) + +/* + * _bt_metapinit() -- Initialize the metadata page of a btree. + */ +void +_bt_metapinit(Relation rel) +{ + Buffer buf; + Page pg; + int nblocks; + BTMetaPageData metad; + BTPageOpaque op; + + /* can't be sharing this with anyone, now... */ + if (USELOCKING) + RelationSetLockForWrite(rel); + + if ((nblocks = RelationGetNumberOfBlocks(rel)) != 0) { + elog(WARN, "Cannot initialize non-empty btree %s", + RelationGetRelationName(rel)); + } + + buf = ReadBuffer(rel, P_NEW); + pg = BufferGetPage(buf); + _bt_pageinit(pg, BufferGetPageSize(buf)); + + metad.btm_magic = BTREE_MAGIC; + metad.btm_version = BTREE_VERSION; + metad.btm_root = P_NONE; + memmove((char *) BTPageGetMeta(pg), (char *) &metad, sizeof(metad)); + + op = (BTPageOpaque) PageGetSpecialPointer(pg); + op->btpo_flags = BTP_META; + + WriteBuffer(buf); + + /* all done */ + if (USELOCKING) + RelationUnsetLockForWrite(rel); +} + +/* + * _bt_checkmeta() -- Verify that the metadata stored in a btree are + * reasonable. + */ +void +_bt_checkmeta(Relation rel) +{ + Buffer metabuf; + Page metap; + BTMetaPageData *metad; + BTPageOpaque op; + int nblocks; + + /* if the relation is empty, this is init time; don't complain */ + if ((nblocks = RelationGetNumberOfBlocks(rel)) == 0) + return; + + metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ); + metap = BufferGetPage(metabuf); + op = (BTPageOpaque) PageGetSpecialPointer(metap); + if (!(op->btpo_flags & BTP_META)) { + elog(WARN, "Invalid metapage for index %s", + RelationGetRelationName(rel)); + } + metad = BTPageGetMeta(metap); + + if (metad->btm_magic != BTREE_MAGIC) { + elog(WARN, "Index %s is not a btree", + RelationGetRelationName(rel)); + } + + if (metad->btm_version != BTREE_VERSION) { + elog(WARN, "Version mismatch on %s: version %d file, version %d code", + RelationGetRelationName(rel), + metad->btm_version, BTREE_VERSION); + } + + _bt_relbuf(rel, metabuf, BT_READ); +} + +/* + * _bt_getroot() -- Get the root page of the btree. + * + * Since the root page can move around the btree file, we have to read + * its location from the metadata page, and then read the root page + * itself. If no root page exists yet, we have to create one. The + * standard class of race conditions exists here; I think I covered + * them all in the Hopi Indian rain dance of lock requests below. + * + * We pass in the access type (BT_READ or BT_WRITE), and return the + * root page's buffer with the appropriate lock type set. Reference + * count on the root page gets bumped by ReadBuffer. The metadata + * page is unlocked and unreferenced by this process when this routine + * returns. + */ +Buffer +_bt_getroot(Relation rel, int access) +{ + Buffer metabuf; + Page metapg; + BTPageOpaque metaopaque; + Buffer rootbuf; + Page rootpg; + BTPageOpaque rootopaque; + BlockNumber rootblkno; + BTMetaPageData *metad; + + metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ); + metapg = BufferGetPage(metabuf); + metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg); + Assert(metaopaque->btpo_flags & BTP_META); + metad = BTPageGetMeta(metapg); + + /* if no root page initialized yet, do it */ + if (metad->btm_root == P_NONE) { + + /* turn our read lock in for a write lock */ + _bt_relbuf(rel, metabuf, BT_READ); + metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE); + metapg = BufferGetPage(metabuf); + metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg); + Assert(metaopaque->btpo_flags & BTP_META); + metad = BTPageGetMeta(metapg); + + /* + * Race condition: if someone else initialized the metadata between + * the time we released the read lock and acquired the write lock, + * above, we want to avoid doing it again. + */ + + if (metad->btm_root == P_NONE) { + + /* + * Get, initialize, write, and leave a lock of the appropriate + * type on the new root page. Since this is the first page in + * the tree, it's a leaf. + */ + + rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE); + rootblkno = BufferGetBlockNumber(rootbuf); + rootpg = BufferGetPage(rootbuf); + metad->btm_root = rootblkno; + _bt_pageinit(rootpg, BufferGetPageSize(rootbuf)); + rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpg); + rootopaque->btpo_flags |= (BTP_LEAF | BTP_ROOT); + _bt_wrtnorelbuf(rel, rootbuf); + + /* swap write lock for read lock, if appropriate */ + if (access != BT_WRITE) { + _bt_setpagelock(rel, rootblkno, BT_READ); + _bt_unsetpagelock(rel, rootblkno, BT_WRITE); + } + + /* okay, metadata is correct */ + _bt_wrtbuf(rel, metabuf); + } else { + + /* + * Metadata initialized by someone else. In order to guarantee + * no deadlocks, we have to release the metadata page and start + * all over again. + */ + + _bt_relbuf(rel, metabuf, BT_WRITE); + return (_bt_getroot(rel, access)); + } + } else { + rootbuf = _bt_getbuf(rel, metad->btm_root, access); + + /* done with the meta page */ + _bt_relbuf(rel, metabuf, BT_READ); + } + + /* + * Race condition: If the root page split between the time we looked + * at the metadata page and got the root buffer, then we got the wrong + * buffer. + */ + + rootpg = BufferGetPage(rootbuf); + rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpg); + if (!(rootopaque->btpo_flags & BTP_ROOT)) { + + /* it happened, try again */ + _bt_relbuf(rel, rootbuf, access); + return (_bt_getroot(rel, access)); + } + + /* + * By here, we have a correct lock on the root block, its reference + * count is correct, and we have no lock set on the metadata page. + * Return the root block. + */ + + return (rootbuf); +} + +/* + * _bt_getbuf() -- Get a buffer by block number for read or write. + * + * When this routine returns, the appropriate lock is set on the + * requested buffer its reference count is correct. + */ +Buffer +_bt_getbuf(Relation rel, BlockNumber blkno, int access) +{ + Buffer buf; + Page page; + + /* + * If we want a new block, we can't set a lock of the appropriate type + * until we've instantiated the buffer. + */ + + if (blkno != P_NEW) { + if (access == BT_WRITE) + _bt_setpagelock(rel, blkno, BT_WRITE); + else + _bt_setpagelock(rel, blkno, BT_READ); + + buf = ReadBuffer(rel, blkno); + } else { + buf = ReadBuffer(rel, blkno); + blkno = BufferGetBlockNumber(buf); + page = BufferGetPage(buf); + _bt_pageinit(page, BufferGetPageSize(buf)); + + if (access == BT_WRITE) + _bt_setpagelock(rel, blkno, BT_WRITE); + else + _bt_setpagelock(rel, blkno, BT_READ); + } + + /* ref count and lock type are correct */ + return (buf); +} + +/* + * _bt_relbuf() -- release a locked buffer. + */ +void +_bt_relbuf(Relation rel, Buffer buf, int access) +{ + BlockNumber blkno; + + blkno = BufferGetBlockNumber(buf); + + /* access had better be one of read or write */ + if (access == BT_WRITE) + _bt_unsetpagelock(rel, blkno, BT_WRITE); + else + _bt_unsetpagelock(rel, blkno, BT_READ); + + ReleaseBuffer(buf); +} + +/* + * _bt_wrtbuf() -- write a btree page to disk. + * + * This routine releases the lock held on the buffer and our reference + * to it. It is an error to call _bt_wrtbuf() without a write lock + * or a reference to the buffer. + */ +void +_bt_wrtbuf(Relation rel, Buffer buf) +{ + BlockNumber blkno; + + blkno = BufferGetBlockNumber(buf); + WriteBuffer(buf); + _bt_unsetpagelock(rel, blkno, BT_WRITE); +} + +/* + * _bt_wrtnorelbuf() -- write a btree page to disk, but do not release + * our reference or lock. + * + * It is an error to call _bt_wrtnorelbuf() without a write lock + * or a reference to the buffer. + */ +void +_bt_wrtnorelbuf(Relation rel, Buffer buf) +{ + BlockNumber blkno; + + blkno = BufferGetBlockNumber(buf); + WriteNoReleaseBuffer(buf); +} + +/* + * _bt_pageinit() -- Initialize a new page. + */ +void +_bt_pageinit(Page page, Size size) +{ + /* + * Cargo-cult programming -- don't really need this to be zero, but + * creating new pages is an infrequent occurrence and it makes me feel + * good when I know they're empty. + */ + + memset(page, 0, size); + + PageInit(page, size, sizeof(BTPageOpaqueData)); +} + +/* + * _bt_metaproot() -- Change the root page of the btree. + * + * Lehman and Yao require that the root page move around in order to + * guarantee deadlock-free short-term, fine-granularity locking. When + * we split the root page, we record the new parent in the metadata page + * for the relation. This routine does the work. + * + * No direct preconditions, but if you don't have the a write lock on + * at least the old root page when you call this, you're making a big + * mistake. On exit, metapage data is correct and we no longer have + * a reference to or lock on the metapage. + */ +void +_bt_metaproot(Relation rel, BlockNumber rootbknum) +{ + Buffer metabuf; + Page metap; + BTPageOpaque metaopaque; + BTMetaPageData *metad; + + metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE); + metap = BufferGetPage(metabuf); + metaopaque = (BTPageOpaque) PageGetSpecialPointer(metap); + Assert(metaopaque->btpo_flags & BTP_META); + metad = BTPageGetMeta(metap); + metad->btm_root = rootbknum; + _bt_wrtbuf(rel, metabuf); +} + +/* + * _bt_getstackbuf() -- Walk back up the tree one step, and find the item + * we last looked at in the parent. + * + * This is possible because we save a bit image of the last item + * we looked at in the parent, and the update algorithm guarantees + * that if items above us in the tree move, they only move right. + */ +Buffer +_bt_getstackbuf(Relation rel, BTStack stack, int access) +{ + Buffer buf; + BlockNumber blkno; + OffsetNumber start, offnum, maxoff; + OffsetNumber i; + Page page; + ItemId itemid; + BTItem item; + BTPageOpaque opaque; + + blkno = stack->bts_blkno; + buf = _bt_getbuf(rel, blkno, access); + page = BufferGetPage(buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + maxoff = PageGetMaxOffsetNumber(page); + + if (maxoff >= stack->bts_offset) { + itemid = PageGetItemId(page, stack->bts_offset); + item = (BTItem) PageGetItem(page, itemid); + + /* if the item is where we left it, we're done */ + if (item->bti_oid == stack->bts_btitem->bti_oid) + return (buf); + + /* if the item has just moved right on this page, we're done */ + for (i = OffsetNumberNext(stack->bts_offset); + i <= maxoff; + i = OffsetNumberNext(i)) { + itemid = PageGetItemId(page, i); + item = (BTItem) PageGetItem(page, itemid); + + /* if the item is where we left it, we're done */ + if (item->bti_oid == stack->bts_btitem->bti_oid) + return (buf); + } + } + + /* by here, the item we're looking for moved right at least one page */ + for (;;) { + blkno = opaque->btpo_next; + if (P_RIGHTMOST(opaque)) + elog(FATAL, "my bits moved right off the end of the world!"); + + _bt_relbuf(rel, buf, access); + buf = _bt_getbuf(rel, blkno, access); + page = BufferGetPage(buf); + maxoff = PageGetMaxOffsetNumber(page); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + /* if we have a right sibling, step over the high key */ + start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY; + + /* see if it's on this page */ + for (offnum = start; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) { + itemid = PageGetItemId(page, offnum); + item = (BTItem) PageGetItem(page, itemid); + if (item->bti_oid == stack->bts_btitem->bti_oid) + return (buf); + } + } +} + +void +_bt_setpagelock(Relation rel, BlockNumber blkno, int access) +{ + ItemPointerData iptr; + + if (USELOCKING) { + ItemPointerSet(&iptr, blkno, P_HIKEY); + + if (access == BT_WRITE) + RelationSetSingleWLockPage(rel, &iptr); + else + RelationSetSingleRLockPage(rel, &iptr); + } +} + +void +_bt_unsetpagelock(Relation rel, BlockNumber blkno, int access) +{ + ItemPointerData iptr; + + if (USELOCKING) { + ItemPointerSet(&iptr, blkno, P_HIKEY); + + if (access == BT_WRITE) + RelationUnsetSingleWLockPage(rel, &iptr); + else + RelationUnsetSingleRLockPage(rel, &iptr); + } +} + +void +_bt_pagedel(Relation rel, ItemPointer tid) +{ + Buffer buf; + Page page; + BlockNumber blkno; + OffsetNumber offno; + + blkno = ItemPointerGetBlockNumber(tid); + offno = ItemPointerGetOffsetNumber(tid); + + buf = _bt_getbuf(rel, blkno, BT_WRITE); + page = BufferGetPage(buf); + + PageIndexTupleDelete(page, offno); + + /* write the buffer and release the lock */ + _bt_wrtbuf(rel, buf); +} diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c new file mode 100644 index 0000000000..0601611996 --- /dev/null +++ b/src/backend/access/nbtree/nbtree.c @@ -0,0 +1,516 @@ +/*------------------------------------------------------------------------- + * + * btree.c-- + * Implementation of Lehman and Yao's btree management algorithm for + * Postgres. + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $ + * + * NOTES + * This file contains only the public interface routines. + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "storage/bufmgr.h" +#include "storage/bufpage.h" + +#include "utils/elog.h" +#include "utils/palloc.h" +#include "utils/rel.h" +#include "utils/excid.h" + +#include "access/heapam.h" +#include "access/genam.h" +#include "access/sdir.h" +#include "access/nbtree.h" +#include "access/funcindex.h" + +#include "nodes/execnodes.h" +#include "nodes/plannodes.h" + +#include "executor/executor.h" +#include "executor/tuptable.h" + +#include "catalog/index.h" + +bool BuildingBtree = false; +bool FastBuild = false; /* turn this on to make bulk builds work*/ + +/* + * btbuild() -- build a new btree index. + * + * We use a global variable to record the fact that we're creating + * a new index. This is used to avoid high-concurrency locking, + * since the index won't be visible until this transaction commits + * and since building is guaranteed to be single-threaded. + */ +void +btbuild(Relation heap, + Relation index, + int natts, + AttrNumber *attnum, + IndexStrategy istrat, + uint16 pcount, + Datum *params, + FuncIndexInfo *finfo, + PredInfo *predInfo) +{ + HeapScanDesc hscan; + Buffer buffer; + HeapTuple htup; + IndexTuple itup; + TupleDesc htupdesc, itupdesc; + Datum *attdata; + bool *nulls; + InsertIndexResult res; + int nhtups, nitups; + int i; + BTItem btitem; + ExprContext *econtext; + TupleTable tupleTable; + TupleTableSlot *slot; + Oid hrelid, irelid; + Node *pred, *oldPred; + void *spool; + + /* note that this is a new btree */ + BuildingBtree = true; + + pred = predInfo->pred; + oldPred = predInfo->oldPred; + + /* initialize the btree index metadata page (if this is a new index) */ + if (oldPred == NULL) + _bt_metapinit(index); + + /* get tuple descriptors for heap and index relations */ + htupdesc = RelationGetTupleDescriptor(heap); + itupdesc = RelationGetTupleDescriptor(index); + + /* get space for data items that'll appear in the index tuple */ + attdata = (Datum *) palloc(natts * sizeof(Datum)); + nulls = (bool *) palloc(natts * sizeof(bool)); + + /* + * If this is a predicate (partial) index, we will need to evaluate the + * predicate using ExecQual, which requires the current tuple to be in a + * slot of a TupleTable. In addition, ExecQual must have an ExprContext + * referring to that slot. Here, we initialize dummy TupleTable and + * ExprContext objects for this purpose. --Nels, Feb '92 + */ +#ifndef OMIT_PARTIAL_INDEX + if (pred != NULL || oldPred != NULL) { + tupleTable = ExecCreateTupleTable(1); + slot = ExecAllocTableSlot(tupleTable); + econtext = makeNode(ExprContext); + FillDummyExprContext(econtext, slot, htupdesc, InvalidBuffer); + } +#endif /* OMIT_PARTIAL_INDEX */ + + /* start a heap scan */ + hscan = heap_beginscan(heap, 0, NowTimeQual, 0, (ScanKey) NULL); + htup = heap_getnext(hscan, 0, &buffer); + + /* build the index */ + nhtups = nitups = 0; + + if (FastBuild) { + spool = _bt_spoolinit(index, 7); + res = (InsertIndexResult) NULL; + } + + for (; HeapTupleIsValid(htup); htup = heap_getnext(hscan, 0, &buffer)) { + + nhtups++; + + /* + * If oldPred != NULL, this is an EXTEND INDEX command, so skip + * this tuple if it was already in the existing partial index + */ + if (oldPred != NULL) { +#ifndef OMIT_PARTIAL_INDEX + + /*SetSlotContents(slot, htup);*/ + slot->val = htup; + if (ExecQual((List*)oldPred, econtext) == true) { + nitups++; + continue; + } +#endif /* OMIT_PARTIAL_INDEX */ + } + + /* Skip this tuple if it doesn't satisfy the partial-index predicate */ + if (pred != NULL) { +#ifndef OMIT_PARTIAL_INDEX + /* SetSlotContents(slot, htup); */ + slot->val = htup; + if (ExecQual((List*)pred, econtext) == false) + continue; +#endif /* OMIT_PARTIAL_INDEX */ + } + + nitups++; + + /* + * For the current heap tuple, extract all the attributes + * we use in this index, and note which are null. + */ + + for (i = 1; i <= natts; i++) { + int attoff; + bool attnull; + + /* + * Offsets are from the start of the tuple, and are + * zero-based; indices are one-based. The next call + * returns i - 1. That's data hiding for you. + */ + + attoff = AttrNumberGetAttrOffset(i); + attdata[attoff] = GetIndexValue(htup, + htupdesc, + attoff, + attnum, + finfo, + &attnull, + buffer); + nulls[attoff] = (attnull ? 'n' : ' '); + } + + /* form an index tuple and point it at the heap tuple */ + itup = index_formtuple(itupdesc, attdata, nulls); + + /* + * If the single index key is null, we don't insert it into + * the index. Btrees support scans on <, <=, =, >=, and >. + * Relational algebra says that A op B (where op is one of the + * operators above) returns null if either A or B is null. This + * means that no qualification used in an index scan could ever + * return true on a null attribute. It also means that indices + * can't be used by ISNULL or NOTNULL scans, but that's an + * artifact of the strategy map architecture chosen in 1986, not + * of the way nulls are handled here. + */ + + if (itup->t_info & INDEX_NULL_MASK) { + pfree(itup); + continue; + } + + itup->t_tid = htup->t_ctid; + btitem = _bt_formitem(itup); + + /* + * if we are doing bottom-up btree build, we insert the index + * into a spool page for subsequent processing. otherwise, we + * insert into the btree. + */ + if (FastBuild) { + _bt_spool(index, btitem, spool); + } else { + res = _bt_doinsert(index, btitem); + } + + pfree(btitem); + pfree(itup); + if (res) { + pfree(res); + } + } + + /* okay, all heap tuples are indexed */ + heap_endscan(hscan); + + if (pred != NULL || oldPred != NULL) { +#ifndef OMIT_PARTIAL_INDEX + ExecDestroyTupleTable(tupleTable, true); + pfree(econtext); +#endif /* OMIT_PARTIAL_INDEX */ + } + + /* + * if we are doing bottom-up btree build, we now have a bunch of + * sorted runs in the spool pages. finish the build by (1) + * merging the runs, (2) inserting the sorted tuples into btree + * pages and (3) building the upper levels. + */ + if (FastBuild) { + _bt_spool(index, (BTItem) NULL, spool); /* flush spool */ + _bt_leafbuild(index, spool); + _bt_spooldestroy(spool); + } + + /* + * Since we just counted the tuples in the heap, we update its + * stats in pg_class to guarantee that the planner takes advantage + * of the index we just created. Finally, only update statistics + * during normal index definitions, not for indices on system catalogs + * created during bootstrap processing. We must close the relations + * before updatings statistics to guarantee that the relcache entries + * are flushed when we increment the command counter in UpdateStats(). + */ + if (IsNormalProcessingMode()) + { + hrelid = heap->rd_id; + irelid = index->rd_id; + heap_close(heap); + index_close(index); + UpdateStats(hrelid, nhtups, true); + UpdateStats(irelid, nitups, false); + if (oldPred != NULL) { + if (nitups == nhtups) pred = NULL; + UpdateIndexPredicate(irelid, oldPred, pred); + } + } + + /* be tidy */ + pfree(nulls); + pfree(attdata); + + /* all done */ + BuildingBtree = false; +} + +/* + * btinsert() -- insert an index tuple into a btree. + * + * Descend the tree recursively, find the appropriate location for our + * new tuple, put it there, set its unique OID as appropriate, and + * return an InsertIndexResult to the caller. + */ +InsertIndexResult +btinsert(Relation rel, IndexTuple itup) +{ + BTItem btitem; + InsertIndexResult res; + + if (itup->t_info & INDEX_NULL_MASK) + return ((InsertIndexResult) NULL); + + btitem = _bt_formitem(itup); + + res = _bt_doinsert(rel, btitem); + pfree(btitem); + + return (res); +} + +/* + * btgettuple() -- Get the next tuple in the scan. + */ +char * +btgettuple(IndexScanDesc scan, ScanDirection dir) +{ + RetrieveIndexResult res; + + /* + * If we've already initialized this scan, we can just advance it + * in the appropriate direction. If we haven't done so yet, we + * call a routine to get the first item in the scan. + */ + + if (ItemPointerIsValid(&(scan->currentItemData))) + res = _bt_next(scan, dir); + else + res = _bt_first(scan, dir); + + return ((char *) res); +} + +/* + * btbeginscan() -- start a scan on a btree index + */ +char * +btbeginscan(Relation rel, bool fromEnd, uint16 keysz, ScanKey scankey) +{ + IndexScanDesc scan; + StrategyNumber strat; + BTScanOpaque so; + + /* first order the keys in the qualification */ + if (keysz > 1) + _bt_orderkeys(rel, &keysz, scankey); + + /* now get the scan */ + scan = RelationGetIndexScan(rel, fromEnd, keysz, scankey); + so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData)); + so->btso_curbuf = so->btso_mrkbuf = InvalidBuffer; + scan->opaque = so; + + /* finally, be sure that the scan exploits the tree order */ + scan->scanFromEnd = false; + scan->flags = 0x0; + if (keysz > 0) { + strat = _bt_getstrat(scan->relation, 1 /* XXX */, + scankey[0].sk_procedure); + + if (strat == BTLessStrategyNumber + || strat == BTLessEqualStrategyNumber) + scan->scanFromEnd = true; + } else { + scan->scanFromEnd = true; + } + + /* register scan in case we change pages it's using */ + _bt_regscan(scan); + + return ((char *) scan); +} + +/* + * btrescan() -- rescan an index relation + */ +void +btrescan(IndexScanDesc scan, bool fromEnd, ScanKey scankey) +{ + ItemPointer iptr; + BTScanOpaque so; + + so = (BTScanOpaque) scan->opaque; + + /* we hold a read lock on the current page in the scan */ + if (ItemPointerIsValid(iptr = &(scan->currentItemData))) { + _bt_relbuf(scan->relation, so->btso_curbuf, BT_READ); + so->btso_curbuf = InvalidBuffer; + ItemPointerSetInvalid(iptr); + } + + /* and we hold a read lock on the last marked item in the scan */ + if (ItemPointerIsValid(iptr = &(scan->currentMarkData))) { + _bt_relbuf(scan->relation, so->btso_mrkbuf, BT_READ); + so->btso_mrkbuf = InvalidBuffer; + ItemPointerSetInvalid(iptr); + } + + /* reset the scan key */ + if (scan->numberOfKeys > 0) { + memmove(scan->keyData, + scankey, + scan->numberOfKeys * sizeof(ScanKeyData)); + } +} + +void +btmovescan(IndexScanDesc scan, Datum v) +{ + ItemPointer iptr; + BTScanOpaque so; + + so = (BTScanOpaque) scan->opaque; + + /* release any locks we still hold */ + if (ItemPointerIsValid(iptr = &(scan->currentItemData))) { + _bt_relbuf(scan->relation, so->btso_curbuf, BT_READ); + so->btso_curbuf = InvalidBuffer; + ItemPointerSetInvalid(iptr); + } + + scan->keyData[0].sk_argument = v; +} + +/* + * btendscan() -- close down a scan + */ +void +btendscan(IndexScanDesc scan) +{ + ItemPointer iptr; + BTScanOpaque so; + + so = (BTScanOpaque) scan->opaque; + + /* release any locks we still hold */ + if (ItemPointerIsValid(iptr = &(scan->currentItemData))) { + if (BufferIsValid(so->btso_curbuf)) + _bt_relbuf(scan->relation, so->btso_curbuf, BT_READ); + so->btso_curbuf = InvalidBuffer; + ItemPointerSetInvalid(iptr); + } + + if (ItemPointerIsValid(iptr = &(scan->currentMarkData))) { + if (BufferIsValid(so->btso_mrkbuf)) + _bt_relbuf(scan->relation, so->btso_mrkbuf, BT_READ); + so->btso_mrkbuf = InvalidBuffer; + ItemPointerSetInvalid(iptr); + } + + /* don't need scan registered anymore */ + _bt_dropscan(scan); + + /* be tidy */ +#ifdef PERFECT_MMGR + pfree (scan->opaque); +#endif /* PERFECT_MMGR */ +} + +/* + * btmarkpos() -- save current scan position + */ +void +btmarkpos(IndexScanDesc scan) +{ + ItemPointer iptr; + BTScanOpaque so; + + so = (BTScanOpaque) scan->opaque; + + /* release lock on old marked data, if any */ + if (ItemPointerIsValid(iptr = &(scan->currentMarkData))) { + _bt_relbuf(scan->relation, so->btso_mrkbuf, BT_READ); + so->btso_mrkbuf = InvalidBuffer; + ItemPointerSetInvalid(iptr); + } + + /* bump lock on currentItemData and copy to currentMarkData */ + if (ItemPointerIsValid(&(scan->currentItemData))) { + so->btso_mrkbuf = _bt_getbuf(scan->relation, + BufferGetBlockNumber(so->btso_curbuf), + BT_READ); + scan->currentMarkData = scan->currentItemData; + } +} + +/* + * btrestrpos() -- restore scan to last saved position + */ +void +btrestrpos(IndexScanDesc scan) +{ + ItemPointer iptr; + BTScanOpaque so; + + so = (BTScanOpaque) scan->opaque; + + /* release lock on current data, if any */ + if (ItemPointerIsValid(iptr = &(scan->currentItemData))) { + _bt_relbuf(scan->relation, so->btso_curbuf, BT_READ); + so->btso_curbuf = InvalidBuffer; + ItemPointerSetInvalid(iptr); + } + + /* bump lock on currentMarkData and copy to currentItemData */ + if (ItemPointerIsValid(&(scan->currentMarkData))) { + so->btso_curbuf = _bt_getbuf(scan->relation, + BufferGetBlockNumber(so->btso_mrkbuf), + BT_READ); + + scan->currentItemData = scan->currentMarkData; + } +} + +/* stubs */ +void +btdelete(Relation rel, ItemPointer tid) +{ + /* adjust any active scans that will be affected by this deletion */ + _bt_adjscans(rel, tid); + + /* delete the data from the page */ + _bt_pagedel(rel, tid); +} diff --git a/src/backend/access/nbtree/nbtscan.c b/src/backend/access/nbtree/nbtscan.c new file mode 100644 index 0000000000..62a029bc06 --- /dev/null +++ b/src/backend/access/nbtree/nbtscan.c @@ -0,0 +1,164 @@ +/*------------------------------------------------------------------------- + * + * btscan.c-- + * manage scans on btrees. + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/access/nbtree/Attic/nbtscan.c,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $ + * + * + * NOTES + * Because we can be doing an index scan on a relation while we update + * it, we need to avoid missing data that moves around in the index. + * The routines and global variables in this file guarantee that all + * scans in the local address space stay correctly positioned. This + * is all we need to worry about, since write locking guarantees that + * no one else will be on the same page at the same time as we are. + * + * The scheme is to manage a list of active scans in the current backend. + * Whenever we add or remove records from an index, or whenever we + * split a leaf page, we check the list of active scans to see if any + * has been affected. A scan is affected only if it is on the same + * relation, and the same page, as the update. + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "storage/bufmgr.h" +#include "storage/bufpage.h" + +#include "utils/elog.h" +#include "utils/palloc.h" +#include "utils/rel.h" +#include "utils/excid.h" + +#include "access/heapam.h" +#include "access/genam.h" +#include "access/sdir.h" +#include "access/nbtree.h" + +typedef struct BTScanListData { + IndexScanDesc btsl_scan; + struct BTScanListData *btsl_next; +} BTScanListData; + +typedef BTScanListData *BTScanList; + +static BTScanList BTScans = (BTScanList) NULL; + +/* + * _bt_regscan() -- register a new scan. + */ +void +_bt_regscan(IndexScanDesc scan) +{ + BTScanList new_el; + + new_el = (BTScanList) palloc(sizeof(BTScanListData)); + new_el->btsl_scan = scan; + new_el->btsl_next = BTScans; + BTScans = new_el; +} + +/* + * _bt_dropscan() -- drop a scan from the scan list + */ +void +_bt_dropscan(IndexScanDesc scan) +{ + BTScanList chk, last; + + last = (BTScanList) NULL; + for (chk = BTScans; + chk != (BTScanList) NULL && chk->btsl_scan != scan; + chk = chk->btsl_next) { + last = chk; + } + + if (chk == (BTScanList) NULL) + elog(WARN, "btree scan list trashed; can't find 0x%lx", scan); + + if (last == (BTScanList) NULL) + BTScans = chk->btsl_next; + else + last->btsl_next = chk->btsl_next; + +#ifdef PERFECT_MEM + pfree (chk); +#endif /* PERFECT_MEM */ +} + +void +_bt_adjscans(Relation rel, ItemPointer tid) +{ + BTScanList l; + Oid relid; + + relid = rel->rd_id; + for (l = BTScans; l != (BTScanList) NULL; l = l->btsl_next) { + if (relid == l->btsl_scan->relation->rd_id) + _bt_scandel(l->btsl_scan, ItemPointerGetBlockNumber(tid), + ItemPointerGetOffsetNumber(tid)); + } +} + +void +_bt_scandel(IndexScanDesc scan, BlockNumber blkno, OffsetNumber offno) +{ + ItemPointer current; + Buffer buf; + BTScanOpaque so; + + if (!_bt_scantouched(scan, blkno, offno)) + return; + + so = (BTScanOpaque) scan->opaque; + buf = so->btso_curbuf; + + current = &(scan->currentItemData); + if (ItemPointerIsValid(current) + && ItemPointerGetBlockNumber(current) == blkno + && ItemPointerGetOffsetNumber(current) >= offno) { + _bt_step(scan, &buf, BackwardScanDirection); + so->btso_curbuf = buf; + } + + current = &(scan->currentMarkData); + if (ItemPointerIsValid(current) + && ItemPointerGetBlockNumber(current) == blkno + && ItemPointerGetOffsetNumber(current) >= offno) { + ItemPointerData tmp; + tmp = *current; + *current = scan->currentItemData; + scan->currentItemData = tmp; + _bt_step(scan, &buf, BackwardScanDirection); + so->btso_mrkbuf = buf; + tmp = *current; + *current = scan->currentItemData; + scan->currentItemData = tmp; + } +} + +bool +_bt_scantouched(IndexScanDesc scan, BlockNumber blkno, OffsetNumber offno) +{ + ItemPointer current; + + current = &(scan->currentItemData); + if (ItemPointerIsValid(current) + && ItemPointerGetBlockNumber(current) == blkno + && ItemPointerGetOffsetNumber(current) >= offno) + return (true); + + current = &(scan->currentMarkData); + if (ItemPointerIsValid(current) + && ItemPointerGetBlockNumber(current) == blkno + && ItemPointerGetOffsetNumber(current) >= offno) + return (true); + + return (false); +} diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c new file mode 100644 index 0000000000..d7a7fc7d62 --- /dev/null +++ b/src/backend/access/nbtree/nbtsearch.c @@ -0,0 +1,1133 @@ +/*------------------------------------------------------------------------- + * + * btsearch.c-- + * search code for postgres btrees. + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "storage/bufmgr.h" +#include "storage/bufpage.h" + +#include "utils/elog.h" +#include "utils/palloc.h" +#include "utils/rel.h" +#include "utils/excid.h" + +#include "fmgr.h" + +#include "access/heapam.h" +#include "access/genam.h" +#include "access/skey.h" +#include "access/sdir.h" +#include "access/nbtree.h" + +static BTStack _bt_searchr(Relation rel, int keysz, ScanKey scankey, Buffer *bufP, BTStack stack_in); +static OffsetNumber _bt_firsteq(Relation rel, TupleDesc itupdesc, Page page, Size keysz, ScanKey scankey, OffsetNumber offnum); +static int _bt_compare(Relation rel, TupleDesc itupdesc, Page page, int keysz, ScanKey scankey, OffsetNumber offnum); +static bool _bt_twostep(IndexScanDesc scan, Buffer *bufP, ScanDirection dir); +static RetrieveIndexResult _bt_endpoint(IndexScanDesc scan, ScanDirection dir); + +/* + * _bt_search() -- Search for a scan key in the index. + * + * This routine is actually just a helper that sets things up and + * calls a recursive-descent search routine on the tree. + */ +BTStack +_bt_search(Relation rel, int keysz, ScanKey scankey, Buffer *bufP) +{ + *bufP = _bt_getroot(rel, BT_READ); + return (_bt_searchr(rel, keysz, scankey, bufP, (BTStack) NULL)); +} + +/* + * _bt_searchr() -- Search the tree recursively for a particular scankey. + */ +static BTStack +_bt_searchr(Relation rel, + int keysz, + ScanKey scankey, + Buffer *bufP, + BTStack stack_in) +{ + BTStack stack; + OffsetNumber offnum; + Page page; + BTPageOpaque opaque; + BlockNumber par_blkno; + BlockNumber blkno; + ItemId itemid; + BTItem btitem; + BTItem item_save; + int item_nbytes; + IndexTuple itup; + + /* if this is a leaf page, we're done */ + page = BufferGetPage(*bufP); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + if (opaque->btpo_flags & BTP_LEAF) + return (stack_in); + + /* + * Find the appropriate item on the internal page, and get the child + * page that it points to. + */ + + par_blkno = BufferGetBlockNumber(*bufP); + offnum = _bt_binsrch(rel, *bufP, keysz, scankey, BT_DESCENT); + itemid = PageGetItemId(page, offnum); + btitem = (BTItem) PageGetItem(page, itemid); + itup = &(btitem->bti_itup); + blkno = ItemPointerGetBlockNumber(&(itup->t_tid)); + + /* + * We need to save the bit image of the index entry we chose in the + * parent page on a stack. In case we split the tree, we'll use this + * bit image to figure out what our real parent page is, in case the + * parent splits while we're working lower in the tree. See the paper + * by Lehman and Yao for how this is detected and handled. (We use + * unique OIDs to disambiguate duplicate keys in the index -- Lehman + * and Yao disallow duplicate keys). + */ + + item_nbytes = ItemIdGetLength(itemid); + item_save = (BTItem) palloc(item_nbytes); + memmove((char *) item_save, (char *) btitem, item_nbytes); + stack = (BTStack) palloc(sizeof(BTStackData)); + stack->bts_blkno = par_blkno; + stack->bts_offset = offnum; + stack->bts_btitem = item_save; + stack->bts_parent = stack_in; + + /* drop the read lock on the parent page and acquire one on the child */ + _bt_relbuf(rel, *bufP, BT_READ); + *bufP = _bt_getbuf(rel, blkno, BT_READ); + + /* + * Race -- the page we just grabbed may have split since we read its + * pointer in the parent. If it has, we may need to move right to its + * new sibling. Do that. + */ + + *bufP = _bt_moveright(rel, *bufP, keysz, scankey, BT_READ); + + /* okay, all set to move down a level */ + return (_bt_searchr(rel, keysz, scankey, bufP, stack)); +} + +/* + * _bt_moveright() -- move right in the btree if necessary. + * + * When we drop and reacquire a pointer to a page, it is possible that + * the page has changed in the meanwhile. If this happens, we're + * guaranteed that the page has "split right" -- that is, that any + * data that appeared on the page originally is either on the page + * or strictly to the right of it. + * + * This routine decides whether or not we need to move right in the + * tree by examining the high key entry on the page. If that entry + * is strictly less than one we expect to be on the page, then our + * picture of the page is incorrect and we need to move right. + * + * On entry, we have the buffer pinned and a lock of the proper type. + * If we move right, we release the buffer and lock and acquire the + * same on the right sibling. + */ +Buffer +_bt_moveright(Relation rel, + Buffer buf, + int keysz, + ScanKey scankey, + int access) +{ + Page page; + BTPageOpaque opaque; + ItemId hikey; + ItemId itemid; + BlockNumber rblkno; + + page = BufferGetPage(buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + /* if we're on a rightmost page, we don't need to move right */ + if (P_RIGHTMOST(opaque)) + return (buf); + + /* by convention, item 0 on non-rightmost pages is the high key */ + hikey = PageGetItemId(page, P_HIKEY); + + /* + * If the scan key that brought us to this page is >= the high key + * stored on the page, then the page has split and we need to move + * right. + */ + + if (_bt_skeycmp(rel, keysz, scankey, page, hikey, + BTGreaterEqualStrategyNumber)) { + + /* move right as long as we need to */ + do { + /* + * If this page consists of all duplicate keys (hikey and first + * key on the page have the same value), then we don't need to + * step right. + */ + if (PageGetMaxOffsetNumber(page) > P_HIKEY) { + itemid = PageGetItemId(page, P_FIRSTKEY); + if (_bt_skeycmp(rel, keysz, scankey, page, itemid, + BTEqualStrategyNumber)) { + /* break is for the "move right" while loop */ + break; + } + } + + /* step right one page */ + rblkno = opaque->btpo_next; + _bt_relbuf(rel, buf, access); + buf = _bt_getbuf(rel, rblkno, access); + page = BufferGetPage(buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + hikey = PageGetItemId(page, P_HIKEY); + + } while (! P_RIGHTMOST(opaque) + && _bt_skeycmp(rel, keysz, scankey, page, hikey, + BTGreaterEqualStrategyNumber)); + } + return (buf); +} + +/* + * _bt_skeycmp() -- compare a scan key to a particular item on a page using + * a requested strategy (<, <=, =, >=, >). + * + * We ignore the unique OIDs stored in the btree item here. Those + * numbers are intended for use internally only, in repositioning a + * scan after a page split. They do not impose any meaningful ordering. + * + * The comparison is A <op> B, where A is the scan key and B is the + * tuple pointed at by itemid on page. + */ +bool +_bt_skeycmp(Relation rel, + Size keysz, + ScanKey scankey, + Page page, + ItemId itemid, + StrategyNumber strat) +{ + BTItem item; + IndexTuple indexTuple; + TupleDesc tupDes; + ScanKey entry; + int i; + Datum attrDatum; + Datum keyDatum; + bool compare; + bool isNull; + + item = (BTItem) PageGetItem(page, itemid); + indexTuple = &(item->bti_itup); + + tupDes = RelationGetTupleDescriptor(rel); + + /* see if the comparison is true for all of the key attributes */ + for (i=1; i <= keysz; i++) { + + entry = &scankey[i-1]; + attrDatum = index_getattr(indexTuple, + entry->sk_attno, + tupDes, + &isNull); + keyDatum = entry->sk_argument; + + compare = _bt_invokestrat(rel, i, strat, keyDatum, attrDatum); + if (!compare) + return (false); + } + + return (true); +} + +/* + * _bt_binsrch() -- Do a binary search for a key on a particular page. + * + * The scankey we get has the compare function stored in the procedure + * entry of each data struct. We invoke this regproc to do the + * comparison for every key in the scankey. _bt_binsrch() returns + * the OffsetNumber of the first matching key on the page, or the + * OffsetNumber at which the matching key would appear if it were + * on this page. + * + * By the time this procedure is called, we're sure we're looking + * at the right page -- don't need to walk right. _bt_binsrch() has + * no lock or refcount side effects on the buffer. + */ +OffsetNumber +_bt_binsrch(Relation rel, + Buffer buf, + int keysz, + ScanKey scankey, + int srchtype) +{ + TupleDesc itupdesc; + Page page; + BTPageOpaque opaque; + OffsetNumber low, mid, high; + bool match; + int result; + + page = BufferGetPage(buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + /* by convention, item 0 on any non-rightmost page is the high key */ + low = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY; + + high = PageGetMaxOffsetNumber(page); + + /* + * Since for non-rightmost pages, the zeroeth item on the page is the + * high key, there are two notions of emptiness. One is if nothing + * appears on the page. The other is if nothing but the high key does. + * The reason we test high <= low, rather than high == low, is that + * after vacuuming there may be nothing *but* the high key on a page. + * In that case, given the scheme above, low = 1 and high = 0. + */ + + if (PageIsEmpty(page) || (! P_RIGHTMOST(opaque) && high <= low)) + return (low); + + itupdesc = RelationGetTupleDescriptor(rel); + match = false; + + while ((high - low) > 1) { + mid = low + ((high - low) / 2); + result = _bt_compare(rel, itupdesc, page, keysz, scankey, mid); + + if (result > 0) + low = mid; + else if (result < 0) + high = mid - 1; + else { + match = true; + break; + } + } + + /* if we found a match, we want to find the first one on the page */ + if (match) { + return (_bt_firsteq(rel, itupdesc, page, keysz, scankey, mid)); + } else { + + /* + * We terminated because the endpoints got too close together. There + * are two cases to take care of. + * + * For non-insertion searches on internal pages, we want to point at + * the last key <, or first key =, the scankey on the page. This + * guarantees that we'll descend the tree correctly. + * + * For all other cases, we want to point at the first key >= + * the scankey on the page. This guarantees that scans and + * insertions will happen correctly. + */ + + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + if (!(opaque->btpo_flags & BTP_LEAF) && srchtype == BT_DESCENT) { + + /* + * We want the last key <, or first key ==, the scan key. + */ + + result = _bt_compare(rel, itupdesc, page, keysz, scankey, high); + + if (result == 0) { + return (_bt_firsteq(rel, itupdesc, page, keysz, scankey, high)); + } else if (result > 0) { + return (high); + } else { + return (low); + } + } else { + + /* we want the first key >= the scan key */ + result = _bt_compare(rel, itupdesc, page, keysz, scankey, low); + if (result <= 0) { + return (low); + } else { + if (low == high) + return (OffsetNumberNext(low)); + + result = _bt_compare(rel, itupdesc, page, keysz, scankey, high); + if (result <= 0) + return (high); + else + return (OffsetNumberNext(high)); + } + } + } +} + +static OffsetNumber +_bt_firsteq(Relation rel, + TupleDesc itupdesc, + Page page, + Size keysz, + ScanKey scankey, + OffsetNumber offnum) +{ + BTPageOpaque opaque; + OffsetNumber limit; + + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + /* skip the high key, if any */ + limit = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY; + + /* walk backwards looking for the first key in the chain of duplicates */ + while (offnum > limit + && _bt_compare(rel, itupdesc, page, + keysz, scankey, OffsetNumberPrev(offnum)) == 0) { + offnum = OffsetNumberPrev(offnum); + } + + return (offnum); +} + +/* + * _bt_compare() -- Compare scankey to a particular tuple on the page. + * + * This routine returns: + * -1 if scankey < tuple at offnum; + * 0 if scankey == tuple at offnum; + * +1 if scankey > tuple at offnum. + * + * In order to avoid having to propagate changes up the tree any time + * a new minimal key is inserted, the leftmost entry on the leftmost + * page is less than all possible keys, by definition. + */ +static int +_bt_compare(Relation rel, + TupleDesc itupdesc, + Page page, + int keysz, + ScanKey scankey, + OffsetNumber offnum) +{ + Datum datum; + BTItem btitem; + ItemId itemid; + IndexTuple itup; + BTPageOpaque opaque; + ScanKey entry; + AttrNumber attno; + int result; + int i; + bool null; + + /* + * If this is a leftmost internal page, and if our comparison is + * with the first key on the page, then the item at that position is + * by definition less than the scan key. + */ + + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + if (!(opaque->btpo_flags & BTP_LEAF) + && P_LEFTMOST(opaque) + && offnum == P_HIKEY) { + itemid = PageGetItemId(page, offnum); + + /* + * we just have to believe that this will only be called with + * offnum == P_HIKEY when P_HIKEY is the OffsetNumber of the + * first actual data key (i.e., this is also a rightmost + * page). there doesn't seem to be any code that implies + * that the leftmost page is normally missing a high key as + * well as the rightmost page. but that implies that this + * code path only applies to the root -- which seems + * unlikely.. + */ + if (! P_RIGHTMOST(opaque)) { + elog(WARN, "_bt_compare: invalid comparison to high key"); + } + + /* + * If the item on the page is equal to the scankey, that's + * okay to admit. We just can't claim that the first key on + * the page is greater than anything. + */ + + if (_bt_skeycmp(rel, keysz, scankey, page, itemid, + BTEqualStrategyNumber)) { + return (0); + } + return (1); + } + + btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum)); + itup = &(btitem->bti_itup); + + /* + * The scan key is set up with the attribute number associated with each + * term in the key. It is important that, if the index is multi-key, + * the scan contain the first k key attributes, and that they be in + * order. If you think about how multi-key ordering works, you'll + * understand why this is. + * + * We don't test for violation of this condition here. + */ + + for (i = 1; i <= keysz; i++) { + long tmpres; + + entry = &scankey[i - 1]; + attno = entry->sk_attno; + datum = index_getattr(itup, attno, itupdesc, &null); + tmpres = (long) FMGR_PTR2(entry->sk_func, entry->sk_procedure, + entry->sk_argument, datum); + result = tmpres; + + /* if the keys are unequal, return the difference */ + if (result != 0) + return (result); + } + + /* by here, the keys are equal */ + return (0); +} + +/* + * _bt_next() -- Get the next item in a scan. + * + * On entry, we have a valid currentItemData in the scan, and a + * read lock on the page that contains that item. We do not have + * the page pinned. We return the next item in the scan. On + * exit, we have the page containing the next item locked but not + * pinned. + */ +RetrieveIndexResult +_bt_next(IndexScanDesc scan, ScanDirection dir) +{ + Relation rel; + Buffer buf; + Page page; + OffsetNumber offnum; + RetrieveIndexResult res; + BlockNumber blkno; + ItemPointer current; + ItemPointer iptr; + BTItem btitem; + IndexTuple itup; + BTScanOpaque so; + + rel = scan->relation; + so = (BTScanOpaque) scan->opaque; + current = &(scan->currentItemData); + + /* + * XXX 10 may 91: somewhere there's a bug in our management of the + * cached buffer for this scan. wei discovered it. the following + * is a workaround so he can work until i figure out what's going on. + */ + + if (!BufferIsValid(so->btso_curbuf)) + so->btso_curbuf = _bt_getbuf(rel, ItemPointerGetBlockNumber(current), + BT_READ); + + /* we still have the buffer pinned and locked */ + buf = so->btso_curbuf; + blkno = BufferGetBlockNumber(buf); + + /* step one tuple in the appropriate direction */ + if (!_bt_step(scan, &buf, dir)) + return ((RetrieveIndexResult) NULL); + + /* by here, current is the tuple we want to return */ + offnum = ItemPointerGetOffsetNumber(current); + page = BufferGetPage(buf); + btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum)); + itup = &btitem->bti_itup; + + if (_bt_checkqual(scan, itup)) { + iptr = (ItemPointer) palloc(sizeof(ItemPointerData)); + memmove((char *) iptr, (char *) &(itup->t_tid), + sizeof(ItemPointerData)); + res = FormRetrieveIndexResult(current, iptr); + + /* remember which buffer we have pinned and locked */ + so->btso_curbuf = buf; + } else { + ItemPointerSetInvalid(current); + so->btso_curbuf = InvalidBuffer; + _bt_relbuf(rel, buf, BT_READ); + res = (RetrieveIndexResult) NULL; + } + + return (res); +} + +/* + * _bt_first() -- Find the first item in a scan. + * + * We need to be clever about the type of scan, the operation it's + * performing, and the tree ordering. We return the RetrieveIndexResult + * of the first item in the tree that satisfies the qualification + * associated with the scan descriptor. On exit, the page containing + * the current index tuple is read locked and pinned, and the scan's + * opaque data entry is updated to include the buffer. + */ +RetrieveIndexResult +_bt_first(IndexScanDesc scan, ScanDirection dir) +{ + Relation rel; + TupleDesc itupdesc; + Buffer buf; + Page page; + BTStack stack; + OffsetNumber offnum, maxoff; + BTItem btitem; + IndexTuple itup; + ItemPointer current; + ItemPointer iptr; + BlockNumber blkno; + StrategyNumber strat; + RetrieveIndexResult res; + RegProcedure proc; + int result; + BTScanOpaque so; + ScanKeyData skdata; + + /* if we just need to walk down one edge of the tree, do that */ + if (scan->scanFromEnd) + return (_bt_endpoint(scan, dir)); + + rel = scan->relation; + itupdesc = RelationGetTupleDescriptor(scan->relation); + current = &(scan->currentItemData); + so = (BTScanOpaque) scan->opaque; + + /* + * Okay, we want something more complicated. What we'll do is use + * the first item in the scan key passed in (which has been correctly + * ordered to take advantage of index ordering) to position ourselves + * at the right place in the scan. + */ + + /* + * XXX -- The attribute number stored in the scan key is the attno + * in the heap relation. We need to transmogrify this into + * the index relation attno here. For the moment, we have + * hardwired attno == 1. + */ + proc = index_getprocid(rel, 1, BTORDER_PROC); + ScanKeyEntryInitialize(&skdata, 0x0, 1, proc, + scan->keyData[0].sk_argument); + + stack = _bt_search(rel, 1, &skdata, &buf); + _bt_freestack(stack); + + /* find the nearest match to the manufactured scan key on the page */ + offnum = _bt_binsrch(rel, buf, 1, &skdata, BT_DESCENT); + page = BufferGetPage(buf); + + /* + * This will happen if the tree we're searching is entirely empty, + * or if we're doing a search for a key that would appear on an + * entirely empty internal page. In either case, there are no + * matching tuples in the index. + */ + + if (PageIsEmpty(page)) { + ItemPointerSetInvalid(current); + so->btso_curbuf = InvalidBuffer; + _bt_relbuf(rel, buf, BT_READ); + return ((RetrieveIndexResult) NULL); + } + + maxoff = PageGetMaxOffsetNumber(page); + + if (offnum > maxoff) + offnum = maxoff; + + blkno = BufferGetBlockNumber(buf); + ItemPointerSet(current, blkno, offnum); + + /* + * Now find the right place to start the scan. Result is the + * value we're looking for minus the value we're looking at + * in the index. + */ + + result = _bt_compare(rel, itupdesc, page, 1, &skdata, offnum); + strat = _bt_getstrat(rel, 1, scan->keyData[0].sk_procedure); + + switch (strat) { + case BTLessStrategyNumber: + if (result <= 0) { + do { + if (!_bt_twostep(scan, &buf, BackwardScanDirection)) + break; + + offnum = ItemPointerGetOffsetNumber(current); + page = BufferGetPage(buf); + result = _bt_compare(rel, itupdesc, page, 1, &skdata, offnum); + } while (result <= 0); + + /* if this is true, the key we just looked at is gone */ + if (result > 0) + (void) _bt_twostep(scan, &buf, ForwardScanDirection); + } + break; + + case BTLessEqualStrategyNumber: + if (result >= 0) { + do { + if (!_bt_twostep(scan, &buf, ForwardScanDirection)) + break; + + offnum = ItemPointerGetOffsetNumber(current); + page = BufferGetPage(buf); + result = _bt_compare(rel, itupdesc, page, 1, &skdata, offnum); + } while (result >= 0); + + if (result < 0) + (void) _bt_twostep(scan, &buf, BackwardScanDirection); + } + break; + + case BTEqualStrategyNumber: + if (result != 0) { + _bt_relbuf(scan->relation, buf, BT_READ); + so->btso_curbuf = InvalidBuffer; + ItemPointerSetInvalid(&(scan->currentItemData)); + return ((RetrieveIndexResult) NULL); + } + break; + + case BTGreaterEqualStrategyNumber: + if (result < 0) { + do { + if (!_bt_twostep(scan, &buf, BackwardScanDirection)) + break; + + page = BufferGetPage(buf); + offnum = ItemPointerGetOffsetNumber(current); + result = _bt_compare(rel, itupdesc, page, 1, &skdata, offnum); + } while (result < 0); + + if (result > 0) + (void) _bt_twostep(scan, &buf, ForwardScanDirection); + } + break; + + case BTGreaterStrategyNumber: + if (result >= 0) { + do { + if (!_bt_twostep(scan, &buf, ForwardScanDirection)) + break; + + offnum = ItemPointerGetOffsetNumber(current); + page = BufferGetPage(buf); + result = _bt_compare(rel, itupdesc, page, 1, &skdata, offnum); + } while (result >= 0); + } + break; + } + + /* okay, current item pointer for the scan is right */ + offnum = ItemPointerGetOffsetNumber(current); + page = BufferGetPage(buf); + btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum)); + itup = &btitem->bti_itup; + + if (_bt_checkqual(scan, itup)) { + iptr = (ItemPointer) palloc(sizeof(ItemPointerData)); + memmove((char *) iptr, (char *) &(itup->t_tid), + sizeof(ItemPointerData)); + res = FormRetrieveIndexResult(current, iptr); + pfree(iptr); + + /* remember which buffer we have pinned */ + so->btso_curbuf = buf; + } else { + ItemPointerSetInvalid(current); + so->btso_curbuf = InvalidBuffer; + _bt_relbuf(rel, buf, BT_READ); + res = (RetrieveIndexResult) NULL; + } + + return (res); +} + +/* + * _bt_step() -- Step one item in the requested direction in a scan on + * the tree. + * + * If no adjacent record exists in the requested direction, return + * false. Else, return true and set the currentItemData for the + * scan to the right thing. + */ +bool +_bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) +{ + Page page; + BTPageOpaque opaque; + OffsetNumber offnum, maxoff; + OffsetNumber start; + BlockNumber blkno; + BlockNumber obknum; + BTScanOpaque so; + ItemPointer current; + Relation rel; + + rel = scan->relation; + current = &(scan->currentItemData); + offnum = ItemPointerGetOffsetNumber(current); + page = BufferGetPage(*bufP); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + so = (BTScanOpaque) scan->opaque; + maxoff = PageGetMaxOffsetNumber(page); + + /* get the next tuple */ + if (ScanDirectionIsForward(dir)) { + if (!PageIsEmpty(page) && offnum < maxoff) { + offnum = OffsetNumberNext(offnum); + } else { + + /* if we're at end of scan, release the buffer and return */ + blkno = opaque->btpo_next; + if (P_RIGHTMOST(opaque)) { + _bt_relbuf(rel, *bufP, BT_READ); + ItemPointerSetInvalid(current); + *bufP = so->btso_curbuf = InvalidBuffer; + return (false); + } else { + + /* walk right to the next page with data */ + _bt_relbuf(rel, *bufP, BT_READ); + for (;;) { + *bufP = _bt_getbuf(rel, blkno, BT_READ); + page = BufferGetPage(*bufP); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + maxoff = PageGetMaxOffsetNumber(page); + start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY; + + if (!PageIsEmpty(page) && start <= maxoff) { + break; + } else { + blkno = opaque->btpo_next; + _bt_relbuf(rel, *bufP, BT_READ); + if (blkno == P_NONE) { + *bufP = so->btso_curbuf = InvalidBuffer; + ItemPointerSetInvalid(current); + return (false); + } + } + } + offnum = start; + } + } + } else if (ScanDirectionIsBackward(dir)) { + + /* remember that high key is item zero on non-rightmost pages */ + start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY; + + if (offnum > start) { + offnum = OffsetNumberPrev(offnum); + } else { + + /* if we're at end of scan, release the buffer and return */ + blkno = opaque->btpo_prev; + if (P_LEFTMOST(opaque)) { + _bt_relbuf(rel, *bufP, BT_READ); + *bufP = so->btso_curbuf = InvalidBuffer; + ItemPointerSetInvalid(current); + return (false); + } else { + + obknum = BufferGetBlockNumber(*bufP); + + /* walk right to the next page with data */ + _bt_relbuf(rel, *bufP, BT_READ); + for (;;) { + *bufP = _bt_getbuf(rel, blkno, BT_READ); + page = BufferGetPage(*bufP); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + maxoff = PageGetMaxOffsetNumber(page); + + /* + * If the adjacent page just split, then we may have the + * wrong block. Handle this case. Because pages only + * split right, we don't have to worry about this failing + * to terminate. + */ + + while (opaque->btpo_next != obknum) { + blkno = opaque->btpo_next; + _bt_relbuf(rel, *bufP, BT_READ); + *bufP = _bt_getbuf(rel, blkno, BT_READ); + page = BufferGetPage(*bufP); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + maxoff = PageGetMaxOffsetNumber(page); + } + + /* don't consider the high key */ + start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY; + + /* anything to look at here? */ + if (!PageIsEmpty(page) && maxoff >= start) { + break; + } else { + blkno = opaque->btpo_prev; + obknum = BufferGetBlockNumber(*bufP); + _bt_relbuf(rel, *bufP, BT_READ); + if (blkno == P_NONE) { + *bufP = so->btso_curbuf = InvalidBuffer; + ItemPointerSetInvalid(current); + return (false); + } + } + } + offnum = maxoff; /* XXX PageIsEmpty? */ + } + } + } + blkno = BufferGetBlockNumber(*bufP); + so->btso_curbuf = *bufP; + ItemPointerSet(current, blkno, offnum); + + return (true); +} + +/* + * _bt_twostep() -- Move to an adjacent record in a scan on the tree, + * if an adjacent record exists. + * + * This is like _bt_step, except that if no adjacent record exists + * it restores us to where we were before trying the step. This is + * only hairy when you cross page boundaries, since the page you cross + * from could have records inserted or deleted, or could even split. + * This is unlikely, but we try to handle it correctly here anyway. + * + * This routine contains the only case in which our changes to Lehman + * and Yao's algorithm. + * + * Like step, this routine leaves the scan's currentItemData in the + * proper state and acquires a lock and pin on *bufP. If the twostep + * succeeded, we return true; otherwise, we return false. + */ +static bool +_bt_twostep(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) +{ + Page page; + BTPageOpaque opaque; + OffsetNumber offnum, maxoff; + OffsetNumber start; + ItemPointer current; + ItemId itemid; + int itemsz; + BTItem btitem; + BTItem svitem; + BlockNumber blkno; + + blkno = BufferGetBlockNumber(*bufP); + page = BufferGetPage(*bufP); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + maxoff = PageGetMaxOffsetNumber(page); + current = &(scan->currentItemData); + offnum = ItemPointerGetOffsetNumber(current); + + start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY; + + /* if we're safe, just do it */ + if (ScanDirectionIsForward(dir) && offnum < maxoff) { /* XXX PageIsEmpty? */ + ItemPointerSet(current, blkno, OffsetNumberNext(offnum)); + return (true); + } else if (ScanDirectionIsBackward(dir) && offnum > start) { + ItemPointerSet(current, blkno, OffsetNumberPrev(offnum)); + return (true); + } + + /* if we've hit end of scan we don't have to do any work */ + if (ScanDirectionIsForward(dir) && P_RIGHTMOST(opaque)) { + return (false); + } else if (ScanDirectionIsBackward(dir) && P_LEFTMOST(opaque)) { + return (false); + } + + /* + * Okay, it's off the page; let _bt_step() do the hard work, and we'll + * try to remember where we were. This is not guaranteed to work; this + * is the only place in the code where concurrency can screw us up, + * and it's because we want to be able to move in two directions in + * the scan. + */ + + itemid = PageGetItemId(page, offnum); + itemsz = ItemIdGetLength(itemid); + btitem = (BTItem) PageGetItem(page, itemid); + svitem = (BTItem) palloc(itemsz); + memmove((char *) svitem, (char *) btitem, itemsz); + + if (_bt_step(scan, bufP, dir)) { + pfree(svitem); + return (true); + } + + /* try to find our place again */ + *bufP = _bt_getbuf(scan->relation, blkno, BT_READ); + page = BufferGetPage(*bufP); + maxoff = PageGetMaxOffsetNumber(page); + + while (offnum <= maxoff) { + itemid = PageGetItemId(page, offnum); + btitem = (BTItem) PageGetItem(page, itemid); + if (btitem->bti_oid == svitem->bti_oid) { + pfree(svitem); + ItemPointerSet(current, blkno, offnum); + return (false); + } + } + + /* + * XXX crash and burn -- can't find our place. We can be a little + * smarter -- walk to the next page to the right, for example, since + * that's the only direction that splits happen in. Deletions screw + * us up less often since they're only done by the vacuum daemon. + */ + + elog(WARN, "btree synchronization error: concurrent update botched scan"); + + return (false); +} + +/* + * _bt_endpoint() -- Find the first or last key in the index. + */ +static RetrieveIndexResult +_bt_endpoint(IndexScanDesc scan, ScanDirection dir) +{ + Relation rel; + Buffer buf; + Page page; + BTPageOpaque opaque; + ItemPointer current; + ItemPointer iptr; + OffsetNumber offnum, maxoff; + OffsetNumber start; + BlockNumber blkno; + BTItem btitem; + IndexTuple itup; + BTScanOpaque so; + RetrieveIndexResult res; + + rel = scan->relation; + current = &(scan->currentItemData); + + buf = _bt_getroot(rel, BT_READ); + blkno = BufferGetBlockNumber(buf); + page = BufferGetPage(buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + for (;;) { + if (opaque->btpo_flags & BTP_LEAF) + break; + + if (ScanDirectionIsForward(dir)) { + offnum = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY; + } else { + offnum = PageGetMaxOffsetNumber(page); + } + + btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum)); + itup = &(btitem->bti_itup); + + blkno = ItemPointerGetBlockNumber(&(itup->t_tid)); + + _bt_relbuf(rel, buf, BT_READ); + buf = _bt_getbuf(rel, blkno, BT_READ); + page = BufferGetPage(buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + /* + * Race condition: If the child page we just stepped onto is + * in the process of being split, we need to make sure we're + * all the way at the right edge of the tree. See the paper + * by Lehman and Yao. + */ + + if (ScanDirectionIsBackward(dir) && ! P_RIGHTMOST(opaque)) { + do { + blkno = opaque->btpo_next; + _bt_relbuf(rel, buf, BT_READ); + buf = _bt_getbuf(rel, blkno, BT_READ); + page = BufferGetPage(buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + } while (! P_RIGHTMOST(opaque)); + } + } + + /* okay, we've got the {left,right}-most page in the tree */ + maxoff = PageGetMaxOffsetNumber(page); + + if (ScanDirectionIsForward(dir)) { + if (PageIsEmpty(page)) { + maxoff = FirstOffsetNumber; + } else { + maxoff = PageGetMaxOffsetNumber(page); + } + start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY; + + if (PageIsEmpty(page) || start > maxoff) { + ItemPointerSet(current, blkno, maxoff); + if (!_bt_step(scan, &buf, BackwardScanDirection)) + return ((RetrieveIndexResult) NULL); + + start = ItemPointerGetOffsetNumber(current); + page = BufferGetPage(buf); + } else { + ItemPointerSet(current, blkno, start); + } + } else if (ScanDirectionIsBackward(dir)) { + if (PageIsEmpty(page)) { + ItemPointerSet(current, blkno, FirstOffsetNumber); + if (!_bt_step(scan, &buf, ForwardScanDirection)) + return ((RetrieveIndexResult) NULL); + + start = ItemPointerGetOffsetNumber(current); + page = BufferGetPage(buf); + } else { + start = PageGetMaxOffsetNumber(page); + ItemPointerSet(current, blkno, start); + } + } else { + elog(WARN, "Illegal scan direction %d", dir); + } + + btitem = (BTItem) PageGetItem(page, PageGetItemId(page, start)); + itup = &(btitem->bti_itup); + + /* see if we picked a winner */ + if (_bt_checkqual(scan, itup)) { + iptr = (ItemPointer) palloc(sizeof(ItemPointerData)); + memmove((char *) iptr, (char *) &(itup->t_tid), + sizeof(ItemPointerData)); + res = FormRetrieveIndexResult(current, iptr); + + /* remember which buffer we have pinned */ + so = (BTScanOpaque) scan->opaque; + so->btso_curbuf = buf; + } else { + _bt_relbuf(rel, buf, BT_READ); + res = (RetrieveIndexResult) NULL; + } + + return (res); +} diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c new file mode 100644 index 0000000000..3d2676324a --- /dev/null +++ b/src/backend/access/nbtree/nbtsort.c @@ -0,0 +1,1196 @@ +/*------------------------------------------------------------------------- + * btsort.c-- + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Id: nbtsort.c,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $ + * + * NOTES + * + * what we do is: + * - generate a set of initial one-block runs, distributed round-robin + * between the output tapes. + * - for each pass, + * - swap input and output tape sets, rewinding both and truncating + * the output tapes. + * - merge the current run in each input tape to the current output + * tape. + * - when each input run has been exhausted, switch to another output + * tape and start processing another run. + * - when we have fewer runs than tapes, we know we are ready to start + * merging into the btree leaf pages. + * - every time we complete a level of the btree, we can construct the + * next level up. when we have only one page on a level, it can be + * attached to the btree metapage and we are done. + * + * conventions: + * - external interface routines take in and return "void *" for their + * opaque handles. this is for modularity reasons (i prefer not to + * export these structures without good reason). + * + * this code is moderately slow (~10% slower) compared to the regular + * btree (insertion) build code on sorted or well-clustered data. on + * random data, however, the insertion build code is unusable -- the + * difference on a 60MB heap is a factor of 15 because the random + * probes into the btree thrash the buffer pool. + * + * this code currently packs the pages to 100% of capacity. this is + * not wise, since *any* insertion will cause splitting. filling to + * something like the standard 70% steady-state load factor for btrees + * would probably be better. + * + * somebody desperately needs to figure out how to do a better job of + * balancing the merge passes -- the fan-in on the final merges can be + * pretty poor, which is bad for performance. + *------------------------------------------------------------------------- + */ + +#include <stdio.h> + +#include "c.h" + +#include "access/nbtree.h" + +#include "storage/bufmgr.h" +#include "storage/fd.h" +#include "utils/rel.h" +#include "utils/palloc.h" +#include "utils/elog.h" + +/*#define FASTBUILD_DEBUG*/ /* turn on debugging output */ + +#define FASTBUILD + +#ifdef FASTBUILD + +#define MAXTAPES (7) +#define TAPEBLCKSZ (BLCKSZ << 2) +#define TAPETEMP "pg_btsortXXXXXX" + + +/*------------------------------------------------------------------------- + * sorting comparison routine - returns {-1,0,1} depending on whether + * the key in the left BTItem is {<,=,>} the key in the right BTItem. + * + * we want to use _bt_isortcmp as a comparison function for qsort(3), + * but it needs extra arguments, so we "pass them in" as global + * variables. ick. fortunately, they are the same throughout the + * build, so we need do this only once. this is why you must call + * _bt_isortcmpinit before the call to qsort(3). + * + * a NULL BTItem is always assumed to be greater than any actual + * value; our heap routines (see below) assume that the smallest + * element in the heap is returned. that way, NULL values from the + * exhausted tapes can sift down to the bottom of the heap. in point + * of fact we just don't replace the elements of exhausted tapes, but + * what the heck. + * *------------------------------------------------------------------------- + */ +static Relation _bt_sortrel; + +static void +_bt_isortcmpinit(Relation index) +{ + _bt_sortrel = index; +} + +static int +_bt_isortcmp(BTItem *bti1p, BTItem *bti2p) +{ + BTItem bti1 = *bti1p; + BTItem bti2 = *bti2p; + + if (bti1 == (BTItem) NULL) { + if (bti2 == (BTItem) NULL) { + return(0); /* 1 = 2 */ + } + return(1); /* 1 > 2 */ + } else if (bti2 == (BTItem) NULL) { + return(-1); /* 1 < 2 */ + } else if (_bt_itemcmp(_bt_sortrel, 1, bti1, bti2, + BTGreaterStrategyNumber)) { + return(1); /* 1 > 2 */ + } else if (_bt_itemcmp(_bt_sortrel, 1, bti2, bti1, + BTGreaterStrategyNumber)) { + return(-1); /* 1 < 2 */ + } + return(0); /* 1 = 2 */ +} + +/*------------------------------------------------------------------------- + * priority queue methods + * + * these were more-or-less lifted from the heap section of the 1984 + * edition of gonnet's book on algorithms and data structures. they + * are coded so that the smallest element in the heap is returned (we + * use them for merging sorted runs). + * + * XXX these probably ought to be generic library functions. + *------------------------------------------------------------------------- + */ + +typedef struct { + int btpqe_tape; /* tape identifier */ + BTItem btpqe_item; /* pointer to BTItem in tape buffer */ +} BTPriQueueElem; + +#define MAXELEM MAXTAPES +typedef struct { + int btpq_nelem; + BTPriQueueElem btpq_queue[MAXELEM]; + Relation btpq_rel; +} BTPriQueue; + +/* be sure to call _bt_isortcmpinit first */ +#define GREATER(a, b) \ + (_bt_isortcmp(&((a)->btpqe_item), &((b)->btpqe_item)) > 0) + +static void +_bt_pqsift(BTPriQueue *q, int parent) +{ + int child; + BTPriQueueElem e; + + for (child = parent * 2 + 1; + child < q->btpq_nelem; + child = parent * 2 + 1) { + if (child < q->btpq_nelem - 1) { + if (GREATER(&(q->btpq_queue[child]), &(q->btpq_queue[child+1]))) { + ++child; + } + } + if (GREATER(&(q->btpq_queue[parent]), &(q->btpq_queue[child]))) { + e = q->btpq_queue[child]; /* struct = */ + q->btpq_queue[child] = q->btpq_queue[parent]; /* struct = */ + q->btpq_queue[parent] = e; /* struct = */ + parent = child; + } else { + parent = child + 1; + } + } +} + +static int +_bt_pqnext(BTPriQueue *q, BTPriQueueElem *e) +{ + if (q->btpq_nelem < 1) { /* already empty */ + return(-1); + } + *e = q->btpq_queue[0]; /* struct = */ + + if (--q->btpq_nelem < 1) { /* now empty, don't sift */ + return(0); + } + q->btpq_queue[0] = q->btpq_queue[q->btpq_nelem]; /* struct = */ + _bt_pqsift(q, 0); + return(0); +} + +static void +_bt_pqadd(BTPriQueue *q, BTPriQueueElem *e) +{ + int child, parent; + + if (q->btpq_nelem >= MAXELEM) { + elog(WARN, "_bt_pqadd: queue overflow"); + } + + child = q->btpq_nelem++; + while (child > 0) { + parent = child / 2; + if (GREATER(e, &(q->btpq_queue[parent]))) { + break; + } else { + q->btpq_queue[child] = q->btpq_queue[parent]; /* struct = */ + child = parent; + } + } + + q->btpq_queue[child] = *e; /* struct = */ +} + +/*------------------------------------------------------------------------- + * tape methods + *------------------------------------------------------------------------- + */ + +#define BTITEMSZ(btitem) \ + ((btitem) ? \ + (IndexTupleDSize((btitem)->bti_itup) + \ + (sizeof(BTItemData) - sizeof(IndexTupleData))) : \ + 0) +#define SPCLEFT(tape) \ + (sizeof((tape)->bttb_data) - (tape)->bttb_top) +#define EMPTYTAPE(tape) \ + ((tape)->bttb_ntup <= 0) +#define BTTAPEMAGIC 0x19660226 + +/* + * this is what we use to shovel BTItems in and out of memory. it's + * bigger than a standard block because we are doing a lot of strictly + * sequential i/o. this is obviously something of a tradeoff since we + * are potentially reading a bunch of zeroes off of disk in many + * cases. + * + * BTItems are packed in and DOUBLEALIGN'd. + * + * the fd should not be going out to disk, strictly speaking, but it's + * the only thing like that so i'm not going to worry about wasting a + * few bytes. + */ +typedef struct { + int bttb_magic; /* magic number */ + int bttb_fd; /* file descriptor */ + int bttb_top; /* top of free space within bttb_data */ + short bttb_ntup; /* number of tuples in this block */ + short bttb_eor; /* End-Of-Run marker */ + char bttb_data[TAPEBLCKSZ - 2 * sizeof(double)]; +} BTTapeBlock; + + +/* + * reset the tape header for its next use without doing anything to + * the physical tape file. (setting bttb_top to 0 makes the block + * empty.) + */ +static void +_bt_tapereset(BTTapeBlock *tape) +{ + tape->bttb_eor = 0; + tape->bttb_top = 0; + tape->bttb_ntup = 0; +} + +/* + * rewind the physical tape file. + */ +static void +_bt_taperewind(BTTapeBlock *tape) +{ + (void) FileSeek(tape->bttb_fd, 0, SEEK_SET); +} + +/* + * destroy the contents of the physical tape file without destroying + * the tape data structure or removing the physical tape file. + * + * we use the VFD version of ftruncate(2) to do this rather than + * unlinking and recreating the file. you still have to wait while + * the OS frees up all of the file system blocks and stuff, but at + * least you don't have to delete and reinsert the directory entries. + */ +static void +_bt_tapeclear(BTTapeBlock *tape) +{ + /* blow away the contents of the old file */ + _bt_taperewind(tape); +#if 0 + FileSync(tape->bttb_fd); +#endif + FileTruncate(tape->bttb_fd, 0); + + /* reset the buffer */ + _bt_tapereset(tape); +} + +/* + * create a new BTTapeBlock, allocating memory for the data structure + * as well as opening a physical tape file. + */ +static BTTapeBlock * +_bt_tapecreate(char *fname) +{ + BTTapeBlock *tape = (BTTapeBlock *) palloc(sizeof(BTTapeBlock)); + + if (tape == (BTTapeBlock *) NULL) { + elog(WARN, "_bt_tapecreate: out of memory"); + } + + tape->bttb_magic = BTTAPEMAGIC; + + tape->bttb_fd = FileNameOpenFile(fname, O_RDWR|O_CREAT|O_TRUNC, 0600); + Assert(tape->bttb_fd >= 0); + + /* initialize the buffer */ + _bt_tapereset(tape); + + return(tape); +} + +/* + * destroy the BTTapeBlock structure and its physical tape file. + */ +static void +_bt_tapedestroy(BTTapeBlock *tape) +{ + FileUnlink(tape->bttb_fd); + pfree((void *) tape); +} + +/* + * flush the tape block to the file, marking End-Of-Run if requested. + */ +static void +_bt_tapewrite(BTTapeBlock *tape, int eor) +{ + tape->bttb_eor = eor; + FileWrite(tape->bttb_fd, (char*)tape, TAPEBLCKSZ); + _bt_tapereset(tape); +} + +/* + * read a tape block from the file, overwriting the current contents + * of the buffer. + * + * returns: + * - 0 if there are no more blocks in the tape or in this run (call + * _bt_tapereset to clear the End-Of-Run marker) + * - 1 if a valid block was read + */ +static int +_bt_taperead(BTTapeBlock *tape) +{ + int fd; + int nread; + + if (tape->bttb_eor) { + return(0); /* we are at End-Of-Run */ + } + + /* + * we're clobbering the old tape block, but we do need to save the + * VFD (the one in the block we're reading is bogus). + */ + fd = tape->bttb_fd; + nread = FileRead(fd, (char*) tape, TAPEBLCKSZ); + tape->bttb_fd = fd; + + if (nread != TAPEBLCKSZ) { + Assert(nread == 0); /* we are at EOF */ + return(0); + } + Assert(tape->bttb_magic == BTTAPEMAGIC); + return(1); +} + +/* + * get the next BTItem from a tape block. + * + * returns: + * - NULL if we have run out of BTItems + * - a pointer to the BTItemData in the block otherwise + * + * side effects: + * - sets 'pos' to the current position within the block. + */ +static BTItem +_bt_tapenext(BTTapeBlock *tape, char **pos) +{ + Size itemsz; + BTItem bti; + + if (*pos >= tape->bttb_data + tape->bttb_top) { + return((BTItem) NULL); + } + bti = (BTItem) *pos; + itemsz = BTITEMSZ(bti); + *pos += DOUBLEALIGN(itemsz); + return(bti); +} + +/* + * copy a BTItem into a tape block. + * + * assumes that we have already checked to see if the block has enough + * space for the item. + * + * side effects: + * + * - advances the 'top' pointer in the tape block header to point to + * the beginning of free space. + */ +static void +_bt_tapeadd(BTTapeBlock *tape, BTItem item, int itemsz) +{ + (void) memcpy(tape->bttb_data + tape->bttb_top, item, itemsz); + ++tape->bttb_ntup; + tape->bttb_top += DOUBLEALIGN(itemsz); +} + +/*------------------------------------------------------------------------- + * spool methods + *------------------------------------------------------------------------- + */ + +/* + * this structure holds the bookkeeping for a simple balanced multiway + * merge. (polyphase merging is hairier than i want to get into right + * now, and i don't see why i have to care how many "tapes" i use + * right now. though if psort was in a condition that i could hack it + * to do this, you bet i would.) + */ +typedef struct { + int bts_ntapes; + int bts_tape; + BTTapeBlock **bts_itape; /* input tape blocks */ + BTTapeBlock **bts_otape; /* output tape blocks */ +} BTSpool; + +/* + * create and initialize a spool structure, including the underlying + * files. + */ +void * +_bt_spoolinit(Relation index, int ntapes) +{ + char *mktemp(); + + BTSpool *btspool = (BTSpool *) palloc(sizeof(BTSpool)); + int i; + char *fname = (char *) palloc(sizeof(TAPETEMP) + 1); + + if (btspool == (BTSpool *) NULL || fname == (char *) NULL) { + elog(WARN, "_bt_spoolinit: out of memory"); + } + (void) memset((char *) btspool, 0, sizeof(BTSpool)); + btspool->bts_ntapes = ntapes; + btspool->bts_tape = 0; + + btspool->bts_itape = + (BTTapeBlock **) palloc(sizeof(BTTapeBlock *) * ntapes); + btspool->bts_otape = + (BTTapeBlock **) palloc(sizeof(BTTapeBlock *) * ntapes); + if (btspool->bts_itape == (BTTapeBlock **) NULL || + btspool->bts_otape == (BTTapeBlock **) NULL) { + elog(WARN, "_bt_spoolinit: out of memory"); + } + + for (i = 0; i < ntapes; ++i) { + btspool->bts_itape[i] = + _bt_tapecreate(mktemp(strcpy(fname, TAPETEMP))); + btspool->bts_otape[i] = + _bt_tapecreate(mktemp(strcpy(fname, TAPETEMP))); + } + pfree((void *) fname); + + _bt_isortcmpinit(index); + + return((void *) btspool); +} + +/* + * clean up a spool structure and its substructures. + */ +void +_bt_spooldestroy(void *spool) +{ + BTSpool *btspool = (BTSpool *) spool; + int i; + + for (i = 0; i < btspool->bts_ntapes; ++i) { + _bt_tapedestroy(btspool->bts_otape[i]); + _bt_tapedestroy(btspool->bts_itape[i]); + } + pfree((void *) btspool); +} + +/* + * flush out any dirty output tape blocks + */ +static void +_bt_spoolflush(BTSpool *btspool) +{ + int i; + + for (i = 0; i < btspool->bts_ntapes; ++i) { + if (!EMPTYTAPE(btspool->bts_otape[i])) { + _bt_tapewrite(btspool->bts_otape[i], 1); + } + } +} + +/* + * swap input tapes and output tapes by swapping their file + * descriptors. additional preparation for the next merge pass + * includes rewinding the new input tapes and clearing out the new + * output tapes. + */ +static void +_bt_spoolswap(BTSpool *btspool) +{ + File tmpfd; + BTTapeBlock *itape; + BTTapeBlock *otape; + int i; + + for (i = 0; i < btspool->bts_ntapes; ++i) { + itape = btspool->bts_itape[i]; + otape = btspool->bts_otape[i]; + + /* + * swap the input and output VFDs. + */ + tmpfd = itape->bttb_fd; + itape->bttb_fd = otape->bttb_fd; + otape->bttb_fd = tmpfd; + + /* + * rewind the new input tape. + */ + _bt_taperewind(itape); + _bt_tapereset(itape); + + /* + * clear the new output tape -- it's ok to throw away the old + * inputs. + */ + _bt_tapeclear(otape); + } +} + +/*------------------------------------------------------------------------- + * sorting routines + *------------------------------------------------------------------------- + */ + +/* + * spool 'btitem' into an initial run. as tape blocks are filled, the + * block BTItems are qsorted and written into some output tape (it + * doesn't matter which; we go round-robin for simplicity). the + * initial runs are therefore always just one block. + */ +void +_bt_spool(Relation index, BTItem btitem, void *spool) +{ + BTSpool *btspool = (BTSpool *) spool; + BTTapeBlock *itape; + Size itemsz; + + itape = btspool->bts_itape[btspool->bts_tape]; + itemsz = BTITEMSZ(btitem); + itemsz = DOUBLEALIGN(itemsz); + + /* + * if this buffer is too full for this BTItemData, or if we have + * run out of BTItems, we need to sort the buffer and write it + * out. in this case, the BTItemData will go into the next tape's + * buffer. + */ + if (btitem == (BTItem) NULL || SPCLEFT(itape) < itemsz) { + BTItem *parray; + BTTapeBlock *otape; + BTItem bti; + char *pos; + int btisz; + int i; + + /* + * build an array of pointers to the BTItemDatas on the input + * block. + */ + parray = (BTItem *) palloc(itape->bttb_ntup * sizeof(BTItem)); + if (parray == (BTItem *) NULL) { + elog(WARN, "_bt_spool: out of memory"); + } + pos = itape->bttb_data; + for (i = 0; i < itape->bttb_ntup; ++i) { + parray[i] = _bt_tapenext(itape, &pos); + } + + /* + * qsort the pointer array. + */ + _bt_isortcmpinit(index); + qsort((void *) parray, itape->bttb_ntup, sizeof(BTItem), _bt_isortcmp); + + /* + * write the spooled run into the output tape. we copy the + * BTItemDatas in the order dictated by the sorted array of + * BTItems, not the original order. + * + * (since everything was DOUBLEALIGN'd and is all on a single + * page, everything had *better* still fit on one page..) + */ + otape = btspool->bts_otape[btspool->bts_tape]; + for (i = 0; i < itape->bttb_ntup; ++i) { + bti = parray[i]; + btisz = BTITEMSZ(bti); + btisz = DOUBLEALIGN(btisz); + _bt_tapeadd(otape, bti, btisz); +#ifdef FASTBUILD_DEBUG + { + bool isnull; + Datum d = index_getattr(&(bti->bti_itup), 1, + RelationGetTupleDescriptor(index), + &isnull); + printf("_bt_spool: inserted <%x> into output tape %d\n", + d, btspool->bts_tape); + } +#endif /* FASTBUILD_DEBUG */ + } + + /* + * the initial runs are always single tape blocks. flush the + * output block, marking End-Of-Run. + */ + _bt_tapewrite(otape, 1); + + /* + * reset the input buffer for the next run. we don't have to + * write it out or anything -- we only use it to hold the + * unsorted BTItemDatas, the output tape contains all the + * sorted stuff. + * + * changing bts_tape changes the output tape and input tape; + * we change itape for the code below. + */ + _bt_tapereset(itape); + btspool->bts_tape = (btspool->bts_tape + 1) % btspool->bts_ntapes; + itape = btspool->bts_itape[btspool->bts_tape]; + + /* + * destroy the pointer array. + */ + pfree((void *) parray); + } + + /* insert this item into the current buffer */ + if (btitem != (BTItem) NULL) { + _bt_tapeadd(itape, btitem, itemsz); + } +} + +/* + * allocate a new, clean btree page, not linked to any siblings. + */ +static void +_bt_blnewpage(Relation index, Buffer *buf, Page *page, int flags) +{ + BTPageOpaque opaque; + + *buf = _bt_getbuf(index, P_NEW, BT_WRITE); + *page = BufferGetPage(*buf); + _bt_pageinit(*page, BufferGetPageSize(*buf)); + opaque = (BTPageOpaque) PageGetSpecialPointer(*page); + opaque->btpo_prev = opaque->btpo_next = P_NONE; + opaque->btpo_flags = flags; +} + +/* + * slide an array of ItemIds back one slot (from P_FIRSTKEY to + * P_HIKEY). we need to do this when we discover that we have built + * an ItemId array in what has turned out to be a P_RIGHTMOST page. + */ +static void +_bt_slideleft(Relation index, Buffer buf, Page page) +{ + OffsetNumber off; + OffsetNumber maxoff; + ItemId previi; + ItemId thisii; + + maxoff = PageGetMaxOffsetNumber(page); + previi = PageGetItemId(page, P_HIKEY); + for (off = P_FIRSTKEY; off <= maxoff; off = OffsetNumberNext(off)) { + thisii = PageGetItemId(page, off); + *previi = *thisii; + previi = thisii; + } + ((PageHeader) page)->pd_lower -= sizeof(ItemIdData); +} + +typedef struct { + Buffer btps_buf; + Page btps_page; + BTItem btps_lastbti; + OffsetNumber btps_lastoff; + OffsetNumber btps_firstoff; +} BTPageState; + +/* + * add an item to a disk page from a merge tape block. + * + * we must be careful to observe the following restrictions, placed + * upon us by the conventions in nbtsearch.c: + * - rightmost pages start data items at P_HIKEY instead of at + * P_FIRSTKEY. + * - duplicates cannot be split among pages unless the chain of + * duplicates starts at the first data item. + * + * a leaf page being built looks like: + * + * +----------------+---------------------------------+ + * | PageHeaderData | linp0 linp1 linp2 ... | + * +-----------+----+---------------------------------+ + * | ... linpN | ^ first | + * +-----------+--------------------------------------+ + * | ^ last | + * | | + * | v last | + * +-------------+------------------------------------+ + * | | itemN ... | + * +-------------+------------------+-----------------+ + * | ... item3 item2 item1 | "special space" | + * +--------------------------------+-----------------+ + * ^ first + * + * contrast this with the diagram in bufpage.h; note the mismatch + * between linps and items. this is because we reserve linp0 as a + * placeholder for the pointer to the "high key" item; when we have + * filled up the page, we will set linp0 to point to itemN and clear + * linpN. + * + * 'last' pointers indicate the last offset/item added to the page. + * 'first' pointers indicate the first offset/item that is part of a + * chain of duplicates extending from 'first' to 'last'. + * + * if all keys are unique, 'first' will always be the same as 'last'. + */ +static void +_bt_buildadd(Relation index, BTPageState *state, BTItem bti, int flags) +{ + Buffer nbuf; + Page npage; + BTItem last_bti; + OffsetNumber first_off; + OffsetNumber last_off; + OffsetNumber off; + Size pgspc; + Size btisz; + + nbuf = state->btps_buf; + npage = state->btps_page; + first_off = state->btps_firstoff; + last_off = state->btps_lastoff; + last_bti = state->btps_lastbti; + + pgspc = PageGetFreeSpace(npage); + btisz = BTITEMSZ(bti); + btisz = DOUBLEALIGN(btisz); + if (pgspc < btisz) { + Buffer obuf = nbuf; + Page opage = npage; + OffsetNumber o, n; + ItemId ii; + ItemId hii; + + _bt_blnewpage(index, &nbuf, &npage, flags); + + /* + * if 'last' is part of a chain of duplicates that does not + * start at the beginning of the old page, the entire chain is + * copied to the new page; we delete all of the duplicates + * from the old page except the first, which becomes the high + * key item of the old page. + * + * if the chain starts at the beginning of the page or there + * is no chain ('first' == 'last'), we need only copy 'last' + * to the new page. again, 'first' (== 'last') becomes the + * high key of the old page. + * + * note that in either case, we copy at least one item to the + * new page, so 'last_bti' will always be valid. 'bti' will + * never be the first data item on the new page. + */ + if (first_off == P_FIRSTKEY) { + Assert(last_off != P_FIRSTKEY); + first_off = last_off; + } + for (o = first_off, n = P_FIRSTKEY; + o <= last_off; + o = OffsetNumberNext(o), n = OffsetNumberNext(n)) { + ii = PageGetItemId(opage, o); + (void) PageAddItem(npage, PageGetItem(opage, ii), + ii->lp_len, n, LP_USED); +#ifdef FASTBUILD_DEBUG + { + bool isnull; + BTItem tmpbti = + (BTItem) PageGetItem(npage, PageGetItemId(npage, n)); + Datum d = index_getattr(&(tmpbti->bti_itup), 1, + RelationGetTupleDescriptor(index), + &isnull); + printf("_bt_buildadd: moved <%x> to offset %d\n", + d, n); + } +#endif /* FASTBUILD_DEBUG */ + } + for (o = last_off; o > first_off; o = OffsetNumberPrev(o)) { + PageIndexTupleDelete(opage, o); + } + hii = PageGetItemId(opage, P_HIKEY); + ii = PageGetItemId(opage, first_off); + *hii = *ii; + ii->lp_flags &= ~LP_USED; + ((PageHeader) opage)->pd_lower -= sizeof(ItemIdData); + + first_off = P_FIRSTKEY; + last_off = PageGetMaxOffsetNumber(npage); + last_bti = (BTItem) PageGetItem(npage, PageGetItemId(npage, last_off)); + + /* + * set the page (side link) pointers. + */ + { + BTPageOpaque oopaque = (BTPageOpaque) PageGetSpecialPointer(opage); + BTPageOpaque nopaque = (BTPageOpaque) PageGetSpecialPointer(npage); + + oopaque->btpo_next = BufferGetBlockNumber(nbuf); + nopaque->btpo_prev = BufferGetBlockNumber(obuf); + nopaque->btpo_next = P_NONE; + } + + /* + * write out the old stuff. we never want to see it again, so + * we can give up our lock (if we had one; BuildingBtree is + * set, so we aren't locking). + */ + _bt_wrtbuf(index, obuf); + } + + /* + * if this item is different from the last item added, we start a + * new chain of duplicates. + */ + off = OffsetNumberNext(last_off); + (void) PageAddItem(npage, (Item) bti, btisz, off, LP_USED); +#ifdef FASTBUILD_DEBUG + { + bool isnull; + Datum d = index_getattr(&(bti->bti_itup), 1, + RelationGetTupleDescriptor(index), + &isnull); + printf("_bt_buildadd: inserted <%x> at offset %d\n", + d, off); + } +#endif /* FASTBUILD_DEBUG */ + if (last_bti == (BTItem) NULL) { + first_off = P_FIRSTKEY; + } else if (!_bt_itemcmp(index, 1, bti, last_bti, BTEqualStrategyNumber)) { + first_off = off; + } + last_off = off; + last_bti = (BTItem) PageGetItem(npage, PageGetItemId(npage, off)); + + state->btps_buf = nbuf; + state->btps_page = npage; + state->btps_lastbti = last_bti; + state->btps_lastoff = last_off; + state->btps_firstoff = first_off; +} + +/* + * take the input tapes stored by 'btspool' and perform successive + * merging passes until at most one run is left in each tape. at that + * point, merge the final tape runs into a set of btree leaves. + * + * XXX three nested loops? gross. cut me up into smaller routines. + */ +static BlockNumber +_bt_merge(Relation index, BTSpool *btspool) +{ + BTPageState state; + BlockNumber firstblk; + BTPriQueue q; + BTPriQueueElem e; + BTItem bti; + BTTapeBlock *itape; + BTTapeBlock *otape; + char *tapepos[MAXTAPES]; + int tapedone[MAXTAPES]; + int t; + int goodtapes; + int nruns; + Size btisz; + bool doleaf = false; + + /* + * initialize state needed for the merge into the btree leaf pages. + */ + (void) memset((char *) &state, 0, sizeof(BTPageState)); + _bt_blnewpage(index, &(state.btps_buf), &(state.btps_page), BTP_LEAF); + state.btps_lastoff = P_HIKEY; + state.btps_lastbti = (BTItem) NULL; + firstblk = BufferGetBlockNumber(state.btps_buf); + + do { /* pass */ + /* + * each pass starts by flushing the previous outputs and + * swapping inputs and outputs. this process also clears the + * new output tapes and rewinds the new input tapes. + */ + btspool->bts_tape = btspool->bts_ntapes - 1; + _bt_spoolflush(btspool); + _bt_spoolswap(btspool); + + nruns = 0; + + for (;;) { /* run */ + /* + * each run starts by selecting a new output tape. the + * merged results of a given run are always sent to this + * one tape. + */ + btspool->bts_tape = (btspool->bts_tape + 1) % btspool->bts_ntapes; + otape = btspool->bts_otape[btspool->bts_tape]; + + /* + * initialize the priority queue by loading it with the + * first element of the given run in each tape. since we + * are starting a new run, we reset the tape (clearing the + * End-Of-Run marker) before reading it. this means that + * _bt_taperead will return 0 only if the tape is actually + * at EOF. + */ + (void) memset((char *) &q, 0, sizeof(BTPriQueue)); + goodtapes = 0; + for (t = 0; t < btspool->bts_ntapes; ++t) { + itape = btspool->bts_itape[t]; + tapepos[t] = itape->bttb_data; + _bt_tapereset(itape); + if (_bt_taperead(itape) == 0) { + tapedone[t] = 1; + } else { + ++goodtapes; + tapedone[t] = 0; + e.btpqe_tape = t; + e.btpqe_item = _bt_tapenext(itape, &tapepos[t]); + if (e.btpqe_item != (BTItem) NULL) { + _bt_pqadd(&q, &e); + } + } + } + /* + * if we don't have any tapes with any input (i.e., they + * are all at EOF), we must be done with this pass. + */ + if (goodtapes == 0) { + break; /* for */ + } + ++nruns; + + /* + * output the smallest element from the queue until there are no + * more. + */ + while (_bt_pqnext(&q, &e) >= 0) { /* item */ + /* + * replace the element taken from priority queue, + * fetching a new block if needed. a tape can run out + * if it hits either End-Of-Run or EOF. + */ + t = e.btpqe_tape; + bti = e.btpqe_item; + if (bti != (BTItem) NULL) { + btisz = BTITEMSZ(bti); + btisz = DOUBLEALIGN(btisz); + if (doleaf) { + _bt_buildadd(index, &state, bti, BTP_LEAF); +#ifdef FASTBUILD_DEBUG + { + bool isnull; + Datum d = index_getattr(&(bti->bti_itup), 1, + RelationGetTupleDescriptor(index), + &isnull); + printf("_bt_merge: inserted <%x> into block %d\n", + d, BufferGetBlockNumber(state.btps_buf)); + } +#endif /* FASTBUILD_DEBUG */ + } else { + if (SPCLEFT(otape) < btisz) { + /* + * if it's full, write it out and add the + * item to the next block. (since we know + * there will be at least one more block, + * we know we do *not* want to set + * End-Of-Run here!) + */ + _bt_tapewrite(otape, 0); + } + _bt_tapeadd(otape, bti, btisz); +#ifdef FASTBUILD_DEBUG + { + bool isnull; + Datum d = index_getattr(&(bti->bti_itup), 1, + RelationGetTupleDescriptor(index), &isnull); + printf("_bt_merge: inserted <%x> into tape %d\n", + d, btspool->bts_tape); + } +#endif /* FASTBUILD_DEBUG */ + } + } +#ifdef FASTBUILD_DEBUG + { + bool isnull; + Datum d = index_getattr(&(bti->bti_itup), 1, + RelationGetTupleDescriptor(index), + &isnull); + printf("_bt_merge: got <%x> from tape %d\n", d, t); + } +#endif /* FASTBUILD_DEBUG */ + + itape = btspool->bts_itape[t]; + if (!tapedone[t]) { + BTItem newbti = _bt_tapenext(itape, &tapepos[t]); + + if (newbti == (BTItem) NULL) { + if (_bt_taperead(itape) == 0) { + tapedone[t] = 1; + } else { + tapepos[t] = itape->bttb_data; + newbti = _bt_tapenext(itape, &tapepos[t]); + } + } + if (newbti != (BTItem) NULL) { + BTPriQueueElem nexte; + + nexte.btpqe_tape = t; + nexte.btpqe_item = newbti; + _bt_pqadd(&q, &nexte); + } + } + } /* item */ + } /* run */ + + /* + * we are here because we ran out of input on all of the input + * tapes. + * + * if this pass did not generate more actual output runs than + * we have tapes, we know we have at most one run in each + * tape. this means that we are ready to merge into the final + * btree leaf pages instead of merging into a tape file. + */ + if (nruns <= btspool->bts_ntapes) { + doleaf = true; + } + } while (nruns > 0); /* pass */ + + /* + * this is the rightmost page, so the ItemId array needs to be + * slid back one slot. + */ + _bt_slideleft(index, state.btps_buf, state.btps_page); + _bt_wrtbuf(index, state.btps_buf); + + return(firstblk); +} + + +/* + * given the block number 'blk' of the first page of a set of linked + * siblings (i.e., the start of an entire level of the btree), + * construct the corresponding next level of the btree. we do this by + * placing minimum keys from each page into this page. the format of + * the internal pages is otherwise the same as for leaf pages. + */ +void +_bt_upperbuild(Relation index, BlockNumber blk, int level) +{ + Buffer rbuf; + Page rpage; + BTPageOpaque ropaque; + BTPageState state; + BlockNumber firstblk; + BTItem bti; + BTItem nbti; + OffsetNumber off; + + rbuf = _bt_getbuf(index, blk, BT_WRITE); + rpage = BufferGetPage(rbuf); + ropaque = (BTPageOpaque) PageGetSpecialPointer(rpage); + + /* + * if we only have one page on a level, we can just make it the + * root. + */ + if (P_RIGHTMOST(ropaque)) { + ropaque->btpo_flags |= BTP_ROOT; + _bt_wrtbuf(index, rbuf); + _bt_metaproot(index, blk); + return; + } + _bt_relbuf(index, rbuf, BT_WRITE); + + (void) memset((char *) &state, 0, sizeof(BTPageState)); + _bt_blnewpage(index, &(state.btps_buf), &(state.btps_page), 0); + state.btps_lastoff = P_HIKEY; + state.btps_lastbti = (BTItem) NULL; + firstblk = BufferGetBlockNumber(state.btps_buf); + + /* for each page... */ + do { + rbuf = _bt_getbuf(index, blk, BT_READ); + rpage = BufferGetPage(rbuf); + ropaque = (BTPageOpaque) PageGetSpecialPointer(rpage); + + /* for each item... */ + if (!PageIsEmpty(rpage)) { + /* + * form a new index tuple corresponding to the minimum key + * of the lower page and insert it into a page at this + * level. + */ + off = P_RIGHTMOST(ropaque) ? P_HIKEY : P_FIRSTKEY; + bti = (BTItem) PageGetItem(rpage, PageGetItemId(rpage, off)); + nbti = _bt_formitem(&(bti->bti_itup)); + ItemPointerSet(&(nbti->bti_itup.t_tid), blk, P_HIKEY); +#ifdef FASTBUILD_DEBUG + { + bool isnull; + Datum d = index_getattr(&(nbti->bti_itup), 1, + RelationGetTupleDescriptor(index), + &isnull); + printf("_bt_upperbuild: inserting <%x> at %d\n", + d, level); + } +#endif /* FASTBUILD_DEBUG */ + _bt_buildadd(index, &state, nbti, 0); + pfree((void *) nbti); + } + blk = ropaque->btpo_next; + _bt_relbuf(index, rbuf, BT_READ); + } while (blk != P_NONE); + + /* + * this is the rightmost page, so the ItemId array needs to be + * slid back one slot. + */ + _bt_slideleft(index, state.btps_buf, state.btps_page); + _bt_wrtbuf(index, state.btps_buf); + + _bt_upperbuild(index, firstblk, level + 1); +} + +/* + * given a spool loading by successive calls to _bt_spool, create an + * entire btree. + */ +void +_bt_leafbuild(Relation index, void *spool) +{ + BTSpool *btspool = (BTSpool *) spool; + BlockNumber firstblk; + + /* + * merge the runs into btree leaf pages. + */ + firstblk = _bt_merge(index, btspool); + + /* + * build the upper levels of the btree. + */ + _bt_upperbuild(index, firstblk, 0); +} + +#else /* !FASTBUILD */ + +void *_bt_spoolinit(Relation index, int ntapes) { return((void *) NULL); } +void _bt_spooldestroy(void *spool) { } +void _bt_spool(Relation index, BTItem btitem, void *spool) { } +void _bt_upperbuild(Relation index, BlockNumber blk, int level) { } +void _bt_leafbuild(Relation index, void *spool) { } + +#endif /* !FASTBUILD */ diff --git a/src/backend/access/nbtree/nbtstrat.c b/src/backend/access/nbtree/nbtstrat.c new file mode 100644 index 0000000000..2214c60950 --- /dev/null +++ b/src/backend/access/nbtree/nbtstrat.c @@ -0,0 +1,134 @@ +/*------------------------------------------------------------------------- + * + * btstrat.c-- + * Srategy map entries for the btree indexed access method + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/access/nbtree/Attic/nbtstrat.c,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "storage/bufpage.h" + +#include "utils/elog.h" +#include "utils/rel.h" +#include "utils/excid.h" + +#include "access/genam.h" +#include "access/nbtree.h" + +/* + * Note: + * StrategyNegate, StrategyCommute, and StrategyNegateCommute + * assume <, <=, ==, >=, > ordering. + */ +static StrategyNumber BTNegate[5] = { + BTGreaterEqualStrategyNumber, + BTGreaterStrategyNumber, + InvalidStrategy, + BTLessStrategyNumber, + BTLessEqualStrategyNumber +}; + +static StrategyNumber BTCommute[5] = { + BTGreaterStrategyNumber, + BTGreaterEqualStrategyNumber, + InvalidStrategy, + BTLessEqualStrategyNumber, + BTLessStrategyNumber +}; + +static StrategyNumber BTNegateCommute[5] = { + BTLessEqualStrategyNumber, + BTLessStrategyNumber, + InvalidStrategy, + BTGreaterStrategyNumber, + BTGreaterEqualStrategyNumber +}; + +static uint16 BTLessTermData[] = { /* XXX type clash */ + 2, + BTLessStrategyNumber, + SK_NEGATE, + BTLessStrategyNumber, + SK_NEGATE | SK_COMMUTE +}; + +static uint16 BTLessEqualTermData[] = { /* XXX type clash */ + 2, + BTLessEqualStrategyNumber, + 0x0, + BTLessEqualStrategyNumber, + SK_COMMUTE +}; + +static uint16 BTGreaterEqualTermData[] = { /* XXX type clash */ + 2, + BTGreaterEqualStrategyNumber, + 0x0, + BTGreaterEqualStrategyNumber, + SK_COMMUTE + }; + +static uint16 BTGreaterTermData[] = { /* XXX type clash */ + 2, + BTGreaterStrategyNumber, + SK_NEGATE, + BTGreaterStrategyNumber, + SK_NEGATE | SK_COMMUTE +}; + +static StrategyTerm BTEqualExpressionData[] = { + (StrategyTerm)BTLessTermData, /* XXX */ + (StrategyTerm)BTLessEqualTermData, /* XXX */ + (StrategyTerm)BTGreaterEqualTermData, /* XXX */ + (StrategyTerm)BTGreaterTermData, /* XXX */ + NULL +}; + +static StrategyEvaluationData BTEvaluationData = { + /* XXX static for simplicity */ + + BTMaxStrategyNumber, + (StrategyTransformMap)BTNegate, /* XXX */ + (StrategyTransformMap)BTCommute, /* XXX */ + (StrategyTransformMap)BTNegateCommute, /* XXX */ + + { NULL, NULL, (StrategyExpression)BTEqualExpressionData, NULL, NULL, + NULL,NULL,NULL,NULL,NULL,NULL,NULL} +}; + +/* ---------------------------------------------------------------- + * RelationGetBTStrategy + * ---------------------------------------------------------------- + */ + +StrategyNumber +_bt_getstrat(Relation rel, + AttrNumber attno, + RegProcedure proc) +{ + StrategyNumber strat; + + strat = RelationGetStrategy(rel, attno, &BTEvaluationData, proc); + + Assert(StrategyNumberIsValid(strat)); + + return (strat); +} + +bool +_bt_invokestrat(Relation rel, + AttrNumber attno, + StrategyNumber strat, + Datum left, + Datum right) +{ + return (RelationInvokeStrategy(rel, &BTEvaluationData, attno, strat, + left, right)); +} diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c new file mode 100644 index 0000000000..695a2b637c --- /dev/null +++ b/src/backend/access/nbtree/nbtutils.c @@ -0,0 +1,239 @@ +/*------------------------------------------------------------------------- + * + * btutils.c-- + * Utility code for Postgres btree implementation. + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtutils.c,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#include <stdio.h> +#include "postgres.h" + +#include "storage/bufmgr.h" +#include "storage/bufpage.h" + +#include "fmgr.h" +#include "utils/elog.h" +#include "utils/palloc.h" +#include "utils/rel.h" +#include "utils/excid.h" +#include "utils/datum.h" + +#include "access/heapam.h" +#include "access/genam.h" +#include "access/iqual.h" +#include "access/nbtree.h" + +ScanKey +_bt_mkscankey(Relation rel, IndexTuple itup) +{ + ScanKey skey; + TupleDesc itupdesc; + int natts; + int i; + Datum arg; + RegProcedure proc; + bool null; + + natts = rel->rd_rel->relnatts; + itupdesc = RelationGetTupleDescriptor(rel); + + skey = (ScanKey) palloc(natts * sizeof(ScanKeyData)); + + for (i = 0; i < natts; i++) { + arg = index_getattr(itup, i + 1, itupdesc, &null); + proc = index_getprocid(rel, i + 1, BTORDER_PROC); + ScanKeyEntryInitialize(&skey[i], + 0x0, (AttrNumber) (i + 1), proc, arg); + } + + return (skey); +} + +void +_bt_freeskey(ScanKey skey) +{ + pfree(skey); +} + +void +_bt_freestack(BTStack stack) +{ + BTStack ostack; + + while (stack != (BTStack) NULL) { + ostack = stack; + stack = stack->bts_parent; + pfree(ostack->bts_btitem); + pfree(ostack); + } +} + +/* + * _bt_orderkeys() -- Put keys in a sensible order for conjunctive quals. + * + * The order of the keys in the qual match the ordering imposed by + * the index. This routine only needs to be called if there are + * more than one qual clauses using this index. + */ +void +_bt_orderkeys(Relation relation, uint16 *numberOfKeys, ScanKey key) +{ + ScanKey xform; + ScanKeyData *cur; + StrategyMap map; + int nbytes; + long test; + int i, j; + int init[BTMaxStrategyNumber+1]; + + /* haven't looked at any strategies yet */ + for (i = 0; i <= BTMaxStrategyNumber; i++) + init[i] = 0; + + /* get space for the modified array of keys */ + nbytes = BTMaxStrategyNumber * sizeof(ScanKeyData); + xform = (ScanKey) palloc(nbytes); + memset(xform, 0, nbytes); + + + /* get the strategy map for this index/attribute pair */ + /* + * XXX + * When we support multiple keys in a single index, this is what + * we'll want to do. At present, the planner is hosed, so we + * hard-wire the attribute number below. Postgres only does single- + * key indices... + * map = IndexStrategyGetStrategyMap(RelationGetIndexStrategy(relation), + * BTMaxStrategyNumber, + * key->data[0].attributeNumber); + */ + map = IndexStrategyGetStrategyMap(RelationGetIndexStrategy(relation), + BTMaxStrategyNumber, + 1 /* XXX */ ); + + /* check each key passed in */ + for (i = *numberOfKeys; --i >= 0; ) { + cur = &key[i]; + for (j = BTMaxStrategyNumber; --j >= 0; ) { + if (cur->sk_procedure == map->entry[j].sk_procedure) + break; + } + + /* have we seen one of these before? */ + if (init[j]) { + /* yup, use the appropriate value */ + test = + (long) FMGR_PTR2(cur->sk_func, cur->sk_procedure, + cur->sk_argument, xform[j].sk_argument); + if (test) + xform[j].sk_argument = cur->sk_argument; + } else { + /* nope, use this value */ + memmove(&xform[j], cur, sizeof(*cur)); + + init[j] = 1; + } + } + + /* if = has been specified, no other key will be used */ + if (init[BTEqualStrategyNumber - 1]) { + init[BTLessStrategyNumber - 1] = 0; + init[BTLessEqualStrategyNumber - 1] = 0; + init[BTGreaterEqualStrategyNumber - 1] = 0; + init[BTGreaterStrategyNumber - 1] = 0; + } + + /* only one of <, <= */ + if (init[BTLessStrategyNumber - 1] + && init[BTLessEqualStrategyNumber - 1]) { + + ScanKeyData *lt, *le; + + lt = &xform[BTLessStrategyNumber - 1]; + le = &xform[BTLessEqualStrategyNumber - 1]; + + /* + * DO NOT use the cached function stuff here -- this is key + * ordering, happens only when the user expresses a hokey + * qualification, and gets executed only once, anyway. The + * transform maps are hard-coded, and can't be initialized + * in the correct way. + */ + + test = (long) fmgr(le->sk_procedure, le->sk_argument, lt->sk_argument); + + if (test) + init[BTLessEqualStrategyNumber - 1] = 0; + else + init[BTLessStrategyNumber - 1] = 0; + } + + /* only one of >, >= */ + if (init[BTGreaterStrategyNumber - 1] + && init[BTGreaterEqualStrategyNumber - 1]) { + + ScanKeyData *gt, *ge; + + gt = &xform[BTGreaterStrategyNumber - 1]; + ge = &xform[BTGreaterEqualStrategyNumber - 1]; + + /* see note above on function cache */ + test = (long) fmgr(ge->sk_procedure, gt->sk_argument, gt->sk_argument); + + if (test) + init[BTGreaterStrategyNumber - 1] = 0; + else + init[BTGreaterEqualStrategyNumber - 1] = 0; + } + + /* okay, reorder and count */ + j = 0; + + for (i = BTMaxStrategyNumber; --i >= 0; ) + if (init[i]) + key[j++] = xform[i]; + + *numberOfKeys = j; + + pfree(xform); +} + +bool +_bt_checkqual(IndexScanDesc scan, IndexTuple itup) +{ + if (scan->numberOfKeys > 0) + return (index_keytest(itup, RelationGetTupleDescriptor(scan->relation), + scan->numberOfKeys, scan->keyData)); + else + return (true); +} + +BTItem +_bt_formitem(IndexTuple itup) +{ + int nbytes_btitem; + BTItem btitem; + Size tuplen; + extern Oid newoid(); + + /* disallow nulls in btree keys */ + if (itup->t_info & INDEX_NULL_MASK) + elog(WARN, "btree indices cannot include null keys"); + + /* make a copy of the index tuple with room for the sequence number */ + tuplen = IndexTupleSize(itup); + nbytes_btitem = tuplen + + (sizeof(BTItemData) - sizeof(IndexTupleData)); + + btitem = (BTItem) palloc(nbytes_btitem); + memmove((char *) &(btitem->bti_itup), (char *) itup, tuplen); + + btitem->bti_oid = newoid(); + return (btitem); +} |
