1 files changed, 614 insertions, 0 deletions
diff --git a/src/backend/access/hash/hashovfl.c b/src/backend/access/hash/hashovfl.c
new file mode 100644
index 0000000000..55ee9e9ce7
--- /dev/null
+++ b/src/backend/access/hash/hashovfl.c
@@ -0,0 +1,614 @@
+/*-------------------------------------------------------------------------
+ *
+ * hashovfl.c--
+ *    Overflow page management code for the Postgres hash access method
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *    $Header: /cvsroot/pgsql/src/backend/access/hash/hashovfl.c,v 1.1.1.1 1996/07/09 06:21:10 scrappy Exp $
+ *
+ * NOTES
+ *    Overflow pages look like ordinary relation pages.
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "storage/bufmgr.h"
+#include "storage/bufpage.h"
+
+#include "utils/elog.h"
+#include "utils/rel.h"
+#include "utils/excid.h"
+
+#include "access/genam.h"
+#include "access/hash.h"
+
+static OverflowPageAddress _hash_getovfladdr(Relation rel, Buffer *metabufp);
+static uint32 _hash_firstfreebit(uint32 map);
+
+/*
+ *  _hash_addovflpage
+ *
+ *  Add an overflow page to the page currently pointed to by the buffer 
+ *  argument 'buf'. 
+ *
+ *  *Metabufp has a read lock upon entering the function; buf has a 
+ *  write lock. 
+ *  
+ */
+Buffer
+_hash_addovflpage(Relation rel, Buffer *metabufp, Buffer buf)
+{
+    
+    OverflowPageAddress oaddr;
+    BlockNumber ovflblkno;
+    Buffer ovflbuf;
+    HashMetaPage metap;
+    HashPageOpaque ovflopaque;
+    HashPageOpaque pageopaque;
+    Page page;
+    Page ovflpage;
+    
+    /* this had better be the last page in a bucket chain */
+    page = BufferGetPage(buf);
+    _hash_checkpage(page, LH_BUCKET_PAGE|LH_OVERFLOW_PAGE);
+    pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
+    Assert(!BlockNumberIsValid(pageopaque->hasho_nextblkno));
+    
+    metap = (HashMetaPage) BufferGetPage(*metabufp);
+    _hash_checkpage((Page) metap, LH_META_PAGE);
+
+    /* allocate an empty overflow page */
+    oaddr = _hash_getovfladdr(rel, metabufp);
+    if (oaddr == InvalidOvflAddress) {
+	elog(WARN, "_hash_addovflpage: problem with _hash_getovfladdr.");
+    }
+    ovflblkno = OADDR_TO_BLKNO(OADDR_OF(SPLITNUM(oaddr), OPAGENUM(oaddr)));
+    Assert(BlockNumberIsValid(ovflblkno));
+    ovflbuf = _hash_getbuf(rel, ovflblkno, HASH_WRITE);
+    Assert(BufferIsValid(ovflbuf));
+    ovflpage = BufferGetPage(ovflbuf);
+
+    /* initialize the new overflow page */
+    _hash_pageinit(ovflpage, BufferGetPageSize(ovflbuf));
+    ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage);
+    ovflopaque->hasho_prevblkno = BufferGetBlockNumber(buf);
+    ovflopaque->hasho_nextblkno = InvalidBlockNumber;
+    ovflopaque->hasho_flag = LH_OVERFLOW_PAGE;
+    ovflopaque->hasho_oaddr = oaddr;
+    ovflopaque->hasho_bucket = pageopaque->hasho_bucket;
+    _hash_wrtnorelbuf(rel, ovflbuf);
+    
+    /* logically chain overflow page to previous page */
+    pageopaque->hasho_nextblkno = ovflblkno;
+    _hash_wrtnorelbuf(rel, buf);
+    return (ovflbuf);
+}
+
+/*
+ *  _hash_getovfladdr()
+ *
+ *  Find an available overflow page and return its address. 
+ *
+ *  When we enter this function, we have a read lock on *metabufp which
+ *  we change to a write lock immediately. Before exiting, the write lock
+ *  is exchanged for a read lock. 
+ *
+ */
+static OverflowPageAddress
+_hash_getovfladdr(Relation rel, Buffer *metabufp)
+{
+    HashMetaPage metap;
+    Buffer mapbuf;
+    BlockNumber blkno;
+    PageOffset offset;
+    OverflowPageAddress oaddr;
+    SplitNumber splitnum;
+    uint32 *freep;
+    uint32 max_free; 
+    uint32 bit;
+    uint32 first_page; 
+    uint32 free_bit; 
+    uint32 free_page; 
+    uint32 in_use_bits;
+    uint32 i, j;
+    
+    metap = (HashMetaPage) _hash_chgbufaccess(rel, metabufp, HASH_READ, HASH_WRITE);
+    
+    splitnum = metap->OVFL_POINT;
+    max_free = metap->SPARES[splitnum];
+    
+    free_page = (max_free - 1) >> (metap->BSHIFT + BYTE_TO_BIT);
+    free_bit = (max_free - 1) & (BMPGSZ_BIT(metap) - 1);
+    
+    /* Look through all the free maps to find the first free block */
+    first_page = metap->LAST_FREED >> (metap->BSHIFT + BYTE_TO_BIT);
+    for ( i = first_page; i <= free_page; i++ ) {
+	Page mappage;
+
+	blkno = metap->hashm_mapp[i];
+	mapbuf = _hash_getbuf(rel, blkno, HASH_WRITE);
+	mappage = BufferGetPage(mapbuf);
+	_hash_checkpage(mappage, LH_BITMAP_PAGE);
+	freep = HashPageGetBitmap(mappage);
+	Assert(freep);
+	
+	if (i == free_page)
+	    in_use_bits = free_bit;
+	else
+	    in_use_bits = BMPGSZ_BIT(metap) - 1;
+	
+	if (i == first_page) {
+	    bit = metap->LAST_FREED & (BMPGSZ_BIT(metap) - 1);
+	    j = bit / BITS_PER_MAP;
+	    bit = bit & ~(BITS_PER_MAP - 1);
+	} else {
+	    bit = 0;
+	    j = 0;
+	}
+	for (; bit <= in_use_bits; j++, bit += BITS_PER_MAP)
+	    if (freep[j] != ALL_SET)
+		goto found;
+    }
+    
+    /* No Free Page Found - have to allocate a new page */
+    metap->LAST_FREED = metap->SPARES[splitnum];
+    metap->SPARES[splitnum]++;
+    offset = metap->SPARES[splitnum] -
+	(splitnum ? metap->SPARES[splitnum - 1] : 0);
+    
+#define	OVMSG	"HASH: Out of overflow pages.  Out of luck.\n"
+    
+    if (offset > SPLITMASK) {
+	if (++splitnum >= NCACHED) {
+	    elog(WARN, OVMSG);
+	}
+	metap->OVFL_POINT = splitnum;
+	metap->SPARES[splitnum] = metap->SPARES[splitnum-1];
+	metap->SPARES[splitnum-1]--;
+	offset = 0;
+    }
+    
+    /* Check if we need to allocate a new bitmap page */
+    if (free_bit == BMPGSZ_BIT(metap) - 1) {
+	/* won't be needing old map page */
+
+	_hash_relbuf(rel, mapbuf, HASH_WRITE);
+
+	free_page++;
+	if (free_page >= NCACHED) {
+	    elog(WARN, OVMSG);
+	}
+	
+	/*
+	 * This is tricky.  The 1 indicates that you want the new page
+	 * allocated with 1 clear bit.  Actually, you are going to
+	 * allocate 2 pages from this map.  The first is going to be
+	 * the map page, the second is the overflow page we were
+	 * looking for.  The init_bitmap routine automatically, sets
+	 * the first bit of itself to indicate that the bitmap itself
+	 * is in use.  We would explicitly set the second bit, but
+	 * don't have to if we tell init_bitmap not to leave it clear
+	 * in the first place.
+	 */
+	if (_hash_initbitmap(rel, metap, OADDR_OF(splitnum, offset),
+			     1, free_page)) {
+	    elog(WARN, "overflow_page: problem with _hash_initbitmap.");
+	}
+	metap->SPARES[splitnum]++;
+	offset++;
+	if (offset > SPLITMASK) {
+	    if (++splitnum >= NCACHED) {
+		elog(WARN, OVMSG);
+	    }
+	    metap->OVFL_POINT = splitnum;
+	    metap->SPARES[splitnum] = metap->SPARES[splitnum-1];
+	    metap->SPARES[splitnum-1]--;
+	    offset = 0;
+	}
+    } else {
+	
+	/*
+	 * Free_bit addresses the last used bit.  Bump it to address
+	 * the first available bit.
+	 */
+	free_bit++;
+	SETBIT(freep, free_bit);
+	_hash_wrtbuf(rel, mapbuf);
+    }
+    
+    /* Calculate address of the new overflow page */
+    oaddr = OADDR_OF(splitnum, offset);
+    _hash_chgbufaccess(rel, metabufp, HASH_WRITE, HASH_READ);
+    return (oaddr);
+    
+ found:
+    bit = bit + _hash_firstfreebit(freep[j]);
+    SETBIT(freep, bit);
+    _hash_wrtbuf(rel, mapbuf);
+    
+    /*
+     * Bits are addressed starting with 0, but overflow pages are addressed
+     * beginning at 1. Bit is a bit addressnumber, so we need to increment
+     * it to convert it to a page number.
+     */
+    
+    bit = 1 + bit + (i * BMPGSZ_BIT(metap));
+    if (bit >= metap->LAST_FREED) {
+	metap->LAST_FREED = bit - 1;
+    }
+    
+    /* Calculate the split number for this page */
+    for (i = 0; (i < splitnum) && (bit > metap->SPARES[i]); i++)
+	;
+    offset = (i ? bit - metap->SPARES[i - 1] : bit);
+    if (offset >= SPLITMASK) {
+	elog(WARN, OVMSG);
+    }
+    
+    /* initialize this page */
+    oaddr = OADDR_OF(i, offset);
+    _hash_chgbufaccess(rel, metabufp, HASH_WRITE, HASH_READ);
+    return (oaddr);
+}
+
+/*
+ *  _hash_firstfreebit()
+ *
+ *  Return the first bit that is not set in the argument 'map'. This
+ *  function is used to find an available overflow page within a
+ *  splitnumber. 
+ * 
+ */
+static uint32
+_hash_firstfreebit(uint32 map)
+{
+    uint32 i, mask;
+    
+    mask = 0x1;
+    for (i = 0; i < BITS_PER_MAP; i++) {
+	if (!(mask & map))
+	    return (i);
+	mask = mask << 1;
+    }
+    return (i);
+}
+
+/*
+ *  _hash_freeovflpage() - 
+ *
+ *  Mark this overflow page as free and return a buffer with 
+ *  the page that follows it (which may be defined as
+ *  InvalidBuffer). 
+ *
+ */
+Buffer
+_hash_freeovflpage(Relation rel, Buffer ovflbuf)
+{
+    HashMetaPage metap;
+    Buffer metabuf;
+    Buffer mapbuf;
+    BlockNumber prevblkno;
+    BlockNumber blkno;
+    BlockNumber nextblkno;
+    HashPageOpaque ovflopaque;
+    Page ovflpage;
+    Page mappage;
+    OverflowPageAddress addr;
+    SplitNumber splitnum;
+    uint32 *freep;
+    uint32 ovflpgno;
+    int32 bitmappage, bitmapbit;
+    Bucket bucket;
+    
+    metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE);
+    metap = (HashMetaPage) BufferGetPage(metabuf);
+    _hash_checkpage((Page) metap, LH_META_PAGE);
+    
+    ovflpage = BufferGetPage(ovflbuf);
+    _hash_checkpage(ovflpage, LH_OVERFLOW_PAGE);
+    ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage);
+    addr = ovflopaque->hasho_oaddr;
+    nextblkno = ovflopaque->hasho_nextblkno;
+    prevblkno = ovflopaque->hasho_prevblkno;
+    bucket = ovflopaque->hasho_bucket;
+    (void) memset(ovflpage, 0, BufferGetPageSize(ovflbuf));
+    _hash_wrtbuf(rel, ovflbuf);
+    
+    /* 
+     * fix up the bucket chain.  this is a doubly-linked list, so we
+     * must fix up the bucket chain members behind and ahead of the
+     * overflow page being deleted.
+     *
+     * XXX this should look like:
+     * - lock prev/next
+     * - modify/write prev/next (how to do write ordering with a
+     * doubly-linked list???)
+     * - unlock prev/next
+     */
+    if (BlockNumberIsValid(prevblkno)) {
+	Buffer prevbuf = _hash_getbuf(rel, prevblkno, HASH_WRITE);
+	Page prevpage = BufferGetPage(prevbuf);
+	HashPageOpaque prevopaque =
+	    (HashPageOpaque) PageGetSpecialPointer(prevpage);
+
+	_hash_checkpage(prevpage, LH_BUCKET_PAGE|LH_OVERFLOW_PAGE);
+	Assert(prevopaque->hasho_bucket == bucket);
+	prevopaque->hasho_nextblkno = nextblkno;
+	_hash_wrtbuf(rel, prevbuf);
+    }
+    if (BlockNumberIsValid(nextblkno)) {
+	Buffer nextbuf = _hash_getbuf(rel, nextblkno, HASH_WRITE);
+	Page nextpage = BufferGetPage(nextbuf);
+	HashPageOpaque nextopaque =
+	    (HashPageOpaque) PageGetSpecialPointer(nextpage);
+	
+	_hash_checkpage(nextpage, LH_OVERFLOW_PAGE);
+	Assert(nextopaque->hasho_bucket == bucket);
+	nextopaque->hasho_prevblkno = prevblkno;
+	_hash_wrtbuf(rel, nextbuf);
+    }
+    
+    /* 
+     * Fix up the overflow page bitmap that tracks this particular
+     * overflow page. The bitmap can be found in the MetaPageData
+     * array element hashm_mapp[bitmappage].
+     */
+    splitnum = (addr >> SPLITSHIFT);
+    ovflpgno =
+	(splitnum ? metap->SPARES[splitnum - 1] : 0) + (addr & SPLITMASK) - 1;
+    
+    if (ovflpgno < metap->LAST_FREED) {
+	metap->LAST_FREED = ovflpgno;
+    }
+    
+    bitmappage = (ovflpgno >> (metap->BSHIFT + BYTE_TO_BIT));
+    bitmapbit = ovflpgno & (BMPGSZ_BIT(metap) - 1);
+    
+    blkno = metap->hashm_mapp[bitmappage];
+    mapbuf = _hash_getbuf(rel, blkno, HASH_WRITE);
+    mappage = BufferGetPage(mapbuf);
+    _hash_checkpage(mappage, LH_BITMAP_PAGE);
+    freep = HashPageGetBitmap(mappage);
+    CLRBIT(freep, bitmapbit);
+    _hash_wrtbuf(rel, mapbuf);
+    
+    _hash_relbuf(rel, metabuf, HASH_WRITE);
+    
+    /* 
+     * now instantiate the page that replaced this one, 
+     * if it exists, and return that buffer with a write lock.
+     */
+    if (BlockNumberIsValid(nextblkno)) {
+	return (_hash_getbuf(rel, nextblkno, HASH_WRITE));
+    } else {
+	return (InvalidBuffer);
+    }
+}
+
+
+/*
+ *  _hash_initbitmap()
+ *  
+ *   Initialize a new bitmap page.  The metapage has a write-lock upon
+ *   entering the function.
+ *
+ * 'pnum' is the OverflowPageAddress of the new bitmap page.
+ * 'nbits' is how many bits to clear (i.e., make available) in the new
+ * bitmap page.  the remainder of the bits (as well as the first bit,
+ * representing the bitmap page itself) will be set.
+ * 'ndx' is the 0-based offset of the new bitmap page within the
+ * metapage's array of bitmap page OverflowPageAddresses.
+ */
+
+#define INT_MASK	((1 << INT_TO_BIT) -1)
+
+int32
+_hash_initbitmap(Relation rel,
+		 HashMetaPage metap,
+		 int32 pnum,
+		 int32 nbits,
+		 int32 ndx)
+{
+    Buffer buf;
+    BlockNumber blkno;
+    Page pg;
+    HashPageOpaque op;
+    uint32 *freep;
+    int clearbytes, clearints;
+    
+    blkno = OADDR_TO_BLKNO(pnum);
+    buf = _hash_getbuf(rel, blkno, HASH_WRITE);
+    pg = BufferGetPage(buf);
+    _hash_pageinit(pg, BufferGetPageSize(buf));
+    op = (HashPageOpaque) PageGetSpecialPointer(pg);
+    op->hasho_oaddr = InvalidOvflAddress;
+    op->hasho_prevblkno = InvalidBlockNumber;
+    op->hasho_nextblkno = InvalidBlockNumber;
+    op->hasho_flag = LH_BITMAP_PAGE;
+    op->hasho_bucket = -1;
+
+    freep = HashPageGetBitmap(pg);
+
+    /* set all of the bits above 'nbits' to 1 */
+    clearints = ((nbits - 1) >> INT_TO_BIT) + 1;
+    clearbytes = clearints << INT_TO_BYTE;
+    (void) memset((char *) freep, 0, clearbytes);
+    (void) memset(((char *) freep) + clearbytes, 0xFF,
+		  BMPGSZ_BYTE(metap) - clearbytes);
+    freep[clearints - 1] = ALL_SET << (nbits & INT_MASK);
+
+    /* bit 0 represents the new bitmap page */
+    SETBIT(freep, 0);
+        
+    /* metapage already has a write lock */
+    metap->hashm_nmaps++;
+    metap->hashm_mapp[ndx] = blkno;
+    
+    /* write out the new bitmap page (releasing its locks) */
+    _hash_wrtbuf(rel, buf);
+
+    return (0);
+}
+
+
+/*
+ *  _hash_squeezebucket(rel, bucket)
+ *
+ *  Try to squeeze the tuples onto pages occuring earlier in the
+ *  bucket chain in an attempt to free overflow pages. When we start
+ *  the "squeezing", the page from which we start taking tuples (the
+ *  "read" page) is the last bucket in the bucket chain and the page
+ *  onto which we start squeezing tuples (the "write" page) is the
+ *  first page in the bucket chain.  The read page works backward and
+ *  the write page works forward; the procedure terminates when the
+ *  read page and write page are the same page.
+ */
+void
+_hash_squeezebucket(Relation rel,
+		    HashMetaPage metap, 
+		    Bucket bucket)
+{
+    Buffer wbuf;
+    Buffer rbuf;
+    BlockNumber wblkno;		
+    BlockNumber rblkno;		
+    Page wpage;
+    Page rpage;
+    HashPageOpaque wopaque;
+    HashPageOpaque ropaque;
+    OffsetNumber woffnum;
+    OffsetNumber roffnum;
+    HashItem hitem;
+    int itemsz;
+    
+/*    elog(DEBUG, "_hash_squeezebucket: squeezing bucket %d", bucket); */
+
+    /*
+     * start squeezing into the base bucket page.
+     */
+    wblkno = BUCKET_TO_BLKNO(bucket);
+    wbuf = _hash_getbuf(rel, wblkno, HASH_WRITE);
+    wpage = BufferGetPage(wbuf);
+    _hash_checkpage(wpage, LH_BUCKET_PAGE);
+    wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage);
+    
+    /*
+     * if there aren't any overflow pages, there's nothing to squeeze.
+     */
+    if (!BlockNumberIsValid(wopaque->hasho_nextblkno)) {
+	_hash_relbuf(rel, wbuf, HASH_WRITE);
+	return;
+    }
+    
+    /*
+     * find the last page in the bucket chain by starting at the base
+     * bucket page and working forward.
+     *
+     * XXX if chains tend to be long, we should probably move forward
+     * using HASH_READ and then _hash_chgbufaccess to HASH_WRITE when
+     * we reach the end.  if they are short we probably don't care
+     * very much.  if the hash function is working at all, they had
+     * better be short..
+     */
+    ropaque = wopaque;
+    do {
+	rblkno = ropaque->hasho_nextblkno;
+	if (ropaque != wopaque) {
+	    _hash_relbuf(rel, rbuf, HASH_WRITE);
+	}
+	rbuf = _hash_getbuf(rel, rblkno, HASH_WRITE);
+	rpage = BufferGetPage(rbuf);
+	_hash_checkpage(rpage, LH_OVERFLOW_PAGE);
+	Assert(!PageIsEmpty(rpage));
+	ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage);
+	Assert(ropaque->hasho_bucket == bucket);
+    } while (BlockNumberIsValid(ropaque->hasho_nextblkno));
+
+    /*
+     * squeeze the tuples.
+     */
+    roffnum = FirstOffsetNumber;
+    for(;;) {
+	hitem = (HashItem) PageGetItem(rpage, PageGetItemId(rpage, roffnum));
+	itemsz = IndexTupleDSize(hitem->hash_itup) 
+	    + (sizeof(HashItemData) - sizeof(IndexTupleData));
+	itemsz = DOUBLEALIGN(itemsz);
+	
+	/*
+	 * walk up the bucket chain, looking for a page big enough for
+	 * this item.
+	 */
+	while (PageGetFreeSpace(wpage) < itemsz) {
+	    wblkno = wopaque->hasho_nextblkno;
+
+	    _hash_wrtbuf(rel, wbuf);
+
+	    if (!BlockNumberIsValid(wblkno) || (rblkno == wblkno)) {
+		_hash_wrtbuf(rel, rbuf);
+		/* wbuf is already released */
+		return;
+	    }
+	    
+	    wbuf = _hash_getbuf(rel, wblkno, HASH_WRITE);
+	    wpage = BufferGetPage(wbuf);
+	    _hash_checkpage(wpage, LH_OVERFLOW_PAGE);
+	    Assert(!PageIsEmpty(wpage));
+	    wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage);
+	    Assert(wopaque->hasho_bucket == bucket);
+	}
+	
+	/* 
+	 * if we're here, we have found room so insert on the "write"
+	 * page.
+	 */
+	woffnum = OffsetNumberNext(PageGetMaxOffsetNumber(wpage));
+	(void) PageAddItem(wpage, (Item) hitem, itemsz, woffnum, LP_USED);
+	
+	/* 
+	 * delete the tuple from the "read" page.
+	 * PageIndexTupleDelete repacks the ItemId array, so 'roffnum'
+	 * will be "advanced" to the "next" ItemId.
+	 */
+	PageIndexTupleDelete(rpage, roffnum);
+	_hash_wrtnorelbuf(rel, rbuf);
+	
+	/*
+	 * if the "read" page is now empty because of the deletion,
+	 * free it.
+	 */
+	if (PageIsEmpty(rpage) && (ropaque->hasho_flag & LH_OVERFLOW_PAGE)) {
+	    rblkno = ropaque->hasho_prevblkno;
+	    Assert(BlockNumberIsValid(rblkno));
+
+	    /*
+	     * free this overflow page.  the extra _hash_relbuf is
+	     * because _hash_freeovflpage gratuitously returns the
+	     * next page (we want the previous page and will get it
+	     * ourselves later).
+	     */
+	    rbuf = _hash_freeovflpage(rel, rbuf);
+	    if (BufferIsValid(rbuf)) {
+		_hash_relbuf(rel, rbuf, HASH_WRITE);
+	    }
+	    
+	    if (rblkno == wblkno) {
+		/* rbuf is already released */
+		_hash_wrtbuf(rel, wbuf);
+		return;
+	    }
+	    
+	    rbuf = _hash_getbuf(rel, rblkno, HASH_WRITE);
+	    rpage = BufferGetPage(rbuf);
+	    _hash_checkpage(rpage, LH_OVERFLOW_PAGE);
+	    Assert(!PageIsEmpty(rpage));
+	    ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage);
+	    Assert(ropaque->hasho_bucket == bucket);
+
+	    roffnum = FirstOffsetNumber;
+	}
+    }
+}