diff options
Diffstat (limited to 'src/backend/access/hash/hashovfl.c')
| -rw-r--r-- | src/backend/access/hash/hashovfl.c | 614 |
1 files changed, 614 insertions, 0 deletions
diff --git a/src/backend/access/hash/hashovfl.c b/src/backend/access/hash/hashovfl.c new file mode 100644 index 0000000000..55ee9e9ce7 --- /dev/null +++ b/src/backend/access/hash/hashovfl.c @@ -0,0 +1,614 @@ +/*------------------------------------------------------------------------- + * + * hashovfl.c-- + * Overflow page management code for the Postgres hash access method + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/access/hash/hashovfl.c,v 1.1.1.1 1996/07/09 06:21:10 scrappy Exp $ + * + * NOTES + * Overflow pages look like ordinary relation pages. + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "storage/bufmgr.h" +#include "storage/bufpage.h" + +#include "utils/elog.h" +#include "utils/rel.h" +#include "utils/excid.h" + +#include "access/genam.h" +#include "access/hash.h" + +static OverflowPageAddress _hash_getovfladdr(Relation rel, Buffer *metabufp); +static uint32 _hash_firstfreebit(uint32 map); + +/* + * _hash_addovflpage + * + * Add an overflow page to the page currently pointed to by the buffer + * argument 'buf'. + * + * *Metabufp has a read lock upon entering the function; buf has a + * write lock. + * + */ +Buffer +_hash_addovflpage(Relation rel, Buffer *metabufp, Buffer buf) +{ + + OverflowPageAddress oaddr; + BlockNumber ovflblkno; + Buffer ovflbuf; + HashMetaPage metap; + HashPageOpaque ovflopaque; + HashPageOpaque pageopaque; + Page page; + Page ovflpage; + + /* this had better be the last page in a bucket chain */ + page = BufferGetPage(buf); + _hash_checkpage(page, LH_BUCKET_PAGE|LH_OVERFLOW_PAGE); + pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); + Assert(!BlockNumberIsValid(pageopaque->hasho_nextblkno)); + + metap = (HashMetaPage) BufferGetPage(*metabufp); + _hash_checkpage((Page) metap, LH_META_PAGE); + + /* allocate an empty overflow page */ + oaddr = _hash_getovfladdr(rel, metabufp); + if (oaddr == InvalidOvflAddress) { + elog(WARN, "_hash_addovflpage: problem with _hash_getovfladdr."); + } + ovflblkno = OADDR_TO_BLKNO(OADDR_OF(SPLITNUM(oaddr), OPAGENUM(oaddr))); + Assert(BlockNumberIsValid(ovflblkno)); + ovflbuf = _hash_getbuf(rel, ovflblkno, HASH_WRITE); + Assert(BufferIsValid(ovflbuf)); + ovflpage = BufferGetPage(ovflbuf); + + /* initialize the new overflow page */ + _hash_pageinit(ovflpage, BufferGetPageSize(ovflbuf)); + ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage); + ovflopaque->hasho_prevblkno = BufferGetBlockNumber(buf); + ovflopaque->hasho_nextblkno = InvalidBlockNumber; + ovflopaque->hasho_flag = LH_OVERFLOW_PAGE; + ovflopaque->hasho_oaddr = oaddr; + ovflopaque->hasho_bucket = pageopaque->hasho_bucket; + _hash_wrtnorelbuf(rel, ovflbuf); + + /* logically chain overflow page to previous page */ + pageopaque->hasho_nextblkno = ovflblkno; + _hash_wrtnorelbuf(rel, buf); + return (ovflbuf); +} + +/* + * _hash_getovfladdr() + * + * Find an available overflow page and return its address. + * + * When we enter this function, we have a read lock on *metabufp which + * we change to a write lock immediately. Before exiting, the write lock + * is exchanged for a read lock. + * + */ +static OverflowPageAddress +_hash_getovfladdr(Relation rel, Buffer *metabufp) +{ + HashMetaPage metap; + Buffer mapbuf; + BlockNumber blkno; + PageOffset offset; + OverflowPageAddress oaddr; + SplitNumber splitnum; + uint32 *freep; + uint32 max_free; + uint32 bit; + uint32 first_page; + uint32 free_bit; + uint32 free_page; + uint32 in_use_bits; + uint32 i, j; + + metap = (HashMetaPage) _hash_chgbufaccess(rel, metabufp, HASH_READ, HASH_WRITE); + + splitnum = metap->OVFL_POINT; + max_free = metap->SPARES[splitnum]; + + free_page = (max_free - 1) >> (metap->BSHIFT + BYTE_TO_BIT); + free_bit = (max_free - 1) & (BMPGSZ_BIT(metap) - 1); + + /* Look through all the free maps to find the first free block */ + first_page = metap->LAST_FREED >> (metap->BSHIFT + BYTE_TO_BIT); + for ( i = first_page; i <= free_page; i++ ) { + Page mappage; + + blkno = metap->hashm_mapp[i]; + mapbuf = _hash_getbuf(rel, blkno, HASH_WRITE); + mappage = BufferGetPage(mapbuf); + _hash_checkpage(mappage, LH_BITMAP_PAGE); + freep = HashPageGetBitmap(mappage); + Assert(freep); + + if (i == free_page) + in_use_bits = free_bit; + else + in_use_bits = BMPGSZ_BIT(metap) - 1; + + if (i == first_page) { + bit = metap->LAST_FREED & (BMPGSZ_BIT(metap) - 1); + j = bit / BITS_PER_MAP; + bit = bit & ~(BITS_PER_MAP - 1); + } else { + bit = 0; + j = 0; + } + for (; bit <= in_use_bits; j++, bit += BITS_PER_MAP) + if (freep[j] != ALL_SET) + goto found; + } + + /* No Free Page Found - have to allocate a new page */ + metap->LAST_FREED = metap->SPARES[splitnum]; + metap->SPARES[splitnum]++; + offset = metap->SPARES[splitnum] - + (splitnum ? metap->SPARES[splitnum - 1] : 0); + +#define OVMSG "HASH: Out of overflow pages. Out of luck.\n" + + if (offset > SPLITMASK) { + if (++splitnum >= NCACHED) { + elog(WARN, OVMSG); + } + metap->OVFL_POINT = splitnum; + metap->SPARES[splitnum] = metap->SPARES[splitnum-1]; + metap->SPARES[splitnum-1]--; + offset = 0; + } + + /* Check if we need to allocate a new bitmap page */ + if (free_bit == BMPGSZ_BIT(metap) - 1) { + /* won't be needing old map page */ + + _hash_relbuf(rel, mapbuf, HASH_WRITE); + + free_page++; + if (free_page >= NCACHED) { + elog(WARN, OVMSG); + } + + /* + * This is tricky. The 1 indicates that you want the new page + * allocated with 1 clear bit. Actually, you are going to + * allocate 2 pages from this map. The first is going to be + * the map page, the second is the overflow page we were + * looking for. The init_bitmap routine automatically, sets + * the first bit of itself to indicate that the bitmap itself + * is in use. We would explicitly set the second bit, but + * don't have to if we tell init_bitmap not to leave it clear + * in the first place. + */ + if (_hash_initbitmap(rel, metap, OADDR_OF(splitnum, offset), + 1, free_page)) { + elog(WARN, "overflow_page: problem with _hash_initbitmap."); + } + metap->SPARES[splitnum]++; + offset++; + if (offset > SPLITMASK) { + if (++splitnum >= NCACHED) { + elog(WARN, OVMSG); + } + metap->OVFL_POINT = splitnum; + metap->SPARES[splitnum] = metap->SPARES[splitnum-1]; + metap->SPARES[splitnum-1]--; + offset = 0; + } + } else { + + /* + * Free_bit addresses the last used bit. Bump it to address + * the first available bit. + */ + free_bit++; + SETBIT(freep, free_bit); + _hash_wrtbuf(rel, mapbuf); + } + + /* Calculate address of the new overflow page */ + oaddr = OADDR_OF(splitnum, offset); + _hash_chgbufaccess(rel, metabufp, HASH_WRITE, HASH_READ); + return (oaddr); + + found: + bit = bit + _hash_firstfreebit(freep[j]); + SETBIT(freep, bit); + _hash_wrtbuf(rel, mapbuf); + + /* + * Bits are addressed starting with 0, but overflow pages are addressed + * beginning at 1. Bit is a bit addressnumber, so we need to increment + * it to convert it to a page number. + */ + + bit = 1 + bit + (i * BMPGSZ_BIT(metap)); + if (bit >= metap->LAST_FREED) { + metap->LAST_FREED = bit - 1; + } + + /* Calculate the split number for this page */ + for (i = 0; (i < splitnum) && (bit > metap->SPARES[i]); i++) + ; + offset = (i ? bit - metap->SPARES[i - 1] : bit); + if (offset >= SPLITMASK) { + elog(WARN, OVMSG); + } + + /* initialize this page */ + oaddr = OADDR_OF(i, offset); + _hash_chgbufaccess(rel, metabufp, HASH_WRITE, HASH_READ); + return (oaddr); +} + +/* + * _hash_firstfreebit() + * + * Return the first bit that is not set in the argument 'map'. This + * function is used to find an available overflow page within a + * splitnumber. + * + */ +static uint32 +_hash_firstfreebit(uint32 map) +{ + uint32 i, mask; + + mask = 0x1; + for (i = 0; i < BITS_PER_MAP; i++) { + if (!(mask & map)) + return (i); + mask = mask << 1; + } + return (i); +} + +/* + * _hash_freeovflpage() - + * + * Mark this overflow page as free and return a buffer with + * the page that follows it (which may be defined as + * InvalidBuffer). + * + */ +Buffer +_hash_freeovflpage(Relation rel, Buffer ovflbuf) +{ + HashMetaPage metap; + Buffer metabuf; + Buffer mapbuf; + BlockNumber prevblkno; + BlockNumber blkno; + BlockNumber nextblkno; + HashPageOpaque ovflopaque; + Page ovflpage; + Page mappage; + OverflowPageAddress addr; + SplitNumber splitnum; + uint32 *freep; + uint32 ovflpgno; + int32 bitmappage, bitmapbit; + Bucket bucket; + + metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE); + metap = (HashMetaPage) BufferGetPage(metabuf); + _hash_checkpage((Page) metap, LH_META_PAGE); + + ovflpage = BufferGetPage(ovflbuf); + _hash_checkpage(ovflpage, LH_OVERFLOW_PAGE); + ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage); + addr = ovflopaque->hasho_oaddr; + nextblkno = ovflopaque->hasho_nextblkno; + prevblkno = ovflopaque->hasho_prevblkno; + bucket = ovflopaque->hasho_bucket; + (void) memset(ovflpage, 0, BufferGetPageSize(ovflbuf)); + _hash_wrtbuf(rel, ovflbuf); + + /* + * fix up the bucket chain. this is a doubly-linked list, so we + * must fix up the bucket chain members behind and ahead of the + * overflow page being deleted. + * + * XXX this should look like: + * - lock prev/next + * - modify/write prev/next (how to do write ordering with a + * doubly-linked list???) + * - unlock prev/next + */ + if (BlockNumberIsValid(prevblkno)) { + Buffer prevbuf = _hash_getbuf(rel, prevblkno, HASH_WRITE); + Page prevpage = BufferGetPage(prevbuf); + HashPageOpaque prevopaque = + (HashPageOpaque) PageGetSpecialPointer(prevpage); + + _hash_checkpage(prevpage, LH_BUCKET_PAGE|LH_OVERFLOW_PAGE); + Assert(prevopaque->hasho_bucket == bucket); + prevopaque->hasho_nextblkno = nextblkno; + _hash_wrtbuf(rel, prevbuf); + } + if (BlockNumberIsValid(nextblkno)) { + Buffer nextbuf = _hash_getbuf(rel, nextblkno, HASH_WRITE); + Page nextpage = BufferGetPage(nextbuf); + HashPageOpaque nextopaque = + (HashPageOpaque) PageGetSpecialPointer(nextpage); + + _hash_checkpage(nextpage, LH_OVERFLOW_PAGE); + Assert(nextopaque->hasho_bucket == bucket); + nextopaque->hasho_prevblkno = prevblkno; + _hash_wrtbuf(rel, nextbuf); + } + + /* + * Fix up the overflow page bitmap that tracks this particular + * overflow page. The bitmap can be found in the MetaPageData + * array element hashm_mapp[bitmappage]. + */ + splitnum = (addr >> SPLITSHIFT); + ovflpgno = + (splitnum ? metap->SPARES[splitnum - 1] : 0) + (addr & SPLITMASK) - 1; + + if (ovflpgno < metap->LAST_FREED) { + metap->LAST_FREED = ovflpgno; + } + + bitmappage = (ovflpgno >> (metap->BSHIFT + BYTE_TO_BIT)); + bitmapbit = ovflpgno & (BMPGSZ_BIT(metap) - 1); + + blkno = metap->hashm_mapp[bitmappage]; + mapbuf = _hash_getbuf(rel, blkno, HASH_WRITE); + mappage = BufferGetPage(mapbuf); + _hash_checkpage(mappage, LH_BITMAP_PAGE); + freep = HashPageGetBitmap(mappage); + CLRBIT(freep, bitmapbit); + _hash_wrtbuf(rel, mapbuf); + + _hash_relbuf(rel, metabuf, HASH_WRITE); + + /* + * now instantiate the page that replaced this one, + * if it exists, and return that buffer with a write lock. + */ + if (BlockNumberIsValid(nextblkno)) { + return (_hash_getbuf(rel, nextblkno, HASH_WRITE)); + } else { + return (InvalidBuffer); + } +} + + +/* + * _hash_initbitmap() + * + * Initialize a new bitmap page. The metapage has a write-lock upon + * entering the function. + * + * 'pnum' is the OverflowPageAddress of the new bitmap page. + * 'nbits' is how many bits to clear (i.e., make available) in the new + * bitmap page. the remainder of the bits (as well as the first bit, + * representing the bitmap page itself) will be set. + * 'ndx' is the 0-based offset of the new bitmap page within the + * metapage's array of bitmap page OverflowPageAddresses. + */ + +#define INT_MASK ((1 << INT_TO_BIT) -1) + +int32 +_hash_initbitmap(Relation rel, + HashMetaPage metap, + int32 pnum, + int32 nbits, + int32 ndx) +{ + Buffer buf; + BlockNumber blkno; + Page pg; + HashPageOpaque op; + uint32 *freep; + int clearbytes, clearints; + + blkno = OADDR_TO_BLKNO(pnum); + buf = _hash_getbuf(rel, blkno, HASH_WRITE); + pg = BufferGetPage(buf); + _hash_pageinit(pg, BufferGetPageSize(buf)); + op = (HashPageOpaque) PageGetSpecialPointer(pg); + op->hasho_oaddr = InvalidOvflAddress; + op->hasho_prevblkno = InvalidBlockNumber; + op->hasho_nextblkno = InvalidBlockNumber; + op->hasho_flag = LH_BITMAP_PAGE; + op->hasho_bucket = -1; + + freep = HashPageGetBitmap(pg); + + /* set all of the bits above 'nbits' to 1 */ + clearints = ((nbits - 1) >> INT_TO_BIT) + 1; + clearbytes = clearints << INT_TO_BYTE; + (void) memset((char *) freep, 0, clearbytes); + (void) memset(((char *) freep) + clearbytes, 0xFF, + BMPGSZ_BYTE(metap) - clearbytes); + freep[clearints - 1] = ALL_SET << (nbits & INT_MASK); + + /* bit 0 represents the new bitmap page */ + SETBIT(freep, 0); + + /* metapage already has a write lock */ + metap->hashm_nmaps++; + metap->hashm_mapp[ndx] = blkno; + + /* write out the new bitmap page (releasing its locks) */ + _hash_wrtbuf(rel, buf); + + return (0); +} + + +/* + * _hash_squeezebucket(rel, bucket) + * + * Try to squeeze the tuples onto pages occuring earlier in the + * bucket chain in an attempt to free overflow pages. When we start + * the "squeezing", the page from which we start taking tuples (the + * "read" page) is the last bucket in the bucket chain and the page + * onto which we start squeezing tuples (the "write" page) is the + * first page in the bucket chain. The read page works backward and + * the write page works forward; the procedure terminates when the + * read page and write page are the same page. + */ +void +_hash_squeezebucket(Relation rel, + HashMetaPage metap, + Bucket bucket) +{ + Buffer wbuf; + Buffer rbuf; + BlockNumber wblkno; + BlockNumber rblkno; + Page wpage; + Page rpage; + HashPageOpaque wopaque; + HashPageOpaque ropaque; + OffsetNumber woffnum; + OffsetNumber roffnum; + HashItem hitem; + int itemsz; + +/* elog(DEBUG, "_hash_squeezebucket: squeezing bucket %d", bucket); */ + + /* + * start squeezing into the base bucket page. + */ + wblkno = BUCKET_TO_BLKNO(bucket); + wbuf = _hash_getbuf(rel, wblkno, HASH_WRITE); + wpage = BufferGetPage(wbuf); + _hash_checkpage(wpage, LH_BUCKET_PAGE); + wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage); + + /* + * if there aren't any overflow pages, there's nothing to squeeze. + */ + if (!BlockNumberIsValid(wopaque->hasho_nextblkno)) { + _hash_relbuf(rel, wbuf, HASH_WRITE); + return; + } + + /* + * find the last page in the bucket chain by starting at the base + * bucket page and working forward. + * + * XXX if chains tend to be long, we should probably move forward + * using HASH_READ and then _hash_chgbufaccess to HASH_WRITE when + * we reach the end. if they are short we probably don't care + * very much. if the hash function is working at all, they had + * better be short.. + */ + ropaque = wopaque; + do { + rblkno = ropaque->hasho_nextblkno; + if (ropaque != wopaque) { + _hash_relbuf(rel, rbuf, HASH_WRITE); + } + rbuf = _hash_getbuf(rel, rblkno, HASH_WRITE); + rpage = BufferGetPage(rbuf); + _hash_checkpage(rpage, LH_OVERFLOW_PAGE); + Assert(!PageIsEmpty(rpage)); + ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage); + Assert(ropaque->hasho_bucket == bucket); + } while (BlockNumberIsValid(ropaque->hasho_nextblkno)); + + /* + * squeeze the tuples. + */ + roffnum = FirstOffsetNumber; + for(;;) { + hitem = (HashItem) PageGetItem(rpage, PageGetItemId(rpage, roffnum)); + itemsz = IndexTupleDSize(hitem->hash_itup) + + (sizeof(HashItemData) - sizeof(IndexTupleData)); + itemsz = DOUBLEALIGN(itemsz); + + /* + * walk up the bucket chain, looking for a page big enough for + * this item. + */ + while (PageGetFreeSpace(wpage) < itemsz) { + wblkno = wopaque->hasho_nextblkno; + + _hash_wrtbuf(rel, wbuf); + + if (!BlockNumberIsValid(wblkno) || (rblkno == wblkno)) { + _hash_wrtbuf(rel, rbuf); + /* wbuf is already released */ + return; + } + + wbuf = _hash_getbuf(rel, wblkno, HASH_WRITE); + wpage = BufferGetPage(wbuf); + _hash_checkpage(wpage, LH_OVERFLOW_PAGE); + Assert(!PageIsEmpty(wpage)); + wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage); + Assert(wopaque->hasho_bucket == bucket); + } + + /* + * if we're here, we have found room so insert on the "write" + * page. + */ + woffnum = OffsetNumberNext(PageGetMaxOffsetNumber(wpage)); + (void) PageAddItem(wpage, (Item) hitem, itemsz, woffnum, LP_USED); + + /* + * delete the tuple from the "read" page. + * PageIndexTupleDelete repacks the ItemId array, so 'roffnum' + * will be "advanced" to the "next" ItemId. + */ + PageIndexTupleDelete(rpage, roffnum); + _hash_wrtnorelbuf(rel, rbuf); + + /* + * if the "read" page is now empty because of the deletion, + * free it. + */ + if (PageIsEmpty(rpage) && (ropaque->hasho_flag & LH_OVERFLOW_PAGE)) { + rblkno = ropaque->hasho_prevblkno; + Assert(BlockNumberIsValid(rblkno)); + + /* + * free this overflow page. the extra _hash_relbuf is + * because _hash_freeovflpage gratuitously returns the + * next page (we want the previous page and will get it + * ourselves later). + */ + rbuf = _hash_freeovflpage(rel, rbuf); + if (BufferIsValid(rbuf)) { + _hash_relbuf(rel, rbuf, HASH_WRITE); + } + + if (rblkno == wblkno) { + /* rbuf is already released */ + _hash_wrtbuf(rel, wbuf); + return; + } + + rbuf = _hash_getbuf(rel, rblkno, HASH_WRITE); + rpage = BufferGetPage(rbuf); + _hash_checkpage(rpage, LH_OVERFLOW_PAGE); + Assert(!PageIsEmpty(rpage)); + ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage); + Assert(ropaque->hasho_bucket == bucket); + + roffnum = FirstOffsetNumber; + } + } +} |
