diff options
Diffstat (limited to 'src/backend/access/hash/hashpage.c')
| -rw-r--r-- | src/backend/access/hash/hashpage.c | 669 |
1 files changed, 669 insertions, 0 deletions
diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c new file mode 100644 index 0000000000..2c6ebed835 --- /dev/null +++ b/src/backend/access/hash/hashpage.c @@ -0,0 +1,669 @@ +/*------------------------------------------------------------------------- + * + * hashpage.c-- + * Hash table page management code for the Postgres hash access method + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/access/hash/hashpage.c,v 1.1.1.1 1996/07/09 06:21:10 scrappy Exp $ + * + * NOTES + * Postgres hash pages look like ordinary relation pages. The opaque + * data at high addresses includes information about the page including + * whether a page is an overflow page or a true bucket, the block + * numbers of the preceding and following pages, and the overflow + * address of the page if it is an overflow page. + * + * The first page in a hash relation, page zero, is special -- it stores + * information describing the hash table; it is referred to as teh + * "meta page." Pages one and higher store the actual data. + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "storage/bufmgr.h" +#include "storage/bufpage.h" + +#include "utils/elog.h" +#include "utils/rel.h" +#include "utils/excid.h" + +#include "access/genam.h" +#include "access/hash.h" + +static void _hash_setpagelock(Relation rel, BlockNumber blkno, int access); +static void _hash_unsetpagelock(Relation rel, BlockNumber blkno, int access); +static void _hash_splitpage(Relation rel, Buffer metabuf, Bucket obucket, Bucket nbucket); + +/* + * We use high-concurrency locking on hash indices. There are two cases in + * which we don't do locking. One is when we're building the index. + * Since the creating transaction has not committed, no one can see + * the index, and there's no reason to share locks. The second case + * is when we're just starting up the database system. We use some + * special-purpose initialization code in the relation cache manager + * (see utils/cache/relcache.c) to allow us to do indexed scans on + * the system catalogs before we'd normally be able to. This happens + * before the lock table is fully initialized, so we can't use it. + * Strictly speaking, this violates 2pl, but we don't do 2pl on the + * system catalogs anyway. + */ + + +#define USELOCKING (!BuildingHash && !IsInitProcessingMode()) + + +/* + * _hash_metapinit() -- Initialize the metadata page of a hash index, + * the two buckets that we begin with and the initial + * bitmap page. + */ +void +_hash_metapinit(Relation rel) +{ + HashMetaPage metap; + HashPageOpaque pageopaque; + Buffer metabuf; + Buffer buf; + Page pg; + int nbuckets; + uint32 nelem; /* number elements */ + uint32 lg2nelem; /* _hash_log2(nelem) */ + uint32 nblocks; + uint16 i; + + /* can't be sharing this with anyone, now... */ + if (USELOCKING) + RelationSetLockForWrite(rel); + + if ((nblocks = RelationGetNumberOfBlocks(rel)) != 0) { + elog(WARN, "Cannot initialize non-empty hash table %s", + RelationGetRelationName(rel)); + } + + metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE); + pg = BufferGetPage(metabuf); + metap = (HashMetaPage) pg; + _hash_pageinit(pg, BufferGetPageSize(metabuf)); + + metap->hashm_magic = HASH_MAGIC; + metap->hashm_version = HASH_VERSION; + metap->hashm_nkeys = 0; + metap->hashm_nmaps = 0; + metap->hashm_ffactor = DEFAULT_FFACTOR; + metap->hashm_bsize = BufferGetPageSize(metabuf); + metap->hashm_bshift = _hash_log2(metap->hashm_bsize); + for (i = metap->hashm_bshift; i > 0; --i) { + if ((1 << i) < (metap->hashm_bsize - + (DOUBLEALIGN(sizeof(PageHeaderData)) + + DOUBLEALIGN(sizeof(HashPageOpaqueData))))) { + break; + } + } + Assert(i); + metap->hashm_bmsize = 1 << i; + metap->hashm_procid = index_getprocid(rel, 1, HASHPROC); + + /* + * Make nelem = 2 rather than 0 so that we end up allocating space + * for the next greater power of two number of buckets. + */ + nelem = 2; + lg2nelem = 1; /*_hash_log2(MAX(nelem, 2)) */ + nbuckets = 2; /*1 << lg2nelem */ + + memset((char *) metap->hashm_spares, 0, sizeof(metap->hashm_spares)); + memset((char *) metap->hashm_mapp, 0, sizeof(metap->hashm_mapp)); + + metap->hashm_spares[lg2nelem] = 2; /* lg2nelem + 1 */ + metap->hashm_spares[lg2nelem + 1] = 2; /* lg2nelem + 1 */ + metap->hashm_ovflpoint = 1; /* lg2nelem */ + metap->hashm_lastfreed = 2; + + metap->hashm_maxbucket = metap->hashm_lowmask = 1; /* nbuckets - 1 */ + metap->hashm_highmask = 3; /* (nbuckets << 1) - 1 */ + + pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg); + pageopaque->hasho_oaddr = InvalidOvflAddress; + pageopaque->hasho_prevblkno = InvalidBlockNumber; + pageopaque->hasho_nextblkno = InvalidBlockNumber; + pageopaque->hasho_flag = LH_META_PAGE; + pageopaque->hasho_bucket = -1; + + /* + * First bitmap page is at: splitpoint lg2nelem page offset 1 which + * turns out to be page 3. Couldn't initialize page 3 until we created + * the first two buckets above. + */ + if (_hash_initbitmap(rel, metap, OADDR_OF(lg2nelem, 1), lg2nelem + 1, 0)) + elog(WARN, "Problem with _hash_initbitmap."); + + /* all done */ + _hash_wrtnorelbuf(rel, metabuf); + + /* + * initialize the first two buckets + */ + for (i = 0; i <= 1; i++) { + buf = _hash_getbuf(rel, BUCKET_TO_BLKNO(i), HASH_WRITE); + pg = BufferGetPage(buf); + _hash_pageinit(pg, BufferGetPageSize(buf)); + pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg); + pageopaque->hasho_oaddr = InvalidOvflAddress; + pageopaque->hasho_prevblkno = InvalidBlockNumber; + pageopaque->hasho_nextblkno = InvalidBlockNumber; + pageopaque->hasho_flag = LH_BUCKET_PAGE; + pageopaque->hasho_bucket = i; + _hash_wrtbuf(rel, buf); + } + + _hash_relbuf(rel, metabuf, HASH_WRITE); + + if (USELOCKING) + RelationUnsetLockForWrite(rel); +} + +/* + * _hash_getbuf() -- Get a buffer by block number for read or write. + * + * When this routine returns, the appropriate lock is set on the + * requested buffer its reference count is correct. + * + * XXX P_NEW is not used because, unlike the tree structures, we + * need the bucket blocks to be at certain block numbers. we must + * depend on the caller to call _hash_pageinit on the block if it + * knows that this is a new block. + */ +Buffer +_hash_getbuf(Relation rel, BlockNumber blkno, int access) +{ + Buffer buf; + + if (blkno == P_NEW) { + elog(WARN, "_hash_getbuf: internal error: hash AM does not use P_NEW"); + } + switch (access) { + case HASH_WRITE: + case HASH_READ: + _hash_setpagelock(rel, blkno, access); + break; + default: + elog(WARN, "_hash_getbuf: invalid access (%d) on new blk: %.*s", + access, NAMEDATALEN, RelationGetRelationName(rel)); + break; + } + buf = ReadBuffer(rel, blkno); + + /* ref count and lock type are correct */ + return (buf); +} + +/* + * _hash_relbuf() -- release a locked buffer. + */ +void +_hash_relbuf(Relation rel, Buffer buf, int access) +{ + BlockNumber blkno; + + blkno = BufferGetBlockNumber(buf); + + switch (access) { + case HASH_WRITE: + case HASH_READ: + _hash_unsetpagelock(rel, blkno, access); + break; + default: + elog(WARN, "_hash_relbuf: invalid access (%d) on blk %x: %.*s", + access, blkno, NAMEDATALEN, RelationGetRelationName(rel)); + } + + ReleaseBuffer(buf); +} + +/* + * _hash_wrtbuf() -- write a hash page to disk. + * + * This routine releases the lock held on the buffer and our reference + * to it. It is an error to call _hash_wrtbuf() without a write lock + * or a reference to the buffer. + */ +void +_hash_wrtbuf(Relation rel, Buffer buf) +{ + BlockNumber blkno; + + blkno = BufferGetBlockNumber(buf); + WriteBuffer(buf); + _hash_unsetpagelock(rel, blkno, HASH_WRITE); +} + +/* + * _hash_wrtnorelbuf() -- write a hash page to disk, but do not release + * our reference or lock. + * + * It is an error to call _hash_wrtnorelbuf() without a write lock + * or a reference to the buffer. + */ +void +_hash_wrtnorelbuf(Relation rel, Buffer buf) +{ + BlockNumber blkno; + + blkno = BufferGetBlockNumber(buf); + WriteNoReleaseBuffer(buf); +} + +Page +_hash_chgbufaccess(Relation rel, + Buffer *bufp, + int from_access, + int to_access) +{ + BlockNumber blkno; + + blkno = BufferGetBlockNumber(*bufp); + + switch (from_access) { + case HASH_WRITE: + _hash_wrtbuf(rel, *bufp); + break; + case HASH_READ: + _hash_relbuf(rel, *bufp, from_access); + break; + default: + elog(WARN, "_hash_chgbufaccess: invalid access (%d) on blk %x: %.*s", + from_access, blkno, NAMEDATALEN, RelationGetRelationName(rel)); + break; + } + *bufp = _hash_getbuf(rel, blkno, to_access); + return (BufferGetPage(*bufp)); +} + +/* + * _hash_pageinit() -- Initialize a new page. + */ +void +_hash_pageinit(Page page, Size size) +{ + Assert(((PageHeader) page)->pd_lower == 0); + Assert(((PageHeader) page)->pd_upper == 0); + Assert(((PageHeader) page)->pd_special == 0); + + /* + * Cargo-cult programming -- don't really need this to be zero, but + * creating new pages is an infrequent occurrence and it makes me feel + * good when I know they're empty. + */ + memset(page, 0, size); + + PageInit(page, size, sizeof(HashPageOpaqueData)); +} + +static void +_hash_setpagelock(Relation rel, + BlockNumber blkno, + int access) +{ + ItemPointerData iptr; + + if (USELOCKING) { + ItemPointerSet(&iptr, blkno, 1); + + switch (access) { + case HASH_WRITE: + RelationSetSingleWLockPage(rel, &iptr); + break; + case HASH_READ: + RelationSetSingleRLockPage(rel, &iptr); + break; + default: + elog(WARN, "_hash_setpagelock: invalid access (%d) on blk %x: %.*s", + access, blkno, NAMEDATALEN, RelationGetRelationName(rel)); + break; + } + } +} + +static void +_hash_unsetpagelock(Relation rel, + BlockNumber blkno, + int access) +{ + ItemPointerData iptr; + + if (USELOCKING) { + ItemPointerSet(&iptr, blkno, 1); + + switch (access) { + case HASH_WRITE: + RelationUnsetSingleWLockPage(rel, &iptr); + break; + case HASH_READ: + RelationUnsetSingleRLockPage(rel, &iptr); + break; + default: + elog(WARN, "_hash_unsetpagelock: invalid access (%d) on blk %x: %.*s", + access, blkno, NAMEDATALEN, RelationGetRelationName(rel)); + break; + } + } +} + +void +_hash_pagedel(Relation rel, ItemPointer tid) +{ + Buffer buf; + Buffer metabuf; + Page page; + BlockNumber blkno; + OffsetNumber offno; + HashMetaPage metap; + HashPageOpaque opaque; + + blkno = ItemPointerGetBlockNumber(tid); + offno = ItemPointerGetOffsetNumber(tid); + + buf = _hash_getbuf(rel, blkno, HASH_WRITE); + page = BufferGetPage(buf); + _hash_checkpage(page, LH_BUCKET_PAGE|LH_OVERFLOW_PAGE); + opaque = (HashPageOpaque) PageGetSpecialPointer(page); + + PageIndexTupleDelete(page, offno); + _hash_wrtnorelbuf(rel, buf); + + if (PageIsEmpty(page) && (opaque->hasho_flag & LH_OVERFLOW_PAGE)) { + buf = _hash_freeovflpage(rel, buf); + if (BufferIsValid(buf)) { + _hash_relbuf(rel, buf, HASH_WRITE); + } + } else { + _hash_relbuf(rel, buf, HASH_WRITE); + } + + metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE); + metap = (HashMetaPage) BufferGetPage(metabuf); + _hash_checkpage((Page) metap, LH_META_PAGE); + ++metap->hashm_nkeys; + _hash_wrtbuf(rel, metabuf); +} + +void +_hash_expandtable(Relation rel, Buffer metabuf) +{ + HashMetaPage metap; + Bucket old_bucket; + Bucket new_bucket; + uint32 spare_ndx; + +/* elog(DEBUG, "_hash_expandtable: expanding..."); */ + + metap = (HashMetaPage) BufferGetPage(metabuf); + _hash_checkpage((Page) metap, LH_META_PAGE); + + metap = (HashMetaPage) _hash_chgbufaccess(rel, &metabuf, HASH_READ, HASH_WRITE); + new_bucket = ++metap->MAX_BUCKET; + metap = (HashMetaPage) _hash_chgbufaccess(rel, &metabuf, HASH_WRITE, HASH_READ); + old_bucket = (metap->MAX_BUCKET & metap->LOW_MASK); + + /* + * If the split point is increasing (MAX_BUCKET's log base 2 + * * increases), we need to copy the current contents of the spare + * split bucket to the next bucket. + */ + spare_ndx = _hash_log2(metap->MAX_BUCKET + 1); + if (spare_ndx > metap->OVFL_POINT) { + + metap = (HashMetaPage) _hash_chgbufaccess(rel, &metabuf, HASH_READ, HASH_WRITE); + metap->SPARES[spare_ndx] = metap->SPARES[metap->OVFL_POINT]; + metap->OVFL_POINT = spare_ndx; + metap = (HashMetaPage) _hash_chgbufaccess(rel, &metabuf, HASH_WRITE, HASH_READ); + } + + if (new_bucket > metap->HIGH_MASK) { + + /* Starting a new doubling */ + metap = (HashMetaPage) _hash_chgbufaccess(rel, &metabuf, HASH_READ, HASH_WRITE); + metap->LOW_MASK = metap->HIGH_MASK; + metap->HIGH_MASK = new_bucket | metap->LOW_MASK; + metap = (HashMetaPage) _hash_chgbufaccess(rel, &metabuf, HASH_WRITE, HASH_READ); + + } + /* Relocate records to the new bucket */ + _hash_splitpage(rel, metabuf, old_bucket, new_bucket); +} + + +/* + * _hash_splitpage -- split 'obucket' into 'obucket' and 'nbucket' + * + * this routine is actually misnamed -- we are splitting a bucket that + * consists of a base bucket page and zero or more overflow (bucket + * chain) pages. + */ +static void +_hash_splitpage(Relation rel, + Buffer metabuf, + Bucket obucket, + Bucket nbucket) +{ + Bucket bucket; + Buffer obuf; + Buffer nbuf; + Buffer ovflbuf; + BlockNumber oblkno; + BlockNumber nblkno; + bool null; + Datum datum; + HashItem hitem; + HashPageOpaque oopaque; + HashPageOpaque nopaque; + HashMetaPage metap; + IndexTuple itup; + int itemsz; + OffsetNumber ooffnum; + OffsetNumber noffnum; + OffsetNumber omaxoffnum; + Page opage; + Page npage; + TupleDesc itupdesc; + +/* elog(DEBUG, "_hash_splitpage: splitting %d into %d,%d", + obucket, obucket, nbucket); +*/ + metap = (HashMetaPage) BufferGetPage(metabuf); + _hash_checkpage((Page) metap, LH_META_PAGE); + + /* get the buffers & pages */ + oblkno = BUCKET_TO_BLKNO(obucket); + nblkno = BUCKET_TO_BLKNO(nbucket); + obuf = _hash_getbuf(rel, oblkno, HASH_WRITE); + nbuf = _hash_getbuf(rel, nblkno, HASH_WRITE); + opage = BufferGetPage(obuf); + npage = BufferGetPage(nbuf); + + /* initialize the new bucket */ + _hash_pageinit(npage, BufferGetPageSize(nbuf)); + nopaque = (HashPageOpaque) PageGetSpecialPointer(npage); + nopaque->hasho_prevblkno = InvalidBlockNumber; + nopaque->hasho_nextblkno = InvalidBlockNumber; + nopaque->hasho_flag = LH_BUCKET_PAGE; + nopaque->hasho_oaddr = InvalidOvflAddress; + nopaque->hasho_bucket = nbucket; + _hash_wrtnorelbuf(rel, nbuf); + + /* + * make sure the old bucket isn't empty. advance 'opage' and + * friends through the overflow bucket chain until we find a + * non-empty page. + * + * XXX we should only need this once, if we are careful to + * preserve the invariant that overflow pages are never empty. + */ + _hash_checkpage(opage, LH_BUCKET_PAGE); + oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); + if (PageIsEmpty(opage)) { + oblkno = oopaque->hasho_nextblkno; + _hash_relbuf(rel, obuf, HASH_WRITE); + if (!BlockNumberIsValid(oblkno)) { + /* + * the old bucket is completely empty; of course, the new + * bucket will be as well, but since it's a base bucket + * page we don't care. + */ + _hash_relbuf(rel, nbuf, HASH_WRITE); + return; + } + obuf = _hash_getbuf(rel, oblkno, HASH_WRITE); + opage = BufferGetPage(obuf); + _hash_checkpage(opage, LH_OVERFLOW_PAGE); + if (PageIsEmpty(opage)) { + elog(WARN, "_hash_splitpage: empty overflow page %d", oblkno); + } + oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); + } + + /* + * we are now guaranteed that 'opage' is not empty. partition the + * tuples in the old bucket between the old bucket and the new + * bucket, advancing along their respective overflow bucket chains + * and adding overflow pages as needed. + */ + ooffnum = FirstOffsetNumber; + omaxoffnum = PageGetMaxOffsetNumber(opage); + for (;;) { + /* + * at each iteration through this loop, each of these variables + * should be up-to-date: obuf opage oopaque ooffnum omaxoffnum + */ + + /* check if we're at the end of the page */ + if (ooffnum > omaxoffnum) { + /* at end of page, but check for overflow page */ + oblkno = oopaque->hasho_nextblkno; + if (BlockNumberIsValid(oblkno)) { + /* + * we ran out of tuples on this particular page, but + * we have more overflow pages; re-init values. + */ + _hash_wrtbuf(rel, obuf); + obuf = _hash_getbuf(rel, oblkno, HASH_WRITE); + opage = BufferGetPage(obuf); + _hash_checkpage(opage, LH_OVERFLOW_PAGE); + oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); + + /* we're guaranteed that an ovfl page has at least 1 tuple */ + if (PageIsEmpty(opage)) { + elog(WARN, "_hash_splitpage: empty ovfl page %d!", + oblkno); + } + ooffnum = FirstOffsetNumber; + omaxoffnum = PageGetMaxOffsetNumber(opage); + } else { + /* + * we're at the end of the bucket chain, so now we're + * really done with everything. before quitting, call + * _hash_squeezebucket to ensure the tuples in the + * bucket (including the overflow pages) are packed as + * tightly as possible. + */ + _hash_wrtbuf(rel, obuf); + _hash_wrtbuf(rel, nbuf); + _hash_squeezebucket(rel, metap, obucket); + return; + } + } + + /* hash on the tuple */ + hitem = (HashItem) PageGetItem(opage, PageGetItemId(opage, ooffnum)); + itup = &(hitem->hash_itup); + itupdesc = RelationGetTupleDescriptor(rel); + datum = index_getattr(itup, 1, itupdesc, &null); + bucket = _hash_call(rel, metap, datum); + + if (bucket == nbucket) { + /* + * insert the tuple into the new bucket. if it doesn't + * fit on the current page in the new bucket, we must + * allocate a new overflow page and place the tuple on + * that page instead. + */ + itemsz = IndexTupleDSize(hitem->hash_itup) + + (sizeof(HashItemData) - sizeof(IndexTupleData)); + + itemsz = DOUBLEALIGN(itemsz); + + if (PageGetFreeSpace(npage) < itemsz) { + ovflbuf = _hash_addovflpage(rel, &metabuf, nbuf); + _hash_wrtbuf(rel, nbuf); + nbuf = ovflbuf; + npage = BufferGetPage(nbuf); + _hash_checkpage(npage, LH_BUCKET_PAGE|LH_OVERFLOW_PAGE); + } + + noffnum = OffsetNumberNext(PageGetMaxOffsetNumber(npage)); + (void) PageAddItem(npage, (Item) hitem, itemsz, noffnum, LP_USED); + _hash_wrtnorelbuf(rel, nbuf); + + /* + * now delete the tuple from the old bucket. after this + * section of code, 'ooffnum' will actually point to the + * ItemId to which we would point if we had advanced it + * before the deletion (PageIndexTupleDelete repacks the + * ItemId array). this also means that 'omaxoffnum' is + * exactly one less than it used to be, so we really can + * just decrement it instead of calling + * PageGetMaxOffsetNumber. + */ + PageIndexTupleDelete(opage, ooffnum); + _hash_wrtnorelbuf(rel, obuf); + omaxoffnum = OffsetNumberPrev(omaxoffnum); + + /* + * tidy up. if the old page was an overflow page and it + * is now empty, we must free it (we want to preserve the + * invariant that overflow pages cannot be empty). + */ + if (PageIsEmpty(opage) && + (oopaque->hasho_flag & LH_OVERFLOW_PAGE)) { + obuf = _hash_freeovflpage(rel, obuf); + + /* check that we're not through the bucket chain */ + if (BufferIsInvalid(obuf)) { + _hash_wrtbuf(rel, nbuf); + _hash_squeezebucket(rel, metap, obucket); + return; + } + + /* + * re-init. again, we're guaranteed that an ovfl page + * has at least one tuple. + */ + opage = BufferGetPage(obuf); + _hash_checkpage(opage, LH_OVERFLOW_PAGE); + oblkno = BufferGetBlockNumber(obuf); + oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); + if (PageIsEmpty(opage)) { + elog(WARN, "_hash_splitpage: empty overflow page %d", + oblkno); + } + ooffnum = FirstOffsetNumber; + omaxoffnum = PageGetMaxOffsetNumber(opage); + } + } else { + /* + * the tuple stays on this page. we didn't move anything, + * so we didn't delete anything and therefore we don't + * have to change 'omaxoffnum'. + * + * XXX any hash value from [0, nbucket-1] will map to this + * bucket, which doesn't make sense to me. + */ + ooffnum = OffsetNumberNext(ooffnum); + } + } + /*NOTREACHED*/ +} |
