Reimplement hash index locking algorithms, per my recent proposal to

pghackers. This fixes the problem recently reported by Markus KrÌutner (hash bucket split corrupts the state of scans being done concurrently), and I believe it also fixes all the known problems with deadlocks in hash index operations. Hash indexes are still not really ready for prime time (since they aren't WAL-logged), but this is a step forward.
author: Tom Lane <tgl@sss.pgh.pa.us> 2003-09-04 22:06:27 +0000
committer: Tom Lane <tgl@sss.pgh.pa.us> 2003-09-04 22:06:27 +0000
commit: 7a3693716dad968569d3b91ce203841f1293370d (patch)
tree: 60e513010279ef799fb63bab5cdc6c3187b1ae7e /src/backend/access/hash/hashpage.c
parent: ca43f71ca5c24497ad6f2904fd7ac9ce9b2bf75a (diff)
download: postgresql-7a3693716dad968569d3b91ce203841f1293370d.tar.gz
1 files changed, 377 insertions, 333 deletions
diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c
index 1c16df33cd..5b9d19acf1 100644
--- a/src/backend/access/hash/hashpage.c
+++ b/src/backend/access/hash/hashpage.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/hash/hashpage.c,v 1.41 2003/09/02 18:13:31 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/hash/hashpage.c,v 1.42 2003/09/04 22:06:27 tgl Exp $
  *
  * NOTES
  *	  Postgres hash pages look like ordinary relation pages.  The opaque
@@ -26,54 +26,201 @@
  *
  *-------------------------------------------------------------------------
  */
-
 #include "postgres.h"
 
 #include "access/genam.h"
 #include "access/hash.h"
-#include "miscadmin.h"
 #include "storage/lmgr.h"
+#include "utils/lsyscache.h"
+
+
+static void _hash_splitbucket(Relation rel, Buffer metabuf,
+							  Bucket obucket, Bucket nbucket,
+							  BlockNumber start_oblkno,
+							  BlockNumber start_nblkno,
+							  uint32 maxbucket,
+							  uint32 highmask, uint32 lowmask);
+
+
+/*
+ * We use high-concurrency locking on hash indexes (see README for an overview
+ * of the locking rules).  There are two cases in which we don't do locking.
+ * One is when the index is newly created in the current transaction.  Since
+ * the creating transaction has not committed, no one else can see the index,
+ * and there's no reason to take locks.  The second case is for temp
+ * relations, which no one else can see either.  (We still take buffer-level
+ * locks, but not lmgr locks.)
+ */
+#define USELOCKING(rel)		(!((rel)->rd_isnew || (rel)->rd_istemp))
 
 
 /*
- *	We use high-concurrency locking on hash indices.  There are two cases in
- *	which we don't do locking.  One is when we're building the index.
- *	Since the creating transaction has not committed, no one can see
- *	the index, and there's no reason to share locks.  The second case
- *	is when we're just starting up the database system.  We use some
- *	special-purpose initialization code in the relation cache manager
- *	(see utils/cache/relcache.c) to allow us to do indexed scans on
- *	the system catalogs before we'd normally be able to.  This happens
- *	before the lock table is fully initialized, so we can't use it.
- *	Strictly speaking, this violates 2pl, but we don't do 2pl on the
- *	system catalogs anyway.
+ * _hash_getlock() -- Acquire an lmgr lock.
  *
- *	Note that our page locks are actual lockmanager locks, not buffer
- *	locks (as are used by btree, for example).	This is a good idea because
- *	the algorithms are not deadlock-free, and we'd better be able to detect
- *	and recover from deadlocks.
+ * 'whichlock' should be zero to acquire the split-control lock, or the
+ * block number of a bucket's primary bucket page to acquire the per-bucket
+ * lock.  (See README for details of the use of these locks.)
  *
- *	Another important difference from btree is that a hash indexscan
- *	retains both a lock and a buffer pin on the current index page
- *	between hashgettuple() calls (btree keeps only a buffer pin).
- *	Because of this, it's safe to do item deletions with only a regular
- *	write lock on a hash page --- there cannot be an indexscan stopped on
- *	the page being deleted, other than an indexscan of our own backend,
- *	which will be taken care of by _hash_adjscans.
+ * 'access' must be HASH_SHARE or HASH_EXCLUSIVE.
  */
-#define USELOCKING		(!BuildingHash && !IsInitProcessingMode())
+void
+_hash_getlock(Relation rel, BlockNumber whichlock, int access)
+{
+	if (USELOCKING(rel))
+		LockPage(rel, whichlock, access);
+}
 
+/*
+ * _hash_try_getlock() -- Acquire an lmgr lock, but only if it's free.
+ *
+ * Same as above except we return FALSE without blocking if lock isn't free.
+ */
+bool
+_hash_try_getlock(Relation rel, BlockNumber whichlock, int access)
+{
+	if (USELOCKING(rel))
+		return ConditionalLockPage(rel, whichlock, access);
+	else
+		return true;
+}
 
-static void _hash_setpagelock(Relation rel, BlockNumber blkno, int access);
-static void _hash_unsetpagelock(Relation rel, BlockNumber blkno, int access);
-static void _hash_splitbucket(Relation rel, Buffer metabuf,
-							  Bucket obucket, Bucket nbucket);
+/*
+ * _hash_droplock() -- Release an lmgr lock.
+ */
+void
+_hash_droplock(Relation rel, BlockNumber whichlock, int access)
+{
+	if (USELOCKING(rel))
+		UnlockPage(rel, whichlock, access);
+}
+
+/*
+ *	_hash_getbuf() -- Get a buffer by block number for read or write.
+ *
+ *		'access' must be HASH_READ, HASH_WRITE, or HASH_NOLOCK.
+ *
+ *		When this routine returns, the appropriate lock is set on the
+ *		requested buffer and its reference count has been incremented
+ *		(ie, the buffer is "locked and pinned").
+ *
+ *		XXX P_NEW is not used because, unlike the tree structures, we
+ *		need the bucket blocks to be at certain block numbers.	we must
+ *		depend on the caller to call _hash_pageinit on the block if it
+ *		knows that this is a new block.
+ */
+Buffer
+_hash_getbuf(Relation rel, BlockNumber blkno, int access)
+{
+	Buffer		buf;
+
+	if (blkno == P_NEW)
+		elog(ERROR, "hash AM does not use P_NEW");
+
+	buf = ReadBuffer(rel, blkno);
+
+	if (access != HASH_NOLOCK)
+		LockBuffer(buf, access);
+
+	/* ref count and lock type are correct */
+	return buf;
+}
+
+/*
+ *	_hash_relbuf() -- release a locked buffer.
+ *
+ * Lock and pin (refcount) are both dropped.  Note that either read or
+ * write lock can be dropped this way, but if we modified the buffer,
+ * this is NOT the right way to release a write lock.
+ */
+void
+_hash_relbuf(Relation rel, Buffer buf)
+{
+	LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+	ReleaseBuffer(buf);
+}
+
+/*
+ *	_hash_dropbuf() -- release an unlocked buffer.
+ *
+ * This is used to unpin a buffer on which we hold no lock.  It is assumed
+ * that the buffer is not dirty.
+ */
+void
+_hash_dropbuf(Relation rel, Buffer buf)
+{
+	ReleaseBuffer(buf);
+}
+
+/*
+ *	_hash_wrtbuf() -- write a hash page to disk.
+ *
+ *		This routine releases the lock held on the buffer and our refcount
+ *		for it.  It is an error to call _hash_wrtbuf() without a write lock
+ *		and a pin on the buffer.
+ *
+ * NOTE: actually, the buffer manager just marks the shared buffer page
+ * dirty here; the real I/O happens later.	This is okay since we are not
+ * relying on write ordering anyway.  The WAL mechanism is responsible for
+ * guaranteeing correctness after a crash.
+ */
+void
+_hash_wrtbuf(Relation rel, Buffer buf)
+{
+	LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+	WriteBuffer(buf);
+}
+
+/*
+ *	_hash_wrtnorelbuf() -- write a hash page to disk, but do not release
+ *						 our reference or lock.
+ *
+ *		It is an error to call _hash_wrtnorelbuf() without a write lock
+ *		and a pin on the buffer.
+ *
+ * See above NOTE.
+ */
+void
+_hash_wrtnorelbuf(Relation rel, Buffer buf)
+{
+	WriteNoReleaseBuffer(buf);
+}
+
+/*
+ * _hash_chgbufaccess() -- Change the lock type on a buffer, without
+ *			dropping our pin on it.
+ *
+ * from_access and to_access may be HASH_READ, HASH_WRITE, or HASH_NOLOCK,
+ * the last indicating that no buffer-level lock is held or wanted.
+ *
+ * When from_access == HASH_WRITE, we assume the buffer is dirty and tell
+ * bufmgr it must be written out.  If the caller wants to release a write
+ * lock on a page that's not been modified, it's okay to pass from_access
+ * as HASH_READ (a bit ugly, but handy in some places).
+ */
+void
+_hash_chgbufaccess(Relation rel,
+				   Buffer buf,
+				   int from_access,
+				   int to_access)
+{
+	if (from_access != HASH_NOLOCK)
+		LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+	if (from_access == HASH_WRITE)
+		WriteNoReleaseBuffer(buf);
+
+	if (to_access != HASH_NOLOCK)
+		LockBuffer(buf, to_access);
+}
 
 
 /*
  *	_hash_metapinit() -- Initialize the metadata page of a hash index,
  *				the two buckets that we begin with and the initial
  *				bitmap page.
+ *
+ * We are fairly cavalier about locking here, since we know that no one else
+ * could be accessing this index.  In particular the rule about not holding
+ * multiple buffer locks is ignored.
  */
 void
 _hash_metapinit(Relation rel)
@@ -83,16 +230,31 @@ _hash_metapinit(Relation rel)
 	Buffer		metabuf;
 	Buffer		buf;
 	Page		pg;
+	int32		data_width;
+	int32		item_width;
+	int32		ffactor;
 	uint16		i;
 
-	/* can't be sharing this with anyone, now... */
-	if (USELOCKING)
-		LockRelation(rel, AccessExclusiveLock);
-
+	/* safety check */
 	if (RelationGetNumberOfBlocks(rel) != 0)
 		elog(ERROR, "cannot initialize non-empty hash index \"%s\"",
 			 RelationGetRelationName(rel));
 
+	/*
+	 * Determine the target fill factor (tuples per bucket) for this index.
+	 * The idea is to make the fill factor correspond to pages about 3/4ths
+	 * full.  We can compute it exactly if the index datatype is fixed-width,
+	 * but for var-width there's some guessing involved.
+	 */
+	data_width = get_typavgwidth(RelationGetDescr(rel)->attrs[0]->atttypid,
+								 RelationGetDescr(rel)->attrs[0]->atttypmod);
+	item_width = MAXALIGN(sizeof(HashItemData)) + MAXALIGN(data_width) +
+		sizeof(ItemIdData);		/* include the line pointer */
+	ffactor = (BLCKSZ * 3 / 4) / item_width;
+	/* keep to a sane range */
+	if (ffactor < 10)
+		ffactor = 10;
+
 	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE);
 	pg = BufferGetPage(metabuf);
 	_hash_pageinit(pg, BufferGetPageSize(metabuf));
@@ -110,7 +272,7 @@ _hash_metapinit(Relation rel)
 	metap->hashm_version = HASH_VERSION;
 	metap->hashm_ntuples = 0;
 	metap->hashm_nmaps = 0;
-	metap->hashm_ffactor = DEFAULT_FFACTOR;
+	metap->hashm_ffactor = ffactor;
 	metap->hashm_bsize = BufferGetPageSize(metabuf);
 	/* find largest bitmap array size that will fit in page size */
 	for (i = _hash_log2(metap->hashm_bsize); i > 0; --i)
@@ -142,7 +304,7 @@ _hash_metapinit(Relation rel)
 	metap->hashm_firstfree = 0;
 
 	/*
-	 * initialize the first two buckets
+	 * Initialize the first two buckets
 	 */
 	for (i = 0; i <= 1; i++)
 	{
@@ -159,135 +321,17 @@ _hash_metapinit(Relation rel)
 	}
 
 	/*
-	 * Initialize bitmap page.  Can't do this until we
+	 * Initialize first bitmap page.  Can't do this until we
 	 * create the first two buckets, else smgr will complain.
 	 */
 	_hash_initbitmap(rel, metap, 3);
 
 	/* all done */
 	_hash_wrtbuf(rel, metabuf);
-
-	if (USELOCKING)
-		UnlockRelation(rel, AccessExclusiveLock);
 }
 
 /*
- *	_hash_getbuf() -- Get a buffer by block number for read or write.
- *
- *		When this routine returns, the appropriate lock is set on the
- *		requested buffer its reference count is correct.
- *
- *		XXX P_NEW is not used because, unlike the tree structures, we
- *		need the bucket blocks to be at certain block numbers.	we must
- *		depend on the caller to call _hash_pageinit on the block if it
- *		knows that this is a new block.
- */
-Buffer
-_hash_getbuf(Relation rel, BlockNumber blkno, int access)
-{
-	Buffer		buf;
-
-	if (blkno == P_NEW)
-		elog(ERROR, "hash AM does not use P_NEW");
-	switch (access)
-	{
-		case HASH_WRITE:
-		case HASH_READ:
-			_hash_setpagelock(rel, blkno, access);
-			break;
-		default:
-			elog(ERROR, "unrecognized hash access code: %d", access);
-			break;
-	}
-	buf = ReadBuffer(rel, blkno);
-
-	/* ref count and lock type are correct */
-	return buf;
-}
-
-/*
- *	_hash_relbuf() -- release a locked buffer.
- */
-void
-_hash_relbuf(Relation rel, Buffer buf, int access)
-{
-	BlockNumber blkno;
-
-	blkno = BufferGetBlockNumber(buf);
-
-	switch (access)
-	{
-		case HASH_WRITE:
-		case HASH_READ:
-			_hash_unsetpagelock(rel, blkno, access);
-			break;
-		default:
-			elog(ERROR, "unrecognized hash access code: %d", access);
-			break;
-	}
-
-	ReleaseBuffer(buf);
-}
-
-/*
- *	_hash_wrtbuf() -- write a hash page to disk.
- *
- *		This routine releases the lock held on the buffer and our reference
- *		to it.	It is an error to call _hash_wrtbuf() without a write lock
- *		or a reference to the buffer.
- */
-void
-_hash_wrtbuf(Relation rel, Buffer buf)
-{
-	BlockNumber blkno;
-
-	blkno = BufferGetBlockNumber(buf);
-	WriteBuffer(buf);
-	_hash_unsetpagelock(rel, blkno, HASH_WRITE);
-}
-
-/*
- *	_hash_wrtnorelbuf() -- write a hash page to disk, but do not release
- *						 our reference or lock.
- *
- *		It is an error to call _hash_wrtnorelbuf() without a write lock
- *		or a reference to the buffer.
- */
-void
-_hash_wrtnorelbuf(Buffer buf)
-{
-	BlockNumber blkno;
-
-	blkno = BufferGetBlockNumber(buf);
-	WriteNoReleaseBuffer(buf);
-}
-
-/*
- * _hash_chgbufaccess() -- Change from read to write access or vice versa.
- *
- * When changing from write to read, we assume the buffer is dirty and tell
- * bufmgr it must be written out.
- */
-void
-_hash_chgbufaccess(Relation rel,
-				   Buffer buf,
-				   int from_access,
-				   int to_access)
-{
-	BlockNumber blkno;
-
-	blkno = BufferGetBlockNumber(buf);
-
-	if (from_access == HASH_WRITE)
-		_hash_wrtnorelbuf(buf);
-
-	_hash_unsetpagelock(rel, blkno, from_access);
-
-	_hash_setpagelock(rel, blkno, to_access);
-}
-
-/*
- *	_hash_pageinit() -- Initialize a new page.
+ *	_hash_pageinit() -- Initialize a new hash index page.
  */
 void
 _hash_pageinit(Page page, Size size)
@@ -297,57 +341,14 @@ _hash_pageinit(Page page, Size size)
 }
 
 /*
- *  _hash_setpagelock() -- Acquire the requested type of lock on a page.
- */
-static void
-_hash_setpagelock(Relation rel,
-				  BlockNumber blkno,
-				  int access)
-{
-	if (USELOCKING)
-	{
-		switch (access)
-		{
-			case HASH_WRITE:
-				LockPage(rel, blkno, ExclusiveLock);
-				break;
-			case HASH_READ:
-				LockPage(rel, blkno, ShareLock);
-				break;
-			default:
-				elog(ERROR, "unrecognized hash access code: %d", access);
-				break;
-		}
-	}
-}
-
-/*
- *  _hash_unsetpagelock() -- Release the specified type of lock on a page.
- */
-static void
-_hash_unsetpagelock(Relation rel,
-					BlockNumber blkno,
-					int access)
-{
-	if (USELOCKING)
-	{
-		switch (access)
-		{
-			case HASH_WRITE:
-				UnlockPage(rel, blkno, ExclusiveLock);
-				break;
-			case HASH_READ:
-				UnlockPage(rel, blkno, ShareLock);
-				break;
-			default:
-				elog(ERROR, "unrecognized hash access code: %d", access);
-				break;
-		}
-	}
-}
-
-/*
- * Expand the hash table by creating one new bucket.
+ * Attempt to expand the hash table by creating one new bucket.
+ *
+ * This will silently do nothing if it cannot get the needed locks.
+ *
+ * The caller should hold no locks on the hash index.
+ *
+ * The caller must hold a pin, but no lock, on the metapage buffer.
+ * The buffer is returned in the same state.
  */
 void
 _hash_expandtable(Relation rel, Buffer metabuf)
@@ -356,15 +357,72 @@ _hash_expandtable(Relation rel, Buffer metabuf)
 	Bucket		old_bucket;
 	Bucket		new_bucket;
 	uint32		spare_ndx;
+	BlockNumber start_oblkno;
+	BlockNumber start_nblkno;
+	uint32		maxbucket;
+	uint32		highmask;
+	uint32		lowmask;
+
+	/*
+	 * Obtain the page-zero lock to assert the right to begin a split
+	 * (see README).
+	 *
+	 * Note: deadlock should be impossible here. Our own backend could only
+	 * be holding bucket sharelocks due to stopped indexscans; those will not
+	 * block other holders of the page-zero lock, who are only interested in
+	 * acquiring bucket sharelocks themselves.  Exclusive bucket locks are
+	 * only taken here and in hashbulkdelete, and neither of these operations
+	 * needs any additional locks to complete.  (If, due to some flaw in this
+	 * reasoning, we manage to deadlock anyway, it's okay to error out; the
+	 * index will be left in a consistent state.)
+	 */
+	_hash_getlock(rel, 0, HASH_EXCLUSIVE);
+
+	/* Write-lock the meta page */
+	_hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);
 
 	metap = (HashMetaPage) BufferGetPage(metabuf);
 	_hash_checkpage(rel, (Page) metap, LH_META_PAGE);
 
-	_hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_WRITE);
+	/*
+	 * Check to see if split is still needed; someone else might have already
+	 * done one while we waited for the lock.
+	 *
+	 * Make sure this stays in sync with_hash_doinsert()
+	 */
+	if (metap->hashm_ntuples <=
+		(double) metap->hashm_ffactor * (metap->hashm_maxbucket + 1))
+		goto fail;
 
-	new_bucket = ++metap->hashm_maxbucket;
+	/*
+	 * Determine which bucket is to be split, and attempt to lock the old
+	 * bucket.  If we can't get the lock, give up.
+	 *
+	 * The lock protects us against other backends, but not against our own
+	 * backend.  Must check for active scans separately.
+	 *
+	 * Ideally we would lock the new bucket too before proceeding, but if
+	 * we are about to cross a splitpoint then the BUCKET_TO_BLKNO mapping
+	 * isn't correct yet.  For simplicity we update the metapage first and
+	 * then lock.  This should be okay because no one else should be trying
+	 * to lock the new bucket yet...
+	 */
+	new_bucket = metap->hashm_maxbucket + 1;
 	old_bucket = (new_bucket & metap->hashm_lowmask);
 
+	start_oblkno = BUCKET_TO_BLKNO(metap, old_bucket);
+
+	if (_hash_has_active_scan(rel, old_bucket))
+		goto fail;
+
+	if (!_hash_try_getlock(rel, start_oblkno, HASH_EXCLUSIVE))
+		goto fail;
+
+	/*
+	 * Okay to proceed with split.  Update the metapage bucket mapping info.
+	 */
+	metap->hashm_maxbucket = new_bucket;
+
 	if (new_bucket > metap->hashm_highmask)
 	{
 		/* Starting a new doubling */
@@ -379,7 +437,7 @@ _hash_expandtable(Relation rel, Buffer metabuf)
 	 * this new batch of bucket pages.
 	 *
 	 * XXX should initialize new bucket pages to prevent out-of-order
-	 * page creation.
+	 * page creation?  Don't wanna do it right here though.
 	 */
 	spare_ndx = _hash_log2(metap->hashm_maxbucket + 1);
 	if (spare_ndx > metap->hashm_ovflpoint)
@@ -389,10 +447,50 @@ _hash_expandtable(Relation rel, Buffer metabuf)
 		metap->hashm_ovflpoint = spare_ndx;
 	}
 
-	_hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_READ);
+	/* now we can compute the new bucket's primary block number */
+	start_nblkno = BUCKET_TO_BLKNO(metap, new_bucket);
+
+	Assert(!_hash_has_active_scan(rel, new_bucket));
+
+	if (!_hash_try_getlock(rel, start_nblkno, HASH_EXCLUSIVE))
+		elog(PANIC, "could not get lock on supposedly new bucket");
+
+	/*
+	 * Copy bucket mapping info now; this saves re-accessing the meta page
+	 * inside _hash_splitbucket's inner loop.  Note that once we drop the
+	 * split lock, other splits could begin, so these values might be out of
+	 * date before _hash_splitbucket finishes.  That's okay, since all it
+	 * needs is to tell which of these two buckets to map hashkeys into.
+	 */
+	maxbucket = metap->hashm_maxbucket;
+	highmask = metap->hashm_highmask;
+	lowmask = metap->hashm_lowmask;
+
+	/* Write out the metapage and drop lock, but keep pin */
+	_hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK);
+
+	/* Release split lock; okay for other splits to occur now */
+	_hash_droplock(rel, 0, HASH_EXCLUSIVE);
 
 	/* Relocate records to the new bucket */
-	_hash_splitbucket(rel, metabuf, old_bucket, new_bucket);
+	_hash_splitbucket(rel, metabuf, old_bucket, new_bucket,
+					  start_oblkno, start_nblkno,
+					  maxbucket, highmask, lowmask);
+
+	/* Release bucket locks, allowing others to access them */
+	_hash_droplock(rel, start_oblkno, HASH_EXCLUSIVE);
+	_hash_droplock(rel, start_nblkno, HASH_EXCLUSIVE);
+
+	return;
+
+	/* Here if decide not to split or fail to acquire old bucket lock */
+fail:
+
+	/* We didn't write the metapage, so just drop lock */
+	_hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK);
+
+	/* Release split lock */
+	_hash_droplock(rel, 0, HASH_EXCLUSIVE);
 }
 
 
@@ -403,27 +501,35 @@ _hash_expandtable(Relation rel, Buffer metabuf)
  * or more overflow (bucket chain) pages.  We must relocate tuples that
  * belong in the new bucket, and compress out any free space in the old
  * bucket.
+ *
+ * The caller must hold exclusive locks on both buckets to ensure that
+ * no one else is trying to access them (see README).
+ *
+ * The caller must hold a pin, but no lock, on the metapage buffer.
+ * The buffer is returned in the same state.  (The metapage is only
+ * touched if it becomes necessary to add or remove overflow pages.)
  */
 static void
 _hash_splitbucket(Relation rel,
 				  Buffer metabuf,
 				  Bucket obucket,
-				  Bucket nbucket)
+				  Bucket nbucket,
+				  BlockNumber start_oblkno,
+				  BlockNumber start_nblkno,
+				  uint32 maxbucket,
+				  uint32 highmask,
+				  uint32 lowmask)
 {
 	Bucket		bucket;
 	Buffer		obuf;
 	Buffer		nbuf;
-	Buffer		ovflbuf;
 	BlockNumber oblkno;
 	BlockNumber nblkno;
-	BlockNumber start_oblkno;
-	BlockNumber start_nblkno;
 	bool		null;
 	Datum		datum;
 	HashItem	hitem;
 	HashPageOpaque oopaque;
 	HashPageOpaque nopaque;
-	HashMetaPage metap;
 	IndexTuple	itup;
 	Size		itemsz;
 	OffsetNumber ooffnum;
@@ -433,12 +539,11 @@ _hash_splitbucket(Relation rel,
 	Page		npage;
 	TupleDesc	itupdesc = RelationGetDescr(rel);
 
-	metap = (HashMetaPage) BufferGetPage(metabuf);
-	_hash_checkpage(rel, (Page) metap, LH_META_PAGE);
-
-	/* get the buffers & pages */
-	start_oblkno = BUCKET_TO_BLKNO(metap, obucket);
-	start_nblkno = BUCKET_TO_BLKNO(metap, nbucket);
+	/*
+	 * It should be okay to simultaneously write-lock pages from each
+	 * bucket, since no one else can be trying to acquire buffer lock
+	 * on pages of either bucket.
+	 */
 	oblkno = start_oblkno;
 	nblkno = start_nblkno;
 	obuf = _hash_getbuf(rel, oblkno, HASH_WRITE);
@@ -446,7 +551,10 @@ _hash_splitbucket(Relation rel,
 	opage = BufferGetPage(obuf);
 	npage = BufferGetPage(nbuf);
 
-	/* initialize the new bucket page */
+	_hash_checkpage(rel, opage, LH_BUCKET_PAGE);
+	oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
+
+	/* initialize the new bucket's primary page */
 	_hash_pageinit(npage, BufferGetPageSize(nbuf));
 	nopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
 	nopaque->hasho_prevblkno = InvalidBlockNumber;
@@ -454,44 +562,11 @@ _hash_splitbucket(Relation rel,
 	nopaque->hasho_bucket = nbucket;
 	nopaque->hasho_flag = LH_BUCKET_PAGE;
 	nopaque->hasho_filler = HASHO_FILL;
-	_hash_wrtnorelbuf(nbuf);
-
-	/*
-	 * make sure the old bucket isn't empty.  advance 'opage' and friends
-	 * through the overflow bucket chain until we find a non-empty page.
-	 *
-	 * XXX we should only need this once, if we are careful to preserve the
-	 * invariant that overflow pages are never empty.
-	 */
-	_hash_checkpage(rel, opage, LH_BUCKET_PAGE);
-	oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
-	if (PageIsEmpty(opage))
-	{
-		oblkno = oopaque->hasho_nextblkno;
-		_hash_relbuf(rel, obuf, HASH_WRITE);
-		if (!BlockNumberIsValid(oblkno))
-		{
-			/*
-			 * the old bucket is completely empty; of course, the new
-			 * bucket will be as well, but since it's a base bucket page
-			 * we don't care.
-			 */
-			_hash_relbuf(rel, nbuf, HASH_WRITE);
-			return;
-		}
-		obuf = _hash_getbuf(rel, oblkno, HASH_WRITE);
-		opage = BufferGetPage(obuf);
-		_hash_checkpage(rel, opage, LH_OVERFLOW_PAGE);
-		if (PageIsEmpty(opage))
-			elog(ERROR, "empty hash overflow page %u", oblkno);
-		oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
-	}
 
 	/*
-	 * we are now guaranteed that 'opage' is not empty.  partition the
-	 * tuples in the old bucket between the old bucket and the new bucket,
-	 * advancing along their respective overflow bucket chains and adding
-	 * overflow pages as needed.
+	 * Partition the tuples in the old bucket between the old bucket and the
+	 * new bucket, advancing along the old bucket's overflow bucket chain
+	 * and adding overflow pages to the new bucket as needed.
 	 */
 	ooffnum = FirstOffsetNumber;
 	omaxoffnum = PageGetMaxOffsetNumber(opage);
@@ -505,48 +580,39 @@ _hash_splitbucket(Relation rel,
 		/* check if we're at the end of the page */
 		if (ooffnum > omaxoffnum)
 		{
-			/* at end of page, but check for overflow page */
+			/* at end of page, but check for an(other) overflow page */
 			oblkno = oopaque->hasho_nextblkno;
-			if (BlockNumberIsValid(oblkno))
-			{
-				/*
-				 * we ran out of tuples on this particular page, but we
-				 * have more overflow pages; re-init values.
-				 */
-				_hash_wrtbuf(rel, obuf);
-				obuf = _hash_getbuf(rel, oblkno, HASH_WRITE);
-				opage = BufferGetPage(obuf);
-				_hash_checkpage(rel, opage, LH_OVERFLOW_PAGE);
-				oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
-				/* we're guaranteed that an ovfl page has at least 1 tuple */
-				if (PageIsEmpty(opage))
-					elog(ERROR, "empty hash overflow page %u", oblkno);
-				ooffnum = FirstOffsetNumber;
-				omaxoffnum = PageGetMaxOffsetNumber(opage);
-			}
-			else
-			{
-				/*
-				 * We're at the end of the bucket chain, so now we're
-				 * really done with everything.  Before quitting, call
-				 * _hash_squeezebucket to ensure the tuples remaining in the
-				 * old bucket (including the overflow pages) are packed as
-				 * tightly as possible.  The new bucket is already tight.
-				 */
-				_hash_wrtbuf(rel, obuf);
-				_hash_wrtbuf(rel, nbuf);
-				_hash_squeezebucket(rel, obucket, start_oblkno);
-				return;
-			}
+			if (!BlockNumberIsValid(oblkno))
+				break;
+			/*
+			 * we ran out of tuples on this particular page, but we
+			 * have more overflow pages; advance to next page.
+			 */
+			_hash_wrtbuf(rel, obuf);
+
+			obuf = _hash_getbuf(rel, oblkno, HASH_WRITE);
+			opage = BufferGetPage(obuf);
+			_hash_checkpage(rel, opage, LH_OVERFLOW_PAGE);
+			oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
+			ooffnum = FirstOffsetNumber;
+			omaxoffnum = PageGetMaxOffsetNumber(opage);
+			continue;
 		}
 
-		/* hash on the tuple */
+		/*
+		 * Re-hash the tuple to determine which bucket it now belongs in.
+		 *
+		 * It is annoying to call the hash function while holding locks,
+		 * but releasing and relocking the page for each tuple is unappealing
+		 * too.
+		 */
 		hitem = (HashItem) PageGetItem(opage, PageGetItemId(opage, ooffnum));
 		itup = &(hitem->hash_itup);
 		datum = index_getattr(itup, 1, itupdesc, &null);
 		Assert(!null);
 
-		bucket = _hash_call(rel, metap, datum);
+		bucket = _hash_hashkey2bucket(_hash_datum2hashkey(rel, datum),
+									  maxbucket, highmask, lowmask);
 
 		if (bucket == nbucket)
 		{
@@ -562,11 +628,13 @@ _hash_splitbucket(Relation rel,
 
 			if (PageGetFreeSpace(npage) < itemsz)
 			{
-				ovflbuf = _hash_addovflpage(rel, metabuf, nbuf);
-				_hash_wrtbuf(rel, nbuf);
-				nbuf = ovflbuf;
+				/* write out nbuf and drop lock, but keep pin */
+				_hash_chgbufaccess(rel, nbuf, HASH_WRITE, HASH_NOLOCK);
+				/* chain to a new overflow page */
+				nbuf = _hash_addovflpage(rel, metabuf, nbuf);
 				npage = BufferGetPage(nbuf);
-				_hash_checkpage(rel, npage, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
+				_hash_checkpage(rel, npage, LH_OVERFLOW_PAGE);
+				/* we don't need nopaque within the loop */
 			}
 
 			noffnum = OffsetNumberNext(PageGetMaxOffsetNumber(npage));
@@ -574,7 +642,6 @@ _hash_splitbucket(Relation rel,
 				== InvalidOffsetNumber)
 				elog(ERROR, "failed to add index item to \"%s\"",
 					 RelationGetRelationName(rel));
-			_hash_wrtnorelbuf(nbuf);
 
 			/*
 			 * now delete the tuple from the old bucket.  after this
@@ -586,40 +653,7 @@ _hash_splitbucket(Relation rel,
 			 * instead of calling PageGetMaxOffsetNumber.
 			 */
 			PageIndexTupleDelete(opage, ooffnum);
-			_hash_wrtnorelbuf(obuf);
 			omaxoffnum = OffsetNumberPrev(omaxoffnum);
-
-			/*
-			 * tidy up.  if the old page was an overflow page and it is
-			 * now empty, we must free it (we want to preserve the
-			 * invariant that overflow pages cannot be empty).
-			 */
-			if (PageIsEmpty(opage) &&
-				(oopaque->hasho_flag & LH_OVERFLOW_PAGE))
-			{
-				oblkno = _hash_freeovflpage(rel, obuf);
-
-				/* check that we're not through the bucket chain */
-				if (!BlockNumberIsValid(oblkno))
-				{
-					_hash_wrtbuf(rel, nbuf);
-					_hash_squeezebucket(rel, obucket, start_oblkno);
-					return;
-				}
-
-				/*
-				 * re-init. again, we're guaranteed that an ovfl page has
-				 * at least one tuple.
-				 */
-				obuf = _hash_getbuf(rel, oblkno, HASH_WRITE);
-				opage = BufferGetPage(obuf);
-				_hash_checkpage(rel, opage, LH_OVERFLOW_PAGE);
-				oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
-				if (PageIsEmpty(opage))
-					elog(ERROR, "empty hash overflow page %u", oblkno);
-				ooffnum = FirstOffsetNumber;
-				omaxoffnum = PageGetMaxOffsetNumber(opage);
-			}
 		}
 		else
 		{
@@ -632,5 +666,15 @@ _hash_splitbucket(Relation rel,
 			ooffnum = OffsetNumberNext(ooffnum);
 		}
 	}
-	/* NOTREACHED */
+
+	/*
+	 * We're at the end of the old bucket chain, so we're done partitioning
+	 * the tuples.  Before quitting, call _hash_squeezebucket to ensure the
+	 * tuples remaining in the old bucket (including the overflow pages) are
+	 * packed as tightly as possible.  The new bucket is already tight.
+	 */
+	_hash_wrtbuf(rel, obuf);
+	_hash_wrtbuf(rel, nbuf);
+
+	_hash_squeezebucket(rel, obucket, start_oblkno);
 }
author	Tom Lane <tgl@sss.pgh.pa.us>	2003-09-04 22:06:27 +0000
committer	Tom Lane <tgl@sss.pgh.pa.us>	2003-09-04 22:06:27 +0000
commit	7a3693716dad968569d3b91ce203841f1293370d (patch)
tree	60e513010279ef799fb63bab5cdc6c3187b1ae7e /src/backend/access/hash/hashpage.c
parent	ca43f71ca5c24497ad6f2904fd7ac9ce9b2bf75a (diff)
download	postgresql-7a3693716dad968569d3b91ce203841f1293370d.tar.gz