1 files changed, 377 insertions, 333 deletions
diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c
index 1c16df33cd..5b9d19acf1 100644
--- a/src/backend/access/hash/hashpage.c
+++ b/src/backend/access/hash/hashpage.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/hash/hashpage.c,v 1.41 2003/09/02 18:13:31 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/hash/hashpage.c,v 1.42 2003/09/04 22:06:27 tgl Exp $
  *
  * NOTES
  *	  Postgres hash pages look like ordinary relation pages.  The opaque
@@ -26,54 +26,201 @@
  *
  *-------------------------------------------------------------------------
  */
-
 #include "postgres.h"
 
 #include "access/genam.h"
 #include "access/hash.h"
-#include "miscadmin.h"
 #include "storage/lmgr.h"
+#include "utils/lsyscache.h"
+
+
+static void _hash_splitbucket(Relation rel, Buffer metabuf,
+							  Bucket obucket, Bucket nbucket,
+							  BlockNumber start_oblkno,
+							  BlockNumber start_nblkno,
+							  uint32 maxbucket,
+							  uint32 highmask, uint32 lowmask);
+
+
+/*
+ * We use high-concurrency locking on hash indexes (see README for an overview
+ * of the locking rules).  There are two cases in which we don't do locking.
+ * One is when the index is newly created in the current transaction.  Since
+ * the creating transaction has not committed, no one else can see the index,
+ * and there's no reason to take locks.  The second case is for temp
+ * relations, which no one else can see either.  (We still take buffer-level
+ * locks, but not lmgr locks.)
+ */
+#define USELOCKING(rel)		(!((rel)->rd_isnew || (rel)->rd_istemp))
 
 
 /*
- *	We use high-concurrency locking on hash indices.  There are two cases in
- *	which we don't do locking.  One is when we're building the index.
- *	Since the creating transaction has not committed, no one can see
- *	the index, and there's no reason to share locks.  The second case
- *	is when we're just starting up the database system.  We use some
- *	special-purpose initialization code in the relation cache manager
- *	(see utils/cache/relcache.c) to allow us to do indexed scans on
- *	the system catalogs before we'd normally be able to.  This happens
- *	before the lock table is fully initialized, so we can't use it.
- *	Strictly speaking, this violates 2pl, but we don't do 2pl on the
- *	system catalogs anyway.
+ * _hash_getlock() -- Acquire an lmgr lock.
  *
- *	Note that our page locks are actual lockmanager locks, not buffer
- *	locks (as are used by btree, for example).	This is a good idea because
- *	the algorithms are not deadlock-free, and we'd better be able to detect
- *	and recover from deadlocks.
+ * 'whichlock' should be zero to acquire the split-control lock, or the
+ * block number of a bucket's primary bucket page to acquire the per-bucket
+ * lock.  (See README for details of the use of these locks.)
  *
- *	Another important difference from btree is that a hash indexscan
- *	retains both a lock and a buffer pin on the current index page
- *	between hashgettuple() calls (btree keeps only a buffer pin).
- *	Because of this, it's safe to do item deletions with only a regular
- *	write lock on a hash page --- there cannot be an indexscan stopped on
- *	the page being deleted, other than an indexscan of our own backend,
- *	which will be taken care of by _hash_adjscans.
+ * 'access' must be HASH_SHARE or HASH_EXCLUSIVE.
  */
-#define USELOCKING		(!BuildingHash && !IsInitProcessingMode())
+void
+_hash_getlock(Relation rel, BlockNumber whichlock, int access)
+{
+	if (USELOCKING(rel))
+		LockPage(rel, whichlock, access);
+}
 
+/*
+ * _hash_try_getlock() -- Acquire an lmgr lock, but only if it's free.
+ *
+ * Same as above except we return FALSE without blocking if lock isn't free.
+ */
+bool
+_hash_try_getlock(Relation rel, BlockNumber whichlock, int access)
+{
+	if (USELOCKING(rel))
+		return ConditionalLockPage(rel, whichlock, access);
+	else
+		return true;
+}
 
-static void _hash_setpagelock(Relation rel, BlockNumber blkno, int access);
-static void _hash_unsetpagelock(Relation rel, BlockNumber blkno, int access);
-static void _hash_splitbucket(Relation rel, Buffer metabuf,
-							  Bucket obucket, Bucket nbucket);
+/*
+ * _hash_droplock() -- Release an lmgr lock.
+ */
+void
+_hash_droplock(Relation rel, BlockNumber whichlock, int access)
+{
+	if (USELOCKING(rel))
+		UnlockPage(rel, whichlock, access);
+}
+
+/*
+ *	_hash_getbuf() -- Get a buffer by block number for read or write.
+ *
+ *		'access' must be HASH_READ, HASH_WRITE, or HASH_NOLOCK.
+ *
+ *		When this routine returns, the appropriate lock is set on the
+ *		requested buffer and its reference count has been incremented
+ *		(ie, the buffer is "locked and pinned").
+ *
+ *		XXX P_NEW is not used because, unlike the tree structures, we
+ *		need the bucket blocks to be at certain block numbers.	we must
+ *		depend on the caller to call _hash_pageinit on the block if it
+ *		knows that this is a new block.
+ */
+Buffer
+_hash_getbuf(Relation rel, BlockNumber blkno, int access)
+{
+	Buffer		buf;
+
+	if (blkno == P_NEW)
+		elog(ERROR, "hash AM does not use P_NEW");
+
+	buf = ReadBuffer(rel, blkno);
+
+	if (access != HASH_NOLOCK)
+		LockBuffer(buf, access);
+
+	/* ref count and lock type are correct */
+	return buf;
+}
+
+/*
+ *	_hash_relbuf() -- release a locked buffer.
+ *
+ * Lock and pin (refcount) are both dropped.  Note that either read or
+ * write lock can be dropped this way, but if we modified the buffer,
+ * this is NOT the right way to release a write lock.
+ */
+void
+_hash_relbuf(Relation rel, Buffer buf)
+{
+	LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+	ReleaseBuffer(buf);
+}
+
+/*
+ *	_hash_dropbuf() -- release an unlocked buffer.
+ *
+ * This is used to unpin a buffer on which we hold no lock.  It is assumed
+ * that the buffer is not dirty.
+ */
+void
+_hash_dropbuf(Relation rel, Buffer buf)
+{
+	ReleaseBuffer(buf);
+}
+
+/*
+ *	_hash_wrtbuf() -- write a hash page to disk.
+ *
+ *		This routine releases the lock held on the buffer and our refcount
+ *		for it.  It is an error to call _hash_wrtbuf() without a write lock
+ *		and a pin on the buffer.
+ *
+ * NOTE: actually, the buffer manager just marks the shared buffer page
+ * dirty here; the real I/O happens later.	This is okay since we are not
+ * relying on write ordering anyway.  The WAL mechanism is responsible for
+ * guaranteeing correctness after a crash.
+ */
+void
+_hash_wrtbuf(Relation rel, Buffer buf)
+{
+	LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+	WriteBuffer(buf);
+}
+
+/*
+ *	_hash_wrtnorelbuf() -- write a hash page to disk, but do not release
+ *						 our reference or lock.
+ *
+ *		It is an error to call _hash_wrtnorelbuf() without a write lock
+ *		and a pin on the buffer.
+ *
+ * See above NOTE.
+ */
+void
+_hash_wrtnorelbuf(Relation rel, Buffer buf)
+{
+	WriteNoReleaseBuffer(buf);
+}
+
+/*
+ * _hash_chgbufaccess() -- Change the lock type on a buffer, without
+ *			dropping our pin on it.
+ *
+ * from_access and to_access may be HASH_READ, HASH_WRITE, or HASH_NOLOCK,
+ * the last indicating that no buffer-level lock is held or wanted.
+ *
+ * When from_access == HASH_WRITE, we assume the buffer is dirty and tell
+ * bufmgr it must be written out.  If the caller wants to release a write
+ * lock on a page that's not been modified, it's okay to pass from_access
+ * as HASH_READ (a bit ugly, but handy in some places).
+ */
+void
+_hash_chgbufaccess(Relation rel,
+				   Buffer buf,
+				   int from_access,
+				   int to_access)
+{
+	if (from_access != HASH_NOLOCK)
+		LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+	if (from_access == HASH_WRITE)
+		WriteNoReleaseBuffer(buf);
+
+	if (to_access != HASH_NOLOCK)
+		LockBuffer(buf, to_access);
+}
 
 
 /*
  *	_hash_metapinit() -- Initialize the metadata page of a hash index,
  *				the two buckets that we begin with and the initial
  *				bitmap page.
+ *
+ * We are fairly cavalier about locking here, since we know that no one else
+ * could be accessing this index.  In particular the rule about not holding
+ * multiple buffer locks is ignored.
  */
 void
 _hash_metapinit(Relation rel)
@@ -83,16 +230,31 @@ _hash_metapinit(Relation rel)
 	Buffer		metabuf;
 	Buffer		buf;
 	Page		pg;
+	int32		data_width;
+	int32		item_width;
+	int32		ffactor;
 	uint16		i;
 
-	/* can't be sharing this with anyone, now... */
-	if (USELOCKING)
-		LockRelation(rel, AccessExclusiveLock);
-
+	/* safety check */
 	if (RelationGetNumberOfBlocks(rel) != 0)
 		elog(ERROR, "cannot initialize non-empty hash index \"%s\"",
 			 RelationGetRelationName(rel));
 
+	/*
+	 * Determine the target fill factor (tuples per bucket) for this index.
+	 * The idea is to make the fill factor correspond to pages about 3/4ths
+	 * full.  We can compute it exactly if the index datatype is fixed-width,
+	 * but for var-width there's some guessing involved.
+	 */
+	data_width = get_typavgwidth(RelationGetDescr(rel)->attrs[0]->atttypid,
+								 RelationGetDescr(rel)->attrs[0]->atttypmod);
+	item_width = MAXALIGN(sizeof(HashItemData)) + MAXALIGN(data_width) +
+		sizeof(ItemIdData);		/* include the line pointer */
+	ffactor = (BLCKSZ * 3 / 4) / item_width;
+	/* keep to a sane range */
+	if (ffactor < 10)
+		ffactor = 10;
+
 	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE);
 	pg = BufferGetPage(metabuf);
 	_hash_pageinit(pg, BufferGetPageSize(metabuf));
@@ -110,7 +272,7 @@ _hash_metapinit(Relation rel)
 	metap->hashm_version = HASH_VERSION;
 	metap->hashm_ntuples = 0;
 	metap->hashm_nmaps = 0;
-	metap->hashm_ffactor = DEFAULT_FFACTOR;
+	metap->hashm_ffactor = ffactor;
 	metap->hashm_bsize = BufferGetPageSize(metabuf);
 	/* find largest bitmap array size that will fit in page size */
 	for (i = _hash_log2(metap->hashm_bsize); i > 0; --i)
@@ -142,7 +304,7 @@ _hash_metapinit(Relation rel)
 	metap->hashm_firstfree = 0;
 
 	/*
-	 * initialize the first two buckets
+	 * Initialize the first two buckets
 	 */
 	for (i = 0; i <= 1; i++)
 	{
@@ -159,135 +321,17 @@ _hash_metapinit(Relation rel)
 	}
 
 	/*
-	 * Initialize bitmap page.  Can't do this until we
+	 * Initialize first bitmap page.  Can't do this until we
 	 * create the first two buckets, else smgr will complain.
 	 */
 	_hash_initbitmap(rel, metap, 3);
 
 	/* all done */
 	_hash_wrtbuf(rel, metabuf);
-
-	if (USELOCKING)
-		UnlockRelation(rel, AccessExclusiveLock);
 }
 
 /*
- *	_hash_getbuf() -- Get a buffer by block number for read or write.
- *
- *		When this routine returns, the appropriate lock is set on the
- *		requested buffer its reference count is correct.
- *
- *		XXX P_NEW is not used because, unlike the tree structures, we
- *		need the bucket blocks to be at certain block numbers.	we must
- *		depend on the caller to call _hash_pageinit on the block if it
- *		knows that this is a new block.
- */
-Buffer
-_hash_getbuf(Relation rel, BlockNumber blkno, int access)
-{
-	Buffer		buf;
-
-	if (blkno == P_NEW)
-		elog(ERROR, "hash AM does not use P_NEW");
-	switch (access)
-	{
-		case HASH_WRITE:
-		case HASH_READ:
-			_hash_setpagelock(rel, blkno, access);
-			break;
-		default:
-			elog(ERROR, "unrecognized hash access code: %d", access);
-			break;
-	}
-	buf = ReadBuffer(rel, blkno);
-
-	/* ref count and lock type are correct */
-	return buf;
-}
-
-/*
- *	_hash_relbuf() -- release a locked buffer.
- */
-void
-_hash_relbuf(Relation rel, Buffer buf, int access)
-{
-	BlockNumber blkno;
-
-	blkno = BufferGetBlockNumber(buf);
-
-	switch (access)
-	{
-		case HASH_WRITE:
-		case HASH_READ:
-			_hash_unsetpagelock(rel, blkno, access);
-			break;
-		default:
-			elog(ERROR, "unrecognized hash access code: %d", access);
-			break;
-	}
-
-	ReleaseBuffer(buf);
-}
-
-/*
- *	_hash_wrtbuf() -- write a hash page to disk.
- *
- *		This routine releases the lock held on the buffer and our reference
- *		to it.	It is an error to call _hash_wrtbuf() without a write lock
- *		or a reference to the buffer.
- */
-void
-_hash_wrtbuf(Relation rel, Buffer buf)
-{
-	BlockNumber blkno;
-
-	blkno = BufferGetBlockNumber(buf);
-	WriteBuffer(buf);
-	_hash_unsetpagelock(rel, blkno, HASH_WRITE);
-}
-
-/*
- *	_hash_wrtnorelbuf() -- write a hash page to disk, but do not release
- *						 our reference or lock.
- *
- *		It is an error to call _hash_wrtnorelbuf() without a write lock
- *		or a reference to the buffer.
- */
-void
-_hash_wrtnorelbuf(Buffer buf)
-{
-	BlockNumber blkno;
-
-	blkno = BufferGetBlockNumber(buf);
-	WriteNoReleaseBuffer(buf);
-}
-
-/*
- * _hash_chgbufaccess() -- Change from read to write access or vice versa.
- *
- * When changing from write to read, we assume the buffer is dirty and tell
- * bufmgr it must be written out.
- */
-void
-_hash_chgbufaccess(Relation rel,
-				   Buffer buf,
-				   int from_access,
-				   int to_access)
-{
-	BlockNumber blkno;
-
-	blkno = BufferGetBlockNumber(buf);
-
-	if (from_access == HASH_WRITE)
-		_hash_wrtnorelbuf(buf);
-
-	_hash_unsetpagelock(rel, blkno, from_access);
-
-	_hash_setpagelock(rel, blkno, to_access);
-}
-
-/*
- *	_hash_pageinit() -- Initialize a new page.
+ *	_hash_pageinit() -- Initialize a new hash index page.
  */
 void
 _hash_pageinit(Page page, Size size)
@@ -297,57 +341,14 @@ _hash_pageinit(Page page, Size size)
 }
 
 /*
- *  _hash_setpagelock() -- Acquire the requested type of lock on a page.
- */
-static void
-_hash_setpagelock(Relation rel,
-				  BlockNumber blkno,
-				  int access)
-{
-	if (USELOCKING)
-	{
-		switch (access)
-		{
-			case HASH_WRITE:
-				LockPage(rel, blkno, ExclusiveLock);
-				break;
-			case HASH_READ:
-				LockPage(rel, blkno, ShareLock);
-				break;
-			default:
-				elog(ERROR, "unrecognized hash access code: %d", access);
-				break;
-		}
-	}
-}
-
-/*
- *  _hash_unsetpagelock() -- Release the specified type of lock on a page.
- */
-static void
-_hash_unsetpagelock(Relation rel,
-					BlockNumber blkno,
-					int access)
-{
-	if (USELOCKING)
-	{
-		switch (access)
-		{
-			case HASH_WRITE:
-				UnlockPage(rel, blkno, ExclusiveLock);
-				break;
-			case HASH_READ:
-				UnlockPage(rel, blkno, ShareLock);
-				break;
-			default:
-				elog(ERROR, "unrecognized hash access code: %d", access);
-				break;
-		}
-	}
-}
-
-/*
- * Expand the hash table by creating one new bucket.
+ * Attempt to expand the hash table by creating one new bucket.
+ *
+ * This will silently do nothing if it cannot get the needed locks.
+ *
+ * The caller should hold no locks on the hash index.
+ *
+ * The caller must hold a pin, but no lock, on the metapage buffer.
+ * The buffer is returned in the same state.
  */
 void
 _hash_expandtable(Relation rel, Buffer metabuf)
@@ -356,15 +357,72 @@ _hash_expandtable(Relation rel, Buffer metabuf)
 	Bucket		old_bucket;
 	Bucket		new_bucket;
 	uint32		spare_ndx;
+	BlockNumber start_oblkno;
+	BlockNumber start_nblkno;
+	uint32		maxbucket;
+	uint32		highmask;
+	uint32		lowmask;
+
+	/*
+	 * Obtain the page-zero lock to assert the right to begin a split
+	 * (see README).
+	 *
+	 * Note: deadlock should be impossible here. Our own backend could only
+	 * be holding bucket sharelocks due to stopped indexscans; those will not
+	 * block other holders of the page-zero lock, who are only interested in
+	 * acquiring bucket sharelocks themselves.  Exclusive bucket locks are
+	 * only taken here and in hashbulkdelete, and neither of these operations
+	 * needs any additional locks to complete.  (If, due to some flaw in this
+	 * reasoning, we manage to deadlock anyway, it's okay to error out; the
+	 * index will be left in a consistent state.)
+	 */
+	_hash_getlock(rel, 0, HASH_EXCLUSIVE);
+
+	/* Write-lock the meta page */
+	_hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);
 
 	metap = (HashMetaPage) BufferGetPage(metabuf);
 	_hash_checkpage(rel, (Page) metap, LH_META_PAGE);
 
-	_hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_WRITE);
+	/*
+	 * Check to see if split is still needed; someone else might have already
+	 * done one while we waited for the lock.
+	 *
+	 * Make sure this stays in sync with_hash_doinsert()
+	 */
+	if (metap->hashm_ntuples <=
+		(double) metap->hashm_ffactor * (metap->hashm_maxbucket + 1))
+		goto fail;
 
-	new_bucket = ++metap->hashm_maxbucket;
+	/*
+	 * Determine which bucket is to be split, and attempt to lock the old
+	 * bucket.  If we can't get the lock, give up.
+	 *
+	 * The lock protects us against other backends, but not against our own
+	 * backend.  Must check for active scans separately.
+	 *
+	 * Ideally we would lock the new bucket too before proceeding, but if
+	 * we are about to cross a splitpoint then the BUCKET_TO_BLKNO mapping
+	 * isn't correct yet.  For simplicity we update the metapage first and
+	 * then lock.  This should be okay because no one else should be trying
+	 * to lock the new bucket yet...
+	 */
+	new_bucket = metap->hashm_maxbucket + 1;
 	old_bucket = (new_bucket & metap->hashm_lowmask);
 
+	start_oblkno = BUCKET_TO_BLKNO(metap, old_bucket);
+
+	if (_hash_has_active_scan(rel, old_bucket))
+		goto fail;
+
+	if (!_hash_try_getlock(rel, start_oblkno, HASH_EXCLUSIVE))
+		goto fail;
+
+	/*
+	 * Okay to proceed with split.  Update the metapage bucket mapping info.
+	 */
+	metap->hashm_maxbucket = new_bucket;
+
 	if (new_bucket > metap->hashm_highmask)
 	{
 		/* Starting a new doubling */
@@ -379,7 +437,7 @@ _hash_expandtable(Relation rel, Buffer metabuf)
 	 * this new batch of bucket pages.
 	 *
 	 * XXX should initialize new bucket pages to prevent out-of-order
-	 * page creation.
+	 * page creation?  Don't wanna do it right here though.
 	 */
 	spare_ndx = _hash_log2(metap->hashm_maxbucket + 1);
 	if (spare_ndx > metap->hashm_ovflpoint)
@@ -389,10 +447,50 @@ _hash_expandtable(Relation rel, Buffer metabuf)
 		metap->hashm_ovflpoint = spare_ndx;
 	}
 
-	_hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_READ);
+	/* now we can compute the new bucket's primary block number */
+	start_nblkno = BUCKET_TO_BLKNO(metap, new_bucket);
+
+	Assert(!_hash_has_active_scan(rel, new_bucket));
+
+	if (!_hash_try_getlock(rel, start_nblkno, HASH_EXCLUSIVE))
+		elog(PANIC, "could not get lock on supposedly new bucket");
+
+	/*
+	 * Copy bucket mapping info now; this saves re-accessing the meta page
+	 * inside _hash_splitbucket's inner loop.  Note that once we drop the
+	 * split lock, other splits could begin, so these values might be out of
+	 * date before _hash_splitbucket finishes.  That's okay, since all it
+	 * needs is to tell which of these two buckets to map hashkeys into.
+	 */
+	maxbucket = metap->hashm_maxbucket;
+	highmask = metap->hashm_highmask;
+	lowmask = metap->hashm_lowmask;
+
+	/* Write out the metapage and drop lock, but keep pin */
+	_hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK);
+
+	/* Release split lock; okay for other splits to occur now */
+	_hash_droplock(rel, 0, HASH_EXCLUSIVE);
 
 	/* Relocate records to the new bucket */
-	_hash_splitbucket(rel, metabuf, old_bucket, new_bucket);
+	_hash_splitbucket(rel, metabuf, old_bucket, new_bucket,
+					  start_oblkno, start_nblkno,
+					  maxbucket, highmask, lowmask);
+
+	/* Release bucket locks, allowing others to access them */
+	_hash_droplock(rel, start_oblkno, HASH_EXCLUSIVE);
+	_hash_droplock(rel, start_nblkno, HASH_EXCLUSIVE);
+
+	return;
+
+	/* Here if decide not to split or fail to acquire old bucket lock */
+fail:
+
+	/* We didn't write the metapage, so just drop lock */
+	_hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK);
+
+	/* Release split lock */
+	_hash_droplock(rel, 0, HASH_EXCLUSIVE);
 }
 
 
@@ -403,27 +501,35 @@ _hash_expandtable(Relation rel, Buffer metabuf)
  * or more overflow (bucket chain) pages.  We must relocate tuples that
  * belong in the new bucket, and compress out any free space in the old
  * bucket.
+ *
+ * The caller must hold exclusive locks on both buckets to ensure that
+ * no one else is trying to access them (see README).
+ *
+ * The caller must hold a pin, but no lock, on the metapage buffer.
+ * The buffer is returned in the same state.  (The metapage is only
+ * touched if it becomes necessary to add or remove overflow pages.)
  */
 static void
 _hash_splitbucket(Relation rel,
 				  Buffer metabuf,
 				  Bucket obucket,
-				  Bucket nbucket)
+				  Bucket nbucket,
+				  BlockNumber start_oblkno,
+				  BlockNumber start_nblkno,
+				  uint32 maxbucket,
+				  uint32 highmask,
+				  uint32 lowmask)
 {
 	Bucket		bucket;
 	Buffer		obuf;
 	Buffer		nbuf;
-	Buffer		ovflbuf;
 	BlockNumber oblkno;
 	BlockNumber nblkno;
-	BlockNumber start_oblkno;
-	BlockNumber start_nblkno;
 	bool		null;
 	Datum		datum;
 	HashItem	hitem;
 	HashPageOpaque oopaque;
 	HashPageOpaque nopaque;
-	HashMetaPage metap;
 	IndexTuple	itup;
 	Size		itemsz;
 	OffsetNumber ooffnum;
@@ -433,12 +539,11 @@ _hash_splitbucket(Relation rel,
 	Page		npage;
 	TupleDesc	itupdesc = RelationGetDescr(rel);
 
-	metap = (HashMetaPage) BufferGetPage(metabuf);
-	_hash_checkpage(rel, (Page) metap, LH_META_PAGE);
-
-	/* get the buffers & pages */
-	start_oblkno = BUCKET_TO_BLKNO(metap, obucket);
-	start_nblkno = BUCKET_TO_BLKNO(metap, nbucket);
+	/*
+	 * It should be okay to simultaneously write-lock pages from each
+	 * bucket, since no one else can be trying to acquire buffer lock
+	 * on pages of either bucket.
+	 */
 	oblkno = start_oblkno;
 	nblkno = start_nblkno;
 	obuf = _hash_getbuf(rel, oblkno, HASH_WRITE);
@@ -446,7 +551,10 @@ _hash_splitbucket(Relation rel,
 	opage = BufferGetPage(obuf);
 	npage = BufferGetPage(nbuf);
 
-	/* initialize the new bucket page */
+	_hash_checkpage(rel, opage, LH_BUCKET_PAGE);
+	oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
+
+	/* initialize the new bucket's primary page */
 	_hash_pageinit(npage, BufferGetPageSize(nbuf));
 	nopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
 	nopaque->hasho_prevblkno = InvalidBlockNumber;
@@ -454,44 +562,11 @@ _hash_splitbucket(Relation rel,
 	nopaque->hasho_bucket = nbucket;
 	nopaque->hasho_flag = LH_BUCKET_PAGE;
 	nopaque->hasho_filler = HASHO_FILL;
-	_hash_wrtnorelbuf(nbuf);
-
-	/*
-	 * make sure the old bucket isn't empty.  advance 'opage' and friends
-	 * through the overflow bucket chain until we find a non-empty page.
-	 *
-	 * XXX we should only need this once, if we are careful to preserve the
-	 * invariant that overflow pages are never empty.
-	 */
-	_hash_checkpage(rel, opage, LH_BUCKET_PAGE);
-	oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
-	if (PageIsEmpty(opage))
-	{
-		oblkno = oopaque->hasho_nextblkno;
-		_hash_relbuf(rel, obuf, HASH_WRITE);
-		if (!BlockNumberIsValid(oblkno))
-		{
-			/*
-			 * the old bucket is completely empty; of course, the new
-			 * bucket will be as well, but since it's a base bucket page
-			 * we don't care.
-			 */
-			_hash_relbuf(rel, nbuf, HASH_WRITE);
-			return;
-		}
-		obuf = _hash_getbuf(rel, oblkno, HASH_WRITE);
-		opage = BufferGetPage(obuf);
-		_hash_checkpage(rel, opage, LH_OVERFLOW_PAGE);
-		if (PageIsEmpty(opage))
-			elog(ERROR, "empty hash overflow page %u", oblkno);
-		oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
-	}
 
 	/*
-	 * we are now guaranteed that 'opage' is not empty.  partition the
-	 * tuples in the old bucket between the old bucket and the new bucket,
-	 * advancing along their respective overflow bucket chains and adding
-	 * overflow pages as needed.
+	 * Partition the tuples in the old bucket between the old bucket and the
+	 * new bucket, advancing along the old bucket's overflow bucket chain
+	 * and adding overflow pages to the new bucket as needed.
 	 */
 	ooffnum = FirstOffsetNumber;
 	omaxoffnum = PageGetMaxOffsetNumber(opage);
@@ -505,48 +580,39 @@ _hash_splitbucket(Relation rel,
 		/* check if we're at the end of the page */
 		if (ooffnum > omaxoffnum)
 		{
-			/* at end of page, but check for overflow page */
+			/* at end of page, but check for an(other) overflow page */
 			oblkno = oopaque->hasho_nextblkno;
-			if (BlockNumberIsValid(oblkno))
-			{
-				/*
-				 * we ran out of tuples on this particular page, but we
-				 * have more overflow pages; re-init values.
-				 */
-				_hash_wrtbuf(rel, obuf);
-				obuf = _hash_getbuf(rel, oblkno, HASH_WRITE);
-				opage = BufferGetPage(obuf);
-				_hash_checkpage(rel, opage, LH_OVERFLOW_PAGE);
-				oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
-				/* we're guaranteed that an ovfl page has at least 1 tuple */
-				if (PageIsEmpty(opage))
-					elog(ERROR, "empty hash overflow page %u", oblkno);
-				ooffnum = FirstOffsetNumber;
-				omaxoffnum = PageGetMaxOffsetNumber(opage);
-			}
-			else
-			{
-				/*
-				 * We're at the end of the bucket chain, so now we're
-				 * really done with everything.  Before quitting, call
-				 * _hash_squeezebucket to ensure the tuples remaining in the
-				 * old bucket (including the overflow pages) are packed as
-				 * tightly as possible.  The new bucket is already tight.
-				 */
-				_hash_wrtbuf(rel, obuf);
-				_hash_wrtbuf(rel, nbuf);
-				_hash_squeezebucket(rel, obucket, start_oblkno);
-				return;
-			}
+			if (!BlockNumberIsValid(oblkno))
+				break;
+			/*
+			 * we ran out of tuples on this particular page, but we
+			 * have more overflow pages; advance to next page.
+			 */
+			_hash_wrtbuf(rel, obuf);
+
+			obuf = _hash_getbuf(rel, oblkno, HASH_WRITE);
+			opage = BufferGetPage(obuf);
+			_hash_checkpage(rel, opage, LH_OVERFLOW_PAGE);
+			oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
+			ooffnum = FirstOffsetNumber;
+			omaxoffnum = PageGetMaxOffsetNumber(opage);
+			continue;
 		}
 
-		/* hash on the tuple */
+		/*
+		 * Re-hash the tuple to determine which bucket it now belongs in.
+		 *
+		 * It is annoying to call the hash function while holding locks,
+		 * but releasing and relocking the page for each tuple is unappealing
+		 * too.
+		 */
 		hitem = (HashItem) PageGetItem(opage, PageGetItemId(opage, ooffnum));
 		itup = &(hitem->hash_itup);
 		datum = index_getattr(itup, 1, itupdesc, &null);
 		Assert(!null);
 
-		bucket = _hash_call(rel, metap, datum);
+		bucket = _hash_hashkey2bucket(_hash_datum2hashkey(rel, datum),
+									  maxbucket, highmask, lowmask);
 
 		if (bucket == nbucket)
 		{
@@ -562,11 +628,13 @@ _hash_splitbucket(Relation rel,
 
 			if (PageGetFreeSpace(npage) < itemsz)
 			{
-				ovflbuf = _hash_addovflpage(rel, metabuf, nbuf);
-				_hash_wrtbuf(rel, nbuf);
-				nbuf = ovflbuf;
+				/* write out nbuf and drop lock, but keep pin */
+				_hash_chgbufaccess(rel, nbuf, HASH_WRITE, HASH_NOLOCK);
+				/* chain to a new overflow page */
+				nbuf = _hash_addovflpage(rel, metabuf, nbuf);
 				npage = BufferGetPage(nbuf);
-				_hash_checkpage(rel, npage, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
+				_hash_checkpage(rel, npage, LH_OVERFLOW_PAGE);
+				/* we don't need nopaque within the loop */
 			}
 
 			noffnum = OffsetNumberNext(PageGetMaxOffsetNumber(npage));
@@ -574,7 +642,6 @@ _hash_splitbucket(Relation rel,
 				== InvalidOffsetNumber)
 				elog(ERROR, "failed to add index item to \"%s\"",
 					 RelationGetRelationName(rel));
-			_hash_wrtnorelbuf(nbuf);
 
 			/*
 			 * now delete the tuple from the old bucket.  after this
@@ -586,40 +653,7 @@ _hash_splitbucket(Relation rel,
 			 * instead of calling PageGetMaxOffsetNumber.
 			 */
 			PageIndexTupleDelete(opage, ooffnum);
-			_hash_wrtnorelbuf(obuf);
 			omaxoffnum = OffsetNumberPrev(omaxoffnum);
-
-			/*
-			 * tidy up.  if the old page was an overflow page and it is
-			 * now empty, we must free it (we want to preserve the
-			 * invariant that overflow pages cannot be empty).
-			 */
-			if (PageIsEmpty(opage) &&
-				(oopaque->hasho_flag & LH_OVERFLOW_PAGE))
-			{
-				oblkno = _hash_freeovflpage(rel, obuf);
-
-				/* check that we're not through the bucket chain */
-				if (!BlockNumberIsValid(oblkno))
-				{
-					_hash_wrtbuf(rel, nbuf);
-					_hash_squeezebucket(rel, obucket, start_oblkno);
-					return;
-				}
-
-				/*
-				 * re-init. again, we're guaranteed that an ovfl page has
-				 * at least one tuple.
-				 */
-				obuf = _hash_getbuf(rel, oblkno, HASH_WRITE);
-				opage = BufferGetPage(obuf);
-				_hash_checkpage(rel, opage, LH_OVERFLOW_PAGE);
-				oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
-				if (PageIsEmpty(opage))
-					elog(ERROR, "empty hash overflow page %u", oblkno);
-				ooffnum = FirstOffsetNumber;
-				omaxoffnum = PageGetMaxOffsetNumber(opage);
-			}
 		}
 		else
 		{
@@ -632,5 +666,15 @@ _hash_splitbucket(Relation rel,
 			ooffnum = OffsetNumberNext(ooffnum);
 		}
 	}
-	/* NOTREACHED */
+
+	/*
+	 * We're at the end of the old bucket chain, so we're done partitioning
+	 * the tuples.  Before quitting, call _hash_squeezebucket to ensure the
+	 * tuples remaining in the old bucket (including the overflow pages) are
+	 * packed as tightly as possible.  The new bucket is already tight.
+	 */
+	_hash_wrtbuf(rel, obuf);
+	_hash_wrtbuf(rel, nbuf);
+
+	_hash_squeezebucket(rel, obucket, start_oblkno);
 }