5 files changed, 179 insertions, 15 deletions
diff --git a/src/backend/access/hash/Makefile b/src/backend/access/hash/Makefile
index 38eb291253..80f9ea61e9 100644
--- a/src/backend/access/hash/Makefile
+++ b/src/backend/access/hash/Makefile
@@ -4,7 +4,7 @@
 #    Makefile for access/hash
 #
 # IDENTIFICATION
-#    $PostgreSQL: pgsql/src/backend/access/hash/Makefile,v 1.14 2008/02/19 10:30:06 petere Exp $
+#    $PostgreSQL: pgsql/src/backend/access/hash/Makefile,v 1.15 2008/03/16 23:15:08 tgl Exp $
 #
 #-------------------------------------------------------------------------
 
@@ -13,6 +13,6 @@ top_builddir = ../../../..
 include $(top_builddir)/src/Makefile.global
 
 OBJS = hash.o hashfunc.o hashinsert.o hashovfl.o hashpage.o hashscan.o \
-       hashsearch.o hashutil.o
+       hashsearch.o hashsort.o hashutil.o
 
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c
index b008c0aa4a..01da35ec9f 100644
--- a/src/backend/access/hash/hash.c
+++ b/src/backend/access/hash/hash.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/hash/hash.c,v 1.99 2008/03/15 20:46:31 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/hash/hash.c,v 1.100 2008/03/16 23:15:08 tgl Exp $
  *
  * NOTES
  *	  This file contains only the public interface routines.
@@ -22,13 +22,15 @@
 #include "access/hash.h"
 #include "catalog/index.h"
 #include "commands/vacuum.h"
+#include "optimizer/cost.h"
 #include "optimizer/plancat.h"
 
 
 /* Working state for hashbuild and its callback */
 typedef struct
 {
-	double		indtuples;
+	HSpool	   *spool;			/* NULL if not using spooling */
+	double		indtuples;		/* # tuples accepted into index */
 } HashBuildState;
 
 static void hashbuildCallback(Relation index,
@@ -51,6 +53,7 @@ hashbuild(PG_FUNCTION_ARGS)
 	IndexBuildResult *result;
 	BlockNumber	relpages;
 	double		reltuples;
+	uint32		num_buckets;
 	HashBuildState buildstate;
 
 	/*
@@ -61,19 +64,43 @@ hashbuild(PG_FUNCTION_ARGS)
 		elog(ERROR, "index \"%s\" already contains data",
 			 RelationGetRelationName(index));
 
-	/* estimate the number of rows currently present in the table */
+	/* Estimate the number of rows currently present in the table */
 	estimate_rel_size(heap, NULL, &relpages, &reltuples);
 
-	/* initialize the hash index metadata page and initial buckets */
-	_hash_metapinit(index, reltuples);
+	/* Initialize the hash index metadata page and initial buckets */
+	num_buckets = _hash_metapinit(index, reltuples);
 
-	/* build the index */
+	/*
+	 * If we just insert the tuples into the index in scan order, then
+	 * (assuming their hash codes are pretty random) there will be no locality
+	 * of access to the index, and if the index is bigger than available RAM
+	 * then we'll thrash horribly.  To prevent that scenario, we can sort the
+	 * tuples by (expected) bucket number.  However, such a sort is useless
+	 * overhead when the index does fit in RAM.  We choose to sort if the
+	 * initial index size exceeds effective_cache_size.
+	 *
+	 * NOTE: this test will need adjustment if a bucket is ever different
+	 * from one page.
+	 */
+	if (num_buckets >= (uint32) effective_cache_size)
+		buildstate.spool = _h_spoolinit(index, num_buckets);
+	else
+		buildstate.spool = NULL;
+
+	/* prepare to build the index */
 	buildstate.indtuples = 0;
 
 	/* do the heap scan */
 	reltuples = IndexBuildHeapScan(heap, index, indexInfo,
 								   hashbuildCallback, (void *) &buildstate);
 
+	if (buildstate.spool)
+	{
+		/* sort the tuples and insert them into the index */
+		_h_indexbuild(buildstate.spool);
+		_h_spooldestroy(buildstate.spool);
+	}
+
 	/*
 	 * Return statistics
 	 */
@@ -110,7 +137,11 @@ hashbuildCallback(Relation index,
 		return;
 	}
 
-	_hash_doinsert(index, itup);
+	/* Either spool the tuple for sorting, or just put it into the index */
+	if (buildstate->spool)
+		_h_spool(itup, buildstate->spool);
+	else
+		_hash_doinsert(index, itup);
 
 	buildstate->indtuples += 1;
 
diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c
index ec6f4b390f..d179af0121 100644
--- a/src/backend/access/hash/hashpage.c
+++ b/src/backend/access/hash/hashpage.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.73 2008/03/15 20:46:31 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.74 2008/03/16 23:15:08 tgl Exp $
  *
  * NOTES
  *	  Postgres hash pages look like ordinary relation pages.  The opaque
@@ -315,13 +315,14 @@ _hash_chgbufaccess(Relation rel,
  *				the initial buckets, and the initial bitmap page.
  *
  * The initial number of buckets is dependent on num_tuples, an estimate
- * of the number of tuples to be loaded into the index initially.
+ * of the number of tuples to be loaded into the index initially.  The
+ * chosen number of buckets is returned.
  *
  * We are fairly cavalier about locking here, since we know that no one else
  * could be accessing this index.  In particular the rule about not holding
  * multiple buffer locks is ignored.
  */
-void
+uint32
 _hash_metapinit(Relation rel, double num_tuples)
 {
 	HashMetaPage metap;
@@ -438,10 +439,21 @@ _hash_metapinit(Relation rel, double num_tuples)
 	metap->hashm_firstfree = 0;
 
 	/*
+	 * Release buffer lock on the metapage while we initialize buckets.
+	 * Otherwise, we'll be in interrupt holdoff and the CHECK_FOR_INTERRUPTS
+	 * won't accomplish anything.  It's a bad idea to hold buffer locks
+	 * for long intervals in any case, since that can block the bgwriter.
+	 */
+	_hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK);
+
+	/*
 	 * Initialize the first N buckets
 	 */
 	for (i = 0; i < num_buckets; i++)
 	{
+		/* Allow interrupts, in case N is huge */
+		CHECK_FOR_INTERRUPTS();
+
 		buf = _hash_getnewbuf(rel, BUCKET_TO_BLKNO(metap, i));
 		pg = BufferGetPage(buf);
 		pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg);
@@ -453,6 +465,9 @@ _hash_metapinit(Relation rel, double num_tuples)
 		_hash_wrtbuf(rel, buf);
 	}
 
+	/* Now reacquire buffer lock on metapage */
+	_hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);
+
 	/*
 	 * Initialize first bitmap page
 	 */
@@ -460,6 +475,8 @@ _hash_metapinit(Relation rel, double num_tuples)
 
 	/* all done */
 	_hash_wrtbuf(rel, metabuf);
+
+	return num_buckets;
 }
 
 /*
diff --git a/src/backend/access/hash/hashsort.c b/src/backend/access/hash/hashsort.c
new file mode 100644
index 0000000000..71a656842e
--- /dev/null
+++ b/src/backend/access/hash/hashsort.c
@@ -0,0 +1,116 @@
+/*-------------------------------------------------------------------------
+ *
+ * hashsort.c
+ *		Sort tuples for insertion into a new hash index.
+ *
+ * When building a very large hash index, we pre-sort the tuples by bucket
+ * number to improve locality of access to the index, and thereby avoid
+ * thrashing.  We use tuplesort.c to sort the given index tuples into order.
+ *
+ * Note: if the number of rows in the table has been underestimated,
+ * bucket splits may occur during the index build.  In that case we'd
+ * be inserting into two or more buckets for each possible masked-off
+ * hash code value.  That's no big problem though, since we'll still have
+ * plenty of locality of access.
+ *
+ *
+ * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  $PostgreSQL: pgsql/src/backend/access/hash/hashsort.c,v 1.1 2008/03/16 23:15:08 tgl Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/hash.h"
+#include "miscadmin.h"
+#include "utils/tuplesort.h"
+
+
+/*
+ * Status record for spooling/sorting phase.
+ */
+struct HSpool
+{
+	Tuplesortstate *sortstate;	/* state data for tuplesort.c */
+	Relation	index;
+};
+
+
+/*
+ * create and initialize a spool structure
+ */
+HSpool *
+_h_spoolinit(Relation index, uint32 num_buckets)
+{
+	HSpool	   *hspool = (HSpool *) palloc0(sizeof(HSpool));
+	uint32		hash_mask;
+
+	hspool->index = index;
+
+	/*
+	 * Determine the bitmask for hash code values.  Since there are currently
+	 * num_buckets buckets in the index, the appropriate mask can be computed
+	 * as follows.
+	 *
+	 * Note: at present, the passed-in num_buckets is always a power of 2,
+	 * so we could just compute num_buckets - 1.  We prefer not to assume
+	 * that here, though.
+	 */
+	hash_mask = (((uint32) 1) << _hash_log2(num_buckets)) - 1;
+
+	/*
+	 * We size the sort area as maintenance_work_mem rather than work_mem to
+	 * speed index creation.  This should be OK since a single backend can't
+	 * run multiple index creations in parallel.
+	 */
+	hspool->sortstate = tuplesort_begin_index_hash(index,
+												   hash_mask,
+												   maintenance_work_mem,
+												   false);
+
+	return hspool;
+}
+
+/*
+ * clean up a spool structure and its substructures.
+ */
+void
+_h_spooldestroy(HSpool *hspool)
+{
+	tuplesort_end(hspool->sortstate);
+	pfree(hspool);
+}
+
+/*
+ * spool an index entry into the sort file.
+ */
+void
+_h_spool(IndexTuple itup, HSpool *hspool)
+{
+	tuplesort_putindextuple(hspool->sortstate, itup);
+}
+
+/*
+ * given a spool loaded by successive calls to _h_spool,
+ * create an entire index.
+ */
+void
+_h_indexbuild(HSpool *hspool)
+{
+	IndexTuple	itup;
+	bool		should_free;
+
+	tuplesort_performsort(hspool->sortstate);
+
+	while ((itup = tuplesort_getindextuple(hspool->sortstate,
+										   true, &should_free)) != NULL)
+	{
+		_hash_doinsert(hspool->index, itup);
+		if (should_free)
+			pfree(itup);
+	}
+}
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c
index 6ecd21150a..96274b6ba3 100644
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -57,7 +57,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtsort.c,v 1.114 2008/01/01 19:45:46 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtsort.c,v 1.115 2008/03/16 23:15:08 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -158,8 +158,8 @@ _bt_spoolinit(Relation index, bool isunique, bool isdead)
 	 * work_mem.
 	 */
 	btKbytes = isdead ? work_mem : maintenance_work_mem;
-	btspool->sortstate = tuplesort_begin_index(index, isunique,
-											   btKbytes, false);
+	btspool->sortstate = tuplesort_begin_index_btree(index, isunique,
+													 btKbytes, false);
 
 	return btspool;
 }