I've attached a patch which implements Bob Jenkin's hash function for

PostgreSQL. This hash function replaces the one used by hash indexes and the catalog cache. Hash joins use a different, relatively poor-quality hash function, but I'll fix that later. As suggested by Tom Lane, this patch also changes the size of the fixed hash table used by the catalog cache to be a power-of-2 (instead of a prime: I chose 256 instead of 257). This allows the catcache to lookup hash buckets using a simple bitmask. This should improve the performance of the catalog cache slightly, since the previous method (modulo a prime) was slow. In my tests, this improves the performance of hash indexes by between 4% and 8%; the performance when using btree indexes or seqscans is basically unchanged. Neil Conway <neilconway@rogers.com>
author: Bruce Momjian <bruce@momjian.us> 2002-03-06 20:49:46 +0000
committer: Bruce Momjian <bruce@momjian.us> 2002-03-06 20:49:46 +0000
commit: 7ab746731812fb4574f04a7402eaa40d15e29720 (patch)
tree: 11883b3a5a44cf0401c3647599a3d1502c711623 /src/backend/access/hash/hashfunc.c
parent: 5b5cef9abd92e6e17f78cbf2838ac898f3427255 (diff)
download: postgresql-7ab746731812fb4574f04a7402eaa40d15e29720.tar.gz
1 files changed, 83 insertions, 53 deletions
diff --git a/src/backend/access/hash/hashfunc.c b/src/backend/access/hash/hashfunc.c
index 9cb2a02cf3..2e0f181939 100644
--- a/src/backend/access/hash/hashfunc.c
+++ b/src/backend/access/hash/hashfunc.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/hash/hashfunc.c,v 1.31 2002/02/25 04:06:47 momjian Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/hash/hashfunc.c,v 1.32 2002/03/06 20:49:38 momjian Exp $
  *
  * NOTES
  *	  These functions are stored in pg_amproc.	For each operator class
@@ -95,6 +95,8 @@ hashname(PG_FUNCTION_ARGS)
 {
 	char	   *key = NameStr(*PG_GETARG_NAME(0));
 
+	Assert(strlen(key) <= NAMEDATALEN);
+
 	return hash_any(key, strlen(key));
 }
 
@@ -116,61 +118,89 @@ hashvarlena(PG_FUNCTION_ARGS)
 	return result;
 }
 
+/* This hash function was written by Bob Jenkins
+ * (bob_jenkins@burtleburtle.net), and superficially adapted
+ * for PostgreSQL by Neil Conway. For more information on this
+ * hash function, see http://burtleburtle.net/bob/hash/doobs.html
+ */
 
 /*
- * hash_any --- compute a hash function for any specified chunk of memory
- *
- * This can be used as the underlying hash function for any pass-by-reference
- * data type in which there are no non-significant bits.
- *
- * (Comment from the original db3 hashing code: )
- *
- * This is INCREDIBLY ugly, but fast.  We break the string up into 8 byte
- * units.  On the first time through the loop we get the 'leftover bytes'
- * (strlen % 8).  On every later iteration, we perform 8 HASHC's so we handle
- * all 8 bytes.  Essentially, this saves us 7 cmp & branch instructions.  If
- * this routine is heavily used enough, it's worth the ugly coding.
- *
- * "OZ's original sdbm hash"
+ * mix -- mix 3 32-bit values reversibly.
+ * For every delta with one or two bits set, and the deltas of all three
+ * high bits or all three low bits, whether the original value of a,b,c
+ * is almost all zero or is uniformly distributed,
+ * - If mix() is run forward or backward, at least 32 bits in a,b,c
+ *   have at least 1/4 probability of changing.
+ * - If mix() is run forward, every bit of c will change between 1/3 and
+ *   2/3 of the time.  (Well, 22/100 and 78/100 for some 2-bit deltas.)
+ */
+#define mix(a,b,c) \
+{ \
+  a -= b; a -= c; a ^= (c>>13); \
+  b -= c; b -= a; b ^= (a<<8); \
+  c -= a; c -= b; c ^= (b>>13); \
+  a -= b; a -= c; a ^= (c>>12);  \
+  b -= c; b -= a; b ^= (a<<16); \
+  c -= a; c -= b; c ^= (b>>5); \
+  a -= b; a -= c; a ^= (c>>3);  \
+  b -= c; b -= a; b ^= (a<<10); \
+  c -= a; c -= b; c ^= (b>>15); \
+}
+
+/*
+ * hash_any() -- hash a variable-length key into a 32-bit value
+ *      k       : the key (the unaligned variable-length array of bytes)
+ *      len     : the length of the key, counting by bytes
+ * Returns a 32-bit value.  Every bit of the key affects every bit of
+ * the return value.  Every 1-bit and 2-bit delta achieves avalanche.
+ * About 6*len+35 instructions. The best hash table sizes are powers
+ * of 2.  There is no need to do mod a prime (mod is sooo slow!).
+ * If you need less than 32 bits, use a bitmask.
  */
 Datum
-hash_any(const char *keydata, int keylen)
+hash_any(register const char *k, register int keylen)
 {
-	uint32		n;
-	int			loop;
-
-#define HASHC	n = *keydata++ + 65599 * n
-
-	n = 0;
-	if (keylen > 0)
-	{
-		loop = (keylen + 8 - 1) >> 3;
-
-		switch (keylen & (8 - 1))
-		{
-			case 0:
-				do
-				{				/* All fall throughs */
-					HASHC;
-			case 7:
-					HASHC;
-			case 6:
-					HASHC;
-			case 5:
-					HASHC;
-			case 4:
-					HASHC;
-			case 3:
-					HASHC;
-			case 2:
-					HASHC;
-			case 1:
-					HASHC;
-				} while (--loop);
-		}
-	}
-
-#undef HASHC
-
-	PG_RETURN_UINT32(n);
+   register Datum a,b,c,len;
+
+   /* Set up the internal state */
+   len = keylen;
+   a = b = 0x9e3779b9;  /* the golden ratio; an arbitrary value */
+   /* Another arbitrary value. If the hash function is called
+    * multiple times, this could be the previously generated
+    * hash value; however, the interface currently doesn't allow
+    * this. AFAIK this isn't a big deal.
+    */
+   c = 3923095;
+
+   /* handle most of the key */
+   while (len >= 12)
+   {
+      a += (k[0] +((Datum)k[1]<<8) +((Datum)k[2]<<16) +((Datum)k[3]<<24));
+      b += (k[4] +((Datum)k[5]<<8) +((Datum)k[6]<<16) +((Datum)k[7]<<24));
+      c += (k[8] +((Datum)k[9]<<8) +((Datum)k[10]<<16)+((Datum)k[11]<<24));
+      mix(a,b,c);
+      k += 12; len -= 12;
+   }
+
+   /* handle the last 11 bytes */
+   c += keylen;
+   switch(len)              /* all the case statements fall through */
+   {
+   case 11: c+=((Datum)k[10]<<24);
+   case 10: c+=((Datum)k[9]<<16);
+   case 9 : c+=((Datum)k[8]<<8);
+      /* the first byte of c is reserved for the length */
+   case 8 : b+=((Datum)k[7]<<24);
+   case 7 : b+=((Datum)k[6]<<16);
+   case 6 : b+=((Datum)k[5]<<8);
+   case 5 : b+=k[4];
+   case 4 : a+=((Datum)k[3]<<24);
+   case 3 : a+=((Datum)k[2]<<16);
+   case 2 : a+=((Datum)k[1]<<8);
+   case 1 : a+=k[0];
+     /* case 0: nothing left to add */
+   }
+   mix(a,b,c);
+   /* report the result */
+   return c;
 }
author	Bruce Momjian <bruce@momjian.us>	2002-03-06 20:49:46 +0000
committer	Bruce Momjian <bruce@momjian.us>	2002-03-06 20:49:46 +0000
commit	7ab746731812fb4574f04a7402eaa40d15e29720 (patch)
tree	11883b3a5a44cf0401c3647599a3d1502c711623 /src/backend/access/hash/hashfunc.c
parent	5b5cef9abd92e6e17f78cbf2838ac898f3427255 (diff)
download	postgresql-7ab746731812fb4574f04a7402eaa40d15e29720.tar.gz