summaryrefslogtreecommitdiff
path: root/src/backend/access/gist/gist.c
diff options
context:
space:
mode:
authorHeikki Linnakangas <heikki.linnakangas@iki.fi>2011-09-08 17:51:23 +0300
committerHeikki Linnakangas <heikki.linnakangas@iki.fi>2011-09-08 17:51:23 +0300
commit5edb24a8983e4a103e26153853d91141f818227c (patch)
tree9e3102de6e2149b0d3678b403c91955e97f3bdc8 /src/backend/access/gist/gist.c
parent09b68c70af855a0a69cede14da70968ddd97ba05 (diff)
downloadpostgresql-5edb24a8983e4a103e26153853d91141f818227c.tar.gz
Buffering GiST index build algorithm.
When building a GiST index that doesn't fit in cache, buffers are attached to some internal nodes in the index. This speeds up the build by avoiding random I/O that would otherwise be needed to traverse all the way down the tree to the find right leaf page for tuple. Alexander Korotkov
Diffstat (limited to 'src/backend/access/gist/gist.c')
-rw-r--r--src/backend/access/gist/gist.c211
1 files changed, 35 insertions, 176 deletions
diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c
index 4fc7a213b6..24f30099a1 100644
--- a/src/backend/access/gist/gist.c
+++ b/src/backend/access/gist/gist.c
@@ -24,33 +24,7 @@
#include "utils/memutils.h"
#include "utils/rel.h"
-/* Working state for gistbuild and its callback */
-typedef struct
-{
- GISTSTATE giststate;
- int numindexattrs;
- double indtuples;
- MemoryContext tmpCtx;
-} GISTBuildState;
-
-/* A List of these is used represent a split-in-progress. */
-typedef struct
-{
- Buffer buf; /* the split page "half" */
- IndexTuple downlink; /* downlink for this half. */
-} GISTPageSplitInfo;
-
/* non-export function prototypes */
-static void gistbuildCallback(Relation index,
- HeapTuple htup,
- Datum *values,
- bool *isnull,
- bool tupleIsAlive,
- void *state);
-static void gistdoinsert(Relation r,
- IndexTuple itup,
- Size freespace,
- GISTSTATE *GISTstate);
static void gistfixsplit(GISTInsertState *state, GISTSTATE *giststate);
static bool gistinserttuples(GISTInsertState *state, GISTInsertStack *stack,
GISTSTATE *giststate,
@@ -89,138 +63,6 @@ createTempGistContext(void)
}
/*
- * Routine to build an index. Basically calls insert over and over.
- *
- * XXX: it would be nice to implement some sort of bulk-loading
- * algorithm, but it is not clear how to do that.
- */
-Datum
-gistbuild(PG_FUNCTION_ARGS)
-{
- Relation heap = (Relation) PG_GETARG_POINTER(0);
- Relation index = (Relation) PG_GETARG_POINTER(1);
- IndexInfo *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2);
- IndexBuildResult *result;
- double reltuples;
- GISTBuildState buildstate;
- Buffer buffer;
- Page page;
-
- /*
- * We expect to be called exactly once for any index relation. If that's
- * not the case, big trouble's what we have.
- */
- if (RelationGetNumberOfBlocks(index) != 0)
- elog(ERROR, "index \"%s\" already contains data",
- RelationGetRelationName(index));
-
- /* no locking is needed */
- initGISTstate(&buildstate.giststate, index);
-
- /* initialize the root page */
- buffer = gistNewBuffer(index);
- Assert(BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO);
- page = BufferGetPage(buffer);
-
- START_CRIT_SECTION();
-
- GISTInitBuffer(buffer, F_LEAF);
-
- MarkBufferDirty(buffer);
-
- if (RelationNeedsWAL(index))
- {
- XLogRecPtr recptr;
- XLogRecData rdata;
-
- rdata.data = (char *) &(index->rd_node);
- rdata.len = sizeof(RelFileNode);
- rdata.buffer = InvalidBuffer;
- rdata.next = NULL;
-
- recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_CREATE_INDEX, &rdata);
- PageSetLSN(page, recptr);
- PageSetTLI(page, ThisTimeLineID);
- }
- else
- PageSetLSN(page, GetXLogRecPtrForTemp());
-
- UnlockReleaseBuffer(buffer);
-
- END_CRIT_SECTION();
-
- /* build the index */
- buildstate.numindexattrs = indexInfo->ii_NumIndexAttrs;
- buildstate.indtuples = 0;
-
- /*
- * create a temporary memory context that is reset once for each tuple
- * inserted into the index
- */
- buildstate.tmpCtx = createTempGistContext();
-
- /* do the heap scan */
- reltuples = IndexBuildHeapScan(heap, index, indexInfo, true,
- gistbuildCallback, (void *) &buildstate);
-
- /* okay, all heap tuples are indexed */
- MemoryContextDelete(buildstate.tmpCtx);
-
- freeGISTstate(&buildstate.giststate);
-
- /*
- * Return statistics
- */
- result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
-
- result->heap_tuples = reltuples;
- result->index_tuples = buildstate.indtuples;
-
- PG_RETURN_POINTER(result);
-}
-
-/*
- * Per-tuple callback from IndexBuildHeapScan
- */
-static void
-gistbuildCallback(Relation index,
- HeapTuple htup,
- Datum *values,
- bool *isnull,
- bool tupleIsAlive,
- void *state)
-{
- GISTBuildState *buildstate = (GISTBuildState *) state;
- IndexTuple itup;
- MemoryContext oldCtx;
-
- oldCtx = MemoryContextSwitchTo(buildstate->tmpCtx);
-
- /* form an index tuple and point it at the heap tuple */
- itup = gistFormTuple(&buildstate->giststate, index,
- values, isnull, true /* size is currently bogus */ );
- itup->t_tid = htup->t_self;
-
- /*
- * Since we already have the index relation locked, we call gistdoinsert
- * directly. Normal access method calls dispatch through gistinsert,
- * which locks the relation for write. This is the right thing to do if
- * you're inserting single tups, but not when you're initializing the
- * whole index at once.
- *
- * In this path we respect the fillfactor setting, whereas insertions
- * after initial build do not.
- */
- gistdoinsert(index, itup,
- RelationGetTargetPageFreeSpace(index, GIST_DEFAULT_FILLFACTOR),
- &buildstate->giststate);
-
- buildstate->indtuples += 1;
- MemoryContextSwitchTo(oldCtx);
- MemoryContextReset(buildstate->tmpCtx);
-}
-
-/*
* gistbuildempty() -- build an empty gist index in the initialization fork
*/
Datum
@@ -285,6 +127,11 @@ gistinsert(PG_FUNCTION_ARGS)
* to the right of 'leftchildbuf', or updating the downlink for 'leftchildbuf'.
* F_FOLLOW_RIGHT flag on 'leftchildbuf' is cleared and NSN is set.
*
+ * If 'markfollowright' is true and the page is split, the left child is
+ * marked with F_FOLLOW_RIGHT flag. That is the normal case. During buffered
+ * index build, however, there is no concurrent access and the page splitting
+ * is done in a slightly simpler fashion, and false is passed.
+ *
* If there is not enough room on the page, it is split. All the split
* pages are kept pinned and locked and returned in *splitinfo, the caller
* is responsible for inserting the downlinks for them. However, if
@@ -293,13 +140,16 @@ gistinsert(PG_FUNCTION_ARGS)
* In that case, we continue to hold the root page locked, and the child
* pages are released; note that new tuple(s) are *not* on the root page
* but in one of the new child pages.
+ *
+ * Returns 'true' if the page was split, 'false' otherwise.
*/
-static bool
-gistplacetopage(GISTInsertState *state, GISTSTATE *giststate,
+bool
+gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate,
Buffer buffer,
IndexTuple *itup, int ntup, OffsetNumber oldoffnum,
Buffer leftchildbuf,
- List **splitinfo)
+ List **splitinfo,
+ bool markfollowright)
{
Page page = BufferGetPage(buffer);
bool is_leaf = (GistPageIsLeaf(page)) ? true : false;
@@ -331,7 +181,7 @@ gistplacetopage(GISTInsertState *state, GISTSTATE *giststate,
* one-element todelete array; in the split case, it's handled implicitly
* because the tuple vector passed to gistSplit won't include this tuple.
*/
- is_split = gistnospace(page, itup, ntup, oldoffnum, state->freespace);
+ is_split = gistnospace(page, itup, ntup, oldoffnum, freespace);
if (is_split)
{
/* no space for insertion */
@@ -362,7 +212,7 @@ gistplacetopage(GISTInsertState *state, GISTSTATE *giststate,
memmove(itvec + pos, itvec + pos + 1, sizeof(IndexTuple) * (tlen - pos));
}
itvec = gistjoinvector(itvec, &tlen, itup, ntup);
- dist = gistSplit(state->r, page, itvec, tlen, giststate);
+ dist = gistSplit(rel, page, itvec, tlen, giststate);
/*
* Set up pages to work with. Allocate new buffers for all but the
@@ -392,7 +242,7 @@ gistplacetopage(GISTInsertState *state, GISTSTATE *giststate,
for (; ptr; ptr = ptr->next)
{
/* Allocate new page */
- ptr->buffer = gistNewBuffer(state->r);
+ ptr->buffer = gistNewBuffer(rel);
GISTInitBuffer(ptr->buffer, (is_leaf) ? F_LEAF : 0);
ptr->page = BufferGetPage(ptr->buffer);
ptr->block.blkno = BufferGetBlockNumber(ptr->buffer);
@@ -463,7 +313,7 @@ gistplacetopage(GISTInsertState *state, GISTSTATE *giststate,
for (i = 0; i < ptr->block.num; i++)
{
if (PageAddItem(ptr->page, (Item) data, IndexTupleSize((IndexTuple) data), i + FirstOffsetNumber, false, false) == InvalidOffsetNumber)
- elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(state->r));
+ elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(rel));
data += IndexTupleSize((IndexTuple) data);
}
@@ -474,7 +324,15 @@ gistplacetopage(GISTInsertState *state, GISTSTATE *giststate,
else
GistPageGetOpaque(ptr->page)->rightlink = oldrlink;
- if (ptr->next && !is_rootsplit)
+ /*
+ * Mark the all but the right-most page with the follow-right
+ * flag. It will be cleared as soon as the downlink is inserted
+ * into the parent, but this ensures that if we error out before
+ * that, the index is still consistent. (in buffering build mode,
+ * any error will abort the index build anyway, so this is not
+ * needed.)
+ */
+ if (ptr->next && !is_rootsplit && markfollowright)
GistMarkFollowRight(ptr->page);
else
GistClearFollowRight(ptr->page);
@@ -506,9 +364,10 @@ gistplacetopage(GISTInsertState *state, GISTSTATE *giststate,
dist->page = BufferGetPage(dist->buffer);
/* Write the WAL record */
- if (RelationNeedsWAL(state->r))
- recptr = gistXLogSplit(state->r->rd_node, blkno, is_leaf,
- dist, oldrlink, oldnsn, leftchildbuf);
+ if (RelationNeedsWAL(rel))
+ recptr = gistXLogSplit(rel->rd_node, blkno, is_leaf,
+ dist, oldrlink, oldnsn, leftchildbuf,
+ markfollowright);
else
recptr = GetXLogRecPtrForTemp();
@@ -547,7 +406,7 @@ gistplacetopage(GISTInsertState *state, GISTSTATE *giststate,
if (BufferIsValid(leftchildbuf))
MarkBufferDirty(leftchildbuf);
- if (RelationNeedsWAL(state->r))
+ if (RelationNeedsWAL(rel))
{
OffsetNumber ndeloffs = 0,
deloffs[1];
@@ -558,7 +417,7 @@ gistplacetopage(GISTInsertState *state, GISTSTATE *giststate,
ndeloffs = 1;
}
- recptr = gistXLogUpdate(state->r->rd_node, buffer,
+ recptr = gistXLogUpdate(rel->rd_node, buffer,
deloffs, ndeloffs, itup, ntup,
leftchildbuf);
@@ -570,8 +429,6 @@ gistplacetopage(GISTInsertState *state, GISTSTATE *giststate,
recptr = GetXLogRecPtrForTemp();
PageSetLSN(page, recptr);
}
-
- *splitinfo = NIL;
}
/*
@@ -608,7 +465,7 @@ gistplacetopage(GISTInsertState *state, GISTSTATE *giststate,
* this routine assumes it is invoked in a short-lived memory context,
* so it does not bother releasing palloc'd allocations.
*/
-static void
+void
gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *giststate)
{
ItemId iid;
@@ -1192,10 +1049,12 @@ gistinserttuples(GISTInsertState *state, GISTInsertStack *stack,
List *splitinfo;
bool is_split;
- is_split = gistplacetopage(state, giststate, stack->buffer,
+ is_split = gistplacetopage(state->r, state->freespace, giststate,
+ stack->buffer,
tuples, ntup, oldoffnum,
leftchild,
- &splitinfo);
+ &splitinfo,
+ true);
if (splitinfo)
gistfinishsplit(state, stack, giststate, splitinfo);