From 9de3aa65f01fb51cbc725e8508ea233e4e92c46c Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 23 Dec 2010 16:03:08 +0200 Subject: Rewrite the GiST insertion logic so that we don't need the post-recovery cleanup stage to finish incomplete inserts or splits anymore. There was two reasons for the cleanup step: 1. When a new tuple was inserted to a leaf page, the downlink in the parent needed to be updated to contain (ie. to be consistent with) the new key. Updating the parent in turn might require recursively updating the parent of the parent. We now handle that by updating the parent while traversing down the tree, so that when we insert the leaf tuple, all the parents are already consistent with the new key, and the tree is consistent at every step. 2. When a page is split, we need to insert the downlink for the new right page(s), and update the downlink for the original page to not include keys that moved to the right page(s). We now handle that by setting a new flag, F_FOLLOW_RIGHT, on the non-rightmost pages in the split. When that flag is set, scans always follow the rightlink, regardless of the NSN mechanism used to detect concurrent page splits. That way the tree is consistent right after split, even though the downlink is still missing. This is very similar to the way B-tree splits are handled. When the downlink is inserted in the parent, the flag is cleared. To keep the insertion algorithm simple, when an insertion sees an incomplete split, indicated by the F_FOLLOW_RIGHT flag, it finishes the split before doing anything else. These changes allow removing the whole "invalid tuple" mechanism, but I retained the scan code to still follow invalid tuples correctly. While we don't create any such tuples anymore, we want to handle them gracefully in case you pg_upgrade a GiST index that has them. If we encounter any on an insert, though, we just throw an error saying that you need to REINDEX. The issue that got me into doing this is that if you did a checkpoint while an insert or split was in progress, and the checkpoint finishes quickly so that there is no WAL record related to the insert between RedoRecPtr and the checkpoint record, recovery from that checkpoint would not know to finish the incomplete insert. IOW, we have the same issue we solved with the rm_safe_restartpoint mechanism during normal operation too. It's highly unlikely to happen in practice, and this fix is far too large to backpatch, so we're just going to live with in previous versions, but this refactoring fixes it going forward. With this patch, you don't get the annoying 'index "FOO" needs VACUUM or REINDEX to finish crash recovery' notices anymore if you crash at an unfortunate moment. --- src/backend/access/gist/gistvacuum.c | 57 +++++++++++++----------------------- 1 file changed, 20 insertions(+), 37 deletions(-) (limited to 'src/backend/access/gist/gistvacuum.c') diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c index e02e72d431..86af414dce 100644 --- a/src/backend/access/gist/gistvacuum.c +++ b/src/backend/access/gist/gistvacuum.c @@ -26,13 +26,6 @@ #include "utils/memutils.h" -typedef struct GistBulkDeleteResult -{ - IndexBulkDeleteResult std; /* common state */ - bool needReindex; -} GistBulkDeleteResult; - - /* * VACUUM cleanup: update FSM */ @@ -40,7 +33,7 @@ Datum gistvacuumcleanup(PG_FUNCTION_ARGS) { IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0); - GistBulkDeleteResult *stats = (GistBulkDeleteResult *) PG_GETARG_POINTER(1); + IndexBulkDeleteResult *stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(1); Relation rel = info->index; BlockNumber npages, blkno; @@ -56,10 +49,10 @@ gistvacuumcleanup(PG_FUNCTION_ARGS) /* Set up all-zero stats if gistbulkdelete wasn't called */ if (stats == NULL) { - stats = (GistBulkDeleteResult *) palloc0(sizeof(GistBulkDeleteResult)); + stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); /* use heap's tuple count */ - stats->std.num_index_tuples = info->num_heap_tuples; - stats->std.estimated_count = info->estimated_count; + stats->num_index_tuples = info->num_heap_tuples; + stats->estimated_count = info->estimated_count; /* * XXX the above is wrong if index is partial. Would it be OK to just @@ -67,11 +60,6 @@ gistvacuumcleanup(PG_FUNCTION_ARGS) */ } - if (stats->needReindex) - ereport(NOTICE, - (errmsg("index \"%s\" needs VACUUM FULL or REINDEX to finish crash recovery", - RelationGetRelationName(rel)))); - /* * Need lock unless it's local to this backend. */ @@ -112,10 +100,10 @@ gistvacuumcleanup(PG_FUNCTION_ARGS) IndexFreeSpaceMapVacuum(info->index); /* return statistics */ - stats->std.pages_free = totFreePages; + stats->pages_free = totFreePages; if (needLock) LockRelationForExtension(rel, ExclusiveLock); - stats->std.num_pages = RelationGetNumberOfBlocks(rel); + stats->num_pages = RelationGetNumberOfBlocks(rel); if (needLock) UnlockRelationForExtension(rel, ExclusiveLock); @@ -135,7 +123,7 @@ pushStackIfSplited(Page page, GistBDItem *stack) GISTPageOpaque opaque = GistPageGetOpaque(page); if (stack->blkno != GIST_ROOT_BLKNO && !XLogRecPtrIsInvalid(stack->parentlsn) && - XLByteLT(stack->parentlsn, opaque->nsn) && + (GistFollowRight(page) || XLByteLT(stack->parentlsn, opaque->nsn)) && opaque->rightlink != InvalidBlockNumber /* sanity check */ ) { /* split page detected, install right link to the stack */ @@ -162,7 +150,7 @@ Datum gistbulkdelete(PG_FUNCTION_ARGS) { IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0); - GistBulkDeleteResult *stats = (GistBulkDeleteResult *) PG_GETARG_POINTER(1); + IndexBulkDeleteResult *stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(1); IndexBulkDeleteCallback callback = (IndexBulkDeleteCallback) PG_GETARG_POINTER(2); void *callback_state = (void *) PG_GETARG_POINTER(3); Relation rel = info->index; @@ -171,10 +159,10 @@ gistbulkdelete(PG_FUNCTION_ARGS) /* first time through? */ if (stats == NULL) - stats = (GistBulkDeleteResult *) palloc0(sizeof(GistBulkDeleteResult)); + stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); /* we'll re-count the tuples each time */ - stats->std.estimated_count = false; - stats->std.num_index_tuples = 0; + stats->estimated_count = false; + stats->num_index_tuples = 0; stack = (GistBDItem *) palloc0(sizeof(GistBDItem)); stack->blkno = GIST_ROOT_BLKNO; @@ -232,10 +220,10 @@ gistbulkdelete(PG_FUNCTION_ARGS) { todelete[ntodelete] = i - ntodelete; ntodelete++; - stats->std.tuples_removed += 1; + stats->tuples_removed += 1; } else - stats->std.num_index_tuples += 1; + stats->num_index_tuples += 1; } if (ntodelete) @@ -250,22 +238,13 @@ gistbulkdelete(PG_FUNCTION_ARGS) if (RelationNeedsWAL(rel)) { - XLogRecData *rdata; XLogRecPtr recptr; - gistxlogPageUpdate *xlinfo; - rdata = formUpdateRdata(rel->rd_node, buffer, + recptr = gistXLogUpdate(rel->rd_node, buffer, todelete, ntodelete, - NULL, 0, - NULL); - xlinfo = (gistxlogPageUpdate *) rdata->next->data; - - recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_UPDATE, rdata); + NULL, 0, InvalidBuffer); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); - - pfree(xlinfo); - pfree(rdata); } else PageSetLSN(page, GetXLogRecPtrForTemp()); @@ -293,7 +272,11 @@ gistbulkdelete(PG_FUNCTION_ARGS) stack->next = ptr; if (GistTupleIsInvalid(idxtuple)) - stats->needReindex = true; + ereport(LOG, + (errmsg("index \"%s\" contains an inner tuple marked as invalid", + RelationGetRelationName(rel)), + errdetail("This is caused by an incomplete page split at crash recovery before upgrading to 9.1."), + errhint("Please REINDEX it."))); } } -- cgit v1.2.1