diff options
Diffstat (limited to 'src/backend/access/transam')
| -rw-r--r-- | src/backend/access/transam/README | 165 | ||||
| -rw-r--r-- | src/backend/access/transam/xact.c | 6 | ||||
| -rw-r--r-- | src/backend/access/transam/xlog.c | 64 | ||||
| -rw-r--r-- | src/backend/access/transam/xlogutils.c | 108 |
4 files changed, 278 insertions, 65 deletions
diff --git a/src/backend/access/transam/README b/src/backend/access/transam/README index 177ba26cf3..4ebf7a8946 100644 --- a/src/backend/access/transam/README +++ b/src/backend/access/transam/README @@ -1,4 +1,4 @@ -$PostgreSQL: pgsql/src/backend/access/transam/README,v 1.3 2005/05/19 21:35:45 tgl Exp $ +$PostgreSQL: pgsql/src/backend/access/transam/README,v 1.4 2006/03/29 21:17:37 tgl Exp $ The Transaction System ---------------------- @@ -252,3 +252,166 @@ slru.c is the supporting mechanism for both pg_clog and pg_subtrans. It implements the LRU policy for in-memory buffer pages. The high-level routines for pg_clog are implemented in transam.c, while the low-level functions are in clog.c. pg_subtrans is contained completely in subtrans.c. + + +Write-Ahead Log coding +---------------------- + +The WAL subsystem (also called XLOG in the code) exists to guarantee crash +recovery. It can also be used to provide point-in-time recovery, as well as +hot-standby replication via log shipping. Here are some notes about +non-obvious aspects of its design. + +A basic assumption of a write AHEAD log is that log entries must reach stable +storage before the data-page changes they describe. This ensures that +replaying the log to its end will bring us to a consistent state where there +are no partially-performed transactions. To guarantee this, each data page +(either heap or index) is marked with the LSN (log sequence number --- in +practice, a WAL file location) of the latest XLOG record affecting the page. +Before the bufmgr can write out a dirty page, it must ensure that xlog has +been flushed to disk at least up to the page's LSN. This low-level +interaction improves performance by not waiting for XLOG I/O until necessary. +The LSN check exists only in the shared-buffer manager, not in the local +buffer manager used for temp tables; hence operations on temp tables must not +be WAL-logged. + +During WAL replay, we can check the LSN of a page to detect whether the change +recorded by the current log entry is already applied (it has been, if the page +LSN is >= the log entry's WAL location). + +Usually, log entries contain just enough information to redo a single +incremental update on a page (or small group of pages). This will work only +if the filesystem and hardware implement data page writes as atomic actions, +so that a page is never left in a corrupt partly-written state. Since that's +often an untenable assumption in practice, we log additional information to +allow complete reconstruction of modified pages. The first WAL record +affecting a given page after a checkpoint is made to contain a copy of the +entire page, and we implement replay by restoring that page copy instead of +redoing the update. (This is more reliable than the data storage itself would +be because we can check the validity of the WAL record's CRC.) We can detect +the "first change after checkpoint" by noting whether the page's old LSN +precedes the end of WAL as of the last checkpoint (the RedoRecPtr). + +The general schema for executing a WAL-logged action is + +1. Pin and exclusive-lock the shared buffer(s) containing the data page(s) +to be modified. + +2. START_CRIT_SECTION() (Any error during the next two steps must cause a +PANIC because the shared buffers will contain unlogged changes, which we +have to ensure don't get to disk. Obviously, you should check conditions +such as whether there's enough free space on the page before you start the +critical section.) + +3. Apply the required changes to the shared buffer(s). + +4. Build a WAL log record and pass it to XLogInsert(); then update the page's +LSN and TLI using the returned XLOG location. For instance, + + recptr = XLogInsert(rmgr_id, info, rdata); + + PageSetLSN(dp, recptr); + PageSetTLI(dp, ThisTimeLineID); + +5. END_CRIT_SECTION() + +6. Unlock and write the buffer(s): + + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + WriteBuffer(buffer); + +(Note: WriteBuffer doesn't really "write" the buffer anymore, it just marks it +dirty and unpins it. The write will not happen until a checkpoint occurs or +the shared buffer is needed for another page.) + +XLogInsert's "rdata" argument is an array of pointer/size items identifying +chunks of data to be written in the XLOG record, plus optional shared-buffer +IDs for chunks that are in shared buffers rather than temporary variables. +The "rdata" array must mention (at least once) each of the shared buffers +being modified, unless the action is such that the WAL replay routine can +reconstruct the entire page contents. XLogInsert includes the logic that +tests to see whether a shared buffer has been modified since the last +checkpoint. If not, the entire page contents are logged rather than just the +portion(s) pointed to by "rdata". + +Because XLogInsert drops the rdata components associated with buffers it +chooses to log in full, the WAL replay routines normally need to test to see +which buffers were handled that way --- otherwise they may be misled about +what the XLOG record actually contains. XLOG records that describe multi-page +changes therefore require some care to design: you must be certain that you +know what data is indicated by each "BKP" bit. An example of the trickiness +is that in a HEAP_UPDATE record, BKP(1) normally is associated with the source +page and BKP(2) is associated with the destination page --- but if these are +the same page, only BKP(1) would have been set. + +For this reason as well as the risk of deadlocking on buffer locks, it's best +to design WAL records so that they reflect small atomic actions involving just +one or a few pages. The current XLOG infrastructure cannot handle WAL records +involving references to more than three shared buffers, anyway. + +In the case where the WAL record contains enough information to re-generate +the entire contents of a page, do *not* show that page's buffer ID in the +rdata array, even if some of the rdata items point into the buffer. This is +because you don't want XLogInsert to log the whole page contents. The +standard replay-routine pattern for this case is + + reln = XLogOpenRelation(rnode); + buffer = XLogReadBuffer(reln, blkno, true); + Assert(BufferIsValid(buffer)); + page = (Page) BufferGetPage(buffer); + + ... initialize the page ... + + PageSetLSN(page, lsn); + PageSetTLI(page, ThisTimeLineID); + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + WriteBuffer(buffer); + +In the case where the WAL record provides only enough information to +incrementally update the page, the rdata array *must* mention the buffer +ID at least once; otherwise there is no defense against torn-page problems. +The standard replay-routine pattern for this case is + + if (record->xl_info & XLR_BKP_BLOCK_n) + << do nothing, page was rewritten from logged copy >>; + + reln = XLogOpenRelation(rnode); + buffer = XLogReadBuffer(reln, blkno, false); + if (!BufferIsValid(buffer)) + << do nothing, page has been deleted >>; + page = (Page) BufferGetPage(buffer); + + if (XLByteLE(lsn, PageGetLSN(page))) + { + /* changes are already applied */ + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + ReleaseBuffer(buffer); + return; + } + + ... apply the change ... + + PageSetLSN(page, lsn); + PageSetTLI(page, ThisTimeLineID); + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + WriteBuffer(buffer); + +As noted above, for a multi-page update you need to be able to determine +which XLR_BKP_BLOCK_n flag applies to each page. If a WAL record reflects +a combination of fully-rewritable and incremental updates, then the rewritable +pages don't count for the XLR_BKP_BLOCK_n numbering. (XLR_BKP_BLOCK_n is +associated with the n'th distinct buffer ID seen in the "rdata" array, and +per the above discussion, fully-rewritable buffers shouldn't be mentioned in +"rdata".) + +Due to all these constraints, complex changes (such as a multilevel index +insertion) normally need to be described by a series of atomic-action WAL +records. What do you do if the intermediate states are not self-consistent? +The answer is that the WAL replay logic has to be able to fix things up. +In btree indexes, for example, a page split requires insertion of a new key in +the parent btree level, but for locking reasons this has to be reflected by +two separate WAL records. The replay code has to remember "unfinished" split +operations, and match them up to subsequent insertions in the parent level. +If no matching insert has been found by the time the WAL replay ends, the +replay code has to do the insertion on its own to restore the index to +consistency. diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index a33e0df7c4..0bbe2c0d49 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -10,7 +10,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/transam/xact.c,v 1.218 2006/03/24 04:32:12 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/transam/xact.c,v 1.219 2006/03/29 21:17:37 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -4097,7 +4097,7 @@ xact_redo_commit(xl_xact_commit *xlrec, TransactionId xid) /* Make sure files supposed to be dropped are dropped */ for (i = 0; i < xlrec->nrels; i++) { - XLogCloseRelation(xlrec->xnodes[i]); + XLogDropRelation(xlrec->xnodes[i]); smgrdounlink(smgropen(xlrec->xnodes[i]), false, true); } } @@ -4132,7 +4132,7 @@ xact_redo_abort(xl_xact_abort *xlrec, TransactionId xid) /* Make sure files supposed to be dropped are dropped */ for (i = 0; i < xlrec->nrels; i++) { - XLogCloseRelation(xlrec->xnodes[i]); + XLogDropRelation(xlrec->xnodes[i]); smgrdounlink(smgropen(xlrec->xnodes[i]), false, true); } } diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 27149fd375..753b300fee 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.229 2006/03/28 22:01:16 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.230 2006/03/29 21:17:37 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -2509,35 +2509,29 @@ RestoreBkpBlocks(XLogRecord *record, XLogRecPtr lsn) blk += sizeof(BkpBlock); reln = XLogOpenRelation(bkpb.node); + buffer = XLogReadBuffer(reln, bkpb.block, true); + Assert(BufferIsValid(buffer)); + page = (Page) BufferGetPage(buffer); - if (reln) + if (bkpb.hole_length == 0) { - buffer = XLogReadBuffer(true, reln, bkpb.block); - if (BufferIsValid(buffer)) - { - page = (Page) BufferGetPage(buffer); - - if (bkpb.hole_length == 0) - { - memcpy((char *) page, blk, BLCKSZ); - } - else - { - /* must zero-fill the hole */ - MemSet((char *) page, 0, BLCKSZ); - memcpy((char *) page, blk, bkpb.hole_offset); - memcpy((char *) page + (bkpb.hole_offset + bkpb.hole_length), - blk + bkpb.hole_offset, - BLCKSZ - (bkpb.hole_offset + bkpb.hole_length)); - } - - PageSetLSN(page, lsn); - PageSetTLI(page, ThisTimeLineID); - LockBuffer(buffer, BUFFER_LOCK_UNLOCK); - WriteBuffer(buffer); - } + memcpy((char *) page, blk, BLCKSZ); + } + else + { + /* must zero-fill the hole */ + MemSet((char *) page, 0, BLCKSZ); + memcpy((char *) page, blk, bkpb.hole_offset); + memcpy((char *) page + (bkpb.hole_offset + bkpb.hole_length), + blk + bkpb.hole_offset, + BLCKSZ - (bkpb.hole_offset + bkpb.hole_length)); } + PageSetLSN(page, lsn); + PageSetTLI(page, ThisTimeLineID); + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + WriteBuffer(buffer); + blk += BLCKSZ - bkpb.hole_length; } } @@ -5451,25 +5445,19 @@ xlog_desc(StringInfo buf, uint8 xl_info, char *rec) static void xlog_outrec(StringInfo buf, XLogRecord *record) { - int bkpb; int i; appendStringInfo(buf, "prev %X/%X; xid %u", - record->xl_prev.xlogid, record->xl_prev.xrecoff, - record->xl_xid); + record->xl_prev.xlogid, record->xl_prev.xrecoff, + record->xl_xid); - for (i = 0, bkpb = 0; i < XLR_MAX_BKP_BLOCKS; i++) + for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++) { - if (!(record->xl_info & (XLR_SET_BKP_BLOCK(i)))) - continue; - bkpb++; + if (record->xl_info & XLR_SET_BKP_BLOCK(i)) + appendStringInfo(buf, "; bkpb%d", i+1); } - if (bkpb) - appendStringInfo(buf, "; bkpb %d", bkpb); - - appendStringInfo(buf, ": %s", - RmgrTable[record->xl_rmid].rm_name); + appendStringInfo(buf, ": %s", RmgrTable[record->xl_rmid].rm_name); } #endif /* WAL_DEBUG */ diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c index 2f85bb32ce..fb771fe2fd 100644 --- a/src/backend/access/transam/xlogutils.c +++ b/src/backend/access/transam/xlogutils.c @@ -11,7 +11,7 @@ * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/backend/access/transam/xlogutils.c,v 1.41 2006/03/05 15:58:22 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/access/transam/xlogutils.c,v 1.42 2006/03/29 21:17:38 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -19,44 +19,81 @@ #include "access/xlogutils.h" #include "storage/bufmgr.h" +#include "storage/bufpage.h" #include "storage/smgr.h" #include "utils/hsearch.h" /* + * XLogReadBuffer + * Read a page during XLOG replay * - * Storage related support functions + * This is functionally comparable to ReadBuffer followed by + * LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE): you get back a pinned + * and locked buffer. (The lock is not really necessary, since we + * expect that this is only done during single-process XLOG replay, + * but in some places it simplifies sharing code with the non-XLOG case.) * + * If "init" is true then the caller intends to rewrite the page fully + * using the info in the XLOG record. In this case we will extend the + * relation if needed to make the page exist, and we will not complain about + * the page being "new" (all zeroes). + * + * If "init" is false then the caller needs the page to be valid already. + * If the page doesn't exist or contains zeroes, we report failure. + * + * If the return value is InvalidBuffer (only possible when init = false), + * the caller should silently skip the update on this page. This currently + * never happens, but we retain it as part of the API spec for possible future + * use. */ - Buffer -XLogReadBuffer(bool extend, Relation reln, BlockNumber blkno) +XLogReadBuffer(Relation reln, BlockNumber blkno, bool init) { BlockNumber lastblock = RelationGetNumberOfBlocks(reln); Buffer buffer; - if (blkno >= lastblock) + Assert(blkno != P_NEW); + + if (blkno < lastblock) { + /* page exists in file */ + buffer = ReadBuffer(reln, blkno); + } + else + { + /* hm, page doesn't exist in file */ + if (!init) + elog(PANIC, "block %u of relation %u/%u/%u does not exist", + blkno, reln->rd_node.spcNode, + reln->rd_node.dbNode, reln->rd_node.relNode); + /* OK to extend the file */ + /* we do this in recovery only - no rel-extension lock needed */ + Assert(InRecovery); buffer = InvalidBuffer; - if (extend) /* we do this in recovery only - no locks */ + while (blkno >= lastblock) { - Assert(InRecovery); - while (lastblock <= blkno) - { - if (buffer != InvalidBuffer) - ReleaseBuffer(buffer); /* must be WriteBuffer()? */ - buffer = ReadBuffer(reln, P_NEW); - lastblock++; - } + if (buffer != InvalidBuffer) + ReleaseBuffer(buffer); /* must be WriteBuffer()? */ + buffer = ReadBuffer(reln, P_NEW); + lastblock++; } - if (buffer != InvalidBuffer) - LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); - return buffer; + Assert(BufferGetBlockNumber(buffer) == blkno); + } + + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + + if (!init) + { + /* check that page has been initialized */ + Page page = (Page) BufferGetPage(buffer); + + if (PageIsNew((PageHeader) page)) + elog(PANIC, "block %u of relation %u/%u/%u is uninitialized", + blkno, reln->rd_node.spcNode, + reln->rd_node.dbNode, reln->rd_node.relNode); } - buffer = ReadBuffer(reln, blkno); - if (buffer != InvalidBuffer) - LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); return buffer; } @@ -184,6 +221,9 @@ XLogCloseRelationCache(void) /* * Open a relation during XLOG replay + * + * Note: this once had an API that allowed NULL return on failure, but it + * no longer does; any failure results in elog(). */ Relation XLogOpenRelation(RelFileNode rnode) @@ -224,7 +264,7 @@ XLogOpenRelation(RelFileNode rnode) hash_search(_xlrelcache, (void *) &rnode, HASH_ENTER, &found); if (found) - elog(PANIC, "XLogOpenRelation: file found on insert into cache"); + elog(PANIC, "xlog relation already present on insert into cache"); hentry->rdesc = res; @@ -253,7 +293,7 @@ XLogOpenRelation(RelFileNode rnode) } /* - * Close a relation during XLOG replay + * Drop a relation during XLOG replay * * This is called when the relation is about to be deleted; we need to ensure * that there is no dangling smgr reference in the xlog relation cache. @@ -262,7 +302,7 @@ XLogOpenRelation(RelFileNode rnode) * cache, we just let it age out normally. */ void -XLogCloseRelation(RelFileNode rnode) +XLogDropRelation(RelFileNode rnode) { XLogRelDesc *rdesc; XLogRelCacheEntry *hentry; @@ -277,3 +317,25 @@ XLogCloseRelation(RelFileNode rnode) RelationCloseSmgr(&(rdesc->reldata)); } + +/* + * Drop a whole database during XLOG replay + * + * As above, but for DROP DATABASE instead of dropping a single rel + */ +void +XLogDropDatabase(Oid dbid) +{ + HASH_SEQ_STATUS status; + XLogRelCacheEntry *hentry; + + hash_seq_init(&status, _xlrelcache); + + while ((hentry = (XLogRelCacheEntry *) hash_seq_search(&status)) != NULL) + { + XLogRelDesc *rdesc = hentry->rdesc; + + if (hentry->rnode.dbNode == dbid) + RelationCloseSmgr(&(rdesc->reldata)); + } +} |
