diff options
Diffstat (limited to 'src/bin/pg_rewind/pg_rewind.c')
| -rw-r--r-- | src/bin/pg_rewind/pg_rewind.c | 550 |
1 files changed, 550 insertions, 0 deletions
diff --git a/src/bin/pg_rewind/pg_rewind.c b/src/bin/pg_rewind/pg_rewind.c new file mode 100644 index 0000000000..6d458b034c --- /dev/null +++ b/src/bin/pg_rewind/pg_rewind.c @@ -0,0 +1,550 @@ +/*------------------------------------------------------------------------- + * + * pg_rewind.c + * Synchronizes an old master server to a new timeline + * + * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group + * + *------------------------------------------------------------------------- + */ +#include "postgres_fe.h" + +#include <sys/stat.h> +#include <fcntl.h> +#include <time.h> +#include <unistd.h> + +#include "pg_rewind.h" +#include "fetch.h" +#include "file_ops.h" +#include "filemap.h" +#include "logging.h" + +#include "access/timeline.h" +#include "access/xlog_internal.h" +#include "catalog/catversion.h" +#include "catalog/pg_control.h" +#include "getopt_long.h" +#include "storage/bufpage.h" + +static void usage(const char *progname); + +static void createBackupLabel(XLogRecPtr startpoint, TimeLineID starttli, + XLogRecPtr checkpointloc); + +static void digestControlFile(ControlFileData *ControlFile, char *source, + size_t size); +static void updateControlFile(ControlFileData *ControlFile); +static void sanityChecks(void); +static void findCommonAncestorTimeline(XLogRecPtr *recptr, TimeLineID *tli); + +static ControlFileData ControlFile_target; +static ControlFileData ControlFile_source; + +const char *progname; + +/* Configuration options */ +char *datadir_target = NULL; +char *datadir_source = NULL; +char *connstr_source = NULL; + +bool debug = false; +bool showprogress = false; +bool dry_run = false; + +static void +usage(const char *progname) +{ + printf(_("%s resynchronizes a cluster with another copy of the cluster.\n\n"), progname); + printf(_("Usage:\n %s [OPTION]...\n\n"), progname); + printf(_("Options:\n")); + printf(_(" -D, --target-pgdata=DIRECTORY\n")); + printf(_(" existing data directory to modify\n")); + printf(_(" --source-pgdata=DIRECTORY\n")); + printf(_(" source data directory to sync with\n")); + printf(_(" --source-server=CONNSTR\n")); + printf(_(" source server to sync with\n")); + printf(_(" -P, --progress write progress messages\n")); + printf(_(" -n, --dry-run stop before modifying anything\n")); + printf(_(" --debug write a lot of debug messages\n")); + printf(_(" -V, --version output version information, then exit\n")); + printf(_(" -?, --help show this help, then exit\n")); + printf(_("\n")); + printf(_("Report bugs to <pgsql-bugs@postgresql.org>.\n")); +} + + +int +main(int argc, char **argv) +{ + static struct option long_options[] = { + {"help", no_argument, NULL, '?'}, + {"target-pgdata", required_argument, NULL, 'D'}, + {"source-pgdata", required_argument, NULL, 1}, + {"source-server", required_argument, NULL, 2}, + {"version", no_argument, NULL, 'V'}, + {"dry-run", no_argument, NULL, 'n'}, + {"progress", no_argument, NULL, 'P'}, + {"debug", no_argument, NULL, 3}, + {NULL, 0, NULL, 0} + }; + int option_index; + int c; + XLogRecPtr divergerec; + TimeLineID lastcommontli; + XLogRecPtr chkptrec; + TimeLineID chkpttli; + XLogRecPtr chkptredo; + size_t size; + char *buffer; + bool rewind_needed; + XLogRecPtr endrec; + TimeLineID endtli; + ControlFileData ControlFile_new; + + progname = get_progname(argv[0]); + + /* Process command-line arguments */ + if (argc > 1) + { + if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0) + { + usage(progname); + exit(0); + } + if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0) + { + puts("pg_rewind (PostgreSQL) " PG_VERSION); + exit(0); + } + } + + while ((c = getopt_long(argc, argv, "D:NnP", long_options, &option_index)) != -1) + { + switch (c) + { + case '?': + fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname); + exit(1); + + case 'P': + showprogress = true; + break; + + case 'n': + dry_run = true; + break; + + case 3: + debug = true; + break; + + case 'D': /* -D or --target-pgdata */ + datadir_target = pg_strdup(optarg); + break; + + case 1: /* --source-pgdata */ + datadir_source = pg_strdup(optarg); + break; + case 2: /* --source-server */ + connstr_source = pg_strdup(optarg); + break; + } + } + + /* No source given? Show usage */ + if (datadir_source == NULL && connstr_source == NULL) + { + pg_fatal("no source specified (--source-pgdata or --source-server)\n"); + pg_fatal("Try \"%s --help\" for more information.\n", progname); + exit(1); + } + + if (datadir_target == NULL) + { + pg_fatal("no target data directory specified (--target-pgdata)\n"); + fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname); + exit(1); + } + + if (argc != optind) + { + pg_fatal("%s: invalid arguments\n", progname); + fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname); + exit(1); + } + + /* Connect to remote server */ + if (connstr_source) + libpqConnect(connstr_source); + + /* + * Ok, we have all the options and we're ready to start. Read in all the + * information we need from both clusters. + */ + buffer = slurpFile(datadir_target, "global/pg_control", &size); + digestControlFile(&ControlFile_target, buffer, size); + pg_free(buffer); + + buffer = fetchFile("global/pg_control", &size); + digestControlFile(&ControlFile_source, buffer, size); + pg_free(buffer); + + sanityChecks(); + + /* + * If both clusters are already on the same timeline, there's nothing to + * do. + */ + if (ControlFile_target.checkPointCopy.ThisTimeLineID == ControlFile_source.checkPointCopy.ThisTimeLineID) + pg_fatal("source and target cluster are on the same timeline\n"); + + findCommonAncestorTimeline(&divergerec, &lastcommontli); + printf(_("The servers diverged at WAL position %X/%X on timeline %u.\n"), + (uint32) (divergerec >> 32), (uint32) divergerec, lastcommontli); + + /* + * Check for the possibility that the target is in fact a direct ancestor + * of the source. In that case, there is no divergent history in the + * target that needs rewinding. + */ + if (ControlFile_target.checkPoint >= divergerec) + { + rewind_needed = true; + } + else + { + XLogRecPtr chkptendrec; + + /* Read the checkpoint record on the target to see where it ends. */ + chkptendrec = readOneRecord(datadir_target, + ControlFile_target.checkPoint, + ControlFile_target.checkPointCopy.ThisTimeLineID); + + /* + * If the histories diverged exactly at the end of the shutdown + * checkpoint record on the target, there are no WAL records in the + * target that don't belong in the source's history, and no rewind is + * needed. + */ + if (chkptendrec == divergerec) + rewind_needed = false; + else + rewind_needed = true; + } + + if (!rewind_needed) + { + printf(_("No rewind required.\n")); + exit(0); + } + + findLastCheckpoint(datadir_target, divergerec, lastcommontli, + &chkptrec, &chkpttli, &chkptredo); + printf(_("Rewinding from last common checkpoint at %X/%X on timeline %u\n"), + (uint32) (chkptrec >> 32), (uint32) chkptrec, + chkpttli); + + /* + * Build the filemap, by comparing the remote and local data directories. + */ + (void) filemap_create(); + pg_log(PG_PROGRESS, "reading source file list\n"); + fetchRemoteFileList(); + pg_log(PG_PROGRESS, "reading target file list\n"); + traverse_datadir(datadir_target, &process_local_file); + + /* + * Read the target WAL from last checkpoint before the point of fork, to + * extract all the pages that were modified on the target cluster after + * the fork. We can stop reading after reaching the final shutdown record. + * XXX: If we supported rewinding a server that was not shut down cleanly, + * we would need to replay until the end of WAL here. + */ + pg_log(PG_PROGRESS, "reading WAL in target\n"); + extractPageMap(datadir_target, chkptrec, lastcommontli, + ControlFile_target.checkPoint); + filemap_finalize(); + + if (showprogress) + calculate_totals(); + + /* this is too verbose even for verbose mode */ + if (debug) + print_filemap(); + + /* + * Ok, we're ready to start copying things over. + */ + if (showprogress) + { + pg_log(PG_PROGRESS, "Need to copy %lu MB (total source directory size is %lu MB)\n", + (unsigned long) (filemap->fetch_size / (1024 * 1024)), + (unsigned long) (filemap->total_size / (1024 * 1024))); + + fetch_size = filemap->fetch_size; + fetch_done = 0; + } + + /* + * This is the point of no return. Once we start copying things, we have + * modified the target directory and there is no turning back! + */ + + executeFileMap(); + + progress_report(true); + + pg_log(PG_PROGRESS, "\ncreating backup label and updating control file\n"); + createBackupLabel(chkptredo, chkpttli, chkptrec); + + /* + * Update control file of target. Make it ready to perform archive + * recovery when restarting. + * + * minRecoveryPoint is set to the current WAL insert location in the + * source server. Like in an online backup, it's important that we recover + * all the WAL that was generated while we copied the files over. + */ + memcpy(&ControlFile_new, &ControlFile_source, sizeof(ControlFileData)); + + if (connstr_source) + { + endrec = libpqGetCurrentXlogInsertLocation(); + endtli = ControlFile_source.checkPointCopy.ThisTimeLineID; + } + else + { + endrec = ControlFile_source.checkPoint; + endtli = ControlFile_source.checkPointCopy.ThisTimeLineID; + } + ControlFile_new.minRecoveryPoint = endrec; + ControlFile_new.minRecoveryPointTLI = endtli; + ControlFile_new.state = DB_IN_ARCHIVE_RECOVERY; + updateControlFile(&ControlFile_new); + + printf(_("Done!\n")); + + return 0; +} + +static void +sanityChecks(void) +{ + /* TODO Check that there's no backup_label in either cluster */ + + /* Check system_id match */ + if (ControlFile_target.system_identifier != ControlFile_source.system_identifier) + pg_fatal("source and target clusters are from different systems\n"); + + /* check version */ + if (ControlFile_target.pg_control_version != PG_CONTROL_VERSION || + ControlFile_source.pg_control_version != PG_CONTROL_VERSION || + ControlFile_target.catalog_version_no != CATALOG_VERSION_NO || + ControlFile_source.catalog_version_no != CATALOG_VERSION_NO) + { + pg_fatal("clusters are not compatible with this version of pg_rewind\n"); + } + + /* + * Target cluster need to use checksums or hint bit wal-logging, this to + * prevent from data corruption that could occur because of hint bits. + */ + if (ControlFile_target.data_checksum_version != PG_DATA_CHECKSUM_VERSION && + !ControlFile_target.wal_log_hints) + { + pg_fatal("target server need to use either data checksums or \"wal_log_hints = on\"\n"); + } + + /* + * Target cluster better not be running. This doesn't guard against + * someone starting the cluster concurrently. Also, this is probably more + * strict than necessary; it's OK if the master was not shut down cleanly, + * as long as it isn't running at the moment. + */ + if (ControlFile_target.state != DB_SHUTDOWNED) + pg_fatal("target server must be shut down cleanly\n"); + + /* + * When the source is a data directory, also require that the source + * server is shut down. There isn't any very strong reason for this + * limitation, but better safe than sorry. + */ + if (datadir_source && ControlFile_source.state != DB_SHUTDOWNED) + pg_fatal("source data directory must be shut down cleanly\n"); +} + +/* + * Determine the TLI of the last common timeline in the histories of the two + * clusters. *tli is set to the last common timeline, and *recptr is set to + * the position where the histories diverged (ie. the first WAL record that's + * not the same in both clusters). + * + * Control files of both clusters must be read into ControlFile_target/source + * before calling this. + */ +static void +findCommonAncestorTimeline(XLogRecPtr *recptr, TimeLineID *tli) +{ + TimeLineID targettli; + TimeLineHistoryEntry *sourceHistory; + int nentries; + int i; + TimeLineID sourcetli; + + targettli = ControlFile_target.checkPointCopy.ThisTimeLineID; + sourcetli = ControlFile_source.checkPointCopy.ThisTimeLineID; + + /* Timeline 1 does not have a history file, so no need to check */ + if (sourcetli == 1) + { + sourceHistory = (TimeLineHistoryEntry *) pg_malloc(sizeof(TimeLineHistoryEntry)); + sourceHistory->tli = sourcetli; + sourceHistory->begin = sourceHistory->end = InvalidXLogRecPtr; + nentries = 1; + } + else + { + char path[MAXPGPATH]; + char *histfile; + + TLHistoryFilePath(path, sourcetli); + histfile = fetchFile(path, NULL); + + sourceHistory = rewind_parseTimeLineHistory(histfile, + ControlFile_source.checkPointCopy.ThisTimeLineID, + &nentries); + pg_free(histfile); + } + + /* + * Trace the history backwards, until we hit the target timeline. + * + * TODO: This assumes that there are no timeline switches on the target + * cluster after the fork. + */ + for (i = nentries - 1; i >= 0; i--) + { + TimeLineHistoryEntry *entry = &sourceHistory[i]; + + if (entry->tli == targettli) + { + /* found it */ + *recptr = entry->end; + *tli = entry->tli; + + free(sourceHistory); + return; + } + } + + pg_fatal("could not find common ancestor of the source and target cluster's timelines\n"); +} + + +/* + * Create a backup_label file that forces recovery to begin at the last common + * checkpoint. + */ +static void +createBackupLabel(XLogRecPtr startpoint, TimeLineID starttli, XLogRecPtr checkpointloc) +{ + XLogSegNo startsegno; + time_t stamp_time; + char strfbuf[128]; + char xlogfilename[MAXFNAMELEN]; + struct tm *tmp; + char buf[1000]; + int len; + + XLByteToSeg(startpoint, startsegno); + XLogFileName(xlogfilename, starttli, startsegno); + + /* + * Construct backup label file + */ + stamp_time = time(NULL); + tmp = localtime(&stamp_time); + strftime(strfbuf, sizeof(strfbuf), "%Y-%m-%d %H:%M:%S %Z", tmp); + + len = snprintf(buf, sizeof(buf), + "START WAL LOCATION: %X/%X (file %s)\n" + "CHECKPOINT LOCATION: %X/%X\n" + "BACKUP METHOD: pg_rewind\n" + "BACKUP FROM: standby\n" + "START TIME: %s\n", + /* omit LABEL: line */ + (uint32) (startpoint >> 32), (uint32) startpoint, xlogfilename, + (uint32) (checkpointloc >> 32), (uint32) checkpointloc, + strfbuf); + if (len >= sizeof(buf)) + pg_fatal("backup label buffer too small\n"); /* shouldn't happen */ + + /* TODO: move old file out of the way, if any. */ + open_target_file("backup_label", true); /* BACKUP_LABEL_FILE */ + write_target_range(buf, 0, len); +} + +/* + * Check CRC of control file + */ +static void +checkControlFile(ControlFileData *ControlFile) +{ + pg_crc32 crc; + + /* Calculate CRC */ + INIT_CRC32C(crc); + COMP_CRC32C(crc, (char *) ControlFile, offsetof(ControlFileData, crc)); + FIN_CRC32C(crc); + + /* And simply compare it */ + if (!EQ_CRC32C(crc, ControlFile->crc)) + pg_fatal("unexpected control file CRC\n"); +} + +/* + * Verify control file contents in the buffer src, and copy it to *ControlFile. + */ +static void +digestControlFile(ControlFileData *ControlFile, char *src, size_t size) +{ + if (size != PG_CONTROL_SIZE) + pg_fatal("unexpected control file size %d, expected %d\n", + (int) size, PG_CONTROL_SIZE); + + memcpy(ControlFile, src, sizeof(ControlFileData)); + + /* Additional checks on control file */ + checkControlFile(ControlFile); +} + +/* + * Update the target's control file. + */ +static void +updateControlFile(ControlFileData *ControlFile) +{ + char buffer[PG_CONTROL_SIZE]; + + /* Recalculate CRC of control file */ + INIT_CRC32C(ControlFile->crc); + COMP_CRC32C(ControlFile->crc, + (char *) ControlFile, + offsetof(ControlFileData, crc)); + FIN_CRC32C(ControlFile->crc); + + /* + * Write out PG_CONTROL_SIZE bytes into pg_control by zero-padding the + * excess over sizeof(ControlFileData) to avoid premature EOF related + * errors when reading it. + */ + memset(buffer, 0, PG_CONTROL_SIZE); + memcpy(buffer, ControlFile, sizeof(ControlFileData)); + + open_target_file("global/pg_control", false); + + write_target_range(buffer, 0, PG_CONTROL_SIZE); + + close_target_file(); +} |
