Sunday, August 31, 2008

Re: [PATCHES] [HACKERS] Infrastructure changes for recovery

Index: src/backend/access/transam/xlog.c
===================================================================
RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/backend/access/transam/xlog.c,v
retrieving revision 1.317
diff -c -r1.317 xlog.c
*** src/backend/access/transam/xlog.c 11 Aug 2008 11:05:10 -0000 1.317
--- src/backend/access/transam/xlog.c 31 Aug 2008 19:07:40 -0000
***************
*** 131,137 ****
static bool recoveryTarget = false;
static bool recoveryTargetExact = false;
static bool recoveryTargetInclusive = true;
- static bool recoveryLogRestartpoints = false;
static TransactionId recoveryTargetXid;
static TimestampTz recoveryTargetTime;
static TimestampTz recoveryLastXTime = 0;
--- 131,136 ----
***************
*** 386,392 ****
static XLogRecord *nextRecord = NULL;
static TimeLineID lastPageTLI = 0;

! static bool InRedo = false;


static void XLogArchiveNotify(const char *xlog);
--- 385,391 ----
static XLogRecord *nextRecord = NULL;
static TimeLineID lastPageTLI = 0;

! bool InRedo = false;


static void XLogArchiveNotify(const char *xlog);
***************
*** 480,485 ****
--- 479,488 ----
bool doPageWrites;
bool isLogSwitch = (rmid == RM_XLOG_ID && info == XLOG_SWITCH);

+ /* cross-check on whether we should be here or not */
+ if (InRedo)
+ elog(FATAL, "cannot write new WAL data during recovery mode");
+
/* info's high bits are reserved for use by me */
if (info & XLR_INFO_MASK)
elog(PANIC, "invalid xlog info mask %02X", info);
***************
*** 2051,2057 ****
unlink(tmppath);
}

! elog(DEBUG2, "done creating and filling new WAL file");

/* Set flag to tell caller there was no existent file */
*use_existent = false;
--- 2054,2061 ----
unlink(tmppath);
}

! XLogFileName(tmppath, ThisTimeLineID, log, seg);
! elog(DEBUG2, "done creating and filling new WAL file %s", tmppath);

/* Set flag to tell caller there was no existent file */
*use_existent = false;
***************
*** 4532,4546 ****
}
else if (strcmp(tok1, "log_restartpoints") == 0)
{
- /*
- * does nothing if a recovery_target is not also set
- */
- if (!parse_bool(tok2, &recoveryLogRestartpoints))
- ereport(ERROR,
- (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
- errmsg("parameter \"log_restartpoints\" requires a Boolean value")));
ereport(LOG,
! (errmsg("log_restartpoints = %s", tok2)));
}
else
ereport(FATAL,
--- 4536,4544 ----
}
else if (strcmp(tok1, "log_restartpoints") == 0)
{
ereport(LOG,
! (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
! errmsg("parameter \"log_restartpoints\" has been deprecated")));
}
else
ereport(FATAL,
***************
*** 4811,4816 ****
--- 4809,4815 ----
CheckPoint checkPoint;
bool wasShutdown;
bool reachedStopPoint = false;
+ bool reachedSafeStopPoint = false;
bool haveBackupLabel = false;
XLogRecPtr RecPtr,
LastRec,
***************
*** 5039,5044 ****
--- 5038,5048 ----
UpdateControlFile();

/*
+ * Reset pgstat data, because it may be invalid after recovery.
+ */
+ pgstat_reset_all();
+
+ /*
* If there was a backup label file, it's done its job and the info
* has now been propagated into pg_control. We must get rid of the
* label file so that if we crash during recovery, we'll pick up at
***************
*** 5148,5153 ****
--- 5152,5172 ----

LastRec = ReadRecPtr;

+ /*
+ * Have we reached our safe stopping point? If so, we can
+ * signal Postmaster to enter consistent recovery mode
+ */
+ if (!reachedSafeStopPoint &&
+ XLByteLE(ControlFile->minRecoveryPoint, EndRecPtr))
+ {
+ reachedSafeStopPoint = true;
+ ereport(LOG,
+ (errmsg("consistent recovery state reached at %X/%X",
+ EndRecPtr.xlogid, EndRecPtr.xrecoff)));
+ if (IsUnderPostmaster)
+ SendPostmasterSignal(PMSIGNAL_RECOVERY_START);
+ }
+
record = ReadRecord(NULL, LOG);
} while (record != NULL && recoveryContinue);

***************
*** 5169,5174 ****
--- 5188,5194 ----
/* there are no WAL records following the checkpoint */
ereport(LOG,
(errmsg("redo is not required")));
+ reachedSafeStopPoint = true;
}
}

***************
*** 5184,5190 ****
* Complain if we did not roll forward far enough to render the backup
* dump consistent.
*/
! if (XLByteLT(EndOfLog, ControlFile->minRecoveryPoint))
{
if (reachedStopPoint) /* stopped because of stop request */
ereport(FATAL,
--- 5204,5210 ----
* Complain if we did not roll forward far enough to render the backup
* dump consistent.
*/
! if (InRecovery && !reachedSafeStopPoint)
{
if (reachedStopPoint) /* stopped because of stop request */
ereport(FATAL,
***************
*** 5305,5314 ****
*/
XLogCheckInvalidPages();

! /*
! * Reset pgstat data, because it may be invalid after recovery.
! */
! pgstat_reset_all();

/*
* Perform a checkpoint to update all our recovery activity to disk.
--- 5325,5332 ----
*/
XLogCheckInvalidPages();

! if (IsUnderPostmaster)
! BgWriterCompleteRestartPointImmediately();

/*
* Perform a checkpoint to update all our recovery activity to disk.
***************
*** 5318,5323 ****
--- 5336,5344 ----
* assigning a new TLI, using a shutdown checkpoint allows us to have
* the rule that TLI only changes in shutdown checkpoints, which
* allows some extra error checking in xlog_redo.
+ *
+ * Note that this will wait behind any restartpoint that the bgwriter
+ * is currently performing, though will be much faster as a result.
*/
CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
}
***************
*** 5372,5377 ****
--- 5393,5401 ----
readRecordBuf = NULL;
readRecordBufSize = 0;
}
+
+ if (IsUnderPostmaster)
+ BgWriterRecoveryComplete();
}

/*
***************
*** 5642,5648 ****
* Log end of a checkpoint.
*/
static void
! LogCheckpointEnd(void)
{
long write_secs,
sync_secs,
--- 5666,5672 ----
* Log end of a checkpoint.
*/
static void
! LogCheckpointEnd(bool checkpoint)
{
long write_secs,
sync_secs,
***************
*** 5665,5673 ****
CheckpointStats.ckpt_sync_end_t,
&sync_secs, &sync_usecs);

! elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); "
"%d transaction log file(s) added, %d removed, %d recycled; "
"write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
CheckpointStats.ckpt_bufs_written,
(double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
CheckpointStats.ckpt_segs_added,
--- 5689,5698 ----
CheckpointStats.ckpt_sync_end_t,
&sync_secs, &sync_usecs);

! elog(LOG, "%s complete: wrote %d buffers (%.1f%%); "
"%d transaction log file(s) added, %d removed, %d recycled; "
"write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
+ (checkpoint ? " checkpoint" : "restartpoint"),
CheckpointStats.ckpt_bufs_written,
(double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
CheckpointStats.ckpt_segs_added,
***************
*** 6002,6008 ****

/* All real work is done, but log before releasing lock. */
if (log_checkpoints)
! LogCheckpointEnd();

LWLockRelease(CheckpointLock);
}
--- 6027,6033 ----

/* All real work is done, but log before releasing lock. */
if (log_checkpoints)
! LogCheckpointEnd(true);

LWLockRelease(CheckpointLock);
}
***************
*** 6071,6099 ****
}
}

/*
! * OK, force data out to disk
! */
! CheckPointGuts(checkPoint->redo, CHECKPOINT_IMMEDIATE);
!
! /*
! * Update pg_control so that any subsequent crash will restart from this
! * checkpoint. Note: ReadRecPtr gives the XLOG address of the checkpoint
! * record itself.
*/
ControlFile->prevCheckPoint = ControlFile->checkPoint;
ControlFile->checkPoint = ReadRecPtr;
ControlFile->checkPointCopy = *checkPoint;
ControlFile->time = (pg_time_t) time(NULL);
UpdateControlFile();

! ereport((recoveryLogRestartpoints ? LOG : DEBUG2),
(errmsg("recovery restart point at %X/%X",
checkPoint->redo.xlogid, checkPoint->redo.xrecoff)));
! if (recoveryLastXTime)
! ereport((recoveryLogRestartpoints ? LOG : DEBUG2),
! (errmsg("last completed transaction was at log time %s",
! timestamptz_to_str(recoveryLastXTime))));
}

/*
--- 6096,6164 ----
}
}

+ if (recoveryLastXTime)
+ ereport((log_checkpoints ? LOG : DEBUG2),
+ (errmsg("last completed transaction was at log time %s",
+ timestamptz_to_str(recoveryLastXTime))));
/*
! * Update ControlFile data in shared memory.
! * Note: ReadRecPtr gives the XLOG address of the checkpoint record itself.
*/
+ LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
ControlFile->prevCheckPoint = ControlFile->checkPoint;
ControlFile->checkPoint = ReadRecPtr;
ControlFile->checkPointCopy = *checkPoint;
+ RequestRestartPoint();
+ LWLockRelease(ControlFileLock);
+ }
+
+ /*
+ * As of 8.4, RestartPoints are always created by the bgwriter
+ */
+ void
+ CreateRestartPoint(void)
+ {
+ CheckPoint *checkPoint;
+
+ if (log_checkpoints)
+ {
+ /*
+ * Prepare to accumulate statistics.
+ */
+
+ MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
+ CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
+
+ elog(LOG, "restartpoint starting:");
+ }
+
+ LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
+
+ checkPoint = &ControlFile->checkPointCopy;
+
+ /*
+ * OK, write out dirty blocks smoothly
+ */
+ CheckPointGuts(checkPoint->redo, 0);
+
+ /*
+ * Update pg_control, using current time
+ */
+ LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
ControlFile->time = (pg_time_t) time(NULL);
UpdateControlFile();
+ LWLockRelease(ControlFileLock);
+
+ /* All real work is done, but log before releasing lock. */
+ if (log_checkpoints)
+ LogCheckpointEnd(true);

! ereport((log_checkpoints ? LOG : DEBUG2),
(errmsg("recovery restart point at %X/%X",
checkPoint->redo.xlogid, checkPoint->redo.xrecoff)));
!
! LWLockRelease(CheckpointLock);
!
}

/*
Index: src/backend/postmaster/bgwriter.c
===================================================================
RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/backend/postmaster/bgwriter.c,v
retrieving revision 1.51
diff -c -r1.51 bgwriter.c
*** src/backend/postmaster/bgwriter.c 11 Aug 2008 11:05:11 -0000 1.51
--- src/backend/postmaster/bgwriter.c 31 Aug 2008 19:45:39 -0000
***************
*** 122,127 ****
--- 122,129 ----
{
pid_t bgwriter_pid; /* PID of bgwriter (0 if not started) */

+ bool InRedo;
+
slock_t ckpt_lck; /* protects all the ckpt_* fields */

int ckpt_started; /* advances when checkpoint starts */
***************
*** 166,172 ****

/* these values are valid when ckpt_active is true: */
static pg_time_t ckpt_start_time;
! static XLogRecPtr ckpt_start_recptr;
static double ckpt_cached_elapsed;

static pg_time_t last_checkpoint_time;
--- 168,174 ----

/* these values are valid when ckpt_active is true: */
static pg_time_t ckpt_start_time;
! static XLogRecPtr ckpt_start_recptr; /* not used if InRedo */
static double ckpt_cached_elapsed;

static pg_time_t last_checkpoint_time;
***************
*** 186,191 ****
--- 188,208 ----
static void ReqCheckpointHandler(SIGNAL_ARGS);
static void ReqShutdownHandler(SIGNAL_ARGS);

+ /* notify bgwriter of change in mode */
+ void
+ BgWriterRecoveryComplete(void)
+ {
+ BgWriterShmem->InRedo = false;
+ elog(DEBUG1, "recovery complete");
+ }
+
+ /* ask bgwriter to complete any restartpoint, if any, with zero delay */
+ void
+ BgWriterCompleteRestartPointImmediately(void)
+ {
+ BgWriterShmem->ckpt_flags = CHECKPOINT_IMMEDIATE;
+ elog(DEBUG2, "asking bgwriter to complete any restartpoint with zero delay");
+ }

/*
* Main entry point for bgwriter process
***************
*** 202,207 ****
--- 219,230 ----
BgWriterShmem->bgwriter_pid = MyProcPid;
am_bg_writer = true;

+ /*
+ * Follow the postmaster's current mode at startup. If we are InRedo then
+ * the startup process will later tell us when it is complete.
+ */
+ BgWriterShmem->InRedo = InRedo;
+
/*
* If possible, make this process a group leader, so that the postmaster
* can signal any child processes too. (bgwriter probably never has any
***************
*** 356,371 ****
*/
PG_SETMASK(&UnBlockSig);

/*
* Loop forever
*/
for (;;)
{
- bool do_checkpoint = false;
- int flags = 0;
- pg_time_t now;
- int elapsed_secs;
-
/*
* Emergency bailout if postmaster has died. This is to avoid the
* necessity for manual cleanup of all postmaster children.
--- 379,393 ----
*/
PG_SETMASK(&UnBlockSig);

+ if (InRedo)
+ elog(DEBUG1, "bgwriter starting in recovery mode, pid = %u",
+ BgWriterShmem->bgwriter_pid);
+
/*
* Loop forever
*/
for (;;)
{
/*
* Emergency bailout if postmaster has died. This is to avoid the
* necessity for manual cleanup of all postmaster children.
***************
*** 383,501 ****
got_SIGHUP = false;
ProcessConfigFile(PGC_SIGHUP);
}
- if (checkpoint_requested)
- {
- checkpoint_requested = false;
- do_checkpoint = true;
- BgWriterStats.m_requested_checkpoints++;
- }
- if (shutdown_requested)
- {
- /*
- * From here on, elog(ERROR) should end with exit(1), not send
- * control back to the sigsetjmp block above
- */
- ExitOnAnyError = true;
- /* Close down the database */
- ShutdownXLOG(0, 0);
- DumpFreeSpaceMap(0, 0);
- /* Normal exit from the bgwriter is here */
- proc_exit(0); /* done */
- }
-
- /*
- * Force a checkpoint if too much time has elapsed since the last one.
- * Note that we count a timed checkpoint in stats only when this
- * occurs without an external request, but we set the CAUSE_TIME flag
- * bit even if there is also an external request.
- */
- now = (pg_time_t) time(NULL);
- elapsed_secs = now - last_checkpoint_time;
- if (elapsed_secs >= CheckPointTimeout)
- {
- if (!do_checkpoint)
- BgWriterStats.m_timed_checkpoints++;
- do_checkpoint = true;
- flags |= CHECKPOINT_CAUSE_TIME;
- }

! /*
! * Do a checkpoint if requested, otherwise do one cycle of
! * dirty-buffer writing.
! */
! if (do_checkpoint)
{
- /* use volatile pointer to prevent code rearrangement */
- volatile BgWriterShmemStruct *bgs = BgWriterShmem;
-
/*
! * Atomically fetch the request flags to figure out what kind of a
! * checkpoint we should perform, and increase the started-counter
! * to acknowledge that we've started a new checkpoint.
*/
! SpinLockAcquire(&bgs->ckpt_lck);
! flags |= bgs->ckpt_flags;
! bgs->ckpt_flags = 0;
! bgs->ckpt_started++;
! SpinLockRelease(&bgs->ckpt_lck);

! /*
! * We will warn if (a) too soon since last checkpoint (whatever
! * caused it) and (b) somebody set the CHECKPOINT_CAUSE_XLOG flag
! * since the last checkpoint start. Note in particular that this
! * implementation will not generate warnings caused by
! * CheckPointTimeout < CheckPointWarning.
! */
! if ((flags & CHECKPOINT_CAUSE_XLOG) &&
! elapsed_secs < CheckPointWarning)
! ereport(LOG,
! (errmsg("checkpoints are occurring too frequently (%d seconds apart)",
! elapsed_secs),
! errhint("Consider increasing the configuration parameter \"checkpoint_segments\".")));

! /*
! * Initialize bgwriter-private variables used during checkpoint.
! */
! ckpt_active = true;
! ckpt_start_recptr = GetInsertRecPtr();
! ckpt_start_time = now;
! ckpt_cached_elapsed = 0;

! /*
! * Do the checkpoint.
! */
! CreateCheckPoint(flags);

/*
! * After any checkpoint, close all smgr files. This is so we
! * won't hang onto smgr references to deleted files indefinitely.
*/
! smgrcloseall();

/*
! * Indicate checkpoint completion to any waiting backends.
*/
! SpinLockAcquire(&bgs->ckpt_lck);
! bgs->ckpt_done = bgs->ckpt_started;
! SpinLockRelease(&bgs->ckpt_lck);
!
! ckpt_active = false;

/*
! * Note we record the checkpoint start time not end time as
! * last_checkpoint_time. This is so that time-driven checkpoints
! * happen at a predictable spacing.
*/
! last_checkpoint_time = now;
! }
! else
! BgBufferSync();

! /* Check for archive_timeout and switch xlog files if necessary. */
! CheckArchiveTimeout();

! /* Nap for the configured time. */
! BgWriterNap();
}
}

--- 405,610 ----
got_SIGHUP = false;
ProcessConfigFile(PGC_SIGHUP);
}

! if (InRedo)
{
/*
! * Check to see whether startup process has completed redo.
! * If so, we can permanently change out of recovery mode.
*/
! if (BgWriterShmem->InRedo == false)
! {

! elog(LOG, "changing to InRedo = false");

! InitXLOGAccess();
! InRedo = false;

! /*
! * Start time-driven events from now
! */
! last_checkpoint_time = last_xlog_switch_time = (pg_time_t) time(NULL);
! }
!
! if (checkpoint_requested)
! {
! /*
! * Initialize bgwriter-private variables used during checkpoint.
! */
! ckpt_active = true;
! ckpt_start_time = (pg_time_t) time(NULL);
! ckpt_cached_elapsed = 0;
!
! CreateRestartPoint();
!
! ckpt_active = false;
! checkpoint_requested = false;
! /*
! * Reset any flags if we requested immediate completion part
! * way through the restart point
! */
! BgWriterShmem->ckpt_flags = 0;
! }
! else
! {
! /* Clean buffers dirtied by recovery */
! BgBufferSync();
!
! /* Nap for the configured time. */
! BgWriterNap();
! }
!
! if (shutdown_requested)
! {
! /*
! * From here on, elog(ERROR) should end with exit(1), not send
! * control back to the sigsetjmp block above
! */
! ExitOnAnyError = true;
! /* Normal exit from the bgwriter is here */
! proc_exit(0); /* done */
! }

/*
! * Check to see whether startup process has completed redo.
! * If so, we can permanently change out of recovery mode.
*/
! if (BgWriterShmem->InRedo == false)
! {
! elog(DEBUG2, "changing to InRedo = false");
!
! InitXLOGAccess();
! InRedo = false;
!
! /*
! * Start time-driven events from now
! */
! last_checkpoint_time = last_xlog_switch_time = (pg_time_t) time(NULL);
! }
! }
! else /* Normal processing */
! {
! bool do_checkpoint = false;
! int flags = 0;
! pg_time_t now;
! int elapsed_secs;
!
! Assert(!InRedo);
!
! if (checkpoint_requested)
! {
! checkpoint_requested = false;
! do_checkpoint = true;
! BgWriterStats.m_requested_checkpoints++;
! }
! if (shutdown_requested)
! {
! /*
! * From here on, elog(ERROR) should end with exit(1), not send
! * control back to the sigsetjmp block above
! */
! ExitOnAnyError = true;
! /* Close down the database */
! ShutdownXLOG(0, 0);
! DumpFreeSpaceMap(0, 0);
! /* Normal exit from the bgwriter is here */
! proc_exit(0); /* done */
! }

/*
! * Force a checkpoint if too much time has elapsed since the last one.
! * Note that we count a timed checkpoint in stats only when this
! * occurs without an external request, but we set the CAUSE_TIME flag
! * bit even if there is also an external request.
*/
! now = (pg_time_t) time(NULL);
! elapsed_secs = now - last_checkpoint_time;
! if (elapsed_secs >= CheckPointTimeout)
! {
! if (!do_checkpoint)
! BgWriterStats.m_timed_checkpoints++;
! do_checkpoint = true;
! flags |= CHECKPOINT_CAUSE_TIME;
! }

/*
! * Do a checkpoint if requested, otherwise do one cycle of
! * dirty-buffer writing.
*/
! if (do_checkpoint)
! {
! /* use volatile pointer to prevent code rearrangement */
! volatile BgWriterShmemStruct *bgs = BgWriterShmem;
!
! /*
! * Atomically fetch the request flags to figure out what kind of a
! * checkpoint we should perform, and increase the started-counter
! * to acknowledge that we've started a new checkpoint.
! */
! SpinLockAcquire(&bgs->ckpt_lck);
! flags |= bgs->ckpt_flags;
! bgs->ckpt_flags = 0;
! bgs->ckpt_started++;
! SpinLockRelease(&bgs->ckpt_lck);
!
! /*
! * We will warn if (a) too soon since last checkpoint (whatever
! * caused it) and (b) somebody set the CHECKPOINT_CAUSE_XLOG flag
! * since the last checkpoint start. Note in particular that this
! * implementation will not generate warnings caused by
! * CheckPointTimeout < CheckPointWarning.
! */
! if ((flags & CHECKPOINT_CAUSE_XLOG) &&
! elapsed_secs < CheckPointWarning)
! ereport(LOG,
! (errmsg("checkpoints are occurring too frequently (%d seconds apart)",
! elapsed_secs),
! errhint("Consider increasing the configuration parameter \"checkpoint_segments\".")));
!
! /*
! * Initialize bgwriter-private variables used during checkpoint.
! */
! ckpt_active = true;
! ckpt_start_recptr = GetInsertRecPtr();
! ckpt_start_time = now;
! ckpt_cached_elapsed = 0;
!
! /*
! * Do the checkpoint.
! */
! CreateCheckPoint(flags);
!
! /*
! * After any checkpoint, close all smgr files. This is so we
! * won't hang onto smgr references to deleted files indefinitely.
! */
! smgrcloseall();
!
! /*
! * Indicate checkpoint completion to any waiting backends.
! */
! SpinLockAcquire(&bgs->ckpt_lck);
! bgs->ckpt_done = bgs->ckpt_started;
! SpinLockRelease(&bgs->ckpt_lck);
!
! ckpt_active = false;
!
! /*
! * Note we record the checkpoint start time not end time as
! * last_checkpoint_time. This is so that time-driven checkpoints
! * happen at a predictable spacing.
! */
! last_checkpoint_time = now;
! }
! else
! BgBufferSync();

! /* Check for archive_timeout and switch xlog files if necessary. */
! CheckArchiveTimeout();

! /* Nap for the configured time. */
! BgWriterNap();
! }
}
}

***************
*** 588,594 ****
(ckpt_active ? ImmediateCheckpointRequested() : checkpoint_requested))
break;
pg_usleep(1000000L);
! AbsorbFsyncRequests();
udelay -= 1000000L;
}

--- 697,704 ----
(ckpt_active ? ImmediateCheckpointRequested() : checkpoint_requested))
break;
pg_usleep(1000000L);
! if (!InRedo)
! AbsorbFsyncRequests();
udelay -= 1000000L;
}

***************
*** 642,647 ****
--- 752,770 ----
if (!am_bg_writer)
return;

+ /* Perform minimal duties during recovery and skip wait if requested */
+ if (InRedo)
+ {
+ BgBufferSync();
+
+ if (!ImmediateCheckpointRequested() &&
+ !shutdown_requested &&
+ IsCheckpointOnSchedule(progress))
+ BgWriterNap();
+
+ return;
+ }
+
/*
* Perform the usual bgwriter duties and take a nap, unless we're behind
* schedule, in which case we just try to catch up as quickly as possible.
***************
*** 716,731 ****
* However, it's good enough for our purposes, we're only calculating an
* estimate anyway.
*/
! recptr = GetInsertRecPtr();
! elapsed_xlogs =
! (((double) (int32) (recptr.xlogid - ckpt_start_recptr.xlogid)) * XLogSegsPerFile +
! ((double) recptr.xrecoff - (double) ckpt_start_recptr.xrecoff) / XLogSegSize) /
! CheckPointSegments;
!
! if (progress < elapsed_xlogs)
{
! ckpt_cached_elapsed = elapsed_xlogs;
! return false;
}

/*
--- 839,857 ----
* However, it's good enough for our purposes, we're only calculating an
* estimate anyway.
*/
! if (!InRedo)
{
! recptr = GetInsertRecPtr();
! elapsed_xlogs =
! (((double) (int32) (recptr.xlogid - ckpt_start_recptr.xlogid)) * XLogSegsPerFile +
! ((double) recptr.xrecoff - (double) ckpt_start_recptr.xrecoff) / XLogSegSize) /
! CheckPointSegments;
!
! if (progress < elapsed_xlogs)
! {
! ckpt_cached_elapsed = elapsed_xlogs;
! return false;
! }
}

/*
***************
*** 966,971 ****
--- 1092,1118 ----
}
}

+ void
+ RequestRestartPoint(void)
+ {
+ /*
+ * If in a standalone backend, just do it ourselves.
+ */
+ if (!IsPostmasterEnvironment)
+ {
+ CreateRestartPoint();
+ return;
+ }
+
+ /*
+ * Send signal to request restartpoint.
+ */
+ if (BgWriterShmem->bgwriter_pid == 0)
+ elog(LOG, "could not request restartpoint because bgwriter not running");
+ if (kill(BgWriterShmem->bgwriter_pid, SIGINT) != 0)
+ elog(LOG, "could not signal for restartpoint: %m");
+ }
+
/*
* ForwardFsyncRequest
* Forward a file-fsync request from a backend to the bgwriter
Index: src/backend/postmaster/postmaster.c
===================================================================
RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/backend/postmaster/postmaster.c,v
retrieving revision 1.561
diff -c -r1.561 postmaster.c
*** src/backend/postmaster/postmaster.c 26 Jun 2008 02:47:19 -0000 1.561
--- src/backend/postmaster/postmaster.c 31 Aug 2008 17:10:31 -0000
***************
*** 254,259 ****
--- 254,264 ----
{
PM_INIT, /* postmaster starting */
PM_STARTUP, /* waiting for startup subprocess */
+ PM_RECOVERY, /* consistent recovery mode; state only
+ * entered for archive and streaming recovery,
+ * and only after the point where the
+ * all data is in consistent state.
+ */
PM_RUN, /* normal "database is alive" state */
PM_WAIT_BACKUP, /* waiting for online backup mode to end */
PM_WAIT_BACKENDS, /* waiting for live backends to exit */
***************
*** 2104,2110 ****
if (pid == StartupPID)
{
StartupPID = 0;
! Assert(pmState == PM_STARTUP);

/* FATAL exit of startup is treated as catastrophic */
if (!EXIT_STATUS_0(exitstatus))
--- 2109,2115 ----
if (pid == StartupPID)
{
StartupPID = 0;
! Assert(pmState == PM_STARTUP || pmState == PM_RECOVERY);

/* FATAL exit of startup is treated as catastrophic */
if (!EXIT_STATUS_0(exitstatus))
***************
*** 2136,2141 ****
--- 2141,2147 ----
* Otherwise, commence normal operations.
*/
pmState = PM_RUN;
+ InRedo = false;

/*
* Load the flat authorization file into postmaster's cache. The
***************
*** 2148,2155 ****
* Crank up the background writer. It doesn't matter if this
* fails, we'll just try again later.
*/
! Assert(BgWriterPID == 0);
! BgWriterPID = StartBackgroundWriter();

/*
* Likewise, start other special children as needed. In a restart
--- 2154,2161 ----
* Crank up the background writer. It doesn't matter if this
* fails, we'll just try again later.
*/
! if (BgWriterPID == 0)
! BgWriterPID = StartBackgroundWriter();

/*
* Likewise, start other special children as needed. In a restart
***************
*** 2812,2817 ****
--- 2818,2825 ----
*/
MyCancelKey = PostmasterRandom();

+ InRedo = (pmState != PM_RUN);
+
/*
* Make room for backend data structure. Better before the fork() so we
* can handle failure cleanly.
***************
*** 3821,3826 ****
--- 3829,3880 ----

PG_SETMASK(&BlockSig);

+ if (CheckPostmasterSignal(PMSIGNAL_RECOVERY_START))
+ {
+ Assert(pmState == PM_STARTUP);
+
+ /*
+ * Go to shutdown mode if a shutdown request was pending.
+ */
+ if (Shutdown > NoShutdown)
+ {
+ pmState = PM_WAIT_BACKENDS;
+ /* PostmasterStateMachine logic does the rest */
+ }
+ else
+ {
+ /*
+ * Startup process has entered recovery
+ */
+ pmState = PM_RECOVERY;
+ InRedo = true;
+
+ /*
+ * Load the flat authorization file into postmaster's cache. The
+ * startup process won't have recomputed this from the database yet,
+ * so we it may change following recovery.
+ */
+ load_role();
+
+ /*
+ * Crank up the background writer. It doesn't matter if this
+ * fails, we'll just try again later.
+ */
+ Assert(BgWriterPID == 0);
+ BgWriterPID = StartBackgroundWriter();
+
+ /*
+ * Likewise, start other special children as needed.
+ */
+ Assert(PgStatPID == 0);
+ PgStatPID = pgstat_start();
+
+ /* XXX at this point we could accept read-only connections */
+ ereport(DEBUG1,
+ (errmsg("database system is in consistent recovery mode")));
+ }
+ }
+
if (CheckPostmasterSignal(PMSIGNAL_PASSWORD_CHANGE))
{
/*
Index: src/include/miscadmin.h
===================================================================
RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/include/miscadmin.h,v
retrieving revision 1.202
diff -c -r1.202 miscadmin.h
*** src/include/miscadmin.h 23 Apr 2008 13:44:59 -0000 1.202
--- src/include/miscadmin.h 31 Aug 2008 11:23:45 -0000
***************
*** 64,69 ****
--- 64,71 ----
*
*****************************************************************************/

+ extern bool InRedo;
+
/* in globals.c */
/* these are marked volatile because they are set by signal handlers: */
extern PGDLLIMPORT volatile bool InterruptPending;
Index: src/include/access/xlog.h
===================================================================
RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/include/access/xlog.h,v
retrieving revision 1.88
diff -c -r1.88 xlog.h
*** src/include/access/xlog.h 12 May 2008 08:35:05 -0000 1.88
--- src/include/access/xlog.h 31 Aug 2008 13:33:43 -0000
***************
*** 205,210 ****
--- 205,211 ----
extern void ShutdownXLOG(int code, Datum arg);
extern void InitXLOGAccess(void);
extern void CreateCheckPoint(int flags);
+ extern void CreateRestartPoint(void);
extern void XLogPutNextOid(Oid nextOid);
extern XLogRecPtr GetRedoRecPtr(void);
extern XLogRecPtr GetInsertRecPtr(void);
Index: src/include/postmaster/bgwriter.h
===================================================================
RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/include/postmaster/bgwriter.h,v
retrieving revision 1.12
diff -c -r1.12 bgwriter.h
*** src/include/postmaster/bgwriter.h 11 Aug 2008 11:05:11 -0000 1.12
--- src/include/postmaster/bgwriter.h 31 Aug 2008 15:35:30 -0000
***************
*** 25,36 ****
--- 25,40 ----
extern void BackgroundWriterMain(void);

extern void RequestCheckpoint(int flags);
+ extern void RequestRestartPoint(void);
extern void CheckpointWriteDelay(int flags, double progress);

extern bool ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum,
BlockNumber segno);
extern void AbsorbFsyncRequests(void);

+ extern void BgWriterRecoveryComplete(void);
+ extern void BgWriterCompleteRestartPointImmediately(void);
+
extern Size BgWriterShmemSize(void);
extern void BgWriterShmemInit(void);

Index: src/include/storage/pmsignal.h
===================================================================
RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/include/storage/pmsignal.h,v
retrieving revision 1.20
diff -c -r1.20 pmsignal.h
*** src/include/storage/pmsignal.h 19 Jun 2008 21:32:56 -0000 1.20
--- src/include/storage/pmsignal.h 31 Aug 2008 11:26:40 -0000
***************
*** 22,27 ****
--- 22,28 ----
*/
typedef enum
{
+ PMSIGNAL_RECOVERY_START, /* move to PM_RECOVERY state */
PMSIGNAL_PASSWORD_CHANGE, /* pg_auth file has changed */
PMSIGNAL_WAKEN_ARCHIVER, /* send a NOTIFY signal to xlog archiver */
PMSIGNAL_ROTATE_LOGFILE, /* send SIGUSR1 to syslogger to rotate logfile */
On Thu, 2008-08-07 at 12:44 +0100, Simon Riggs wrote:
> I would like to propose some changes to the infrastructure for recovery.
> These changes are beneficial in themselves, but also form the basis for
> other work we might later contemplate.
>
> Currently
> * the startup process performs restartpoints during recovery
> * the death of the startup process is tied directly to the change of
> state in the postmaster following recovery
>
> I propose to
> * have startup process signal postmaster when it starts Redo phase (if
> it starts it)

> Decoupling things in this way allows us to
> 1. arrange for the bgwriter to start during Redo, so it can:
> i) clean dirty blocks for the startup process
> ii) perform restartpoints in background
> Both of these aspects will increase performance of recovery

Taking into account comments from Tom and Alvaro

Included patch with the following changes:

* new postmaster mode known as consistent recovery, entered only when
recovery passes safe/consistent point. InRedo is now set in all
processes when started/connected in consistent recovery mode.

* bgwriter and stats process starts in consistent recovery mode.
bgwriter changes mode when startup process completes.

* bgwriter now performs restartpoints and also cleans shared_buffers
while the startup process performs redo apply

* recovery.conf parameter log_restartpoints is now deprecated, since
function overlaps with log_checkpoints too much. I've kept the
distinction between restartpoints and checkpoints in code, to avoid
convoluted code. Minor change, not critical.

[Replying to one of Alvaro's other comments: Startup process still uses
XLogReadBuffer. I'm not planning on changing that either, at least not
in this patch.]

Patch doesn't conflict with rmgr plugin patch.

Passes make check, but that's easy.
Various other tests all seem to be working.

--
Simon Riggs www.2ndQuadrant.com
PostgreSQL Training, Services and Support

No comments: