Re: Hot standby, recovery infra
| От | Heikki Linnakangas |
|---|---|
| Тема | Re: Hot standby, recovery infra |
| Дата | |
| Msg-id | 499047FE.9090407@enterprisedb.com обсуждение исходный текст |
| Ответ на | Re: Hot standby, recovery infra (Simon Riggs <simon@2ndQuadrant.com>) |
| Ответы |
Re: Hot standby, recovery infra
|
| Список | pgsql-hackers |
Simon Riggs wrote:
> On Fri, 2009-02-06 at 10:06 +0200, Heikki Linnakangas wrote:
>> Simon Riggs wrote:
>>> On Thu, 2009-02-05 at 21:54 +0200, Heikki Linnakangas wrote:
>>>> - If you perform a fast shutdown while startup process is waiting for
>>>> the restore command, startup process sometimes throws a FATAL error
>>>> which leads escalates into an immediate shutdown. That leads to
>>>> different messages in the logs, and skipping of the shutdown
>>>> restartpoint that we now otherwise perform.
>>> Sometimes?
>> I think what happens is that if the restore command receives the SIGTERM
>> and dies before the startup process that's waiting for the restore
>> command receives the SIGTERM, the startup process throws a FATAL error
>> because the restore command died unexpectedly. I put this
>>
>>> if (shutdown_requested && InRedo)
>>> {
>>> /* XXX: Is EndRecPtr always the right value here? */
>>> UpdateMinRecoveryPoint(EndRecPtr);
>>> proc_exit(0);
>>> }
>> right after the "system(xlogRestoreCmd)" call, to exit gracefully if we
>> were requested to shut down while restore command was running, but it
>> seems that that's not enough because of the race condition.
>
> Can we trap the death of the restorecmd and handle it differently from
> the death of the startup process?
The startup process launches the restore command, so it's the startup
process that needs to handle its death.
Anyway, I think I've found a solution. While we're executing the restore
command, we're in a state that it's safe to proc_exit(0). We can set a
flag to indicate to the signal handler when we're executing the restore
command, so that the signal handler can do proc_exit(0) on SIGTERM. So
if the startup process receives the SIGTERM first, it will proc_exit(0)
immediately, and if the restore command dies first due to the SIGTERM,
startup process exits with proc_exit(0) when it sees that restore
command exited because of the SIGTERM. If either process receives
SIGTERM for some other reason than a fast shutdown request, postmaster
will see that the startup process exited unexpectedly, and handles that
like a child process crash.
Attached is an updated patch that does that, and I've fixed all the
other outstanding issues I listed earlier as well. Now I'm feeling again
that this is in pretty good shape.
--
Heikki Linnakangas
EnterpriseDB http://www.enterprisedb.com
*** a/src/backend/access/transam/xlog.c
--- b/src/backend/access/transam/xlog.c
***************
*** 36,41 ****
--- 36,42 ----
#include "catalog/pg_control.h"
#include "catalog/pg_type.h"
#include "funcapi.h"
+ #include "libpq/pqsignal.h"
#include "miscadmin.h"
#include "pgstat.h"
#include "postmaster/bgwriter.h"
***************
*** 47,52 ****
--- 48,54 ----
#include "storage/smgr.h"
#include "storage/spin.h"
#include "utils/builtins.h"
+ #include "utils/flatfiles.h"
#include "utils/guc.h"
#include "utils/ps_status.h"
#include "pg_trace.h"
***************
*** 119,130 **** CheckpointStatsData CheckpointStats;
*/
TimeLineID ThisTimeLineID = 0;
! /* Are we doing recovery from XLOG? */
bool InRecovery = false;
/* Are we recovering using offline XLOG archives? */
static bool InArchiveRecovery = false;
/* Was the last xlog file restored from archive, or local? */
static bool restoredFromArchive = false;
--- 121,146 ----
*/
TimeLineID ThisTimeLineID = 0;
! /*
! * Are we doing recovery from XLOG?
! *
! * This is only ever true in the startup process, when it's replaying WAL.
! * It's used in functions that need to act differently when called from a
! * redo function (e.g skip WAL logging). To check whether the system is in
! * recovery regardless of what process you're running in, use
! * IsRecoveryProcessingMode().
! */
bool InRecovery = false;
/* Are we recovering using offline XLOG archives? */
static bool InArchiveRecovery = false;
+ /*
+ * Local copy of shared RecoveryProcessingMode variable. True actually
+ * means "not known, need to check the shared state"
+ */
+ static bool LocalRecoveryProcessingMode = true;
+
/* Was the last xlog file restored from archive, or local? */
static bool restoredFromArchive = false;
***************
*** 133,139 **** static char *recoveryRestoreCommand = NULL;
static bool recoveryTarget = false;
static bool recoveryTargetExact = false;
static bool recoveryTargetInclusive = true;
- static bool recoveryLogRestartpoints = false;
static TransactionId recoveryTargetXid;
static TimestampTz recoveryTargetTime;
static TimestampTz recoveryLastXTime = 0;
--- 149,154 ----
***************
*** 242,250 **** static XLogRecPtr RedoRecPtr;
* ControlFileLock: must be held to read/update control file or create
* new log file.
*
! * CheckpointLock: must be held to do a checkpoint (ensures only one
! * checkpointer at a time; currently, with all checkpoints done by the
! * bgwriter, this is just pro forma).
*
*----------
*/
--- 257,264 ----
* ControlFileLock: must be held to read/update control file or create
* new log file.
*
! * CheckpointLock: must be held to do a checkpoint or restartpoint (ensures
! * only one checkpointer at a time)
*
*----------
*/
***************
*** 313,318 **** typedef struct XLogCtlData
--- 327,351 ----
int XLogCacheBlck; /* highest allocated xlog buffer index */
TimeLineID ThisTimeLineID;
+ /*
+ * SharedRecoveryProcessingMode indicates if we're still in crash or
+ * archive recovery. It's checked by IsRecoveryProcessingMode().
+ */
+ bool SharedRecoveryProcessingMode;
+
+ /*
+ * During recovery, we keep a copy of the latest checkpoint record
+ * here. Used by the background writer when it wants to create
+ * a restartpoint.
+ *
+ * Protected by info_lck.
+ */
+ XLogRecPtr lastCheckPointRecPtr;
+ CheckPoint lastCheckPoint;
+
+ /* end+1 of the last record replayed (or being replayed) */
+ XLogRecPtr replayEndRecPtr;
+
slock_t info_lck; /* locks shared variables shown above */
} XLogCtlData;
***************
*** 387,395 **** static XLogRecPtr ReadRecPtr; /* start of last record read */
--- 420,440 ----
static XLogRecPtr EndRecPtr; /* end+1 of last record read */
static XLogRecord *nextRecord = NULL;
static TimeLineID lastPageTLI = 0;
+ static XLogRecPtr minRecoveryPoint; /* local copy of ControlFile->minRecoveryPoint */
+ static bool updateMinRecoveryPoint = true;
static bool InRedo = false;
+ /*
+ * Flag set by interrupt handlers for later service in the redo loop.
+ */
+ static volatile sig_atomic_t shutdown_requested = false;
+ /*
+ * Flag set when executing a restore command, to tell SIGTERM signal handler
+ * that it's safe to just proc_exit(0).
+ */
+ static volatile sig_atomic_t in_restore_command = false;
+
static void XLogArchiveNotify(const char *xlog);
static void XLogArchiveNotifySeg(uint32 log, uint32 seg);
***************
*** 420,425 **** static void PreallocXlogFiles(XLogRecPtr endptr);
--- 465,471 ----
static void RemoveOldXlogFiles(uint32 log, uint32 seg, XLogRecPtr endptr);
static void ValidateXLOGDirectoryStructure(void);
static void CleanupBackupHistory(void);
+ static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force);
static XLogRecord *ReadRecord(XLogRecPtr *RecPtr, int emode);
static bool ValidXLOGHeader(XLogPageHeader hdr, int emode);
static XLogRecord *ReadCheckpointRecord(XLogRecPtr RecPtr, int whichChkpt);
***************
*** 484,489 **** XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata)
--- 530,539 ----
bool doPageWrites;
bool isLogSwitch = (rmid == RM_XLOG_ID && info == XLOG_SWITCH);
+ /* cross-check on whether we should be here or not */
+ if (IsRecoveryProcessingMode())
+ elog(FATAL, "cannot make new WAL entries during recovery");
+
/* info's high bits are reserved for use by me */
if (info & XLR_INFO_MASK)
elog(PANIC, "invalid xlog info mask %02X", info);
***************
*** 1718,1723 **** XLogSetAsyncCommitLSN(XLogRecPtr asyncCommitLSN)
--- 1768,1830 ----
}
/*
+ * Advance minRecoveryPoint in control file.
+ *
+ * If we crash during recovery, we must reach this point again before the
+ * database is consistent.
+ *
+ * If 'force' is true, 'lsn' argument is ignored. Otherwise, minRecoveryPoint
+ * is is only updated if it's already greater than or equal to 'lsn'.
+ */
+ static void
+ UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
+ {
+ /* Quick check using our local copy of the variable */
+ if (!updateMinRecoveryPoint || (!force && XLByteLE(lsn, minRecoveryPoint)))
+ return;
+
+ LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+
+ /* update local copy */
+ minRecoveryPoint = ControlFile->minRecoveryPoint;
+
+ /*
+ * An invalid minRecoveryPoint means that we need to recover all the WAL,
+ * ie. crash recovery. Don't update the control file in that case.
+ */
+ if (minRecoveryPoint.xlogid == 0 && minRecoveryPoint.xrecoff == 0)
+ updateMinRecoveryPoint = false;
+ else if (force || XLByteLT(minRecoveryPoint, lsn))
+ {
+ /* use volatile pointer to prevent code rearrangement */
+ volatile XLogCtlData *xlogctl = XLogCtl;
+ XLogRecPtr newMinRecoveryPoint;
+
+ /*
+ * To avoid having to update the control file too often, we update
+ * it all the way to the last record being replayed, even though 'lsn'
+ * would suffice for correctness.
+ */
+ SpinLockAcquire(&xlogctl->info_lck);
+ newMinRecoveryPoint = xlogctl->replayEndRecPtr;
+ SpinLockRelease(&xlogctl->info_lck);
+
+ /* update control file */
+ if (XLByteLT(ControlFile->minRecoveryPoint, newMinRecoveryPoint))
+ {
+ ControlFile->minRecoveryPoint = newMinRecoveryPoint;
+ UpdateControlFile();
+ minRecoveryPoint = newMinRecoveryPoint;
+ }
+
+ ereport(DEBUG2,
+ (errmsg("updated min recovery point to %X/%X",
+ minRecoveryPoint.xlogid, minRecoveryPoint.xrecoff)));
+ }
+ LWLockRelease(ControlFileLock);
+ }
+
+ /*
* Ensure that all XLOG data through the given position is flushed to disk.
*
* NOTE: this differs from XLogWrite mainly in that the WALWriteLock is not
***************
*** 1729,1737 **** XLogFlush(XLogRecPtr record)
XLogRecPtr WriteRqstPtr;
XLogwrtRqst WriteRqst;
! /* Disabled during REDO */
! if (InRedo)
return;
/* Quick exit if already known flushed */
if (XLByteLE(record, LogwrtResult.Flush))
--- 1836,1850 ----
XLogRecPtr WriteRqstPtr;
XLogwrtRqst WriteRqst;
! /*
! * During REDO, we don't try to flush the WAL, but update minRecoveryPoint
! * instead.
! */
! if (IsRecoveryProcessingMode())
! {
! UpdateMinRecoveryPoint(record, false);
return;
+ }
/* Quick exit if already known flushed */
if (XLByteLE(record, LogwrtResult.Flush))
***************
*** 1818,1826 **** XLogFlush(XLogRecPtr record)
* the bad page is encountered again during recovery then we would be
* unable to restart the database at all! (This scenario has actually
* happened in the field several times with 7.1 releases. Note that we
! * cannot get here while InRedo is true, but if the bad page is brought in
! * and marked dirty during recovery then CreateCheckPoint will try to
! * flush it at the end of recovery.)
*
* The current approach is to ERROR under normal conditions, but only
* WARNING during recovery, so that the system can be brought up even if
--- 1931,1939 ----
* the bad page is encountered again during recovery then we would be
* unable to restart the database at all! (This scenario has actually
* happened in the field several times with 7.1 releases. Note that we
! * cannot get here while IsRecoveryProcessingMode(), but if the bad page is
! * brought in and marked dirty during recovery then if a checkpoint were
! * performed at the end of recovery it will try to flush it.
*
* The current approach is to ERROR under normal conditions, but only
* WARNING during recovery, so that the system can be brought up even if
***************
*** 1857,1862 **** XLogBackgroundFlush(void)
--- 1970,1979 ----
XLogRecPtr WriteRqstPtr;
bool flexible = true;
+ /* XLOG doesn't need flushing during recovery */
+ if (IsRecoveryProcessingMode())
+ return;
+
/* read LogwrtResult and update local state */
{
/* use volatile pointer to prevent code rearrangement */
***************
*** 1928,1933 **** XLogAsyncCommitFlush(void)
--- 2045,2054 ----
/* use volatile pointer to prevent code rearrangement */
volatile XLogCtlData *xlogctl = XLogCtl;
+ /* There's no asynchronously committed transactions during recovery */
+ if (IsRecoveryProcessingMode())
+ return;
+
SpinLockAcquire(&xlogctl->info_lck);
WriteRqstPtr = xlogctl->asyncCommitLSN;
SpinLockRelease(&xlogctl->info_lck);
***************
*** 1944,1949 **** XLogAsyncCommitFlush(void)
--- 2065,2074 ----
bool
XLogNeedsFlush(XLogRecPtr record)
{
+ /* XLOG doesn't need flushing during recovery */
+ if (IsRecoveryProcessingMode())
+ return false;
+
/* Quick exit if already known flushed */
if (XLByteLE(record, LogwrtResult.Flush))
return false;
***************
*** 2619,2627 **** RestoreArchivedFile(char *path, const char *xlogfname,
--- 2744,2765 ----
xlogRestoreCmd)));
/*
+ * Set in_restore_command to tell the signal handler that we should exit
+ * right away on SIGTERM. We know that we're in a safe point to do that.
+ * Check if we had already received the signal, so that we don't miss
+ * a shutdown request received just before this.
+ */
+ in_restore_command = true;
+ if (shutdown_requested)
+ proc_exit(0);
+
+ /*
* Copy xlog from archival storage to XLOGDIR
*/
rc = system(xlogRestoreCmd);
+
+ in_restore_command = false;
+
if (rc == 0)
{
/*
***************
*** 2674,2687 **** RestoreArchivedFile(char *path, const char *xlogfname,
* assume that recovery is complete and start up the database!) It's
* essential to abort on child SIGINT and SIGQUIT, because per spec
* system() ignores SIGINT and SIGQUIT while waiting; if we see one of
! * those it's a good bet we should have gotten it too. Aborting on other
! * signals such as SIGTERM seems a good idea as well.
*
* Per the Single Unix Spec, shells report exit status > 128 when a called
* command died on a signal. Also, 126 and 127 are used to report
* problems such as an unfindable command; treat those as fatal errors
* too.
*/
signaled = WIFSIGNALED(rc) || WEXITSTATUS(rc) > 125;
ereport(signaled ? FATAL : DEBUG2,
--- 2812,2835 ----
* assume that recovery is complete and start up the database!) It's
* essential to abort on child SIGINT and SIGQUIT, because per spec
* system() ignores SIGINT and SIGQUIT while waiting; if we see one of
! * those it's a good bet we should have gotten it too.
! *
! * On SIGTERM, assume we have received a fast shutdown request, and exit
! * cleanly. It's pure chance whether we receive the SIGTERM first, or the
! * child process. If we receive it first, the signal handler will call
! * proc_exit(0), otherwise we do it here. If we or the child process
! * received SIGTERM for any other reason than a fast shutdown request,
! * postmaster will perform an immediate shutdown when it sees us exiting
! * unexpectedly.
*
* Per the Single Unix Spec, shells report exit status > 128 when a called
* command died on a signal. Also, 126 and 127 are used to report
* problems such as an unfindable command; treat those as fatal errors
* too.
*/
+ if (WTERMSIG(rc) == SIGTERM)
+ proc_exit(0);
+
signaled = WIFSIGNALED(rc) || WEXITSTATUS(rc) > 125;
ereport(signaled ? FATAL : DEBUG2,
***************
*** 4590,4607 **** readRecoveryCommandFile(void)
ereport(LOG,
(errmsg("recovery_target_inclusive = %s", tok2)));
}
- else if (strcmp(tok1, "log_restartpoints") == 0)
- {
- /*
- * does nothing if a recovery_target is not also set
- */
- if (!parse_bool(tok2, &recoveryLogRestartpoints))
- ereport(ERROR,
- (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
- errmsg("parameter \"log_restartpoints\" requires a Boolean value")));
- ereport(LOG,
- (errmsg("log_restartpoints = %s", tok2)));
- }
else
ereport(FATAL,
(errmsg("unrecognized recovery parameter \"%s\"",
--- 4738,4743 ----
***************
*** 4883,4889 **** StartupXLOG(void)
XLogRecPtr RecPtr,
LastRec,
checkPointLoc,
! minRecoveryLoc,
EndOfLog;
uint32 endLogId;
uint32 endLogSeg;
--- 5019,5025 ----
XLogRecPtr RecPtr,
LastRec,
checkPointLoc,
! backupStopLoc,
EndOfLog;
uint32 endLogId;
uint32 endLogSeg;
***************
*** 4891,4896 **** StartupXLOG(void)
--- 5027,5034 ----
uint32 freespace;
TransactionId oldestActiveXID;
+ XLogCtl->SharedRecoveryProcessingMode = true;
+
/*
* Read control file and check XLOG status looks valid.
*
***************
*** 4970,4976 **** StartupXLOG(void)
recoveryTargetTLI,
ControlFile->checkPointCopy.ThisTimeLineID)));
! if (read_backup_label(&checkPointLoc, &minRecoveryLoc))
{
/*
* When a backup_label file is present, we want to roll forward from
--- 5108,5114 ----
recoveryTargetTLI,
ControlFile->checkPointCopy.ThisTimeLineID)));
! if (read_backup_label(&checkPointLoc, &backupStopLoc))
{
/*
* When a backup_label file is present, we want to roll forward from
***************
*** 5108,5118 **** StartupXLOG(void)
ControlFile->prevCheckPoint = ControlFile->checkPoint;
ControlFile->checkPoint = checkPointLoc;
ControlFile->checkPointCopy = checkPoint;
! if (minRecoveryLoc.xlogid != 0 || minRecoveryLoc.xrecoff != 0)
! ControlFile->minRecoveryPoint = minRecoveryLoc;
ControlFile->time = (pg_time_t) time(NULL);
UpdateControlFile();
/*
* If there was a backup label file, it's done its job and the info
* has now been propagated into pg_control. We must get rid of the
--- 5246,5268 ----
ControlFile->prevCheckPoint = ControlFile->checkPoint;
ControlFile->checkPoint = checkPointLoc;
ControlFile->checkPointCopy = checkPoint;
! if (backupStopLoc.xlogid != 0 || backupStopLoc.xrecoff != 0)
! {
! if (XLByteLT(ControlFile->minRecoveryPoint, backupStopLoc))
! ControlFile->minRecoveryPoint = backupStopLoc;
! }
ControlFile->time = (pg_time_t) time(NULL);
+ /* No need to hold ControlFileLock yet, we aren't up far enough */
UpdateControlFile();
+ /* update our local copy of minRecoveryPoint */
+ minRecoveryPoint = ControlFile->minRecoveryPoint;
+
+ /*
+ * Reset pgstat data, because it may be invalid after recovery.
+ */
+ pgstat_reset_all();
+
/*
* If there was a backup label file, it's done its job and the info
* has now been propagated into pg_control. We must get rid of the
***************
*** 5157,5168 **** StartupXLOG(void)
{
bool recoveryContinue = true;
bool recoveryApply = true;
ErrorContextCallback errcontext;
InRedo = true;
! ereport(LOG,
! (errmsg("redo starts at %X/%X",
! ReadRecPtr.xlogid, ReadRecPtr.xrecoff)));
/*
* main redo apply loop
--- 5307,5347 ----
{
bool recoveryContinue = true;
bool recoveryApply = true;
+ bool reachedMinRecoveryPoint = false;
ErrorContextCallback errcontext;
+ /* use volatile pointer to prevent code rearrangement */
+ volatile XLogCtlData *xlogctl = XLogCtl;
+
+ /* Update shared replayEndRecPtr */
+ SpinLockAcquire(&xlogctl->info_lck);
+ xlogctl->replayEndRecPtr = ReadRecPtr;
+ SpinLockRelease(&xlogctl->info_lck);
InRedo = true;
!
! if (minRecoveryPoint.xlogid == 0 && minRecoveryPoint.xrecoff == 0)
! ereport(LOG,
! (errmsg("redo starts at %X/%X",
! ReadRecPtr.xlogid, ReadRecPtr.xrecoff)));
! else
! ereport(LOG,
! (errmsg("redo starts at %X/%X, consistency will be reached at %X/%X",
! ReadRecPtr.xlogid, ReadRecPtr.xrecoff,
! minRecoveryPoint.xlogid, minRecoveryPoint.xrecoff)));
!
! /*
! * Let postmaster know we've started redo now, so that it can
! * launch bgwriter to perform restartpoints. We don't bother
! * during crash recovery as restartpoints can only be performed
! * during archive recovery. And we'd like to keep crash recovery
! * simple, to avoid introducing bugs that could you from
! * recovering after crash.
! *
! * After this point, we can no longer assume that we're the only
! * process in addition to postmaster!
! */
! if (InArchiveRecovery && IsUnderPostmaster)
! SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
/*
* main redo apply loop
***************
*** 5189,5194 **** StartupXLOG(void)
--- 5368,5397 ----
#endif
/*
+ * Check if we were requested to exit without finishing
+ * recovery.
+ */
+ if (shutdown_requested)
+ proc_exit(0);
+
+ /*
+ * Have we reached our safe starting point? If so, we can
+ * tell postmaster that the database is consistent now.
+ */
+ if (!reachedMinRecoveryPoint &&
+ XLByteLE(minRecoveryPoint, EndRecPtr))
+ {
+ reachedMinRecoveryPoint = true;
+ if (InArchiveRecovery)
+ {
+ ereport(LOG,
+ (errmsg("consistent recovery state reached")));
+ if (IsUnderPostmaster)
+ SendPostmasterSignal(PMSIGNAL_RECOVERY_CONSISTENT);
+ }
+ }
+
+ /*
* Have we reached our recovery target?
*/
if (recoveryStopsHere(record, &recoveryApply))
***************
*** 5213,5218 **** StartupXLOG(void)
--- 5416,5430 ----
TransactionIdAdvance(ShmemVariableCache->nextXid);
}
+ /*
+ * Update shared replayEndRecPtr before replaying this
+ * record, so that XLogFlush will update minRecoveryPoint
+ * correctly.
+ */
+ SpinLockAcquire(&xlogctl->info_lck);
+ xlogctl->replayEndRecPtr = EndRecPtr;
+ SpinLockRelease(&xlogctl->info_lck);
+
RmgrTable[record->xl_rmid].rm_redo(EndRecPtr, record);
/* Pop the error context stack */
***************
*** 5256,5269 **** StartupXLOG(void)
* Complain if we did not roll forward far enough to render the backup
* dump consistent.
*/
! if (XLByteLT(EndOfLog, ControlFile->minRecoveryPoint))
{
if (reachedStopPoint) /* stopped because of stop request */
ereport(FATAL,
! (errmsg("requested recovery stop point is before end time of backup dump")));
else /* ran off end of WAL */
ereport(FATAL,
! (errmsg("WAL ends before end time of backup dump")));
}
/*
--- 5468,5481 ----
* Complain if we did not roll forward far enough to render the backup
* dump consistent.
*/
! if (InRecovery && XLByteLT(EndOfLog, minRecoveryPoint))
{
if (reachedStopPoint) /* stopped because of stop request */
ereport(FATAL,
! (errmsg("requested recovery stop point is before consistent recovery point")));
else /* ran off end of WAL */
ereport(FATAL,
! (errmsg("WAL ends before consistent recovery point")));
}
/*
***************
*** 5358,5363 **** StartupXLOG(void)
--- 5570,5581 ----
/* Pre-scan prepared transactions to find out the range of XIDs present */
oldestActiveXID = PrescanPreparedTransactions();
+ /*
+ * Allow writing WAL for us, so that we can create a checkpoint record.
+ * But not yet for other backends!
+ */
+ LocalRecoveryProcessingMode = false;
+
if (InRecovery)
{
int rmid;
***************
*** 5378,5388 **** StartupXLOG(void)
XLogCheckInvalidPages();
/*
- * Reset pgstat data, because it may be invalid after recovery.
- */
- pgstat_reset_all();
-
- /*
* Perform a checkpoint to update all our recovery activity to disk.
*
* Note that we write a shutdown checkpoint rather than an on-line
--- 5596,5601 ----
***************
*** 5404,5415 **** StartupXLOG(void)
*/
InRecovery = false;
ControlFile->state = DB_IN_PRODUCTION;
ControlFile->time = (pg_time_t) time(NULL);
UpdateControlFile();
/* start the archive_timeout timer running */
! XLogCtl->Write.lastSegSwitchTime = ControlFile->time;
/* initialize shared-memory copy of latest checkpoint XID/epoch */
XLogCtl->ckptXidEpoch = ControlFile->checkPointCopy.nextXidEpoch;
--- 5617,5630 ----
*/
InRecovery = false;
+ LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
ControlFile->state = DB_IN_PRODUCTION;
ControlFile->time = (pg_time_t) time(NULL);
UpdateControlFile();
+ LWLockRelease(ControlFileLock);
/* start the archive_timeout timer running */
! XLogCtl->Write.lastSegSwitchTime = (pg_time_t) time(NULL);
/* initialize shared-memory copy of latest checkpoint XID/epoch */
XLogCtl->ckptXidEpoch = ControlFile->checkPointCopy.nextXidEpoch;
***************
*** 5444,5449 **** StartupXLOG(void)
--- 5659,5703 ----
readRecordBuf = NULL;
readRecordBufSize = 0;
}
+
+ /*
+ * All done. Allow others to write WAL.
+ */
+ XLogCtl->SharedRecoveryProcessingMode = false;
+ }
+
+ /*
+ * Is the system still in recovery?
+ *
+ * As a side-effect, we initialize the local TimeLineID and RedoRecPtr
+ * variables the first time we see that recovery is finished.
+ */
+ bool
+ IsRecoveryProcessingMode(void)
+ {
+ /*
+ * We check shared state each time only until we leave recovery mode.
+ * We can't re-enter recovery, so we rely on the local state variable
+ * after that.
+ */
+ if (!LocalRecoveryProcessingMode)
+ return false;
+ else
+ {
+ /* use volatile pointer to prevent code rearrangement */
+ volatile XLogCtlData *xlogctl = XLogCtl;
+
+ LocalRecoveryProcessingMode = xlogctl->SharedRecoveryProcessingMode;
+
+ /*
+ * Initialize TimeLineID and RedoRecPtr the first time we see that
+ * recovery is finished.
+ */
+ if (!LocalRecoveryProcessingMode)
+ InitXLOGAccess();
+
+ return LocalRecoveryProcessingMode;
+ }
}
/*
***************
*** 5575,5580 **** InitXLOGAccess(void)
--- 5829,5836 ----
{
/* ThisTimeLineID doesn't change so we need no lock to copy it */
ThisTimeLineID = XLogCtl->ThisTimeLineID;
+ Assert(ThisTimeLineID != 0);
+
/* Use GetRedoRecPtr to copy the RedoRecPtr safely */
(void) GetRedoRecPtr();
}
***************
*** 5686,5692 **** ShutdownXLOG(int code, Datum arg)
ereport(LOG,
(errmsg("shutting down")));
! CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
ShutdownCLOG();
ShutdownSUBTRANS();
ShutdownMultiXact();
--- 5942,5951 ----
ereport(LOG,
(errmsg("shutting down")));
! if (IsRecoveryProcessingMode())
! CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
! else
! CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
ShutdownCLOG();
ShutdownSUBTRANS();
ShutdownMultiXact();
***************
*** 5699,5707 **** ShutdownXLOG(int code, Datum arg)
* Log start of a checkpoint.
*/
static void
! LogCheckpointStart(int flags)
{
! elog(LOG, "checkpoint starting:%s%s%s%s%s%s",
(flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
(flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
(flags & CHECKPOINT_FORCE) ? " force" : "",
--- 5958,5977 ----
* Log start of a checkpoint.
*/
static void
! LogCheckpointStart(int flags, bool restartpoint)
{
! char *msg;
!
! /*
! * XXX: This is hopelessly untranslatable. We could call gettext_noop
! * for the main message, but what about all the flags?
! */
! if (restartpoint)
! msg = "restartpoint starting:%s%s%s%s%s%s";
! else
! msg = "checkpoint starting:%s%s%s%s%s%s";
!
! elog(LOG, msg,
(flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
(flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
(flags & CHECKPOINT_FORCE) ? " force" : "",
***************
*** 5714,5720 **** LogCheckpointStart(int flags)
* Log end of a checkpoint.
*/
static void
! LogCheckpointEnd(void)
{
long write_secs,
sync_secs,
--- 5984,5990 ----
* Log end of a checkpoint.
*/
static void
! LogCheckpointEnd(bool restartpoint)
{
long write_secs,
sync_secs,
***************
*** 5737,5753 **** LogCheckpointEnd(void)
CheckpointStats.ckpt_sync_end_t,
&sync_secs, &sync_usecs);
! elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); "
! "%d transaction log file(s) added, %d removed, %d recycled; "
! "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
! CheckpointStats.ckpt_bufs_written,
! (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
! CheckpointStats.ckpt_segs_added,
! CheckpointStats.ckpt_segs_removed,
! CheckpointStats.ckpt_segs_recycled,
! write_secs, write_usecs / 1000,
! sync_secs, sync_usecs / 1000,
! total_secs, total_usecs / 1000);
}
/*
--- 6007,6032 ----
CheckpointStats.ckpt_sync_end_t,
&sync_secs, &sync_usecs);
! if (restartpoint)
! elog(LOG, "restartpoint complete: wrote %d buffers (%.1f%%); "
! "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
! CheckpointStats.ckpt_bufs_written,
! (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
! write_secs, write_usecs / 1000,
! sync_secs, sync_usecs / 1000,
! total_secs, total_usecs / 1000);
! else
! elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); "
! "%d transaction log file(s) added, %d removed, %d recycled; "
! "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
! CheckpointStats.ckpt_bufs_written,
! (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
! CheckpointStats.ckpt_segs_added,
! CheckpointStats.ckpt_segs_removed,
! CheckpointStats.ckpt_segs_recycled,
! write_secs, write_usecs / 1000,
! sync_secs, sync_usecs / 1000,
! total_secs, total_usecs / 1000);
}
/*
***************
*** 5778,5790 **** CreateCheckPoint(int flags)
TransactionId *inCommitXids;
int nInCommit;
/*
* Acquire CheckpointLock to ensure only one checkpoint happens at a time.
! * (This is just pro forma, since in the present system structure there is
! * only one process that is allowed to issue checkpoints at any given
! * time.)
*/
! LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
/*
* Prepare to accumulate statistics.
--- 6057,6089 ----
TransactionId *inCommitXids;
int nInCommit;
+ /* shouldn't happen */
+ if (IsRecoveryProcessingMode())
+ elog(ERROR, "can't create a checkpoint during recovery");
+
/*
* Acquire CheckpointLock to ensure only one checkpoint happens at a time.
! * During normal operation, bgwriter is the only process that creates
! * checkpoints, but at the end archive recovery, the bgwriter can be busy
! * creating a restartpoint while the startup process tries to perform the
! * startup checkpoint.
*/
! if (!LWLockConditionalAcquire(CheckpointLock, LW_EXCLUSIVE))
! {
! Assert(InRecovery);
!
! /*
! * A restartpoint is in progress. Wait until it finishes. This can
! * cause an extra restartpoint to be performed, but that's OK because
! * we're just about to perform a checkpoint anyway. Flushing the
! * buffers in this restartpoint can take some time, but that time is
! * saved from the upcoming checkpoint so the net effect is zero.
! */
! ereport(DEBUG2, (errmsg("hurrying in-progress restartpoint")));
! RequestCheckpoint(CHECKPOINT_IMMEDIATE | CHECKPOINT_WAIT);
!
! LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
! }
/*
* Prepare to accumulate statistics.
***************
*** 5803,5811 **** CreateCheckPoint(int flags)
--- 6102,6112 ----
if (shutdown)
{
+ LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
ControlFile->state = DB_SHUTDOWNING;
ControlFile->time = (pg_time_t) time(NULL);
UpdateControlFile();
+ LWLockRelease(ControlFileLock);
}
/*
***************
*** 5909,5915 **** CreateCheckPoint(int flags)
* to log anything if we decided to skip the checkpoint.
*/
if (log_checkpoints)
! LogCheckpointStart(flags);
TRACE_POSTGRESQL_CHECKPOINT_START(flags);
--- 6210,6216 ----
* to log anything if we decided to skip the checkpoint.
*/
if (log_checkpoints)
! LogCheckpointStart(flags, false);
TRACE_POSTGRESQL_CHECKPOINT_START(flags);
***************
*** 6076,6082 **** CreateCheckPoint(int flags)
/* All real work is done, but log before releasing lock. */
if (log_checkpoints)
! LogCheckpointEnd();
TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written,
NBuffers, CheckpointStats.ckpt_segs_added,
--- 6377,6383 ----
/* All real work is done, but log before releasing lock. */
if (log_checkpoints)
! LogCheckpointEnd(false);
TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written,
NBuffers, CheckpointStats.ckpt_segs_added,
***************
*** 6104,6135 **** CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
}
/*
! * Set a recovery restart point if appropriate
! *
! * This is similar to CreateCheckPoint, but is used during WAL recovery
! * to establish a point from which recovery can roll forward without
! * replaying the entire recovery log. This function is called each time
! * a checkpoint record is read from XLOG; it must determine whether a
! * restartpoint is needed or not.
*/
static void
RecoveryRestartPoint(const CheckPoint *checkPoint)
{
- int elapsed_secs;
int rmid;
!
! /*
! * Do nothing if the elapsed time since the last restartpoint is less than
! * half of checkpoint_timeout. (We use a value less than
! * checkpoint_timeout so that variations in the timing of checkpoints on
! * the master, or speed of transmission of WAL segments to a slave, won't
! * make the slave skip a restartpoint once it's synced with the master.)
! * Checking true elapsed time keeps us from doing restartpoints too often
! * while rapidly scanning large amounts of WAL.
! */
! elapsed_secs = (pg_time_t) time(NULL) - ControlFile->time;
! if (elapsed_secs < CheckPointTimeout / 2)
! return;
/*
* Is it safe to checkpoint? We must ask each of the resource managers
--- 6405,6421 ----
}
/*
! * This is used during WAL recovery to establish a point from which recovery
! * can roll forward without replaying the entire recovery log. This function
! * is called each time a checkpoint record is read from XLOG. It is stored
! * in shared memory, so that it can be used as a restartpoint later on.
*/
static void
RecoveryRestartPoint(const CheckPoint *checkPoint)
{
int rmid;
! /* use volatile pointer to prevent code rearrangement */
! volatile XLogCtlData *xlogctl = XLogCtl;
/*
* Is it safe to checkpoint? We must ask each of the resource managers
***************
*** 6151,6178 **** RecoveryRestartPoint(const CheckPoint *checkPoint)
}
/*
! * OK, force data out to disk
*/
! CheckPointGuts(checkPoint->redo, CHECKPOINT_IMMEDIATE);
/*
! * Update pg_control so that any subsequent crash will restart from this
! * checkpoint. Note: ReadRecPtr gives the XLOG address of the checkpoint
! * record itself.
*/
ControlFile->prevCheckPoint = ControlFile->checkPoint;
! ControlFile->checkPoint = ReadRecPtr;
! ControlFile->checkPointCopy = *checkPoint;
ControlFile->time = (pg_time_t) time(NULL);
UpdateControlFile();
! ereport((recoveryLogRestartpoints ? LOG : DEBUG2),
(errmsg("recovery restart point at %X/%X",
! checkPoint->redo.xlogid, checkPoint->redo.xrecoff)));
if (recoveryLastXTime)
! ereport((recoveryLogRestartpoints ? LOG : DEBUG2),
! (errmsg("last completed transaction was at log time %s",
! timestamptz_to_str(recoveryLastXTime))));
}
/*
--- 6437,6564 ----
}
/*
! * Copy the checkpoint record to shared memory, so that bgwriter can
! * use it the next time it wants to perform a restartpoint.
! */
! SpinLockAcquire(&xlogctl->info_lck);
! XLogCtl->lastCheckPointRecPtr = ReadRecPtr;
! memcpy(&XLogCtl->lastCheckPoint, checkPoint, sizeof(CheckPoint));
! SpinLockRelease(&xlogctl->info_lck);
! }
!
! /*
! * This is similar to CreateCheckPoint, but is used during WAL recovery
! * to establish a point from which recovery can roll forward without
! * replaying the entire recovery log.
! *
! * Returns true if a new restartpoint was established. We can only establish
! * a restartpoint if we have replayed a checkpoint record since last
! * restartpoint.
! */
! bool
! CreateRestartPoint(int flags)
! {
! XLogRecPtr lastCheckPointRecPtr;
! CheckPoint lastCheckPoint;
! /* use volatile pointer to prevent code rearrangement */
! volatile XLogCtlData *xlogctl = XLogCtl;
!
! /*
! * Acquire CheckpointLock to ensure only one restartpoint or checkpoint
! * happens at a time.
! */
! LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
!
! /* Get the a local copy of the last checkpoint record. */
! SpinLockAcquire(&xlogctl->info_lck);
! lastCheckPointRecPtr = xlogctl->lastCheckPointRecPtr;
! memcpy(&lastCheckPoint, &XLogCtl->lastCheckPoint, sizeof(CheckPoint));
! SpinLockRelease(&xlogctl->info_lck);
!
! /*
! * Check that we're still in recovery mode. It's ok if we exit recovery
! * mode after this check, the restart point is valid anyway.
! */
! if (!IsRecoveryProcessingMode())
! {
! ereport(DEBUG2,
! (errmsg("skipping restartpoint, recovery has already ended")));
! LWLockRelease(CheckpointLock);
! return false;
! }
!
! /*
! * If the last checkpoint record we've replayed is already our last
! * restartpoint, we can't perform a new restart point. We still update
! * minRecoveryPoint in that case, so that if this is a shutdown restart
! * point, we won't start up earlier than before. That's not strictly
! * necessary, but when we get hot standby capability, it would be rather
! * weird if the database opened up for read-only connections at a
! * point-in-time before the last shutdown. Such time travel is still
! * possible in case of immediate shutdown, though.
! *
! * We don't explicitly advance minRecoveryPoint when we do create a
! * restartpoint. It's assumed that flushing the buffers will do that
! * as a side-effect.
*/
! if (XLogRecPtrIsInvalid(lastCheckPointRecPtr) ||
! XLByteLE(lastCheckPoint.redo, ControlFile->checkPointCopy.redo))
! {
! XLogRecPtr InvalidXLogRecPtr = {0, 0};
! ereport(DEBUG2,
! (errmsg("skipping restartpoint, already performed at %X/%X",
! lastCheckPoint.redo.xlogid, lastCheckPoint.redo.xrecoff)));
!
! UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
! LWLockRelease(CheckpointLock);
! return false;
! }
!
! if (log_checkpoints)
! {
! /*
! * Prepare to accumulate statistics.
! */
! MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
! CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
!
! LogCheckpointStart(flags, true);
! }
!
! CheckPointGuts(lastCheckPoint.redo, flags);
/*
! * Update pg_control, using current time
*/
+ LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
ControlFile->prevCheckPoint = ControlFile->checkPoint;
! ControlFile->checkPoint = lastCheckPointRecPtr;
! ControlFile->checkPointCopy = lastCheckPoint;
ControlFile->time = (pg_time_t) time(NULL);
UpdateControlFile();
+ LWLockRelease(ControlFileLock);
! /*
! * Currently, there is no need to truncate pg_subtrans during recovery.
! * If we did do that, we will need to have called StartupSUBTRANS()
! * already and then TruncateSUBTRANS() would go here.
! */
!
! /* All real work is done, but log before releasing lock. */
! if (log_checkpoints)
! LogCheckpointEnd(true);
!
! ereport((log_checkpoints ? LOG : DEBUG2),
(errmsg("recovery restart point at %X/%X",
! lastCheckPoint.redo.xlogid, lastCheckPoint.redo.xrecoff)));
!
if (recoveryLastXTime)
! ereport((log_checkpoints ? LOG : DEBUG2),
! (errmsg("last completed transaction was at log time %s",
! timestamptz_to_str(recoveryLastXTime))));
!
! LWLockRelease(CheckpointLock);
! return true;
}
/*
***************
*** 6238,6243 **** RequestXLogSwitch(void)
--- 6624,6632 ----
/*
* XLOG resource manager's routines
+ *
+ * Definitions of message info are in include/catalog/pg_control.h,
+ * though not all messages relate to control file processing.
*/
void
xlog_redo(XLogRecPtr lsn, XLogRecord *record)
***************
*** 6284,6292 **** xlog_redo(XLogRecPtr lsn, XLogRecord *record)
(int) checkPoint.ThisTimeLineID))
ereport(PANIC,
(errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
! checkPoint.ThisTimeLineID, ThisTimeLineID)));
! /* Following WAL records should be run with new TLI */
! ThisTimeLineID = checkPoint.ThisTimeLineID;
}
RecoveryRestartPoint(&checkPoint);
--- 6673,6681 ----
(int) checkPoint.ThisTimeLineID))
ereport(PANIC,
(errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
! checkPoint.ThisTimeLineID, ThisTimeLineID)));
! /* Following WAL records should be run with new TLI */
! ThisTimeLineID = checkPoint.ThisTimeLineID;
}
RecoveryRestartPoint(&checkPoint);
***************
*** 7227,7229 **** CancelBackup(void)
--- 7616,7707 ----
}
}
+ /* ------------------------------------------------------
+ * Startup Process main entry point and signal handlers
+ * ------------------------------------------------------
+ */
+
+ /*
+ * startupproc_quickdie() occurs when signalled SIGQUIT by the postmaster.
+ *
+ * Some backend has bought the farm,
+ * so we need to stop what we're doing and exit.
+ */
+ static void
+ startupproc_quickdie(SIGNAL_ARGS)
+ {
+ PG_SETMASK(&BlockSig);
+
+ /*
+ * DO NOT proc_exit() -- we're here because shared memory may be
+ * corrupted, so we don't want to try to clean up our transaction. Just
+ * nail the windows shut and get out of town.
+ *
+ * Note we do exit(2) not exit(0). This is to force the postmaster into a
+ * system reset cycle if some idiot DBA sends a manual SIGQUIT to a random
+ * backend. This is necessary precisely because we don't clean up our
+ * shared memory state.
+ */
+ exit(2);
+ }
+
+
+ /* SIGTERM: set flag to abort redo and exit */
+ static void
+ StartupProcShutdownHandler(SIGNAL_ARGS)
+ {
+ if (in_restore_command)
+ proc_exit(0);
+ else
+ shutdown_requested = true;
+ }
+
+ /* Main entry point for startup process */
+ void
+ StartupProcessMain(void)
+ {
+ /*
+ * If possible, make this process a group leader, so that the postmaster
+ * can signal any child processes too.
+ */
+ #ifdef HAVE_SETSID
+ if (setsid() < 0)
+ elog(FATAL, "setsid() failed: %m");
+ #endif
+
+ /*
+ * Properly accept or ignore signals the postmaster might send us
+ */
+ pqsignal(SIGHUP, SIG_IGN); /* ignore config file updates */
+ pqsignal(SIGINT, SIG_IGN); /* ignore query cancel */
+ pqsignal(SIGTERM, StartupProcShutdownHandler); /* request shutdown */
+ pqsignal(SIGQUIT, startupproc_quickdie); /* hard crash time */
+ pqsignal(SIGALRM, SIG_IGN);
+ pqsignal(SIGPIPE, SIG_IGN);
+ pqsignal(SIGUSR1, SIG_IGN);
+ pqsignal(SIGUSR2, SIG_IGN);
+
+ /*
+ * Reset some signals that are accepted by postmaster but not here
+ */
+ pqsignal(SIGCHLD, SIG_DFL);
+ pqsignal(SIGTTIN, SIG_DFL);
+ pqsignal(SIGTTOU, SIG_DFL);
+ pqsignal(SIGCONT, SIG_DFL);
+ pqsignal(SIGWINCH, SIG_DFL);
+
+ /*
+ * Unblock signals (they were blocked when the postmaster forked us)
+ */
+ PG_SETMASK(&UnBlockSig);
+
+ StartupXLOG();
+
+ BuildFlatFiles(false);
+
+ /* Let postmaster know that startup is finished */
+ SendPostmasterSignal(PMSIGNAL_RECOVERY_COMPLETED);
+
+ /* exit normally */
+ proc_exit(0);
+ }
*** a/src/backend/bootstrap/bootstrap.c
--- b/src/backend/bootstrap/bootstrap.c
***************
*** 37,43 ****
#include "storage/proc.h"
#include "tcop/tcopprot.h"
#include "utils/builtins.h"
- #include "utils/flatfiles.h"
#include "utils/fmgroids.h"
#include "utils/memutils.h"
#include "utils/ps_status.h"
--- 37,42 ----
***************
*** 416,429 **** AuxiliaryProcessMain(int argc, char *argv[])
proc_exit(1); /* should never return */
case StartupProcess:
! bootstrap_signals();
! StartupXLOG();
! BuildFlatFiles(false);
! proc_exit(0); /* startup done */
case BgWriterProcess:
/* don't set signals, bgwriter has its own agenda */
- InitXLOGAccess();
BackgroundWriterMain();
proc_exit(1); /* should never return */
--- 415,426 ----
proc_exit(1); /* should never return */
case StartupProcess:
! /* don't set signals, startup process has its own agenda */
! StartupProcessMain();
! proc_exit(1); /* should never return */
case BgWriterProcess:
/* don't set signals, bgwriter has its own agenda */
BackgroundWriterMain();
proc_exit(1); /* should never return */
*** a/src/backend/postmaster/bgwriter.c
--- b/src/backend/postmaster/bgwriter.c
***************
*** 49,54 ****
--- 49,55 ----
#include <unistd.h>
#include "access/xlog_internal.h"
+ #include "catalog/pg_control.h"
#include "libpq/pqsignal.h"
#include "miscadmin.h"
#include "pgstat.h"
***************
*** 197,202 **** BackgroundWriterMain(void)
--- 198,204 ----
{
sigjmp_buf local_sigjmp_buf;
MemoryContext bgwriter_context;
+ bool BgWriterRecoveryMode = true;
BgWriterShmem->bgwriter_pid = MyProcPid;
am_bg_writer = true;
***************
*** 418,428 **** BackgroundWriterMain(void)
--- 420,446 ----
}
/*
+ * Check if we've exited recovery. We do this after determining
+ * whether to perform a checkpoint or not, to be sure that we
+ * perform a real checkpoint and not a restartpoint, if someone
+ * requested a checkpoint immediately after exiting recovery. And
+ * we must have the right TimeLineID when we perform a checkpoint;
+ * IsRecoveryProcessingMode() initializes that as a side-effect.
+ */
+ if (BgWriterRecoveryMode && !IsRecoveryProcessingMode())
+ {
+ elog(DEBUG1, "bgwriter changing from recovery to normal mode");
+ BgWriterRecoveryMode = false;
+ }
+
+ /*
* Do a checkpoint if requested, otherwise do one cycle of
* dirty-buffer writing.
*/
if (do_checkpoint)
{
+ bool ckpt_performed = false;
+
/* use volatile pointer to prevent code rearrangement */
volatile BgWriterShmemStruct *bgs = BgWriterShmem;
***************
*** 444,450 **** BackgroundWriterMain(void)
* implementation will not generate warnings caused by
* CheckPointTimeout < CheckPointWarning.
*/
! if ((flags & CHECKPOINT_CAUSE_XLOG) &&
elapsed_secs < CheckPointWarning)
ereport(LOG,
(errmsg("checkpoints are occurring too frequently (%d seconds apart)",
--- 462,469 ----
* implementation will not generate warnings caused by
* CheckPointTimeout < CheckPointWarning.
*/
! if (!BgWriterRecoveryMode &&
! (flags & CHECKPOINT_CAUSE_XLOG) &&
elapsed_secs < CheckPointWarning)
ereport(LOG,
(errmsg("checkpoints are occurring too frequently (%d seconds apart)",
***************
*** 455,468 **** BackgroundWriterMain(void)
* Initialize bgwriter-private variables used during checkpoint.
*/
ckpt_active = true;
! ckpt_start_recptr = GetInsertRecPtr();
ckpt_start_time = now;
ckpt_cached_elapsed = 0;
/*
* Do the checkpoint.
*/
! CreateCheckPoint(flags);
/*
* After any checkpoint, close all smgr files. This is so we
--- 474,494 ----
* Initialize bgwriter-private variables used during checkpoint.
*/
ckpt_active = true;
! if (!BgWriterRecoveryMode)
! ckpt_start_recptr = GetInsertRecPtr();
ckpt_start_time = now;
ckpt_cached_elapsed = 0;
/*
* Do the checkpoint.
*/
! if (!BgWriterRecoveryMode)
! {
! CreateCheckPoint(flags);
! ckpt_performed = true;
! }
! else
! ckpt_performed = CreateRestartPoint(flags);
/*
* After any checkpoint, close all smgr files. This is so we
***************
*** 477,490 **** BackgroundWriterMain(void)
bgs->ckpt_done = bgs->ckpt_started;
SpinLockRelease(&bgs->ckpt_lck);
! ckpt_active = false;
! /*
! * Note we record the checkpoint start time not end time as
! * last_checkpoint_time. This is so that time-driven checkpoints
! * happen at a predictable spacing.
! */
! last_checkpoint_time = now;
}
else
BgBufferSync();
--- 503,529 ----
bgs->ckpt_done = bgs->ckpt_started;
SpinLockRelease(&bgs->ckpt_lck);
! if (ckpt_performed)
! {
! /*
! * Note we record the checkpoint start time not end time as
! * last_checkpoint_time. This is so that time-driven
! * checkpoints happen at a predictable spacing.
! */
! last_checkpoint_time = now;
! }
! else
! {
! /*
! * We were not able to perform the restartpoint (checkpoints
! * throw an ERROR in case of error). Most likely because we
! * have not received any new checkpoint WAL records since the
! * last restartpoint. Try again in 15 s.
! */
! last_checkpoint_time = now - CheckPointTimeout + 15;
! }
! ckpt_active = false;
}
else
BgBufferSync();
***************
*** 507,513 **** CheckArchiveTimeout(void)
pg_time_t now;
pg_time_t last_time;
! if (XLogArchiveTimeout <= 0)
return;
now = (pg_time_t) time(NULL);
--- 546,552 ----
pg_time_t now;
pg_time_t last_time;
! if (XLogArchiveTimeout <= 0 || IsRecoveryProcessingMode())
return;
now = (pg_time_t) time(NULL);
***************
*** 586,592 **** BgWriterNap(void)
(ckpt_active ? ImmediateCheckpointRequested() : checkpoint_requested))
break;
pg_usleep(1000000L);
! AbsorbFsyncRequests();
udelay -= 1000000L;
}
--- 625,632 ----
(ckpt_active ? ImmediateCheckpointRequested() : checkpoint_requested))
break;
pg_usleep(1000000L);
! if (!IsRecoveryProcessingMode())
! AbsorbFsyncRequests();
udelay -= 1000000L;
}
***************
*** 714,729 **** IsCheckpointOnSchedule(double progress)
* However, it's good enough for our purposes, we're only calculating an
* estimate anyway.
*/
! recptr = GetInsertRecPtr();
! elapsed_xlogs =
! (((double) (int32) (recptr.xlogid - ckpt_start_recptr.xlogid)) * XLogSegsPerFile +
! ((double) recptr.xrecoff - (double) ckpt_start_recptr.xrecoff) / XLogSegSize) /
! CheckPointSegments;
!
! if (progress < elapsed_xlogs)
{
! ckpt_cached_elapsed = elapsed_xlogs;
! return false;
}
/*
--- 754,772 ----
* However, it's good enough for our purposes, we're only calculating an
* estimate anyway.
*/
! if (!IsRecoveryProcessingMode())
{
! recptr = GetInsertRecPtr();
! elapsed_xlogs =
! (((double) (int32) (recptr.xlogid - ckpt_start_recptr.xlogid)) * XLogSegsPerFile +
! ((double) recptr.xrecoff - (double) ckpt_start_recptr.xrecoff) / XLogSegSize) /
! CheckPointSegments;
!
! if (progress < elapsed_xlogs)
! {
! ckpt_cached_elapsed = elapsed_xlogs;
! return false;
! }
}
/*
*** a/src/backend/postmaster/postmaster.c
--- b/src/backend/postmaster/postmaster.c
***************
*** 225,235 **** static pid_t StartupPID = 0,
--- 225,262 ----
static int Shutdown = NoShutdown;
static bool FatalError = false; /* T if recovering from backend crash */
+ static bool RecoveryError = false; /* T if recovery failed */
+
+ /* State of WAL redo */
+ #define NoRecovery 0
+ #define RecoveryStarted 1
+ #define RecoveryConsistent 2
+ #define RecoveryCompleted 3
+
+ static int RecoveryStatus = NoRecovery;
/*
* We use a simple state machine to control startup, shutdown, and
* crash recovery (which is rather like shutdown followed by startup).
*
+ * After doing all the postmaster initialization work, we enter PM_STARTUP
+ * state and the startup process is launched. The startup process begins by
+ * reading the control file and other preliminary initialization steps. When
+ * it's ready to start WAL redo, it signals postmaster, and we switch to
+ * PM_RECOVERY phase. The background writer is launched, while the startup
+ * process continues applying WAL.
+ *
+ * After reaching a consistent point in WAL redo, startup process signals
+ * us again, and we switch to PM_RECOVERY_CONSISTENT phase. There's currently
+ * no difference between PM_RECOVERY and PM_RECOVERY_CONSISTENT, but we
+ * could start accepting connections to perform read-only queries at this
+ * point, if we had the infrastructure to do that.
+ *
+ * When the WAL redo is finished, the startup process signals us the third
+ * time, and we switch to PM_RUN state. The startup process can also skip the
+ * recovery and consistent recovery phases altogether, as it will during
+ * normal startup when there's no recovery to be done, for example.
+ *
* Normal child backends can only be launched when we are in PM_RUN state.
* (We also allow it in PM_WAIT_BACKUP state, but only for superusers.)
* In other states we handle connection requests by launching "dead_end"
***************
*** 245,259 **** static bool FatalError = false; /* T if recovering from backend crash */
*
* Notice that this state variable does not distinguish *why* we entered
* states later than PM_RUN --- Shutdown and FatalError must be consulted
! * to find that out. FatalError is never true in PM_RUN state, nor in
! * PM_SHUTDOWN states (because we don't enter those states when trying to
! * recover from a crash). It can be true in PM_STARTUP state, because we
! * don't clear it until we've successfully recovered.
*/
typedef enum
{
PM_INIT, /* postmaster starting */
PM_STARTUP, /* waiting for startup subprocess */
PM_RUN, /* normal "database is alive" state */
PM_WAIT_BACKUP, /* waiting for online backup mode to end */
PM_WAIT_BACKENDS, /* waiting for live backends to exit */
--- 272,288 ----
*
* Notice that this state variable does not distinguish *why* we entered
* states later than PM_RUN --- Shutdown and FatalError must be consulted
! * to find that out. FatalError is never true in PM_RECOVERY_* or PM_RUN
! * states, nor in PM_SHUTDOWN states (because we don't enter those states
! * when trying to recover from a crash). It can be true in PM_STARTUP state,
! * because we don't clear it until we've successfully started WAL redo.
*/
typedef enum
{
PM_INIT, /* postmaster starting */
PM_STARTUP, /* waiting for startup subprocess */
+ PM_RECOVERY, /* in recovery mode */
+ PM_RECOVERY_CONSISTENT, /* consistent recovery mode */
PM_RUN, /* normal "database is alive" state */
PM_WAIT_BACKUP, /* waiting for online backup mode to end */
PM_WAIT_BACKENDS, /* waiting for live backends to exit */
***************
*** 307,312 **** static void pmdie(SIGNAL_ARGS);
--- 336,342 ----
static void reaper(SIGNAL_ARGS);
static void sigusr1_handler(SIGNAL_ARGS);
static void dummy_handler(SIGNAL_ARGS);
+ static void CheckRecoverySignals(void);
static void CleanupBackend(int pid, int exitstatus);
static void HandleChildCrash(int pid, int exitstatus, const char *procname);
static void LogChildExit(int lev, const char *procname,
***************
*** 1302,1308 **** ServerLoop(void)
* state that prevents it, start one. It doesn't matter if this
* fails, we'll just try again later.
*/
! if (BgWriterPID == 0 && pmState == PM_RUN)
BgWriterPID = StartBackgroundWriter();
/*
--- 1332,1340 ----
* state that prevents it, start one. It doesn't matter if this
* fails, we'll just try again later.
*/
! if (BgWriterPID == 0 &&
! (pmState == PM_RUN || pmState == PM_RECOVERY ||
! pmState == PM_RECOVERY_CONSISTENT))
BgWriterPID = StartBackgroundWriter();
/*
***************
*** 1752,1758 **** canAcceptConnections(void)
return CAC_WAITBACKUP; /* allow superusers only */
if (Shutdown > NoShutdown)
return CAC_SHUTDOWN; /* shutdown is pending */
! if (pmState == PM_STARTUP && !FatalError)
return CAC_STARTUP; /* normal startup */
return CAC_RECOVERY; /* else must be crash recovery */
}
--- 1784,1793 ----
return CAC_WAITBACKUP; /* allow superusers only */
if (Shutdown > NoShutdown)
return CAC_SHUTDOWN; /* shutdown is pending */
! if (!FatalError &&
! (pmState == PM_STARTUP ||
! pmState == PM_RECOVERY ||
! pmState == PM_RECOVERY_CONSISTENT))
return CAC_STARTUP; /* normal startup */
return CAC_RECOVERY; /* else must be crash recovery */
}
***************
*** 1982,1988 **** pmdie(SIGNAL_ARGS)
ereport(LOG,
(errmsg("received smart shutdown request")));
! if (pmState == PM_RUN)
{
/* autovacuum workers are told to shut down immediately */
SignalAutovacWorkers(SIGTERM);
--- 2017,2023 ----
ereport(LOG,
(errmsg("received smart shutdown request")));
! if (pmState == PM_RUN || pmState == PM_RECOVERY || pmState == PM_RECOVERY_CONSISTENT)
{
/* autovacuum workers are told to shut down immediately */
SignalAutovacWorkers(SIGTERM);
***************
*** 2019,2025 **** pmdie(SIGNAL_ARGS)
if (StartupPID != 0)
signal_child(StartupPID, SIGTERM);
! if (pmState == PM_RUN || pmState == PM_WAIT_BACKUP)
{
ereport(LOG,
(errmsg("aborting any active transactions")));
--- 2054,2067 ----
if (StartupPID != 0)
signal_child(StartupPID, SIGTERM);
! if (pmState == PM_RECOVERY)
! {
! /* only bgwriter is active in this state */
! pmState = PM_WAIT_BACKENDS;
! }
! if (pmState == PM_RUN ||
! pmState == PM_WAIT_BACKUP ||
! pmState == PM_RECOVERY_CONSISTENT)
{
ereport(LOG,
(errmsg("aborting any active transactions")));
***************
*** 2116,2125 **** reaper(SIGNAL_ARGS)
if (pid == StartupPID)
{
StartupPID = 0;
- Assert(pmState == PM_STARTUP);
! /* FATAL exit of startup is treated as catastrophic */
! if (!EXIT_STATUS_0(exitstatus))
{
LogChildExit(LOG, _("startup process"),
pid, exitstatus);
--- 2158,2179 ----
if (pid == StartupPID)
{
StartupPID = 0;
! /*
! * Check if we've received a signal from the startup process
! * first. This can change pmState. If the startup process sends
! * a signal, and exits immediately after that, we might not have
! * processed the signal yet, and we need to know if it completed
! * recovery before exiting.
! */
! CheckRecoverySignals();
!
! /*
! * Unexpected exit of startup process (including FATAL exit)
! * during PM_STARTUP is treated as catastrophic. There is no
! * other processes running yet.
! */
! if (pmState == PM_STARTUP)
{
LogChildExit(LOG, _("startup process"),
pid, exitstatus);
***************
*** 2127,2186 **** reaper(SIGNAL_ARGS)
(errmsg("aborting startup due to startup process failure")));
ExitPostmaster(1);
}
-
/*
! * Startup succeeded - we are done with system startup or
! * recovery.
*/
! FatalError = false;
!
! /*
! * Go to shutdown mode if a shutdown request was pending.
! */
! if (Shutdown > NoShutdown)
{
! pmState = PM_WAIT_BACKENDS;
! /* PostmasterStateMachine logic does the rest */
continue;
}
-
/*
! * Otherwise, commence normal operations.
! */
! pmState = PM_RUN;
!
! /*
! * Load the flat authorization file into postmaster's cache. The
! * startup process has recomputed this from the database contents,
! * so we wait till it finishes before loading it.
! */
! load_role();
!
! /*
! * Crank up the background writer. It doesn't matter if this
! * fails, we'll just try again later.
*/
! Assert(BgWriterPID == 0);
! BgWriterPID = StartBackgroundWriter();
!
! /*
! * Likewise, start other special children as needed. In a restart
! * situation, some of them may be alive already.
! */
! if (WalWriterPID == 0)
! WalWriterPID = StartWalWriter();
! if (AutoVacuumingActive() && AutoVacPID == 0)
! AutoVacPID = StartAutoVacLauncher();
! if (XLogArchivingActive() && PgArchPID == 0)
! PgArchPID = pgarch_start();
! if (PgStatPID == 0)
! PgStatPID = pgstat_start();
!
! /* at this point we are really open for business */
! ereport(LOG,
! (errmsg("database system is ready to accept connections")));
!
! continue;
}
/*
--- 2181,2210 ----
(errmsg("aborting startup due to startup process failure")));
ExitPostmaster(1);
}
/*
! * Any unexpected exit (including FATAL exit) of the startup
! * process is treated as a crash, except that we don't want
! * to reinitialize.
*/
! if (!EXIT_STATUS_0(exitstatus))
{
! RecoveryError = true;
! HandleChildCrash(pid, exitstatus,
! _("startup process"));
continue;
}
/*
! * Startup process exited normally, but didn't finish recovery.
! * This can happen if someone else than postmaster kills the
! * startup process with SIGTERM. Treat it like a crash.
*/
! if (pmState == PM_RECOVERY || pmState == PM_RECOVERY_CONSISTENT)
! {
! RecoveryError = true;
! HandleChildCrash(pid, exitstatus,
! _("startup process"));
! continue;
! }
}
/*
***************
*** 2443,2448 **** HandleChildCrash(int pid, int exitstatus, const char *procname)
--- 2467,2484 ----
}
}
+ /* Take care of the startup process too */
+ if (pid == StartupPID)
+ StartupPID = 0;
+ else if (StartupPID != 0 && !FatalError)
+ {
+ ereport(DEBUG2,
+ (errmsg_internal("sending %s to process %d",
+ (SendStop ? "SIGSTOP" : "SIGQUIT"),
+ (int) StartupPID)));
+ signal_child(BgWriterPID, (SendStop ? SIGSTOP : SIGQUIT));
+ }
+
/* Take care of the bgwriter too */
if (pid == BgWriterPID)
BgWriterPID = 0;
***************
*** 2514,2520 **** HandleChildCrash(int pid, int exitstatus, const char *procname)
FatalError = true;
/* We now transit into a state of waiting for children to die */
! if (pmState == PM_RUN ||
pmState == PM_WAIT_BACKUP ||
pmState == PM_SHUTDOWN)
pmState = PM_WAIT_BACKENDS;
--- 2550,2558 ----
FatalError = true;
/* We now transit into a state of waiting for children to die */
! if (pmState == PM_RECOVERY ||
! pmState == PM_RECOVERY_CONSISTENT ||
! pmState == PM_RUN ||
pmState == PM_WAIT_BACKUP ||
pmState == PM_SHUTDOWN)
pmState = PM_WAIT_BACKENDS;
***************
*** 2582,2587 **** LogChildExit(int lev, const char *procname, int pid, int exitstatus)
--- 2620,2746 ----
static void
PostmasterStateMachine(void)
{
+ /* Startup states */
+
+ if (pmState == PM_STARTUP && RecoveryStatus > NoRecovery)
+ {
+ /* WAL redo has started. We're out of reinitialization. */
+ FatalError = false;
+
+ /*
+ * Go to shutdown mode if a shutdown request was pending.
+ */
+ if (Shutdown > NoShutdown)
+ {
+ pmState = PM_WAIT_BACKENDS;
+ /* PostmasterStateMachine logic does the rest */
+ }
+ else
+ {
+ /*
+ * Crank up the background writer. It doesn't matter if this
+ * fails, we'll just try again later.
+ */
+ Assert(BgWriterPID == 0);
+ BgWriterPID = StartBackgroundWriter();
+
+ pmState = PM_RECOVERY;
+ }
+ }
+ if (pmState == PM_RECOVERY && RecoveryStatus >= RecoveryConsistent)
+ {
+ /*
+ * Go to shutdown mode if a shutdown request was pending.
+ */
+ if (Shutdown > NoShutdown)
+ {
+ pmState = PM_WAIT_BACKENDS;
+ /* PostmasterStateMachine logic does the rest */
+ }
+ else
+ {
+ /*
+ * Startup process has entered recovery. We consider that good
+ * enough to reset FatalError.
+ */
+ pmState = PM_RECOVERY_CONSISTENT;
+
+ /*
+ * Load the flat authorization file into postmaster's cache. The
+ * startup process won't have recomputed this from the database yet,
+ * so we it may change following recovery.
+ */
+ load_role();
+
+ /*
+ * Likewise, start other special children as needed.
+ */
+ Assert(PgStatPID == 0);
+ PgStatPID = pgstat_start();
+
+ /* XXX at this point we could accept read-only connections */
+ ereport(DEBUG1,
+ (errmsg("database system is in consistent recovery mode")));
+ }
+ }
+ if ((pmState == PM_RECOVERY ||
+ pmState == PM_RECOVERY_CONSISTENT ||
+ pmState == PM_STARTUP) &&
+ RecoveryStatus == RecoveryCompleted)
+ {
+ /*
+ * Startup succeeded.
+ *
+ * Go to shutdown mode if a shutdown request was pending.
+ */
+ if (Shutdown > NoShutdown)
+ {
+ pmState = PM_WAIT_BACKENDS;
+ /* PostmasterStateMachine logic does the rest */
+ }
+ else
+ {
+ /*
+ * Otherwise, commence normal operations.
+ */
+ pmState = PM_RUN;
+
+ /*
+ * Load the flat authorization file into postmaster's cache. The
+ * startup process has recomputed this from the database contents,
+ * so we wait till it finishes before loading it.
+ */
+ load_role();
+
+ /*
+ * Crank up the background writer, if we didn't do that already
+ * when we entered consistent recovery phase. It doesn't matter
+ * if this fails, we'll just try again later.
+ */
+ if (BgWriterPID == 0)
+ BgWriterPID = StartBackgroundWriter();
+
+ /*
+ * Likewise, start other special children as needed. In a restart
+ * situation, some of them may be alive already.
+ */
+ if (WalWriterPID == 0)
+ WalWriterPID = StartWalWriter();
+ if (AutoVacuumingActive() && AutoVacPID == 0)
+ AutoVacPID = StartAutoVacLauncher();
+ if (XLogArchivingActive() && PgArchPID == 0)
+ PgArchPID = pgarch_start();
+ if (PgStatPID == 0)
+ PgStatPID = pgstat_start();
+
+ /* at this point we are really open for business */
+ ereport(LOG,
+ (errmsg("database system is ready to accept connections")));
+ }
+ }
+
+ /* Shutdown states */
+
if (pmState == PM_WAIT_BACKUP)
{
/*
***************
*** 2723,2728 **** PostmasterStateMachine(void)
--- 2882,2896 ----
}
/*
+ * If recovery failed, wait for all non-syslogger children to exit,
+ * and then exit postmaster. We don't try to reinitialize when recovery
+ * fails, because more than likely it will just fail again and we will
+ * keep trying forever.
+ */
+ if (RecoveryError && pmState == PM_NO_CHILDREN)
+ ExitPostmaster(1);
+
+ /*
* If we need to recover from a crash, wait for all non-syslogger
* children to exit, then reset shmem and StartupDataBase.
*/
***************
*** 2734,2739 **** PostmasterStateMachine(void)
--- 2902,2909 ----
shmem_exit(1);
reset_shared(PostPortNumber);
+ RecoveryStatus = NoRecovery;
+
StartupPID = StartupDataBase();
Assert(StartupPID != 0);
pmState = PM_STARTUP;
***************
*** 3838,3843 **** ExitPostmaster(int status)
--- 4008,4044 ----
}
/*
+ * common code used in sigusr1_handler() and reaper() to handle
+ * recovery-related signals from startup process
+ */
+ static void
+ CheckRecoverySignals(void)
+ {
+ bool changed = false;
+
+ if (CheckPostmasterSignal(PMSIGNAL_RECOVERY_STARTED))
+ {
+ Assert(pmState == PM_STARTUP);
+
+ RecoveryStatus = RecoveryStarted;
+ changed = true;
+ }
+ if (CheckPostmasterSignal(PMSIGNAL_RECOVERY_CONSISTENT))
+ {
+ RecoveryStatus = RecoveryConsistent;
+ changed = true;
+ }
+ if (CheckPostmasterSignal(PMSIGNAL_RECOVERY_COMPLETED))
+ {
+ RecoveryStatus = RecoveryCompleted;
+ changed = true;
+ }
+
+ if (changed)
+ PostmasterStateMachine();
+ }
+
+ /*
* sigusr1_handler - handle signal conditions from child processes
*/
static void
***************
*** 3847,3852 **** sigusr1_handler(SIGNAL_ARGS)
--- 4048,4055 ----
PG_SETMASK(&BlockSig);
+ CheckRecoverySignals();
+
if (CheckPostmasterSignal(PMSIGNAL_PASSWORD_CHANGE))
{
/*
*** a/src/backend/storage/buffer/README
--- b/src/backend/storage/buffer/README
***************
*** 268,270 **** out (and anyone else who flushes buffer contents to disk must do so too).
--- 268,279 ----
This ensures that the page image transferred to disk is reasonably consistent.
We might miss a hint-bit update or two but that isn't a problem, for the same
reasons mentioned under buffer access rules.
+
+ As of 8.4, background writer starts during recovery mode when there is
+ some form of potentially extended recovery to perform. It performs an
+ identical service to normal processing, except that checkpoints it
+ writes are technically restartpoints. Flushing outstanding WAL for dirty
+ buffers is also skipped, though there shouldn't ever be new WAL entries
+ at that time in any case. We could choose to start background writer
+ immediately but we hold off until we can prove the database is in a
+ consistent state so that postmaster has a single, clean state change.
*** a/src/backend/utils/init/postinit.c
--- b/src/backend/utils/init/postinit.c
***************
*** 324,330 **** InitCommunication(void)
* If you're wondering why this is separate from InitPostgres at all:
* the critical distinction is that this stuff has to happen before we can
* run XLOG-related initialization, which is done before InitPostgres --- in
! * fact, for cases such as checkpoint creation processes, InitPostgres may
* never be done at all.
*/
void
--- 324,330 ----
* If you're wondering why this is separate from InitPostgres at all:
* the critical distinction is that this stuff has to happen before we can
* run XLOG-related initialization, which is done before InitPostgres --- in
! * fact, for cases such as the background writer process, InitPostgres may
* never be done at all.
*/
void
*** a/src/include/access/xlog.h
--- b/src/include/access/xlog.h
***************
*** 133,139 **** typedef struct XLogRecData
} XLogRecData;
extern TimeLineID ThisTimeLineID; /* current TLI */
! extern bool InRecovery;
extern XLogRecPtr XactLastRecEnd;
/* these variables are GUC parameters related to XLOG */
--- 133,148 ----
} XLogRecData;
extern TimeLineID ThisTimeLineID; /* current TLI */
!
! /*
! * Prior to 8.4, all activity during recovery were carried out by Startup
! * process. This local variable continues to be used in many parts of the
! * code to indicate actions taken by RecoveryManagers. Other processes who
! * potentially perform work during recovery should check
! * IsRecoveryProcessingMode(), see XLogCtl notes in xlog.c
! */
! extern bool InRecovery;
!
extern XLogRecPtr XactLastRecEnd;
/* these variables are GUC parameters related to XLOG */
***************
*** 199,204 **** extern void RestoreBkpBlocks(XLogRecPtr lsn, XLogRecord *record, bool cleanup);
--- 208,215 ----
extern void xlog_redo(XLogRecPtr lsn, XLogRecord *record);
extern void xlog_desc(StringInfo buf, uint8 xl_info, char *rec);
+ extern bool IsRecoveryProcessingMode(void);
+
extern void UpdateControlFile(void);
extern Size XLOGShmemSize(void);
extern void XLOGShmemInit(void);
***************
*** 207,215 **** extern void StartupXLOG(void);
--- 218,229 ----
extern void ShutdownXLOG(int code, Datum arg);
extern void InitXLOGAccess(void);
extern void CreateCheckPoint(int flags);
+ extern bool CreateRestartPoint(int flags);
extern void XLogPutNextOid(Oid nextOid);
extern XLogRecPtr GetRedoRecPtr(void);
extern XLogRecPtr GetInsertRecPtr(void);
extern void GetNextXidAndEpoch(TransactionId *xid, uint32 *epoch);
+ extern void StartupProcessMain(void);
+
#endif /* XLOG_H */
*** a/src/include/storage/pmsignal.h
--- b/src/include/storage/pmsignal.h
***************
*** 22,27 ****
--- 22,30 ----
*/
typedef enum
{
+ PMSIGNAL_RECOVERY_STARTED, /* recovery has started */
+ PMSIGNAL_RECOVERY_CONSISTENT, /* recovery has reached consistent state */
+ PMSIGNAL_RECOVERY_COMPLETED, /* recovery completed */
PMSIGNAL_PASSWORD_CHANGE, /* pg_auth file has changed */
PMSIGNAL_WAKEN_ARCHIVER, /* send a NOTIFY signal to xlog archiver */
PMSIGNAL_ROTATE_LOGFILE, /* send SIGUSR1 to syslogger to rotate logfile */
В списке pgsql-hackers по дате отправления: