Re: Hot standby, recovery infra

Поиск
Список
Период
Сортировка
От	Heikki Linnakangas
Тема	Re: Hot standby, recovery infra
Дата	9 февраля 2009 г. 11:13:22
Msg-id	499047FE.9090407@enterprisedb.com обсуждение исходный текст
Ответ на	Re: Hot standby, recovery infra (Simon Riggs <simon@2ndQuadrant.com>)
Ответы	Re: Hot standby, recovery infra
Список	pgsql-hackers
Дерево обсуждения
Simon Riggs wrote:
> On Fri, 2009-02-06 at 10:06 +0200, Heikki Linnakangas wrote:
>> Simon Riggs wrote:
>>> On Thu, 2009-02-05 at 21:54 +0200, Heikki Linnakangas wrote:
>>>> - If you perform a fast shutdown while startup process is waiting for
>>>> the restore command, startup process sometimes throws a FATAL error
>>>> which leads escalates into an immediate shutdown. That leads to
>>>> different messages in the logs, and skipping of the shutdown
>>>> restartpoint that we now otherwise perform.
>>> Sometimes?
>> I think what happens is that if the restore command receives the SIGTERM
>> and dies before the startup process that's waiting for the restore
>> command receives the SIGTERM, the startup process throws a FATAL error
>> because the restore command died unexpectedly. I put this
>>
>>>     if (shutdown_requested && InRedo)
>>>     {
>>>         /* XXX: Is EndRecPtr always the right value here? */
>>>         UpdateMinRecoveryPoint(EndRecPtr);
>>>         proc_exit(0);
>>>     }
>> right after the "system(xlogRestoreCmd)" call, to exit gracefully if we
>> were requested to shut down while restore command was running, but it
>> seems that that's not enough because of the race condition.
>
> Can we trap the death of the restorecmd and handle it differently from
> the death of the startup process?

The startup process launches the restore command, so it's the startup
process that needs to handle its death.

Anyway, I think I've found a solution. While we're executing the restore
command, we're in a state that it's safe to proc_exit(0). We can set a
flag to indicate to the signal handler when we're executing the restore
command, so that the signal handler can do proc_exit(0) on SIGTERM. So
if the startup process receives the SIGTERM first, it will proc_exit(0)
immediately, and if the restore command dies first due to the SIGTERM,
startup process exits with proc_exit(0) when it sees that restore
command exited because of the SIGTERM. If either process receives
SIGTERM for some other reason than a fast shutdown request, postmaster
will see that the startup process exited unexpectedly, and handles that
like a child process crash.

Attached is an updated patch that does that, and I've fixed all the
other outstanding issues I listed earlier as well. Now I'm feeling again
that this is in pretty good shape.

--
   Heikki Linnakangas
   EnterpriseDB   http://www.enterprisedb.com
*** a/src/backend/access/transam/xlog.c
--- b/src/backend/access/transam/xlog.c
***************
*** 36,41 ****
--- 36,42 ----
  #include "catalog/pg_control.h"
  #include "catalog/pg_type.h"
  #include "funcapi.h"
+ #include "libpq/pqsignal.h"
  #include "miscadmin.h"
  #include "pgstat.h"
  #include "postmaster/bgwriter.h"
***************
*** 47,52 ****
--- 48,54 ----
  #include "storage/smgr.h"
  #include "storage/spin.h"
  #include "utils/builtins.h"
+ #include "utils/flatfiles.h"
  #include "utils/guc.h"
  #include "utils/ps_status.h"
  #include "pg_trace.h"
***************
*** 119,130 **** CheckpointStatsData CheckpointStats;
   */
  TimeLineID    ThisTimeLineID = 0;

! /* Are we doing recovery from XLOG? */
  bool        InRecovery = false;

  /* Are we recovering using offline XLOG archives? */
  static bool InArchiveRecovery = false;

  /* Was the last xlog file restored from archive, or local? */
  static bool restoredFromArchive = false;

--- 121,146 ----
   */
  TimeLineID    ThisTimeLineID = 0;

! /*
!  * Are we doing recovery from XLOG?
!  *
!  * This is only ever true in the startup process, when it's replaying WAL.
!  * It's used in functions that need to act differently when called from a
!  * redo function (e.g skip WAL logging).  To check whether the system is in
!  * recovery regardless of what process you're running in, use
!  * IsRecoveryProcessingMode().
!  */
  bool        InRecovery = false;

  /* Are we recovering using offline XLOG archives? */
  static bool InArchiveRecovery = false;

+ /*
+  * Local copy of shared RecoveryProcessingMode variable. True actually
+  * means "not known, need to check the shared state"
+  */
+ static bool LocalRecoveryProcessingMode = true;
+
  /* Was the last xlog file restored from archive, or local? */
  static bool restoredFromArchive = false;

***************
*** 133,139 **** static char *recoveryRestoreCommand = NULL;
  static bool recoveryTarget = false;
  static bool recoveryTargetExact = false;
  static bool recoveryTargetInclusive = true;
- static bool recoveryLogRestartpoints = false;
  static TransactionId recoveryTargetXid;
  static TimestampTz recoveryTargetTime;
  static TimestampTz recoveryLastXTime = 0;
--- 149,154 ----
***************
*** 242,250 **** static XLogRecPtr RedoRecPtr;
   * ControlFileLock: must be held to read/update control file or create
   * new log file.
   *
!  * CheckpointLock: must be held to do a checkpoint (ensures only one
!  * checkpointer at a time; currently, with all checkpoints done by the
!  * bgwriter, this is just pro forma).
   *
   *----------
   */
--- 257,264 ----
   * ControlFileLock: must be held to read/update control file or create
   * new log file.
   *
!  * CheckpointLock: must be held to do a checkpoint or restartpoint (ensures
!  * only one checkpointer at a time)
   *
   *----------
   */
***************
*** 313,318 **** typedef struct XLogCtlData
--- 327,351 ----
      int            XLogCacheBlck;    /* highest allocated xlog buffer index */
      TimeLineID    ThisTimeLineID;

+     /*
+      * SharedRecoveryProcessingMode indicates if we're still in crash or
+      * archive recovery.  It's checked by IsRecoveryProcessingMode().
+      */
+     bool        SharedRecoveryProcessingMode;
+
+     /*
+      * During recovery, we keep a copy of the latest checkpoint record
+      * here.  Used by the background writer when it wants to create
+      * a restartpoint.
+      *
+      * Protected by info_lck.
+      */
+     XLogRecPtr    lastCheckPointRecPtr;
+     CheckPoint    lastCheckPoint;
+
+     /* end+1 of the last record replayed (or being replayed) */
+     XLogRecPtr    replayEndRecPtr;
+
      slock_t        info_lck;        /* locks shared variables shown above */
  } XLogCtlData;

***************
*** 387,395 **** static XLogRecPtr ReadRecPtr;    /* start of last record read */
--- 420,440 ----
  static XLogRecPtr EndRecPtr;    /* end+1 of last record read */
  static XLogRecord *nextRecord = NULL;
  static TimeLineID lastPageTLI = 0;
+ static XLogRecPtr minRecoveryPoint; /* local copy of ControlFile->minRecoveryPoint */
+ static bool    updateMinRecoveryPoint = true;

  static bool InRedo = false;

+ /*
+  * Flag set by interrupt handlers for later service in the redo loop.
+  */
+ static volatile sig_atomic_t shutdown_requested = false;
+ /*
+  * Flag set when executing a restore command, to tell SIGTERM signal handler
+  * that it's safe to just proc_exit(0).
+  */
+ static volatile sig_atomic_t in_restore_command = false;
+

  static void XLogArchiveNotify(const char *xlog);
  static void XLogArchiveNotifySeg(uint32 log, uint32 seg);
***************
*** 420,425 **** static void PreallocXlogFiles(XLogRecPtr endptr);
--- 465,471 ----
  static void RemoveOldXlogFiles(uint32 log, uint32 seg, XLogRecPtr endptr);
  static void ValidateXLOGDirectoryStructure(void);
  static void CleanupBackupHistory(void);
+ static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force);
  static XLogRecord *ReadRecord(XLogRecPtr *RecPtr, int emode);
  static bool ValidXLOGHeader(XLogPageHeader hdr, int emode);
  static XLogRecord *ReadCheckpointRecord(XLogRecPtr RecPtr, int whichChkpt);
***************
*** 484,489 **** XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata)
--- 530,539 ----
      bool        doPageWrites;
      bool        isLogSwitch = (rmid == RM_XLOG_ID && info == XLOG_SWITCH);

+     /* cross-check on whether we should be here or not */
+     if (IsRecoveryProcessingMode())
+         elog(FATAL, "cannot make new WAL entries during recovery");
+
      /* info's high bits are reserved for use by me */
      if (info & XLR_INFO_MASK)
          elog(PANIC, "invalid xlog info mask %02X", info);
***************
*** 1718,1723 **** XLogSetAsyncCommitLSN(XLogRecPtr asyncCommitLSN)
--- 1768,1830 ----
  }

  /*
+  * Advance minRecoveryPoint in control file.
+  *
+  * If we crash during recovery, we must reach this point again before the
+  * database is consistent.
+  *
+  * If 'force' is true, 'lsn' argument is ignored. Otherwise, minRecoveryPoint
+  * is is only updated if it's already greater than or equal to 'lsn'.
+  */
+ static void
+ UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
+ {
+     /* Quick check using our local copy of the variable */
+     if (!updateMinRecoveryPoint || (!force && XLByteLE(lsn, minRecoveryPoint)))
+         return;
+
+     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+
+     /* update local copy */
+     minRecoveryPoint = ControlFile->minRecoveryPoint;
+
+     /*
+      * An invalid minRecoveryPoint means that we need to recover all the WAL,
+      * ie. crash recovery. Don't update the control file in that case.
+      */
+     if (minRecoveryPoint.xlogid == 0 && minRecoveryPoint.xrecoff == 0)
+         updateMinRecoveryPoint = false;
+     else if (force || XLByteLT(minRecoveryPoint, lsn))
+     {
+         /* use volatile pointer to prevent code rearrangement */
+         volatile XLogCtlData *xlogctl = XLogCtl;
+         XLogRecPtr newMinRecoveryPoint;
+
+         /*
+          * To avoid having to update the control file too often, we update
+          * it all the way to the last record being replayed, even though 'lsn'
+          * would suffice for correctness.
+          */
+         SpinLockAcquire(&xlogctl->info_lck);
+         newMinRecoveryPoint = xlogctl->replayEndRecPtr;
+         SpinLockRelease(&xlogctl->info_lck);
+
+         /* update control file */
+         if (XLByteLT(ControlFile->minRecoveryPoint, newMinRecoveryPoint))
+         {
+             ControlFile->minRecoveryPoint = newMinRecoveryPoint;
+             UpdateControlFile();
+             minRecoveryPoint = newMinRecoveryPoint;
+         }
+
+         ereport(DEBUG2,
+                 (errmsg("updated min recovery point to %X/%X",
+                         minRecoveryPoint.xlogid, minRecoveryPoint.xrecoff)));
+     }
+     LWLockRelease(ControlFileLock);
+ }
+
+ /*
   * Ensure that all XLOG data through the given position is flushed to disk.
   *
   * NOTE: this differs from XLogWrite mainly in that the WALWriteLock is not
***************
*** 1729,1737 **** XLogFlush(XLogRecPtr record)
      XLogRecPtr    WriteRqstPtr;
      XLogwrtRqst WriteRqst;

!     /* Disabled during REDO */
!     if (InRedo)
          return;

      /* Quick exit if already known flushed */
      if (XLByteLE(record, LogwrtResult.Flush))
--- 1836,1850 ----
      XLogRecPtr    WriteRqstPtr;
      XLogwrtRqst WriteRqst;

!     /*
!      * During REDO, we don't try to flush the WAL, but update minRecoveryPoint
!      * instead.
!      */
!     if (IsRecoveryProcessingMode())
!     {
!         UpdateMinRecoveryPoint(record, false);
          return;
+     }

      /* Quick exit if already known flushed */
      if (XLByteLE(record, LogwrtResult.Flush))
***************
*** 1818,1826 **** XLogFlush(XLogRecPtr record)
       * the bad page is encountered again during recovery then we would be
       * unable to restart the database at all!  (This scenario has actually
       * happened in the field several times with 7.1 releases. Note that we
!      * cannot get here while InRedo is true, but if the bad page is brought in
!      * and marked dirty during recovery then CreateCheckPoint will try to
!      * flush it at the end of recovery.)
       *
       * The current approach is to ERROR under normal conditions, but only
       * WARNING during recovery, so that the system can be brought up even if
--- 1931,1939 ----
       * the bad page is encountered again during recovery then we would be
       * unable to restart the database at all!  (This scenario has actually
       * happened in the field several times with 7.1 releases. Note that we
!      * cannot get here while IsRecoveryProcessingMode(), but if the bad page is
!      * brought in and marked dirty during recovery then if a checkpoint were
!      * performed at the end of recovery it will try to flush it.
       *
       * The current approach is to ERROR under normal conditions, but only
       * WARNING during recovery, so that the system can be brought up even if
***************
*** 1857,1862 **** XLogBackgroundFlush(void)
--- 1970,1979 ----
      XLogRecPtr    WriteRqstPtr;
      bool        flexible = true;

+     /* XLOG doesn't need flushing during recovery */
+     if (IsRecoveryProcessingMode())
+         return;
+
      /* read LogwrtResult and update local state */
      {
          /* use volatile pointer to prevent code rearrangement */
***************
*** 1928,1933 **** XLogAsyncCommitFlush(void)
--- 2045,2054 ----
      /* use volatile pointer to prevent code rearrangement */
      volatile XLogCtlData *xlogctl = XLogCtl;

+     /* There's no asynchronously committed transactions during recovery */
+     if (IsRecoveryProcessingMode())
+         return;
+
      SpinLockAcquire(&xlogctl->info_lck);
      WriteRqstPtr = xlogctl->asyncCommitLSN;
      SpinLockRelease(&xlogctl->info_lck);
***************
*** 1944,1949 **** XLogAsyncCommitFlush(void)
--- 2065,2074 ----
  bool
  XLogNeedsFlush(XLogRecPtr record)
  {
+     /* XLOG doesn't need flushing during recovery */
+     if (IsRecoveryProcessingMode())
+         return false;
+
      /* Quick exit if already known flushed */
      if (XLByteLE(record, LogwrtResult.Flush))
          return false;
***************
*** 2619,2627 **** RestoreArchivedFile(char *path, const char *xlogfname,
--- 2744,2765 ----
                               xlogRestoreCmd)));

      /*
+      * Set in_restore_command to tell the signal handler that we should exit
+      * right away on SIGTERM. We know that we're in a safe point to do that.
+      * Check if we had already received the signal, so that we don't miss
+      * a shutdown request received just before this.
+      */
+     in_restore_command = true;
+     if (shutdown_requested)
+         proc_exit(0);
+
+     /*
       * Copy xlog from archival storage to XLOGDIR
       */
      rc = system(xlogRestoreCmd);
+
+     in_restore_command = false;
+
      if (rc == 0)
      {
          /*
***************
*** 2674,2687 **** RestoreArchivedFile(char *path, const char *xlogfname,
       * assume that recovery is complete and start up the database!) It's
       * essential to abort on child SIGINT and SIGQUIT, because per spec
       * system() ignores SIGINT and SIGQUIT while waiting; if we see one of
!      * those it's a good bet we should have gotten it too.  Aborting on other
!      * signals such as SIGTERM seems a good idea as well.
       *
       * Per the Single Unix Spec, shells report exit status > 128 when a called
       * command died on a signal.  Also, 126 and 127 are used to report
       * problems such as an unfindable command; treat those as fatal errors
       * too.
       */
      signaled = WIFSIGNALED(rc) || WEXITSTATUS(rc) > 125;

      ereport(signaled ? FATAL : DEBUG2,
--- 2812,2835 ----
       * assume that recovery is complete and start up the database!) It's
       * essential to abort on child SIGINT and SIGQUIT, because per spec
       * system() ignores SIGINT and SIGQUIT while waiting; if we see one of
!      * those it's a good bet we should have gotten it too.
!      *
!      * On SIGTERM, assume we have received a fast shutdown request, and exit
!      * cleanly. It's pure chance whether we receive the SIGTERM first, or the
!      * child process. If we receive it first, the signal handler will call
!      * proc_exit(0), otherwise we do it here. If we or the child process
!      * received SIGTERM for any other reason than a fast shutdown request,
!      * postmaster will perform an immediate shutdown when it sees us exiting
!      * unexpectedly.
       *
       * Per the Single Unix Spec, shells report exit status > 128 when a called
       * command died on a signal.  Also, 126 and 127 are used to report
       * problems such as an unfindable command; treat those as fatal errors
       * too.
       */
+     if (WTERMSIG(rc) == SIGTERM)
+         proc_exit(0);
+
      signaled = WIFSIGNALED(rc) || WEXITSTATUS(rc) > 125;

      ereport(signaled ? FATAL : DEBUG2,
***************
*** 4590,4607 **** readRecoveryCommandFile(void)
              ereport(LOG,
                      (errmsg("recovery_target_inclusive = %s", tok2)));
          }
-         else if (strcmp(tok1, "log_restartpoints") == 0)
-         {
-             /*
-              * does nothing if a recovery_target is not also set
-              */
-             if (!parse_bool(tok2, &recoveryLogRestartpoints))
-                   ereport(ERROR,
-                             (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
-                       errmsg("parameter \"log_restartpoints\" requires a Boolean value")));
-             ereport(LOG,
-                     (errmsg("log_restartpoints = %s", tok2)));
-         }
          else
              ereport(FATAL,
                      (errmsg("unrecognized recovery parameter \"%s\"",
--- 4738,4743 ----
***************
*** 4883,4889 **** StartupXLOG(void)
      XLogRecPtr    RecPtr,
                  LastRec,
                  checkPointLoc,
!                 minRecoveryLoc,
                  EndOfLog;
      uint32        endLogId;
      uint32        endLogSeg;
--- 5019,5025 ----
      XLogRecPtr    RecPtr,
                  LastRec,
                  checkPointLoc,
!                 backupStopLoc,
                  EndOfLog;
      uint32        endLogId;
      uint32        endLogSeg;
***************
*** 4891,4896 **** StartupXLOG(void)
--- 5027,5034 ----
      uint32        freespace;
      TransactionId oldestActiveXID;

+     XLogCtl->SharedRecoveryProcessingMode = true;
+
      /*
       * Read control file and check XLOG status looks valid.
       *
***************
*** 4970,4976 **** StartupXLOG(void)
                          recoveryTargetTLI,
                          ControlFile->checkPointCopy.ThisTimeLineID)));

!     if (read_backup_label(&checkPointLoc, &minRecoveryLoc))
      {
          /*
           * When a backup_label file is present, we want to roll forward from
--- 5108,5114 ----
                          recoveryTargetTLI,
                          ControlFile->checkPointCopy.ThisTimeLineID)));

!     if (read_backup_label(&checkPointLoc, &backupStopLoc))
      {
          /*
           * When a backup_label file is present, we want to roll forward from
***************
*** 5108,5118 **** StartupXLOG(void)
          ControlFile->prevCheckPoint = ControlFile->checkPoint;
          ControlFile->checkPoint = checkPointLoc;
          ControlFile->checkPointCopy = checkPoint;
!         if (minRecoveryLoc.xlogid != 0 || minRecoveryLoc.xrecoff != 0)
!             ControlFile->minRecoveryPoint = minRecoveryLoc;
          ControlFile->time = (pg_time_t) time(NULL);
          UpdateControlFile();

          /*
           * If there was a backup label file, it's done its job and the info
           * has now been propagated into pg_control.  We must get rid of the
--- 5246,5268 ----
          ControlFile->prevCheckPoint = ControlFile->checkPoint;
          ControlFile->checkPoint = checkPointLoc;
          ControlFile->checkPointCopy = checkPoint;
!         if (backupStopLoc.xlogid != 0 || backupStopLoc.xrecoff != 0)
!         {
!             if (XLByteLT(ControlFile->minRecoveryPoint, backupStopLoc))
!                 ControlFile->minRecoveryPoint = backupStopLoc;
!         }
          ControlFile->time = (pg_time_t) time(NULL);
+         /* No need to hold ControlFileLock yet, we aren't up far enough */
          UpdateControlFile();

+         /* update our local copy of minRecoveryPoint */
+         minRecoveryPoint = ControlFile->minRecoveryPoint;
+
+         /*
+          * Reset pgstat data, because it may be invalid after recovery.
+          */
+         pgstat_reset_all();
+
          /*
           * If there was a backup label file, it's done its job and the info
           * has now been propagated into pg_control.  We must get rid of the
***************
*** 5157,5168 **** StartupXLOG(void)
          {
              bool        recoveryContinue = true;
              bool        recoveryApply = true;
              ErrorContextCallback errcontext;

              InRedo = true;
!             ereport(LOG,
!                     (errmsg("redo starts at %X/%X",
!                             ReadRecPtr.xlogid, ReadRecPtr.xrecoff)));

              /*
               * main redo apply loop
--- 5307,5347 ----
          {
              bool        recoveryContinue = true;
              bool        recoveryApply = true;
+             bool        reachedMinRecoveryPoint = false;
              ErrorContextCallback errcontext;
+             /* use volatile pointer to prevent code rearrangement */
+             volatile XLogCtlData *xlogctl = XLogCtl;
+
+             /* Update shared replayEndRecPtr */
+             SpinLockAcquire(&xlogctl->info_lck);
+             xlogctl->replayEndRecPtr = ReadRecPtr;
+             SpinLockRelease(&xlogctl->info_lck);

              InRedo = true;
!
!             if (minRecoveryPoint.xlogid == 0 && minRecoveryPoint.xrecoff == 0)
!                 ereport(LOG,
!                         (errmsg("redo starts at %X/%X",
!                                 ReadRecPtr.xlogid, ReadRecPtr.xrecoff)));
!             else
!                 ereport(LOG,
!                         (errmsg("redo starts at %X/%X, consistency will be reached at %X/%X",
!                         ReadRecPtr.xlogid, ReadRecPtr.xrecoff,
!                         minRecoveryPoint.xlogid, minRecoveryPoint.xrecoff)));
!
!             /*
!              * Let postmaster know we've started redo now, so that it can
!              * launch bgwriter to perform restartpoints.  We don't bother
!              * during crash recovery as restartpoints can only be performed
!              * during archive recovery.  And we'd like to keep crash recovery
!              * simple, to avoid introducing bugs that could you from
!              * recovering after crash.
!              *
!              * After this point, we can no longer assume that we're the only
!              * process in addition to postmaster!
!              */
!             if (InArchiveRecovery && IsUnderPostmaster)
!                 SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);

              /*
               * main redo apply loop
***************
*** 5189,5194 **** StartupXLOG(void)
--- 5368,5397 ----
  #endif

                  /*
+                  * Check if we were requested to exit without finishing
+                  * recovery.
+                  */
+                 if (shutdown_requested)
+                     proc_exit(0);
+
+                 /*
+                  * Have we reached our safe starting point? If so, we can
+                  * tell postmaster that the database is consistent now.
+                  */
+                 if (!reachedMinRecoveryPoint &&
+                      XLByteLE(minRecoveryPoint, EndRecPtr))
+                 {
+                     reachedMinRecoveryPoint = true;
+                     if (InArchiveRecovery)
+                     {
+                         ereport(LOG,
+                                 (errmsg("consistent recovery state reached")));
+                         if (IsUnderPostmaster)
+                             SendPostmasterSignal(PMSIGNAL_RECOVERY_CONSISTENT);
+                     }
+                 }
+
+                 /*
                   * Have we reached our recovery target?
                   */
                  if (recoveryStopsHere(record, &recoveryApply))
***************
*** 5213,5218 **** StartupXLOG(void)
--- 5416,5430 ----
                      TransactionIdAdvance(ShmemVariableCache->nextXid);
                  }

+                 /*
+                  * Update shared replayEndRecPtr before replaying this
+                  * record, so that XLogFlush will update minRecoveryPoint
+                  * correctly.
+                  */
+                 SpinLockAcquire(&xlogctl->info_lck);
+                 xlogctl->replayEndRecPtr = EndRecPtr;
+                 SpinLockRelease(&xlogctl->info_lck);
+
                  RmgrTable[record->xl_rmid].rm_redo(EndRecPtr, record);

                  /* Pop the error context stack */
***************
*** 5256,5269 **** StartupXLOG(void)
       * Complain if we did not roll forward far enough to render the backup
       * dump consistent.
       */
!     if (XLByteLT(EndOfLog, ControlFile->minRecoveryPoint))
      {
          if (reachedStopPoint)    /* stopped because of stop request */
              ereport(FATAL,
!                     (errmsg("requested recovery stop point is before end time of backup dump")));
          else    /* ran off end of WAL */
              ereport(FATAL,
!                     (errmsg("WAL ends before end time of backup dump")));
      }

      /*
--- 5468,5481 ----
       * Complain if we did not roll forward far enough to render the backup
       * dump consistent.
       */
!     if (InRecovery && XLByteLT(EndOfLog, minRecoveryPoint))
      {
          if (reachedStopPoint)    /* stopped because of stop request */
              ereport(FATAL,
!                     (errmsg("requested recovery stop point is before consistent recovery point")));
          else    /* ran off end of WAL */
              ereport(FATAL,
!                     (errmsg("WAL ends before consistent recovery point")));
      }

      /*
***************
*** 5358,5363 **** StartupXLOG(void)
--- 5570,5581 ----
      /* Pre-scan prepared transactions to find out the range of XIDs present */
      oldestActiveXID = PrescanPreparedTransactions();

+     /*
+      * Allow writing WAL for us, so that we can create a checkpoint record.
+      * But not yet for other backends!
+      */
+     LocalRecoveryProcessingMode = false;
+
      if (InRecovery)
      {
          int            rmid;
***************
*** 5378,5388 **** StartupXLOG(void)
          XLogCheckInvalidPages();

          /*
-          * Reset pgstat data, because it may be invalid after recovery.
-          */
-         pgstat_reset_all();
-
-         /*
           * Perform a checkpoint to update all our recovery activity to disk.
           *
           * Note that we write a shutdown checkpoint rather than an on-line
--- 5596,5601 ----
***************
*** 5404,5415 **** StartupXLOG(void)
       */
      InRecovery = false;

      ControlFile->state = DB_IN_PRODUCTION;
      ControlFile->time = (pg_time_t) time(NULL);
      UpdateControlFile();

      /* start the archive_timeout timer running */
!     XLogCtl->Write.lastSegSwitchTime = ControlFile->time;

      /* initialize shared-memory copy of latest checkpoint XID/epoch */
      XLogCtl->ckptXidEpoch = ControlFile->checkPointCopy.nextXidEpoch;
--- 5617,5630 ----
       */
      InRecovery = false;

+     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
      ControlFile->state = DB_IN_PRODUCTION;
      ControlFile->time = (pg_time_t) time(NULL);
      UpdateControlFile();
+     LWLockRelease(ControlFileLock);

      /* start the archive_timeout timer running */
!     XLogCtl->Write.lastSegSwitchTime = (pg_time_t) time(NULL);

      /* initialize shared-memory copy of latest checkpoint XID/epoch */
      XLogCtl->ckptXidEpoch = ControlFile->checkPointCopy.nextXidEpoch;
***************
*** 5444,5449 **** StartupXLOG(void)
--- 5659,5703 ----
          readRecordBuf = NULL;
          readRecordBufSize = 0;
      }
+
+     /*
+      * All done. Allow others to write WAL.
+      */
+     XLogCtl->SharedRecoveryProcessingMode = false;
+ }
+
+ /*
+  * Is the system still in recovery?
+  *
+  * As a side-effect, we initialize the local TimeLineID and RedoRecPtr
+  * variables the first time we see that recovery is finished.
+  */
+ bool
+ IsRecoveryProcessingMode(void)
+ {
+     /*
+      * We check shared state each time only until we leave recovery mode.
+      * We can't re-enter recovery, so we rely on the local state variable
+      * after that.
+      */
+     if (!LocalRecoveryProcessingMode)
+         return false;
+     else
+     {
+         /* use volatile pointer to prevent code rearrangement */
+         volatile XLogCtlData *xlogctl = XLogCtl;
+
+         LocalRecoveryProcessingMode = xlogctl->SharedRecoveryProcessingMode;
+
+         /*
+          * Initialize TimeLineID and RedoRecPtr the first time we see that
+          * recovery is finished.
+          */
+         if (!LocalRecoveryProcessingMode)
+             InitXLOGAccess();
+
+         return LocalRecoveryProcessingMode;
+     }
  }

  /*
***************
*** 5575,5580 **** InitXLOGAccess(void)
--- 5829,5836 ----
  {
      /* ThisTimeLineID doesn't change so we need no lock to copy it */
      ThisTimeLineID = XLogCtl->ThisTimeLineID;
+     Assert(ThisTimeLineID != 0);
+
      /* Use GetRedoRecPtr to copy the RedoRecPtr safely */
      (void) GetRedoRecPtr();
  }
***************
*** 5686,5692 **** ShutdownXLOG(int code, Datum arg)
      ereport(LOG,
              (errmsg("shutting down")));

!     CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
      ShutdownCLOG();
      ShutdownSUBTRANS();
      ShutdownMultiXact();
--- 5942,5951 ----
      ereport(LOG,
              (errmsg("shutting down")));

!     if (IsRecoveryProcessingMode())
!         CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
!     else
!         CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
      ShutdownCLOG();
      ShutdownSUBTRANS();
      ShutdownMultiXact();
***************
*** 5699,5707 **** ShutdownXLOG(int code, Datum arg)
   * Log start of a checkpoint.
   */
  static void
! LogCheckpointStart(int flags)
  {
!     elog(LOG, "checkpoint starting:%s%s%s%s%s%s",
           (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
           (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
           (flags & CHECKPOINT_FORCE) ? " force" : "",
--- 5958,5977 ----
   * Log start of a checkpoint.
   */
  static void
! LogCheckpointStart(int flags, bool restartpoint)
  {
!     char *msg;
!
!     /*
!      * XXX: This is hopelessly untranslatable. We could call gettext_noop
!      * for the main message, but what about all the flags?
!      */
!     if (restartpoint)
!         msg = "restartpoint starting:%s%s%s%s%s%s";
!     else
!         msg = "checkpoint starting:%s%s%s%s%s%s";
!
!     elog(LOG, msg,
           (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
           (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
           (flags & CHECKPOINT_FORCE) ? " force" : "",
***************
*** 5714,5720 **** LogCheckpointStart(int flags)
   * Log end of a checkpoint.
   */
  static void
! LogCheckpointEnd(void)
  {
      long        write_secs,
                  sync_secs,
--- 5984,5990 ----
   * Log end of a checkpoint.
   */
  static void
! LogCheckpointEnd(bool restartpoint)
  {
      long        write_secs,
                  sync_secs,
***************
*** 5737,5753 **** LogCheckpointEnd(void)
                          CheckpointStats.ckpt_sync_end_t,
                          &sync_secs, &sync_usecs);

!     elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); "
!          "%d transaction log file(s) added, %d removed, %d recycled; "
!          "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
!          CheckpointStats.ckpt_bufs_written,
!          (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
!          CheckpointStats.ckpt_segs_added,
!          CheckpointStats.ckpt_segs_removed,
!          CheckpointStats.ckpt_segs_recycled,
!          write_secs, write_usecs / 1000,
!          sync_secs, sync_usecs / 1000,
!          total_secs, total_usecs / 1000);
  }

  /*
--- 6007,6032 ----
                          CheckpointStats.ckpt_sync_end_t,
                          &sync_secs, &sync_usecs);

!     if (restartpoint)
!         elog(LOG, "restartpoint complete: wrote %d buffers (%.1f%%); "
!              "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
!              CheckpointStats.ckpt_bufs_written,
!              (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
!              write_secs, write_usecs / 1000,
!              sync_secs, sync_usecs / 1000,
!              total_secs, total_usecs / 1000);
!     else
!         elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); "
!              "%d transaction log file(s) added, %d removed, %d recycled; "
!              "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
!              CheckpointStats.ckpt_bufs_written,
!              (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
!              CheckpointStats.ckpt_segs_added,
!              CheckpointStats.ckpt_segs_removed,
!              CheckpointStats.ckpt_segs_recycled,
!              write_secs, write_usecs / 1000,
!              sync_secs, sync_usecs / 1000,
!              total_secs, total_usecs / 1000);
  }

  /*
***************
*** 5778,5790 **** CreateCheckPoint(int flags)
      TransactionId *inCommitXids;
      int            nInCommit;

      /*
       * Acquire CheckpointLock to ensure only one checkpoint happens at a time.
!      * (This is just pro forma, since in the present system structure there is
!      * only one process that is allowed to issue checkpoints at any given
!      * time.)
       */
!     LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);

      /*
       * Prepare to accumulate statistics.
--- 6057,6089 ----
      TransactionId *inCommitXids;
      int            nInCommit;

+     /* shouldn't happen */
+     if (IsRecoveryProcessingMode())
+         elog(ERROR, "can't create a checkpoint during recovery");
+
      /*
       * Acquire CheckpointLock to ensure only one checkpoint happens at a time.
!      * During normal operation, bgwriter is the only process that creates
!      * checkpoints, but at the end archive recovery, the bgwriter can be busy
!      * creating a restartpoint while the startup process tries to perform the
!      * startup checkpoint.
       */
!     if (!LWLockConditionalAcquire(CheckpointLock, LW_EXCLUSIVE))
!     {
!         Assert(InRecovery);
!
!         /*
!          * A restartpoint is in progress. Wait until it finishes. This can
!          * cause an extra restartpoint to be performed, but that's OK because
!          * we're just about to perform a checkpoint anyway. Flushing the
!          * buffers in this restartpoint can take some time, but that time is
!          * saved from the upcoming checkpoint so the net effect is zero.
!          */
!         ereport(DEBUG2, (errmsg("hurrying in-progress restartpoint")));
!         RequestCheckpoint(CHECKPOINT_IMMEDIATE | CHECKPOINT_WAIT);
!
!         LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
!     }

      /*
       * Prepare to accumulate statistics.
***************
*** 5803,5811 **** CreateCheckPoint(int flags)
--- 6102,6112 ----

      if (shutdown)
      {
+         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
          ControlFile->state = DB_SHUTDOWNING;
          ControlFile->time = (pg_time_t) time(NULL);
          UpdateControlFile();
+         LWLockRelease(ControlFileLock);
      }

      /*
***************
*** 5909,5915 **** CreateCheckPoint(int flags)
       * to log anything if we decided to skip the checkpoint.
       */
      if (log_checkpoints)
!         LogCheckpointStart(flags);

      TRACE_POSTGRESQL_CHECKPOINT_START(flags);

--- 6210,6216 ----
       * to log anything if we decided to skip the checkpoint.
       */
      if (log_checkpoints)
!         LogCheckpointStart(flags, false);

      TRACE_POSTGRESQL_CHECKPOINT_START(flags);

***************
*** 6076,6082 **** CreateCheckPoint(int flags)

      /* All real work is done, but log before releasing lock. */
      if (log_checkpoints)
!         LogCheckpointEnd();

          TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written,
                                  NBuffers, CheckpointStats.ckpt_segs_added,
--- 6377,6383 ----

      /* All real work is done, but log before releasing lock. */
      if (log_checkpoints)
!         LogCheckpointEnd(false);

          TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written,
                                  NBuffers, CheckpointStats.ckpt_segs_added,
***************
*** 6104,6135 **** CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
  }

  /*
!  * Set a recovery restart point if appropriate
!  *
!  * This is similar to CreateCheckPoint, but is used during WAL recovery
!  * to establish a point from which recovery can roll forward without
!  * replaying the entire recovery log.  This function is called each time
!  * a checkpoint record is read from XLOG; it must determine whether a
!  * restartpoint is needed or not.
   */
  static void
  RecoveryRestartPoint(const CheckPoint *checkPoint)
  {
-     int            elapsed_secs;
      int            rmid;
!
!     /*
!      * Do nothing if the elapsed time since the last restartpoint is less than
!      * half of checkpoint_timeout.    (We use a value less than
!      * checkpoint_timeout so that variations in the timing of checkpoints on
!      * the master, or speed of transmission of WAL segments to a slave, won't
!      * make the slave skip a restartpoint once it's synced with the master.)
!      * Checking true elapsed time keeps us from doing restartpoints too often
!      * while rapidly scanning large amounts of WAL.
!      */
!     elapsed_secs = (pg_time_t) time(NULL) - ControlFile->time;
!     if (elapsed_secs < CheckPointTimeout / 2)
!         return;

      /*
       * Is it safe to checkpoint?  We must ask each of the resource managers
--- 6405,6421 ----
  }

  /*
!  * This is used during WAL recovery to establish a point from which recovery
!  * can roll forward without replaying the entire recovery log.  This function
!  * is called each time a checkpoint record is read from XLOG. It is stored
!  * in shared memory, so that it can be used as a restartpoint later on.
   */
  static void
  RecoveryRestartPoint(const CheckPoint *checkPoint)
  {
      int            rmid;
!     /* use volatile pointer to prevent code rearrangement */
!     volatile XLogCtlData *xlogctl = XLogCtl;

      /*
       * Is it safe to checkpoint?  We must ask each of the resource managers
***************
*** 6151,6178 **** RecoveryRestartPoint(const CheckPoint *checkPoint)
      }

      /*
!      * OK, force data out to disk
       */
!     CheckPointGuts(checkPoint->redo, CHECKPOINT_IMMEDIATE);

      /*
!      * Update pg_control so that any subsequent crash will restart from this
!      * checkpoint.    Note: ReadRecPtr gives the XLOG address of the checkpoint
!      * record itself.
       */
      ControlFile->prevCheckPoint = ControlFile->checkPoint;
!     ControlFile->checkPoint = ReadRecPtr;
!     ControlFile->checkPointCopy = *checkPoint;
      ControlFile->time = (pg_time_t) time(NULL);
      UpdateControlFile();

!     ereport((recoveryLogRestartpoints ? LOG : DEBUG2),
              (errmsg("recovery restart point at %X/%X",
!                     checkPoint->redo.xlogid, checkPoint->redo.xrecoff)));
      if (recoveryLastXTime)
!         ereport((recoveryLogRestartpoints ? LOG : DEBUG2),
!                 (errmsg("last completed transaction was at log time %s",
!                         timestamptz_to_str(recoveryLastXTime))));
  }

  /*
--- 6437,6564 ----
      }

      /*
!      * Copy the checkpoint record to shared memory, so that bgwriter can
!      * use it the next time it wants to perform a restartpoint.
!      */
!     SpinLockAcquire(&xlogctl->info_lck);
!     XLogCtl->lastCheckPointRecPtr = ReadRecPtr;
!     memcpy(&XLogCtl->lastCheckPoint, checkPoint, sizeof(CheckPoint));
!     SpinLockRelease(&xlogctl->info_lck);
! }
!
! /*
!  * This is similar to CreateCheckPoint, but is used during WAL recovery
!  * to establish a point from which recovery can roll forward without
!  * replaying the entire recovery log.
!  *
!  * Returns true if a new restartpoint was established. We can only establish
!  * a restartpoint if we have replayed a checkpoint record since last
!  * restartpoint.
!  */
! bool
! CreateRestartPoint(int flags)
! {
!     XLogRecPtr lastCheckPointRecPtr;
!     CheckPoint lastCheckPoint;
!     /* use volatile pointer to prevent code rearrangement */
!     volatile XLogCtlData *xlogctl = XLogCtl;
!
!     /*
!      * Acquire CheckpointLock to ensure only one restartpoint or checkpoint
!      * happens at a time.
!      */
!     LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
!
!     /* Get the a local copy of the last checkpoint record. */
!     SpinLockAcquire(&xlogctl->info_lck);
!     lastCheckPointRecPtr = xlogctl->lastCheckPointRecPtr;
!     memcpy(&lastCheckPoint, &XLogCtl->lastCheckPoint, sizeof(CheckPoint));
!     SpinLockRelease(&xlogctl->info_lck);
!
!     /*
!      * Check that we're still in recovery mode. It's ok if we exit recovery
!      * mode after this check, the restart point is valid anyway.
!      */
!     if (!IsRecoveryProcessingMode())
!     {
!         ereport(DEBUG2,
!                 (errmsg("skipping restartpoint, recovery has already ended")));
!         LWLockRelease(CheckpointLock);
!         return false;
!     }
!
!     /*
!      * If the last checkpoint record we've replayed is already our last
!      * restartpoint, we can't perform a new restart point. We still update
!      * minRecoveryPoint in that case, so that if this is a shutdown restart
!      * point, we won't start up earlier than before. That's not strictly
!      * necessary, but when we get hot standby capability, it would be rather
!      * weird if the database opened up for read-only connections at a
!      * point-in-time before the last shutdown. Such time travel is still
!      * possible in case of immediate shutdown, though.
!      *
!      * We don't explicitly advance minRecoveryPoint when we do create a
!      * restartpoint. It's assumed that flushing the buffers will do that
!      * as a side-effect.
       */
!     if (XLogRecPtrIsInvalid(lastCheckPointRecPtr) ||
!         XLByteLE(lastCheckPoint.redo, ControlFile->checkPointCopy.redo))
!     {
!         XLogRecPtr InvalidXLogRecPtr = {0, 0};
!         ereport(DEBUG2,
!                 (errmsg("skipping restartpoint, already performed at %X/%X",
!                         lastCheckPoint.redo.xlogid, lastCheckPoint.redo.xrecoff)));
!
!         UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
!         LWLockRelease(CheckpointLock);
!         return false;
!     }
!
!     if (log_checkpoints)
!     {
!         /*
!          * Prepare to accumulate statistics.
!          */
!         MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
!         CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
!
!         LogCheckpointStart(flags, true);
!     }
!
!     CheckPointGuts(lastCheckPoint.redo, flags);

      /*
!      * Update pg_control, using current time
       */
+     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
      ControlFile->prevCheckPoint = ControlFile->checkPoint;
!     ControlFile->checkPoint = lastCheckPointRecPtr;
!     ControlFile->checkPointCopy = lastCheckPoint;
      ControlFile->time = (pg_time_t) time(NULL);
      UpdateControlFile();
+     LWLockRelease(ControlFileLock);

!     /*
!      * Currently, there is no need to truncate pg_subtrans during recovery.
!      * If we did do that, we will need to have called StartupSUBTRANS()
!      * already and then TruncateSUBTRANS() would go here.
!      */
!
!     /* All real work is done, but log before releasing lock. */
!     if (log_checkpoints)
!         LogCheckpointEnd(true);
!
!     ereport((log_checkpoints ? LOG : DEBUG2),
              (errmsg("recovery restart point at %X/%X",
!                     lastCheckPoint.redo.xlogid, lastCheckPoint.redo.xrecoff)));
!
      if (recoveryLastXTime)
!         ereport((log_checkpoints ? LOG : DEBUG2),
!             (errmsg("last completed transaction was at log time %s",
!                     timestamptz_to_str(recoveryLastXTime))));
!
!     LWLockRelease(CheckpointLock);
!     return true;
  }

  /*
***************
*** 6238,6243 **** RequestXLogSwitch(void)
--- 6624,6632 ----

  /*
   * XLOG resource manager's routines
+  *
+  * Definitions of message info are in include/catalog/pg_control.h,
+  * though not all messages relate to control file processing.
   */
  void
  xlog_redo(XLogRecPtr lsn, XLogRecord *record)
***************
*** 6284,6292 **** xlog_redo(XLogRecPtr lsn, XLogRecord *record)
                                   (int) checkPoint.ThisTimeLineID))
                  ereport(PANIC,
                          (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
!                                 checkPoint.ThisTimeLineID, ThisTimeLineID)));
!             /* Following WAL records should be run with new TLI */
!             ThisTimeLineID = checkPoint.ThisTimeLineID;
          }

          RecoveryRestartPoint(&checkPoint);
--- 6673,6681 ----
                                   (int) checkPoint.ThisTimeLineID))
                  ereport(PANIC,
                          (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
!                                checkPoint.ThisTimeLineID, ThisTimeLineID)));
!            /* Following WAL records should be run with new TLI */
!            ThisTimeLineID = checkPoint.ThisTimeLineID;
          }

          RecoveryRestartPoint(&checkPoint);
***************
*** 7227,7229 **** CancelBackup(void)
--- 7616,7707 ----
      }
  }

+ /* ------------------------------------------------------
+  *  Startup Process main entry point and signal handlers
+  * ------------------------------------------------------
+  */
+
+ /*
+  * startupproc_quickdie() occurs when signalled SIGQUIT by the postmaster.
+  *
+  * Some backend has bought the farm,
+  * so we need to stop what we're doing and exit.
+  */
+ static void
+ startupproc_quickdie(SIGNAL_ARGS)
+ {
+     PG_SETMASK(&BlockSig);
+
+     /*
+      * DO NOT proc_exit() -- we're here because shared memory may be
+      * corrupted, so we don't want to try to clean up our transaction. Just
+      * nail the windows shut and get out of town.
+      *
+      * Note we do exit(2) not exit(0).    This is to force the postmaster into a
+      * system reset cycle if some idiot DBA sends a manual SIGQUIT to a random
+      * backend.  This is necessary precisely because we don't clean up our
+      * shared memory state.
+      */
+     exit(2);
+ }
+
+
+ /* SIGTERM: set flag to abort redo and exit */
+ static void
+ StartupProcShutdownHandler(SIGNAL_ARGS)
+ {
+     if (in_restore_command)
+         proc_exit(0);
+     else
+         shutdown_requested = true;
+ }
+
+ /* Main entry point for startup process */
+ void
+ StartupProcessMain(void)
+ {
+     /*
+      * If possible, make this process a group leader, so that the postmaster
+      * can signal any child processes too.
+      */
+ #ifdef HAVE_SETSID
+     if (setsid() < 0)
+         elog(FATAL, "setsid() failed: %m");
+ #endif
+
+     /*
+      * Properly accept or ignore signals the postmaster might send us
+      */
+     pqsignal(SIGHUP, SIG_IGN);    /* ignore config file updates */
+     pqsignal(SIGINT, SIG_IGN);        /* ignore query cancel */
+     pqsignal(SIGTERM, StartupProcShutdownHandler); /* request shutdown */
+     pqsignal(SIGQUIT, startupproc_quickdie);        /* hard crash time */
+     pqsignal(SIGALRM, SIG_IGN);
+     pqsignal(SIGPIPE, SIG_IGN);
+     pqsignal(SIGUSR1, SIG_IGN);
+     pqsignal(SIGUSR2, SIG_IGN);
+
+     /*
+      * Reset some signals that are accepted by postmaster but not here
+      */
+     pqsignal(SIGCHLD, SIG_DFL);
+     pqsignal(SIGTTIN, SIG_DFL);
+     pqsignal(SIGTTOU, SIG_DFL);
+     pqsignal(SIGCONT, SIG_DFL);
+     pqsignal(SIGWINCH, SIG_DFL);
+
+     /*
+      * Unblock signals (they were blocked when the postmaster forked us)
+      */
+     PG_SETMASK(&UnBlockSig);
+
+     StartupXLOG();
+
+     BuildFlatFiles(false);
+
+     /* Let postmaster know that startup is finished */
+     SendPostmasterSignal(PMSIGNAL_RECOVERY_COMPLETED);
+
+     /* exit normally */
+     proc_exit(0);
+ }
*** a/src/backend/bootstrap/bootstrap.c
--- b/src/backend/bootstrap/bootstrap.c
***************
*** 37,43 ****
  #include "storage/proc.h"
  #include "tcop/tcopprot.h"
  #include "utils/builtins.h"
- #include "utils/flatfiles.h"
  #include "utils/fmgroids.h"
  #include "utils/memutils.h"
  #include "utils/ps_status.h"
--- 37,42 ----
***************
*** 416,429 **** AuxiliaryProcessMain(int argc, char *argv[])
              proc_exit(1);        /* should never return */

          case StartupProcess:
!             bootstrap_signals();
!             StartupXLOG();
!             BuildFlatFiles(false);
!             proc_exit(0);        /* startup done */

          case BgWriterProcess:
              /* don't set signals, bgwriter has its own agenda */
-             InitXLOGAccess();
              BackgroundWriterMain();
              proc_exit(1);        /* should never return */

--- 415,426 ----
              proc_exit(1);        /* should never return */

          case StartupProcess:
!             /* don't set signals, startup process has its own agenda */
!             StartupProcessMain();
!             proc_exit(1);        /* should never return */

          case BgWriterProcess:
              /* don't set signals, bgwriter has its own agenda */
              BackgroundWriterMain();
              proc_exit(1);        /* should never return */

*** a/src/backend/postmaster/bgwriter.c
--- b/src/backend/postmaster/bgwriter.c
***************
*** 49,54 ****
--- 49,55 ----
  #include <unistd.h>

  #include "access/xlog_internal.h"
+ #include "catalog/pg_control.h"
  #include "libpq/pqsignal.h"
  #include "miscadmin.h"
  #include "pgstat.h"
***************
*** 197,202 **** BackgroundWriterMain(void)
--- 198,204 ----
  {
      sigjmp_buf    local_sigjmp_buf;
      MemoryContext bgwriter_context;
+     bool        BgWriterRecoveryMode = true;

      BgWriterShmem->bgwriter_pid = MyProcPid;
      am_bg_writer = true;
***************
*** 418,428 **** BackgroundWriterMain(void)
--- 420,446 ----
          }

          /*
+          * Check if we've exited recovery. We do this after determining
+          * whether to perform a checkpoint or not, to be sure that we
+          * perform a real checkpoint and not a restartpoint, if someone
+          * requested a checkpoint immediately after exiting recovery. And
+          * we must have the right TimeLineID when we perform a checkpoint;
+          * IsRecoveryProcessingMode() initializes that as a side-effect.
+          */
+          if (BgWriterRecoveryMode && !IsRecoveryProcessingMode())
+           {
+             elog(DEBUG1, "bgwriter changing from recovery to normal mode");
+             BgWriterRecoveryMode = false;
+         }
+
+         /*
           * Do a checkpoint if requested, otherwise do one cycle of
           * dirty-buffer writing.
           */
          if (do_checkpoint)
          {
+             bool    ckpt_performed = false;
+
              /* use volatile pointer to prevent code rearrangement */
              volatile BgWriterShmemStruct *bgs = BgWriterShmem;

***************
*** 444,450 **** BackgroundWriterMain(void)
               * implementation will not generate warnings caused by
               * CheckPointTimeout < CheckPointWarning.
               */
!             if ((flags & CHECKPOINT_CAUSE_XLOG) &&
                  elapsed_secs < CheckPointWarning)
                  ereport(LOG,
                          (errmsg("checkpoints are occurring too frequently (%d seconds apart)",
--- 462,469 ----
               * implementation will not generate warnings caused by
               * CheckPointTimeout < CheckPointWarning.
               */
!             if (!BgWriterRecoveryMode &&
!                 (flags & CHECKPOINT_CAUSE_XLOG) &&
                  elapsed_secs < CheckPointWarning)
                  ereport(LOG,
                          (errmsg("checkpoints are occurring too frequently (%d seconds apart)",
***************
*** 455,468 **** BackgroundWriterMain(void)
               * Initialize bgwriter-private variables used during checkpoint.
               */
              ckpt_active = true;
!             ckpt_start_recptr = GetInsertRecPtr();
              ckpt_start_time = now;
              ckpt_cached_elapsed = 0;

              /*
               * Do the checkpoint.
               */
!             CreateCheckPoint(flags);

              /*
               * After any checkpoint, close all smgr files.    This is so we
--- 474,494 ----
               * Initialize bgwriter-private variables used during checkpoint.
               */
              ckpt_active = true;
!             if (!BgWriterRecoveryMode)
!                 ckpt_start_recptr = GetInsertRecPtr();
              ckpt_start_time = now;
              ckpt_cached_elapsed = 0;

              /*
               * Do the checkpoint.
               */
!             if (!BgWriterRecoveryMode)
!             {
!                 CreateCheckPoint(flags);
!                 ckpt_performed = true;
!             }
!             else
!                 ckpt_performed = CreateRestartPoint(flags);

              /*
               * After any checkpoint, close all smgr files.    This is so we
***************
*** 477,490 **** BackgroundWriterMain(void)
              bgs->ckpt_done = bgs->ckpt_started;
              SpinLockRelease(&bgs->ckpt_lck);

!             ckpt_active = false;

!             /*
!              * Note we record the checkpoint start time not end time as
!              * last_checkpoint_time.  This is so that time-driven checkpoints
!              * happen at a predictable spacing.
!              */
!             last_checkpoint_time = now;
          }
          else
              BgBufferSync();
--- 503,529 ----
              bgs->ckpt_done = bgs->ckpt_started;
              SpinLockRelease(&bgs->ckpt_lck);

!             if (ckpt_performed)
!             {
!                 /*
!                  * Note we record the checkpoint start time not end time as
!                  * last_checkpoint_time.  This is so that time-driven
!                  * checkpoints happen at a predictable spacing.
!                  */
!                 last_checkpoint_time = now;
!             }
!             else
!             {
!                 /*
!                  * We were not able to perform the restartpoint (checkpoints
!                  * throw an ERROR in case of error).  Most likely because we
!                  * have not received any new checkpoint WAL records since the
!                  * last restartpoint. Try again in 15 s.
!                  */
!                 last_checkpoint_time = now - CheckPointTimeout + 15;
!             }

!             ckpt_active = false;
          }
          else
              BgBufferSync();
***************
*** 507,513 **** CheckArchiveTimeout(void)
      pg_time_t    now;
      pg_time_t    last_time;

!     if (XLogArchiveTimeout <= 0)
          return;

      now = (pg_time_t) time(NULL);
--- 546,552 ----
      pg_time_t    now;
      pg_time_t    last_time;

!     if (XLogArchiveTimeout <= 0 || IsRecoveryProcessingMode())
          return;

      now = (pg_time_t) time(NULL);
***************
*** 586,592 **** BgWriterNap(void)
          (ckpt_active ? ImmediateCheckpointRequested() : checkpoint_requested))
              break;
          pg_usleep(1000000L);
!         AbsorbFsyncRequests();
          udelay -= 1000000L;
      }

--- 625,632 ----
          (ckpt_active ? ImmediateCheckpointRequested() : checkpoint_requested))
              break;
          pg_usleep(1000000L);
!         if (!IsRecoveryProcessingMode())
!             AbsorbFsyncRequests();
          udelay -= 1000000L;
      }

***************
*** 714,729 **** IsCheckpointOnSchedule(double progress)
       * However, it's good enough for our purposes, we're only calculating an
       * estimate anyway.
       */
!     recptr = GetInsertRecPtr();
!     elapsed_xlogs =
!         (((double) (int32) (recptr.xlogid - ckpt_start_recptr.xlogid)) * XLogSegsPerFile +
!          ((double) recptr.xrecoff - (double) ckpt_start_recptr.xrecoff) / XLogSegSize) /
!         CheckPointSegments;
!
!     if (progress < elapsed_xlogs)
      {
!         ckpt_cached_elapsed = elapsed_xlogs;
!         return false;
      }

      /*
--- 754,772 ----
       * However, it's good enough for our purposes, we're only calculating an
       * estimate anyway.
       */
!     if (!IsRecoveryProcessingMode())
      {
!         recptr = GetInsertRecPtr();
!         elapsed_xlogs =
!             (((double) (int32) (recptr.xlogid - ckpt_start_recptr.xlogid)) * XLogSegsPerFile +
!              ((double) recptr.xrecoff - (double) ckpt_start_recptr.xrecoff) / XLogSegSize) /
!             CheckPointSegments;
!
!         if (progress < elapsed_xlogs)
!         {
!             ckpt_cached_elapsed = elapsed_xlogs;
!             return false;
!         }
      }

      /*
*** a/src/backend/postmaster/postmaster.c
--- b/src/backend/postmaster/postmaster.c
***************
*** 225,235 **** static pid_t StartupPID = 0,
--- 225,262 ----
  static int    Shutdown = NoShutdown;

  static bool FatalError = false; /* T if recovering from backend crash */
+ static bool RecoveryError = false; /* T if recovery failed */
+
+ /* State of WAL redo */
+ #define            NoRecovery            0
+ #define            RecoveryStarted        1
+ #define            RecoveryConsistent    2
+ #define            RecoveryCompleted    3
+
+ static int    RecoveryStatus = NoRecovery;

  /*
   * We use a simple state machine to control startup, shutdown, and
   * crash recovery (which is rather like shutdown followed by startup).
   *
+  * After doing all the postmaster initialization work, we enter PM_STARTUP
+  * state and the startup process is launched. The startup process begins by
+  * reading the control file and other preliminary initialization steps. When
+  * it's ready to start WAL redo, it signals postmaster, and we switch to
+  * PM_RECOVERY phase. The background writer is launched, while the startup
+  * process continues applying WAL.
+  *
+  * After reaching a consistent point in WAL redo, startup process signals
+  * us again, and we switch to PM_RECOVERY_CONSISTENT phase. There's currently
+  * no difference between PM_RECOVERY and PM_RECOVERY_CONSISTENT, but we
+  * could start accepting connections to perform read-only queries at this
+  * point, if we had the infrastructure to do that.
+  *
+  * When the WAL redo is finished, the startup process signals us the third
+  * time, and we switch to PM_RUN state. The startup process can also skip the
+  * recovery and consistent recovery phases altogether, as it will during
+  * normal startup when there's no recovery to be done, for example.
+  *
   * Normal child backends can only be launched when we are in PM_RUN state.
   * (We also allow it in PM_WAIT_BACKUP state, but only for superusers.)
   * In other states we handle connection requests by launching "dead_end"
***************
*** 245,259 **** static bool FatalError = false; /* T if recovering from backend crash */
   *
   * Notice that this state variable does not distinguish *why* we entered
   * states later than PM_RUN --- Shutdown and FatalError must be consulted
!  * to find that out.  FatalError is never true in PM_RUN state, nor in
!  * PM_SHUTDOWN states (because we don't enter those states when trying to
!  * recover from a crash).  It can be true in PM_STARTUP state, because we
!  * don't clear it until we've successfully recovered.
   */
  typedef enum
  {
      PM_INIT,                    /* postmaster starting */
      PM_STARTUP,                    /* waiting for startup subprocess */
      PM_RUN,                        /* normal "database is alive" state */
      PM_WAIT_BACKUP,                /* waiting for online backup mode to end */
      PM_WAIT_BACKENDS,            /* waiting for live backends to exit */
--- 272,288 ----
   *
   * Notice that this state variable does not distinguish *why* we entered
   * states later than PM_RUN --- Shutdown and FatalError must be consulted
!  * to find that out.  FatalError is never true in PM_RECOVERY_* or PM_RUN
!  * states, nor in PM_SHUTDOWN states (because we don't enter those states
!  * when trying to recover from a crash).  It can be true in PM_STARTUP state,
!  * because we don't clear it until we've successfully started WAL redo.
   */
  typedef enum
  {
      PM_INIT,                    /* postmaster starting */
      PM_STARTUP,                    /* waiting for startup subprocess */
+     PM_RECOVERY,                /* in recovery mode */
+     PM_RECOVERY_CONSISTENT,        /* consistent recovery mode */
      PM_RUN,                        /* normal "database is alive" state */
      PM_WAIT_BACKUP,                /* waiting for online backup mode to end */
      PM_WAIT_BACKENDS,            /* waiting for live backends to exit */
***************
*** 307,312 **** static void pmdie(SIGNAL_ARGS);
--- 336,342 ----
  static void reaper(SIGNAL_ARGS);
  static void sigusr1_handler(SIGNAL_ARGS);
  static void dummy_handler(SIGNAL_ARGS);
+ static void CheckRecoverySignals(void);
  static void CleanupBackend(int pid, int exitstatus);
  static void HandleChildCrash(int pid, int exitstatus, const char *procname);
  static void LogChildExit(int lev, const char *procname,
***************
*** 1302,1308 **** ServerLoop(void)
           * state that prevents it, start one.  It doesn't matter if this
           * fails, we'll just try again later.
           */
!         if (BgWriterPID == 0 && pmState == PM_RUN)
              BgWriterPID = StartBackgroundWriter();

          /*
--- 1332,1340 ----
           * state that prevents it, start one.  It doesn't matter if this
           * fails, we'll just try again later.
           */
!         if (BgWriterPID == 0 &&
!             (pmState == PM_RUN || pmState == PM_RECOVERY ||
!              pmState == PM_RECOVERY_CONSISTENT))
              BgWriterPID = StartBackgroundWriter();

          /*
***************
*** 1752,1758 **** canAcceptConnections(void)
              return CAC_WAITBACKUP;    /* allow superusers only */
          if (Shutdown > NoShutdown)
              return CAC_SHUTDOWN;    /* shutdown is pending */
!         if (pmState == PM_STARTUP && !FatalError)
              return CAC_STARTUP; /* normal startup */
          return CAC_RECOVERY;    /* else must be crash recovery */
      }
--- 1784,1793 ----
              return CAC_WAITBACKUP;    /* allow superusers only */
          if (Shutdown > NoShutdown)
              return CAC_SHUTDOWN;    /* shutdown is pending */
!         if (!FatalError &&
!             (pmState == PM_STARTUP ||
!              pmState == PM_RECOVERY ||
!              pmState == PM_RECOVERY_CONSISTENT))
              return CAC_STARTUP; /* normal startup */
          return CAC_RECOVERY;    /* else must be crash recovery */
      }
***************
*** 1982,1988 **** pmdie(SIGNAL_ARGS)
              ereport(LOG,
                      (errmsg("received smart shutdown request")));

!             if (pmState == PM_RUN)
              {
                  /* autovacuum workers are told to shut down immediately */
                  SignalAutovacWorkers(SIGTERM);
--- 2017,2023 ----
              ereport(LOG,
                      (errmsg("received smart shutdown request")));

!             if (pmState == PM_RUN || pmState == PM_RECOVERY || pmState == PM_RECOVERY_CONSISTENT)
              {
                  /* autovacuum workers are told to shut down immediately */
                  SignalAutovacWorkers(SIGTERM);
***************
*** 2019,2025 **** pmdie(SIGNAL_ARGS)

              if (StartupPID != 0)
                  signal_child(StartupPID, SIGTERM);
!             if (pmState == PM_RUN || pmState == PM_WAIT_BACKUP)
              {
                  ereport(LOG,
                          (errmsg("aborting any active transactions")));
--- 2054,2067 ----

              if (StartupPID != 0)
                  signal_child(StartupPID, SIGTERM);
!             if (pmState == PM_RECOVERY)
!             {
!                 /* only bgwriter is active in this state */
!                 pmState = PM_WAIT_BACKENDS;
!             }
!             if (pmState == PM_RUN ||
!                 pmState == PM_WAIT_BACKUP ||
!                 pmState == PM_RECOVERY_CONSISTENT)
              {
                  ereport(LOG,
                          (errmsg("aborting any active transactions")));
***************
*** 2116,2125 **** reaper(SIGNAL_ARGS)
          if (pid == StartupPID)
          {
              StartupPID = 0;
-             Assert(pmState == PM_STARTUP);

!             /* FATAL exit of startup is treated as catastrophic */
!             if (!EXIT_STATUS_0(exitstatus))
              {
                  LogChildExit(LOG, _("startup process"),
                               pid, exitstatus);
--- 2158,2179 ----
          if (pid == StartupPID)
          {
              StartupPID = 0;

!             /*
!              * Check if we've received a signal from the startup process
!              * first. This can change pmState. If the startup process sends
!              * a signal, and exits immediately after that, we might not have
!              * processed the signal yet, and we need to know if it completed
!              * recovery before exiting.
!              */
!             CheckRecoverySignals();
!
!             /*
!              * Unexpected exit of startup process (including FATAL exit)
!              * during PM_STARTUP is treated as catastrophic. There is no
!              * other processes running yet.
!              */
!             if (pmState == PM_STARTUP)
              {
                  LogChildExit(LOG, _("startup process"),
                               pid, exitstatus);
***************
*** 2127,2186 **** reaper(SIGNAL_ARGS)
                  (errmsg("aborting startup due to startup process failure")));
                  ExitPostmaster(1);
              }
-
              /*
!              * Startup succeeded - we are done with system startup or
!              * recovery.
               */
!             FatalError = false;
!
!             /*
!              * Go to shutdown mode if a shutdown request was pending.
!              */
!             if (Shutdown > NoShutdown)
              {
!                 pmState = PM_WAIT_BACKENDS;
!                 /* PostmasterStateMachine logic does the rest */
                  continue;
              }
-
              /*
!              * Otherwise, commence normal operations.
!              */
!             pmState = PM_RUN;
!
!             /*
!              * Load the flat authorization file into postmaster's cache. The
!              * startup process has recomputed this from the database contents,
!              * so we wait till it finishes before loading it.
!              */
!             load_role();
!
!             /*
!              * Crank up the background writer.    It doesn't matter if this
!              * fails, we'll just try again later.
               */
!             Assert(BgWriterPID == 0);
!             BgWriterPID = StartBackgroundWriter();
!
!             /*
!              * Likewise, start other special children as needed.  In a restart
!              * situation, some of them may be alive already.
!              */
!             if (WalWriterPID == 0)
!                 WalWriterPID = StartWalWriter();
!             if (AutoVacuumingActive() && AutoVacPID == 0)
!                 AutoVacPID = StartAutoVacLauncher();
!             if (XLogArchivingActive() && PgArchPID == 0)
!                 PgArchPID = pgarch_start();
!             if (PgStatPID == 0)
!                 PgStatPID = pgstat_start();
!
!             /* at this point we are really open for business */
!             ereport(LOG,
!                  (errmsg("database system is ready to accept connections")));
!
!             continue;
          }

          /*
--- 2181,2210 ----
                  (errmsg("aborting startup due to startup process failure")));
                  ExitPostmaster(1);
              }
              /*
!              * Any unexpected exit (including FATAL exit) of the startup
!              * process is treated as a crash, except that we don't want
!              * to reinitialize.
               */
!             if (!EXIT_STATUS_0(exitstatus))
              {
!                 RecoveryError = true;
!                 HandleChildCrash(pid, exitstatus,
!                                  _("startup process"));
                  continue;
              }
              /*
!              * Startup process exited normally, but didn't finish recovery.
!              * This can happen if someone else than postmaster kills the
!              * startup process with SIGTERM. Treat it like a crash.
               */
!             if (pmState == PM_RECOVERY || pmState == PM_RECOVERY_CONSISTENT)
!             {
!                 RecoveryError = true;
!                 HandleChildCrash(pid, exitstatus,
!                                  _("startup process"));
!                 continue;
!             }
          }

          /*
***************
*** 2443,2448 **** HandleChildCrash(int pid, int exitstatus, const char *procname)
--- 2467,2484 ----
          }
      }

+     /* Take care of the startup process too */
+     if (pid == StartupPID)
+         StartupPID = 0;
+     else if (StartupPID != 0 && !FatalError)
+     {
+         ereport(DEBUG2,
+                 (errmsg_internal("sending %s to process %d",
+                                  (SendStop ? "SIGSTOP" : "SIGQUIT"),
+                                  (int) StartupPID)));
+         signal_child(BgWriterPID, (SendStop ? SIGSTOP : SIGQUIT));
+     }
+
      /* Take care of the bgwriter too */
      if (pid == BgWriterPID)
          BgWriterPID = 0;
***************
*** 2514,2520 **** HandleChildCrash(int pid, int exitstatus, const char *procname)

      FatalError = true;
      /* We now transit into a state of waiting for children to die */
!     if (pmState == PM_RUN ||
          pmState == PM_WAIT_BACKUP ||
          pmState == PM_SHUTDOWN)
          pmState = PM_WAIT_BACKENDS;
--- 2550,2558 ----

      FatalError = true;
      /* We now transit into a state of waiting for children to die */
!     if (pmState == PM_RECOVERY ||
!         pmState == PM_RECOVERY_CONSISTENT ||
!         pmState == PM_RUN ||
          pmState == PM_WAIT_BACKUP ||
          pmState == PM_SHUTDOWN)
          pmState = PM_WAIT_BACKENDS;
***************
*** 2582,2587 **** LogChildExit(int lev, const char *procname, int pid, int exitstatus)
--- 2620,2746 ----
  static void
  PostmasterStateMachine(void)
  {
+     /* Startup states */
+
+     if (pmState == PM_STARTUP && RecoveryStatus > NoRecovery)
+     {
+         /* WAL redo has started. We're out of reinitialization. */
+         FatalError = false;
+
+         /*
+          * Go to shutdown mode if a shutdown request was pending.
+          */
+         if (Shutdown > NoShutdown)
+         {
+             pmState = PM_WAIT_BACKENDS;
+             /* PostmasterStateMachine logic does the rest */
+         }
+         else
+         {
+             /*
+              * Crank up the background writer.    It doesn't matter if this
+              * fails, we'll just try again later.
+              */
+             Assert(BgWriterPID == 0);
+             BgWriterPID = StartBackgroundWriter();
+
+             pmState = PM_RECOVERY;
+         }
+     }
+     if (pmState == PM_RECOVERY && RecoveryStatus >= RecoveryConsistent)
+     {
+         /*
+          * Go to shutdown mode if a shutdown request was pending.
+          */
+         if (Shutdown > NoShutdown)
+         {
+             pmState = PM_WAIT_BACKENDS;
+             /* PostmasterStateMachine logic does the rest */
+         }
+         else
+         {
+             /*
+              * Startup process has entered recovery. We consider that good
+              * enough to reset FatalError.
+              */
+             pmState = PM_RECOVERY_CONSISTENT;
+
+             /*
+              * Load the flat authorization file into postmaster's cache. The
+              * startup process won't have recomputed this from the database yet,
+              * so we it may change following recovery.
+              */
+             load_role();
+
+             /*
+              * Likewise, start other special children as needed.
+              */
+             Assert(PgStatPID == 0);
+             PgStatPID = pgstat_start();
+
+             /* XXX at this point we could accept read-only connections */
+             ereport(DEBUG1,
+                  (errmsg("database system is in consistent recovery mode")));
+         }
+     }
+     if ((pmState == PM_RECOVERY ||
+          pmState == PM_RECOVERY_CONSISTENT ||
+          pmState == PM_STARTUP) &&
+         RecoveryStatus == RecoveryCompleted)
+     {
+         /*
+          * Startup succeeded.
+          *
+          * Go to shutdown mode if a shutdown request was pending.
+          */
+         if (Shutdown > NoShutdown)
+         {
+             pmState = PM_WAIT_BACKENDS;
+             /* PostmasterStateMachine logic does the rest */
+         }
+         else
+         {
+             /*
+              * Otherwise, commence normal operations.
+              */
+             pmState = PM_RUN;
+
+             /*
+              * Load the flat authorization file into postmaster's cache. The
+              * startup process has recomputed this from the database contents,
+              * so we wait till it finishes before loading it.
+              */
+             load_role();
+
+             /*
+              * Crank up the background writer, if we didn't do that already
+              * when we entered consistent recovery phase.  It doesn't matter
+              * if this fails, we'll just try again later.
+              */
+             if (BgWriterPID == 0)
+                 BgWriterPID = StartBackgroundWriter();
+
+             /*
+              * Likewise, start other special children as needed.  In a restart
+              * situation, some of them may be alive already.
+              */
+             if (WalWriterPID == 0)
+                 WalWriterPID = StartWalWriter();
+             if (AutoVacuumingActive() && AutoVacPID == 0)
+                 AutoVacPID = StartAutoVacLauncher();
+             if (XLogArchivingActive() && PgArchPID == 0)
+                 PgArchPID = pgarch_start();
+             if (PgStatPID == 0)
+                 PgStatPID = pgstat_start();
+
+             /* at this point we are really open for business */
+             ereport(LOG,
+                 (errmsg("database system is ready to accept connections")));
+         }
+     }
+
+     /* Shutdown states */
+
      if (pmState == PM_WAIT_BACKUP)
      {
          /*
***************
*** 2723,2728 **** PostmasterStateMachine(void)
--- 2882,2896 ----
      }

      /*
+      * If recovery failed, wait for all non-syslogger children to exit,
+      * and then exit postmaster. We don't try to reinitialize when recovery
+      * fails, because more than likely it will just fail again and we will
+      * keep trying forever.
+      */
+     if (RecoveryError && pmState == PM_NO_CHILDREN)
+         ExitPostmaster(1);
+
+     /*
       * If we need to recover from a crash, wait for all non-syslogger
       * children to exit, then reset shmem and StartupDataBase.
       */
***************
*** 2734,2739 **** PostmasterStateMachine(void)
--- 2902,2909 ----
          shmem_exit(1);
          reset_shared(PostPortNumber);

+         RecoveryStatus = NoRecovery;
+
          StartupPID = StartupDataBase();
          Assert(StartupPID != 0);
          pmState = PM_STARTUP;
***************
*** 3838,3843 **** ExitPostmaster(int status)
--- 4008,4044 ----
  }

  /*
+  * common code used in sigusr1_handler() and reaper() to handle
+  * recovery-related signals from startup process
+  */
+ static void
+ CheckRecoverySignals(void)
+ {
+     bool changed = false;
+
+     if (CheckPostmasterSignal(PMSIGNAL_RECOVERY_STARTED))
+     {
+         Assert(pmState == PM_STARTUP);
+
+         RecoveryStatus = RecoveryStarted;
+         changed = true;
+     }
+     if (CheckPostmasterSignal(PMSIGNAL_RECOVERY_CONSISTENT))
+     {
+         RecoveryStatus = RecoveryConsistent;
+         changed = true;
+     }
+     if (CheckPostmasterSignal(PMSIGNAL_RECOVERY_COMPLETED))
+     {
+         RecoveryStatus = RecoveryCompleted;
+         changed = true;
+     }
+
+     if (changed)
+         PostmasterStateMachine();
+ }
+
+ /*
   * sigusr1_handler - handle signal conditions from child processes
   */
  static void
***************
*** 3847,3852 **** sigusr1_handler(SIGNAL_ARGS)
--- 4048,4055 ----

      PG_SETMASK(&BlockSig);

+     CheckRecoverySignals();
+
      if (CheckPostmasterSignal(PMSIGNAL_PASSWORD_CHANGE))
      {
          /*
*** a/src/backend/storage/buffer/README
--- b/src/backend/storage/buffer/README
***************
*** 268,270 **** out (and anyone else who flushes buffer contents to disk must do so too).
--- 268,279 ----
  This ensures that the page image transferred to disk is reasonably consistent.
  We might miss a hint-bit update or two but that isn't a problem, for the same
  reasons mentioned under buffer access rules.
+
+ As of 8.4, background writer starts during recovery mode when there is
+ some form of potentially extended recovery to perform. It performs an
+ identical service to normal processing, except that checkpoints it
+ writes are technically restartpoints. Flushing outstanding WAL for dirty
+ buffers is also skipped, though there shouldn't ever be new WAL entries
+ at that time in any case. We could choose to start background writer
+ immediately but we hold off until we can prove the database is in a
+ consistent state so that postmaster has a single, clean state change.
*** a/src/backend/utils/init/postinit.c
--- b/src/backend/utils/init/postinit.c
***************
*** 324,330 **** InitCommunication(void)
   * If you're wondering why this is separate from InitPostgres at all:
   * the critical distinction is that this stuff has to happen before we can
   * run XLOG-related initialization, which is done before InitPostgres --- in
!  * fact, for cases such as checkpoint creation processes, InitPostgres may
   * never be done at all.
   */
  void
--- 324,330 ----
   * If you're wondering why this is separate from InitPostgres at all:
   * the critical distinction is that this stuff has to happen before we can
   * run XLOG-related initialization, which is done before InitPostgres --- in
!  * fact, for cases such as the background writer process, InitPostgres may
   * never be done at all.
   */
  void
*** a/src/include/access/xlog.h
--- b/src/include/access/xlog.h
***************
*** 133,139 **** typedef struct XLogRecData
  } XLogRecData;

  extern TimeLineID ThisTimeLineID;        /* current TLI */
! extern bool InRecovery;
  extern XLogRecPtr XactLastRecEnd;

  /* these variables are GUC parameters related to XLOG */
--- 133,148 ----
  } XLogRecData;

  extern TimeLineID ThisTimeLineID;        /* current TLI */
!
! /*
!  * Prior to 8.4, all activity during recovery were carried out by Startup
!  * process. This local variable continues to be used in many parts of the
!  * code to indicate actions taken by RecoveryManagers. Other processes who
!  * potentially perform work during recovery should check
!  * IsRecoveryProcessingMode(), see XLogCtl notes in xlog.c
!  */
! extern bool InRecovery;
!
  extern XLogRecPtr XactLastRecEnd;

  /* these variables are GUC parameters related to XLOG */
***************
*** 199,204 **** extern void RestoreBkpBlocks(XLogRecPtr lsn, XLogRecord *record, bool cleanup);
--- 208,215 ----
  extern void xlog_redo(XLogRecPtr lsn, XLogRecord *record);
  extern void xlog_desc(StringInfo buf, uint8 xl_info, char *rec);

+ extern bool IsRecoveryProcessingMode(void);
+
  extern void UpdateControlFile(void);
  extern Size XLOGShmemSize(void);
  extern void XLOGShmemInit(void);
***************
*** 207,215 **** extern void StartupXLOG(void);
--- 218,229 ----
  extern void ShutdownXLOG(int code, Datum arg);
  extern void InitXLOGAccess(void);
  extern void CreateCheckPoint(int flags);
+ extern bool CreateRestartPoint(int flags);
  extern void XLogPutNextOid(Oid nextOid);
  extern XLogRecPtr GetRedoRecPtr(void);
  extern XLogRecPtr GetInsertRecPtr(void);
  extern void GetNextXidAndEpoch(TransactionId *xid, uint32 *epoch);

+ extern void StartupProcessMain(void);
+
  #endif   /* XLOG_H */
*** a/src/include/storage/pmsignal.h
--- b/src/include/storage/pmsignal.h
***************
*** 22,27 ****
--- 22,30 ----
   */
  typedef enum
  {
+     PMSIGNAL_RECOVERY_STARTED,    /* recovery has started */
+     PMSIGNAL_RECOVERY_CONSISTENT, /* recovery has reached consistent state */
+     PMSIGNAL_RECOVERY_COMPLETED, /* recovery completed */
      PMSIGNAL_PASSWORD_CHANGE,    /* pg_auth file has changed */
      PMSIGNAL_WAKEN_ARCHIVER,    /* send a NOTIFY signal to xlog archiver */
      PMSIGNAL_ROTATE_LOGFILE,    /* send SIGUSR1 to syslogger to rotate logfile */
В списке pgsql-hackers по дате отправления:
Вход в личный кабинет

Восстановление пароля

Подтверждение аккаунта

Изменение пароля

Re: Hot standby, recovery infra