Re: BUG #4879: bgwriter fails to fsync the file in recovery mode

Поиск
Список
Период
Сортировка
От Heikki Linnakangas
Тема Re: BUG #4879: bgwriter fails to fsync the file in recovery mode
Дата
Msg-id 4A43C4A1.5020201@enterprisedb.com
обсуждение исходный текст
Ответ на Re: BUG #4879: bgwriter fails to fsync the file in recovery mode  (Tom Lane <tgl@sss.pgh.pa.us>)
Ответы Re: BUG #4879: bgwriter fails to fsync the file in recovery mode  (Tom Lane <tgl@sss.pgh.pa.us>)
Re: BUG #4879: bgwriter fails to fsync the file in recovery mode  (Tom Lane <tgl@sss.pgh.pa.us>)
Список pgsql-bugs
Tom Lane wrote:
> Heikki Linnakangas <heikki.linnakangas@enterprisedb.com> writes:
>> Simon Riggs wrote:
>>> On Thu, 2009-06-25 at 12:43 -0400, Tom Lane wrote:
>>>> What about "revert the patch"?
>>> That's probably just as dangerous.
>
>> I don't feel comfortable either reverting such a big patch at last
>> minute.
>
> Yeah, I'm not very happy with that either.  However, I've still got no
> confidence in anything proposed so far.
>
>> I'm testing the attached patch at the moment. It's the same as the
>> previous one, with the elog() in mdsync() issue fixed.
>
> This seems like a kluge on top of a hack.  Can't we have the bgwriter
> do the final checkpoint instead?

Here's a patch taking that approach, and I think it's better than the
previous  one. I was afraid we would lose robustness if we have to set
the shared state as "out of recovery" before requesting the checkpoint,
but we can use the same trick we were using in startup process and set
LocalRecoveryInProgress=false before setting the shared variable. I
introduced a new CHECKPOINT_IS_STARTUP flag, which is otherwise
identical to CHECKPOINT_IS_SHUTDOWN, but in a startup checkpoint
CreateCheckPoint() excpects to be called while recovery is still active,
and sets LocalRecoveryInProgress=false. It also tells bgwriter that it
needs to do a checkpoint instead of a restartpoint, even though recovery
is still in progress.

--
  Heikki Linnakangas
  EnterpriseDB   http://www.enterprisedb.com
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 4fd9d41..18c47aa 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -5156,6 +5156,7 @@ StartupXLOG(void)
     XLogRecord *record;
     uint32        freespace;
     TransactionId oldestActiveXID;
+    bool        bgwriterLaunched;

     XLogCtl->SharedRecoveryInProgress = true;

@@ -5472,7 +5473,11 @@ StartupXLOG(void)
              * process in addition to postmaster!
              */
             if (InArchiveRecovery && IsUnderPostmaster)
+            {
+                SetForwardFsyncRequests();
                 SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
+                bgwriterLaunched = true;
+            }

             /*
              * main redo apply loop
@@ -5709,12 +5714,6 @@ StartupXLOG(void)
     /* Pre-scan prepared transactions to find out the range of XIDs present */
     oldestActiveXID = PrescanPreparedTransactions();

-    /*
-     * Allow writing WAL for us, so that we can create a checkpoint record.
-     * But not yet for other backends!
-     */
-    LocalRecoveryInProgress = false;
-
     if (InRecovery)
     {
         int            rmid;
@@ -5743,7 +5742,11 @@ StartupXLOG(void)
          * the rule that TLI only changes in shutdown checkpoints, which
          * allows some extra error checking in xlog_redo.
          */
-        CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
+        if (bgwriterLaunched)
+            RequestCheckpoint(CHECKPOINT_IS_STARTUP | CHECKPOINT_IMMEDIATE |
+                              CHECKPOINT_WAIT);
+        else
+            CreateCheckPoint(CHECKPOINT_IS_STARTUP | CHECKPOINT_IMMEDIATE);

         /*
          * And finally, execute the recovery_end_command, if any.
@@ -5806,7 +5809,7 @@ StartupXLOG(void)
     }

     /*
-     * All done. Allow others to write WAL.
+     * All done. Allow backends to write WAL.
      */
     XLogCtl->SharedRecoveryInProgress = false;
 }
@@ -6123,12 +6126,13 @@ LogCheckpointStart(int flags, bool restartpoint)
      * the main message, but what about all the flags?
      */
     if (restartpoint)
-        msg = "restartpoint starting:%s%s%s%s%s%s";
+        msg = "restartpoint starting:%s%s%s%s%s%s%s";
     else
-        msg = "checkpoint starting:%s%s%s%s%s%s";
+        msg = "checkpoint starting:%s%s%s%s%s%s%s";

     elog(LOG, msg,
          (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
+         (flags & CHECKPOINT_IS_STARTUP) ? " startup" : "",
          (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
          (flags & CHECKPOINT_FORCE) ? " force" : "",
          (flags & CHECKPOINT_WAIT) ? " wait" : "",
@@ -6190,10 +6194,12 @@ LogCheckpointEnd(bool restartpoint)
  *
  * flags is a bitwise OR of the following:
  *    CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
+ *    CHECKPOINT_IS_STARTUP: checkpoint is for database startup.
  *    CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
  *        ignoring checkpoint_completion_target parameter.
  *    CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occured
- *        since the last one (implied by CHECKPOINT_IS_SHUTDOWN).
+ *        since the last one (implied by CHECKPOINT_IS_SHUTDOWN and
+ *        CHECKPOINT_IS_STARTUP).
  *
  * Note: flags contains other bits, of interest here only for logging purposes.
  * In particular note that this routine is synchronous and does not pay
@@ -6202,7 +6208,7 @@ LogCheckpointEnd(bool restartpoint)
 void
 CreateCheckPoint(int flags)
 {
-    bool        shutdown = (flags & CHECKPOINT_IS_SHUTDOWN) != 0;
+    bool        shutdown;
     CheckPoint    checkPoint;
     XLogRecPtr    recptr;
     XLogCtlInsert *Insert = &XLogCtl->Insert;
@@ -6213,35 +6219,40 @@ CreateCheckPoint(int flags)
     TransactionId *inCommitXids;
     int            nInCommit;

-    /* shouldn't happen */
-    if (RecoveryInProgress())
-        elog(ERROR, "can't create a checkpoint during recovery");
+    /*
+     * A startup checkpoint is really a shutdown checkpoint, just issued at
+     * a different time.
+     */
+    shutdown = (flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IS_STARTUP)) != 0;

     /*
-     * Acquire CheckpointLock to ensure only one checkpoint happens at a time.
-     * During normal operation, bgwriter is the only process that creates
-     * checkpoints, but at the end of archive recovery, the bgwriter can be
-     * busy creating a restartpoint while the startup process tries to perform
-     * the startup checkpoint.
+     * A startup checkpoint is created before anyone else is allowed to
+     * write WAL. To allow us to write the checkpoint record, set
+     * LocalRecoveryInProgress to false. This lets us write WAL, but others
+     * are still not allowed to do so.
      */
-    if (!LWLockConditionalAcquire(CheckpointLock, LW_EXCLUSIVE))
+    if (flags & CHECKPOINT_IS_STARTUP)
     {
-        Assert(InRecovery);
-
-        /*
-         * A restartpoint is in progress. Wait until it finishes. This can
-         * cause an extra restartpoint to be performed, but that's OK because
-         * we're just about to perform a checkpoint anyway. Flushing the
-         * buffers in this restartpoint can take some time, but that time is
-         * saved from the upcoming checkpoint so the net effect is zero.
-         */
-        ereport(DEBUG2, (errmsg("hurrying in-progress restartpoint")));
-        RequestCheckpoint(CHECKPOINT_IMMEDIATE | CHECKPOINT_WAIT);
-
-        LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
+        Assert(RecoveryInProgress());
+        LocalRecoveryInProgress = false;
+        InitXLOGAccess();
+    }
+    else
+    {
+        /* shouldn't happen */
+        if (RecoveryInProgress())
+            elog(ERROR, "can't create a checkpoint during recovery");
     }

     /*
+     * Acquire CheckpointLock to ensure only one checkpoint happens at a time.
+     * (This is just pro forma, since in the present system structure there is
+     * only one process that is allowed to issue checkpoints at any given
+     * time.)
+     */
+    LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
+
+    /*
      * Prepare to accumulate statistics.
      *
      * Note: because it is possible for log_checkpoints to change while a
@@ -6298,7 +6309,8 @@ CreateCheckPoint(int flags)
      * the end of the last checkpoint record, and its redo pointer must point
      * to itself.
      */
-    if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_FORCE)) == 0)
+    if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IS_STARTUP |
+                  CHECKPOINT_FORCE)) == 0)
     {
         XLogRecPtr    curInsert;

@@ -6528,7 +6540,7 @@ CreateCheckPoint(int flags)
      * in subtrans.c).    During recovery, though, we mustn't do this because
      * StartupSUBTRANS hasn't been called yet.
      */
-    if (!InRecovery)
+    if (!InRecovery && (flags & CHECKPOINT_IS_STARTUP) == 0)
         TruncateSUBTRANS(GetOldestXmin(true, false));

     /* All real work is done, but log before releasing lock. */
diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c
index 55dff57..2c42584 100644
--- a/src/backend/postmaster/bgwriter.c
+++ b/src/backend/postmaster/bgwriter.c
@@ -449,6 +449,13 @@ BackgroundWriterMain(void)
             SpinLockRelease(&bgs->ckpt_lck);

             /*
+             * A startup checkpoint is a real checkpoint that's performed
+             * while we're still in recovery.
+             */
+            if (flags & CHECKPOINT_IS_STARTUP)
+                do_restartpoint = false;
+
+            /*
              * We will warn if (a) too soon since last checkpoint (whatever
              * caused it) and (b) somebody set the CHECKPOINT_CAUSE_XLOG flag
              * since the last checkpoint start.  Note in particular that this
@@ -895,10 +902,12 @@ BgWriterShmemInit(void)
  *
  * flags is a bitwise OR of the following:
  *    CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
+ *    CHECKPOINT_IS_STARTUP: checkpoint is for database startup.
  *    CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
  *        ignoring checkpoint_completion_target parameter.
  *    CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occured
- *        since the last one (implied by CHECKPOINT_IS_SHUTDOWN).
+ *        since the last one (implied by CHECKPOINT_IS_SHUTDOWN and
+ *        CHECKPOINT_IS_STARTUP).
  *    CHECKPOINT_WAIT: wait for completion before returning (otherwise,
  *        just signal bgwriter to do it, and return).
  *    CHECKPOINT_CAUSE_XLOG: checkpoint is requested due to xlog filling.
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index d42e86a..b01b2ab 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -204,6 +204,21 @@ mdinit(void)
 }

 /*
+ * In archive recovery, we rely on bgwriter to do fsyncs(), but we don't
+ * know that we do archive recovery at process startup when pendingOpsTable
+ * has already been created. Calling this function drops pendingOpsTable
+ * and causes any subsequent requests to be forwarded to bgwriter.
+ */
+void
+SetForwardFsyncRequests(void)
+{
+    /* Perform any pending ops we may have queued up */
+    if (pendingOpsTable)
+        mdsync();
+    pendingOpsTable = NULL;
+}
+
+/*
  *    mdexists() -- Does the physical file exist?
  *
  * Note: this will return true for lingering files, with pending deletions
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index f8720bb..d206b93 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -166,6 +166,7 @@ extern bool XLOG_DEBUG;
 /* These indicate the cause of a checkpoint request */
 #define CHECKPOINT_CAUSE_XLOG    0x0010    /* XLOG consumption */
 #define CHECKPOINT_CAUSE_TIME    0x0020    /* Elapsed time */
+#define CHECKPOINT_IS_STARTUP    0x0040    /* Checkpoint is for startup */

 /* Checkpoint statistics */
 typedef struct CheckpointStatsData
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index 7556b14..fd79d7b 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -109,6 +109,7 @@ extern void mdpreckpt(void);
 extern void mdsync(void);
 extern void mdpostckpt(void);

+extern void SetForwardFsyncRequests(void);
 extern void RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum,
                      BlockNumber segno);
 extern void ForgetRelationFsyncRequests(RelFileNode rnode, ForkNumber forknum);

В списке pgsql-bugs по дате отправления:

Предыдущее
От: Heikki Linnakangas
Дата:
Сообщение: Re: BUG #4879: bgwriter fails to fsync the file in recovery mode
Следующее
От: Tom Lane
Дата:
Сообщение: Re: BUG #4879: bgwriter fails to fsync the file in recovery mode