Re: Corruption during WAL replay

Поиск
Список
Период
Сортировка
От Kyotaro Horiguchi
Тема Re: Corruption during WAL replay
Дата
Msg-id 20220126.172533.404912466361180867.horikyota.ntt@gmail.com
обсуждение исходный текст
Ответ на Re: Corruption during WAL replay  (Daniel Shelepanov <deniel1495@mail.ru>)
Ответы Re: Corruption during WAL replay  (Robert Haas <robertmhaas@gmail.com>)
Список pgsql-hackers
At Mon, 24 Jan 2022 23:33:20 +0300, Daniel Shelepanov <deniel1495@mail.ru> wrote in 
> Hi. This is my first attempt to review a patch so feel free to tell me
> if I missed something.

Welcome!

> As of today's state of REL_14_STABLE
> (ef9706bbc8ce917a366e4640df8c603c9605817a), the problem is
> reproducible using the script provided by Daniel Wood in this
> (1335373813.287510.1573611814107@connect.xfinity.com) message. Also,
> the latest patch seems not to be applicable and requires some minor
> tweaks.

Thanks for the info.  The reason for my failure is checksum was
enabled.. After disalbing both fpw and checksum (and wal_log_hints)
allows me to reproduce the issue.  And what I found is:

v3 patch:
 >    vxids = GetVirtualXIDsDelayingChkpt(&nvxids, DELAY_CHKPT_COMPLETE);
!>    if (0 && nvxids > 0)
 >    {

Ugggggggh!  It looks like a debugging tweak but it prevents everything
from working.

The attached is the fixed version and it surely works with the repro.

regards.

-- 
Kyotaro Horiguchi
NTT Open Source Software Center
From e4155f4e777f31c019cbbd00f41c7b5eb2320ef7 Mon Sep 17 00:00:00 2001
From: Kyotaro Horiguchi <horikyota.ntt@gmail.com>
Date: Wed, 26 Jan 2022 15:07:58 +0900
Subject: [PATCH v4] Delay checkpoint completion until truncation completes

When a relation is being truncated, the buffers for the relation are
dropped without being written to underlying relation file, so after
that moment the content of the to-be-truncated file is not guaranteed
to be consistent. This is fine in the normal cases where the file is
eventually removed. However, if a checkpoint after the buffer dropping
but finally the file truncation doesn't happen due to server crash or
filesystem failure, the files left behind is inconsistent with WAL.

This commit delays checkpoint completion until all ongoing file
truncation are gone to prevent such inconsistency.
---
 src/backend/access/transam/multixact.c  |  6 +++---
 src/backend/access/transam/twophase.c   | 12 +++++++-----
 src/backend/access/transam/xact.c       |  5 +++--
 src/backend/access/transam/xlog.c       | 16 +++++++++++++--
 src/backend/access/transam/xloginsert.c |  2 +-
 src/backend/catalog/storage.c           | 13 +++++++++++++
 src/backend/storage/buffer/bufmgr.c     |  6 ++++--
 src/backend/storage/ipc/procarray.c     | 26 ++++++++++++++++++-------
 src/backend/storage/lmgr/proc.c         |  4 ++--
 src/include/storage/proc.h              |  7 ++++++-
 src/include/storage/procarray.h         |  7 ++++---
 11 files changed, 76 insertions(+), 28 deletions(-)

diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c
index 806f2e43ba..70593238ab 100644
--- a/src/backend/access/transam/multixact.c
+++ b/src/backend/access/transam/multixact.c
@@ -3075,8 +3075,8 @@ TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB)
      * crash/basebackup, even though the state of the data directory would
      * require it.
      */
-    Assert(!MyProc->delayChkpt);
-    MyProc->delayChkpt = true;
+    Assert((MyProc->delayChkpt & DELAY_CHKPT_START) == 0);
+    MyProc->delayChkpt |= DELAY_CHKPT_START;
 
     /* WAL log truncation */
     WriteMTruncateXlogRec(newOldestMultiDB,
@@ -3102,7 +3102,7 @@ TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB)
     /* Then offsets */
     PerformOffsetsTruncation(oldestMulti, newOldestMulti);
 
-    MyProc->delayChkpt = false;
+    MyProc->delayChkpt &= ~DELAY_CHKPT_START;
 
     END_CRIT_SECTION();
     LWLockRelease(MultiXactTruncationLock);
diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c
index 271a3146db..2b1a25e723 100644
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -474,7 +474,7 @@ MarkAsPreparingGuts(GlobalTransaction gxact, TransactionId xid, const char *gid,
     }
     proc->xid = xid;
     Assert(proc->xmin == InvalidTransactionId);
-    proc->delayChkpt = false;
+    proc->delayChkpt = 0;
     proc->statusFlags = 0;
     proc->pid = 0;
     proc->databaseId = databaseid;
@@ -1165,7 +1165,8 @@ EndPrepare(GlobalTransaction gxact)
 
     START_CRIT_SECTION();
 
-    MyProc->delayChkpt = true;
+    Assert((MyProc->delayChkpt & DELAY_CHKPT_START) == 0);
+    MyProc->delayChkpt |= DELAY_CHKPT_START;
 
     XLogBeginInsert();
     for (record = records.head; record != NULL; record = record->next)
@@ -1208,7 +1209,7 @@ EndPrepare(GlobalTransaction gxact)
      * checkpoint starting after this will certainly see the gxact as a
      * candidate for fsyncing.
      */
-    MyProc->delayChkpt = false;
+    MyProc->delayChkpt &= ~DELAY_CHKPT_START;
 
     /*
      * Remember that we have this GlobalTransaction entry locked for us.  If
@@ -2267,7 +2268,8 @@ RecordTransactionCommitPrepared(TransactionId xid,
     START_CRIT_SECTION();
 
     /* See notes in RecordTransactionCommit */
-    MyProc->delayChkpt = true;
+    Assert((MyProc->delayChkpt & DELAY_CHKPT_START) == 0);
+    MyProc->delayChkpt |= DELAY_CHKPT_START;
 
     /*
      * Emit the XLOG commit record. Note that we mark 2PC commits as
@@ -2315,7 +2317,7 @@ RecordTransactionCommitPrepared(TransactionId xid,
     TransactionIdCommitTree(xid, nchildren, children);
 
     /* Checkpoint can proceed now */
-    MyProc->delayChkpt = false;
+    MyProc->delayChkpt &= ~DELAY_CHKPT_START;
 
     END_CRIT_SECTION();
 
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index c9516e03fa..9e6acf954c 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -1386,8 +1386,9 @@ RecordTransactionCommit(void)
          * This makes checkpoint's determination of which xacts are delayChkpt
          * a bit fuzzy, but it doesn't matter.
          */
+        Assert((MyProc->delayChkpt & DELAY_CHKPT_START) == 0);
         START_CRIT_SECTION();
-        MyProc->delayChkpt = true;
+        MyProc->delayChkpt |= DELAY_CHKPT_START;
 
         SetCurrentTransactionStopTimestamp();
 
@@ -1488,7 +1489,7 @@ RecordTransactionCommit(void)
      */
     if (markXidCommitted)
     {
-        MyProc->delayChkpt = false;
+        MyProc->delayChkpt &= ~DELAY_CHKPT_START;
         END_CRIT_SECTION();
     }
 
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index dfe2a0bcce..c2729e4222 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -9304,18 +9304,30 @@ CreateCheckPoint(int flags)
      * and we will correctly flush the update below.  So we cannot miss any
      * xacts we need to wait for.
      */
-    vxids = GetVirtualXIDsDelayingChkpt(&nvxids);
+    vxids = GetVirtualXIDsDelayingChkpt(&nvxids, DELAY_CHKPT_START);
     if (nvxids > 0)
     {
         do
         {
             pg_usleep(10000L);    /* wait for 10 msec */
-        } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids));
+        } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids,
+                                              DELAY_CHKPT_START));
     }
     pfree(vxids);
 
     CheckPointGuts(checkPoint.redo, flags);
 
+    vxids = GetVirtualXIDsDelayingChkpt(&nvxids, DELAY_CHKPT_COMPLETE);
+    if (nvxids > 0)
+    {
+        do
+        {
+            pg_usleep(10000L);    /* wait for 10 msec */
+        } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids,
+                                              DELAY_CHKPT_COMPLETE));
+    }
+    pfree(vxids);
+
     /*
      * Take a snapshot of running transactions and write this to WAL. This
      * allows us to reconstruct the state of running transactions during
diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c
index c260310c4c..d66ec015ea 100644
--- a/src/backend/access/transam/xloginsert.c
+++ b/src/backend/access/transam/xloginsert.c
@@ -978,7 +978,7 @@ XLogSaveBufferForHint(Buffer buffer, bool buffer_std)
     /*
      * Ensure no checkpoint can change our view of RedoRecPtr.
      */
-    Assert(MyProc->delayChkpt);
+    Assert((MyProc->delayChkpt & DELAY_CHKPT_START) != 0);
 
     /*
      * Update RedoRecPtr so that we can make the right decision
diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c
index 9b8075536a..7af8278a50 100644
--- a/src/backend/catalog/storage.c
+++ b/src/backend/catalog/storage.c
@@ -325,6 +325,15 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
 
     RelationPreTruncate(rel);
 
+    /*
+     * Delay the concurrent checkpoint's completion until this truncation
+     * successfully completes. Otherwise the checkpoint may leave inconsistent
+     * relation files if the following file truncation doesn't happen due to
+     * file system failure or server crash.
+     */
+    Assert((MyProc->delayChkpt & DELAY_CHKPT_COMPLETE) == 0);
+    MyProc->delayChkpt |= DELAY_CHKPT_COMPLETE;
+
     /*
      * We WAL-log the truncation before actually truncating, which means
      * trouble if the truncation fails. If we then crash, the WAL replay
@@ -366,6 +375,10 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
     /* Do the real work to truncate relation forks */
     smgrtruncate(RelationGetSmgr(rel), forks, nforks, blocks);
 
+
+    /* FSM is not WAL-logged, close the critical section here. */
+    MyProc->delayChkpt &= ~DELAY_CHKPT_COMPLETE;
+
     /*
      * Update upper-level FSM pages to account for the truncation. This is
      * important because the just-truncated pages were likely marked as
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index a2512e750c..1bf064ba5e 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -3910,7 +3910,9 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
              * essential that CreateCheckPoint waits for virtual transactions
              * rather than full transactionids.
              */
-            MyProc->delayChkpt = delayChkpt = true;
+            Assert((MyProc->delayChkpt & DELAY_CHKPT_START) == 0);
+            MyProc->delayChkpt |= DELAY_CHKPT_START;
+            delayChkpt = true;
             lsn = XLogSaveBufferForHint(buffer, buffer_std);
         }
 
@@ -3943,7 +3945,7 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
         UnlockBufHdr(bufHdr, buf_state);
 
         if (delayChkpt)
-            MyProc->delayChkpt = false;
+            MyProc->delayChkpt &= ~DELAY_CHKPT_START;
 
         if (dirtied)
         {
diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c
index 3be6040289..90628d150b 100644
--- a/src/backend/storage/ipc/procarray.c
+++ b/src/backend/storage/ipc/procarray.c
@@ -689,7 +689,10 @@ ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid)
 
         proc->lxid = InvalidLocalTransactionId;
         proc->xmin = InvalidTransactionId;
-        proc->delayChkpt = false;    /* be sure this is cleared in abort */
+
+        /* be sure this is cleared in abort */
+        proc->delayChkpt = 0;
+
         proc->recoveryConflictPending = false;
 
         /* must be cleared with xid/xmin: */
@@ -728,7 +731,10 @@ ProcArrayEndTransactionInternal(PGPROC *proc, TransactionId latestXid)
     proc->xid = InvalidTransactionId;
     proc->lxid = InvalidLocalTransactionId;
     proc->xmin = InvalidTransactionId;
-    proc->delayChkpt = false;    /* be sure this is cleared in abort */
+
+    /* be sure this is cleared in abort */
+    proc->delayChkpt = 0;
+
     proc->recoveryConflictPending = false;
 
     /* must be cleared with xid/xmin: */
@@ -3039,7 +3045,8 @@ GetOldestSafeDecodingTransactionId(bool catalogOnly)
  * delaying checkpoint because they have critical actions in progress.
  *
  * Constructs an array of VXIDs of transactions that are currently in commit
- * critical sections, as shown by having delayChkpt set in their PGPROC.
+ * critical sections, as shown by having specified delayChkpt bits set in their
+ * PGPROC.
  *
  * Returns a palloc'd array that should be freed by the caller.
  * *nvxids is the number of valid entries.
@@ -3053,13 +3060,15 @@ GetOldestSafeDecodingTransactionId(bool catalogOnly)
  * for clearing of delayChkpt to propagate is unimportant for correctness.
  */
 VirtualTransactionId *
-GetVirtualXIDsDelayingChkpt(int *nvxids)
+GetVirtualXIDsDelayingChkpt(int *nvxids, int type)
 {
     VirtualTransactionId *vxids;
     ProcArrayStruct *arrayP = procArray;
     int            count = 0;
     int            index;
 
+    Assert(type != 0);
+
     /* allocate what's certainly enough result space */
     vxids = (VirtualTransactionId *)
         palloc(sizeof(VirtualTransactionId) * arrayP->maxProcs);
@@ -3071,7 +3080,7 @@ GetVirtualXIDsDelayingChkpt(int *nvxids)
         int            pgprocno = arrayP->pgprocnos[index];
         PGPROC       *proc = &allProcs[pgprocno];
 
-        if (proc->delayChkpt)
+        if ((proc->delayChkpt & type) != 0)
         {
             VirtualTransactionId vxid;
 
@@ -3097,12 +3106,14 @@ GetVirtualXIDsDelayingChkpt(int *nvxids)
  * those numbers should be small enough for it not to be a problem.
  */
 bool
-HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids)
+HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids, int type)
 {
     bool        result = false;
     ProcArrayStruct *arrayP = procArray;
     int            index;
 
+    Assert(type != 0);
+
     LWLockAcquire(ProcArrayLock, LW_SHARED);
 
     for (index = 0; index < arrayP->numProcs; index++)
@@ -3113,7 +3124,8 @@ HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids)
 
         GET_VXID_FROM_PGPROC(vxid, *proc);
 
-        if (proc->delayChkpt && VirtualTransactionIdIsValid(vxid))
+        if ((proc->delayChkpt & type) != 0 &&
+            VirtualTransactionIdIsValid(vxid))
         {
             int            i;
 
diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c
index e306b04738..7611ad0f0c 100644
--- a/src/backend/storage/lmgr/proc.c
+++ b/src/backend/storage/lmgr/proc.c
@@ -392,7 +392,7 @@ InitProcess(void)
     MyProc->roleId = InvalidOid;
     MyProc->tempNamespaceId = InvalidOid;
     MyProc->isBackgroundWorker = IsBackgroundWorker;
-    MyProc->delayChkpt = false;
+    MyProc->delayChkpt = 0;
     MyProc->statusFlags = 0;
     /* NB -- autovac launcher intentionally does not set IS_AUTOVACUUM */
     if (IsAutoVacuumWorkerProcess())
@@ -577,7 +577,7 @@ InitAuxiliaryProcess(void)
     MyProc->roleId = InvalidOid;
     MyProc->tempNamespaceId = InvalidOid;
     MyProc->isBackgroundWorker = IsBackgroundWorker;
-    MyProc->delayChkpt = false;
+    MyProc->delayChkpt = 0;
     MyProc->statusFlags = 0;
     MyProc->lwWaiting = false;
     MyProc->lwWaitMode = 0;
diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h
index a58888f9e9..7b1f7a6ac7 100644
--- a/src/include/storage/proc.h
+++ b/src/include/storage/proc.h
@@ -86,6 +86,10 @@ struct XidCache
  */
 #define INVALID_PGPROCNO        PG_INT32_MAX
 
+/* symbols for PGPROC.delayChkpt */
+#define DELAY_CHKPT_START        (1<<0)
+#define DELAY_CHKPT_COMPLETE    (1<<1)
+
 typedef enum
 {
     PROC_WAIT_STATUS_OK,
@@ -191,7 +195,8 @@ struct PGPROC
     pg_atomic_uint64 waitStart; /* time at which wait for lock acquisition
                                  * started */
 
-    bool        delayChkpt;        /* true if this proc delays checkpoint start */
+    int            delayChkpt;        /* if this proc delays checkpoint start and/or
+                                 * completion.  */
 
     uint8        statusFlags;    /* this backend's status flags, see PROC_*
                                  * above. mirrored in
diff --git a/src/include/storage/procarray.h b/src/include/storage/procarray.h
index e03692053e..dcdb01be53 100644
--- a/src/include/storage/procarray.h
+++ b/src/include/storage/procarray.h
@@ -15,11 +15,11 @@
 #define PROCARRAY_H
 
 #include "storage/lock.h"
+#include "storage/proc.h"
 #include "storage/standby.h"
 #include "utils/relcache.h"
 #include "utils/snapshot.h"
 
-
 extern Size ProcArrayShmemSize(void);
 extern void CreateSharedProcArray(void);
 extern void ProcArrayAdd(PGPROC *proc);
@@ -59,8 +59,9 @@ extern TransactionId GetOldestActiveTransactionId(void);
 extern TransactionId GetOldestSafeDecodingTransactionId(bool catalogOnly);
 extern void GetReplicationHorizons(TransactionId *slot_xmin, TransactionId *catalog_xmin);
 
-extern VirtualTransactionId *GetVirtualXIDsDelayingChkpt(int *nvxids);
-extern bool HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids);
+extern VirtualTransactionId *GetVirtualXIDsDelayingChkpt(int *nvxids, int type);
+extern bool HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids,
+                                         int nvxids, int type);
 
 extern PGPROC *BackendPidGetProc(int pid);
 extern PGPROC *BackendPidGetProcWithLock(int pid);
-- 
2.27.0


В списке pgsql-hackers по дате отправления:

Предыдущее
От: Michail Nikolaev
Дата:
Сообщение: Re: [PATCH] Full support for index LP_DEAD hint bits on standby
Следующее
От: "osumi.takamichi@fujitsu.com"
Дата:
Сообщение: RE: logical replication empty transactions