Re: [HACKERS] WAL logging problem in 9.4.3?

Поиск
Список
Период
Сортировка
От Kyotaro HORIGUCHI
Тема Re: [HACKERS] WAL logging problem in 9.4.3?
Дата
Msg-id 20190130.102634.246232856.horiguchi.kyotaro@lab.ntt.co.jp
обсуждение исходный текст
Ответ на Re: [HACKERS] WAL logging problem in 9.4.3?  (Kyotaro HORIGUCHI <horiguchi.kyotaro@lab.ntt.co.jp>)
Ответы Re: [HACKERS] WAL logging problem in 9.4.3?  (Kyotaro HORIGUCHI <horiguchi.kyotaro@lab.ntt.co.jp>)
Список pgsql-hackers
Rebased.

regards.

-- 
Kyotaro Horiguchi
NTT Open Source Software Center
From d5f2b47b6ba191d0ad1673f9bd9c5851d91a1b59 Mon Sep 17 00:00:00 2001
From: Kyotaro Horiguchi <horiguchi.kyotaro@lab.ntt.co.jp>
Date: Thu, 11 Oct 2018 10:03:21 +0900
Subject: [PATCH 1/4] TAP test for copy-truncation optimization.

---
 src/test/recovery/t/016_wal_optimize.pl | 192 ++++++++++++++++++++++++++++++++
 1 file changed, 192 insertions(+)
 create mode 100644 src/test/recovery/t/016_wal_optimize.pl

diff --git a/src/test/recovery/t/016_wal_optimize.pl b/src/test/recovery/t/016_wal_optimize.pl
new file mode 100644
index 0000000000..310772a2b3
--- /dev/null
+++ b/src/test/recovery/t/016_wal_optimize.pl
@@ -0,0 +1,192 @@
+# Test WAL replay for optimized TRUNCATE and COPY records
+#
+# WAL truncation is optimized in some cases with TRUNCATE and COPY queries
+# which sometimes interact badly with the other optimizations in line with
+# several setting values of wal_level, particularly when using "minimal" or
+# "replica".  The optimization may be enabled or disabled depending on the
+# scenarios dealt here, and should never result in any type of failures or
+# data loss.
+use strict;
+use warnings;
+
+use PostgresNode;
+use TestLib;
+use Test::More tests => 14;
+
+# Wrapper routine tunable for wal_level.
+sub run_wal_optimize
+{
+    my $wal_level = shift;
+
+    # Primary needs to have wal_level = minimal here
+    my $node = get_new_node("node_$wal_level");
+    $node->init;
+    $node->append_conf('postgresql.conf', qq(
+wal_level = $wal_level
+));
+    $node->start;
+
+    # Test direct truncation optimization.  No tuples
+    $node->safe_psql('postgres', "
+        BEGIN;
+        CREATE TABLE test1 (id serial PRIMARY KEY);
+        TRUNCATE test1;
+        COMMIT;");
+
+    $node->stop('immediate');
+    $node->start;
+
+    my $result = $node->safe_psql('postgres', "SELECT count(*) FROM test1;");
+    is($result, qq(0),
+       "wal_level = $wal_level, optimized truncation with empty table");
+
+    # Test truncation with inserted tuples within the same transaction.
+    # Tuples inserted after the truncation should be seen.
+    $node->safe_psql('postgres', "
+        BEGIN;
+        CREATE TABLE test2 (id serial PRIMARY KEY);
+        INSERT INTO test2 VALUES (DEFAULT);
+        TRUNCATE test2;
+        INSERT INTO test2 VALUES (DEFAULT);
+        COMMIT;");
+
+    $node->stop('immediate');
+    $node->start;
+
+    $result = $node->safe_psql('postgres', "SELECT count(*) FROM test2;");
+    is($result, qq(1),
+       "wal_level = $wal_level, optimized truncation with inserted table");
+
+    # Data file for COPY query in follow-up tests.
+    my $basedir = $node->basedir;
+    my $copy_file = "$basedir/copy_data.txt";
+    TestLib::append_to_file($copy_file, qq(20000,30000
+20001,30001
+20002,30002));
+
+    # Test truncation with inserted tuples using COPY.  Tuples copied after the
+    # truncation should be seen.
+    $node->safe_psql('postgres', "
+        BEGIN;
+        CREATE TABLE test3 (id serial PRIMARY KEY, id2 int);
+        INSERT INTO test3 (id, id2) VALUES (DEFAULT, generate_series(1,10000));
+        TRUNCATE test3;
+        COPY test3 FROM '$copy_file' DELIMITER ',';
+        COMMIT;");
+    $node->stop('immediate');
+    $node->start;
+    $result = $node->safe_psql('postgres', "SELECT count(*) FROM test3;");
+    is($result, qq(3),
+       "wal_level = $wal_level, optimized truncation with copied table");
+
+    # Test truncation with inserted tuples using both INSERT and COPY. Tuples
+    # inserted after the truncation should be seen.
+    $node->safe_psql('postgres', "
+        BEGIN;
+        CREATE TABLE test4 (id serial PRIMARY KEY, id2 int);
+        INSERT INTO test4 (id, id2) VALUES (DEFAULT, generate_series(1,10000));
+        TRUNCATE test4;
+        INSERT INTO test4 (id, id2) VALUES (DEFAULT, 10000);
+        COPY test4 FROM '$copy_file' DELIMITER ',';
+        INSERT INTO test4 (id, id2) VALUES (DEFAULT, 10000);
+        COMMIT;");
+
+    $node->stop('immediate');
+    $node->start;
+    $result = $node->safe_psql('postgres', "SELECT count(*) FROM test4;");
+    is($result, qq(5),
+       "wal_level = $wal_level, optimized truncation with inserted/copied table");
+
+    # Test consistency of COPY with INSERT for table created in the same
+    # transaction.
+    $node->safe_psql('postgres', "
+        BEGIN;
+        CREATE TABLE test5 (id serial PRIMARY KEY, id2 int);
+        INSERT INTO test5 VALUES (DEFAULT, 1);
+        COPY test5 FROM '$copy_file' DELIMITER ',';
+        COMMIT;");
+    $node->stop('immediate');
+    $node->start;
+    $result = $node->safe_psql('postgres', "SELECT count(*) FROM test5;");
+    is($result, qq(4),
+       "wal_level = $wal_level, replay of optimized copy with inserted table");
+
+    # Test consistency of COPY that inserts more to the same table using
+    # triggers.  If the INSERTS from the trigger go to the same block data
+    # is copied to, and the INSERTs are WAL-logged, WAL replay will fail when
+    # it tries to replay the WAL record but the "before" image doesn't match,
+    # because not all changes were WAL-logged.
+    $node->safe_psql('postgres', "
+        BEGIN;
+        CREATE TABLE test6 (id serial PRIMARY KEY, id2 text);
+        CREATE FUNCTION test6_before_row_trig() RETURNS trigger
+          LANGUAGE plpgsql as \$\$
+          BEGIN
+            IF new.id2 NOT LIKE 'triggered%' THEN
+              INSERT INTO test6 VALUES (DEFAULT, 'triggered row before' || NEW.id2);
+            END IF;
+            RETURN NEW;
+          END; \$\$;
+        CREATE FUNCTION test6_after_row_trig() RETURNS trigger
+          LANGUAGE plpgsql as \$\$
+          BEGIN
+            IF new.id2 NOT LIKE 'triggered%' THEN
+              INSERT INTO test6 VALUES (DEFAULT, 'triggered row after' || OLD.id2);
+            END IF;
+            RETURN NEW;
+          END; \$\$;
+        CREATE TRIGGER test6_before_row_insert
+          BEFORE INSERT ON test6
+          FOR EACH ROW EXECUTE PROCEDURE test6_before_row_trig();
+        CREATE TRIGGER test6_after_row_insert
+          AFTER INSERT ON test6
+          FOR EACH ROW EXECUTE PROCEDURE test6_after_row_trig();
+        COPY test6 FROM '$copy_file' DELIMITER ',';
+        COMMIT;");
+    $node->stop('immediate');
+    $node->start;
+    $result = $node->safe_psql('postgres', "SELECT count(*) FROM test6;");
+    is($result, qq(9),
+       "wal_level = $wal_level, replay of optimized copy with before trigger");
+
+    # Test consistency of INSERT, COPY and TRUNCATE in same transaction block
+    # with TRUNCATE triggers.
+    $node->safe_psql('postgres', "
+        BEGIN;
+        CREATE TABLE test7 (id serial PRIMARY KEY, id2 text);
+        CREATE FUNCTION test7_before_stat_trig() RETURNS trigger
+          LANGUAGE plpgsql as \$\$
+          BEGIN
+            INSERT INTO test7 VALUES (DEFAULT, 'triggered stat before');
+            RETURN NULL;
+          END; \$\$;
+        CREATE FUNCTION test7_after_stat_trig() RETURNS trigger
+          LANGUAGE plpgsql as \$\$
+          BEGIN
+            INSERT INTO test7 VALUES (DEFAULT, 'triggered stat before');
+            RETURN NULL;
+          END; \$\$;
+        CREATE TRIGGER test7_before_stat_truncate
+          BEFORE TRUNCATE ON test7
+          FOR EACH STATEMENT EXECUTE PROCEDURE test7_before_stat_trig();
+        CREATE TRIGGER test7_after_stat_truncate
+          AFTER TRUNCATE ON test7
+          FOR EACH STATEMENT EXECUTE PROCEDURE test7_after_stat_trig();
+        INSERT INTO test7 VALUES (DEFAULT, 1);
+        TRUNCATE test7;
+        COPY test7 FROM '$copy_file' DELIMITER ',';
+        COMMIT;");
+    $node->stop('immediate');
+    $node->start;
+    $result = $node->safe_psql('postgres', "SELECT count(*) FROM test7;");
+    is($result, qq(4),
+       "wal_level = $wal_level, replay of optimized copy with before trigger");
+
+    $node->teardown_node;
+    $node->clean_node;
+    return;
+}
+
+# Run same test suite for multiple wal_level values.
+run_wal_optimize("minimal");
+run_wal_optimize("replica");
-- 
2.16.3

From 5613d41deca5a5691d18457db6bfd177ee2febe1 Mon Sep 17 00:00:00 2001
From: Kyotaro Horiguchi <horiguchi.kyotaro@lab.ntt.co.jp>
Date: Thu, 11 Oct 2018 10:03:08 +0900
Subject: [PATCH 2/4] Write WAL for empty nbtree index build

After relation truncation indexes are also rebuild. It doesn't emit
WAL in minimal mode and if truncation happened within its creation
transaction, crash recovery leaves an empty index heap, which is
considered broken. This patch forces to emit WAL when an index_build
turns into empty nbtree index.
---
 src/backend/access/nbtree/nbtsort.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c
index dc398e1186..70d4380533 100644
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -611,8 +611,14 @@ _bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno)
     /* Ensure rd_smgr is open (could have been closed by relcache flush!) */
     RelationOpenSmgr(wstate->index);
 
-    /* XLOG stuff */
-    if (wstate->btws_use_wal)
+    /* XLOG stuff
+     *
+     * Even if minimal mode, WAL is required here if truncation happened after
+     * being created in the same transaction. It is not needed otherwise but
+     * we don't bother identifying the case precisely.
+     */
+    if (wstate->btws_use_wal ||
+        (blkno == BTREE_METAPAGE && BTPageGetMeta(page)->btm_root == 0))
     {
         /* We use the heap NEWPAGE record type for this */
         log_newpage(&wstate->index->rd_node, MAIN_FORKNUM, blkno, page, true);
@@ -1056,6 +1062,11 @@ _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
      * set to point to "P_NONE").  This changes the index to the "valid" state
      * by filling in a valid magic number in the metapage.
      */
+    /*
+     * If no tuple was inserted, it's possible that we are truncating a
+     * relation. We need to emit WAL for the metapage in the case. However it
+     * is not required elsewise,
+     */
     metapage = (Page) palloc(BLCKSZ);
     _bt_initmetapage(metapage, rootblkno, rootlevel);
     _bt_blwritepage(wstate, metapage, BTREE_METAPAGE);
-- 
2.16.3

From ec2f481feb39247584e06b92aaee42c21c9dec2c Mon Sep 17 00:00:00 2001
From: Kyotaro Horiguchi <horiguchi.kyotaro@lab.ntt.co.jp>
Date: Thu, 11 Oct 2018 16:00:44 +0900
Subject: [PATCH 3/4] Add infrastructure to WAL-logging skip feature

We used to optimize WAL-logging for truncation of in-transaction
crated tables in minimal mode by just singaling by
HEAP_INSERT_SKIP_WAL option on heap operations. This mechanism can
emit WAL records that results in corrupt state for certain series of
in-transaction operations. This patch provides infrastructure to track
pending at-comit fsyncs for a relation and in-transaction truncations.
heap_register_sync() should be used to start tracking before batch
operations like COPY and CLUSTER, and use BufferNeedsWAL() instead of
RelationNeedsWAL() at the places related to WAL-logging about
heap-modifying operations.
---
 src/backend/access/heap/heapam.c    |  31 ++++
 src/backend/access/transam/xact.c   |   7 +
 src/backend/catalog/storage.c       | 317 +++++++++++++++++++++++++++++++++---
 src/backend/commands/tablecmds.c    |   3 +-
 src/backend/storage/buffer/bufmgr.c |  40 ++++-
 src/backend/utils/cache/relcache.c  |  13 ++
 src/include/access/heapam.h         |   1 +
 src/include/catalog/storage.h       |   5 +-
 src/include/storage/bufmgr.h        |   2 +
 src/include/utils/rel.h             |   8 +
 10 files changed, 395 insertions(+), 32 deletions(-)

diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 4406a69ef2..5972e9d190 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -50,6 +50,7 @@
 #include "access/xloginsert.h"
 #include "access/xlogutils.h"
 #include "catalog/catalog.h"
+#include "catalog/storage.h"
 #include "miscadmin.h"
 #include "pgstat.h"
 #include "port/atomics.h"
@@ -9080,3 +9081,33 @@ heap_mask(char *pagedata, BlockNumber blkno)
         }
     }
 }
+
+/*
+ *    heap_register_sync    - register a heap to be synced to disk at commit
+ *
+ * This can be used to skip WAL-logging changes on a relation file that has
+ * been created in the same transaction. This makes note of the current size of
+ * the relation, and ensures that when the relation is extended, any changes
+ * to the new blocks in the heap, in the same transaction, will not be
+ * WAL-logged. Instead, the heap contents are flushed to disk at commit,
+ * like heap_sync() does.
+ *
+ * This does the same for the TOAST heap, if any. Indexes are not affected.
+ */
+void
+heap_register_sync(Relation rel)
+{
+    /* non-WAL-logged tables never need fsync */
+    if (!RelationNeedsWAL(rel))
+        return;
+
+    RecordPendingSync(rel);
+    if (OidIsValid(rel->rd_rel->reltoastrelid))
+    {
+        Relation    toastrel;
+
+        toastrel = heap_open(rel->rd_rel->reltoastrelid, AccessShareLock);
+        RecordPendingSync(toastrel);
+        heap_close(toastrel, AccessShareLock);
+    }
+}
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index 0181976964..fa845bfd45 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -2020,6 +2020,9 @@ CommitTransaction(void)
     /* close large objects before lower-level cleanup */
     AtEOXact_LargeObject(true);
 
+    /* Flush updates to relations that we didn't WAL-logged */
+    smgrDoPendingSyncs(true);
+
     /*
      * Mark serializable transaction as complete for predicate locking
      * purposes.  This should be done as late as we can put it and still allow
@@ -2249,6 +2252,9 @@ PrepareTransaction(void)
     /* close large objects before lower-level cleanup */
     AtEOXact_LargeObject(true);
 
+    /* Flush updates to relations that we didn't WAL-logged */
+    smgrDoPendingSyncs(true);
+
     /*
      * Mark serializable transaction as complete for predicate locking
      * purposes.  This should be done as late as we can put it and still allow
@@ -2568,6 +2574,7 @@ AbortTransaction(void)
     AtAbort_Notify();
     AtEOXact_RelationMap(false, is_parallel_worker);
     AtAbort_Twophase();
+    smgrDoPendingSyncs(false);    /* abandone pending syncs */
 
     /*
      * Advertise the fact that we aborted in pg_xact (assuming that we got as
diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c
index 0302507e6f..68947b017f 100644
--- a/src/backend/catalog/storage.c
+++ b/src/backend/catalog/storage.c
@@ -28,6 +28,7 @@
 #include "catalog/storage_xlog.h"
 #include "storage/freespace.h"
 #include "storage/smgr.h"
+#include "utils/hsearch.h"
 #include "utils/memutils.h"
 #include "utils/rel.h"
 
@@ -62,6 +63,49 @@ typedef struct PendingRelDelete
 
 static PendingRelDelete *pendingDeletes = NULL; /* head of linked list */
 
+/*
+ * We also track relation files (RelFileNode values) that have been created
+ * in the same transaction, and that have been modified without WAL-logging
+ * the action (an optimization possible with wal_level=minimal). When we are
+ * about to skip WAL-logging, a PendingRelSync entry is created, and
+ * 'sync_above' is set to the current size of the relation. Any operations
+ * on blocks < sync_above need to be WAL-logged as usual, but for operations
+ * on higher blocks, WAL-logging is skipped.
+ *
+ * NB: after WAL-logging has been skipped for a block, we must not WAL-log
+ * any subsequent actions on the same block either. Replaying the WAL record
+ * of the subsequent action might fail otherwise, as the "before" state of
+ * the block might not match, as the earlier actions were not WAL-logged.
+ * Likewise, after we have WAL-logged an operation for a block, we must
+ * WAL-log any subsequent operations on the same page as well. Replaying
+ * a possible full-page-image from the earlier WAL record would otherwise
+ * revert the page to the old state, even if we sync the relation at end
+ * of transaction.
+ *
+ * If a relation is truncated (without creating a new relfilenode), and we
+ * emit a WAL record of the truncation, we can't skip WAL-logging for any
+ * of the truncated blocks anymore, as replaying the truncation record will
+ * destroy all the data inserted after that. But if we have already decided
+ * to skip WAL-logging changes to a relation, and the relation is truncated,
+ * we don't need to WAL-log the truncation either.
+ *
+ * This mechanism is currently only used by heaps. Indexes are always
+ * WAL-logged. Also, this only applies for wal_level=minimal; with higher
+ * WAL levels we need the WAL for PITR/replication anyway.
+ */
+typedef struct PendingRelSync
+{
+    RelFileNode relnode;        /* relation created in same xact */
+    BlockNumber sync_above;        /* WAL-logging skipped for blocks >=
+                                 * sync_above */
+    BlockNumber truncated_to;    /* truncation WAL record was written */
+}    PendingRelSync;
+
+/* Relations that need to be fsync'd at commit */
+static HTAB *pendingSyncs = NULL;
+
+static PendingRelSync *getPendingSyncEntry(Relation rel, bool create);
+
 /*
  * RelationCreateStorage
  *        Create physical storage for a relation.
@@ -259,37 +303,117 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
      */
     if (RelationNeedsWAL(rel))
     {
-        /*
-         * Make an XLOG entry reporting the file truncation.
-         */
-        XLogRecPtr    lsn;
-        xl_smgr_truncate xlrec;
+        PendingRelSync *pending_sync;
 
-        xlrec.blkno = nblocks;
-        xlrec.rnode = rel->rd_node;
-        xlrec.flags = SMGR_TRUNCATE_ALL;
+        /* get pending sync entry, create if not yet */
+        pending_sync = getPendingSyncEntry(rel, true);
 
-        XLogBeginInsert();
-        XLogRegisterData((char *) &xlrec, sizeof(xlrec));
+        if (pending_sync->sync_above == InvalidBlockNumber ||
+            pending_sync->sync_above < nblocks)
+        {
+            /*
+             * This is the first time truncation of this relation in this
+             * transaction or truncation that leaves pages that need at-commit
+             * fsync.  Make an XLOG entry reporting the file truncation.
+             */
+            XLogRecPtr        lsn;
+            xl_smgr_truncate xlrec;
 
-        lsn = XLogInsert(RM_SMGR_ID,
-                         XLOG_SMGR_TRUNCATE | XLR_SPECIAL_REL_UPDATE);
+            xlrec.blkno = nblocks;
+            xlrec.rnode = rel->rd_node;
+            xlrec.flags = SMGR_TRUNCATE_ALL;
 
-        /*
-         * Flush, because otherwise the truncation of the main relation might
-         * hit the disk before the WAL record, and the truncation of the FSM
-         * or visibility map. If we crashed during that window, we'd be left
-         * with a truncated heap, but the FSM or visibility map would still
-         * contain entries for the non-existent heap pages.
-         */
-        if (fsm || vm)
-            XLogFlush(lsn);
+            XLogBeginInsert();
+            XLogRegisterData((char *) &xlrec, sizeof(xlrec));
+
+            lsn = XLogInsert(RM_SMGR_ID,
+                             XLOG_SMGR_TRUNCATE | XLR_SPECIAL_REL_UPDATE);
+
+            elog(DEBUG2, "WAL-logged truncation of rel %u/%u/%u to %u blocks",
+                 rel->rd_node.spcNode, rel->rd_node.dbNode, rel->rd_node.relNode,
+                 nblocks);
+
+            /*
+             * Flush, because otherwise the truncation of the main relation
+             * might hit the disk before the WAL record, and the truncation of
+             * the FSM or visibility map. If we crashed during that window,
+             * we'd be left with a truncated heap, but the FSM or visibility
+             * map would still contain entries for the non-existent heap
+             * pages.
+             */
+            if (fsm || vm)
+                XLogFlush(lsn);
+
+            rel->pending_sync->truncated_to = nblocks;
+        }
     }
 
     /* Do the real work */
     smgrtruncate(rel->rd_smgr, MAIN_FORKNUM, nblocks);
 }
 
+/*
+ * getPendingSyncEntry: get pendig sync entry.
+ *
+ * Returns pending sync entry for the relation. The entry tracks pending
+ * at-commit fsyncs for the relation.  Creates one if needed when create is
+ * true.
+ */  
+static PendingRelSync *
+getPendingSyncEntry(Relation rel, bool create)
+{
+    PendingRelSync *pendsync_entry = NULL;
+    bool            found;
+
+    if (rel->pending_sync)
+        return rel->pending_sync;
+
+    /* we know we don't have pending sync entry */
+    if (!create && rel->no_pending_sync)
+        return NULL;
+
+    if (!pendingSyncs)
+    {
+        /* First time through: initialize the hash table */
+        HASHCTL        ctl;
+
+        if (!create)
+            return NULL;
+
+        MemSet(&ctl, 0, sizeof(ctl));
+        ctl.keysize = sizeof(RelFileNode);
+        ctl.entrysize = sizeof(PendingRelSync);
+        ctl.hash = tag_hash;
+        pendingSyncs = hash_create("pending relation sync table", 5,
+                                   &ctl, HASH_ELEM | HASH_FUNCTION);
+    }
+
+    elog(DEBUG2, "getPendingSyncEntry: accessing hash for %d",
+         rel->rd_node.relNode);
+    pendsync_entry = (PendingRelSync *)
+        hash_search(pendingSyncs, (void *) &rel->rd_node,
+                    create ? HASH_ENTER: HASH_FIND,    &found);
+
+    if (!pendsync_entry)
+    {
+        rel->no_pending_sync = true;
+        return NULL;
+    }
+
+    /* new entry created */
+    if (!found)
+    {
+        pendsync_entry->truncated_to = InvalidBlockNumber;
+        pendsync_entry->sync_above = InvalidBlockNumber;
+    }
+
+    /* hold shortcut in Relation */
+    rel->no_pending_sync = false;
+    rel->pending_sync = pendsync_entry;
+
+    return pendsync_entry;
+}
+
 /*
  *    smgrDoPendingDeletes() -- Take care of relation deletes at end of xact.
  *
@@ -367,6 +491,24 @@ smgrDoPendingDeletes(bool isCommit)
     }
 }
 
+/*
+ * RelationRemovePendingSync() -- remove pendingSync entry for a relation
+ */
+void
+RelationRemovePendingSync(Relation rel)
+{
+    bool found;
+
+    rel->pending_sync = NULL;
+    rel->no_pending_sync = true;
+    if (pendingSyncs)
+    {
+        elog(DEBUG2, "RelationRemovePendingSync: accessing hash");
+        hash_search(pendingSyncs, (void *) &rel->rd_node, HASH_REMOVE, &found);
+    }
+}
+
+
 /*
  * smgrGetPendingDeletes() -- Get a list of non-temp relations to be deleted.
  *
@@ -418,6 +560,139 @@ smgrGetPendingDeletes(bool forCommit, RelFileNode **ptr)
     return nrels;
 }
 
+
+/*
+ * Remember that the given relation needs to be sync'd at commit, because we
+ * are going to skip WAL-logging subsequent actions to it.
+ */
+void
+RecordPendingSync(Relation rel)
+{
+    BlockNumber nblocks;
+    PendingRelSync *pending_sync;
+
+    Assert(RelationNeedsWAL(rel));
+
+    /* get pending sync entry, create if not yet  */
+    pending_sync = getPendingSyncEntry(rel, true);
+
+    nblocks = RelationGetNumberOfBlocks(rel);
+
+    if (pending_sync->sync_above != InvalidBlockNumber)
+    {
+        elog(DEBUG2,
+             "pending sync for rel %u/%u/%u was already registered at block %u (new %u)",
+             rel->rd_node.spcNode, rel->rd_node.dbNode, rel->rd_node.relNode,
+             rel->pending_sync->sync_above, nblocks);
+
+        return;
+    }
+
+    elog(DEBUG2,
+         "registering new pending sync for rel %u/%u/%u at block %u",
+         rel->rd_node.spcNode, rel->rd_node.dbNode, rel->rd_node.relNode,
+         nblocks);
+    pending_sync->sync_above = nblocks;
+}
+
+/*
+ * Do changes to given heap page need to be WAL-logged?
+ *
+ * This takes into account any previous RecordPendingSync() requests.
+ *
+ * Note that it is required to check this before creating any WAL records for
+ * heap pages - it is not merely an optimization! WAL-logging a record, when
+ * we have already skipped a previous WAL record for the same page could lead
+ * to failure at WAL replay, as the "before" state expected by the record
+ * might not match what's on disk. Also, if the heap was truncated earlier, we
+ * must WAL-log any changes to the once-truncated blocks, because replaying
+ * the truncation record will destroy them.
+ */
+bool
+BufferNeedsWAL(Relation rel, Buffer buf)
+{
+    BlockNumber        blkno = InvalidBlockNumber;
+    PendingRelSync *pending_sync;
+
+    if (!RelationNeedsWAL(rel))
+        return false;
+
+    /* fetch exising pending sync entry */
+    pending_sync = getPendingSyncEntry(rel, false);
+
+    /*
+     * no point in doing further work if we know that we don't have pending
+     * sync
+     */
+    if (!pending_sync)
+        return true;
+
+    Assert(BufferIsValid(buf));
+
+    blkno = BufferGetBlockNumber(buf);
+
+    /* we don't skip WAL-logging for pages that already done */
+    if (pending_sync->sync_above == InvalidBlockNumber ||
+        pending_sync->sync_above > blkno)
+    {
+        elog(DEBUG2, "not skipping WAL-logging for rel %u/%u/%u block %u, because sync_above is %u",
+             rel->rd_node.spcNode, rel->rd_node.dbNode, rel->rd_node.relNode,
+             blkno, rel->pending_sync->sync_above);
+        return true;
+    }
+
+    /*
+     * We have emitted a truncation record for this block.
+     */
+    if (pending_sync->truncated_to != InvalidBlockNumber &&
+        pending_sync->truncated_to <= blkno)
+    {
+        elog(DEBUG2, "not skipping WAL-logging for rel %u/%u/%u block %u, because it was truncated earlier in the same
xact",
+             rel->rd_node.spcNode, rel->rd_node.dbNode, rel->rd_node.relNode,
+             blkno);
+        return true;
+    }
+
+    elog(DEBUG2, "skipping WAL-logging for rel %u/%u/%u block %u",
+         rel->rd_node.spcNode, rel->rd_node.dbNode, rel->rd_node.relNode,
+         blkno);
+
+    return false;
+}
+
+/*
+ * Sync to disk any relations that we skipped WAL-logging for earlier.
+ */
+void
+smgrDoPendingSyncs(bool isCommit)
+{
+    if (!pendingSyncs)
+        return;
+
+    if (isCommit)
+    {
+        HASH_SEQ_STATUS status;
+        PendingRelSync *pending;
+
+        hash_seq_init(&status, pendingSyncs);
+
+        while ((pending = hash_seq_search(&status)) != NULL)
+        {
+            if (pending->sync_above != InvalidBlockNumber)
+            {
+                FlushRelationBuffersWithoutRelCache(pending->relnode, false);
+                smgrimmedsync(smgropen(pending->relnode, InvalidBackendId), MAIN_FORKNUM);
+
+                elog(DEBUG2, "syncing rel %u/%u/%u", pending->relnode.spcNode,
+                     pending->relnode.dbNode, pending->relnode.relNode);
+            }
+        }
+    }
+
+    hash_destroy(pendingSyncs);
+    pendingSyncs = NULL;
+}
+
 /*
  *    PostPrepare_smgr -- Clean up after a successful PREPARE
  *
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index 434be403fe..e15296e373 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -11387,11 +11387,12 @@ ATExecSetTableSpace(Oid tableOid, Oid newTableSpace, LOCKMODE lockmode)
 
     /*
      * Create and copy all forks of the relation, and schedule unlinking of
-     * old physical files.
+     * old physical files. Pending syncs for the old node is no longer needed.
      *
      * NOTE: any conflict in relfilenode value will be caught in
      * RelationCreateStorage().
      */
+    RelationRemovePendingSync(rel);
     RelationCreateStorage(newrnode, rel->rd_rel->relpersistence);
 
     /* copy main fork */
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 273e2f385f..a9741f138c 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -451,6 +451,7 @@ static BufferDesc *BufferAlloc(SMgrRelation smgr,
             BufferAccessStrategy strategy,
             bool *foundPtr);
 static void FlushBuffer(BufferDesc *buf, SMgrRelation reln);
+static void FlushRelationBuffers_common(SMgrRelation smgr, bool islocal);
 static void AtProcExit_Buffers(int code, Datum arg);
 static void CheckForBufferLeaks(void);
 static int    rnode_comparator(const void *p1, const void *p2);
@@ -3153,20 +3154,41 @@ PrintPinnedBufs(void)
 void
 FlushRelationBuffers(Relation rel)
 {
-    int            i;
-    BufferDesc *bufHdr;
-
     /* Open rel at the smgr level if not already done */
     RelationOpenSmgr(rel);
 
-    if (RelationUsesLocalBuffers(rel))
+    FlushRelationBuffers_common(rel->rd_smgr, RelationUsesLocalBuffers(rel));
+}
+
+/*
+ * Like FlushRelationBuffers(), but the relation is specified by a
+ * RelFileNode
+ */
+void
+FlushRelationBuffersWithoutRelCache(RelFileNode rnode, bool islocal)
+{
+    FlushRelationBuffers_common(smgropen(rnode, InvalidBackendId), islocal);
+}
+
+/*
+ * Code shared between functions FlushRelationBuffers() and
+ * FlushRelationBuffersWithoutRelCache().
+ */
+static void
+FlushRelationBuffers_common(SMgrRelation smgr, bool islocal)
+{
+    RelFileNode rnode = smgr->smgr_rnode.node;
+    int            i;
+    BufferDesc *bufHdr;
+
+    if (islocal)
     {
         for (i = 0; i < NLocBuffer; i++)
         {
             uint32        buf_state;
 
             bufHdr = GetLocalBufferDescriptor(i);
-            if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
+            if (RelFileNodeEquals(bufHdr->tag.rnode, rnode) &&
                 ((buf_state = pg_atomic_read_u32(&bufHdr->state)) &
                  (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
             {
@@ -3183,7 +3205,7 @@ FlushRelationBuffers(Relation rel)
 
                 PageSetChecksumInplace(localpage, bufHdr->tag.blockNum);
 
-                smgrwrite(rel->rd_smgr,
+                smgrwrite(smgr,
                           bufHdr->tag.forkNum,
                           bufHdr->tag.blockNum,
                           localpage,
@@ -3213,18 +3235,18 @@ FlushRelationBuffers(Relation rel)
          * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
          * and saves some cycles.
          */
-        if (!RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node))
+        if (!RelFileNodeEquals(bufHdr->tag.rnode, rnode))
             continue;
 
         ReservePrivateRefCountEntry();
 
         buf_state = LockBufHdr(bufHdr);
-        if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
+        if (RelFileNodeEquals(bufHdr->tag.rnode, rnode) &&
             (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
         {
             PinBuffer_Locked(bufHdr);
             LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
-            FlushBuffer(bufHdr, rel->rd_smgr);
+            FlushBuffer(bufHdr, smgr);
             LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
             UnpinBuffer(bufHdr, true);
         }
diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c
index af96a03338..66e7d5a301 100644
--- a/src/backend/utils/cache/relcache.c
+++ b/src/backend/utils/cache/relcache.c
@@ -75,6 +75,7 @@
 #include "partitioning/partbounds.h"
 #include "rewrite/rewriteDefine.h"
 #include "rewrite/rowsecurity.h"
+#include "storage/bufmgr.h"
 #include "storage/lmgr.h"
 #include "storage/smgr.h"
 #include "utils/array.h"
@@ -414,6 +415,10 @@ AllocateRelationDesc(Form_pg_class relp)
     /* which we mark as a reference-counted tupdesc */
     relation->rd_att->tdrefcount = 1;
 
+    /* We don't know if pending sync for this relation exists so far */
+    relation->pending_sync = NULL;
+    relation->no_pending_sync = false;
+
     MemoryContextSwitchTo(oldcxt);
 
     return relation;
@@ -1869,6 +1874,10 @@ formrdesc(const char *relationName, Oid relationReltype,
         relation->rd_rel->relhasindex = true;
     }
 
+    /* We don't know if pending sync for this relation exists so far */
+    relation->pending_sync = NULL;
+    relation->no_pending_sync = false;
+
     /*
      * add new reldesc to relcache
      */
@@ -3263,6 +3272,10 @@ RelationBuildLocalRelation(const char *relname,
     else
         rel->rd_rel->relfilenode = relfilenode;
 
+    /* newly built relation has no pending sync */
+    rel->no_pending_sync = true;
+    rel->pending_sync = NULL;
+
     RelationInitLockInfo(rel);    /* see lmgr.c */
 
     RelationInitPhysicalAddr(rel);
diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h
index ab0879138f..fab5052868 100644
--- a/src/include/access/heapam.h
+++ b/src/include/access/heapam.h
@@ -163,6 +163,7 @@ extern void simple_heap_delete(Relation relation, ItemPointer tid);
 extern void simple_heap_update(Relation relation, ItemPointer otid,
                    HeapTuple tup);
 
+extern void heap_register_sync(Relation relation);
 extern void heap_sync(Relation relation);
 extern void heap_update_snapshot(HeapScanDesc scan, Snapshot snapshot);
 
diff --git a/src/include/catalog/storage.h b/src/include/catalog/storage.h
index 9f638be924..95d7898e25 100644
--- a/src/include/catalog/storage.h
+++ b/src/include/catalog/storage.h
@@ -22,13 +22,16 @@ extern void RelationCreateStorage(RelFileNode rnode, char relpersistence);
 extern void RelationDropStorage(Relation rel);
 extern void RelationPreserveStorage(RelFileNode rnode, bool atCommit);
 extern void RelationTruncate(Relation rel, BlockNumber nblocks);
-
+extern void RelationRemovePendingSync(Relation rel);
 /*
  * These functions used to be in storage/smgr/smgr.c, which explains the
  * naming
  */
 extern void smgrDoPendingDeletes(bool isCommit);
 extern int    smgrGetPendingDeletes(bool forCommit, RelFileNode **ptr);
+extern void smgrDoPendingSyncs(bool isCommit);
+extern void RecordPendingSync(Relation rel);
+bool BufferNeedsWAL(Relation rel, Buffer buf);
 extern void AtSubCommit_smgr(void);
 extern void AtSubAbort_smgr(void);
 extern void PostPrepare_smgr(void);
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index c5826f691d..8a9ea041dd 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -189,6 +189,8 @@ extern BlockNumber RelationGetNumberOfBlocksInFork(Relation relation,
                                 ForkNumber forkNum);
 extern void FlushOneBuffer(Buffer buffer);
 extern void FlushRelationBuffers(Relation rel);
+extern void FlushRelationBuffersWithoutRelCache(RelFileNode rnode,
+                                    bool islocal);
 extern void FlushDatabaseBuffers(Oid dbid);
 extern void DropRelFileNodeBuffers(RelFileNodeBackend rnode,
                        ForkNumber forkNum, BlockNumber firstDelBlock);
diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h
index 1d05465303..0f39f209d3 100644
--- a/src/include/utils/rel.h
+++ b/src/include/utils/rel.h
@@ -185,6 +185,14 @@ typedef struct RelationData
 
     /* use "struct" here to avoid needing to include pgstat.h: */
     struct PgStat_TableStatus *pgstat_info; /* statistics collection area */
+
+    /*
+     * no_pending_sync is true if this relation is known not to have pending
+     * syncs.  Elsewise searching for registered sync is required if
+     * pending_sync is NULL.
+     */
+    bool                   no_pending_sync;
+    struct PendingRelSync *pending_sync;
 } RelationData;
 
 
-- 
2.16.3

From 7b52c9dd2d6bb76f0264bfd0f17d034001351b6f Mon Sep 17 00:00:00 2001
From: Kyotaro Horiguchi <horiguchi.kyotaro@lab.ntt.co.jp>
Date: Thu, 11 Oct 2018 16:18:04 +0900
Subject: [PATCH 4/4] Fix WAL skipping feature.

This patch repalces WAL-skipping means from HEAP_INSERT_SKIP_WAL to
pending-sync tracking infrastructure.
---
 src/backend/access/heap/heapam.c        | 70 ++++++++++++++++++++++-----------
 src/backend/access/heap/pruneheap.c     |  3 +-
 src/backend/access/heap/rewriteheap.c   |  3 --
 src/backend/access/heap/vacuumlazy.c    |  6 +--
 src/backend/access/heap/visibilitymap.c |  3 +-
 src/backend/commands/copy.c             | 13 +++---
 src/backend/commands/createas.c         |  9 ++---
 src/backend/commands/matview.c          |  6 +--
 src/backend/commands/tablecmds.c        |  5 +--
 src/include/access/heapam.h             |  9 ++---
 10 files changed, 72 insertions(+), 55 deletions(-)

diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 5972e9d190..a2d8aefa28 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -28,6 +28,28 @@
  *      the POSTGRES heap access method used for all POSTGRES
  *      relations.
  *
+ * WAL CONSIDERATIONS
+ *      All heap operations are normally WAL-logged. but there are a few
+ *      exceptions. Temporary and unlogged relations never need to be
+ *      WAL-logged, but we can also skip WAL-logging for a table that was
+ *      created in the same transaction, if we don't need WAL for PITR or
+ *      WAL archival purposes (i.e. if wal_level=minimal), and we fsync()
+ *      the file to disk at COMMIT instead.
+ *
+ *      The same-relation optimization is not employed automatically on all
+ *      updates to a table that was created in the same transacton, because
+ *      for a small number of changes, it's cheaper to just create the WAL
+ *      records than fsyncing() the whole relation at COMMIT. It is only
+ *      worthwhile for (presumably) large operations like COPY, CLUSTER,
+ *      or VACUUM FULL. Use heap_register_sync() to initiate such an
+ *      operation; it will cause any subsequent updates to the table to skip
+ *      WAL-logging, if possible, and cause the heap to be synced to disk at
+ *      COMMIT.
+ *
+ *      To make that work, all modifications to heap must use
+ *      BufferNeedsWAL() to check if WAL-logging is needed in this transaction
+ *      for the given block.
+ *
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"
@@ -2127,12 +2149,6 @@ ReleaseBulkInsertStatePin(BulkInsertState bistate)
  * The new tuple is stamped with current transaction ID and the specified
  * command ID.
  *
- * If the HEAP_INSERT_SKIP_WAL option is specified, the new tuple is not
- * logged in WAL, even for a non-temp relation.  Safe usage of this behavior
- * requires that we arrange that all new tuples go into new pages not
- * containing any tuples from other transactions, and that the relation gets
- * fsync'd before commit.  (See also heap_sync() comments)
- *
  * The HEAP_INSERT_SKIP_FSM option is passed directly to
  * RelationGetBufferForTuple, which see for more info.
  *
@@ -2239,7 +2255,7 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid,
     MarkBufferDirty(buffer);
 
     /* XLOG stuff */
-    if (!(options & HEAP_INSERT_SKIP_WAL) && RelationNeedsWAL(relation))
+    if (BufferNeedsWAL(relation, buffer))
     {
         xl_heap_insert xlrec;
         xl_heap_header xlhdr;
@@ -2414,7 +2430,6 @@ heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples,
     int            ndone;
     PGAlignedBlock scratch;
     Page        page;
-    bool        needwal;
     Size        saveFreeSpace;
     bool        need_tuple_data = RelationIsLogicallyLogged(relation);
     bool        need_cids = RelationIsAccessibleInLogicalDecoding(relation);
@@ -2422,7 +2437,6 @@ heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples,
     /* currently not needed (thus unsupported) for heap_multi_insert() */
     AssertArg(!(options & HEAP_INSERT_NO_LOGICAL));
 
-    needwal = !(options & HEAP_INSERT_SKIP_WAL) && RelationNeedsWAL(relation);
     saveFreeSpace = RelationGetTargetPageFreeSpace(relation,
                                                    HEAP_DEFAULT_FILLFACTOR);
 
@@ -2464,6 +2478,7 @@ heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples,
         Buffer        vmbuffer = InvalidBuffer;
         bool        all_visible_cleared = false;
         int            nthispage;
+        bool        needwal;
 
         CHECK_FOR_INTERRUPTS();
 
@@ -2475,6 +2490,7 @@ heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples,
                                            InvalidBuffer, options, bistate,
                                            &vmbuffer, NULL);
         page = BufferGetPage(buffer);
+        needwal = BufferNeedsWAL(relation, buffer);
 
         /* NO EREPORT(ERROR) from here till changes are logged */
         START_CRIT_SECTION();
@@ -3037,7 +3053,7 @@ l1:
      * NB: heap_abort_speculative() uses the same xlog record and replay
      * routines.
      */
-    if (RelationNeedsWAL(relation))
+    if (BufferNeedsWAL(relation, buffer))
     {
         xl_heap_delete xlrec;
         XLogRecPtr    recptr;
@@ -3777,7 +3793,7 @@ l2:
 
         MarkBufferDirty(buffer);
 
-        if (RelationNeedsWAL(relation))
+        if (BufferNeedsWAL(relation, buffer))
         {
             xl_heap_lock xlrec;
             XLogRecPtr    recptr;
@@ -3992,7 +4008,8 @@ l2:
     MarkBufferDirty(buffer);
 
     /* XLOG stuff */
-    if (RelationNeedsWAL(relation))
+    if (BufferNeedsWAL(relation, buffer) ||
+        BufferNeedsWAL(relation, newbuf))
     {
         XLogRecPtr    recptr;
 
@@ -4882,7 +4899,7 @@ failed:
      * (Also, in a PITR log-shipping or 2PC environment, we have to have XLOG
      * entries for everything anyway.)
      */
-    if (RelationNeedsWAL(relation))
+    if (BufferNeedsWAL(relation, *buffer))
     {
         xl_heap_lock xlrec;
         XLogRecPtr    recptr;
@@ -5626,7 +5643,7 @@ l4:
         MarkBufferDirty(buf);
 
         /* XLOG stuff */
-        if (RelationNeedsWAL(rel))
+        if (BufferNeedsWAL(rel, buf))
         {
             xl_heap_lock_updated xlrec;
             XLogRecPtr    recptr;
@@ -5786,7 +5803,7 @@ heap_finish_speculative(Relation relation, HeapTuple tuple)
     htup->t_ctid = tuple->t_self;
 
     /* XLOG stuff */
-    if (RelationNeedsWAL(relation))
+    if (BufferNeedsWAL(relation, buffer))
     {
         xl_heap_confirm xlrec;
         XLogRecPtr    recptr;
@@ -5919,7 +5936,7 @@ heap_abort_speculative(Relation relation, HeapTuple tuple)
      * The WAL records generated here match heap_delete().  The same recovery
      * routines are used.
      */
-    if (RelationNeedsWAL(relation))
+    if (BufferNeedsWAL(relation, buffer))
     {
         xl_heap_delete xlrec;
         XLogRecPtr    recptr;
@@ -6028,7 +6045,7 @@ heap_inplace_update(Relation relation, HeapTuple tuple)
     MarkBufferDirty(buffer);
 
     /* XLOG stuff */
-    if (RelationNeedsWAL(relation))
+    if (BufferNeedsWAL(relation, buffer))
     {
         xl_heap_inplace xlrec;
         XLogRecPtr    recptr;
@@ -7225,7 +7242,7 @@ log_heap_clean(Relation reln, Buffer buffer,
     XLogRecPtr    recptr;
 
     /* Caller should not call me on a non-WAL-logged relation */
-    Assert(RelationNeedsWAL(reln));
+    Assert(BufferNeedsWAL(reln, buffer));
 
     xlrec.latestRemovedXid = latestRemovedXid;
     xlrec.nredirected = nredirected;
@@ -7273,7 +7290,7 @@ log_heap_freeze(Relation reln, Buffer buffer, TransactionId cutoff_xid,
     XLogRecPtr    recptr;
 
     /* Caller should not call me on a non-WAL-logged relation */
-    Assert(RelationNeedsWAL(reln));
+    Assert(BufferNeedsWAL(reln, buffer));
     /* nor when there are no tuples to freeze */
     Assert(ntuples > 0);
 
@@ -7358,7 +7375,7 @@ log_heap_update(Relation reln, Buffer oldbuf,
     int            bufflags;
 
     /* Caller should not call me on a non-WAL-logged relation */
-    Assert(RelationNeedsWAL(reln));
+    Assert(BufferNeedsWAL(reln, newbuf) || BufferNeedsWAL(reln, oldbuf));
 
     XLogBeginInsert();
 
@@ -8962,9 +8979,16 @@ heap2_redo(XLogReaderState *record)
  *    heap_sync        - sync a heap, for use when no WAL has been written
  *
  * This forces the heap contents (including TOAST heap if any) down to disk.
- * If we skipped using WAL, and WAL is otherwise needed, we must force the
- * relation down to disk before it's safe to commit the transaction.  This
- * requires writing out any dirty buffers and then doing a forced fsync.
+ * If we did any changes to the heap bypassing the buffer manager, we must
+ * force the relation down to disk before it's safe to commit the
+ * transaction, because the direct modifications will not be flushed by
+ * the next checkpoint.
+ *
+ * We used to also use this after batch operations like COPY and CLUSTER,
+ * if we skipped using WAL and WAL is otherwise needed, but there were
+ * corner-cases involving other WAL-logged operations to the same
+ * relation, where that was not enough. heap_register_sync() should be
+ * used for that purpose instead.
  *
  * Indexes are not touched.  (Currently, index operations associated with
  * the commands that use this are WAL-logged and so do not need fsync.
diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c
index a3e51922d8..a05659b168 100644
--- a/src/backend/access/heap/pruneheap.c
+++ b/src/backend/access/heap/pruneheap.c
@@ -20,6 +20,7 @@
 #include "access/htup_details.h"
 #include "access/xlog.h"
 #include "catalog/catalog.h"
+#include "catalog/storage.h"
 #include "miscadmin.h"
 #include "pgstat.h"
 #include "storage/bufmgr.h"
@@ -258,7 +259,7 @@ heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin,
         /*
          * Emit a WAL HEAP_CLEAN record showing what we did
          */
-        if (RelationNeedsWAL(relation))
+        if (BufferNeedsWAL(relation, buffer))
         {
             XLogRecPtr    recptr;
 
diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c
index f5cf9ffc9c..1e9c07c9b2 100644
--- a/src/backend/access/heap/rewriteheap.c
+++ b/src/backend/access/heap/rewriteheap.c
@@ -654,9 +654,6 @@ raw_heap_insert(RewriteState state, HeapTuple tup)
     {
         int options = HEAP_INSERT_SKIP_FSM;
 
-        if (!state->rs_use_wal)
-            options |= HEAP_INSERT_SKIP_WAL;
-
         /*
          * While rewriting the heap for VACUUM FULL / CLUSTER, make sure data
          * for the TOAST table are not logically decoded.  The main heap is
diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c
index 37aa484ec3..3309c93bce 100644
--- a/src/backend/access/heap/vacuumlazy.c
+++ b/src/backend/access/heap/vacuumlazy.c
@@ -923,7 +923,7 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats,
                  * page has been previously WAL-logged, and if not, do that
                  * now.
                  */
-                if (RelationNeedsWAL(onerel) &&
+                if (BufferNeedsWAL(onerel, buf) &&
                     PageGetLSN(page) == InvalidXLogRecPtr)
                     log_newpage_buffer(buf, true);
 
@@ -1187,7 +1187,7 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats,
             }
 
             /* Now WAL-log freezing if necessary */
-            if (RelationNeedsWAL(onerel))
+            if (BufferNeedsWAL(onerel, buf))
             {
                 XLogRecPtr    recptr;
 
@@ -1568,7 +1568,7 @@ lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
     MarkBufferDirty(buffer);
 
     /* XLOG stuff */
-    if (RelationNeedsWAL(onerel))
+    if (BufferNeedsWAL(onerel, buffer))
     {
         XLogRecPtr    recptr;
 
diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c
index 931ae81fd6..53da0da68f 100644
--- a/src/backend/access/heap/visibilitymap.c
+++ b/src/backend/access/heap/visibilitymap.c
@@ -88,6 +88,7 @@
 #include "access/heapam_xlog.h"
 #include "access/visibilitymap.h"
 #include "access/xlog.h"
+#include "catalog/storage.h"
 #include "miscadmin.h"
 #include "storage/bufmgr.h"
 #include "storage/lmgr.h"
@@ -307,7 +308,7 @@ visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf,
         map[mapByte] |= (flags << mapOffset);
         MarkBufferDirty(vmBuf);
 
-        if (RelationNeedsWAL(rel))
+        if (BufferNeedsWAL(rel, heapBuf))
         {
             if (XLogRecPtrIsInvalid(recptr))
             {
diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c
index dbb06397e6..b42bfbfd47 100644
--- a/src/backend/commands/copy.c
+++ b/src/backend/commands/copy.c
@@ -2390,8 +2390,7 @@ CopyFrom(CopyState cstate)
      *    - data is being written to relfilenode created in this transaction
      * then we can skip writing WAL.  It's safe because if the transaction
      * doesn't commit, we'll discard the table (or the new relfilenode file).
-     * If it does commit, we'll have done the heap_sync at the bottom of this
-     * routine first.
+     * If it does commit, commit will do heap_sync().
      *
      * As mentioned in comments in utils/rel.h, the in-same-transaction test
      * is not always set correctly, since in rare cases rd_newRelfilenodeSubid
@@ -2437,7 +2436,7 @@ CopyFrom(CopyState cstate)
     {
         hi_options |= HEAP_INSERT_SKIP_FSM;
         if (!XLogIsNeeded())
-            hi_options |= HEAP_INSERT_SKIP_WAL;
+            heap_register_sync(cstate->rel);
     }
 
     /*
@@ -3092,11 +3091,11 @@ CopyFrom(CopyState cstate)
     FreeExecutorState(estate);
 
     /*
-     * If we skipped writing WAL, then we need to sync the heap (but not
-     * indexes since those use WAL anyway)
+     * If we skipped writing WAL, then we will sync the heap at the end of
+     * the transaction. (We used to do it here, but it was later found out
+     * that to be safe, we must also avoid WAL-logging any subsequent
+     * actions on the pages we skipped WAL for). Indexes always use WAL.
      */
-    if (hi_options & HEAP_INSERT_SKIP_WAL)
-        heap_sync(cstate->rel);
 
     return processed;
 }
diff --git a/src/backend/commands/createas.c b/src/backend/commands/createas.c
index 2bc8f928ea..5eb45a4a65 100644
--- a/src/backend/commands/createas.c
+++ b/src/backend/commands/createas.c
@@ -556,8 +556,9 @@ intorel_startup(DestReceiver *self, int operation, TupleDesc typeinfo)
      * We can skip WAL-logging the insertions, unless PITR or streaming
      * replication is in use. We can skip the FSM in any case.
      */
-    myState->hi_options = HEAP_INSERT_SKIP_FSM |
-        (XLogIsNeeded() ? 0 : HEAP_INSERT_SKIP_WAL);
+    if (!XLogIsNeeded())
+        heap_register_sync(intoRelationDesc);
+    myState->hi_options = HEAP_INSERT_SKIP_FSM;
     myState->bistate = GetBulkInsertState();
 
     /* Not using WAL requires smgr_targblock be initially invalid */
@@ -600,9 +601,7 @@ intorel_shutdown(DestReceiver *self)
 
     FreeBulkInsertState(myState->bistate);
 
-    /* If we skipped using WAL, must heap_sync before commit */
-    if (myState->hi_options & HEAP_INSERT_SKIP_WAL)
-        heap_sync(myState->rel);
+    /* If we skipped using WAL, we will sync the relation at commit */
 
     /* close rel, but keep lock until commit */
     table_close(myState->rel, NoLock);
diff --git a/src/backend/commands/matview.c b/src/backend/commands/matview.c
index 5a47be4b33..5f447c6d94 100644
--- a/src/backend/commands/matview.c
+++ b/src/backend/commands/matview.c
@@ -463,7 +463,7 @@ transientrel_startup(DestReceiver *self, int operation, TupleDesc typeinfo)
      */
     myState->hi_options = HEAP_INSERT_SKIP_FSM | HEAP_INSERT_FROZEN;
     if (!XLogIsNeeded())
-        myState->hi_options |= HEAP_INSERT_SKIP_WAL;
+        heap_register_sync(transientrel);
     myState->bistate = GetBulkInsertState();
 
     /* Not using WAL requires smgr_targblock be initially invalid */
@@ -509,9 +509,7 @@ transientrel_shutdown(DestReceiver *self)
 
     FreeBulkInsertState(myState->bistate);
 
-    /* If we skipped using WAL, must heap_sync before commit */
-    if (myState->hi_options & HEAP_INSERT_SKIP_WAL)
-        heap_sync(myState->transientrel);
+    /* If we skipped using WAL, we will sync the relation at commit */
 
     /* close transientrel, but keep lock until commit */
     table_close(myState->transientrel, NoLock);
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index e15296e373..65be3c2869 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -4616,8 +4616,9 @@ ATRewriteTable(AlteredTableInfo *tab, Oid OIDNewHeap, LOCKMODE lockmode)
         bistate = GetBulkInsertState();
 
         hi_options = HEAP_INSERT_SKIP_FSM;
+
         if (!XLogIsNeeded())
-            hi_options |= HEAP_INSERT_SKIP_WAL;
+            heap_register_sync(newrel);
     }
     else
     {
@@ -4882,8 +4883,6 @@ ATRewriteTable(AlteredTableInfo *tab, Oid OIDNewHeap, LOCKMODE lockmode)
         FreeBulkInsertState(bistate);

         /* If we skipped writing WAL, then we need to sync the heap. */
-        if (hi_options & HEAP_INSERT_SKIP_WAL)
-            heap_sync(newrel);
 
         table_close(newrel, NoLock);
     }
diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h
index fab5052868..32a365021a 100644
--- a/src/include/access/heapam.h
+++ b/src/include/access/heapam.h
@@ -27,11 +27,10 @@
 
 
 /* "options" flag bits for heap_insert */
-#define HEAP_INSERT_SKIP_WAL    0x0001
-#define HEAP_INSERT_SKIP_FSM    0x0002
-#define HEAP_INSERT_FROZEN        0x0004
-#define HEAP_INSERT_SPECULATIVE 0x0008
-#define HEAP_INSERT_NO_LOGICAL    0x0010
+#define HEAP_INSERT_SKIP_FSM    0x0001
+#define HEAP_INSERT_FROZEN        0x0002
+#define HEAP_INSERT_SPECULATIVE 0x0004
+#define HEAP_INSERT_NO_LOGICAL    0x0008
 
 typedef struct BulkInsertStateData *BulkInsertState;
 
-- 
2.16.3


В списке pgsql-hackers по дате отправления:

Предыдущее
От: Andreas Karlsson
Дата:
Сообщение: Re: Covering GiST indexes
Следующее
От: Kyotaro HORIGUCHI
Дата:
Сообщение: Re: ALTER SESSION