Re: [HACKERS] WAL logging problem in 9.4.3?

Поиск
Список
Период
Сортировка
От Kyotaro HORIGUCHI
Тема Re: [HACKERS] WAL logging problem in 9.4.3?
Дата
Msg-id 20190521.212948.34357392.horiguchi.kyotaro@lab.ntt.co.jp
обсуждение исходный текст
Ответ на Re: [HACKERS] WAL logging problem in 9.4.3?  (Kyotaro HORIGUCHI <horiguchi.kyotaro@lab.ntt.co.jp>)
Ответы Re: [HACKERS] WAL logging problem in 9.4.3?  (Kyotaro HORIGUCHI <horiguchi.kyotaro@lab.ntt.co.jp>)
Re: [HACKERS] WAL logging problem in 9.4.3?  (Noah Misch <noah@leadboat.com>)
Список pgsql-hackers
Hello.

At Mon, 20 May 2019 15:54:30 +0900 (Tokyo Standard Time), Kyotaro HORIGUCHI <horiguchi.kyotaro@lab.ntt.co.jp> wrote in
<20190520.155430.215084510.horiguchi.kyotaro@lab.ntt.co.jp>
> > I suspect the design in the https://postgr.es/m/559FA0BA.3080808@iki.fi last
> > paragraph will be simpler, not more complex.  In the implementation I'm
> > envisioning, smgrDoPendingDeletes() would change name, perhaps to
> > AtEOXact_Storage().  For every relfilenode it does not delete, it would ensure
> > durability by syncing (for large nodes) or by WAL-logging each page (for small
> > nodes).  RelationNeedsWAL() would return false whenever the applicable
> > relfilenode appears in pendingDeletes.  Access methods would remove their
> > smgrimmedsync() calls, but they would otherwise not change.  Would anyone like
> > to try implementing that?
> 
> Following this direction, the attached PoC works *at least for*
> the wal_optimization TAP tests, but doing pending flush not in
> smgr but in relcache. This is extending skip-wal feature to
> indexes. And makes the old 0002 patch on nbtree useless.

This is a tidier version of the patch.

- Passes regression tests including 018_wal_optimize.pl

- Move the substantial work to table/index AMs.

  Each AM can decide whether to support WAL skip or not.
  Currently heap and nbtree support it.

- The timing of sync is moved from AtEOXact to PreCommit. This is
  because heap_sync() needs xact state = INPROGRESS.

- matview and cluster is broken, since swapping to new
  relfilenode doesn't change rd_newRelfilenodeSubid. I'll address
  that.

regards.

-- 
Kyotaro Horiguchi
NTT Open Source Software Center
From 680462288cb82da23c19a02239787fc1ea08cdde Mon Sep 17 00:00:00 2001
From: Kyotaro Horiguchi <horiguchi.kyotaro@lab.ntt.co.jp>
Date: Thu, 11 Oct 2018 10:03:21 +0900
Subject: [PATCH 1/2] TAP test for copy-truncation optimization.

---
 src/test/recovery/t/018_wal_optimize.pl | 291 ++++++++++++++++++++++++++++++++
 1 file changed, 291 insertions(+)
 create mode 100644 src/test/recovery/t/018_wal_optimize.pl

diff --git a/src/test/recovery/t/018_wal_optimize.pl b/src/test/recovery/t/018_wal_optimize.pl
new file mode 100644
index 0000000000..4fa8be728e
--- /dev/null
+++ b/src/test/recovery/t/018_wal_optimize.pl
@@ -0,0 +1,291 @@
+# Test WAL replay for optimized TRUNCATE and COPY records
+#
+# WAL truncation is optimized in some cases with TRUNCATE and COPY queries
+# which sometimes interact badly with the other optimizations in line with
+# several setting values of wal_level, particularly when using "minimal" or
+# "replica".  The optimization may be enabled or disabled depending on the
+# scenarios dealt here, and should never result in any type of failures or
+# data loss.
+use strict;
+use warnings;
+
+use PostgresNode;
+use TestLib;
+use Test::More tests => 24;
+
+sub check_orphan_relfilenodes
+{
+    my($node, $test_name) = @_;
+
+    my $db_oid = $node->safe_psql('postgres',
+       "SELECT oid FROM pg_database WHERE datname = 'postgres'");
+    my $prefix = "base/$db_oid/";
+    my $filepaths_referenced = $node->safe_psql('postgres', "
+       SELECT pg_relation_filepath(oid) FROM pg_class
+       WHERE reltablespace = 0 and relpersistence <> 't' and
+       pg_relation_filepath(oid) IS NOT NULL;");
+    is_deeply([sort(map { "$prefix$_" }
+                    grep(/^[0-9]+$/,
+                         slurp_dir($node->data_dir . "/$prefix")))],
+              [sort split /\n/, $filepaths_referenced],
+              $test_name);
+    return;
+}
+
+# Wrapper routine tunable for wal_level.
+sub run_wal_optimize
+{
+    my $wal_level = shift;
+
+    # Primary needs to have wal_level = minimal here
+    my $node = get_new_node("node_$wal_level");
+    $node->init;
+    $node->append_conf('postgresql.conf', qq(
+wal_level = $wal_level
+));
+    $node->start;
+
+    # Setup
+    my $tablespace_dir = $node->basedir . '/tablespace_other';
+    mkdir ($tablespace_dir);
+    $tablespace_dir = TestLib::real_dir($tablespace_dir);
+    $node->safe_psql('postgres',
+       "CREATE TABLESPACE other LOCATION '$tablespace_dir';");
+
+    # Test direct truncation optimization.  No tuples
+    $node->safe_psql('postgres', "
+        BEGIN;
+        CREATE TABLE test1 (id serial PRIMARY KEY);
+        TRUNCATE test1;
+        COMMIT;");
+
+    $node->stop('immediate');
+    $node->start;
+
+    my $result = $node->safe_psql('postgres', "SELECT count(*) FROM test1;");
+    is($result, qq(0),
+       "wal_level = $wal_level, optimized truncation with empty table");
+
+    # Test truncation with inserted tuples within the same transaction.
+    # Tuples inserted after the truncation should be seen.
+    $node->safe_psql('postgres', "
+        BEGIN;
+        CREATE TABLE test2 (id serial PRIMARY KEY);
+        INSERT INTO test2 VALUES (DEFAULT);
+        TRUNCATE test2;
+        INSERT INTO test2 VALUES (DEFAULT);
+        COMMIT;");
+
+    $node->stop('immediate');
+    $node->start;
+
+    $result = $node->safe_psql('postgres', "SELECT count(*) FROM test2;");
+    is($result, qq(1),
+       "wal_level = $wal_level, optimized truncation with inserted table");
+
+    # Data file for COPY query in follow-up tests.
+    my $basedir = $node->basedir;
+    my $copy_file = "$basedir/copy_data.txt";
+    TestLib::append_to_file($copy_file, qq(20000,30000
+20001,30001
+20002,30002));
+
+    # Test truncation with inserted tuples using COPY.  Tuples copied after the
+    # truncation should be seen.
+    $node->safe_psql('postgres', "
+        BEGIN;
+        CREATE TABLE test3 (id serial PRIMARY KEY, id2 int);
+        INSERT INTO test3 (id, id2) VALUES (DEFAULT, generate_series(1,3000));
+        TRUNCATE test3;
+        COPY test3 FROM '$copy_file' DELIMITER ',';
+        COMMIT;");
+    $node->stop('immediate');
+    $node->start;
+    $result = $node->safe_psql('postgres', "SELECT count(*) FROM test3;");
+    is($result, qq(3),
+       "wal_level = $wal_level, optimized truncation with copied table");
+
+    # Like previous test, but rollback SET TABLESPACE in a subtransaction.
+    $node->safe_psql('postgres', "
+        BEGIN;
+        CREATE TABLE test3a (id serial PRIMARY KEY, id2 int);
+        INSERT INTO test3a (id, id2) VALUES (DEFAULT, generate_series(1,3000));
+        TRUNCATE test3a;
+        SAVEPOINT s; ALTER TABLE test3a SET TABLESPACE other; ROLLBACK TO s;
+        COPY test3a FROM '$copy_file' DELIMITER ',';
+        COMMIT;");
+    $node->stop('immediate');
+    $node->start;
+    $result = $node->safe_psql('postgres', "SELECT count(*) FROM test3a;");
+    is($result, qq(3),
+       "wal_level = $wal_level, SET TABLESPACE in subtransaction");
+
+    # in different subtransaction patterns
+    $node->safe_psql('postgres', "
+        BEGIN;
+        CREATE TABLE test3a2 (id serial PRIMARY KEY, id2 int);
+        INSERT INTO test3a2 (id, id2) VALUES (DEFAULT, generate_series(1,3000));
+        TRUNCATE test3a2;
+        SAVEPOINT s; ALTER TABLE test3a SET TABLESPACE other; RELEASE s;
+        COPY test3a2 FROM '$copy_file' DELIMITER ',';
+        COMMIT;");
+    $node->stop('immediate');
+    $node->start;
+    $result = $node->safe_psql('postgres', "SELECT count(*) FROM test3a;");
+    is($result, qq(3),
+       "wal_level = $wal_level, SET TABLESPACE in subtransaction");
+
+    $node->safe_psql('postgres', "
+        BEGIN;
+        CREATE TABLE test3a3 (id serial PRIMARY KEY, id2 int);
+        INSERT INTO test3a3 (id, id2) VALUES (DEFAULT, generate_series(1,3000));
+        TRUNCATE test3a3;
+        SAVEPOINT s;
+            ALTER TABLE test3a3 SET TABLESPACE other;
+            SAVEPOINT s2;
+                ALTER TABLE test3a3 SET TABLESPACE pg_default;
+            ROLLBACK TO s2;
+            SAVEPOINT s2;
+                ALTER TABLE test3a3 SET TABLESPACE pg_default;
+            RELEASE s2;
+        ROLLBACK TO s;
+        COPY test3a3 FROM '$copy_file' DELIMITER ',';
+        COMMIT;");
+    $node->stop('immediate');
+    $node->start;
+    $result = $node->safe_psql('postgres', "SELECT count(*) FROM test3a;");
+    is($result, qq(3),
+       "wal_level = $wal_level, SET TABLESPACE in subtransaction");
+
+    # UPDATE touches two buffers; one is BufferNeedsWAL(); the other is not.
+    $node->safe_psql('postgres', "
+        BEGIN;
+        CREATE TABLE test3b (id serial PRIMARY KEY, id2 int);
+        INSERT INTO test3b (id, id2) VALUES (DEFAULT, generate_series(1,10000));
+        COPY test3b FROM '$copy_file' DELIMITER ',';  -- set sync_above
+        UPDATE test3b SET id2 = id2 + 1;
+        DELETE FROM test3b;
+        COMMIT;");
+    $node->stop('immediate');
+    $node->start;
+    $result = $node->safe_psql('postgres', "SELECT count(*) FROM test3b;");
+    is($result, qq(0),
+       "wal_level = $wal_level, UPDATE of logged page extends relation");
+
+    # Test truncation with inserted tuples using both INSERT and COPY. Tuples
+    # inserted after the truncation should be seen.
+    $node->safe_psql('postgres', "
+        BEGIN;
+        CREATE TABLE test4 (id serial PRIMARY KEY, id2 int);
+        INSERT INTO test4 (id, id2) VALUES (DEFAULT, generate_series(1,10000));
+        TRUNCATE test4;
+        INSERT INTO test4 (id, id2) VALUES (DEFAULT, 10000);
+        COPY test4 FROM '$copy_file' DELIMITER ',';
+        INSERT INTO test4 (id, id2) VALUES (DEFAULT, 10000);
+        COMMIT;");
+
+    $node->stop('immediate');
+    $node->start;
+    $result = $node->safe_psql('postgres', "SELECT count(*) FROM test4;");
+    is($result, qq(5),
+       "wal_level = $wal_level, optimized truncation with inserted/copied table");
+
+    # Test consistency of COPY with INSERT for table created in the same
+    # transaction.
+    $node->safe_psql('postgres', "
+        BEGIN;
+        CREATE TABLE test5 (id serial PRIMARY KEY, id2 int);
+        INSERT INTO test5 VALUES (DEFAULT, 1);
+        COPY test5 FROM '$copy_file' DELIMITER ',';
+        COMMIT;");
+    $node->stop('immediate');
+    $node->start;
+    $result = $node->safe_psql('postgres', "SELECT count(*) FROM test5;");
+    is($result, qq(4),
+       "wal_level = $wal_level, replay of optimized copy with inserted table");
+
+    # Test consistency of COPY that inserts more to the same table using
+    # triggers.  If the INSERTS from the trigger go to the same block data
+    # is copied to, and the INSERTs are WAL-logged, WAL replay will fail when
+    # it tries to replay the WAL record but the "before" image doesn't match,
+    # because not all changes were WAL-logged.
+    $node->safe_psql('postgres', "
+        BEGIN;
+        CREATE TABLE test6 (id serial PRIMARY KEY, id2 text);
+        CREATE FUNCTION test6_before_row_trig() RETURNS trigger
+          LANGUAGE plpgsql as \$\$
+          BEGIN
+            IF new.id2 NOT LIKE 'triggered%' THEN
+              INSERT INTO test6 VALUES (DEFAULT, 'triggered row before' || NEW.id2);
+            END IF;
+            RETURN NEW;
+          END; \$\$;
+        CREATE FUNCTION test6_after_row_trig() RETURNS trigger
+          LANGUAGE plpgsql as \$\$
+          BEGIN
+            IF new.id2 NOT LIKE 'triggered%' THEN
+              INSERT INTO test6 VALUES (DEFAULT, 'triggered row after' || OLD.id2);
+            END IF;
+            RETURN NEW;
+          END; \$\$;
+        CREATE TRIGGER test6_before_row_insert
+          BEFORE INSERT ON test6
+          FOR EACH ROW EXECUTE PROCEDURE test6_before_row_trig();
+        CREATE TRIGGER test6_after_row_insert
+          AFTER INSERT ON test6
+          FOR EACH ROW EXECUTE PROCEDURE test6_after_row_trig();
+        COPY test6 FROM '$copy_file' DELIMITER ',';
+        COMMIT;");
+    $node->stop('immediate');
+    $node->start;
+    $result = $node->safe_psql('postgres', "SELECT count(*) FROM test6;");
+    is($result, qq(9),
+       "wal_level = $wal_level, replay of optimized copy with before trigger");
+
+    # Test consistency of INSERT, COPY and TRUNCATE in same transaction block
+    # with TRUNCATE triggers.
+    $node->safe_psql('postgres', "
+        BEGIN;
+        CREATE TABLE test7 (id serial PRIMARY KEY, id2 text);
+        CREATE FUNCTION test7_before_stat_trig() RETURNS trigger
+          LANGUAGE plpgsql as \$\$
+          BEGIN
+            INSERT INTO test7 VALUES (DEFAULT, 'triggered stat before');
+            RETURN NULL;
+          END; \$\$;
+        CREATE FUNCTION test7_after_stat_trig() RETURNS trigger
+          LANGUAGE plpgsql as \$\$
+          BEGIN
+            INSERT INTO test7 VALUES (DEFAULT, 'triggered stat before');
+            RETURN NULL;
+          END; \$\$;
+        CREATE TRIGGER test7_before_stat_truncate
+          BEFORE TRUNCATE ON test7
+          FOR EACH STATEMENT EXECUTE PROCEDURE test7_before_stat_trig();
+        CREATE TRIGGER test7_after_stat_truncate
+          AFTER TRUNCATE ON test7
+          FOR EACH STATEMENT EXECUTE PROCEDURE test7_after_stat_trig();
+        INSERT INTO test7 VALUES (DEFAULT, 1);
+        TRUNCATE test7;
+        COPY test7 FROM '$copy_file' DELIMITER ',';
+        COMMIT;");
+    $node->stop('immediate');
+    $node->start;
+    $result = $node->safe_psql('postgres', "SELECT count(*) FROM test7;");
+    is($result, qq(4),
+       "wal_level = $wal_level, replay of optimized copy with before trigger");
+
+    # Test redo of temp table creation.
+    $node->safe_psql('postgres', "
+        CREATE TEMP TABLE test8 (id serial PRIMARY KEY, id2 text);");
+    $node->stop('immediate');
+    $node->start;
+
+    check_orphan_relfilenodes($node, "wal_level = $wal_level, no orphan relfilenode remains");
+
+    return;
+}
+
+# Run same test suite for multiple wal_level values.
+run_wal_optimize("minimal");
+run_wal_optimize("replica");
-- 
2.16.3

From 75b90a8020275af6ee5e6ee5a4433c5582bd9148 Mon Sep 17 00:00:00 2001
From: Kyotaro Horiguchi <horiguchi.kyotaro@lab.ntt.co.jp>
Date: Mon, 20 May 2019 15:38:59 +0900
Subject: [PATCH 2/2] Fix WAL skipping feature

WAL-skipping operations mixed with WAL-logged operations can lead to
database corruption after a crash. This patch changes the WAL-skipping
feature so that no data modifcation is WAL-logged at all then sync
such relations at commit.
---
 src/backend/access/brin/brin.c           |  2 +
 src/backend/access/gin/ginutil.c         |  2 +
 src/backend/access/gist/gist.c           |  2 +
 src/backend/access/hash/hash.c           |  2 +
 src/backend/access/heap/heapam.c         |  8 +--
 src/backend/access/heap/heapam_handler.c | 15 +++---
 src/backend/access/heap/rewriteheap.c    |  3 --
 src/backend/access/index/indexam.c       | 16 ++++++
 src/backend/access/nbtree/nbtree.c       | 13 +++++
 src/backend/access/transam/xact.c        |  6 +++
 src/backend/commands/copy.c              |  6 ---
 src/backend/commands/createas.c          |  5 +-
 src/backend/commands/matview.c           |  4 --
 src/backend/commands/tablecmds.c         |  4 --
 src/backend/utils/cache/relcache.c       | 87 ++++++++++++++++++++++++++++++++
 src/include/access/amapi.h               |  8 +++
 src/include/access/genam.h               |  1 +
 src/include/access/heapam.h              |  1 -
 src/include/access/nbtree.h              |  1 +
 src/include/access/tableam.h             | 36 +++++++------
 src/include/utils/rel.h                  | 21 +++++++-
 src/include/utils/relcache.h             |  1 +
 22 files changed, 188 insertions(+), 56 deletions(-)

diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c
index aba234c0af..681520852f 100644
--- a/src/backend/access/brin/brin.c
+++ b/src/backend/access/brin/brin.c
@@ -125,6 +125,8 @@ brinhandler(PG_FUNCTION_ARGS)
     amroutine->aminitparallelscan = NULL;
     amroutine->amparallelrescan = NULL;
 
+    amroutine->amatcommitsync = NULL;
+
     PG_RETURN_POINTER(amroutine);
 }
 
diff --git a/src/backend/access/gin/ginutil.c b/src/backend/access/gin/ginutil.c
index cf9699ad18..f4f0eebec5 100644
--- a/src/backend/access/gin/ginutil.c
+++ b/src/backend/access/gin/ginutil.c
@@ -77,6 +77,8 @@ ginhandler(PG_FUNCTION_ARGS)
     amroutine->aminitparallelscan = NULL;
     amroutine->amparallelrescan = NULL;
 
+    amroutine->amatcommitsync = NULL;
+
     PG_RETURN_POINTER(amroutine);
 }
 
diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c
index d70a138f54..3a23e7c4b2 100644
--- a/src/backend/access/gist/gist.c
+++ b/src/backend/access/gist/gist.c
@@ -99,6 +99,8 @@ gisthandler(PG_FUNCTION_ARGS)
     amroutine->aminitparallelscan = NULL;
     amroutine->amparallelrescan = NULL;
 
+    amroutine->amatcommitsync = NULL;
+
     PG_RETURN_POINTER(amroutine);
 }
 
diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c
index 048e40e46f..3fa8262319 100644
--- a/src/backend/access/hash/hash.c
+++ b/src/backend/access/hash/hash.c
@@ -98,6 +98,8 @@ hashhandler(PG_FUNCTION_ARGS)
     amroutine->aminitparallelscan = NULL;
     amroutine->amparallelrescan = NULL;
 
+    amroutine->amatcommitsync = NULL;
+
     PG_RETURN_POINTER(amroutine);
 }
 
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 19d2c529d8..7f78122b81 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -1950,7 +1950,7 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid,
     MarkBufferDirty(buffer);
 
     /* XLOG stuff */
-    if (!(options & HEAP_INSERT_SKIP_WAL) && RelationNeedsWAL(relation))
+    if (RelationNeedsWAL(relation))
     {
         xl_heap_insert xlrec;
         xl_heap_header xlhdr;
@@ -2133,7 +2133,7 @@ heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples,
     /* currently not needed (thus unsupported) for heap_multi_insert() */
     AssertArg(!(options & HEAP_INSERT_NO_LOGICAL));
 
-    needwal = !(options & HEAP_INSERT_SKIP_WAL) && RelationNeedsWAL(relation);
+    needwal = RelationNeedsWAL(relation);
     saveFreeSpace = RelationGetTargetPageFreeSpace(relation,
                                                    HEAP_DEFAULT_FILLFACTOR);
 
@@ -8906,10 +8906,6 @@ heap2_redo(XLogReaderState *record)
 void
 heap_sync(Relation rel)
 {
-    /* non-WAL-logged tables never need fsync */
-    if (!RelationNeedsWAL(rel))
-        return;
-
     /* main heap */
     FlushRelationBuffers(rel);
     /* FlushRelationBuffers will have opened rd_smgr */
diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c
index 8d8161fd97..a2e1464845 100644
--- a/src/backend/access/heap/heapam_handler.c
+++ b/src/backend/access/heap/heapam_handler.c
@@ -557,15 +557,14 @@ tuple_lock_retry:
     return result;
 }
 
+/* ------------------------------------------------------------------------
+ * WAL-skipping related routine
+ * ------------------------------------------------------------------------
+ */
 static void
-heapam_finish_bulk_insert(Relation relation, int options)
+heapam_at_commit_sync(Relation relation)
 {
-    /*
-     * If we skipped writing WAL, then we need to sync the heap (but not
-     * indexes since those use WAL anyway / don't go through tableam)
-     */
-    if (options & HEAP_INSERT_SKIP_WAL)
-        heap_sync(relation);
+    heap_sync(relation);
 }
 
 
@@ -2573,7 +2572,7 @@ static const TableAmRoutine heapam_methods = {
     .tuple_delete = heapam_tuple_delete,
     .tuple_update = heapam_tuple_update,
     .tuple_lock = heapam_tuple_lock,
-    .finish_bulk_insert = heapam_finish_bulk_insert,
+    .at_commit_sync = heapam_at_commit_sync,
 
     .tuple_fetch_row_version = heapam_fetch_row_version,
     .tuple_get_latest_tid = heap_get_latest_tid,
diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c
index bce4274362..1ac77f7c14 100644
--- a/src/backend/access/heap/rewriteheap.c
+++ b/src/backend/access/heap/rewriteheap.c
@@ -654,9 +654,6 @@ raw_heap_insert(RewriteState state, HeapTuple tup)
     {
         int options = HEAP_INSERT_SKIP_FSM;
 
-        if (!state->rs_use_wal)
-            options |= HEAP_INSERT_SKIP_WAL;
-
         /*
          * While rewriting the heap for VACUUM FULL / CLUSTER, make sure data
          * for the TOAST table are not logically decoded.  The main heap is
diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
index 0fc9139bad..1d089603b7 100644
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -33,6 +33,7 @@
  *        index_can_return    - does index support index-only scans?
  *        index_getprocid - get a support procedure OID
  *        index_getprocinfo - get a support procedure's lookup info
+ *        index_at_commit_sync - perform at_commit_sync
  *
  * NOTES
  *        This file contains the index_ routines which used
@@ -837,6 +838,21 @@ index_getprocinfo(Relation irel,
     return locinfo;
 }
 
+/* ----------------
+ *        index_at_commit_sync
+ *
+ *        This routine perfoms at-commit sync of index storage.  This is called
+ *        when permanent index created in the current transaction is committed.
+ *        ----------------
+ */
+void
+index_at_commit_sync(Relation irel)
+{
+    Assert(irel->rd_indam != NULL && irel->rd_indam->amatcommitsync != NULL);
+    
+    irel->rd_indam->amatcommitsync(irel);
+}
+
 /* ----------------
  *        index_store_float8_orderby_distances
  *
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 02fb352b94..39377f35eb 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -147,6 +147,8 @@ bthandler(PG_FUNCTION_ARGS)
     amroutine->aminitparallelscan = btinitparallelscan;
     amroutine->amparallelrescan = btparallelrescan;
 
+    amroutine->amatcommitsync = btatcommitsync;
+
     PG_RETURN_POINTER(amroutine);
 }
 
@@ -1385,3 +1387,14 @@ btcanreturn(Relation index, int attno)
 {
     return true;
 }
+
+/*
+ *    btatcommitsync() -- Perform at-commit sync of WAL-skipped index
+ */
+void
+btatcommitsync(Relation index)
+{
+    FlushRelationBuffers(index);
+    smgrimmedsync(index->rd_smgr, MAIN_FORKNUM);
+}
+
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index 20feeec327..bc38a53195 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -2120,6 +2120,9 @@ CommitTransaction(void)
     if (!is_parallel_worker)
         PreCommit_CheckForSerializationFailure();
 
+    /* Sync WAL-skipped relations */
+    PreCommit_RelationSync();
+
     /*
      * Insert notifications sent by NOTIFY commands into the queue.  This
      * should be late in the pre-commit sequence to minimize time spent
@@ -2395,6 +2398,9 @@ PrepareTransaction(void)
                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
                  errmsg("cannot PREPARE a transaction that has manipulated logical replication workers")));
 
+    /* Sync WAL-skipped relations */
+    PreCommit_RelationSync();
+
     /* Prevent cancel/die interrupt while cleaning up */
     HOLD_INTERRUPTS();
 
diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c
index 5f81aa57d4..a25c82438e 100644
--- a/src/backend/commands/copy.c
+++ b/src/backend/commands/copy.c
@@ -2761,11 +2761,7 @@ CopyFrom(CopyState cstate)
     if (RELKIND_HAS_STORAGE(cstate->rel->rd_rel->relkind) &&
         (cstate->rel->rd_createSubid != InvalidSubTransactionId ||
          cstate->rel->rd_newRelfilenodeSubid != InvalidSubTransactionId))
-    {
         ti_options |= TABLE_INSERT_SKIP_FSM;
-        if (!XLogIsNeeded())
-            ti_options |= TABLE_INSERT_SKIP_WAL;
-    }
 
     /*
      * Optimize if new relfilenode was created in this subxact or one of its
@@ -3364,8 +3360,6 @@ CopyFrom(CopyState cstate)
 
     FreeExecutorState(estate);
 
-    table_finish_bulk_insert(cstate->rel, ti_options);
-
     return processed;
 }
 
diff --git a/src/backend/commands/createas.c b/src/backend/commands/createas.c
index 43c2fa9124..859b869b0d 100644
--- a/src/backend/commands/createas.c
+++ b/src/backend/commands/createas.c
@@ -558,8 +558,7 @@ intorel_startup(DestReceiver *self, int operation, TupleDesc typeinfo)
      * We can skip WAL-logging the insertions, unless PITR or streaming
      * replication is in use. We can skip the FSM in any case.
      */
-    myState->ti_options = TABLE_INSERT_SKIP_FSM |
-        (XLogIsNeeded() ? 0 : TABLE_INSERT_SKIP_WAL);
+    myState->ti_options = TABLE_INSERT_SKIP_FSM;
     myState->bistate = GetBulkInsertState();
 
     /* Not using WAL requires smgr_targblock be initially invalid */
@@ -604,8 +603,6 @@ intorel_shutdown(DestReceiver *self)
 
     FreeBulkInsertState(myState->bistate);
 
-    table_finish_bulk_insert(myState->rel, myState->ti_options);
-
     /* close rel, but keep lock until commit */
     table_close(myState->rel, NoLock);
     myState->rel = NULL;
diff --git a/src/backend/commands/matview.c b/src/backend/commands/matview.c
index 99bf3c29f2..c84edd0db0 100644
--- a/src/backend/commands/matview.c
+++ b/src/backend/commands/matview.c
@@ -463,8 +463,6 @@ transientrel_startup(DestReceiver *self, int operation, TupleDesc typeinfo)
      * replication is in use. We can skip the FSM in any case.
      */
     myState->ti_options = TABLE_INSERT_SKIP_FSM | TABLE_INSERT_FROZEN;
-    if (!XLogIsNeeded())
-        myState->ti_options |= TABLE_INSERT_SKIP_WAL;
     myState->bistate = GetBulkInsertState();
 
     /* Not using WAL requires smgr_targblock be initially invalid */
@@ -509,8 +507,6 @@ transientrel_shutdown(DestReceiver *self)
 
     FreeBulkInsertState(myState->bistate);
 
-    table_finish_bulk_insert(myState->transientrel, myState->ti_options);
-
     /* close transientrel, but keep lock until commit */
     table_close(myState->transientrel, NoLock);
     myState->transientrel = NULL;
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index bfcf9472d7..75f11a327d 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -4741,8 +4741,6 @@ ATRewriteTable(AlteredTableInfo *tab, Oid OIDNewHeap, LOCKMODE lockmode)
         bistate = GetBulkInsertState();
 
         ti_options = TABLE_INSERT_SKIP_FSM;
-        if (!XLogIsNeeded())
-            ti_options |= TABLE_INSERT_SKIP_WAL;
     }
     else
     {
@@ -5026,8 +5024,6 @@ ATRewriteTable(AlteredTableInfo *tab, Oid OIDNewHeap, LOCKMODE lockmode)
     {
         FreeBulkInsertState(bistate);
 
-        table_finish_bulk_insert(newrel, ti_options);
-
         table_close(newrel, NoLock);
     }
 }
diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c
index d0f6f715e6..4bffbfff5d 100644
--- a/src/backend/utils/cache/relcache.c
+++ b/src/backend/utils/cache/relcache.c
@@ -1512,6 +1512,9 @@ RelationInitIndexAccessInfo(Relation relation)
     relation->rd_exclprocs = NULL;
     relation->rd_exclstrats = NULL;
     relation->rd_amcache = NULL;
+
+    if (relation->rd_indam->amatcommitsync != NULL)
+        relation->rd_can_skipwal = true;
 }
 
 /*
@@ -1781,6 +1784,9 @@ RelationInitTableAccessMethod(Relation relation)
      * Now we can fetch the table AM's API struct
      */
     InitTableAmRoutine(relation);
+
+    if (relation->rd_tableam && relation->rd_tableam->at_commit_sync)
+        relation->rd_can_skipwal = true;
 }
 
 /*
@@ -2913,6 +2919,73 @@ RememberToFreeTupleDescAtEOX(TupleDesc td)
     EOXactTupleDescArray[NextEOXactTupleDescNum++] = td;
 }
 
+/*
+ * PreComimt_RelationSync
+ *
+ *    Sync relations that were WAL-skipped in this transaction .
+ *
+ * AMs may have skipped WAL-logging for relations created in the current
+ * transaction. This let such relations be synced.  This operation can only be
+ * perfomed while transaction status is INPROGRESS so it is separated from
+ * AtEOXact_RelationCache.
+ */
+void
+PreCommit_RelationSync(void)
+{
+    HASH_SEQ_STATUS status;
+    RelIdCacheEnt *idhentry;
+    int            i;
+
+    /* See AtEOXact_RelationCache for details on eoxact_list */
+    if (eoxact_list_overflowed)
+    {
+        hash_seq_init(&status, RelationIdCache);
+        while ((idhentry = (RelIdCacheEnt *) hash_seq_search(&status)) != NULL)
+        {
+            Relation rel = idhentry->reldesc;
+
+            if (!RelationNeedsAtCommitSync(rel))
+                continue;
+
+            if (rel->rd_tableam != NULL)
+                table_at_commit_sync(rel);
+            else
+            {
+                Assert (rel->rd_indam != NULL);
+                table_at_commit_sync(rel);
+            }                
+        }
+    }
+    else
+    {
+        for (i = 0; i < eoxact_list_len; i++)
+        {
+            Relation rel;
+
+            idhentry = (RelIdCacheEnt *) hash_search(RelationIdCache,
+                                                     (void *) &eoxact_list[i],
+                                                     HASH_FIND,
+                                                     NULL);
+
+            if (idhentry == NULL)
+                continue;
+
+            rel = idhentry->reldesc;
+
+            if (!RelationNeedsAtCommitSync(rel))
+                continue;
+
+            if (rel->rd_tableam != NULL)
+                table_at_commit_sync(rel);
+            else
+            {
+                Assert (rel->rd_indam != NULL);
+                table_at_commit_sync(rel);
+            }
+        }
+    }
+}
+
 /*
  * AtEOXact_RelationCache
  *
@@ -3032,7 +3105,21 @@ AtEOXact_cleanup(Relation relation, bool isCommit)
     if (relation->rd_createSubid != InvalidSubTransactionId)
     {
         if (isCommit)
+        {
+            /*
+             * While wal_level=minimal, we have skipped WAL-logging on
+             * persistent relations created in this transaction. Sync that
+             * tables out before they become publicly accessible.
+             */
+            if (!XLogIsNeeded() && relation->rd_smgr &&
+                relation->rd_rel->relpersistence == RELPERSISTENCE_PERMANENT)
+            {
+                FlushRelationBuffers(relation);
+                smgrimmedsync(relation->rd_smgr, MAIN_FORKNUM);
+            }
+
             relation->rd_createSubid = InvalidSubTransactionId;
+        }
         else if (RelationHasReferenceCountZero(relation))
         {
             RelationClearRelation(relation, false);
diff --git a/src/include/access/amapi.h b/src/include/access/amapi.h
index 09a7404267..fc6981d98a 100644
--- a/src/include/access/amapi.h
+++ b/src/include/access/amapi.h
@@ -156,6 +156,11 @@ typedef void (*aminitparallelscan_function) (void *target);
 /* (re)start parallel index scan */
 typedef void (*amparallelrescan_function) (IndexScanDesc scan);
 
+/* sync relation at commit */
+typedef void (*amatcommitsync_function) (Relation indexRelation);
+
+    /* interface function to support WAL-skipping feature */
+    
 /*
  * API struct for an index AM.  Note this must be stored in a single palloc'd
  * chunk of memory.
@@ -230,6 +235,9 @@ typedef struct IndexAmRoutine
     amestimateparallelscan_function amestimateparallelscan; /* can be NULL */
     aminitparallelscan_function aminitparallelscan; /* can be NULL */
     amparallelrescan_function amparallelrescan; /* can be NULL */
+
+    /* interface function to support WAL-skipping feature */
+    amatcommitsync_function amatcommitsync; /* can be NULL */;
 } IndexAmRoutine;
 
 
diff --git a/src/include/access/genam.h b/src/include/access/genam.h
index 9717183ef2..b225fd622e 100644
--- a/src/include/access/genam.h
+++ b/src/include/access/genam.h
@@ -177,6 +177,7 @@ extern RegProcedure index_getprocid(Relation irel, AttrNumber attnum,
                 uint16 procnum);
 extern FmgrInfo *index_getprocinfo(Relation irel, AttrNumber attnum,
                   uint16 procnum);
+extern void index_at_commit_sync(Relation irel);
 extern void index_store_float8_orderby_distances(IndexScanDesc scan,
                                      Oid *orderByTypes, double *distances,
                                      bool recheckOrderBy);
diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h
index 62aaa08eff..0fb7d86bf2 100644
--- a/src/include/access/heapam.h
+++ b/src/include/access/heapam.h
@@ -29,7 +29,6 @@
 
 
 /* "options" flag bits for heap_insert */
-#define HEAP_INSERT_SKIP_WAL    TABLE_INSERT_SKIP_WAL
 #define HEAP_INSERT_SKIP_FSM    TABLE_INSERT_SKIP_FSM
 #define HEAP_INSERT_FROZEN        TABLE_INSERT_FROZEN
 #define HEAP_INSERT_NO_LOGICAL    TABLE_INSERT_NO_LOGICAL
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 6c1acd4855..1d042e89b5 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -717,6 +717,7 @@ extern IndexBulkDeleteResult *btbulkdelete(IndexVacuumInfo *info,
 extern IndexBulkDeleteResult *btvacuumcleanup(IndexVacuumInfo *info,
                 IndexBulkDeleteResult *stats);
 extern bool btcanreturn(Relation index, int attno);
+extern void btatcommitsync(Relation index);
 
 /*
  * prototypes for internal functions in nbtree.c
diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h
index 06eae2337a..90254cb278 100644
--- a/src/include/access/tableam.h
+++ b/src/include/access/tableam.h
@@ -409,19 +409,15 @@ typedef struct TableAmRoutine
                                TM_FailureData *tmfd);
 
     /*
-     * Perform operations necessary to complete insertions made via
-     * tuple_insert and multi_insert with a BulkInsertState specified. This
-     * may for example be used to flush the relation, when the
-     * TABLE_INSERT_SKIP_WAL option was used.
+     * Sync relation at commit-time if needed.
      *
-     * Typically callers of tuple_insert and multi_insert will just pass all
-     * the flags that apply to them, and each AM has to decide which of them
-     * make sense for it, and then only take actions in finish_bulk_insert for
-     * those flags, and ignore others.
+     *  A table AM may skip WAL-logging for relations created in the current
+     *  transaction. This routine is called commit-time and the table AM
+     *  must flush buffer and sync the underlying storage.
      *
      * Optional callback.
      */
-    void        (*finish_bulk_insert) (Relation rel, int options);
+    void        (*at_commit_sync) (Relation rel);
 
 
     /* ------------------------------------------------------------------------
@@ -1104,8 +1100,7 @@ table_compute_xid_horizon_for_tuples(Relation rel,
  *
  *
  * The BulkInsertState object (if any; bistate can be NULL for default
- * behavior) is also just passed through to RelationGetBufferForTuple. If
- * `bistate` is provided, table_finish_bulk_insert() needs to be called.
+ * behavior) is also just passed through to RelationGetBufferForTuple.
  *
  * On return the slot's tts_tid and tts_tableOid are updated to reflect the
  * insertion. But note that any toasting of fields within the slot is NOT
@@ -1300,20 +1295,23 @@ table_lock_tuple(Relation rel, ItemPointer tid, Snapshot snapshot,
 }
 
 /*
- * Perform operations necessary to complete insertions made via
- * tuple_insert and multi_insert with a BulkInsertState specified. This
- * e.g. may e.g. used to flush the relation when inserting with
- * TABLE_INSERT_SKIP_WAL specified.
+ * Sync relation at commit-time if needed.
+ *
+ *  A table AM that defines this interface can allow derived objects created
+ *  in the current transaction to skip WAL-logging. This routine is called
+ *  commit-time and the table AM must flush buffer and sync the underlying
+ *  storage.
+ *
+ * Optional callback.
  */
 static inline void
-table_finish_bulk_insert(Relation rel, int options)
+table_at_commit_sync(Relation rel)
 {
     /* optional callback */
-    if (rel->rd_tableam && rel->rd_tableam->finish_bulk_insert)
-        rel->rd_tableam->finish_bulk_insert(rel, options);
+    if (rel->rd_tableam && rel->rd_tableam->at_commit_sync)
+        rel->rd_tableam->at_commit_sync(rel);
 }
 
-
 /* ------------------------------------------------------------------------
  * DDL related functionality.
  * ------------------------------------------------------------------------
diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h
index d7f33abce3..c09fd84a1c 100644
--- a/src/include/utils/rel.h
+++ b/src/include/utils/rel.h
@@ -64,6 +64,9 @@ typedef struct RelationData
                                  * rd_replidindex) */
     bool        rd_statvalid;    /* is rd_statlist valid? */
 
+    /* Some relations cane comit WAL-logging on certain condition. */
+    bool        rd_can_skipwal; /* can skip WAL-logging?  */
+
     /*
      * rd_createSubid is the ID of the highest subtransaction the rel has
      * survived into; or zero if the rel was not created in the current top
@@ -512,9 +515,25 @@ typedef struct ViewOptions
 /*
  * RelationNeedsWAL
  *        True if relation needs WAL.
+ *
+ * If underlying table AM has at_commit_sync interface, returns false if
+ * wal_level = minimal and this relation is created in the current transaction
  */
 #define RelationNeedsWAL(relation) \
-    ((relation)->rd_rel->relpersistence == RELPERSISTENCE_PERMANENT)
+    ((relation)->rd_rel->relpersistence == RELPERSISTENCE_PERMANENT && \
+     (!relation->rd_can_skipwal ||                                       \
+      !(RELATION_IS_LOCAL(relation) && !XLogIsNeeded())))
+
+/*
+ * RelationNeedAtCommitSync
+ *      True if relation needs WAL needs on-commit sync
+ */
+#define RelationNeedsAtCommitSync(relation) \
+    ((relation)->rd_rel->relpersistence == RELPERSISTENCE_PERMANENT &&    \
+     relation->rd_can_skipwal &&                                        \
+     (RELATION_IS_LOCAL(relation) ||                                    \
+      relation->rd_newRelfilenodeSubid != InvalidBlockNumber)            \
+     && !XLogIsNeeded()))
 
 /*
  * RelationUsesLocalBuffers
diff --git a/src/include/utils/relcache.h b/src/include/utils/relcache.h
index 364495a5f0..07c4cfa565 100644
--- a/src/include/utils/relcache.h
+++ b/src/include/utils/relcache.h
@@ -120,6 +120,7 @@ extern void RelationCacheInvalidate(void);
 
 extern void RelationCloseSmgrByOid(Oid relationId);
 
+extern void PreCommit_RelationSync(void);
 extern void AtEOXact_RelationCache(bool isCommit);
 extern void AtEOSubXact_RelationCache(bool isCommit, SubTransactionId mySubid,
                           SubTransactionId parentSubid);
-- 
2.16.3


В списке pgsql-hackers по дате отправления:

Предыдущее
От: Antonin Houska
Дата:
Сообщение: A few more opportunities to use TupleTableSlotOps fields
Следующее
От: Robert Haas
Дата:
Сообщение: Re: [HACKERS] Unlogged tables cleanup