Re: [HACKERS] More stats about skipped vacuums

Поиск
Список
Период
Сортировка
От Kyotaro HORIGUCHI
Тема Re: [HACKERS] More stats about skipped vacuums
Дата
Msg-id 20171211.201523.24172046.horiguchi.kyotaro@lab.ntt.co.jp
обсуждение исходный текст
Ответ на Re: [HACKERS] More stats about skipped vacuums  (Robert Haas <robertmhaas@gmail.com>)
Ответы Re: [HACKERS] More stats about skipped vacuums  (Masahiko Sawada <sawada.mshk@gmail.com>)
Список pgsql-hackers
At Mon, 27 Nov 2017 13:51:22 -0500, Robert Haas <robertmhaas@gmail.com> wrote in
<CA+Tgmob2tuqvEZfHV2kLC-xobsZxDWGdc1WmjLg5+iOPLa0NHg@mail.gmail.com>
> On Mon, Nov 27, 2017 at 1:49 AM, Kyotaro HORIGUCHI
> <horiguchi.kyotaro@lab.ntt.co.jp> wrote:
> > Hmmm. Okay, we must make stats collector more effeicient if we
> > want to have additional counters with smaller significance in the
> > table stats. Currently sizeof(PgStat_StatTabEntry) is 168
> > bytes. The whole of the patchset increases it to 232 bytes. Thus
> > the size of a stat file for a database with 10000 tables
> > increases from about 1.7MB to 2.4MB.  DSM and shared dynahash is
> > not dynamically expandable so placing stats on shared hash
> > doesn't seem effective. Stats as a regular table could work but
> > it seems too-much.
> 
> dshash, which is already committed, is both DSM-based and dynamically
> expandable.

Yes, I forgot about that. We can just copy memory blocks to take
a snapshot of stats.

> > Is it acceptable that adding a new section containing this new
> > counters, which is just loaded as a byte sequence and parsing
> > (and filling the corresponding hash) is postponed until a counter
> > in the section is really requested?  The new counters need to be
> > shown in a separate stats view (maybe named pg_stat_vacuum).
> 
> Still makes the stats file bigger.

I considered dshash for pgstat.c and the attached is a *PoC*
patch, which is not full-fledged and just working under a not so
concurrent situation.

- Made stats collector an auxiliary process. A crash of stats
  collector leads to a whole-server restarting.

- dshash lacks capability of sequential scan so added it.

- Also added snapshot function to dshash. It just copies
  unerlying DSA segments into local memory but currently it
  doesn't aquire dshash-level locks at all. I tried the same
  thing with resize but it leads to very quick exhaustion of
  LWLocks. An LWLock for the whole dshash would be required. (and
  it is also useful to resize() and sequential scan.

- The current dshash doesn't shrink at all. Such a feature also
  will be required. (A server restart causes a shrink of hashes
  in the same way as before but bloat dshash requires copying
  more than necessary size of memory on takeing a snapshot.)

The size of DSA is about 1MB at minimum. Copying entry-by-entry
into (non-ds) hash might be better than copying underlying DSA as
a whole, and DSA/DSHASH snapshot brings a kind of dirty..


Does anyone give me opinions or suggestions?

regards,

-- 
Kyotaro Horiguchi
NTT Open Source Software Center
From f5c9c45384ec43734c0890dd875101defe6590bc Mon Sep 17 00:00:00 2001
From: Kyotaro Horiguchi <horiguchi.kyotaro@lab.ntt.co.jp>
Date: Fri, 1 Dec 2017 14:34:47 +0900
Subject: [PATCH 1/4] Simple implement of local shapshot of dshash.

Add snapshot feature to DSHASH. This makes palloc'ed copy of
underlying DSA and returns unmodifiable DSHASH using the copied DSA.
---
 src/backend/lib/dshash.c     | 74 +++++++++++++++++++++++++++++++++++++++-----
 src/backend/utils/mmgr/dsa.c | 57 +++++++++++++++++++++++++++++++++-
 src/include/lib/dshash.h     |  1 +
 src/include/utils/dsa.h      |  1 +
 4 files changed, 124 insertions(+), 9 deletions(-)

diff --git a/src/backend/lib/dshash.c b/src/backend/lib/dshash.c
index dd87573..973a826 100644
--- a/src/backend/lib/dshash.c
+++ b/src/backend/lib/dshash.c
@@ -112,6 +112,7 @@ struct dshash_table
     size_t        size_log2;        /* log2(number of buckets) */
     bool        find_locked;    /* Is any partition lock held by 'find'? */
     bool        find_exclusively_locked;    /* ... exclusively? */
+    bool        is_snapshot;    /* Is this hash is a local snapshot?*/
 };
 
 /* Given a pointer to an item, find the entry (user data) it holds. */
@@ -228,6 +229,7 @@ dshash_create(dsa_area *area, const dshash_parameters *params, void *arg)
 
     hash_table->find_locked = false;
     hash_table->find_exclusively_locked = false;
+    hash_table->is_snapshot = false;
 
     /*
      * Set up the initial array of buckets.  Our initial size is the same as
@@ -279,6 +281,7 @@ dshash_attach(dsa_area *area, const dshash_parameters *params,
     hash_table->control = dsa_get_address(area, control);
     hash_table->find_locked = false;
     hash_table->find_exclusively_locked = false;
+    hash_table->is_snapshot = false;
     Assert(hash_table->control->magic == DSHASH_MAGIC);
 
     /*
@@ -321,6 +324,15 @@ dshash_destroy(dshash_table *hash_table)
     size_t        i;
 
     Assert(hash_table->control->magic == DSHASH_MAGIC);
+
+    /* this is a local copy */
+    if (hash_table->is_snapshot)
+    {
+        pfree(hash_table->area);
+        pfree(hash_table);
+        return;
+    }
+
     ensure_valid_bucket_pointers(hash_table);
 
     /* Free all the entries. */
@@ -355,6 +367,29 @@ dshash_destroy(dshash_table *hash_table)
 }
 
 /*
+ * take a local snapshot of a dshash table
+ */
+dshash_table *
+dshash_take_snapshot(dshash_table *org_table, dsa_area *new_area)
+{
+    dshash_table *new_table;
+
+    if (org_table->is_snapshot)
+        elog(ERROR, "cannot make local copy of local dshash");
+
+    new_table = palloc(sizeof(dshash_table));
+
+    new_table->area = new_area;
+    new_table->params = org_table->params;
+    new_table->control = dsa_get_address(new_table->area,
+                                         org_table->control->handle);
+    /* mark this dshash as a local copy */
+    new_table->is_snapshot = true;
+
+    return new_table;
+}
+
+/*
  * Get a handle that can be used by other processes to attach to this hash
  * table.
  */
@@ -392,15 +427,22 @@ dshash_find(dshash_table *hash_table, const void *key, bool exclusive)
     partition = PARTITION_FOR_HASH(hash);
 
     Assert(hash_table->control->magic == DSHASH_MAGIC);
-    Assert(!hash_table->find_locked);
 
-    LWLockAcquire(PARTITION_LOCK(hash_table, partition),
-                  exclusive ? LW_EXCLUSIVE : LW_SHARED);
+    if (!hash_table->is_snapshot)
+    {
+        Assert(!hash_table->find_locked);
+        LWLockAcquire(PARTITION_LOCK(hash_table, partition),
+                      exclusive ? LW_EXCLUSIVE : LW_SHARED);
+    }
     ensure_valid_bucket_pointers(hash_table);
 
     /* Search the active bucket. */
     item = find_in_bucket(hash_table, key, BUCKET_FOR_HASH(hash_table, hash));
 
+    /* don't lock if this is a local copy */
+    if (hash_table->is_snapshot)
+        return item ? ENTRY_FROM_ITEM(item) : NULL;
+
     if (!item)
     {
         /* Not found. */
@@ -436,6 +478,9 @@ dshash_find_or_insert(dshash_table *hash_table,
     dshash_partition *partition;
     dshash_table_item *item;
 
+    if (hash_table->is_snapshot)
+        elog(ERROR, "cannot insert into local dshash");
+
     hash = hash_key(hash_table, key);
     partition_index = PARTITION_FOR_HASH(hash);
     partition = &hash_table->control->partitions[partition_index];
@@ -505,6 +550,9 @@ dshash_delete_key(dshash_table *hash_table, const void *key)
     size_t        partition;
     bool        found;
 
+    if (hash_table->is_snapshot)
+        elog(ERROR, "cannot delete from a snapshot");
+
     Assert(hash_table->control->magic == DSHASH_MAGIC);
     Assert(!hash_table->find_locked);
 
@@ -545,6 +593,7 @@ dshash_delete_entry(dshash_table *hash_table, void *entry)
     Assert(hash_table->control->magic == DSHASH_MAGIC);
     Assert(hash_table->find_locked);
     Assert(hash_table->find_exclusively_locked);
+    Assert(!hash_table->is_snapshot);
     Assert(LWLockHeldByMeInMode(PARTITION_LOCK(hash_table, partition),
                                 LW_EXCLUSIVE));
 
@@ -563,6 +612,9 @@ dshash_release_lock(dshash_table *hash_table, void *entry)
     dshash_table_item *item = ITEM_FROM_ENTRY(entry);
     size_t        partition_index = PARTITION_FOR_HASH(item->hash);
 
+    if (hash_table->is_snapshot)
+        return;
+
     Assert(hash_table->control->magic == DSHASH_MAGIC);
     Assert(hash_table->find_locked);
     Assert(LWLockHeldByMeInMode(PARTITION_LOCK(hash_table, partition_index),
@@ -605,10 +657,13 @@ dshash_dump(dshash_table *hash_table)
     Assert(hash_table->control->magic == DSHASH_MAGIC);
     Assert(!hash_table->find_locked);
 
-    for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i)
+    if (!hash_table->is_snapshot)
     {
-        Assert(!LWLockHeldByMe(PARTITION_LOCK(hash_table, i)));
-        LWLockAcquire(PARTITION_LOCK(hash_table, i), LW_SHARED);
+        for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i)
+        {
+            Assert(!LWLockHeldByMe(PARTITION_LOCK(hash_table, i)));
+            LWLockAcquire(PARTITION_LOCK(hash_table, i), LW_SHARED);
+        }
     }
 
     ensure_valid_bucket_pointers(hash_table);
@@ -643,8 +698,11 @@ dshash_dump(dshash_table *hash_table)
         }
     }
 
-    for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i)
-        LWLockRelease(PARTITION_LOCK(hash_table, i));
+    if (!hash_table->is_snapshot)
+    {
+        for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i)
+            LWLockRelease(PARTITION_LOCK(hash_table, i));
+    }
 }
 
 /*
diff --git a/src/backend/utils/mmgr/dsa.c b/src/backend/utils/mmgr/dsa.c
index fe62788..dd02147 100644
--- a/src/backend/utils/mmgr/dsa.c
+++ b/src/backend/utils/mmgr/dsa.c
@@ -319,6 +319,7 @@ typedef struct
     bool        pinned;
     /* The number of times that segments have been freed. */
     Size        freed_segment_counter;
+    bool        is_snapshot;
     /* The LWLock tranche ID. */
     int            lwlock_tranche_id;
     /* The general lock (protects everything except object pools). */
@@ -931,7 +932,8 @@ dsa_get_address(dsa_area *area, dsa_pointer dp)
         return NULL;
 
     /* Process any requests to detach from freed segments. */
-    check_for_freed_segments(area);
+    if (!area->control->is_snapshot)
+        check_for_freed_segments(area);
 
     /* Break the dsa_pointer into its components. */
     index = DSA_EXTRACT_SEGMENT_NUMBER(dp);
@@ -1232,6 +1234,7 @@ create_internal(void *place, size_t size,
     control->high_segment_index = 0;
     control->refcnt = 1;
     control->freed_segment_counter = 0;
+    control->is_snapshot = false;
     control->lwlock_tranche_id = tranche_id;
 
     /*
@@ -2239,3 +2242,55 @@ check_for_freed_segments(dsa_area *area)
         area->freed_segment_counter = freed_segment_counter;
     }
 }
+
+/*
+ * Make a static local copy of this dsa area.
+ */
+dsa_area *
+dsa_take_snapshot(dsa_area *source_area)
+{
+    dsa_area   *area;
+    Size        size;
+    int i;
+    char        *mem;
+
+    /* allocate required size of memory */
+    size = sizeof(dsa_area);
+    size += sizeof(dsa_area_control);
+
+    LWLockAcquire(DSA_AREA_LOCK(source_area), LW_SHARED);
+    for (i = 0 ; i <= source_area->high_segment_index ; i++)
+        size += source_area->segment_maps[i].header->size;
+    mem = palloc(size);
+
+    area = (dsa_area *)mem;
+    mem += sizeof(dsa_area);
+    area->control = (dsa_area_control *)mem;
+    mem += sizeof(dsa_area_control);
+    memcpy(area->control, source_area->control, sizeof(dsa_area_control));
+    area->control->is_snapshot = true;
+    area->mapping_pinned = false;
+
+    /* Copy and connect the all segments */
+    for (i = 0 ; i <= source_area->high_segment_index ; i++)
+    {
+        dsa_segment_map *smap = &source_area->segment_maps[i];
+        dsa_segment_map *dmap = &area->segment_maps[i];
+
+        dmap->mapped_address = mem;
+        memcpy(dmap->mapped_address, smap->mapped_address, smap->header->size);
+        mem += smap->header->size;
+        dmap->header = (dsa_segment_header*) dmap->mapped_address;
+        dmap->header->magic = 0;
+        dmap->fpm = NULL;
+        dmap->pagemap = (dsa_pointer *)
+            (dmap->mapped_address + MAXALIGN(sizeof(dsa_area_control)) +
+             MAXALIGN(sizeof(FreePageManager)));
+    }
+
+    area->high_segment_index = source_area->high_segment_index;
+    LWLockRelease(DSA_AREA_LOCK(source_area));
+
+    elog(LOG, "dsa_take_snapshot copied %lu bytes", size);
+    return area;
+}
diff --git a/src/include/lib/dshash.h b/src/include/lib/dshash.h
index 220553c..d8f48ed 100644
--- a/src/include/lib/dshash.h
+++ b/src/include/lib/dshash.h
@@ -70,6 +70,7 @@ extern dshash_table *dshash_attach(dsa_area *area,
 extern void dshash_detach(dshash_table *hash_table);
 extern dshash_table_handle dshash_get_hash_table_handle(dshash_table *hash_table);
 extern void dshash_destroy(dshash_table *hash_table);
+extern dshash_table *dshash_take_snapshot(dshash_table *org_table, dsa_area *new_area);
 
 /* Finding, creating, deleting entries. */
 extern void *dshash_find(dshash_table *hash_table,
diff --git a/src/include/utils/dsa.h b/src/include/utils/dsa.h
index 516ef61..c10fb4a 100644
--- a/src/include/utils/dsa.h
+++ b/src/include/utils/dsa.h
@@ -121,5 +121,6 @@ extern void dsa_free(dsa_area *area, dsa_pointer dp);
 extern void *dsa_get_address(dsa_area *area, dsa_pointer dp);
 extern void dsa_trim(dsa_area *area);
 extern void dsa_dump(dsa_area *area);
+extern dsa_area *dsa_take_snapshot(dsa_area *source_area);
 
 #endif                            /* DSA_H */
-- 
2.9.2

From 89f575876812fb73724a5cc64dd605d4fd52a47b Mon Sep 17 00:00:00 2001
From: Kyotaro Horiguchi <horiguchi.kyotaro@oss.ntt.co.jp>
Date: Fri, 8 Dec 2017 21:53:36 +0900
Subject: [PATCH 2/4] Add seqscan on dshash

A WIP implement of seqscan support for dshash.  This doesn't takes any
lock so hash can be currupt from any reason.
---
 src/backend/lib/dshash.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++++
 src/include/lib/dshash.h | 14 +++++++++++
 2 files changed, 77 insertions(+)

diff --git a/src/backend/lib/dshash.c b/src/backend/lib/dshash.c
index 973a826..adc2131 100644
--- a/src/backend/lib/dshash.c
+++ b/src/backend/lib/dshash.c
@@ -644,6 +644,69 @@ dshash_memhash(const void *v, size_t size, void *arg)
     return tag_hash(v, size);
 }
 
+void
+dshash_seq_init(dshash_seq_status *status, dshash_table *hash_table)
+{
+    status->hash_table = hash_table;
+    status->curbucket = 0;
+    status->nbuckets = ((size_t) 1) << hash_table->control->size_log2;
+    status->curitem = NULL;
+
+    ensure_valid_bucket_pointers(hash_table);
+}
+
+void *
+dshash_seq_next(dshash_seq_status *status)
+{
+    dsa_pointer next_item_pointer;
+
+    if (status->curitem == NULL)
+    {
+        Assert (status->curbucket == 0);
+        next_item_pointer = status->hash_table->buckets[status->curbucket];
+    }
+    else
+        next_item_pointer = status->curitem->next;
+
+    while (!DsaPointerIsValid(next_item_pointer))
+    {
+        if (++status->curbucket >= status->nbuckets)
+        {
+            dshash_seq_release(status);
+            return NULL;
+        }
+        next_item_pointer = status->hash_table->buckets[status->curbucket];
+    }
+
+    status->curitem =
+        dsa_get_address(status->hash_table->area, next_item_pointer);
+    return ENTRY_FROM_ITEM(status->curitem);
+}
+
+int
+dshash_get_num_entries(dshash_table *hash_table)
+{
+    /* a shotcut implement. should be improved  */
+    dshash_seq_status s;
+    void *p;
+    int n = 0;
+
+    dshash_seq_init(&s, hash_table);
+    while ((p = dshash_seq_next(&s)) != NULL)
+    {
+        dshash_release_lock(hash_table, p);
+        n++;
+    }
+
+    return n;
+}
+
+void
+dshash_seq_release(dshash_seq_status *status)
+{
+    /* nothing to do so far..*/
+}
+
 /*
  * Print debugging information about the internal state of the hash table to
  * stderr.  The caller must hold no partition locks.
diff --git a/src/include/lib/dshash.h b/src/include/lib/dshash.h
index d8f48ed..460364c 100644
--- a/src/include/lib/dshash.h
+++ b/src/include/lib/dshash.h
@@ -59,6 +59,15 @@ typedef struct dshash_parameters
 struct dshash_table_item;
 typedef struct dshash_table_item dshash_table_item;
 
+struct dshash_seq_status
+{
+    dshash_table       *hash_table;
+    int                    curbucket;
+    int                    nbuckets;
+    dshash_table_item  *curitem;
+};
+typedef struct dshash_seq_status dshash_seq_status;
+
 /* Creating, sharing and destroying from hash tables. */
 extern dshash_table *dshash_create(dsa_area *area,
               const dshash_parameters *params,
@@ -81,6 +90,11 @@ extern bool dshash_delete_key(dshash_table *hash_table, const void *key);
 extern void dshash_delete_entry(dshash_table *hash_table, void *entry);
 extern void dshash_release_lock(dshash_table *hash_table, void *entry);
 
+/* seq scan support */
+extern void dshash_seq_init(dshash_seq_status *status, dshash_table *hash_table);
+extern void *dshash_seq_next(dshash_seq_status *status);
+extern void dshash_seq_release(dshash_seq_status *status);
+extern int dshash_get_num_entries(dshash_table *hash_table);
 /* Convenience hash and compare functions wrapping memcmp and tag_hash. */
 extern int    dshash_memcmp(const void *a, const void *b, size_t size, void *arg);
 extern dshash_hash dshash_memhash(const void *v, size_t size, void *arg);
-- 
2.9.2

From f8daa8726ed4774a58b3c5d775a1ec89182a5325 Mon Sep 17 00:00:00 2001
From: Kyotaro Horiguchi <horiguchi.kyotaro@oss.ntt.co.jp>
Date: Fri, 8 Dec 2017 22:18:29 +0900
Subject: [PATCH 3/4] Change stats collector to an axiliary process.

Shared memory and LWLocks are required to let stats collector use
dshash. This patch makes stats collector an auxiliary process.
---
 src/backend/bootstrap/bootstrap.c   |  8 +++++
 src/backend/postmaster/pgstat.c     | 58 +++++++++++++++++++++++++------------
 src/backend/postmaster/postmaster.c | 24 +++++++++------
 src/include/miscadmin.h             |  2 +-
 src/include/pgstat.h                |  4 ++-
 5 files changed, 67 insertions(+), 29 deletions(-)

diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c
index 8287de9..374a917 100644
--- a/src/backend/bootstrap/bootstrap.c
+++ b/src/backend/bootstrap/bootstrap.c
@@ -335,6 +335,9 @@ AuxiliaryProcessMain(int argc, char *argv[])
             case WalReceiverProcess:
                 statmsg = pgstat_get_backend_desc(B_WAL_RECEIVER);
                 break;
+            case StatsCollectorProcess:
+                statmsg = pgstat_get_backend_desc(B_STATS_COLLECTOR);
+                break;
             default:
                 statmsg = "??? process";
                 break;
@@ -460,6 +463,11 @@ AuxiliaryProcessMain(int argc, char *argv[])
             WalReceiverMain();
             proc_exit(1);        /* should never return */
 
+        case StatsCollectorProcess:
+            /* don't set signals, stats collector has its own agenda */
+            PgstatCollectorMain();
+            proc_exit(1);        /* should never return */
+
         default:
             elog(PANIC, "unrecognized process type: %d", (int) MyAuxProcType);
             proc_exit(1);
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index 5c256ff..4ee9890 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -267,6 +267,7 @@ static List *pending_write_requests = NIL;
 /* Signal handler flags */
 static volatile bool need_exit = false;
 static volatile bool got_SIGHUP = false;
+static volatile bool got_SIGTERM = false;
 
 /*
  * Total time charged to functions so far in the current backend.
@@ -284,8 +285,8 @@ static instr_time total_func_time;
 static pid_t pgstat_forkexec(void);
 #endif
 
-NON_EXEC_STATIC void PgstatCollectorMain(int argc, char *argv[]) pg_attribute_noreturn();
-static void pgstat_exit(SIGNAL_ARGS);
+static void pgstat_shutdown_handler(SIGNAL_ARGS);
+static void pgstat_quickdie_handler(SIGNAL_ARGS);
 static void pgstat_beshutdown_hook(int code, Datum arg);
 static void pgstat_sighup_handler(SIGNAL_ARGS);
 
@@ -770,11 +771,7 @@ pgstat_start(void)
             /* Close the postmaster's sockets */
             ClosePostmasterPorts(false);
 
-            /* Drop our connection to postmaster's shared memory, as well */
-            dsm_detach_all();
-            PGSharedMemoryDetach();
-
-            PgstatCollectorMain(0, NULL);
+            PgstatCollectorMain();
             break;
 #endif
 
@@ -2870,6 +2867,9 @@ pgstat_bestart(void)
             case WalReceiverProcess:
                 beentry->st_backendType = B_WAL_RECEIVER;
                 break;
+            case StatsCollectorProcess:
+                beentry->st_backendType = B_STATS_COLLECTOR;
+                break;
             default:
                 elog(FATAL, "unrecognized process type: %d",
                      (int) MyAuxProcType);
@@ -4077,6 +4077,9 @@ pgstat_get_backend_desc(BackendType backendType)
         case B_WAL_WRITER:
             backendDesc = "walwriter";
             break;
+        case B_STATS_COLLECTOR:
+            backendDesc = "stats collector";
+            break;
     }
 
     return backendDesc;
@@ -4194,8 +4197,8 @@ pgstat_send_bgwriter(void)
  *    The argc/argv parameters are valid only in EXEC_BACKEND case.
  * ----------
  */
-NON_EXEC_STATIC void
-PgstatCollectorMain(int argc, char *argv[])
+void
+PgstatCollectorMain(void)
 {
     int            len;
     PgStat_Msg    msg;
@@ -4208,8 +4211,8 @@ PgstatCollectorMain(int argc, char *argv[])
      */
     pqsignal(SIGHUP, pgstat_sighup_handler);
     pqsignal(SIGINT, SIG_IGN);
-    pqsignal(SIGTERM, SIG_IGN);
-    pqsignal(SIGQUIT, pgstat_exit);
+    pqsignal(SIGTERM, pgstat_shutdown_handler);
+    pqsignal(SIGQUIT, pgstat_quickdie_handler);
     pqsignal(SIGALRM, SIG_IGN);
     pqsignal(SIGPIPE, SIG_IGN);
     pqsignal(SIGUSR1, SIG_IGN);
@@ -4254,14 +4257,14 @@ PgstatCollectorMain(int argc, char *argv[])
         /*
          * Quit if we get SIGQUIT from the postmaster.
          */
-        if (need_exit)
+        if (got_SIGTERM)
             break;
 
         /*
          * Inner loop iterates as long as we keep getting messages, or until
          * need_exit becomes set.
          */
-        while (!need_exit)
+        while (!got_SIGTERM)
         {
             /*
              * Reload configuration if we got SIGHUP from the postmaster.
@@ -4449,14 +4452,21 @@ PgstatCollectorMain(int argc, char *argv[])
 
 /* SIGQUIT signal handler for collector process */
 static void
-pgstat_exit(SIGNAL_ARGS)
+pgstat_quickdie_handler(SIGNAL_ARGS)
 {
-    int            save_errno = errno;
+    PG_SETMASK(&BlockSig);
 
-    need_exit = true;
-    SetLatch(MyLatch);
+    /*
+     * We DO NOT want to run proc_exit() callbacks -- we're here because
+     * shared memory may be corrupted, so we don't want to try to clean up our
+     * transaction.  Just nail the windows shut and get out of town.  Now that
+     * there's an atexit callback to prevent third-party code from breaking
+     * things by calling exit() directly, we have to reset the callbacks
+     * explicitly to make this work as intended.
+     */
+    on_exit_reset();
 
-    errno = save_errno;
+    exit(2);
 }
 
 /* SIGHUP handler for collector process */
@@ -4471,6 +4481,18 @@ pgstat_sighup_handler(SIGNAL_ARGS)
     errno = save_errno;
 }
 
+static void
+pgstat_shutdown_handler(SIGNAL_ARGS)
+{
+    int save_errno = errno;
+
+    got_SIGTERM = true;
+
+    SetLatch(MyLatch);
+
+    errno = save_errno;
+}
+
 /*
  * Subroutine to clear stats in a database entry
  *
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index 17c7f7e..d5fda5d 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -144,7 +144,8 @@
 #define BACKEND_TYPE_AUTOVAC    0x0002    /* autovacuum worker process */
 #define BACKEND_TYPE_WALSND        0x0004    /* walsender process */
 #define BACKEND_TYPE_BGWORKER    0x0008    /* bgworker process */
-#define BACKEND_TYPE_ALL        0x000F    /* OR of all the above */
+#define BACKEND_TYPE_STATS        0x0010    /* bgworker process */
+#define BACKEND_TYPE_ALL        0x001F    /* OR of all the above */
 
 #define BACKEND_TYPE_WORKER        (BACKEND_TYPE_AUTOVAC | BACKEND_TYPE_BGWORKER)
 
@@ -550,6 +551,7 @@ static void ShmemBackendArrayRemove(Backend *bn);
 #define StartCheckpointer()        StartChildProcess(CheckpointerProcess)
 #define StartWalWriter()        StartChildProcess(WalWriterProcess)
 #define StartWalReceiver()        StartChildProcess(WalReceiverProcess)
+#define StartStatsCollector()    StartChildProcess(StatsCollectorProcess)
 
 /* Macros to check exit status of a child process */
 #define EXIT_STATUS_0(st)  ((st) == 0)
@@ -1811,7 +1813,7 @@ ServerLoop(void)
         /* If we have lost the stats collector, try to start a new one */
         if (PgStatPID == 0 &&
             (pmState == PM_RUN || pmState == PM_HOT_STANDBY))
-            PgStatPID = pgstat_start();
+            PgStatPID = StartStatsCollector();
 
         /* If we have lost the archiver, try to start a new one. */
         if (PgArchPID == 0 && PgArchStartupAllowed())
@@ -2929,7 +2931,7 @@ reaper(SIGNAL_ARGS)
             if (PgArchStartupAllowed() && PgArchPID == 0)
                 PgArchPID = pgarch_start();
             if (PgStatPID == 0)
-                PgStatPID = pgstat_start();
+                PgStatPID = StartStatsCollector();
 
             /* workers may be scheduled to start now */
             maybe_start_bgworkers();
@@ -3002,7 +3004,7 @@ reaper(SIGNAL_ARGS)
                  * nothing left for it to do.
                  */
                 if (PgStatPID != 0)
-                    signal_child(PgStatPID, SIGQUIT);
+                    signal_child(PgStatPID, SIGTERM);
             }
             else
             {
@@ -3088,10 +3090,10 @@ reaper(SIGNAL_ARGS)
         {
             PgStatPID = 0;
             if (!EXIT_STATUS_0(exitstatus))
-                LogChildExit(LOG, _("statistics collector process"),
-                             pid, exitstatus);
+                HandleChildCrash(pid, exitstatus,
+                                 _("statistics collector process"));
             if (pmState == PM_RUN || pmState == PM_HOT_STANDBY)
-                PgStatPID = pgstat_start();
+                PgStatPID = StartStatsCollector();
             continue;
         }
 
@@ -3321,7 +3323,7 @@ CleanupBackend(int pid,
 
 /*
  * HandleChildCrash -- cleanup after failed backend, bgwriter, checkpointer,
- * walwriter, autovacuum, or background worker.
+ * walwriter, autovacuum, stats collector or background worker.
  *
  * The objectives here are to clean up our local state about the child
  * process, and to signal all other remaining children to quickdie.
@@ -5114,7 +5116,7 @@ sigusr1_handler(SIGNAL_ARGS)
          * Likewise, start other special children as needed.
          */
         Assert(PgStatPID == 0);
-        PgStatPID = pgstat_start();
+        PgStatPID = StartStatsCollector();
 
         ereport(LOG,
                 (errmsg("database system is ready to accept read only connections")));
@@ -5404,6 +5406,10 @@ StartChildProcess(AuxProcType type)
                 ereport(LOG,
                         (errmsg("could not fork WAL receiver process: %m")));
                 break;
+            case StatsCollectorProcess:
+                ereport(LOG,
+                        (errmsg("could not fork stats collector process: %m")));
+                break;
             default:
                 ereport(LOG,
                         (errmsg("could not fork process: %m")));
diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h
index 59da7a6..b054dab 100644
--- a/src/include/miscadmin.h
+++ b/src/include/miscadmin.h
@@ -395,7 +395,7 @@ typedef enum
     CheckpointerProcess,
     WalWriterProcess,
     WalReceiverProcess,
-
+    StatsCollectorProcess,
     NUM_AUXPROCTYPES            /* Must be last! */
 } AuxProcType;
 
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index 089b7c3..e2a1e21 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -710,7 +710,8 @@ typedef enum BackendType
     B_STARTUP,
     B_WAL_RECEIVER,
     B_WAL_SENDER,
-    B_WAL_WRITER
+    B_WAL_WRITER,
+    B_STATS_COLLECTOR
 } BackendType;
 
 
@@ -1327,6 +1328,7 @@ extern void pgstat_send_bgwriter(void);
  * generate the pgstat* views.
  * ----------
  */
+extern void PgstatCollectorMain(void);
 extern PgStat_StatDBEntry *pgstat_fetch_stat_dbentry(Oid dbid);
 extern PgStat_StatTabEntry *pgstat_fetch_stat_tabentry(Oid relid);
 extern PgBackendStatus *pgstat_fetch_stat_beentry(int beid);
-- 
2.9.2

From 9fe2da9442033ee3409592e108bd1a17a3392909 Mon Sep 17 00:00:00 2001
From: Kyotaro Horiguchi <horiguchi.kyotaro@oss.ntt.co.jp>
Date: Sat, 9 Dec 2017 01:40:55 +0900
Subject: [PATCH 4/4] Change stats sharing method

Stats collector no longer uses files to distribute stats numbers. They
are now stored in dynamic shared hash.
---
 src/backend/lib/dshash.c                      |    2 +-
 src/backend/postmaster/autovacuum.c           |    6 +-
 src/backend/postmaster/pgstat.c               | 1250 +++++++++++--------------
 src/backend/replication/basebackup.c          |   36 -
 src/backend/storage/ipc/ipci.c                |    2 +
 src/backend/storage/lmgr/lwlock.c             |    3 +
 src/backend/storage/lmgr/lwlocknames.txt      |    1 +
 src/backend/utils/misc/guc.c                  |   41 -
 src/backend/utils/misc/postgresql.conf.sample |    1 -
 src/bin/initdb/initdb.c                       |    1 -
 src/bin/pg_basebackup/t/010_pg_basebackup.pl  |    2 +-
 src/include/miscadmin.h                       |    1 +
 src/include/pgstat.h                          |   47 +-
 src/include/storage/lwlock.h                  |    3 +
 14 files changed, 562 insertions(+), 834 deletions(-)

diff --git a/src/backend/lib/dshash.c b/src/backend/lib/dshash.c
index adc2131..9d72f28 100644
--- a/src/backend/lib/dshash.c
+++ b/src/backend/lib/dshash.c
@@ -304,7 +304,7 @@ dshash_attach(dsa_area *area, const dshash_parameters *params,
 void
 dshash_detach(dshash_table *hash_table)
 {
-    Assert(!hash_table->find_locked);
+    Assert(!hash_table->find_locked || hash_table->is_snapshot);
 
     /* The hash table may have been destroyed.  Just free local memory. */
     pfree(hash_table);
diff --git a/src/backend/postmaster/autovacuum.c b/src/backend/postmaster/autovacuum.c
index 48765bb..770a0ec 100644
--- a/src/backend/postmaster/autovacuum.c
+++ b/src/backend/postmaster/autovacuum.c
@@ -2734,12 +2734,10 @@ get_pgstat_tabentry_relid(Oid relid, bool isshared, PgStat_StatDBEntry *shared,
     if (isshared)
     {
         if (PointerIsValid(shared))
-            tabentry = hash_search(shared->tables, &relid,
-                                   HASH_FIND, NULL);
+            tabentry = backend_get_tab_entry(shared, relid);
     }
     else if (PointerIsValid(dbentry))
-        tabentry = hash_search(dbentry->tables, &relid,
-                               HASH_FIND, NULL);
+        tabentry = backend_get_tab_entry(dbentry, relid);
 
     return tabentry;
 }
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index 4ee9890..cccf4b6 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -28,6 +28,7 @@
 #include <arpa/inet.h>
 #include <signal.h>
 #include <time.h>
+#include <utils/dsa.h>
 #ifdef HAVE_SYS_SELECT_H
 #include <sys/select.h>
 #endif
@@ -77,22 +78,10 @@
 #define PGSTAT_STAT_INTERVAL    500 /* Minimum time between stats file
                                      * updates; in milliseconds. */
 
-#define PGSTAT_RETRY_DELAY        10    /* How long to wait between checks for a
-                                     * new file; in milliseconds. */
-
-#define PGSTAT_MAX_WAIT_TIME    10000    /* Maximum time to wait for a stats
-                                         * file update; in milliseconds. */
-
-#define PGSTAT_INQ_INTERVAL        640 /* How often to ping the collector for a
-                                     * new file; in milliseconds. */
-
 #define PGSTAT_RESTART_INTERVAL 60    /* How often to attempt to restart a
                                      * failed statistics collector; in
                                      * seconds. */
 
-#define PGSTAT_POLL_LOOP_COUNT    (PGSTAT_MAX_WAIT_TIME / PGSTAT_RETRY_DELAY)
-#define PGSTAT_INQ_LOOP_COUNT    (PGSTAT_INQ_INTERVAL / PGSTAT_RETRY_DELAY)
-
 /* Minimum receive buffer size for the collector's socket. */
 #define PGSTAT_MIN_RCVBUF        (100 * 1024)
 
@@ -101,7 +90,6 @@
  * The initial size hints for the hash tables used in the collector.
  * ----------
  */
-#define PGSTAT_DB_HASH_SIZE        16
 #define PGSTAT_TAB_HASH_SIZE    512
 #define PGSTAT_FUNCTION_HASH_SIZE    512
 
@@ -131,7 +119,6 @@ int            pgstat_track_activity_query_size = 1024;
  * Built from GUC parameter
  * ----------
  */
-char       *pgstat_stat_directory = NULL;
 char       *pgstat_stat_filename = NULL;
 char       *pgstat_stat_tmpname = NULL;
 
@@ -154,6 +141,42 @@ static time_t last_pgstat_start_time;
 
 static bool pgStatRunningInCollector = false;
 
+/* Shared stats bootstrap infomation */
+typedef struct StatsShmemStruct {
+    dsa_handle stats_dsa_handle;
+    dshash_table_handle db_stats_handle;
+    dsa_pointer    global_stats;
+    dsa_pointer    archiver_stats;
+} StatsShmemStruct;
+
+static StatsShmemStruct * StatsShmem = NULL;
+static dsa_area *area = NULL;
+static dshash_table *db_stats;
+static dshash_table *local_db_stats;
+
+/* dshash parameter for each type of table */
+static const dshash_parameters dsh_dbparams = {
+    sizeof(Oid),
+    sizeof(PgStat_StatDBEntry),
+    dshash_memcmp,
+    dshash_memhash,
+    LWTRANCHE_STATS_DB
+};
+static const dshash_parameters dsh_tblparams = {
+    sizeof(Oid),
+    sizeof(PgStat_StatTabEntry),
+    dshash_memcmp,
+    dshash_memhash,
+    LWTRANCHE_STATS_FUNC_TABLE
+};
+static const dshash_parameters dsh_funcparams = {
+    sizeof(Oid),
+    sizeof(PgStat_StatFuncEntry),
+    dshash_memcmp,
+    dshash_memhash,
+    LWTRANCHE_STATS_FUNC_TABLE
+};
+
 /*
  * Structures in which backends store per-table info that's waiting to be
  * sent to the collector.
@@ -250,12 +273,16 @@ static LocalPgBackendStatus *localBackendStatusTable = NULL;
 static int    localNumBackends = 0;
 
 /*
- * Cluster wide statistics, kept in the stats collector.
- * Contains statistics that are not collected per database
- * or per table.
+ * Cluster wide statistics.
+ * Contains statistics that are not collected per database or per table.
+ * shared_* are the statistics maintained by pgstats and snapshot_* are the
+ * snapshot only taken on reader-side backends.
  */
-static PgStat_ArchiverStats archiverStats;
-static PgStat_GlobalStats globalStats;
+static PgStat_ArchiverStats *shared_archiverStats;
+static PgStat_ArchiverStats *snapshot_archiverStats;
+static PgStat_GlobalStats *shared_globalStats;
+static PgStat_GlobalStats *snapshot_globalStats;
+
 
 /*
  * List of OIDs of databases we need to write out.  If an entry is InvalidOid,
@@ -285,24 +312,23 @@ static instr_time total_func_time;
 static pid_t pgstat_forkexec(void);
 #endif
 
+/* functions used in stats collector */
 static void pgstat_shutdown_handler(SIGNAL_ARGS);
 static void pgstat_quickdie_handler(SIGNAL_ARGS);
 static void pgstat_beshutdown_hook(int code, Datum arg);
 static void pgstat_sighup_handler(SIGNAL_ARGS);
 
 static PgStat_StatDBEntry *pgstat_get_db_entry(Oid databaseid, bool create);
-static PgStat_StatTabEntry *pgstat_get_tab_entry(PgStat_StatDBEntry *dbentry,
-                     Oid tableoid, bool create);
-static void pgstat_write_statsfiles(bool permanent, bool allDbs);
-static void pgstat_write_db_statsfile(PgStat_StatDBEntry *dbentry, bool permanent);
-static HTAB *pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep);
-static void pgstat_read_db_statsfile(Oid databaseid, HTAB *tabhash, HTAB *funchash, bool permanent);
-static void backend_read_statsfile(void);
+static PgStat_StatTabEntry *pgstat_get_tab_entry(dshash_table *table, Oid tableoid, bool create);
+static void pgstat_write_statsfiles(void);
+static void pgstat_write_db_statsfile(PgStat_StatDBEntry *dbentry);
+static void pgstat_read_statsfiles(void);
+static void pgstat_read_db_statsfile(Oid databaseid, dshash_table *tabhash, dshash_table *funchash);
+
+/* functions used in backends */
+static bool backend_take_stats_snapshot(void);
 static void pgstat_read_current_status(void);
 
-static bool pgstat_write_statsfile_needed(void);
-static bool pgstat_db_requested(Oid databaseid);
-
 static void pgstat_send_tabstat(PgStat_MsgTabstat *tsmsg);
 static void pgstat_send_funcstats(void);
 static HTAB *pgstat_collect_oids(Oid catalogid);
@@ -320,7 +346,6 @@ static const char *pgstat_get_wait_io(WaitEventIO w);
 static void pgstat_setheader(PgStat_MsgHdr *hdr, StatMsgType mtype);
 static void pgstat_send(void *msg, int len);
 
-static void pgstat_recv_inquiry(PgStat_MsgInquiry *msg, int len);
 static void pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len);
 static void pgstat_recv_tabpurge(PgStat_MsgTabpurge *msg, int len);
 static void pgstat_recv_dropdb(PgStat_MsgDropdb *msg, int len);
@@ -685,7 +710,6 @@ pgstat_reset_remove_files(const char *directory)
 void
 pgstat_reset_all(void)
 {
-    pgstat_reset_remove_files(pgstat_stat_directory);
     pgstat_reset_remove_files(PGSTAT_STAT_PERMANENT_DIRECTORY);
 }
 
@@ -1010,6 +1034,95 @@ pgstat_send_funcstats(void)
 
 
 /* ----------
+ * pgstat_attach_shared_stats() -
+ *
+ *    attach existing shared stats memory
+ * ----------
+ */
+static bool
+pgstat_attach_shared_stats(void)
+{
+    MemoryContext oldcontext;
+
+    LWLockAcquire(StatsLock, LW_EXCLUSIVE);
+    if (StatsShmem->stats_dsa_handle == DSM_HANDLE_INVALID || area != NULL)
+    {
+        LWLockRelease(StatsLock);
+        return area != NULL;
+    }
+
+    /* this lives till the end of the process */
+    oldcontext = MemoryContextSwitchTo(TopMemoryContext);
+    area = dsa_attach(StatsShmem->stats_dsa_handle);
+    dsa_pin_mapping(area);
+    db_stats = dshash_attach(area, &dsh_dbparams,
+                             StatsShmem->db_stats_handle, 0);
+    local_db_stats = NULL;
+    shared_globalStats = (PgStat_GlobalStats *)
+        dsa_get_address(area, StatsShmem->global_stats);
+    shared_archiverStats =    (PgStat_ArchiverStats *)
+        dsa_get_address(area, StatsShmem->archiver_stats);
+    MemoryContextSwitchTo(oldcontext);
+    LWLockRelease(StatsLock);
+
+    return true;
+}
+
+/* ----------
+ * pgstat_create_shared_stats() -
+ *
+ *    create shared stats memory
+ * ----------
+ */
+static void
+pgstat_create_shared_stats(void)
+{
+    MemoryContext oldcontext;
+
+    LWLockAcquire(StatsLock, LW_EXCLUSIVE);
+    Assert(StatsShmem->stats_dsa_handle == DSM_HANDLE_INVALID);
+
+    /* this lives till the end of the process */
+    oldcontext = MemoryContextSwitchTo(TopMemoryContext);
+    area = dsa_create(LWTRANCHE_STATS_DSA);
+    dsa_pin_mapping(area);
+
+    db_stats = dshash_create(area, &dsh_dbparams, 0);
+
+    /* create shared area and write bootstrap information */
+    StatsShmem->stats_dsa_handle = dsa_get_handle(area);
+    StatsShmem->global_stats =
+        dsa_allocate0(area, sizeof(PgStat_GlobalStats));
+    StatsShmem->archiver_stats =
+        dsa_allocate0(area, sizeof(PgStat_ArchiverStats));
+    StatsShmem->db_stats_handle =
+        dshash_get_hash_table_handle(db_stats);
+
+    /* locally connect to the memory */
+    local_db_stats = NULL;
+    shared_globalStats = (PgStat_GlobalStats *)
+        dsa_get_address(area, StatsShmem->global_stats);
+    shared_archiverStats = (PgStat_ArchiverStats *)
+        dsa_get_address(area, StatsShmem->archiver_stats);
+    MemoryContextSwitchTo(oldcontext);
+    LWLockRelease(StatsLock);
+}
+
+/* ----------
+ * backend_get_tab_entry() -
+ *
+ *    Find database stats entry on backends. This assumes that snapshot is
+ *    created.
+ * ----------
+ */
+PgStat_StatTabEntry *
+backend_get_tab_entry(PgStat_StatDBEntry *dbent, Oid relid)
+{
+    Assert(dbent->snapshot_tables);
+    return dshash_find(dbent->snapshot_tables, &relid, false);
+}
+
+/* ----------
  * pgstat_vacuum_stat() -
  *
  *    Will tell the collector about objects he can get rid of.
@@ -1021,7 +1134,7 @@ pgstat_vacuum_stat(void)
     HTAB       *htab;
     PgStat_MsgTabpurge msg;
     PgStat_MsgFuncpurge f_msg;
-    HASH_SEQ_STATUS hstat;
+    dshash_seq_status hstat;
     PgStat_StatDBEntry *dbentry;
     PgStat_StatTabEntry *tabentry;
     PgStat_StatFuncEntry *funcentry;
@@ -1030,11 +1143,8 @@ pgstat_vacuum_stat(void)
     if (pgStatSock == PGINVALID_SOCKET)
         return;
 
-    /*
-     * If not done for this transaction, read the statistics collector stats
-     * file into some hash tables.
-     */
-    backend_read_statsfile();
+    if (!backend_take_stats_snapshot())
+        return;
 
     /*
      * Read pg_database and make a list of OIDs of all existing databases
@@ -1045,8 +1155,8 @@ pgstat_vacuum_stat(void)
      * Search the database hash table for dead databases and tell the
      * collector to drop them.
      */
-    hash_seq_init(&hstat, pgStatDBHash);
-    while ((dbentry = (PgStat_StatDBEntry *) hash_seq_search(&hstat)) != NULL)
+    dshash_seq_init(&hstat, local_db_stats);
+    while ((dbentry = (PgStat_StatDBEntry *) dshash_seq_next(&hstat)) != NULL)
     {
         Oid            dbid = dbentry->databaseid;
 
@@ -1064,11 +1174,15 @@ pgstat_vacuum_stat(void)
     /*
      * Lookup our own database entry; if not found, nothing more to do.
      */
-    dbentry = (PgStat_StatDBEntry *) hash_search(pgStatDBHash,
+    dbentry = (PgStat_StatDBEntry *) dshash_find(local_db_stats,
                                                  (void *) &MyDatabaseId,
-                                                 HASH_FIND, NULL);
-    if (dbentry == NULL || dbentry->tables == NULL)
+                                                 false);
+    if (dbentry == NULL || dbentry->tables == DSM_HANDLE_INVALID)
+    {
+        if (dbentry)
+            dshash_release_lock(db_stats, dbentry);
         return;
+    }
 
     /*
      * Similarly to above, make a list of all known relations in this DB.
@@ -1083,8 +1197,8 @@ pgstat_vacuum_stat(void)
     /*
      * Check for all tables listed in stats hashtable if they still exist.
      */
-    hash_seq_init(&hstat, dbentry->tables);
-    while ((tabentry = (PgStat_StatTabEntry *) hash_seq_search(&hstat)) != NULL)
+    dshash_seq_init(&hstat, dbentry->snapshot_tables);
+    while ((tabentry = (PgStat_StatTabEntry *) dshash_seq_next(&hstat)) != NULL)
     {
         Oid            tabid = tabentry->tableid;
 
@@ -1134,8 +1248,8 @@ pgstat_vacuum_stat(void)
      * Now repeat the above steps for functions.  However, we needn't bother
      * in the common case where no function stats are being collected.
      */
-    if (dbentry->functions != NULL &&
-        hash_get_num_entries(dbentry->functions) > 0)
+    if (dbentry->snapshot_functions != NULL &&
+        dshash_get_num_entries(dbentry->snapshot_functions) > 0)
     {
         htab = pgstat_collect_oids(ProcedureRelationId);
 
@@ -1143,8 +1257,8 @@ pgstat_vacuum_stat(void)
         f_msg.m_databaseid = MyDatabaseId;
         f_msg.m_nentries = 0;
 
-        hash_seq_init(&hstat, dbentry->functions);
-        while ((funcentry = (PgStat_StatFuncEntry *) hash_seq_search(&hstat)) != NULL)
+        dshash_seq_init(&hstat, dbentry->snapshot_functions);
+        while ((funcentry = (PgStat_StatFuncEntry *) dshash_seq_next(&hstat)) != NULL)
         {
             Oid            funcid = funcentry->functionid;
 
@@ -1551,24 +1665,6 @@ pgstat_ping(void)
     pgstat_send(&msg, sizeof(msg));
 }
 
-/* ----------
- * pgstat_send_inquiry() -
- *
- *    Notify collector that we need fresh data.
- * ----------
- */
-static void
-pgstat_send_inquiry(TimestampTz clock_time, TimestampTz cutoff_time, Oid databaseid)
-{
-    PgStat_MsgInquiry msg;
-
-    pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_INQUIRY);
-    msg.clock_time = clock_time;
-    msg.cutoff_time = cutoff_time;
-    msg.databaseid = databaseid;
-    pgstat_send(&msg, sizeof(msg));
-}
-
 
 /*
  * Initialize function call usage data.
@@ -2384,17 +2480,16 @@ PgStat_StatDBEntry *
 pgstat_fetch_stat_dbentry(Oid dbid)
 {
     /*
-     * If not done for this transaction, read the statistics collector stats
-     * file into some hash tables.
+     * If not done for this transaction, take a stats snapshot
      */
-    backend_read_statsfile();
+    if (!backend_take_stats_snapshot())
+        return NULL;
 
     /*
      * Lookup the requested database; return NULL if not found
      */
-    return (PgStat_StatDBEntry *) hash_search(pgStatDBHash,
-                                              (void *) &dbid,
-                                              HASH_FIND, NULL);
+    return (PgStat_StatDBEntry *) dshash_find(local_db_stats,
+                                              (void *) &dbid, false);
 }
 
 
@@ -2415,23 +2510,22 @@ pgstat_fetch_stat_tabentry(Oid relid)
     PgStat_StatTabEntry *tabentry;
 
     /*
-     * If not done for this transaction, read the statistics collector stats
-     * file into some hash tables.
+     * If not done for this transaction, take a stats snapshot
      */
-    backend_read_statsfile();
+    if (!backend_take_stats_snapshot())
+        return NULL;
 
     /*
      * Lookup our database, then look in its table hash table.
      */
     dbid = MyDatabaseId;
-    dbentry = (PgStat_StatDBEntry *) hash_search(pgStatDBHash,
-                                                 (void *) &dbid,
-                                                 HASH_FIND, NULL);
-    if (dbentry != NULL && dbentry->tables != NULL)
+    dbentry =
+        (PgStat_StatDBEntry *) dshash_find(local_db_stats, (void *)&dbid, false);
+    if (dbentry)
     {
-        tabentry = (PgStat_StatTabEntry *) hash_search(dbentry->tables,
-                                                       (void *) &relid,
-                                                       HASH_FIND, NULL);
+        tabentry = (PgStat_StatTabEntry *)
+            dshash_find(dbentry->snapshot_tables, (void *)&relid, false);
+
         if (tabentry)
             return tabentry;
     }
@@ -2440,14 +2534,13 @@ pgstat_fetch_stat_tabentry(Oid relid)
      * If we didn't find it, maybe it's a shared table.
      */
     dbid = InvalidOid;
-    dbentry = (PgStat_StatDBEntry *) hash_search(pgStatDBHash,
-                                                 (void *) &dbid,
-                                                 HASH_FIND, NULL);
-    if (dbentry != NULL && dbentry->tables != NULL)
+    dbentry = (PgStat_StatDBEntry *) dshash_find(local_db_stats,
+                                                 (void *) &dbid, false);
+    if (dbentry != NULL && dbentry->snapshot_tables != NULL)
     {
-        tabentry = (PgStat_StatTabEntry *) hash_search(dbentry->tables,
-                                                       (void *) &relid,
-                                                       HASH_FIND, NULL);
+        tabentry = (PgStat_StatTabEntry *)
+            dshash_find(dbentry->snapshot_tables, (void *) &relid, false);
+
         if (tabentry)
             return tabentry;
     }
@@ -2469,18 +2562,19 @@ pgstat_fetch_stat_funcentry(Oid func_id)
     PgStat_StatDBEntry *dbentry;
     PgStat_StatFuncEntry *funcentry = NULL;
 
-    /* load the stats file if needed */
-    backend_read_statsfile();
+    /*
+     * If not done for this transaction, take a stats snapshot
+     */
+    if (!backend_take_stats_snapshot())
+        return NULL;
 
-    /* Lookup our database, then find the requested function.  */
+    /*
+     * Lookup our database, then find the requested function
+     */
     dbentry = pgstat_fetch_stat_dbentry(MyDatabaseId);
-    if (dbentry != NULL && dbentry->functions != NULL)
-    {
-        funcentry = (PgStat_StatFuncEntry *) hash_search(dbentry->functions,
-                                                         (void *) &func_id,
-                                                         HASH_FIND, NULL);
-    }
-
+    if (dbentry != NULL && !dbentry->snapshot_functions)
+        funcentry = dshash_find(dbentry->snapshot_functions,
+                                  (void *) &func_id, false);
     return funcentry;
 }
 
@@ -2555,9 +2649,10 @@ pgstat_fetch_stat_numbackends(void)
 PgStat_ArchiverStats *
 pgstat_fetch_stat_archiver(void)
 {
-    backend_read_statsfile();
+    if (!backend_take_stats_snapshot())
+        return NULL;
 
-    return &archiverStats;
+    return snapshot_archiverStats;
 }
 
 
@@ -2572,9 +2667,10 @@ pgstat_fetch_stat_archiver(void)
 PgStat_GlobalStats *
 pgstat_fetch_global(void)
 {
-    backend_read_statsfile();
+    if (!backend_take_stats_snapshot())
+        return NULL;
 
-    return &globalStats;
+    return snapshot_globalStats;
 }
 
 
@@ -4222,18 +4318,14 @@ PgstatCollectorMain(void)
     pqsignal(SIGTTOU, SIG_DFL);
     pqsignal(SIGCONT, SIG_DFL);
     pqsignal(SIGWINCH, SIG_DFL);
-    PG_SETMASK(&UnBlockSig);
 
-    /*
-     * Identify myself via ps
-     */
-    init_ps_display("stats collector", "", "", "");
+    PG_SETMASK(&UnBlockSig);
 
     /*
      * Read in existing stats files or initialize the stats to zero.
      */
     pgStatRunningInCollector = true;
-    pgStatDBHash = pgstat_read_statsfiles(InvalidOid, true, true);
+    pgstat_read_statsfiles();
 
     /*
      * Loop to process messages until we get SIGQUIT or detect ungraceful
@@ -4276,13 +4368,6 @@ PgstatCollectorMain(void)
             }
 
             /*
-             * Write the stats file(s) if a new request has arrived that is
-             * not satisfied by existing file(s).
-             */
-            if (pgstat_write_statsfile_needed())
-                pgstat_write_statsfiles(false, false);
-
-            /*
              * Try to receive and process a message.  This will not block,
              * since the socket is set to non-blocking mode.
              *
@@ -4330,10 +4415,6 @@ PgstatCollectorMain(void)
                 case PGSTAT_MTYPE_DUMMY:
                     break;
 
-                case PGSTAT_MTYPE_INQUIRY:
-                    pgstat_recv_inquiry((PgStat_MsgInquiry *) &msg, len);
-                    break;
-
                 case PGSTAT_MTYPE_TABSTAT:
                     pgstat_recv_tabstat((PgStat_MsgTabstat *) &msg, len);
                     break;
@@ -4424,7 +4505,7 @@ PgstatCollectorMain(void)
          * happening there, this is the best we can do.  The two-second
          * timeout matches our pre-9.2 behavior, and needs to be short enough
          * to not provoke "using stale statistics" complaints from
-         * backend_read_statsfile.
+         * backend_take_stats_snapshot.
          */
         wr = WaitLatchOrSocket(MyLatch,
                                WL_LATCH_SET | WL_POSTMASTER_DEATH | WL_SOCKET_READABLE | WL_TIMEOUT,
@@ -4444,7 +4525,7 @@ PgstatCollectorMain(void)
     /*
      * Save the final stats to reuse at next startup.
      */
-    pgstat_write_statsfiles(true, true);
+    pgstat_write_statsfiles();
 
     exit(0);
 }
@@ -4494,14 +4575,14 @@ pgstat_shutdown_handler(SIGNAL_ARGS)
 }
 
 /*
- * Subroutine to clear stats in a database entry
+ * Subroutine to reset stats in a shared database entry
  *
  * Tables and functions hashes are initialized to empty.
  */
 static void
 reset_dbentry_counters(PgStat_StatDBEntry *dbentry)
 {
-    HASHCTL        hash_ctl;
+    dshash_table *tbl;
 
     dbentry->n_xact_commit = 0;
     dbentry->n_xact_rollback = 0;
@@ -4527,20 +4608,14 @@ reset_dbentry_counters(PgStat_StatDBEntry *dbentry)
     dbentry->stat_reset_timestamp = GetCurrentTimestamp();
     dbentry->stats_timestamp = 0;
 
-    memset(&hash_ctl, 0, sizeof(hash_ctl));
-    hash_ctl.keysize = sizeof(Oid);
-    hash_ctl.entrysize = sizeof(PgStat_StatTabEntry);
-    dbentry->tables = hash_create("Per-database table",
-                                  PGSTAT_TAB_HASH_SIZE,
-                                  &hash_ctl,
-                                  HASH_ELEM | HASH_BLOBS);
 
-    hash_ctl.keysize = sizeof(Oid);
-    hash_ctl.entrysize = sizeof(PgStat_StatFuncEntry);
-    dbentry->functions = hash_create("Per-database function",
-                                     PGSTAT_FUNCTION_HASH_SIZE,
-                                     &hash_ctl,
-                                     HASH_ELEM | HASH_BLOBS);
+    tbl = dshash_create(area, &dsh_tblparams, 0);
+    dbentry->tables = dshash_get_hash_table_handle(tbl);
+    dshash_detach(tbl);
+
+    tbl = dshash_create(area, &dsh_funcparams, 0);
+    dbentry->functions = dshash_get_hash_table_handle(tbl);
+    dshash_detach(tbl);
 }
 
 /*
@@ -4553,15 +4628,18 @@ pgstat_get_db_entry(Oid databaseid, bool create)
 {
     PgStat_StatDBEntry *result;
     bool        found;
-    HASHACTION    action = (create ? HASH_ENTER : HASH_FIND);
+
+    Assert(pgStatRunningInCollector);
 
     /* Lookup or create the hash table entry for this database */
-    result = (PgStat_StatDBEntry *) hash_search(pgStatDBHash,
-                                                &databaseid,
-                                                action, &found);
+    if (create)
+        result = (PgStat_StatDBEntry *)
+            dshash_find_or_insert(db_stats,    &databaseid, &found);
+    else
+        result = (PgStat_StatDBEntry *)    dshash_find(db_stats, &databaseid, true);
 
-    if (!create && !found)
-        return NULL;
+    if (!create)
+        return result;
 
     /*
      * If not found, initialize the new one.  This creates empty hash tables
@@ -4573,23 +4651,23 @@ pgstat_get_db_entry(Oid databaseid, bool create)
     return result;
 }
 
-
 /*
  * Lookup the hash table entry for the specified table. If no hash
  * table entry exists, initialize it, if the create parameter is true.
  * Else, return NULL.
  */
 static PgStat_StatTabEntry *
-pgstat_get_tab_entry(PgStat_StatDBEntry *dbentry, Oid tableoid, bool create)
+pgstat_get_tab_entry(dshash_table *table, Oid tableoid, bool create)
 {
     PgStat_StatTabEntry *result;
     bool        found;
-    HASHACTION    action = (create ? HASH_ENTER : HASH_FIND);
 
     /* Lookup or create the hash table entry for this table */
-    result = (PgStat_StatTabEntry *) hash_search(dbentry->tables,
-                                                 &tableoid,
-                                                 action, &found);
+    if (create)
+        result = (PgStat_StatTabEntry *)
+            dshash_find_or_insert(table, &tableoid, &found);
+    else
+        result = (PgStat_StatTabEntry *) dshash_find(table, &tableoid, false);
 
     if (!create && !found)
         return NULL;
@@ -4638,14 +4716,14 @@ pgstat_get_tab_entry(PgStat_StatDBEntry *dbentry, Oid tableoid, bool create)
  * ----------
  */
 static void
-pgstat_write_statsfiles(bool permanent, bool allDbs)
+pgstat_write_statsfiles(void)
 {
-    HASH_SEQ_STATUS hstat;
+    dshash_seq_status hstat;
     PgStat_StatDBEntry *dbentry;
     FILE       *fpout;
     int32        format_id;
-    const char *tmpfile = permanent ? PGSTAT_STAT_PERMANENT_TMPFILE : pgstat_stat_tmpname;
-    const char *statfile = permanent ? PGSTAT_STAT_PERMANENT_FILENAME : pgstat_stat_filename;
+    const char *tmpfile = PGSTAT_STAT_PERMANENT_TMPFILE;
+    const char *statfile = PGSTAT_STAT_PERMANENT_FILENAME;
     int            rc;
 
     elog(DEBUG2, "writing stats file \"%s\"", statfile);
@@ -4666,7 +4744,7 @@ pgstat_write_statsfiles(bool permanent, bool allDbs)
     /*
      * Set the timestamp of the stats file.
      */
-    globalStats.stats_timestamp = GetCurrentTimestamp();
+    shared_globalStats->stats_timestamp = GetCurrentTimestamp();
 
     /*
      * Write the file header --- currently just a format ID.
@@ -4678,32 +4756,29 @@ pgstat_write_statsfiles(bool permanent, bool allDbs)
     /*
      * Write global stats struct
      */
-    rc = fwrite(&globalStats, sizeof(globalStats), 1, fpout);
+    rc = fwrite(shared_globalStats, sizeof(shared_globalStats), 1, fpout);
     (void) rc;                    /* we'll check for error with ferror */
 
     /*
      * Write archiver stats struct
      */
-    rc = fwrite(&archiverStats, sizeof(archiverStats), 1, fpout);
+    rc = fwrite(shared_archiverStats, sizeof(shared_archiverStats), 1, fpout);
     (void) rc;                    /* we'll check for error with ferror */
 
     /*
      * Walk through the database table.
      */
-    hash_seq_init(&hstat, pgStatDBHash);
-    while ((dbentry = (PgStat_StatDBEntry *) hash_seq_search(&hstat)) != NULL)
+    dshash_seq_init(&hstat, db_stats);
+    while ((dbentry = (PgStat_StatDBEntry *) dshash_seq_next(&hstat)) != NULL)
     {
         /*
          * Write out the table and function stats for this DB into the
          * appropriate per-DB stat file, if required.
          */
-        if (allDbs || pgstat_db_requested(dbentry->databaseid))
-        {
-            /* Make DB's timestamp consistent with the global stats */
-            dbentry->stats_timestamp = globalStats.stats_timestamp;
+        /* Make DB's timestamp consistent with the global stats */
+        dbentry->stats_timestamp = shared_globalStats->stats_timestamp;
 
-            pgstat_write_db_statsfile(dbentry, permanent);
-        }
+        pgstat_write_db_statsfile(dbentry);
 
         /*
          * Write out the DB entry. We don't write the tables or functions
@@ -4747,8 +4822,7 @@ pgstat_write_statsfiles(bool permanent, bool allDbs)
         unlink(tmpfile);
     }
 
-    if (permanent)
-        unlink(pgstat_stat_filename);
+    unlink(pgstat_stat_filename);
 
     /*
      * Now throw away the list of requests.  Note that requests sent after we
@@ -4763,15 +4837,14 @@ pgstat_write_statsfiles(bool permanent, bool allDbs)
  * of length len.
  */
 static void
-get_dbstat_filename(bool permanent, bool tempname, Oid databaseid,
+get_dbstat_filename(bool tempname, Oid databaseid,
                     char *filename, int len)
 {
     int            printed;
 
     /* NB -- pgstat_reset_remove_files knows about the pattern this uses */
     printed = snprintf(filename, len, "%s/db_%u.%s",
-                       permanent ? PGSTAT_STAT_PERMANENT_DIRECTORY :
-                       pgstat_stat_directory,
+                       PGSTAT_STAT_PERMANENT_DIRECTORY,
                        databaseid,
                        tempname ? "tmp" : "stat");
     if (printed > len)
@@ -4789,10 +4862,10 @@ get_dbstat_filename(bool permanent, bool tempname, Oid databaseid,
  * ----------
  */
 static void
-pgstat_write_db_statsfile(PgStat_StatDBEntry *dbentry, bool permanent)
+pgstat_write_db_statsfile(PgStat_StatDBEntry *dbentry)
 {
-    HASH_SEQ_STATUS tstat;
-    HASH_SEQ_STATUS fstat;
+    dshash_seq_status tstat;
+    dshash_seq_status fstat;
     PgStat_StatTabEntry *tabentry;
     PgStat_StatFuncEntry *funcentry;
     FILE       *fpout;
@@ -4801,9 +4874,10 @@ pgstat_write_db_statsfile(PgStat_StatDBEntry *dbentry, bool permanent)
     int            rc;
     char        tmpfile[MAXPGPATH];
     char        statfile[MAXPGPATH];
+    dshash_table *tbl;
 
-    get_dbstat_filename(permanent, true, dbid, tmpfile, MAXPGPATH);
-    get_dbstat_filename(permanent, false, dbid, statfile, MAXPGPATH);
+    get_dbstat_filename(true, dbid, tmpfile, MAXPGPATH);
+    get_dbstat_filename(false, dbid, statfile, MAXPGPATH);
 
     elog(DEBUG2, "writing stats file \"%s\"", statfile);
 
@@ -4830,24 +4904,28 @@ pgstat_write_db_statsfile(PgStat_StatDBEntry *dbentry, bool permanent)
     /*
      * Walk through the database's access stats per table.
      */
-    hash_seq_init(&tstat, dbentry->tables);
-    while ((tabentry = (PgStat_StatTabEntry *) hash_seq_search(&tstat)) != NULL)
+    tbl = dshash_attach(area, &dsh_tblparams, dbentry->tables, 0);
+    dshash_seq_init(&tstat, tbl);
+    while ((tabentry = (PgStat_StatTabEntry *) dshash_seq_next(&tstat)) != NULL)
     {
         fputc('T', fpout);
         rc = fwrite(tabentry, sizeof(PgStat_StatTabEntry), 1, fpout);
         (void) rc;                /* we'll check for error with ferror */
     }
+    dshash_detach(tbl);
 
     /*
      * Walk through the database's function stats table.
      */
-    hash_seq_init(&fstat, dbentry->functions);
-    while ((funcentry = (PgStat_StatFuncEntry *) hash_seq_search(&fstat)) != NULL)
+    tbl = dshash_attach(area, &dsh_funcparams, dbentry->functions, 0);
+    dshash_seq_init(&fstat, tbl);
+    while ((funcentry = (PgStat_StatFuncEntry *) dshash_seq_next(&fstat)) != NULL)
     {
         fputc('F', fpout);
         rc = fwrite(funcentry, sizeof(PgStat_StatFuncEntry), 1, fpout);
         (void) rc;                /* we'll check for error with ferror */
     }
+    dshash_detach(tbl);
 
     /*
      * No more output to be done. Close the temp file and replace the old
@@ -4881,14 +4959,6 @@ pgstat_write_db_statsfile(PgStat_StatDBEntry *dbentry, bool permanent)
                         tmpfile, statfile)));
         unlink(tmpfile);
     }
-
-    if (permanent)
-    {
-        get_dbstat_filename(false, false, dbid, statfile, MAXPGPATH);
-
-        elog(DEBUG2, "removing temporary stats file \"%s\"", statfile);
-        unlink(statfile);
-    }
 }
 
 /* ----------
@@ -4911,46 +4981,35 @@ pgstat_write_db_statsfile(PgStat_StatDBEntry *dbentry, bool permanent)
  *    the table/function hash tables remain empty.
  * ----------
  */
-static HTAB *
-pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep)
+static void
+pgstat_read_statsfiles(void)
 {
     PgStat_StatDBEntry *dbentry;
     PgStat_StatDBEntry dbbuf;
-    HASHCTL        hash_ctl;
-    HTAB       *dbhash;
     FILE       *fpin;
     int32        format_id;
     bool        found;
-    const char *statfile = permanent ? PGSTAT_STAT_PERMANENT_FILENAME : pgstat_stat_filename;
+    const char *statfile = PGSTAT_STAT_PERMANENT_FILENAME;
+    dshash_table *tblstats = NULL;
+    dshash_table *funcstats = NULL;
 
+    Assert(pgStatRunningInCollector);
     /*
      * The tables will live in pgStatLocalContext.
      */
     pgstat_setup_memcxt();
 
     /*
-     * Create the DB hashtable
+     * Create the DB hashtable and global stas area
      */
-    memset(&hash_ctl, 0, sizeof(hash_ctl));
-    hash_ctl.keysize = sizeof(Oid);
-    hash_ctl.entrysize = sizeof(PgStat_StatDBEntry);
-    hash_ctl.hcxt = pgStatLocalContext;
-    dbhash = hash_create("Databases hash", PGSTAT_DB_HASH_SIZE, &hash_ctl,
-                         HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
-
-    /*
-     * Clear out global and archiver statistics so they start from zero in
-     * case we can't load an existing statsfile.
-     */
-    memset(&globalStats, 0, sizeof(globalStats));
-    memset(&archiverStats, 0, sizeof(archiverStats));
+    pgstat_create_shared_stats();
 
     /*
      * Set the current timestamp (will be kept only in case we can't load an
      * existing statsfile).
      */
-    globalStats.stat_reset_timestamp = GetCurrentTimestamp();
-    archiverStats.stat_reset_timestamp = globalStats.stat_reset_timestamp;
+    shared_globalStats->stat_reset_timestamp = GetCurrentTimestamp();
+    shared_archiverStats->stat_reset_timestamp = shared_globalStats->stat_reset_timestamp;
 
     /*
      * Try to open the stats file. If it doesn't exist, the backends simply
@@ -4968,7 +5027,7 @@ pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep)
                     (errcode_for_file_access(),
                      errmsg("could not open statistics file \"%s\": %m",
                             statfile)));
-        return dbhash;
+        return;
     }
 
     /*
@@ -4985,11 +5044,11 @@ pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep)
     /*
      * Read global stats struct
      */
-    if (fread(&globalStats, 1, sizeof(globalStats), fpin) != sizeof(globalStats))
+    if (fread(shared_globalStats, 1, sizeof(shared_globalStats), fpin) != sizeof(shared_globalStats))
     {
         ereport(pgStatRunningInCollector ? LOG : WARNING,
                 (errmsg("corrupted statistics file \"%s\"", statfile)));
-        memset(&globalStats, 0, sizeof(globalStats));
+        memset(shared_globalStats, 0, sizeof(*shared_globalStats));
         goto done;
     }
 
@@ -5000,17 +5059,16 @@ pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep)
      * file's timestamp is less than PGSTAT_STAT_INTERVAL ago, but that's not
      * an unusual scenario.
      */
-    if (pgStatRunningInCollector)
-        globalStats.stats_timestamp = 0;
+    shared_globalStats->stats_timestamp = 0;
 
     /*
      * Read archiver stats struct
      */
-    if (fread(&archiverStats, 1, sizeof(archiverStats), fpin) != sizeof(archiverStats))
+    if (fread(shared_archiverStats, 1, sizeof(shared_archiverStats), fpin) != sizeof(shared_archiverStats))
     {
         ereport(pgStatRunningInCollector ? LOG : WARNING,
                 (errmsg("corrupted statistics file \"%s\"", statfile)));
-        memset(&archiverStats, 0, sizeof(archiverStats));
+        memset(shared_archiverStats, 0, sizeof(*shared_archiverStats));
         goto done;
     }
 
@@ -5039,12 +5097,12 @@ pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep)
                 /*
                  * Add to the DB hash
                  */
-                dbentry = (PgStat_StatDBEntry *) hash_search(dbhash,
-                                                             (void *) &dbbuf.databaseid,
-                                                             HASH_ENTER,
-                                                             &found);
+                dbentry = (PgStat_StatDBEntry *)
+                    dshash_find_or_insert(db_stats, (void *) &dbbuf.databaseid,
+                                          &found);
                 if (found)
                 {
+                    dshash_release_lock(db_stats, dbentry);
                     ereport(pgStatRunningInCollector ? LOG : WARNING,
                             (errmsg("corrupted statistics file \"%s\"",
                                     statfile)));
@@ -5052,8 +5110,8 @@ pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep)
                 }
 
                 memcpy(dbentry, &dbbuf, sizeof(PgStat_StatDBEntry));
-                dbentry->tables = NULL;
-                dbentry->functions = NULL;
+                dbentry->tables = DSM_HANDLE_INVALID;
+                dbentry->functions = DSM_HANDLE_INVALID;
 
                 /*
                  * In the collector, disregard the timestamp we read from the
@@ -5061,47 +5119,23 @@ pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep)
                  * stats file immediately upon the first request from any
                  * backend.
                  */
-                if (pgStatRunningInCollector)
-                    dbentry->stats_timestamp = 0;
-
-                /*
-                 * Don't create tables/functions hashtables for uninteresting
-                 * databases.
-                 */
-                if (onlydb != InvalidOid)
-                {
-                    if (dbbuf.databaseid != onlydb &&
-                        dbbuf.databaseid != InvalidOid)
-                        break;
-                }
-
-                memset(&hash_ctl, 0, sizeof(hash_ctl));
-                hash_ctl.keysize = sizeof(Oid);
-                hash_ctl.entrysize = sizeof(PgStat_StatTabEntry);
-                hash_ctl.hcxt = pgStatLocalContext;
-                dbentry->tables = hash_create("Per-database table",
-                                              PGSTAT_TAB_HASH_SIZE,
-                                              &hash_ctl,
-                                              HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
-
-                hash_ctl.keysize = sizeof(Oid);
-                hash_ctl.entrysize = sizeof(PgStat_StatFuncEntry);
-                hash_ctl.hcxt = pgStatLocalContext;
-                dbentry->functions = hash_create("Per-database function",
-                                                 PGSTAT_FUNCTION_HASH_SIZE,
-                                                 &hash_ctl,
-                                                 HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
+                Assert(pgStatRunningInCollector);
+                dbentry->stats_timestamp = 0;
 
                 /*
                  * If requested, read the data from the database-specific
                  * file.  Otherwise we just leave the hashtables empty.
                  */
-                if (deep)
-                    pgstat_read_db_statsfile(dbentry->databaseid,
-                                             dbentry->tables,
-                                             dbentry->functions,
-                                             permanent);
-
+                tblstats = dshash_create(area, &dsh_tblparams, 0);
+                dbentry->tables = dshash_get_hash_table_handle(tblstats);
+                funcstats = dshash_create(area, &dsh_funcparams, 0);
+                dbentry->functions =
+                    dshash_get_hash_table_handle(funcstats);
+                dshash_release_lock(db_stats, dbentry);
+                pgstat_read_db_statsfile(dbentry->databaseid,
+                                         tblstats, funcstats);
+                dshash_detach(tblstats);
+                dshash_detach(funcstats);
                 break;
 
             case 'E':
@@ -5118,34 +5152,47 @@ pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep)
 done:
     FreeFile(fpin);
 
-    /* If requested to read the permanent file, also get rid of it. */
-    if (permanent)
-    {
-        elog(DEBUG2, "removing permanent stats file \"%s\"", statfile);
-        unlink(statfile);
-    }
+    elog(DEBUG2, "removing permanent stats file \"%s\"", statfile);
+    unlink(statfile);
 
-    return dbhash;
+    return;
 }
 
 
+Size
+StatsShmemSize(void)
+{
+    return sizeof(dsa_handle);
+}
+
+void
+StatsShmemInit(void)
+{
+    bool    found;
+
+    StatsShmem = (StatsShmemStruct *)
+        ShmemInitStruct("Stats area", StatsShmemSize(),
+                        &found);
+    if (!IsUnderPostmaster)
+    {
+        Assert(!found);
+
+        StatsShmem->stats_dsa_handle = DSM_HANDLE_INVALID;
+    }
+    else
+        Assert(found);
+}
+
 /* ----------
  * pgstat_read_db_statsfile() -
  *
- *    Reads in the existing statistics collector file for the given database,
- *    filling the passed-in tables and functions hash tables.
- *
- *    As in pgstat_read_statsfiles, if the permanent file is requested, it is
- *    removed after reading.
- *
- *    Note: this code has the ability to skip storing per-table or per-function
- *    data, if NULL is passed for the corresponding hashtable.  That's not used
- *    at the moment though.
+ *    Reads in the permanent statistics collector file and create shared
+ *    statistics tables. The file is removed afer reading.
  * ----------
  */
 static void
-pgstat_read_db_statsfile(Oid databaseid, HTAB *tabhash, HTAB *funchash,
-                         bool permanent)
+pgstat_read_db_statsfile(Oid databaseid,
+                         dshash_table *tabhash, dshash_table *funchash)
 {
     PgStat_StatTabEntry *tabentry;
     PgStat_StatTabEntry tabbuf;
@@ -5156,7 +5203,8 @@ pgstat_read_db_statsfile(Oid databaseid, HTAB *tabhash, HTAB *funchash,
     bool        found;
     char        statfile[MAXPGPATH];
 
-    get_dbstat_filename(permanent, false, databaseid, statfile, MAXPGPATH);
+    Assert(pgStatRunningInCollector);
+    get_dbstat_filename(false, databaseid, statfile, MAXPGPATH);
 
     /*
      * Try to open the stats file. If it doesn't exist, the backends simply
@@ -5215,12 +5263,13 @@ pgstat_read_db_statsfile(Oid databaseid, HTAB *tabhash, HTAB *funchash,
                 if (tabhash == NULL)
                     break;
 
-                tabentry = (PgStat_StatTabEntry *) hash_search(tabhash,
-                                                               (void *) &tabbuf.tableid,
-                                                               HASH_ENTER, &found);
+                tabentry = (PgStat_StatTabEntry *)
+                    dshash_find_or_insert(tabhash,
+                                          (void *) &tabbuf.tableid, &found);
 
                 if (found)
                 {
+                    dshash_release_lock(tabhash, tabentry);
                     ereport(pgStatRunningInCollector ? LOG : WARNING,
                             (errmsg("corrupted statistics file \"%s\"",
                                     statfile)));
@@ -5228,6 +5277,7 @@ pgstat_read_db_statsfile(Oid databaseid, HTAB *tabhash, HTAB *funchash,
                 }
 
                 memcpy(tabentry, &tabbuf, sizeof(tabbuf));
+                dshash_release_lock(tabhash, tabentry);
                 break;
 
                 /*
@@ -5249,9 +5299,9 @@ pgstat_read_db_statsfile(Oid databaseid, HTAB *tabhash, HTAB *funchash,
                 if (funchash == NULL)
                     break;
 
-                funcentry = (PgStat_StatFuncEntry *) hash_search(funchash,
-                                                                 (void *) &funcbuf.functionid,
-                                                                 HASH_ENTER, &found);
+                funcentry = (PgStat_StatFuncEntry *)
+                    dshash_find_or_insert(funchash,
+                                          (void *) &funcbuf.functionid, &found);
 
                 if (found)
                 {
@@ -5262,6 +5312,7 @@ pgstat_read_db_statsfile(Oid databaseid, HTAB *tabhash, HTAB *funchash,
                 }
 
                 memcpy(funcentry, &funcbuf, sizeof(funcbuf));
+                dshash_release_lock(funchash, funcentry);
                 break;
 
                 /*
@@ -5281,142 +5332,50 @@ pgstat_read_db_statsfile(Oid databaseid, HTAB *tabhash, HTAB *funchash,
 done:
     FreeFile(fpin);
 
-    if (permanent)
-    {
-        elog(DEBUG2, "removing permanent stats file \"%s\"", statfile);
-        unlink(statfile);
-    }
+    elog(DEBUG2, "removing permanent stats file \"%s\"", statfile);
+    unlink(statfile);
 }
 
 /* ----------
- * pgstat_read_db_statsfile_timestamp() -
- *
- *    Attempt to determine the timestamp of the last db statfile write.
- *    Returns true if successful; the timestamp is stored in *ts.
+ * backend_clean_snapshot_callback() -
  *
- *    This needs to be careful about handling databases for which no stats file
- *    exists, such as databases without a stat entry or those not yet written:
- *
- *    - if there's a database entry in the global file, return the corresponding
- *    stats_timestamp value.
- *
- *    - if there's no db stat entry (e.g. for a new or inactive database),
- *    there's no stats_timestamp value, but also nothing to write so we return
- *    the timestamp of the global statfile.
+ *    This is usually called with arg = NULL when the memory context where the
+ *  current snapshot has been taken. Don't bother to release memory for the
+ *  case.
  * ----------
  */
-static bool
-pgstat_read_db_statsfile_timestamp(Oid databaseid, bool permanent,
-                                   TimestampTz *ts)
+static void
+backend_clean_snapshot_callback(void *arg)
 {
-    PgStat_StatDBEntry dbentry;
-    PgStat_GlobalStats myGlobalStats;
-    PgStat_ArchiverStats myArchiverStats;
-    FILE       *fpin;
-    int32        format_id;
-    const char *statfile = permanent ? PGSTAT_STAT_PERMANENT_FILENAME : pgstat_stat_filename;
-
-    /*
-     * Try to open the stats file.  As above, anything but ENOENT is worthy of
-     * complaining about.
-     */
-    if ((fpin = AllocateFile(statfile, PG_BINARY_R)) == NULL)
-    {
-        if (errno != ENOENT)
-            ereport(pgStatRunningInCollector ? LOG : WARNING,
-                    (errcode_for_file_access(),
-                     errmsg("could not open statistics file \"%s\": %m",
-                            statfile)));
-        return false;
-    }
-
-    /*
-     * Verify it's of the expected format.
-     */
-    if (fread(&format_id, 1, sizeof(format_id), fpin) != sizeof(format_id) ||
-        format_id != PGSTAT_FILE_FORMAT_ID)
+    if (arg != NULL)
     {
-        ereport(pgStatRunningInCollector ? LOG : WARNING,
-                (errmsg("corrupted statistics file \"%s\"", statfile)));
-        FreeFile(fpin);
-        return false;
-    }
+        /* explicitly called, so explicitly free resources */
+        if (snapshot_globalStats)
+            pfree(snapshot_globalStats);
 
-    /*
-     * Read global stats struct
-     */
-    if (fread(&myGlobalStats, 1, sizeof(myGlobalStats),
-              fpin) != sizeof(myGlobalStats))
-    {
-        ereport(pgStatRunningInCollector ? LOG : WARNING,
-                (errmsg("corrupted statistics file \"%s\"", statfile)));
-        FreeFile(fpin);
-        return false;
-    }
+        if (snapshot_archiverStats)
+            pfree(snapshot_archiverStats);
 
-    /*
-     * Read archiver stats struct
-     */
-    if (fread(&myArchiverStats, 1, sizeof(myArchiverStats),
-              fpin) != sizeof(myArchiverStats))
-    {
-        ereport(pgStatRunningInCollector ? LOG : WARNING,
-                (errmsg("corrupted statistics file \"%s\"", statfile)));
-        FreeFile(fpin);
-        return false;
-    }
-
-    /* By default, we're going to return the timestamp of the global file. */
-    *ts = myGlobalStats.stats_timestamp;
-
-    /*
-     * We found an existing collector stats file.  Read it and look for a
-     * record for the requested database.  If found, use its timestamp.
-     */
-    for (;;)
-    {
-        switch (fgetc(fpin))
+        if (local_db_stats)
         {
-                /*
-                 * 'D'    A PgStat_StatDBEntry struct describing a database
-                 * follows.
-                 */
-            case 'D':
-                if (fread(&dbentry, 1, offsetof(PgStat_StatDBEntry, tables),
-                          fpin) != offsetof(PgStat_StatDBEntry, tables))
-                {
-                    ereport(pgStatRunningInCollector ? LOG : WARNING,
-                            (errmsg("corrupted statistics file \"%s\"",
-                                    statfile)));
-                    goto done;
-                }
+            dshash_seq_status seq;
+            PgStat_StatDBEntry *dbent;
 
-                /*
-                 * If this is the DB we're looking for, save its timestamp and
-                 * we're done.
-                 */
-                if (dbentry.databaseid == databaseid)
-                {
-                    *ts = dbentry.stats_timestamp;
-                    goto done;
-                }
-
-                break;
-
-            case 'E':
-                goto done;
-
-            default:
-                ereport(pgStatRunningInCollector ? LOG : WARNING,
-                        (errmsg("corrupted statistics file \"%s\"",
-                                statfile)));
-                goto done;
+            dshash_seq_init(&seq, local_db_stats);
+            while ((dbent = dshash_seq_next(&seq)) != NULL)
+            {
+                if (dbent->snapshot_tables)
+                    dshash_detach(dbent->snapshot_tables);
+                if (dbent->snapshot_functions)
+                    dshash_detach(dbent->snapshot_functions);
+            }
+            dshash_destroy(local_db_stats);
         }
     }
 
-done:
-    FreeFile(fpin);
-    return true;
+    snapshot_globalStats = NULL;
+    snapshot_archiverStats = NULL;
+    local_db_stats = NULL;
 }
 
 /*
@@ -5424,131 +5383,77 @@ done:
  * some hash tables.  The results will be kept until pgstat_clear_snapshot()
  * is called (typically, at end of transaction).
  */
-static void
-backend_read_statsfile(void)
+static bool
+backend_take_stats_snapshot(void)
 {
-    TimestampTz min_ts = 0;
-    TimestampTz ref_ts = 0;
-    Oid            inquiry_db;
-    int            count;
+    PgStat_StatDBEntry  *dbent;
+    dsa_area            *new_area;
+    dshash_seq_status seq;
+    MemoryContext oldcontext;
+    MemoryContextCallback *mcxt_cb;
 
-    /* already read it? */
-    if (pgStatDBHash)
-        return;
     Assert(!pgStatRunningInCollector);
 
-    /*
-     * In a normal backend, we check staleness of the data for our own DB, and
-     * so we send MyDatabaseId in inquiry messages.  In the autovac launcher,
-     * check staleness of the shared-catalog data, and send InvalidOid in
-     * inquiry messages so as not to force writing unnecessary data.
-     */
-    if (IsAutoVacuumLauncherProcess())
-        inquiry_db = InvalidOid;
-    else
-        inquiry_db = MyDatabaseId;
-
-    /*
-     * Loop until fresh enough stats file is available or we ran out of time.
-     * The stats inquiry message is sent repeatedly in case collector drops
-     * it; but not every single time, as that just swamps the collector.
-     */
-    for (count = 0; count < PGSTAT_POLL_LOOP_COUNT; count++)
+    oldcontext = MemoryContextSwitchTo(TopMemoryContext);
+    if (!pgstat_attach_shared_stats())
     {
-        bool        ok;
-        TimestampTz file_ts = 0;
-        TimestampTz cur_ts;
-
-        CHECK_FOR_INTERRUPTS();
-
-        ok = pgstat_read_db_statsfile_timestamp(inquiry_db, false, &file_ts);
+        MemoryContextSwitchTo(oldcontext);
+        return false;
+    }
+    MemoryContextSwitchTo(oldcontext);
 
-        cur_ts = GetCurrentTimestamp();
-        /* Calculate min acceptable timestamp, if we didn't already */
-        if (count == 0 || cur_ts < ref_ts)
-        {
-            /*
-             * We set the minimum acceptable timestamp to PGSTAT_STAT_INTERVAL
-             * msec before now.  This indirectly ensures that the collector
-             * needn't write the file more often than PGSTAT_STAT_INTERVAL. In
-             * an autovacuum worker, however, we want a lower delay to avoid
-             * using stale data, so we use PGSTAT_RETRY_DELAY (since the
-             * number of workers is low, this shouldn't be a problem).
-             *
-             * We don't recompute min_ts after sleeping, except in the
-             * unlikely case that cur_ts went backwards.  So we might end up
-             * accepting a file a bit older than PGSTAT_STAT_INTERVAL.  In
-             * practice that shouldn't happen, though, as long as the sleep
-             * time is less than PGSTAT_STAT_INTERVAL; and we don't want to
-             * tell the collector that our cutoff time is less than what we'd
-             * actually accept.
-             */
-            ref_ts = cur_ts;
-            if (IsAutoVacuumWorkerProcess())
-                min_ts = TimestampTzPlusMilliseconds(ref_ts,
-                                                     -PGSTAT_RETRY_DELAY);
-            else
-                min_ts = TimestampTzPlusMilliseconds(ref_ts,
-                                                     -PGSTAT_STAT_INTERVAL);
-        }
+    if (snapshot_globalStats)
+        return true;
 
-        /*
-         * If the file timestamp is actually newer than cur_ts, we must have
-         * had a clock glitch (system time went backwards) or there is clock
-         * skew between our processor and the stats collector's processor.
-         * Accept the file, but send an inquiry message anyway to make
-         * pgstat_recv_inquiry do a sanity check on the collector's time.
-         */
-        if (ok && file_ts > cur_ts)
-        {
-            /*
-             * A small amount of clock skew between processors isn't terribly
-             * surprising, but a large difference is worth logging.  We
-             * arbitrarily define "large" as 1000 msec.
-             */
-            if (file_ts >= TimestampTzPlusMilliseconds(cur_ts, 1000))
-            {
-                char       *filetime;
-                char       *mytime;
-
-                /* Copy because timestamptz_to_str returns a static buffer */
-                filetime = pstrdup(timestamptz_to_str(file_ts));
-                mytime = pstrdup(timestamptz_to_str(cur_ts));
-                elog(LOG, "stats collector's time %s is later than backend local time %s",
-                     filetime, mytime);
-                pfree(filetime);
-                pfree(mytime);
-            }
+    Assert(snapshot_archiverStats == NULL);
+    Assert(local_db_stats == NULL);
 
-            pgstat_send_inquiry(cur_ts, min_ts, inquiry_db);
-            break;
-        }
+    /*
+     * the snapshot lives within the current transaction if any, the current
+     * memory context liftime otherwise.
+     */
+    if (IsTransactionState())
+        MemoryContextSwitchTo(TopTransactionContext);
 
-        /* Normal acceptance case: file is not older than cutoff time */
-        if (ok && file_ts >= min_ts)
-            break;
+    /* global stats can be just copied  */
+    snapshot_globalStats = palloc(sizeof(PgStat_GlobalStats));
+    memcpy(snapshot_globalStats, shared_globalStats,
+           sizeof(PgStat_GlobalStats));
 
-        /* Not there or too old, so kick the collector and wait a bit */
-        if ((count % PGSTAT_INQ_LOOP_COUNT) == 0)
-            pgstat_send_inquiry(cur_ts, min_ts, inquiry_db);
+    snapshot_archiverStats = palloc(sizeof(PgStat_ArchiverStats));
+    memcpy(snapshot_archiverStats, shared_archiverStats,
+           sizeof(PgStat_ArchiverStats));
 
-        pg_usleep(PGSTAT_RETRY_DELAY * 1000L);
+    /*
+     * take a local snapshot for every dsahsh. It's ok if the snapshots are
+     * not in strictly consistent.
+     */
+    new_area = dsa_take_snapshot(area);
+    local_db_stats = dshash_take_snapshot(db_stats, new_area);
+    dshash_seq_init(&seq, local_db_stats);
+    while ((dbent = (PgStat_StatDBEntry *) dshash_seq_next(&seq)) != NULL)
+    {
+        dshash_table *t;
+
+        t = dshash_attach(area, &dsh_tblparams, dbent->tables, 0);
+        dbent->snapshot_tables = dshash_take_snapshot(t, new_area);
+        dshash_detach(t);
+        t = dshash_attach(area, &dsh_funcparams, dbent->functions, 0);
+        dbent->snapshot_functions = dshash_take_snapshot(t, new_area);
+        dshash_detach(t);
     }
 
-    if (count >= PGSTAT_POLL_LOOP_COUNT)
-        ereport(LOG,
-                (errmsg("using stale statistics instead of current ones "
-                        "because stats collector is not responding")));
+    /* set the timestamp of taking this snapshot */
+    snapshot_globalStats->stats_timestamp = GetCurrentTimestamp();
 
-    /*
-     * Autovacuum launcher wants stats about all databases, but a shallow read
-     * is sufficient.  Regular backends want a deep read for just the tables
-     * they can see (MyDatabaseId + shared catalogs).
-     */
-    if (IsAutoVacuumLauncherProcess())
-        pgStatDBHash = pgstat_read_statsfiles(InvalidOid, false, false);
-    else
-        pgStatDBHash = pgstat_read_statsfiles(MyDatabaseId, false, true);
+    /* register callback to clear snapshot */
+    mcxt_cb = (MemoryContextCallback *)palloc(sizeof(MemoryContextCallback));
+    mcxt_cb->func = backend_clean_snapshot_callback;
+    mcxt_cb->arg = NULL;
+    MemoryContextRegisterResetCallback(CurrentMemoryContext, mcxt_cb);
+    MemoryContextSwitchTo(oldcontext);
+
+    return true;
 }
 
 
@@ -5581,6 +5486,8 @@ pgstat_setup_memcxt(void)
 void
 pgstat_clear_snapshot(void)
 {
+    int param = 0;    /* only the address is significant */
+
     /* Release memory, if any was allocated */
     if (pgStatLocalContext)
         MemoryContextDelete(pgStatLocalContext);
@@ -5590,99 +5497,12 @@ pgstat_clear_snapshot(void)
     pgStatDBHash = NULL;
     localBackendStatusTable = NULL;
     localNumBackends = 0;
-}
-
-
-/* ----------
- * pgstat_recv_inquiry() -
- *
- *    Process stat inquiry requests.
- * ----------
- */
-static void
-pgstat_recv_inquiry(PgStat_MsgInquiry *msg, int len)
-{
-    PgStat_StatDBEntry *dbentry;
-
-    elog(DEBUG2, "received inquiry for database %u", msg->databaseid);
-
-    /*
-     * If there's already a write request for this DB, there's nothing to do.
-     *
-     * Note that if a request is found, we return early and skip the below
-     * check for clock skew.  This is okay, since the only way for a DB
-     * request to be present in the list is that we have been here since the
-     * last write round.  It seems sufficient to check for clock skew once per
-     * write round.
-     */
-    if (list_member_oid(pending_write_requests, msg->databaseid))
-        return;
-
-    /*
-     * Check to see if we last wrote this database at a time >= the requested
-     * cutoff time.  If so, this is a stale request that was generated before
-     * we updated the DB file, and we don't need to do so again.
-     *
-     * If the requestor's local clock time is older than stats_timestamp, we
-     * should suspect a clock glitch, ie system time going backwards; though
-     * the more likely explanation is just delayed message receipt.  It is
-     * worth expending a GetCurrentTimestamp call to be sure, since a large
-     * retreat in the system clock reading could otherwise cause us to neglect
-     * to update the stats file for a long time.
-     */
-    dbentry = pgstat_get_db_entry(msg->databaseid, false);
-    if (dbentry == NULL)
-    {
-        /*
-         * We have no data for this DB.  Enter a write request anyway so that
-         * the global stats will get updated.  This is needed to prevent
-         * backend_read_statsfile from waiting for data that we cannot supply,
-         * in the case of a new DB that nobody has yet reported any stats for.
-         * See the behavior of pgstat_read_db_statsfile_timestamp.
-         */
-    }
-    else if (msg->clock_time < dbentry->stats_timestamp)
-    {
-        TimestampTz cur_ts = GetCurrentTimestamp();
-
-        if (cur_ts < dbentry->stats_timestamp)
-        {
-            /*
-             * Sure enough, time went backwards.  Force a new stats file write
-             * to get back in sync; but first, log a complaint.
-             */
-            char       *writetime;
-            char       *mytime;
-
-            /* Copy because timestamptz_to_str returns a static buffer */
-            writetime = pstrdup(timestamptz_to_str(dbentry->stats_timestamp));
-            mytime = pstrdup(timestamptz_to_str(cur_ts));
-            elog(LOG,
-                 "stats_timestamp %s is later than collector's time %s for database %u",
-                 writetime, mytime, dbentry->databaseid);
-            pfree(writetime);
-            pfree(mytime);
-        }
-        else
-        {
-            /*
-             * Nope, it's just an old request.  Assuming msg's clock_time is
-             * >= its cutoff_time, it must be stale, so we can ignore it.
-             */
-            return;
-        }
-    }
-    else if (msg->cutoff_time <= dbentry->stats_timestamp)
-    {
-        /* Stale request, ignore it */
-        return;
-    }
 
     /*
-     * We need to write this DB, so create a request.
+     * the parameter inform the function that it is not called from
+     * MemoryContextCallback
      */
-    pending_write_requests = lappend_oid(pending_write_requests,
-                                         msg->databaseid);
+    backend_clean_snapshot_callback(¶m);
 }
 
 
@@ -5695,6 +5515,7 @@ pgstat_recv_inquiry(PgStat_MsgInquiry *msg, int len)
 static void
 pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len)
 {
+    dshash_table *tabhash;
     PgStat_StatDBEntry *dbentry;
     PgStat_StatTabEntry *tabentry;
     int            i;
@@ -5710,6 +5531,7 @@ pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len)
     dbentry->n_block_read_time += msg->m_block_read_time;
     dbentry->n_block_write_time += msg->m_block_write_time;
 
+    tabhash = dshash_attach(area, &dsh_tblparams, dbentry->tables, 0);
     /*
      * Process all table entries in the message.
      */
@@ -5717,9 +5539,8 @@ pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len)
     {
         PgStat_TableEntry *tabmsg = &(msg->m_entry[i]);
 
-        tabentry = (PgStat_StatTabEntry *) hash_search(dbentry->tables,
-                                                       (void *) &(tabmsg->t_id),
-                                                       HASH_ENTER, &found);
+        tabentry = (PgStat_StatTabEntry *)
+            dshash_find_or_insert(tabhash, (void *) &(tabmsg->t_id), &found);
 
         if (!found)
         {
@@ -5778,6 +5599,7 @@ pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len)
         tabentry->n_live_tuples = Max(tabentry->n_live_tuples, 0);
         /* Likewise for n_dead_tuples */
         tabentry->n_dead_tuples = Max(tabentry->n_dead_tuples, 0);
+        dshash_release_lock(tabhash, tabentry);
 
         /*
          * Add per-table stats to the per-database entry, too.
@@ -5790,6 +5612,8 @@ pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len)
         dbentry->n_blocks_fetched += tabmsg->t_counts.t_blocks_fetched;
         dbentry->n_blocks_hit += tabmsg->t_counts.t_blocks_hit;
     }
+
+    dshash_release_lock(db_stats, dbentry);
 }
 
 
@@ -5802,27 +5626,33 @@ pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len)
 static void
 pgstat_recv_tabpurge(PgStat_MsgTabpurge *msg, int len)
 {
+    dshash_table *tbl;
     PgStat_StatDBEntry *dbentry;
     int            i;
 
     dbentry = pgstat_get_db_entry(msg->m_databaseid, false);
-
     /*
      * No need to purge if we don't even know the database.
      */
-    if (!dbentry || !dbentry->tables)
+    if (!dbentry || dbentry->tables == DSM_HANDLE_INVALID)
+    {
+        if (dbentry)
+            dshash_release_lock(db_stats, dbentry);
         return;
+    }
 
+    tbl = dshash_attach(area, &dsh_tblparams, dbentry->tables, 0);
     /*
      * Process all table entries in the message.
      */
     for (i = 0; i < msg->m_nentries; i++)
     {
         /* Remove from hashtable if present; we don't care if it's not. */
-        (void) hash_search(dbentry->tables,
-                           (void *) &(msg->m_tableid[i]),
-                           HASH_REMOVE, NULL);
+        (void) dshash_delete_key(tbl, (void *) &(msg->m_tableid[i]));
     }
+
+    dshash_release_lock(db_stats, dbentry);
+
 }
 
 
@@ -5848,23 +5678,20 @@ pgstat_recv_dropdb(PgStat_MsgDropdb *msg, int len)
      */
     if (dbentry)
     {
-        char        statfile[MAXPGPATH];
-
-        get_dbstat_filename(false, false, dbid, statfile, MAXPGPATH);
-
-        elog(DEBUG2, "removing stats file \"%s\"", statfile);
-        unlink(statfile);
-
-        if (dbentry->tables != NULL)
-            hash_destroy(dbentry->tables);
-        if (dbentry->functions != NULL)
-            hash_destroy(dbentry->functions);
+        if (dbentry->tables != DSM_HANDLE_INVALID)
+        {
+            dshash_table *tbl =
+                dshash_attach(area, &dsh_tblparams, dbentry->tables, 0);
+            dshash_destroy(tbl);
+        }
+        if (dbentry->functions != DSM_HANDLE_INVALID)
+        {
+            dshash_table *tbl =
+                dshash_attach(area, &dsh_funcparams, dbentry->functions, 0);
+            dshash_destroy(tbl);
+        }
 
-        if (hash_search(pgStatDBHash,
-                        (void *) &dbid,
-                        HASH_REMOVE, NULL) == NULL)
-            ereport(ERROR,
-                    (errmsg("database hash table corrupted during cleanup --- abort")));
+        dshash_delete_entry(db_stats, (void *)dbentry);
     }
 }
 
@@ -5892,19 +5719,28 @@ pgstat_recv_resetcounter(PgStat_MsgResetcounter *msg, int len)
      * We simply throw away all the database's table entries by recreating a
      * new hash table for them.
      */
-    if (dbentry->tables != NULL)
-        hash_destroy(dbentry->tables);
-    if (dbentry->functions != NULL)
-        hash_destroy(dbentry->functions);
-
-    dbentry->tables = NULL;
-    dbentry->functions = NULL;
+    if (dbentry->tables != DSM_HANDLE_INVALID)
+    {
+        dshash_table *t =
+            dshash_attach(area, &dsh_tblparams, dbentry->tables, 0);
+        dshash_destroy(t);
+        dbentry->tables = DSM_HANDLE_INVALID;
+    }
+    if (dbentry->functions != DSM_HANDLE_INVALID)
+    {
+        dshash_table *t =
+            dshash_attach(area, &dsh_funcparams, dbentry->functions, 0);
+        dshash_destroy(t);
+        dbentry->functions = DSM_HANDLE_INVALID;
+    }
 
     /*
      * Reset database-level stats, too.  This creates empty hash tables for
      * tables and functions.
      */
     reset_dbentry_counters(dbentry);
+
+    dshash_release_lock(db_stats, dbentry);
 }
 
 /* ----------
@@ -5919,14 +5755,14 @@ pgstat_recv_resetsharedcounter(PgStat_MsgResetsharedcounter *msg, int len)
     if (msg->m_resettarget == RESET_BGWRITER)
     {
         /* Reset the global background writer statistics for the cluster. */
-        memset(&globalStats, 0, sizeof(globalStats));
-        globalStats.stat_reset_timestamp = GetCurrentTimestamp();
+        memset(&shared_globalStats, 0, sizeof(shared_globalStats));
+        shared_globalStats->stat_reset_timestamp = GetCurrentTimestamp();
     }
     else if (msg->m_resettarget == RESET_ARCHIVER)
     {
         /* Reset the archiver statistics for the cluster. */
-        memset(&archiverStats, 0, sizeof(archiverStats));
-        archiverStats.stat_reset_timestamp = GetCurrentTimestamp();
+        memset(&shared_archiverStats, 0, sizeof(shared_archiverStats));
+        shared_archiverStats->stat_reset_timestamp = GetCurrentTimestamp();
     }
 
     /*
@@ -5956,11 +5792,19 @@ pgstat_recv_resetsinglecounter(PgStat_MsgResetsinglecounter *msg, int len)
 
     /* Remove object if it exists, ignore it if not */
     if (msg->m_resettype == RESET_TABLE)
-        (void) hash_search(dbentry->tables, (void *) &(msg->m_objectid),
-                           HASH_REMOVE, NULL);
+    {
+        dshash_table *t =
+            dshash_attach(area, &dsh_tblparams, dbentry->tables, 0);
+        dshash_delete_key(t, (void *) &(msg->m_objectid));
+    }
     else if (msg->m_resettype == RESET_FUNCTION)
-        (void) hash_search(dbentry->functions, (void *) &(msg->m_objectid),
-                           HASH_REMOVE, NULL);
+    {
+        dshash_table *t =
+            dshash_attach(area, &dsh_funcparams, dbentry->functions, 0);
+        dshash_delete_key(t, (void *) &(msg->m_objectid));
+    }
+
+    dshash_release_lock(db_stats, dbentry);
 }
 
 /* ----------
@@ -5980,6 +5824,8 @@ pgstat_recv_autovac(PgStat_MsgAutovacStart *msg, int len)
     dbentry = pgstat_get_db_entry(msg->m_databaseid, true);
 
     dbentry->last_autovac_time = msg->m_start_time;
+
+    dshash_release_lock(db_stats, dbentry);
 }
 
 /* ----------
@@ -5993,13 +5839,13 @@ pgstat_recv_vacuum(PgStat_MsgVacuum *msg, int len)
 {
     PgStat_StatDBEntry *dbentry;
     PgStat_StatTabEntry *tabentry;
-
+    dshash_table *table;
     /*
      * Store the data in the table's hashtable entry.
      */
     dbentry = pgstat_get_db_entry(msg->m_databaseid, true);
-
-    tabentry = pgstat_get_tab_entry(dbentry, msg->m_tableoid, true);
+    table = dshash_attach(area, &dsh_tblparams, dbentry->tables, 0);
+    tabentry = pgstat_get_tab_entry(table, msg->m_tableoid, true);
 
     tabentry->n_live_tuples = msg->m_live_tuples;
     tabentry->n_dead_tuples = msg->m_dead_tuples;
@@ -6014,6 +5860,9 @@ pgstat_recv_vacuum(PgStat_MsgVacuum *msg, int len)
         tabentry->vacuum_timestamp = msg->m_vacuumtime;
         tabentry->vacuum_count++;
     }
+    dshash_release_lock(table, tabentry);
+    dshash_detach(table);
+    dshash_release_lock(db_stats, dbentry);
 }
 
 /* ----------
@@ -6027,13 +5876,15 @@ pgstat_recv_analyze(PgStat_MsgAnalyze *msg, int len)
 {
     PgStat_StatDBEntry *dbentry;
     PgStat_StatTabEntry *tabentry;
+    dshash_table *table;
 
     /*
      * Store the data in the table's hashtable entry.
      */
     dbentry = pgstat_get_db_entry(msg->m_databaseid, true);
 
-    tabentry = pgstat_get_tab_entry(dbentry, msg->m_tableoid, true);
+    table = dshash_attach(area, &dsh_tblparams, dbentry->tables, 0);
+    tabentry = pgstat_get_tab_entry(table, msg->m_tableoid, true);
 
     tabentry->n_live_tuples = msg->m_live_tuples;
     tabentry->n_dead_tuples = msg->m_dead_tuples;
@@ -6056,6 +5907,9 @@ pgstat_recv_analyze(PgStat_MsgAnalyze *msg, int len)
         tabentry->analyze_timestamp = msg->m_analyzetime;
         tabentry->analyze_count++;
     }
+    dshash_release_lock(table, tabentry);
+    dshash_detach(table);
+    dshash_release_lock(db_stats, dbentry);
 }
 
 
@@ -6071,18 +5925,18 @@ pgstat_recv_archiver(PgStat_MsgArchiver *msg, int len)
     if (msg->m_failed)
     {
         /* Failed archival attempt */
-        ++archiverStats.failed_count;
-        memcpy(archiverStats.last_failed_wal, msg->m_xlog,
-               sizeof(archiverStats.last_failed_wal));
-        archiverStats.last_failed_timestamp = msg->m_timestamp;
+        ++shared_archiverStats->failed_count;
+        memcpy(shared_archiverStats->last_failed_wal, msg->m_xlog,
+               sizeof(shared_archiverStats->last_failed_wal));
+        shared_archiverStats->last_failed_timestamp = msg->m_timestamp;
     }
     else
     {
         /* Successful archival operation */
-        ++archiverStats.archived_count;
-        memcpy(archiverStats.last_archived_wal, msg->m_xlog,
-               sizeof(archiverStats.last_archived_wal));
-        archiverStats.last_archived_timestamp = msg->m_timestamp;
+        ++shared_archiverStats->archived_count;
+        memcpy(shared_archiverStats->last_archived_wal, msg->m_xlog,
+               sizeof(shared_archiverStats->last_archived_wal));
+        shared_archiverStats->last_archived_timestamp = msg->m_timestamp;
     }
 }
 
@@ -6095,16 +5949,16 @@ pgstat_recv_archiver(PgStat_MsgArchiver *msg, int len)
 static void
 pgstat_recv_bgwriter(PgStat_MsgBgWriter *msg, int len)
 {
-    globalStats.timed_checkpoints += msg->m_timed_checkpoints;
-    globalStats.requested_checkpoints += msg->m_requested_checkpoints;
-    globalStats.checkpoint_write_time += msg->m_checkpoint_write_time;
-    globalStats.checkpoint_sync_time += msg->m_checkpoint_sync_time;
-    globalStats.buf_written_checkpoints += msg->m_buf_written_checkpoints;
-    globalStats.buf_written_clean += msg->m_buf_written_clean;
-    globalStats.maxwritten_clean += msg->m_maxwritten_clean;
-    globalStats.buf_written_backend += msg->m_buf_written_backend;
-    globalStats.buf_fsync_backend += msg->m_buf_fsync_backend;
-    globalStats.buf_alloc += msg->m_buf_alloc;
+    shared_globalStats->timed_checkpoints += msg->m_timed_checkpoints;
+    shared_globalStats->requested_checkpoints += msg->m_requested_checkpoints;
+    shared_globalStats->checkpoint_write_time += msg->m_checkpoint_write_time;
+    shared_globalStats->checkpoint_sync_time += msg->m_checkpoint_sync_time;
+    shared_globalStats->buf_written_checkpoints += msg->m_buf_written_checkpoints;
+    shared_globalStats->buf_written_clean += msg->m_buf_written_clean;
+    shared_globalStats->maxwritten_clean += msg->m_maxwritten_clean;
+    shared_globalStats->buf_written_backend += msg->m_buf_written_backend;
+    shared_globalStats->buf_fsync_backend += msg->m_buf_fsync_backend;
+    shared_globalStats->buf_alloc += msg->m_buf_alloc;
 }
 
 /* ----------
@@ -6145,6 +5999,8 @@ pgstat_recv_recoveryconflict(PgStat_MsgRecoveryConflict *msg, int len)
             dbentry->n_conflict_startup_deadlock++;
             break;
     }
+
+    dshash_release_lock(db_stats, dbentry);
 }
 
 /* ----------
@@ -6161,6 +6017,8 @@ pgstat_recv_deadlock(PgStat_MsgDeadlock *msg, int len)
     dbentry = pgstat_get_db_entry(msg->m_databaseid, true);
 
     dbentry->n_deadlocks++;
+
+    dshash_release_lock(db_stats, dbentry);
 }
 
 /* ----------
@@ -6178,6 +6036,8 @@ pgstat_recv_tempfile(PgStat_MsgTempFile *msg, int len)
 
     dbentry->n_temp_bytes += msg->m_filesize;
     dbentry->n_temp_files += 1;
+
+    dshash_release_lock(db_stats, dbentry);
 }
 
 /* ----------
@@ -6189,6 +6049,7 @@ pgstat_recv_tempfile(PgStat_MsgTempFile *msg, int len)
 static void
 pgstat_recv_funcstat(PgStat_MsgFuncstat *msg, int len)
 {
+    dshash_table *t;
     PgStat_FunctionEntry *funcmsg = &(msg->m_entry[0]);
     PgStat_StatDBEntry *dbentry;
     PgStat_StatFuncEntry *funcentry;
@@ -6197,14 +6058,14 @@ pgstat_recv_funcstat(PgStat_MsgFuncstat *msg, int len)
 
     dbentry = pgstat_get_db_entry(msg->m_databaseid, true);
 
+    t = dshash_attach(area, &dsh_funcparams, dbentry->functions, 0);
     /*
      * Process all function entries in the message.
      */
     for (i = 0; i < msg->m_nentries; i++, funcmsg++)
     {
-        funcentry = (PgStat_StatFuncEntry *) hash_search(dbentry->functions,
-                                                         (void *) &(funcmsg->f_id),
-                                                         HASH_ENTER, &found);
+        funcentry = (PgStat_StatFuncEntry *)
+            dshash_find_or_insert(t, (void *) &(funcmsg->f_id), &found);
 
         if (!found)
         {
@@ -6225,7 +6086,11 @@ pgstat_recv_funcstat(PgStat_MsgFuncstat *msg, int len)
             funcentry->f_total_time += funcmsg->f_total_time;
             funcentry->f_self_time += funcmsg->f_self_time;
         }
+        dshash_release_lock(t, funcentry);
     }
+
+    dshash_detach(t);
+    dshash_release_lock(db_stats, dbentry);
 }
 
 /* ----------
@@ -6237,6 +6102,7 @@ pgstat_recv_funcstat(PgStat_MsgFuncstat *msg, int len)
 static void
 pgstat_recv_funcpurge(PgStat_MsgFuncpurge *msg, int len)
 {
+    dshash_table *t;
     PgStat_StatDBEntry *dbentry;
     int            i;
 
@@ -6245,60 +6111,20 @@ pgstat_recv_funcpurge(PgStat_MsgFuncpurge *msg, int len)
     /*
      * No need to purge if we don't even know the database.
      */
-    if (!dbentry || !dbentry->functions)
+    if (!dbentry || dbentry->functions == DSM_HANDLE_INVALID)
         return;
 
+    t = dshash_attach(area, &dsh_funcparams, dbentry->functions, 0);
     /*
      * Process all function entries in the message.
      */
     for (i = 0; i < msg->m_nentries; i++)
     {
         /* Remove from hashtable if present; we don't care if it's not. */
-        (void) hash_search(dbentry->functions,
-                           (void *) &(msg->m_functionid[i]),
-                           HASH_REMOVE, NULL);
+        dshash_delete_key(t, (void *) &(msg->m_functionid[i]));
     }
-}
-
-/* ----------
- * pgstat_write_statsfile_needed() -
- *
- *    Do we need to write out any stats files?
- * ----------
- */
-static bool
-pgstat_write_statsfile_needed(void)
-{
-    if (pending_write_requests != NIL)
-        return true;
-
-    /* Everything was written recently */
-    return false;
-}
-
-/* ----------
- * pgstat_db_requested() -
- *
- *    Checks whether stats for a particular DB need to be written to a file.
- * ----------
- */
-static bool
-pgstat_db_requested(Oid databaseid)
-{
-    /*
-     * If any requests are outstanding at all, we should write the stats for
-     * shared catalogs (the "database" with OID 0).  This ensures that
-     * backends will see up-to-date stats for shared catalogs, even though
-     * they send inquiry messages mentioning only their own DB.
-     */
-    if (databaseid == InvalidOid && pending_write_requests != NIL)
-        return true;
-
-    /* Search to see if there's an open request to write this database. */
-    if (list_member_oid(pending_write_requests, databaseid))
-        return true;
-
-    return false;
+    dshash_detach(t);
+    dshash_release_lock(db_stats, dbentry);
 }
 
 /*
diff --git a/src/backend/replication/basebackup.c b/src/backend/replication/basebackup.c
index cd7d391..c8a08b7 100644
--- a/src/backend/replication/basebackup.c
+++ b/src/backend/replication/basebackup.c
@@ -73,9 +73,6 @@ static void throttle(size_t increment);
 /* Was the backup currently in-progress initiated in recovery mode? */
 static bool backup_started_in_recovery = false;
 
-/* Relative path of temporary statistics directory */
-static char *statrelpath = NULL;
-
 /*
  * Size of each block sent into the tar stream for larger files.
  */
@@ -106,13 +103,6 @@ static TimestampTz throttled_last;
 static const char *excludeDirContents[] =
 {
     /*
-     * Skip temporary statistics files. PG_STAT_TMP_DIR must be skipped even
-     * when stats_temp_directory is set because PGSS_TEXT_FILE is always
-     * created there.
-     */
-    PG_STAT_TMP_DIR,
-
-    /*
      * It is generally not useful to backup the contents of this directory
      * even if the intention is to restore to another master. See backup.sgml
      * for a more detailed description.
@@ -196,11 +186,8 @@ perform_base_backup(basebackup_options *opt)
     TimeLineID    endtli;
     StringInfo    labelfile;
     StringInfo    tblspc_map_file = NULL;
-    int            datadirpathlen;
     List       *tablespaces = NIL;
 
-    datadirpathlen = strlen(DataDir);
-
     backup_started_in_recovery = RecoveryInProgress();
 
     labelfile = makeStringInfo();
@@ -225,18 +212,6 @@ perform_base_backup(basebackup_options *opt)
 
         SendXlogRecPtrResult(startptr, starttli);
 
-        /*
-         * Calculate the relative path of temporary statistics directory in
-         * order to skip the files which are located in that directory later.
-         */
-        if (is_absolute_path(pgstat_stat_directory) &&
-            strncmp(pgstat_stat_directory, DataDir, datadirpathlen) == 0)
-            statrelpath = psprintf("./%s", pgstat_stat_directory + datadirpathlen + 1);
-        else if (strncmp(pgstat_stat_directory, "./", 2) != 0)
-            statrelpath = psprintf("./%s", pgstat_stat_directory);
-        else
-            statrelpath = pgstat_stat_directory;
-
         /* Add a node for the base directory at the end */
         ti = palloc0(sizeof(tablespaceinfo));
         ti->size = opt->progress ? sendDir(".", 1, true, tablespaces, true) : -1;
@@ -1042,17 +1017,6 @@ sendDir(const char *path, int basepathlen, bool sizeonly, List *tablespaces,
             continue;
 
         /*
-         * Exclude contents of directory specified by statrelpath if not set
-         * to the default (pg_stat_tmp) which is caught in the loop above.
-         */
-        if (statrelpath != NULL && strcmp(pathbuf, statrelpath) == 0)
-        {
-            elog(DEBUG1, "contents of directory \"%s\" excluded from backup", statrelpath);
-            size += _tarWriteDir(pathbuf, basepathlen, &statbuf, sizeonly);
-            continue;
-        }
-
-        /*
          * We can skip pg_wal, the WAL segments need to be fetched from the
          * WAL archive anyway. But include it as an empty directory anyway, so
          * we get permissions right.
diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c
index 2d1ed14..16270ff 100644
--- a/src/backend/storage/ipc/ipci.c
+++ b/src/backend/storage/ipc/ipci.c
@@ -150,6 +150,7 @@ CreateSharedMemoryAndSemaphores(bool makePrivate, int port)
         size = add_size(size, SyncScanShmemSize());
         size = add_size(size, AsyncShmemSize());
         size = add_size(size, BackendRandomShmemSize());
+        size = add_size(size, StatsShmemSize());
 #ifdef EXEC_BACKEND
         size = add_size(size, ShmemBackendArraySize());
 #endif
@@ -270,6 +271,7 @@ CreateSharedMemoryAndSemaphores(bool makePrivate, int port)
     SyncScanShmemInit();
     AsyncShmemInit();
     BackendRandomShmemInit();
+    StatsShmemInit();
 
 #ifdef EXEC_BACKEND
 
diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c
index 46f5c42..8ffab2f 100644
--- a/src/backend/storage/lmgr/lwlock.c
+++ b/src/backend/storage/lmgr/lwlock.c
@@ -518,6 +518,9 @@ RegisterLWLockTranches(void)
                           "session_typmod_table");
     LWLockRegisterTranche(LWTRANCHE_TBM, "tbm");
     LWLockRegisterTranche(LWTRANCHE_PARALLEL_APPEND, "parallel_append");
+    LWLockRegisterTranche(LWTRANCHE_STATS_DSA, "stats table dsa");
+    LWLockRegisterTranche(LWTRANCHE_STATS_DB, "db stats");
+    LWLockRegisterTranche(LWTRANCHE_STATS_FUNC_TABLE, "table/func stats");
 
     /* Register named tranches. */
     for (i = 0; i < NamedLWLockTrancheRequests; i++)
diff --git a/src/backend/storage/lmgr/lwlocknames.txt b/src/backend/storage/lmgr/lwlocknames.txt
index e6025ec..798af9f 100644
--- a/src/backend/storage/lmgr/lwlocknames.txt
+++ b/src/backend/storage/lmgr/lwlocknames.txt
@@ -50,3 +50,4 @@ OldSnapshotTimeMapLock                42
 BackendRandomLock                    43
 LogicalRepWorkerLock                44
 CLogTruncationLock                    45
+StatsLock                            46
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 0f7a96d..e2a6fb2 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -185,7 +185,6 @@ static bool check_autovacuum_max_workers(int *newval, void **extra, GucSource so
 static bool check_autovacuum_work_mem(int *newval, void **extra, GucSource source);
 static bool check_effective_io_concurrency(int *newval, void **extra, GucSource source);
 static void assign_effective_io_concurrency(int newval, void *extra);
-static void assign_pgstat_temp_directory(const char *newval, void *extra);
 static bool check_application_name(char **newval, void **extra, GucSource source);
 static void assign_application_name(const char *newval, void *extra);
 static bool check_cluster_name(char **newval, void **extra, GucSource source);
@@ -3562,17 +3561,6 @@ static struct config_string ConfigureNamesString[] =
     },
 
     {
-        {"stats_temp_directory", PGC_SIGHUP, STATS_COLLECTOR,
-            gettext_noop("Writes temporary statistics files to the specified directory."),
-            NULL,
-            GUC_SUPERUSER_ONLY
-        },
-        &pgstat_temp_directory,
-        PG_STAT_TMP_DIR,
-        check_canonical_path, assign_pgstat_temp_directory, NULL
-    },
-
-    {
         {"synchronous_standby_names", PGC_SIGHUP, REPLICATION_MASTER,
             gettext_noop("Number of synchronous standbys and list of names of potential synchronous ones."),
             NULL,
@@ -10438,35 +10426,6 @@ assign_effective_io_concurrency(int newval, void *extra)
 #endif                            /* USE_PREFETCH */
 }
 
-static void
-assign_pgstat_temp_directory(const char *newval, void *extra)
-{
-    /* check_canonical_path already canonicalized newval for us */
-    char       *dname;
-    char       *tname;
-    char       *fname;
-
-    /* directory */
-    dname = guc_malloc(ERROR, strlen(newval) + 1);    /* runtime dir */
-    sprintf(dname, "%s", newval);
-
-    /* global stats */
-    tname = guc_malloc(ERROR, strlen(newval) + 12); /* /global.tmp */
-    sprintf(tname, "%s/global.tmp", newval);
-    fname = guc_malloc(ERROR, strlen(newval) + 13); /* /global.stat */
-    sprintf(fname, "%s/global.stat", newval);
-
-    if (pgstat_stat_directory)
-        free(pgstat_stat_directory);
-    pgstat_stat_directory = dname;
-    if (pgstat_stat_tmpname)
-        free(pgstat_stat_tmpname);
-    pgstat_stat_tmpname = tname;
-    if (pgstat_stat_filename)
-        free(pgstat_stat_filename);
-    pgstat_stat_filename = fname;
-}
-
 static bool
 check_application_name(char **newval, void **extra, GucSource source)
 {
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 842cf36..529c093 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -492,7 +492,6 @@
 #track_io_timing = off
 #track_functions = none            # none, pl, all
 #track_activity_query_size = 1024    # (change requires restart)
-#stats_temp_directory = 'pg_stat_tmp'
 
 
 # - Monitoring -
diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c
index ddc850d..0e0511f 100644
--- a/src/bin/initdb/initdb.c
+++ b/src/bin/initdb/initdb.c
@@ -216,7 +216,6 @@ static const char *const subdirs[] = {
     "pg_replslot",
     "pg_tblspc",
     "pg_stat",
-    "pg_stat_tmp",
     "pg_xact",
     "pg_logical",
     "pg_logical/snapshots",
diff --git a/src/bin/pg_basebackup/t/010_pg_basebackup.pl b/src/bin/pg_basebackup/t/010_pg_basebackup.pl
index cdf4f5b..a25de6d 100644
--- a/src/bin/pg_basebackup/t/010_pg_basebackup.pl
+++ b/src/bin/pg_basebackup/t/010_pg_basebackup.pl
@@ -78,7 +78,7 @@ is_deeply(
 
 # Contents of these directories should not be copied.
 foreach my $dirname (
-    qw(pg_dynshmem pg_notify pg_replslot pg_serial pg_snapshots pg_stat_tmp pg_subtrans)
+    qw(pg_dynshmem pg_notify pg_replslot pg_serial pg_snapshots pg_subtrans)
   )
 {
     is_deeply(
diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h
index b054dab..8a197bf 100644
--- a/src/include/miscadmin.h
+++ b/src/include/miscadmin.h
@@ -407,6 +407,7 @@ extern AuxProcType MyAuxProcType;
 #define AmCheckpointerProcess()        (MyAuxProcType == CheckpointerProcess)
 #define AmWalWriterProcess()        (MyAuxProcType == WalWriterProcess)
 #define AmWalReceiverProcess()        (MyAuxProcType == WalReceiverProcess)
+#define AmStatsCollectorProcess()    (MyAuxProcType == StatsCollectorProcess)
 
 
 /*****************************************************************************
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index e2a1e21..b48741f 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -13,6 +13,7 @@
 
 #include "datatype/timestamp.h"
 #include "fmgr.h"
+#include "lib/dshash.h"
 #include "libpq/pqcomm.h"
 #include "port/atomics.h"
 #include "portability/instr_time.h"
@@ -30,9 +31,6 @@
 #define PGSTAT_STAT_PERMANENT_FILENAME        "pg_stat/global.stat"
 #define PGSTAT_STAT_PERMANENT_TMPFILE        "pg_stat/global.tmp"
 
-/* Default directory to store temporary statistics data in */
-#define PG_STAT_TMP_DIR        "pg_stat_tmp"
-
 /* Values for track_functions GUC variable --- order is significant! */
 typedef enum TrackFunctionsLevel
 {
@@ -48,7 +46,6 @@ typedef enum TrackFunctionsLevel
 typedef enum StatMsgType
 {
     PGSTAT_MTYPE_DUMMY,
-    PGSTAT_MTYPE_INQUIRY,
     PGSTAT_MTYPE_TABSTAT,
     PGSTAT_MTYPE_TABPURGE,
     PGSTAT_MTYPE_DROPDB,
@@ -216,35 +213,6 @@ typedef struct PgStat_MsgDummy
     PgStat_MsgHdr m_hdr;
 } PgStat_MsgDummy;
 
-
-/* ----------
- * PgStat_MsgInquiry            Sent by a backend to ask the collector
- *                                to write the stats file(s).
- *
- * Ordinarily, an inquiry message prompts writing of the global stats file,
- * the stats file for shared catalogs, and the stats file for the specified
- * database.  If databaseid is InvalidOid, only the first two are written.
- *
- * New file(s) will be written only if the existing file has a timestamp
- * older than the specified cutoff_time; this prevents duplicated effort
- * when multiple requests arrive at nearly the same time, assuming that
- * backends send requests with cutoff_times a little bit in the past.
- *
- * clock_time should be the requestor's current local time; the collector
- * uses this to check for the system clock going backward, but it has no
- * effect unless that occurs.  We assume clock_time >= cutoff_time, though.
- * ----------
- */
-
-typedef struct PgStat_MsgInquiry
-{
-    PgStat_MsgHdr m_hdr;
-    TimestampTz clock_time;        /* observed local clock time */
-    TimestampTz cutoff_time;    /* minimum acceptable file timestamp */
-    Oid            databaseid;        /* requested DB (InvalidOid => shared only) */
-} PgStat_MsgInquiry;
-
-
 /* ----------
  * PgStat_TableEntry            Per-table info in a MsgTabstat
  * ----------
@@ -539,7 +507,6 @@ typedef union PgStat_Msg
 {
     PgStat_MsgHdr msg_hdr;
     PgStat_MsgDummy msg_dummy;
-    PgStat_MsgInquiry msg_inquiry;
     PgStat_MsgTabstat msg_tabstat;
     PgStat_MsgTabpurge msg_tabpurge;
     PgStat_MsgDropdb msg_dropdb;
@@ -601,10 +568,13 @@ typedef struct PgStat_StatDBEntry
 
     /*
      * tables and functions must be last in the struct, because we don't write
-     * the pointers out to the stats file.
+     * the handles and pointers out to the stats file.
      */
-    HTAB       *tables;
-    HTAB       *functions;
+    dshash_table_handle tables;
+    dshash_table_handle functions;
+    /* for snapshot tables */
+    dshash_table *snapshot_tables;
+    dshash_table *snapshot_functions;
 } PgStat_StatDBEntry;
 
 
@@ -1201,6 +1171,7 @@ extern PgStat_BackendFunctionEntry *find_funcstat_entry(Oid func_id);
 extern void pgstat_initstats(Relation rel);
 
 extern char *pgstat_clip_activity(const char *raw_activity);
+extern PgStat_StatTabEntry *backend_get_tab_entry(PgStat_StatDBEntry *dbent, Oid relid);
 
 /* ----------
  * pgstat_report_wait_start() -
@@ -1328,6 +1299,8 @@ extern void pgstat_send_bgwriter(void);
  * generate the pgstat* views.
  * ----------
  */
+extern Size StatsShmemSize(void);
+extern void StatsShmemInit(void);
 extern void PgstatCollectorMain(void);
 extern PgStat_StatDBEntry *pgstat_fetch_stat_dbentry(Oid dbid);
 extern PgStat_StatTabEntry *pgstat_fetch_stat_tabentry(Oid relid);
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h
index 460843d..0b17d3e 100644
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -217,6 +217,9 @@ typedef enum BuiltinTrancheIds
     LWTRANCHE_SESSION_TYPMOD_TABLE,
     LWTRANCHE_TBM,
     LWTRANCHE_PARALLEL_APPEND,
+    LWTRANCHE_STATS_DSA,
+    LWTRANCHE_STATS_DB,
+    LWTRANCHE_STATS_FUNC_TABLE,
     LWTRANCHE_FIRST_USER_DEFINED
 }            BuiltinTrancheIds;
 
-- 
2.9.2


В списке pgsql-hackers по дате отправления:

Предыдущее
От: Kyotaro HORIGUCHI
Дата:
Сообщение: Re: [HACKERS] asynchronous execution
Следующее
От: Ashutosh Bapat
Дата:
Сообщение: Re: Mention ordered datums in PartitionBoundInfoData comment