Better client reporting for "immediate stop" shutdowns

Поиск
Список
Период
Сортировка
От Tom Lane
Тема Better client reporting for "immediate stop" shutdowns
Дата
Msg-id 559291.1608587013@sss.pgh.pa.us
обсуждение исходный текст
Ответы Re: Better client reporting for "immediate stop" shutdowns  (Bharath Rupireddy <bharath.rupireddyforpostgres@gmail.com>)
Re: Better client reporting for "immediate stop" shutdowns  (Andres Freund <andres@anarazel.de>)
Список pgsql-hackers
Up to now, if you shut down the database with "pg_ctl stop -m immediate"
then clients get a scary message claiming that something has crashed,
because backends can't tell whether the SIGQUIT they got was sent for
a crash-and-restart situation or because of an immediate-stop command.

This isn't great from a fit-and-finish perspective, and it occurs to me
that it's really easy to do better: the postmaster can stick a flag
into shared memory explaining the reason for SIGQUIT.  While we don't
like the postmaster touching shared memory, there doesn't seem to be
any need for interlocking or anything like that, so there is no risk
involved that's greater than those already taken by pmsignal.c.

So, here's a very simple proposed patch.  Some issues for possible
bikeshedding:

* Up to now, pmsignal.c has only been for child-to-postmaster
communication, so maybe there is some better place to put the
support code.  I can't think of one though.

* I chose to report the same message for immediate shutdown as we
already use for SIGTERM (fast shutdown or pg_terminate_backend()).
Should it be different, and if so what?

            regards, tom lane

diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index 5d09822c81..3dd0a022f7 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -2887,6 +2887,8 @@ pmdie(SIGNAL_ARGS)
             sd_notify(0, "STOPPING=1");
 #endif

+            /* tell children to shut down ASAP */
+            SetQuitSignalReason(PMQUIT_FOR_STOP);
             TerminateChildren(SIGQUIT);
             pmState = PM_WAIT_BACKENDS;

@@ -3464,6 +3466,7 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
         LogChildExit(LOG, procname, pid, exitstatus);
         ereport(LOG,
                 (errmsg("terminating any other active server processes")));
+        SetQuitSignalReason(PMQUIT_FOR_CRASH);
     }

     /* Process background workers. */
diff --git a/src/backend/storage/ipc/pmsignal.c b/src/backend/storage/ipc/pmsignal.c
index 94c65877c1..8ef3f6da4a 100644
--- a/src/backend/storage/ipc/pmsignal.c
+++ b/src/backend/storage/ipc/pmsignal.c
@@ -1,7 +1,7 @@
 /*-------------------------------------------------------------------------
  *
  * pmsignal.c
- *      routines for signaling the postmaster from its child processes
+ *      routines for signaling between the postmaster and its child processes
  *
  *
  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
@@ -55,6 +55,10 @@
  * but carries the extra information that the child is a WAL sender.
  * WAL senders too start in ACTIVE state, but switch to WALSENDER once they
  * start streaming the WAL (and they never go back to ACTIVE after that).
+ *
+ * We also have a shared-memory field that is used for communication in
+ * the opposite direction, from postmaster to children: it tells why the
+ * postmaster has broadcasted SIGQUIT signals, if indeed it has done so.
  */

 #define PM_CHILD_UNUSED        0    /* these values must fit in sig_atomic_t */
@@ -65,8 +69,10 @@
 /* "typedef struct PMSignalData PMSignalData" appears in pmsignal.h */
 struct PMSignalData
 {
-    /* per-reason flags */
+    /* per-reason flags for signaling the postmaster */
     sig_atomic_t PMSignalFlags[NUM_PMSIGNALS];
+    /* global flags for signals from postmaster to children */
+    QuitSignalReason sigquit_reason;    /* why SIGQUIT was sent */
     /* per-child-process flags */
     int            num_child_flags;    /* # of entries in PMChildFlags[] */
     int            next_child_flag;    /* next slot to try to assign */
@@ -134,6 +140,7 @@ PMSignalShmemInit(void)

     if (!found)
     {
+        /* initialize all flags to zeroes */
         MemSet(unvolatize(PMSignalData *, PMSignalState), 0, PMSignalShmemSize());
         PMSignalState->num_child_flags = MaxLivePostmasterChildren();
     }
@@ -171,6 +178,34 @@ CheckPostmasterSignal(PMSignalReason reason)
     return false;
 }

+/*
+ * SetQuitSignalReason - broadcast the reason for a system shutdown.
+ * Should be called by postmaster before sending SIGQUIT to children.
+ *
+ * Note: in a crash-and-restart scenario, the "reason" field gets cleared
+ * as a part of rebuilding shared memory; the postmaster need not do it
+ * explicitly.
+ */
+void
+SetQuitSignalReason(QuitSignalReason reason)
+{
+    PMSignalState->sigquit_reason = reason;
+}
+
+/*
+ * GetQuitSignalReason - obtain the reason for a system shutdown.
+ * Called by child processes when they receive SIGQUIT.
+ * If the postmaster hasn't actually sent SIGQUIT, will return PMQUIT_NOT_SENT.
+ */
+QuitSignalReason
+GetQuitSignalReason(void)
+{
+    /* This is called in signal handlers, so be extra paranoid. */
+    if (!IsUnderPostmaster || PMSignalState == NULL)
+        return PMQUIT_NOT_SENT;
+    return PMSignalState->sigquit_reason;
+}
+

 /*
  * AssignPostmasterChildSlot - select an unused slot for a new postmaster
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index 3679799e50..c3c0a0c01d 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -67,6 +67,7 @@
 #include "rewrite/rewriteHandler.h"
 #include "storage/bufmgr.h"
 #include "storage/ipc.h"
+#include "storage/pmsignal.h"
 #include "storage/proc.h"
 #include "storage/procsignal.h"
 #include "storage/sinval.h"
@@ -2752,8 +2753,8 @@ drop_unnamed_stmt(void)
 /*
  * quickdie() occurs when signaled SIGQUIT by the postmaster.
  *
- * Some backend has bought the farm,
- * so we need to stop what we're doing and exit.
+ * Either some backend has bought the farm, or we've been told to shut down
+ * "immediately"; so we need to stop what we're doing and exit.
  */
 void
 quickdie(SIGNAL_ARGS)
@@ -2788,18 +2789,36 @@ quickdie(SIGNAL_ARGS)
      * wrong, so there's not much to lose.  Assuming the postmaster is still
      * running, it will SIGKILL us soon if we get stuck for some reason.
      *
-     * Ideally this should be ereport(FATAL), but then we'd not get control
-     * back...
+     * Ideally these should be ereport(FATAL), but then we'd not get control
+     * back to force the correct type of process exit.
      */
-    ereport(WARNING,
-            (errcode(ERRCODE_CRASH_SHUTDOWN),
-             errmsg("terminating connection because of crash of another server process"),
-             errdetail("The postmaster has commanded this server process to roll back"
-                       " the current transaction and exit, because another"
-                       " server process exited abnormally and possibly corrupted"
-                       " shared memory."),
-             errhint("In a moment you should be able to reconnect to the"
-                     " database and repeat your command.")));
+    switch (GetQuitSignalReason())
+    {
+        case PMQUIT_NOT_SENT:
+            /* Hmm, SIGQUIT arrived out of the blue */
+            ereport(WARNING,
+                    (errcode(ERRCODE_ADMIN_SHUTDOWN),
+                     errmsg("terminating connection because of unexpected SIGQUIT signal")));
+            break;
+        case PMQUIT_FOR_CRASH:
+            /* A crash-and-restart cycle is in progress */
+            ereport(WARNING,
+                    (errcode(ERRCODE_CRASH_SHUTDOWN),
+                     errmsg("terminating connection because of crash of another server process"),
+                     errdetail("The postmaster has commanded this server process to roll back"
+                               " the current transaction and exit, because another"
+                               " server process exited abnormally and possibly corrupted"
+                               " shared memory."),
+                     errhint("In a moment you should be able to reconnect to the"
+                             " database and repeat your command.")));
+            break;
+        case PMQUIT_FOR_STOP:
+            /* Immediate-mode stop */
+            ereport(WARNING,
+                    (errcode(ERRCODE_ADMIN_SHUTDOWN),
+                     errmsg("terminating connection due to administrator command")));
+            break;
+    }

     /*
      * We DO NOT want to run proc_exit() or atexit() callbacks -- we're here
diff --git a/src/include/storage/pmsignal.h b/src/include/storage/pmsignal.h
index 56c5ec4481..b6aa158160 100644
--- a/src/include/storage/pmsignal.h
+++ b/src/include/storage/pmsignal.h
@@ -1,7 +1,7 @@
 /*-------------------------------------------------------------------------
  *
  * pmsignal.h
- *      routines for signaling the postmaster from its child processes
+ *      routines for signaling between the postmaster and its child processes
  *
  *
  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
@@ -45,6 +45,16 @@ typedef enum
     NUM_PMSIGNALS                /* Must be last value of enum! */
 } PMSignalReason;

+/*
+ * Reasons why the postmaster would send SIGQUIT to its children.
+ */
+typedef enum
+{
+    PMQUIT_NOT_SENT = 0,        /* postmaster hasn't sent SIGQUIT */
+    PMQUIT_FOR_CRASH,            /* some other backend bought the farm */
+    PMQUIT_FOR_STOP                /* immediate stop was commanded */
+} QuitSignalReason;
+
 /* PMSignalData is an opaque struct, details known only within pmsignal.c */
 typedef struct PMSignalData PMSignalData;

@@ -55,6 +65,8 @@ extern Size PMSignalShmemSize(void);
 extern void PMSignalShmemInit(void);
 extern void SendPostmasterSignal(PMSignalReason reason);
 extern bool CheckPostmasterSignal(PMSignalReason reason);
+extern void SetQuitSignalReason(QuitSignalReason reason);
+extern QuitSignalReason GetQuitSignalReason(void);
 extern int    AssignPostmasterChildSlot(void);
 extern bool ReleasePostmasterChildSlot(int slot);
 extern bool IsPostmasterChildWalSender(int slot);

В списке pgsql-hackers по дате отправления:

Предыдущее
От: Bruce Momjian
Дата:
Сообщение: Re: Proposed patch for key managment
Следующее
От: Bruce Momjian
Дата:
Сообщение: Re: Proposed patch for key managment