diff -rcN postgresql/doc/src/sgml/backup.sgml postgresql_with_patch/doc/src/sgml/backup.sgml
*** postgresql/doc/src/sgml/backup.sgml 2011-09-12 05:19:14.000000000 +0900
--- postgresql_with_patch/doc/src/sgml/backup.sgml 2011-09-12 05:24:42.000000000 +0900
***************
*** 724,734 ****
Making a Base Backup
! The procedure for making a base backup is relatively simple:
! Ensure that WAL archiving is enabled and working.
--- 724,736 ----
Making a Base Backup
! The procedure for making a base backup is relatively simple. This can
! also run on hot standby, the procedure is a little different:
! Ensure that WAL archiving is enabled and working. On hot standby then
! this does not need to ensure since there is no WAL archiving originally.
***************
*** 780,785 ****
--- 782,795 ----
+ Copy pg_control> file to the backup taken by above-procedure.
+ This needs on hot standby. This is performed to recovery with Minimum
+ recovery ending location in pg_control> instead of backup end
+ record.
+
+
+
+
Again connect to the database as a superuser, and issue the command:
SELECT pg_stop_backup();
***************
*** 788,793 ****
--- 798,805 ----
the next WAL segment. The reason for the switch is to arrange for
the last WAL segment file written during the backup interval to be
ready to archive.
+ On hot standby then this terminates the backup mode but does not perform
+ an automatic switch.
***************
*** 808,813 ****
--- 820,827 ----
If you wish to place a time limit on the execution of
pg_stop_backup>, set an appropriate
statement_timeout value.
+ On hot standby then this does not perform. If WAL archiving is used,
+ ensure to complete archiving as far as pg_stop_backup> result.
***************
*** 933,938 ****
--- 947,964 ----
backup dump is which and how far back the associated WAL files go.
It is generally better to follow the continuous archiving procedure above.
+
+
+ pg_stop_backup> result on hot standby is may be incorrect. But
+ this value is greater than the correct value. If this value is used in
+ recovery then a phenomenon that WAL is not enough does not happen.
+
+
+
+ When you run in hotstandby pg_start_backup>, and, if promoted
+ to master when you run the pg_stop_backup>,
+ pg_stop_backup> will be failed. Retake the backup then.
+
diff -rcN postgresql/src/backend/access/transam/xlog.c postgresql_with_patch/src/backend/access/transam/xlog.c
*** postgresql/src/backend/access/transam/xlog.c 2011-09-12 05:19:14.000000000 +0900
--- postgresql_with_patch/src/backend/access/transam/xlog.c 2011-09-12 05:24:42.000000000 +0900
***************
*** 664,670 ****
#endif
static void pg_start_backup_callback(int code, Datum arg);
static bool read_backup_label(XLogRecPtr *checkPointLoc,
! bool *backupEndRequired);
static void rm_redo_error_callback(void *arg);
static int get_sync_bit(int method);
--- 664,670 ----
#endif
static void pg_start_backup_callback(int code, Datum arg);
static bool read_backup_label(XLogRecPtr *checkPointLoc,
! bool *backupEndRequired, char *backupfromstr);
static void rm_redo_error_callback(void *arg);
static int get_sync_bit(int method);
***************
*** 6023,6028 ****
--- 6023,6029 ----
uint32 freespace;
TransactionId oldestActiveXID;
bool backupEndRequired = false;
+ char backupfromstr[10];
/*
* Read control file and check XLOG status looks valid.
***************
*** 6061,6067 ****
(errmsg("database system was interrupted while in recovery at log time %s",
str_time(ControlFile->checkPointCopy.time)),
errhint("If this has occurred more than once some data might be corrupted"
! " and you might need to choose an earlier recovery target.")));
else if (ControlFile->state == DB_IN_PRODUCTION)
ereport(LOG,
(errmsg("database system was interrupted; last known up at %s",
--- 6062,6069 ----
(errmsg("database system was interrupted while in recovery at log time %s",
str_time(ControlFile->checkPointCopy.time)),
errhint("If this has occurred more than once some data might be corrupted"
! " and does not take online backup from hot standby"
! " then you might need to choose an earlier recovery target.")));
else if (ControlFile->state == DB_IN_PRODUCTION)
ereport(LOG,
(errmsg("database system was interrupted; last known up at %s",
***************
*** 6156,6162 ****
if (StandbyMode)
OwnLatch(&XLogCtl->recoveryWakeupLatch);
! if (read_backup_label(&checkPointLoc, &backupEndRequired))
{
/*
* When a backup_label file is present, we want to roll forward from
--- 6158,6164 ----
if (StandbyMode)
OwnLatch(&XLogCtl->recoveryWakeupLatch);
! if (read_backup_label(&checkPointLoc, &backupEndRequired, backupfromstr))
{
/*
* When a backup_label file is present, we want to roll forward from
***************
*** 6307,6312 ****
--- 6309,6328 ----
volatile XLogCtlData *xlogctl = XLogCtl;
/*
+ * set backupEndPoint at the start if we take online backup from
+ * hot standby. backupEndPoint is used to distinguish whether the
+ * backup is taken from master or hot stanby. If backupStartPoint
+ * and backupEndPoint is invalid then this is meaning the first
+ * recovery.
+ */
+ if (XLogRecPtrIsInvalid(ControlFile->backupStartPoint) &&
+ XLogRecPtrIsInvalid(ControlFile->backupEndPoint))
+ {
+ if (!XLogRecPtrIsInvalid(ControlFile->minRecoveryPoint))
+ ControlFile->backupEndPoint = ControlFile->minRecoveryPoint;
+ }
+
+ /*
* Update pg_control to show that we are recovering and to show the
* selected checkpoint as the place we are starting from. We also mark
* pg_control with any minimum recovery stop point obtained from a
***************
*** 6618,6623 ****
--- 6634,6660 ----
error_context_stack = errcontext.previous;
/*
+ * Check whether redo reaches minRecoveryPoint if we take online
+ * backup from hot standby. Because we can not write backup end
+ * record when we execute pg_stop_backup under the situation.
+ */
+ if (!XLogRecPtrIsInvalid(ControlFile->backupEndPoint))
+ {
+ if (XLByteLE(ControlFile->backupEndPoint, EndRecPtr))
+ {
+ elog(DEBUG1, "end of backup reached in the control file");
+
+ LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+
+ MemSet(&ControlFile->backupStartPoint, 0, sizeof(XLogRecPtr));
+ MemSet(&ControlFile->backupEndPoint, 0, sizeof(XLogRecPtr));
+ UpdateControlFile();
+
+ LWLockRelease(ControlFileLock);
+ }
+ }
+
+ /*
* Update shared recoveryLastRecPtr after this record has been
* replayed.
*/
***************
*** 8414,8423 ****
/*
* If we see a shutdown checkpoint while waiting for an end-of-backup
* record, the backup was canceled and the end-of-backup record will
! * never arrive.
*/
if (InArchiveRecovery &&
! !XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
ereport(ERROR,
(errmsg("online backup was canceled, recovery cannot continue")));
--- 8451,8463 ----
/*
* If we see a shutdown checkpoint while waiting for an end-of-backup
* record, the backup was canceled and the end-of-backup record will
! * never arrive. If the backup is taken from hot standby then this
! * error is not output because there is a case of shutdown on master
! * during taking online backup from hot standby.
*/
if (InArchiveRecovery &&
! !XLogRecPtrIsInvalid(ControlFile->backupStartPoint) &&
! XLogRecPtrIsInvalid(ControlFile->backupEndPoint))
ereport(ERROR,
(errmsg("online backup was canceled, recovery cannot continue")));
***************
*** 8880,8885 ****
--- 8920,8926 ----
do_pg_start_backup(const char *backupidstr, bool fast, char **labelfile)
{
bool exclusive = (labelfile == NULL);
+ bool duringrecovery = false;
XLogRecPtr checkpointloc;
XLogRecPtr startpoint;
pg_time_t stamp_time;
***************
*** 8890,8908 ****
struct stat stat_buf;
FILE *fp;
StringInfoData labelfbuf;
if (!superuser() && !is_authenticated_user_replication_role())
ereport(ERROR,
(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
errmsg("must be superuser or replication role to run a backup")));
if (RecoveryInProgress())
! ereport(ERROR,
! (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
! errmsg("recovery is in progress"),
! errhint("WAL control functions cannot be executed during recovery.")));
! if (!XLogIsNeeded())
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("WAL level not sufficient for making an online backup"),
--- 8931,8956 ----
struct stat stat_buf;
FILE *fp;
StringInfoData labelfbuf;
+ char *backupfromstr = BACKUP_FROM_MASTER;
if (!superuser() && !is_authenticated_user_replication_role())
ereport(ERROR,
(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
errmsg("must be superuser or replication role to run a backup")));
+ /*
+ * check whether during recovery, and determine a string on backup_label.
+ * If duringrecovery is true here then the subsequent process on WAL (check
+ * wal_level, RequestXLogSwitch, forcePageWrites and gotUniqueStartpoint
+ * by RequestCheckpoint) is skipped because hot standby can not write a wal.
+ */
if (RecoveryInProgress())
! {
! duringrecovery = true;
! backupfromstr = BACKUP_FROM_SLAVE;
! }
! if (!XLogIsNeeded() && !duringrecovery)
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("WAL level not sufficient for making an online backup"),
***************
*** 8925,8931 ****
* directory was not included in the base backup and the WAL archive was
* cleared too before starting the backup.
*/
! RequestXLogSwitch();
/*
* Mark backup active in shared memory. We must do full-page WAL writes
--- 8973,8980 ----
* directory was not included in the base backup and the WAL archive was
* cleared too before starting the backup.
*/
! if (!duringrecovery)
! RequestXLogSwitch();
/*
* Mark backup active in shared memory. We must do full-page WAL writes
***************
*** 8959,8965 ****
}
else
XLogCtl->Insert.nonExclusiveBackups++;
! XLogCtl->Insert.forcePageWrites = true;
LWLockRelease(WALInsertLock);
/* Ensure we release forcePageWrites if fail below */
--- 9008,9015 ----
}
else
XLogCtl->Insert.nonExclusiveBackups++;
! if (!duringrecovery)
! XLogCtl->Insert.forcePageWrites = true;
LWLockRelease(WALInsertLock);
/* Ensure we release forcePageWrites if fail below */
***************
*** 9010,9016 ****
gotUniqueStartpoint = true;
}
LWLockRelease(WALInsertLock);
! } while (!gotUniqueStartpoint);
XLByteToSeg(startpoint, _logId, _logSeg);
XLogFileName(xlogfilename, ThisTimeLineID, _logId, _logSeg);
--- 9060,9066 ----
gotUniqueStartpoint = true;
}
LWLockRelease(WALInsertLock);
! } while (!gotUniqueStartpoint && !duringrecovery);
XLByteToSeg(startpoint, _logId, _logSeg);
XLogFileName(xlogfilename, ThisTimeLineID, _logId, _logSeg);
***************
*** 9033,9038 ****
--- 9083,9089 ----
exclusive ? "pg_start_backup" : "streamed");
appendStringInfo(&labelfbuf, "START TIME: %s\n", strfbuf);
appendStringInfo(&labelfbuf, "LABEL: %s\n", backupidstr);
+ appendStringInfo(&labelfbuf, "FROM: %s\n", backupfromstr);
/*
* Okay, write the file, or return its contents to caller.
***************
*** 9105,9111 ****
}
if (!XLogCtl->Insert.exclusiveBackup &&
! XLogCtl->Insert.nonExclusiveBackups == 0)
{
XLogCtl->Insert.forcePageWrites = false;
}
--- 9156,9163 ----
}
if (!XLogCtl->Insert.exclusiveBackup &&
! XLogCtl->Insert.nonExclusiveBackups == 0 &&
! !RecoveryInProgress())
{
XLogCtl->Insert.forcePageWrites = false;
}
***************
*** 9149,9154 ****
--- 9201,9207 ----
do_pg_stop_backup(char *labelfile, bool waitforarchive)
{
bool exclusive = (labelfile == NULL);
+ bool duringrecovery = false;
XLogRecPtr startpoint;
XLogRecPtr stoppoint;
XLogRecData rdata;
***************
*** 9168,9192 ****
int waits = 0;
bool reported_waiting = false;
char *remaining;
if (!superuser() && !is_authenticated_user_replication_role())
ereport(ERROR,
(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
(errmsg("must be superuser or replication role to run a backup"))));
if (RecoveryInProgress())
! ereport(ERROR,
! (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
! errmsg("recovery is in progress"),
! errhint("WAL control functions cannot be executed during recovery.")));
! if (!XLogIsNeeded())
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("WAL level not sufficient for making an online backup"),
errhint("wal_level must be set to \"archive\" or \"hot_standby\" at server start.")));
/*
* OK to update backup counters and forcePageWrites
*/
LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
--- 9221,9273 ----
int waits = 0;
bool reported_waiting = false;
char *remaining;
+ XLogRecPtr checkPointLoc;
+ bool backupEndRequired = false;
+ char backupfromstr[10];
if (!superuser() && !is_authenticated_user_replication_role())
ereport(ERROR,
(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
(errmsg("must be superuser or replication role to run a backup"))));
+ /*
+ * check whether during recovery. If duringrecovery is true here then the
+ * subsequent process on WAL (check wal_level, forcePageWrites, XLogInsert,
+ * RequestXLogSwitch, write the backup history file and XLogArchivingActive)
+ * is skipped because hot standby can not write a wal.
+ */
if (RecoveryInProgress())
! duringrecovery = true;
! if (!XLogIsNeeded() && !duringrecovery)
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("WAL level not sufficient for making an online backup"),
errhint("wal_level must be set to \"archive\" or \"hot_standby\" at server start.")));
/*
+ * backupfromstr is taken from backup_label, which is where we
+ * execute pg_start_backup on. check whether this state is equals it.
+ * If read_backup_label function returns error, this function is
+ * failed by error handling after this.
+ */
+ if (read_backup_label(&checkPointLoc, &backupEndRequired, backupfromstr))
+ {
+ if (duringrecovery == false &&
+ strcmp(backupfromstr, BACKUP_FROM_MASTER) != 0 &&
+ ControlFile->state != DB_IN_PRODUCTION)
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("different state than when pg_start_backup")));
+ if (duringrecovery == true &&
+ strcmp(backupfromstr, BACKUP_FROM_SLAVE) != 0 &&
+ ControlFile->state != DB_IN_ARCHIVE_RECOVERY)
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("different state than when pg_start_backup")));
+ }
+
+ /*
* OK to update backup counters and forcePageWrites
*/
LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
***************
*** 9205,9211 ****
}
if (!XLogCtl->Insert.exclusiveBackup &&
! XLogCtl->Insert.nonExclusiveBackups == 0)
{
XLogCtl->Insert.forcePageWrites = false;
}
--- 9286,9293 ----
}
if (!XLogCtl->Insert.exclusiveBackup &&
! XLogCtl->Insert.nonExclusiveBackups == 0 &&
! !duringrecovery)
{
XLogCtl->Insert.forcePageWrites = false;
}
***************
*** 9259,9264 ****
--- 9341,9353 ----
}
/*
+ * When pg_stop_backup is excuted on hot standby server, the result
+ * is minRecoveryPoint in the control file.
+ */
+ if (duringrecovery)
+ return ControlFile->minRecoveryPoint;
+
+ /*
* Read and parse the START WAL LOCATION line (this code is pretty crude,
* but we are not expecting any variability in the file format).
*/
***************
*** 9416,9422 ****
XLogCtl->Insert.nonExclusiveBackups--;
if (!XLogCtl->Insert.exclusiveBackup &&
! XLogCtl->Insert.nonExclusiveBackups == 0)
{
XLogCtl->Insert.forcePageWrites = false;
}
--- 9505,9512 ----
XLogCtl->Insert.nonExclusiveBackups--;
if (!XLogCtl->Insert.exclusiveBackup &&
! XLogCtl->Insert.nonExclusiveBackups == 0 &&
! !RecoveryInProgress())
{
XLogCtl->Insert.forcePageWrites = false;
}
***************
*** 9790,9802 ****
* streamed backup, *backupEndRequired is set to TRUE.
*/
static bool
! read_backup_label(XLogRecPtr *checkPointLoc, bool *backupEndRequired)
{
char startxlogfilename[MAXFNAMELEN];
TimeLineID tli;
FILE *lfp;
char ch;
char backuptype[20];
*backupEndRequired = false;
--- 9880,9893 ----
* streamed backup, *backupEndRequired is set to TRUE.
*/
static bool
! read_backup_label(XLogRecPtr *checkPointLoc, bool *backupEndRequired, char *backupfromstr)
{
char startxlogfilename[MAXFNAMELEN];
TimeLineID tli;
FILE *lfp;
char ch;
char backuptype[20];
+ char strbuff[256];
*backupEndRequired = false;
***************
*** 9842,9847 ****
--- 9933,9952 ----
*backupEndRequired = true;
}
+ fgets(strbuff, sizeof(strbuff), lfp); /* skip one line */
+ fgets(strbuff, sizeof(strbuff), lfp); /* skip one line */
+
+ /*
+ * Read and parse the FROM line. If not read then output WARNING message
+ * and set BACKUP_FROM_MASTER.
+ *
+ */
+ strcpy(backupfromstr, BACKUP_FROM_MASTER);
+ if (fscanf(lfp, "FROM: %s%c", backupfromstr, &ch) != 2 || ch != '\n')
+ ereport(WARNING,
+ (errmsg("loaded old file \"%s\", set backup from \"%s\"",
+ BACKUP_LABEL_FILE, BACKUP_FROM_MASTER)));
+
if (ferror(lfp) || FreeFile(lfp))
ereport(FATAL,
(errcode_for_file_access(),
diff -rcN postgresql/src/backend/postmaster/postmaster.c postgresql_with_patch/src/backend/postmaster/postmaster.c
*** postgresql/src/backend/postmaster/postmaster.c 2011-09-12 05:19:14.000000000 +0900
--- postgresql_with_patch/src/backend/postmaster/postmaster.c 2011-09-12 05:24:42.000000000 +0900
***************
*** 287,292 ****
--- 287,295 ----
static PMState pmState = PM_INIT;
static bool ReachedNormalRunning = false; /* T if we've reached PM_RUN */
+ static bool ReachedHotStandbyRunning = false; /* T if we've reached PM_HOT_STANDBY */
+ static bool WaitBackupForHotStandby = false; /* T if we've moved from PM_WAIT_READONLY
+ * to PM_WAIT_BACKUP */
bool ClientAuthInProgress = false; /* T during new-client
* authentication */
***************
*** 2825,2831 ****
* PM_WAIT_BACKUP state ends when online backup mode is not active.
*/
if (!BackupInProgress())
! pmState = PM_WAIT_BACKENDS;
}
if (pmState == PM_WAIT_READONLY)
--- 2828,2845 ----
* PM_WAIT_BACKUP state ends when online backup mode is not active.
*/
if (!BackupInProgress())
! {
! /*
! * WaitBackupForHotStandby, this flag is if we execute
! * smart shutdown during takeing online backup from hot standby.
! * If the flag is true then we need to change PM_WAIT_BACKENDS
! * at the root of PM_WAIT_READONLY.
! */
! if (!WaitBackupForHotStandby)
! pmState = PM_WAIT_BACKENDS;
! else
! pmState = PM_WAIT_READONLY;
! }
}
if (pmState == PM_WAIT_READONLY)
***************
*** 2840,2850 ****
*/
if (CountChildren(BACKEND_TYPE_NORMAL) == 0)
{
! if (StartupPID != 0)
! signal_child(StartupPID, SIGTERM);
! if (WalReceiverPID != 0)
! signal_child(WalReceiverPID, SIGTERM);
! pmState = PM_WAIT_BACKENDS;
}
}
--- 2854,2878 ----
*/
if (CountChildren(BACKEND_TYPE_NORMAL) == 0)
{
! if (!BackupInProgress())
! {
! if (StartupPID != 0)
! signal_child(StartupPID, SIGTERM);
! if (WalReceiverPID != 0)
! signal_child(WalReceiverPID, SIGTERM);
! pmState = PM_WAIT_BACKENDS;
! }
! else
! {
! /*
! * This is meaning that we execute smart shutdown during
! * online backup from hot standby. we need to allow the
! * connection to the backend by changing PM_WAIT_BACKUP
! * to execute pg_stop_backup.
! */
! WaitBackupForHotStandby = true;
! pmState = PM_WAIT_BACKUP;
! }
}
}
***************
*** 2993,3006 ****
{
/*
* Terminate backup mode to avoid recovery after a clean fast
! * shutdown. Since a backup can only be taken during normal
! * running (and not, for example, while running under Hot Standby)
! * it only makes sense to do this if we reached normal running. If
! * we're still in recovery, the backup file is one we're
! * recovering *from*, and we must keep it around so that recovery
! * restarts from the right place.
*/
! if (ReachedNormalRunning)
CancelBackup();
/* Normal exit from the postmaster is here */
--- 3021,3029 ----
{
/*
* Terminate backup mode to avoid recovery after a clean fast
! * shutdown if we reached normal running or hot standby.
*/
! if (ReachedNormalRunning || ReachedHotStandbyRunning)
CancelBackup();
/* Normal exit from the postmaster is here */
***************
*** 4157,4162 ****
--- 4180,4186 ----
ereport(LOG,
(errmsg("database system is ready to accept read only connections")));
+ ReachedHotStandbyRunning = true;
pmState = PM_HOT_STANDBY;
}
diff -rcN postgresql/src/backend/postmaster/postmaster.c.orig postgresql_with_patch/src/backend/postmaster/postmaster.c.orig
*** postgresql/src/backend/postmaster/postmaster.c.orig 1970-01-01 09:00:00.000000000 +0900
--- postgresql_with_patch/src/backend/postmaster/postmaster.c.orig 2011-09-12 05:23:29.000000000 +0900
***************
*** 0 ****
--- 1,5055 ----
+ /*-------------------------------------------------------------------------
+ *
+ * postmaster.c
+ * This program acts as a clearing house for requests to the
+ * POSTGRES system. Frontend programs send a startup message
+ * to the Postmaster and the postmaster uses the info in the
+ * message to setup a backend process.
+ *
+ * The postmaster also manages system-wide operations such as
+ * startup and shutdown. The postmaster itself doesn't do those
+ * operations, mind you --- it just forks off a subprocess to do them
+ * at the right times. It also takes care of resetting the system
+ * if a backend crashes.
+ *
+ * The postmaster process creates the shared memory and semaphore
+ * pools during startup, but as a rule does not touch them itself.
+ * In particular, it is not a member of the PGPROC array of backends
+ * and so it cannot participate in lock-manager operations. Keeping
+ * the postmaster away from shared memory operations makes it simpler
+ * and more reliable. The postmaster is almost always able to recover
+ * from crashes of individual backends by resetting shared memory;
+ * if it did much with shared memory then it would be prone to crashing
+ * along with the backends.
+ *
+ * When a request message is received, we now fork() immediately.
+ * The child process performs authentication of the request, and
+ * then becomes a backend if successful. This allows the auth code
+ * to be written in a simple single-threaded style (as opposed to the
+ * crufty "poor man's multitasking" code that used to be needed).
+ * More importantly, it ensures that blockages in non-multithreaded
+ * libraries like SSL or PAM cannot cause denial of service to other
+ * clients.
+ *
+ *
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/postmaster/postmaster.c
+ *
+ * NOTES
+ *
+ * Initialization:
+ * The Postmaster sets up shared memory data structures
+ * for the backends.
+ *
+ * Synchronization:
+ * The Postmaster shares memory with the backends but should avoid
+ * touching shared memory, so as not to become stuck if a crashing
+ * backend screws up locks or shared memory. Likewise, the Postmaster
+ * should never block on messages from frontend clients.
+ *
+ * Garbage Collection:
+ * The Postmaster cleans up after backends if they have an emergency
+ * exit and/or core dump.
+ *
+ * Error Reporting:
+ * Use write_stderr() only for reporting "interactive" errors
+ * (essentially, bogus arguments on the command line). Once the
+ * postmaster is launched, use ereport().
+ *
+ *-------------------------------------------------------------------------
+ */
+
+ #include "postgres.h"
+
+ #include
+ #include
+ #include
+ #include
+ #include
+ #include
+ #include
+ #include
+ #include
+ #include
+ #include
+ #include
+ #include
+
+ #ifdef HAVE_SYS_SELECT_H
+ #include
+ #endif
+
+ #ifdef HAVE_GETOPT_H
+ #include
+ #endif
+
+ #ifdef USE_BONJOUR
+ #include
+ #endif
+
+ #include "access/transam.h"
+ #include "access/xlog.h"
+ #include "bootstrap/bootstrap.h"
+ #include "catalog/pg_control.h"
+ #include "lib/dllist.h"
+ #include "libpq/auth.h"
+ #include "libpq/ip.h"
+ #include "libpq/libpq.h"
+ #include "libpq/pqsignal.h"
+ #include "miscadmin.h"
+ #include "pgstat.h"
+ #include "postmaster/autovacuum.h"
+ #include "postmaster/fork_process.h"
+ #include "postmaster/pgarch.h"
+ #include "postmaster/postmaster.h"
+ #include "postmaster/syslogger.h"
+ #include "replication/walsender.h"
+ #include "storage/fd.h"
+ #include "storage/ipc.h"
+ #include "storage/pg_shmem.h"
+ #include "storage/pmsignal.h"
+ #include "storage/proc.h"
+ #include "tcop/tcopprot.h"
+ #include "utils/builtins.h"
+ #include "utils/datetime.h"
+ #include "utils/memutils.h"
+ #include "utils/ps_status.h"
+
+ #ifdef EXEC_BACKEND
+ #include "storage/spin.h"
+ #endif
+
+
+ /*
+ * List of active backends (or child processes anyway; we don't actually
+ * know whether a given child has become a backend or is still in the
+ * authorization phase). This is used mainly to keep track of how many
+ * children we have and send them appropriate signals when necessary.
+ *
+ * "Special" children such as the startup, bgwriter and autovacuum launcher
+ * tasks are not in this list. Autovacuum worker and walsender processes are
+ * in it. Also, "dead_end" children are in it: these are children launched just
+ * for the purpose of sending a friendly rejection message to a would-be
+ * client. We must track them because they are attached to shared memory,
+ * but we know they will never become live backends. dead_end children are
+ * not assigned a PMChildSlot.
+ */
+ typedef struct bkend
+ {
+ pid_t pid; /* process id of backend */
+ long cancel_key; /* cancel key for cancels for this backend */
+ int child_slot; /* PMChildSlot for this backend, if any */
+ bool is_autovacuum; /* is it an autovacuum process? */
+ bool dead_end; /* is it going to send an error and quit? */
+ Dlelem elem; /* list link in BackendList */
+ } Backend;
+
+ static Dllist *BackendList;
+
+ #ifdef EXEC_BACKEND
+ static Backend *ShmemBackendArray;
+ #endif
+
+ /* The socket number we are listening for connections on */
+ int PostPortNumber;
+ char *UnixSocketDir;
+ char *ListenAddresses;
+
+ /*
+ * ReservedBackends is the number of backends reserved for superuser use.
+ * This number is taken out of the pool size given by MaxBackends so
+ * number of backend slots available to non-superusers is
+ * (MaxBackends - ReservedBackends). Note what this really means is
+ * "if there are <= ReservedBackends connections available, only superusers
+ * can make new connections" --- pre-existing superuser connections don't
+ * count against the limit.
+ */
+ int ReservedBackends;
+
+ /* The socket(s) we're listening to. */
+ #define MAXLISTEN 64
+ static pgsocket ListenSocket[MAXLISTEN];
+
+ /*
+ * Set by the -o option
+ */
+ static char ExtraOptions[MAXPGPATH];
+
+ /*
+ * These globals control the behavior of the postmaster in case some
+ * backend dumps core. Normally, it kills all peers of the dead backend
+ * and reinitializes shared memory. By specifying -s or -n, we can have
+ * the postmaster stop (rather than kill) peers and not reinitialize
+ * shared data structures. (Reinit is currently dead code, though.)
+ */
+ static bool Reinit = true;
+ static int SendStop = false;
+
+ /* still more option variables */
+ bool EnableSSL = false;
+
+ int PreAuthDelay = 0;
+ int AuthenticationTimeout = 60;
+
+ bool log_hostname; /* for ps display and logging */
+ bool Log_connections = false;
+ bool Db_user_namespace = false;
+
+ bool enable_bonjour = false;
+ char *bonjour_name;
+ bool restart_after_crash = true;
+
+ /* PIDs of special child processes; 0 when not running */
+ static pid_t StartupPID = 0,
+ BgWriterPID = 0,
+ WalWriterPID = 0,
+ WalReceiverPID = 0,
+ AutoVacPID = 0,
+ PgArchPID = 0,
+ PgStatPID = 0,
+ SysLoggerPID = 0;
+
+ /* Startup/shutdown state */
+ #define NoShutdown 0
+ #define SmartShutdown 1
+ #define FastShutdown 2
+
+ static int Shutdown = NoShutdown;
+
+ static bool FatalError = false; /* T if recovering from backend crash */
+ static bool RecoveryError = false; /* T if WAL recovery failed */
+
+ /*
+ * We use a simple state machine to control startup, shutdown, and
+ * crash recovery (which is rather like shutdown followed by startup).
+ *
+ * After doing all the postmaster initialization work, we enter PM_STARTUP
+ * state and the startup process is launched. The startup process begins by
+ * reading the control file and other preliminary initialization steps.
+ * In a normal startup, or after crash recovery, the startup process exits
+ * with exit code 0 and we switch to PM_RUN state. However, archive recovery
+ * is handled specially since it takes much longer and we would like to support
+ * hot standby during archive recovery.
+ *
+ * When the startup process is ready to start archive recovery, it signals the
+ * postmaster, and we switch to PM_RECOVERY state. The background writer is
+ * launched, while the startup process continues applying WAL. If Hot Standby
+ * is enabled, then, after reaching a consistent point in WAL redo, startup
+ * process signals us again, and we switch to PM_HOT_STANDBY state and
+ * begin accepting connections to perform read-only queries. When archive
+ * recovery is finished, the startup process exits with exit code 0 and we
+ * switch to PM_RUN state.
+ *
+ * Normal child backends can only be launched when we are in PM_RUN or
+ * PM_HOT_STANDBY state. (We also allow launch of normal
+ * child backends in PM_WAIT_BACKUP state, but only for superusers.)
+ * In other states we handle connection requests by launching "dead_end"
+ * child processes, which will simply send the client an error message and
+ * quit. (We track these in the BackendList so that we can know when they
+ * are all gone; this is important because they're still connected to shared
+ * memory, and would interfere with an attempt to destroy the shmem segment,
+ * possibly leading to SHMALL failure when we try to make a new one.)
+ * In PM_WAIT_DEAD_END state we are waiting for all the dead_end children
+ * to drain out of the system, and therefore stop accepting connection
+ * requests at all until the last existing child has quit (which hopefully
+ * will not be very long).
+ *
+ * Notice that this state variable does not distinguish *why* we entered
+ * states later than PM_RUN --- Shutdown and FatalError must be consulted
+ * to find that out. FatalError is never true in PM_RECOVERY_* or PM_RUN
+ * states, nor in PM_SHUTDOWN states (because we don't enter those states
+ * when trying to recover from a crash). It can be true in PM_STARTUP state,
+ * because we don't clear it until we've successfully started WAL redo.
+ * Similarly, RecoveryError means that we have crashed during recovery, and
+ * should not try to restart.
+ */
+ typedef enum
+ {
+ PM_INIT, /* postmaster starting */
+ PM_STARTUP, /* waiting for startup subprocess */
+ PM_RECOVERY, /* in archive recovery mode */
+ PM_HOT_STANDBY, /* in hot standby mode */
+ PM_RUN, /* normal "database is alive" state */
+ PM_WAIT_BACKUP, /* waiting for online backup mode to end */
+ PM_WAIT_READONLY, /* waiting for read only backends to exit */
+ PM_WAIT_BACKENDS, /* waiting for live backends to exit */
+ PM_SHUTDOWN, /* waiting for bgwriter to do shutdown ckpt */
+ PM_SHUTDOWN_2, /* waiting for archiver and walsenders to
+ * finish */
+ PM_WAIT_DEAD_END, /* waiting for dead_end children to exit */
+ PM_NO_CHILDREN /* all important children have exited */
+ } PMState;
+
+ static PMState pmState = PM_INIT;
+
+ static bool ReachedNormalRunning = false; /* T if we've reached PM_RUN */
+
+ bool ClientAuthInProgress = false; /* T during new-client
+ * authentication */
+
+ bool redirection_done = false; /* stderr redirected for syslogger? */
+
+ /* received START_AUTOVAC_LAUNCHER signal */
+ static volatile sig_atomic_t start_autovac_launcher = false;
+
+ /* the launcher needs to be signalled to communicate some condition */
+ static volatile bool avlauncher_needs_signal = false;
+
+ /*
+ * State for assigning random salts and cancel keys.
+ * Also, the global MyCancelKey passes the cancel key assigned to a given
+ * backend from the postmaster to that backend (via fork).
+ */
+ static unsigned int random_seed = 0;
+ static struct timeval random_start_time;
+
+ extern char *optarg;
+ extern int optind,
+ opterr;
+
+ #ifdef HAVE_INT_OPTRESET
+ extern int optreset; /* might not be declared by system headers */
+ #endif
+
+ #ifdef USE_BONJOUR
+ static DNSServiceRef bonjour_sdref = NULL;
+ #endif
+
+ /*
+ * postmaster.c - function prototypes
+ */
+ static void getInstallationPaths(const char *argv0);
+ static void checkDataDir(void);
+ static Port *ConnCreate(int serverFd);
+ static void ConnFree(Port *port);
+ static void reset_shared(int port);
+ static void SIGHUP_handler(SIGNAL_ARGS);
+ static void pmdie(SIGNAL_ARGS);
+ static void reaper(SIGNAL_ARGS);
+ static void sigusr1_handler(SIGNAL_ARGS);
+ static void startup_die(SIGNAL_ARGS);
+ static void dummy_handler(SIGNAL_ARGS);
+ static void CleanupBackend(int pid, int exitstatus);
+ static void HandleChildCrash(int pid, int exitstatus, const char *procname);
+ static void LogChildExit(int lev, const char *procname,
+ int pid, int exitstatus);
+ static void PostmasterStateMachine(void);
+ static void BackendInitialize(Port *port);
+ static int BackendRun(Port *port);
+ static void ExitPostmaster(int status);
+ static int ServerLoop(void);
+ static int BackendStartup(Port *port);
+ static int ProcessStartupPacket(Port *port, bool SSLdone);
+ static void processCancelRequest(Port *port, void *pkt);
+ static int initMasks(fd_set *rmask);
+ static void report_fork_failure_to_client(Port *port, int errnum);
+ static CAC_state canAcceptConnections(void);
+ static long PostmasterRandom(void);
+ static void RandomSalt(char *md5Salt);
+ static void signal_child(pid_t pid, int signal);
+ static bool SignalSomeChildren(int signal, int targets);
+
+ #define SignalChildren(sig) SignalSomeChildren(sig, BACKEND_TYPE_ALL)
+
+ /*
+ * Possible types of a backend. These are OR-able request flag bits
+ * for SignalSomeChildren() and CountChildren().
+ */
+ #define BACKEND_TYPE_NORMAL 0x0001 /* normal backend */
+ #define BACKEND_TYPE_AUTOVAC 0x0002 /* autovacuum worker process */
+ #define BACKEND_TYPE_WALSND 0x0004 /* walsender process */
+ #define BACKEND_TYPE_ALL 0x0007 /* OR of all the above */
+
+ static int CountChildren(int target);
+ static bool CreateOptsFile(int argc, char *argv[], char *fullprogname);
+ static pid_t StartChildProcess(AuxProcType type);
+ static void StartAutovacuumWorker(void);
+ static void InitPostmasterDeathWatchHandle(void);
+
+ #ifdef EXEC_BACKEND
+
+ #ifdef WIN32
+ static pid_t win32_waitpid(int *exitstatus);
+ static void WINAPI pgwin32_deadchild_callback(PVOID lpParameter, BOOLEAN TimerOrWaitFired);
+
+ static HANDLE win32ChildQueue;
+
+ typedef struct
+ {
+ HANDLE waitHandle;
+ HANDLE procHandle;
+ DWORD procId;
+ } win32_deadchild_waitinfo;
+ #endif
+
+ static pid_t backend_forkexec(Port *port);
+ static pid_t internal_forkexec(int argc, char *argv[], Port *port);
+
+ /* Type for a socket that can be inherited to a client process */
+ #ifdef WIN32
+ typedef struct
+ {
+ SOCKET origsocket; /* Original socket value, or PGINVALID_SOCKET
+ * if not a socket */
+ WSAPROTOCOL_INFO wsainfo;
+ } InheritableSocket;
+ #else
+ typedef int InheritableSocket;
+ #endif
+
+ typedef struct LWLock LWLock; /* ugly kluge */
+
+ /*
+ * Structure contains all variables passed to exec:ed backends
+ */
+ typedef struct
+ {
+ Port port;
+ InheritableSocket portsocket;
+ char DataDir[MAXPGPATH];
+ pgsocket ListenSocket[MAXLISTEN];
+ long MyCancelKey;
+ int MyPMChildSlot;
+ #ifndef WIN32
+ unsigned long UsedShmemSegID;
+ #else
+ HANDLE UsedShmemSegID;
+ #endif
+ void *UsedShmemSegAddr;
+ slock_t *ShmemLock;
+ VariableCache ShmemVariableCache;
+ Backend *ShmemBackendArray;
+ LWLock *LWLockArray;
+ slock_t *ProcStructLock;
+ PROC_HDR *ProcGlobal;
+ PGPROC *AuxiliaryProcs;
+ PMSignalData *PMSignalState;
+ InheritableSocket pgStatSock;
+ pid_t PostmasterPid;
+ TimestampTz PgStartTime;
+ TimestampTz PgReloadTime;
+ bool redirection_done;
+ #ifdef WIN32
+ HANDLE PostmasterHandle;
+ HANDLE initial_signal_pipe;
+ HANDLE syslogPipe[2];
+ #else
+ int postmaster_alive_fds[2];
+ int syslogPipe[2];
+ #endif
+ char my_exec_path[MAXPGPATH];
+ char pkglib_path[MAXPGPATH];
+ char ExtraOptions[MAXPGPATH];
+ } BackendParameters;
+
+ static void read_backend_variables(char *id, Port *port);
+ static void restore_backend_variables(BackendParameters *param, Port *port);
+
+ #ifndef WIN32
+ static bool save_backend_variables(BackendParameters *param, Port *port);
+ #else
+ static bool save_backend_variables(BackendParameters *param, Port *port,
+ HANDLE childProcess, pid_t childPid);
+ #endif
+
+ static void ShmemBackendArrayAdd(Backend *bn);
+ static void ShmemBackendArrayRemove(Backend *bn);
+ #endif /* EXEC_BACKEND */
+
+ #define StartupDataBase() StartChildProcess(StartupProcess)
+ #define StartBackgroundWriter() StartChildProcess(BgWriterProcess)
+ #define StartWalWriter() StartChildProcess(WalWriterProcess)
+ #define StartWalReceiver() StartChildProcess(WalReceiverProcess)
+
+ /* Macros to check exit status of a child process */
+ #define EXIT_STATUS_0(st) ((st) == 0)
+ #define EXIT_STATUS_1(st) (WIFEXITED(st) && WEXITSTATUS(st) == 1)
+
+ #ifndef WIN32
+ /*
+ * File descriptors for pipe used to monitor if postmaster is alive.
+ * First is POSTMASTER_FD_WATCH, second is POSTMASTER_FD_OWN.
+ */
+ int postmaster_alive_fds[2] = { -1, -1 };
+ #else
+ /* Process handle of postmaster used for the same purpose on Windows */
+ HANDLE PostmasterHandle;
+ #endif
+
+ /*
+ * Postmaster main entry point
+ */
+ int
+ PostmasterMain(int argc, char *argv[])
+ {
+ int opt;
+ int status;
+ char *userDoption = NULL;
+ bool listen_addr_saved = false;
+ int i;
+
+ MyProcPid = PostmasterPid = getpid();
+
+ MyStartTime = time(NULL);
+
+ IsPostmasterEnvironment = true;
+
+ /*
+ * for security, no dir or file created can be group or other accessible
+ */
+ umask(S_IRWXG | S_IRWXO);
+
+ /*
+ * Fire up essential subsystems: memory management
+ */
+ MemoryContextInit();
+
+ /*
+ * By default, palloc() requests in the postmaster will be allocated in
+ * the PostmasterContext, which is space that can be recycled by backends.
+ * Allocated data that needs to be available to backends should be
+ * allocated in TopMemoryContext.
+ */
+ PostmasterContext = AllocSetContextCreate(TopMemoryContext,
+ "Postmaster",
+ ALLOCSET_DEFAULT_MINSIZE,
+ ALLOCSET_DEFAULT_INITSIZE,
+ ALLOCSET_DEFAULT_MAXSIZE);
+ MemoryContextSwitchTo(PostmasterContext);
+
+ /* Initialize paths to installation files */
+ getInstallationPaths(argv[0]);
+
+ /*
+ * Options setup
+ */
+ InitializeGUCOptions();
+
+ opterr = 1;
+
+ /*
+ * Parse command-line options. CAUTION: keep this in sync with
+ * tcop/postgres.c (the option sets should not conflict) and with the
+ * common help() function in main/main.c.
+ */
+ while ((opt = getopt(argc, argv, "A:B:bc:D:d:EeFf:h:ijk:lN:nOo:Pp:r:S:sTt:W:-:")) != -1)
+ {
+ switch (opt)
+ {
+ case 'A':
+ SetConfigOption("debug_assertions", optarg, PGC_POSTMASTER, PGC_S_ARGV);
+ break;
+
+ case 'B':
+ SetConfigOption("shared_buffers", optarg, PGC_POSTMASTER, PGC_S_ARGV);
+ break;
+
+ case 'b':
+ /* Undocumented flag used for binary upgrades */
+ IsBinaryUpgrade = true;
+ break;
+
+ case 'D':
+ userDoption = optarg;
+ break;
+
+ case 'd':
+ set_debug_options(atoi(optarg), PGC_POSTMASTER, PGC_S_ARGV);
+ break;
+
+ case 'E':
+ SetConfigOption("log_statement", "all", PGC_POSTMASTER, PGC_S_ARGV);
+ break;
+
+ case 'e':
+ SetConfigOption("datestyle", "euro", PGC_POSTMASTER, PGC_S_ARGV);
+ break;
+
+ case 'F':
+ SetConfigOption("fsync", "false", PGC_POSTMASTER, PGC_S_ARGV);
+ break;
+
+ case 'f':
+ if (!set_plan_disabling_options(optarg, PGC_POSTMASTER, PGC_S_ARGV))
+ {
+ write_stderr("%s: invalid argument for option -f: \"%s\"\n",
+ progname, optarg);
+ ExitPostmaster(1);
+ }
+ break;
+
+ case 'h':
+ SetConfigOption("listen_addresses", optarg, PGC_POSTMASTER, PGC_S_ARGV);
+ break;
+
+ case 'i':
+ SetConfigOption("listen_addresses", "*", PGC_POSTMASTER, PGC_S_ARGV);
+ break;
+
+ case 'j':
+ /* only used by interactive backend */
+ break;
+
+ case 'k':
+ SetConfigOption("unix_socket_directory", optarg, PGC_POSTMASTER, PGC_S_ARGV);
+ break;
+
+ case 'l':
+ SetConfigOption("ssl", "true", PGC_POSTMASTER, PGC_S_ARGV);
+ break;
+
+ case 'N':
+ SetConfigOption("max_connections", optarg, PGC_POSTMASTER, PGC_S_ARGV);
+ break;
+
+ case 'n':
+ /* Don't reinit shared mem after abnormal exit */
+ Reinit = false;
+ break;
+
+ case 'O':
+ SetConfigOption("allow_system_table_mods", "true", PGC_POSTMASTER, PGC_S_ARGV);
+ break;
+
+ case 'o':
+ /* Other options to pass to the backend on the command line */
+ snprintf(ExtraOptions + strlen(ExtraOptions),
+ sizeof(ExtraOptions) - strlen(ExtraOptions),
+ " %s", optarg);
+ break;
+
+ case 'P':
+ SetConfigOption("ignore_system_indexes", "true", PGC_POSTMASTER, PGC_S_ARGV);
+ break;
+
+ case 'p':
+ SetConfigOption("port", optarg, PGC_POSTMASTER, PGC_S_ARGV);
+ break;
+
+ case 'r':
+ /* only used by single-user backend */
+ break;
+
+ case 'S':
+ SetConfigOption("work_mem", optarg, PGC_POSTMASTER, PGC_S_ARGV);
+ break;
+
+ case 's':
+ SetConfigOption("log_statement_stats", "true", PGC_POSTMASTER, PGC_S_ARGV);
+ break;
+
+ case 'T':
+
+ /*
+ * In the event that some backend dumps core, send SIGSTOP,
+ * rather than SIGQUIT, to all its peers. This lets the wily
+ * post_hacker collect core dumps from everyone.
+ */
+ SendStop = true;
+ break;
+
+ case 't':
+ {
+ const char *tmp = get_stats_option_name(optarg);
+
+ if (tmp)
+ {
+ SetConfigOption(tmp, "true", PGC_POSTMASTER, PGC_S_ARGV);
+ }
+ else
+ {
+ write_stderr("%s: invalid argument for option -t: \"%s\"\n",
+ progname, optarg);
+ ExitPostmaster(1);
+ }
+ break;
+ }
+
+ case 'W':
+ SetConfigOption("post_auth_delay", optarg, PGC_POSTMASTER, PGC_S_ARGV);
+ break;
+
+ case 'c':
+ case '-':
+ {
+ char *name,
+ *value;
+
+ ParseLongOption(optarg, &name, &value);
+ if (!value)
+ {
+ if (opt == '-')
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("--%s requires a value",
+ optarg)));
+ else
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("-c %s requires a value",
+ optarg)));
+ }
+
+ SetConfigOption(name, value, PGC_POSTMASTER, PGC_S_ARGV);
+ free(name);
+ if (value)
+ free(value);
+ break;
+ }
+
+ default:
+ write_stderr("Try \"%s --help\" for more information.\n",
+ progname);
+ ExitPostmaster(1);
+ }
+ }
+
+ /*
+ * Postmaster accepts no non-option switch arguments.
+ */
+ if (optind < argc)
+ {
+ write_stderr("%s: invalid argument: \"%s\"\n",
+ progname, argv[optind]);
+ write_stderr("Try \"%s --help\" for more information.\n",
+ progname);
+ ExitPostmaster(1);
+ }
+
+ /*
+ * Locate the proper configuration files and data directory, and read
+ * postgresql.conf for the first time.
+ */
+ if (!SelectConfigFiles(userDoption, progname))
+ ExitPostmaster(2);
+
+ /* Verify that DataDir looks reasonable */
+ checkDataDir();
+
+ /* And switch working directory into it */
+ ChangeToDataDir();
+
+ /*
+ * Check for invalid combinations of GUC settings.
+ */
+ if (ReservedBackends >= MaxBackends)
+ {
+ write_stderr("%s: superuser_reserved_connections must be less than max_connections\n", progname);
+ ExitPostmaster(1);
+ }
+ if (XLogArchiveMode && wal_level == WAL_LEVEL_MINIMAL)
+ ereport(ERROR,
+ (errmsg("WAL archival (archive_mode=on) requires wal_level \"archive\" or \"hot_standby\"")));
+ if (max_wal_senders > 0 && wal_level == WAL_LEVEL_MINIMAL)
+ ereport(ERROR,
+ (errmsg("WAL streaming (max_wal_senders > 0) requires wal_level \"archive\" or \"hot_standby\"")));
+
+ /*
+ * Other one-time internal sanity checks can go here, if they are fast.
+ * (Put any slow processing further down, after postmaster.pid creation.)
+ */
+ if (!CheckDateTokenTables())
+ {
+ write_stderr("%s: invalid datetoken tables, please fix\n", progname);
+ ExitPostmaster(1);
+ }
+
+ /*
+ * Now that we are done processing the postmaster arguments, reset
+ * getopt(3) library so that it will work correctly in subprocesses.
+ */
+ optind = 1;
+ #ifdef HAVE_INT_OPTRESET
+ optreset = 1; /* some systems need this too */
+ #endif
+
+ /* For debugging: display postmaster environment */
+ {
+ extern char **environ;
+ char **p;
+
+ ereport(DEBUG3,
+ (errmsg_internal("%s: PostmasterMain: initial environment dump:",
+ progname)));
+ ereport(DEBUG3,
+ (errmsg_internal("-----------------------------------------")));
+ for (p = environ; *p; ++p)
+ ereport(DEBUG3,
+ (errmsg_internal("\t%s", *p)));
+ ereport(DEBUG3,
+ (errmsg_internal("-----------------------------------------")));
+ }
+
+ /*
+ * Create lockfile for data directory.
+ *
+ * We want to do this before we try to grab the input sockets, because the
+ * data directory interlock is more reliable than the socket-file
+ * interlock (thanks to whoever decided to put socket files in /tmp :-().
+ * For the same reason, it's best to grab the TCP socket(s) before the
+ * Unix socket.
+ */
+ CreateDataDirLockFile(true);
+
+ /*
+ * Initialize SSL library, if specified.
+ */
+ #ifdef USE_SSL
+ if (EnableSSL)
+ secure_initialize();
+ #endif
+
+ /*
+ * process any libraries that should be preloaded at postmaster start
+ */
+ process_shared_preload_libraries();
+
+ /*
+ * Remove old temporary files. At this point there can be no other
+ * Postgres processes running in this directory, so this should be safe.
+ */
+ RemovePgTempFiles();
+
+ /*
+ * Establish input sockets.
+ */
+ for (i = 0; i < MAXLISTEN; i++)
+ ListenSocket[i] = PGINVALID_SOCKET;
+
+ if (ListenAddresses)
+ {
+ char *rawstring;
+ List *elemlist;
+ ListCell *l;
+ int success = 0;
+
+ /* Need a modifiable copy of ListenAddresses */
+ rawstring = pstrdup(ListenAddresses);
+
+ /* Parse string into list of identifiers */
+ if (!SplitIdentifierString(rawstring, ',', &elemlist))
+ {
+ /* syntax error in list */
+ ereport(FATAL,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("invalid list syntax for \"listen_addresses\"")));
+ }
+
+ foreach(l, elemlist)
+ {
+ char *curhost = (char *) lfirst(l);
+
+ if (strcmp(curhost, "*") == 0)
+ status = StreamServerPort(AF_UNSPEC, NULL,
+ (unsigned short) PostPortNumber,
+ UnixSocketDir,
+ ListenSocket, MAXLISTEN);
+ else
+ status = StreamServerPort(AF_UNSPEC, curhost,
+ (unsigned short) PostPortNumber,
+ UnixSocketDir,
+ ListenSocket, MAXLISTEN);
+
+ if (status == STATUS_OK)
+ {
+ success++;
+ /* record the first successful host addr in lockfile */
+ if (!listen_addr_saved)
+ {
+ AddToDataDirLockFile(LOCK_FILE_LINE_LISTEN_ADDR, curhost);
+ listen_addr_saved = true;
+ }
+ }
+ else
+ ereport(WARNING,
+ (errmsg("could not create listen socket for \"%s\"",
+ curhost)));
+ }
+
+ if (!success && list_length(elemlist))
+ ereport(FATAL,
+ (errmsg("could not create any TCP/IP sockets")));
+
+ list_free(elemlist);
+ pfree(rawstring);
+ }
+
+ #ifdef USE_BONJOUR
+ /* Register for Bonjour only if we opened TCP socket(s) */
+ if (enable_bonjour && ListenSocket[0] != PGINVALID_SOCKET)
+ {
+ DNSServiceErrorType err;
+
+ /*
+ * We pass 0 for interface_index, which will result in registering on
+ * all "applicable" interfaces. It's not entirely clear from the
+ * DNS-SD docs whether this would be appropriate if we have bound to
+ * just a subset of the available network interfaces.
+ */
+ err = DNSServiceRegister(&bonjour_sdref,
+ 0,
+ 0,
+ bonjour_name,
+ "_postgresql._tcp.",
+ NULL,
+ NULL,
+ htons(PostPortNumber),
+ 0,
+ NULL,
+ NULL,
+ NULL);
+ if (err != kDNSServiceErr_NoError)
+ elog(LOG, "DNSServiceRegister() failed: error code %ld",
+ (long) err);
+
+ /*
+ * We don't bother to read the mDNS daemon's reply, and we expect that
+ * it will automatically terminate our registration when the socket is
+ * closed at postmaster termination. So there's nothing more to be
+ * done here. However, the bonjour_sdref is kept around so that
+ * forked children can close their copies of the socket.
+ */
+ }
+ #endif
+
+ #ifdef HAVE_UNIX_SOCKETS
+ status = StreamServerPort(AF_UNIX, NULL,
+ (unsigned short) PostPortNumber,
+ UnixSocketDir,
+ ListenSocket, MAXLISTEN);
+ if (status != STATUS_OK)
+ ereport(WARNING,
+ (errmsg("could not create Unix-domain socket")));
+ #endif
+
+ /*
+ * check that we have some socket to listen on
+ */
+ if (ListenSocket[0] == PGINVALID_SOCKET)
+ ereport(FATAL,
+ (errmsg("no socket created for listening")));
+
+ /*
+ * If no valid TCP ports, write an empty line for listen address,
+ * indicating the Unix socket must be used. Note that this line is not
+ * added to the lock file until there is a socket backing it.
+ */
+ if (!listen_addr_saved)
+ AddToDataDirLockFile(LOCK_FILE_LINE_LISTEN_ADDR, "");
+
+ /*
+ * Set up shared memory and semaphores.
+ */
+ reset_shared(PostPortNumber);
+
+ /*
+ * Estimate number of openable files. This must happen after setting up
+ * semaphores, because on some platforms semaphores count as open files.
+ */
+ set_max_safe_fds();
+
+ /*
+ * Initialize the list of active backends.
+ */
+ BackendList = DLNewList();
+
+ /*
+ * Initialize pipe (or process handle on Windows) that allows children to
+ * wake up from sleep on postmaster death.
+ */
+ InitPostmasterDeathWatchHandle();
+
+ #ifdef WIN32
+ /*
+ * Initialize I/O completion port used to deliver list of dead children.
+ */
+ win32ChildQueue = CreateIoCompletionPort(INVALID_HANDLE_VALUE, NULL, 0, 1);
+ if (win32ChildQueue == NULL)
+ ereport(FATAL,
+ (errmsg("could not create I/O completion port for child queue")));
+ #endif
+
+ /*
+ * Record postmaster options. We delay this till now to avoid recording
+ * bogus options (eg, NBuffers too high for available memory).
+ */
+ if (!CreateOptsFile(argc, argv, my_exec_path))
+ ExitPostmaster(1);
+
+ #ifdef EXEC_BACKEND
+ /* Write out nondefault GUC settings for child processes to use */
+ write_nondefault_variables(PGC_POSTMASTER);
+ #endif
+
+ /*
+ * Write the external PID file if requested
+ */
+ if (external_pid_file)
+ {
+ FILE *fpidfile = fopen(external_pid_file, "w");
+
+ if (fpidfile)
+ {
+ fprintf(fpidfile, "%d\n", MyProcPid);
+ fclose(fpidfile);
+ /* Should we remove the pid file on postmaster exit? */
+
+ /* Make PID file world readable */
+ if (chmod(external_pid_file, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH) != 0)
+ write_stderr("%s: could not change permissions of external PID file \"%s\": %s\n",
+ progname, external_pid_file, strerror(errno));
+ }
+ else
+ write_stderr("%s: could not write external PID file \"%s\": %s\n",
+ progname, external_pid_file, strerror(errno));
+ }
+
+ /*
+ * Set up signal handlers for the postmaster process.
+ *
+ * CAUTION: when changing this list, check for side-effects on the signal
+ * handling setup of child processes. See tcop/postgres.c,
+ * bootstrap/bootstrap.c, postmaster/bgwriter.c, postmaster/walwriter.c,
+ * postmaster/autovacuum.c, postmaster/pgarch.c, postmaster/pgstat.c, and
+ * postmaster/syslogger.c.
+ */
+ pqinitmask();
+ PG_SETMASK(&BlockSig);
+
+ pqsignal(SIGHUP, SIGHUP_handler); /* reread config file and have
+ * children do same */
+ pqsignal(SIGINT, pmdie); /* send SIGTERM and shut down */
+ pqsignal(SIGQUIT, pmdie); /* send SIGQUIT and die */
+ pqsignal(SIGTERM, pmdie); /* wait for children and shut down */
+ pqsignal(SIGALRM, SIG_IGN); /* ignored */
+ pqsignal(SIGPIPE, SIG_IGN); /* ignored */
+ pqsignal(SIGUSR1, sigusr1_handler); /* message from child process */
+ pqsignal(SIGUSR2, dummy_handler); /* unused, reserve for children */
+ pqsignal(SIGCHLD, reaper); /* handle child termination */
+ pqsignal(SIGTTIN, SIG_IGN); /* ignored */
+ pqsignal(SIGTTOU, SIG_IGN); /* ignored */
+ /* ignore SIGXFSZ, so that ulimit violations work like disk full */
+ #ifdef SIGXFSZ
+ pqsignal(SIGXFSZ, SIG_IGN); /* ignored */
+ #endif
+
+ /*
+ * If enabled, start up syslogger collection subprocess
+ */
+ SysLoggerPID = SysLogger_Start();
+
+ /*
+ * Reset whereToSendOutput from DestDebug (its starting state) to
+ * DestNone. This stops ereport from sending log messages to stderr unless
+ * Log_destination permits. We don't do this until the postmaster is
+ * fully launched, since startup failures may as well be reported to
+ * stderr.
+ */
+ whereToSendOutput = DestNone;
+
+ /*
+ * Initialize stats collection subsystem (this does NOT start the
+ * collector process!)
+ */
+ pgstat_init();
+
+ /*
+ * Initialize the autovacuum subsystem (again, no process start yet)
+ */
+ autovac_init();
+
+ /*
+ * Load configuration files for client authentication.
+ */
+ if (!load_hba())
+ {
+ /*
+ * It makes no sense to continue if we fail to load the HBA file,
+ * since there is no way to connect to the database in this case.
+ */
+ ereport(FATAL,
+ (errmsg("could not load pg_hba.conf")));
+ }
+ load_ident();
+
+ /*
+ * Remember postmaster startup time
+ */
+ PgStartTime = GetCurrentTimestamp();
+ /* PostmasterRandom wants its own copy */
+ gettimeofday(&random_start_time, NULL);
+
+ /*
+ * We're ready to rock and roll...
+ */
+ StartupPID = StartupDataBase();
+ Assert(StartupPID != 0);
+ pmState = PM_STARTUP;
+
+ status = ServerLoop();
+
+ /*
+ * ServerLoop probably shouldn't ever return, but if it does, close down.
+ */
+ ExitPostmaster(status != STATUS_OK);
+
+ return 0; /* not reached */
+ }
+
+
+ /*
+ * Compute and check the directory paths to files that are part of the
+ * installation (as deduced from the postgres executable's own location)
+ */
+ static void
+ getInstallationPaths(const char *argv0)
+ {
+ DIR *pdir;
+
+ /* Locate the postgres executable itself */
+ if (find_my_exec(argv0, my_exec_path) < 0)
+ elog(FATAL, "%s: could not locate my own executable path", argv0);
+
+ #ifdef EXEC_BACKEND
+ /* Locate executable backend before we change working directory */
+ if (find_other_exec(argv0, "postgres", PG_BACKEND_VERSIONSTR,
+ postgres_exec_path) < 0)
+ ereport(FATAL,
+ (errmsg("%s: could not locate matching postgres executable",
+ argv0)));
+ #endif
+
+ /*
+ * Locate the pkglib directory --- this has to be set early in case we try
+ * to load any modules from it in response to postgresql.conf entries.
+ */
+ get_pkglib_path(my_exec_path, pkglib_path);
+
+ /*
+ * Verify that there's a readable directory there; otherwise the Postgres
+ * installation is incomplete or corrupt. (A typical cause of this
+ * failure is that the postgres executable has been moved or hardlinked to
+ * some directory that's not a sibling of the installation lib/
+ * directory.)
+ */
+ pdir = AllocateDir(pkglib_path);
+ if (pdir == NULL)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not open directory \"%s\": %m",
+ pkglib_path),
+ errhint("This may indicate an incomplete PostgreSQL installation, or that the file \"%s\" has been moved away from its proper location.",
+ my_exec_path)));
+ FreeDir(pdir);
+
+ /*
+ * XXX is it worth similarly checking the share/ directory? If the lib/
+ * directory is there, then share/ probably is too.
+ */
+ }
+
+
+ /*
+ * Validate the proposed data directory
+ */
+ static void
+ checkDataDir(void)
+ {
+ char path[MAXPGPATH];
+ FILE *fp;
+ struct stat stat_buf;
+
+ Assert(DataDir);
+
+ if (stat(DataDir, &stat_buf) != 0)
+ {
+ if (errno == ENOENT)
+ ereport(FATAL,
+ (errcode_for_file_access(),
+ errmsg("data directory \"%s\" does not exist",
+ DataDir)));
+ else
+ ereport(FATAL,
+ (errcode_for_file_access(),
+ errmsg("could not read permissions of directory \"%s\": %m",
+ DataDir)));
+ }
+
+ /* eventual chdir would fail anyway, but let's test ... */
+ if (!S_ISDIR(stat_buf.st_mode))
+ ereport(FATAL,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("specified data directory \"%s\" is not a directory",
+ DataDir)));
+
+ /*
+ * Check that the directory belongs to my userid; if not, reject.
+ *
+ * This check is an essential part of the interlock that prevents two
+ * postmasters from starting in the same directory (see CreateLockFile()).
+ * Do not remove or weaken it.
+ *
+ * XXX can we safely enable this check on Windows?
+ */
+ #if !defined(WIN32) && !defined(__CYGWIN__)
+ if (stat_buf.st_uid != geteuid())
+ ereport(FATAL,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("data directory \"%s\" has wrong ownership",
+ DataDir),
+ errhint("The server must be started by the user that owns the data directory.")));
+ #endif
+
+ /*
+ * Check if the directory has group or world access. If so, reject.
+ *
+ * It would be possible to allow weaker constraints (for example, allow
+ * group access) but we cannot make a general assumption that that is
+ * okay; for example there are platforms where nearly all users
+ * customarily belong to the same group. Perhaps this test should be
+ * configurable.
+ *
+ * XXX temporarily suppress check when on Windows, because there may not
+ * be proper support for Unix-y file permissions. Need to think of a
+ * reasonable check to apply on Windows.
+ */
+ #if !defined(WIN32) && !defined(__CYGWIN__)
+ if (stat_buf.st_mode & (S_IRWXG | S_IRWXO))
+ ereport(FATAL,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("data directory \"%s\" has group or world access",
+ DataDir),
+ errdetail("Permissions should be u=rwx (0700).")));
+ #endif
+
+ /* Look for PG_VERSION before looking for pg_control */
+ ValidatePgVersion(DataDir);
+
+ snprintf(path, sizeof(path), "%s/global/pg_control", DataDir);
+
+ fp = AllocateFile(path, PG_BINARY_R);
+ if (fp == NULL)
+ {
+ write_stderr("%s: could not find the database system\n"
+ "Expected to find it in the directory \"%s\",\n"
+ "but could not open file \"%s\": %s\n",
+ progname, DataDir, path, strerror(errno));
+ ExitPostmaster(2);
+ }
+ FreeFile(fp);
+ }
+
+ /*
+ * Main idle loop of postmaster
+ */
+ static int
+ ServerLoop(void)
+ {
+ fd_set readmask;
+ int nSockets;
+ time_t now,
+ last_touch_time;
+
+ last_touch_time = time(NULL);
+
+ nSockets = initMasks(&readmask);
+
+ for (;;)
+ {
+ fd_set rmask;
+ int selres;
+
+ /*
+ * Wait for a connection request to arrive.
+ *
+ * We wait at most one minute, to ensure that the other background
+ * tasks handled below get done even when no requests are arriving.
+ *
+ * If we are in PM_WAIT_DEAD_END state, then we don't want to accept
+ * any new connections, so we don't call select() at all; just sleep
+ * for a little bit with signals unblocked.
+ */
+ memcpy((char *) &rmask, (char *) &readmask, sizeof(fd_set));
+
+ PG_SETMASK(&UnBlockSig);
+
+ if (pmState == PM_WAIT_DEAD_END)
+ {
+ pg_usleep(100000L); /* 100 msec seems reasonable */
+ selres = 0;
+ }
+ else
+ {
+ /* must set timeout each time; some OSes change it! */
+ struct timeval timeout;
+
+ timeout.tv_sec = 60;
+ timeout.tv_usec = 0;
+
+ selres = select(nSockets, &rmask, NULL, NULL, &timeout);
+ }
+
+ /*
+ * Block all signals until we wait again. (This makes it safe for our
+ * signal handlers to do nontrivial work.)
+ */
+ PG_SETMASK(&BlockSig);
+
+ /* Now check the select() result */
+ if (selres < 0)
+ {
+ if (errno != EINTR && errno != EWOULDBLOCK)
+ {
+ ereport(LOG,
+ (errcode_for_socket_access(),
+ errmsg("select() failed in postmaster: %m")));
+ return STATUS_ERROR;
+ }
+ }
+
+ /*
+ * New connection pending on any of our sockets? If so, fork a child
+ * process to deal with it.
+ */
+ if (selres > 0)
+ {
+ int i;
+
+ for (i = 0; i < MAXLISTEN; i++)
+ {
+ if (ListenSocket[i] == PGINVALID_SOCKET)
+ break;
+ if (FD_ISSET(ListenSocket[i], &rmask))
+ {
+ Port *port;
+
+ port = ConnCreate(ListenSocket[i]);
+ if (port)
+ {
+ BackendStartup(port);
+
+ /*
+ * We no longer need the open socket or port structure
+ * in this process
+ */
+ StreamClose(port->sock);
+ ConnFree(port);
+ }
+ }
+ }
+ }
+
+ /* If we have lost the log collector, try to start a new one */
+ if (SysLoggerPID == 0 && Logging_collector)
+ SysLoggerPID = SysLogger_Start();
+
+ /*
+ * If no background writer process is running, and we are not in a
+ * state that prevents it, start one. It doesn't matter if this
+ * fails, we'll just try again later.
+ */
+ if (BgWriterPID == 0 &&
+ (pmState == PM_RUN || pmState == PM_RECOVERY ||
+ pmState == PM_HOT_STANDBY))
+ BgWriterPID = StartBackgroundWriter();
+
+ /*
+ * Likewise, if we have lost the walwriter process, try to start a new
+ * one.
+ */
+ if (WalWriterPID == 0 && pmState == PM_RUN)
+ WalWriterPID = StartWalWriter();
+
+ /*
+ * If we have lost the autovacuum launcher, try to start a new one. We
+ * don't want autovacuum to run in binary upgrade mode because
+ * autovacuum might update relfrozenxid for empty tables before the
+ * physical files are put in place.
+ */
+ if (!IsBinaryUpgrade && AutoVacPID == 0 &&
+ (AutoVacuumingActive() || start_autovac_launcher) &&
+ pmState == PM_RUN)
+ {
+ AutoVacPID = StartAutoVacLauncher();
+ if (AutoVacPID != 0)
+ start_autovac_launcher = false; /* signal processed */
+ }
+
+ /* If we have lost the archiver, try to start a new one */
+ if (XLogArchivingActive() && PgArchPID == 0 && pmState == PM_RUN)
+ PgArchPID = pgarch_start();
+
+ /* If we have lost the stats collector, try to start a new one */
+ if (PgStatPID == 0 && pmState == PM_RUN)
+ PgStatPID = pgstat_start();
+
+ /* If we need to signal the autovacuum launcher, do so now */
+ if (avlauncher_needs_signal)
+ {
+ avlauncher_needs_signal = false;
+ if (AutoVacPID != 0)
+ kill(AutoVacPID, SIGUSR2);
+ }
+
+ /*
+ * Touch the socket and lock file every 58 minutes, to ensure that
+ * they are not removed by overzealous /tmp-cleaning tasks. We assume
+ * no one runs cleaners with cutoff times of less than an hour ...
+ */
+ now = time(NULL);
+ if (now - last_touch_time >= 58 * SECS_PER_MINUTE)
+ {
+ TouchSocketFile();
+ TouchSocketLockFile();
+ last_touch_time = now;
+ }
+ }
+ }
+
+
+ /*
+ * Initialise the masks for select() for the ports we are listening on.
+ * Return the number of sockets to listen on.
+ */
+ static int
+ initMasks(fd_set *rmask)
+ {
+ int maxsock = -1;
+ int i;
+
+ FD_ZERO(rmask);
+
+ for (i = 0; i < MAXLISTEN; i++)
+ {
+ int fd = ListenSocket[i];
+
+ if (fd == PGINVALID_SOCKET)
+ break;
+ FD_SET(fd, rmask);
+
+ if (fd > maxsock)
+ maxsock = fd;
+ }
+
+ return maxsock + 1;
+ }
+
+
+ /*
+ * Read a client's startup packet and do something according to it.
+ *
+ * Returns STATUS_OK or STATUS_ERROR, or might call ereport(FATAL) and
+ * not return at all.
+ *
+ * (Note that ereport(FATAL) stuff is sent to the client, so only use it
+ * if that's what you want. Return STATUS_ERROR if you don't want to
+ * send anything to the client, which would typically be appropriate
+ * if we detect a communications failure.)
+ */
+ static int
+ ProcessStartupPacket(Port *port, bool SSLdone)
+ {
+ int32 len;
+ void *buf;
+ ProtocolVersion proto;
+ MemoryContext oldcontext;
+
+ if (pq_getbytes((char *) &len, 4) == EOF)
+ {
+ /*
+ * EOF after SSLdone probably means the client didn't like our
+ * response to NEGOTIATE_SSL_CODE. That's not an error condition, so
+ * don't clutter the log with a complaint.
+ */
+ if (!SSLdone)
+ ereport(COMMERROR,
+ (errcode(ERRCODE_PROTOCOL_VIOLATION),
+ errmsg("incomplete startup packet")));
+ return STATUS_ERROR;
+ }
+
+ len = ntohl(len);
+ len -= 4;
+
+ if (len < (int32) sizeof(ProtocolVersion) ||
+ len > MAX_STARTUP_PACKET_LENGTH)
+ {
+ ereport(COMMERROR,
+ (errcode(ERRCODE_PROTOCOL_VIOLATION),
+ errmsg("invalid length of startup packet")));
+ return STATUS_ERROR;
+ }
+
+ /*
+ * Allocate at least the size of an old-style startup packet, plus one
+ * extra byte, and make sure all are zeroes. This ensures we will have
+ * null termination of all strings, in both fixed- and variable-length
+ * packet layouts.
+ */
+ if (len <= (int32) sizeof(StartupPacket))
+ buf = palloc0(sizeof(StartupPacket) + 1);
+ else
+ buf = palloc0(len + 1);
+
+ if (pq_getbytes(buf, len) == EOF)
+ {
+ ereport(COMMERROR,
+ (errcode(ERRCODE_PROTOCOL_VIOLATION),
+ errmsg("incomplete startup packet")));
+ return STATUS_ERROR;
+ }
+
+ /*
+ * The first field is either a protocol version number or a special
+ * request code.
+ */
+ port->proto = proto = ntohl(*((ProtocolVersion *) buf));
+
+ if (proto == CANCEL_REQUEST_CODE)
+ {
+ processCancelRequest(port, buf);
+ /* Not really an error, but we don't want to proceed further */
+ return STATUS_ERROR;
+ }
+
+ if (proto == NEGOTIATE_SSL_CODE && !SSLdone)
+ {
+ char SSLok;
+
+ #ifdef USE_SSL
+ /* No SSL when disabled or on Unix sockets */
+ if (!EnableSSL || IS_AF_UNIX(port->laddr.addr.ss_family))
+ SSLok = 'N';
+ else
+ SSLok = 'S'; /* Support for SSL */
+ #else
+ SSLok = 'N'; /* No support for SSL */
+ #endif
+
+ retry1:
+ if (send(port->sock, &SSLok, 1, 0) != 1)
+ {
+ if (errno == EINTR)
+ goto retry1; /* if interrupted, just retry */
+ ereport(COMMERROR,
+ (errcode_for_socket_access(),
+ errmsg("failed to send SSL negotiation response: %m")));
+ return STATUS_ERROR; /* close the connection */
+ }
+
+ #ifdef USE_SSL
+ if (SSLok == 'S' && secure_open_server(port) == -1)
+ return STATUS_ERROR;
+ #endif
+ /* regular startup packet, cancel, etc packet should follow... */
+ /* but not another SSL negotiation request */
+ return ProcessStartupPacket(port, true);
+ }
+
+ /* Could add additional special packet types here */
+
+ /*
+ * Set FrontendProtocol now so that ereport() knows what format to send if
+ * we fail during startup.
+ */
+ FrontendProtocol = proto;
+
+ /* Check we can handle the protocol the frontend is using. */
+
+ if (PG_PROTOCOL_MAJOR(proto) < PG_PROTOCOL_MAJOR(PG_PROTOCOL_EARLIEST) ||
+ PG_PROTOCOL_MAJOR(proto) > PG_PROTOCOL_MAJOR(PG_PROTOCOL_LATEST) ||
+ (PG_PROTOCOL_MAJOR(proto) == PG_PROTOCOL_MAJOR(PG_PROTOCOL_LATEST) &&
+ PG_PROTOCOL_MINOR(proto) > PG_PROTOCOL_MINOR(PG_PROTOCOL_LATEST)))
+ ereport(FATAL,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("unsupported frontend protocol %u.%u: server supports %u.0 to %u.%u",
+ PG_PROTOCOL_MAJOR(proto), PG_PROTOCOL_MINOR(proto),
+ PG_PROTOCOL_MAJOR(PG_PROTOCOL_EARLIEST),
+ PG_PROTOCOL_MAJOR(PG_PROTOCOL_LATEST),
+ PG_PROTOCOL_MINOR(PG_PROTOCOL_LATEST))));
+
+ /*
+ * Now fetch parameters out of startup packet and save them into the Port
+ * structure. All data structures attached to the Port struct must be
+ * allocated in TopMemoryContext so that they will remain available in a
+ * running backend (even after PostmasterContext is destroyed). We need
+ * not worry about leaking this storage on failure, since we aren't in the
+ * postmaster process anymore.
+ */
+ oldcontext = MemoryContextSwitchTo(TopMemoryContext);
+
+ if (PG_PROTOCOL_MAJOR(proto) >= 3)
+ {
+ int32 offset = sizeof(ProtocolVersion);
+
+ /*
+ * Scan packet body for name/option pairs. We can assume any string
+ * beginning within the packet body is null-terminated, thanks to
+ * zeroing extra byte above.
+ */
+ port->guc_options = NIL;
+
+ while (offset < len)
+ {
+ char *nameptr = ((char *) buf) + offset;
+ int32 valoffset;
+ char *valptr;
+
+ if (*nameptr == '\0')
+ break; /* found packet terminator */
+ valoffset = offset + strlen(nameptr) + 1;
+ if (valoffset >= len)
+ break; /* missing value, will complain below */
+ valptr = ((char *) buf) + valoffset;
+
+ if (strcmp(nameptr, "database") == 0)
+ port->database_name = pstrdup(valptr);
+ else if (strcmp(nameptr, "user") == 0)
+ port->user_name = pstrdup(valptr);
+ else if (strcmp(nameptr, "options") == 0)
+ port->cmdline_options = pstrdup(valptr);
+ else if (strcmp(nameptr, "replication") == 0)
+ {
+ if (!parse_bool(valptr, &am_walsender))
+ ereport(FATAL,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("invalid value for boolean option \"replication\"")));
+ }
+ else
+ {
+ /* Assume it's a generic GUC option */
+ port->guc_options = lappend(port->guc_options,
+ pstrdup(nameptr));
+ port->guc_options = lappend(port->guc_options,
+ pstrdup(valptr));
+ }
+ offset = valoffset + strlen(valptr) + 1;
+ }
+
+ /*
+ * If we didn't find a packet terminator exactly at the end of the
+ * given packet length, complain.
+ */
+ if (offset != len - 1)
+ ereport(FATAL,
+ (errcode(ERRCODE_PROTOCOL_VIOLATION),
+ errmsg("invalid startup packet layout: expected terminator as last byte")));
+ }
+ else
+ {
+ /*
+ * Get the parameters from the old-style, fixed-width-fields startup
+ * packet as C strings. The packet destination was cleared first so a
+ * short packet has zeros silently added. We have to be prepared to
+ * truncate the pstrdup result for oversize fields, though.
+ */
+ StartupPacket *packet = (StartupPacket *) buf;
+
+ port->database_name = pstrdup(packet->database);
+ if (strlen(port->database_name) > sizeof(packet->database))
+ port->database_name[sizeof(packet->database)] = '\0';
+ port->user_name = pstrdup(packet->user);
+ if (strlen(port->user_name) > sizeof(packet->user))
+ port->user_name[sizeof(packet->user)] = '\0';
+ port->cmdline_options = pstrdup(packet->options);
+ if (strlen(port->cmdline_options) > sizeof(packet->options))
+ port->cmdline_options[sizeof(packet->options)] = '\0';
+ port->guc_options = NIL;
+ }
+
+ /* Check a user name was given. */
+ if (port->user_name == NULL || port->user_name[0] == '\0')
+ ereport(FATAL,
+ (errcode(ERRCODE_INVALID_AUTHORIZATION_SPECIFICATION),
+ errmsg("no PostgreSQL user name specified in startup packet")));
+
+ /* The database defaults to the user name. */
+ if (port->database_name == NULL || port->database_name[0] == '\0')
+ port->database_name = pstrdup(port->user_name);
+
+ if (Db_user_namespace)
+ {
+ /*
+ * If user@, it is a global user, remove '@'. We only want to do this
+ * if there is an '@' at the end and no earlier in the user string or
+ * they may fake as a local user of another database attaching to this
+ * database.
+ */
+ if (strchr(port->user_name, '@') ==
+ port->user_name + strlen(port->user_name) - 1)
+ *strchr(port->user_name, '@') = '\0';
+ else
+ {
+ /* Append '@' and dbname */
+ char *db_user;
+
+ db_user = palloc(strlen(port->user_name) +
+ strlen(port->database_name) + 2);
+ sprintf(db_user, "%s@%s", port->user_name, port->database_name);
+ port->user_name = db_user;
+ }
+ }
+
+ /*
+ * Truncate given database and user names to length of a Postgres name.
+ * This avoids lookup failures when overlength names are given.
+ */
+ if (strlen(port->database_name) >= NAMEDATALEN)
+ port->database_name[NAMEDATALEN - 1] = '\0';
+ if (strlen(port->user_name) >= NAMEDATALEN)
+ port->user_name[NAMEDATALEN - 1] = '\0';
+
+ /* Walsender is not related to a particular database */
+ if (am_walsender)
+ port->database_name[0] = '\0';
+
+ /*
+ * Done putting stuff in TopMemoryContext.
+ */
+ MemoryContextSwitchTo(oldcontext);
+
+ /*
+ * If we're going to reject the connection due to database state, say so
+ * now instead of wasting cycles on an authentication exchange. (This also
+ * allows a pg_ping utility to be written.)
+ */
+ switch (port->canAcceptConnections)
+ {
+ case CAC_STARTUP:
+ ereport(FATAL,
+ (errcode(ERRCODE_CANNOT_CONNECT_NOW),
+ errmsg("the database system is starting up")));
+ break;
+ case CAC_SHUTDOWN:
+ ereport(FATAL,
+ (errcode(ERRCODE_CANNOT_CONNECT_NOW),
+ errmsg("the database system is shutting down")));
+ break;
+ case CAC_RECOVERY:
+ ereport(FATAL,
+ (errcode(ERRCODE_CANNOT_CONNECT_NOW),
+ errmsg("the database system is in recovery mode")));
+ break;
+ case CAC_TOOMANY:
+ ereport(FATAL,
+ (errcode(ERRCODE_TOO_MANY_CONNECTIONS),
+ errmsg("sorry, too many clients already")));
+ break;
+ case CAC_WAITBACKUP:
+ /* OK for now, will check in InitPostgres */
+ break;
+ case CAC_OK:
+ break;
+ }
+
+ return STATUS_OK;
+ }
+
+
+ /*
+ * The client has sent a cancel request packet, not a normal
+ * start-a-new-connection packet. Perform the necessary processing.
+ * Nothing is sent back to the client.
+ */
+ static void
+ processCancelRequest(Port *port, void *pkt)
+ {
+ CancelRequestPacket *canc = (CancelRequestPacket *) pkt;
+ int backendPID;
+ long cancelAuthCode;
+ Backend *bp;
+
+ #ifndef EXEC_BACKEND
+ Dlelem *curr;
+ #else
+ int i;
+ #endif
+
+ backendPID = (int) ntohl(canc->backendPID);
+ cancelAuthCode = (long) ntohl(canc->cancelAuthCode);
+
+ /*
+ * See if we have a matching backend. In the EXEC_BACKEND case, we can no
+ * longer access the postmaster's own backend list, and must rely on the
+ * duplicate array in shared memory.
+ */
+ #ifndef EXEC_BACKEND
+ for (curr = DLGetHead(BackendList); curr; curr = DLGetSucc(curr))
+ {
+ bp = (Backend *) DLE_VAL(curr);
+ #else
+ for (i = MaxLivePostmasterChildren() - 1; i >= 0; i--)
+ {
+ bp = (Backend *) &ShmemBackendArray[i];
+ #endif
+ if (bp->pid == backendPID)
+ {
+ if (bp->cancel_key == cancelAuthCode)
+ {
+ /* Found a match; signal that backend to cancel current op */
+ ereport(DEBUG2,
+ (errmsg_internal("processing cancel request: sending SIGINT to process %d",
+ backendPID)));
+ signal_child(bp->pid, SIGINT);
+ }
+ else
+ /* Right PID, wrong key: no way, Jose */
+ ereport(LOG,
+ (errmsg("wrong key in cancel request for process %d",
+ backendPID)));
+ return;
+ }
+ }
+
+ /* No matching backend */
+ ereport(LOG,
+ (errmsg("PID %d in cancel request did not match any process",
+ backendPID)));
+ }
+
+ /*
+ * canAcceptConnections --- check to see if database state allows connections.
+ */
+ static CAC_state
+ canAcceptConnections(void)
+ {
+ CAC_state result = CAC_OK;
+
+ /*
+ * Can't start backends when in startup/shutdown/inconsistent recovery
+ * state.
+ *
+ * In state PM_WAIT_BACKUP only superusers can connect (this must be
+ * allowed so that a superuser can end online backup mode); we return
+ * CAC_WAITBACKUP code to indicate that this must be checked later. Note
+ * that neither CAC_OK nor CAC_WAITBACKUP can safely be returned until we
+ * have checked for too many children.
+ */
+ if (pmState != PM_RUN)
+ {
+ if (pmState == PM_WAIT_BACKUP)
+ result = CAC_WAITBACKUP; /* allow superusers only */
+ else if (Shutdown > NoShutdown)
+ return CAC_SHUTDOWN; /* shutdown is pending */
+ else if (!FatalError &&
+ (pmState == PM_STARTUP ||
+ pmState == PM_RECOVERY))
+ return CAC_STARTUP; /* normal startup */
+ else if (!FatalError &&
+ pmState == PM_HOT_STANDBY)
+ result = CAC_OK; /* connection OK during hot standby */
+ else
+ return CAC_RECOVERY; /* else must be crash recovery */
+ }
+
+ /*
+ * Don't start too many children.
+ *
+ * We allow more connections than we can have backends here because some
+ * might still be authenticating; they might fail auth, or some existing
+ * backend might exit before the auth cycle is completed. The exact
+ * MaxBackends limit is enforced when a new backend tries to join the
+ * shared-inval backend array.
+ *
+ * The limit here must match the sizes of the per-child-process arrays;
+ * see comments for MaxLivePostmasterChildren().
+ */
+ if (CountChildren(BACKEND_TYPE_ALL) >= MaxLivePostmasterChildren())
+ result = CAC_TOOMANY;
+
+ return result;
+ }
+
+
+ /*
+ * ConnCreate -- create a local connection data structure
+ *
+ * Returns NULL on failure, other than out-of-memory which is fatal.
+ */
+ static Port *
+ ConnCreate(int serverFd)
+ {
+ Port *port;
+
+ if (!(port = (Port *) calloc(1, sizeof(Port))))
+ {
+ ereport(LOG,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ ExitPostmaster(1);
+ }
+
+ if (StreamConnection(serverFd, port) != STATUS_OK)
+ {
+ if (port->sock >= 0)
+ StreamClose(port->sock);
+ ConnFree(port);
+ return NULL;
+ }
+
+ /*
+ * Precompute password salt values to use for this connection. It's
+ * slightly annoying to do this long in advance of knowing whether we'll
+ * need 'em or not, but we must do the random() calls before we fork, not
+ * after. Else the postmaster's random sequence won't get advanced, and
+ * all backends would end up using the same salt...
+ */
+ RandomSalt(port->md5Salt);
+
+ /*
+ * Allocate GSSAPI specific state struct
+ */
+ #ifndef EXEC_BACKEND
+ #if defined(ENABLE_GSS) || defined(ENABLE_SSPI)
+ port->gss = (pg_gssinfo *) calloc(1, sizeof(pg_gssinfo));
+ if (!port->gss)
+ {
+ ereport(LOG,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ ExitPostmaster(1);
+ }
+ #endif
+ #endif
+
+ return port;
+ }
+
+
+ /*
+ * ConnFree -- free a local connection data structure
+ */
+ static void
+ ConnFree(Port *conn)
+ {
+ #ifdef USE_SSL
+ secure_close(conn);
+ #endif
+ if (conn->gss)
+ free(conn->gss);
+ free(conn);
+ }
+
+
+ /*
+ * ClosePostmasterPorts -- close all the postmaster's open sockets
+ *
+ * This is called during child process startup to release file descriptors
+ * that are not needed by that child process. The postmaster still has
+ * them open, of course.
+ *
+ * Note: we pass am_syslogger as a boolean because we don't want to set
+ * the global variable yet when this is called.
+ */
+ void
+ ClosePostmasterPorts(bool am_syslogger)
+ {
+ int i;
+
+ #ifndef WIN32
+ /*
+ * Close the write end of postmaster death watch pipe. It's important to
+ * do this as early as possible, so that if postmaster dies, others won't
+ * think that it's still running because we're holding the pipe open.
+ */
+ if (close(postmaster_alive_fds[POSTMASTER_FD_OWN]))
+ ereport(FATAL,
+ (errcode_for_file_access(),
+ errmsg_internal("could not close postmaster death monitoring pipe in child process: %m")));
+ postmaster_alive_fds[POSTMASTER_FD_OWN] = -1;
+ #endif
+
+ /* Close the listen sockets */
+ for (i = 0; i < MAXLISTEN; i++)
+ {
+ if (ListenSocket[i] != PGINVALID_SOCKET)
+ {
+ StreamClose(ListenSocket[i]);
+ ListenSocket[i] = PGINVALID_SOCKET;
+ }
+ }
+
+ /* If using syslogger, close the read side of the pipe */
+ if (!am_syslogger)
+ {
+ #ifndef WIN32
+ if (syslogPipe[0] >= 0)
+ close(syslogPipe[0]);
+ syslogPipe[0] = -1;
+ #else
+ if (syslogPipe[0])
+ CloseHandle(syslogPipe[0]);
+ syslogPipe[0] = 0;
+ #endif
+ }
+
+ #ifdef USE_BONJOUR
+ /* If using Bonjour, close the connection to the mDNS daemon */
+ if (bonjour_sdref)
+ close(DNSServiceRefSockFD(bonjour_sdref));
+ #endif
+ }
+
+
+ /*
+ * reset_shared -- reset shared memory and semaphores
+ */
+ static void
+ reset_shared(int port)
+ {
+ /*
+ * Create or re-create shared memory and semaphores.
+ *
+ * Note: in each "cycle of life" we will normally assign the same IPC keys
+ * (if using SysV shmem and/or semas), since the port number is used to
+ * determine IPC keys. This helps ensure that we will clean up dead IPC
+ * objects if the postmaster crashes and is restarted.
+ */
+ CreateSharedMemoryAndSemaphores(false, port);
+ }
+
+
+ /*
+ * SIGHUP -- reread config files, and tell children to do same
+ */
+ static void
+ SIGHUP_handler(SIGNAL_ARGS)
+ {
+ int save_errno = errno;
+
+ PG_SETMASK(&BlockSig);
+
+ if (Shutdown <= SmartShutdown)
+ {
+ ereport(LOG,
+ (errmsg("received SIGHUP, reloading configuration files")));
+ ProcessConfigFile(PGC_SIGHUP);
+ SignalChildren(SIGHUP);
+ if (StartupPID != 0)
+ signal_child(StartupPID, SIGHUP);
+ if (BgWriterPID != 0)
+ signal_child(BgWriterPID, SIGHUP);
+ if (WalWriterPID != 0)
+ signal_child(WalWriterPID, SIGHUP);
+ if (WalReceiverPID != 0)
+ signal_child(WalReceiverPID, SIGHUP);
+ if (AutoVacPID != 0)
+ signal_child(AutoVacPID, SIGHUP);
+ if (PgArchPID != 0)
+ signal_child(PgArchPID, SIGHUP);
+ if (SysLoggerPID != 0)
+ signal_child(SysLoggerPID, SIGHUP);
+ if (PgStatPID != 0)
+ signal_child(PgStatPID, SIGHUP);
+
+ /* Reload authentication config files too */
+ if (!load_hba())
+ ereport(WARNING,
+ (errmsg("pg_hba.conf not reloaded")));
+
+ load_ident();
+
+ #ifdef EXEC_BACKEND
+ /* Update the starting-point file for future children */
+ write_nondefault_variables(PGC_SIGHUP);
+ #endif
+ }
+
+ PG_SETMASK(&UnBlockSig);
+
+ errno = save_errno;
+ }
+
+
+ /*
+ * pmdie -- signal handler for processing various postmaster signals.
+ */
+ static void
+ pmdie(SIGNAL_ARGS)
+ {
+ int save_errno = errno;
+
+ PG_SETMASK(&BlockSig);
+
+ ereport(DEBUG2,
+ (errmsg_internal("postmaster received signal %d",
+ postgres_signal_arg)));
+
+ switch (postgres_signal_arg)
+ {
+ case SIGTERM:
+
+ /*
+ * Smart Shutdown:
+ *
+ * Wait for children to end their work, then shut down.
+ */
+ if (Shutdown >= SmartShutdown)
+ break;
+ Shutdown = SmartShutdown;
+ ereport(LOG,
+ (errmsg("received smart shutdown request")));
+
+ if (pmState == PM_RUN || pmState == PM_RECOVERY ||
+ pmState == PM_HOT_STANDBY || pmState == PM_STARTUP)
+ {
+ /* autovacuum workers are told to shut down immediately */
+ SignalSomeChildren(SIGTERM, BACKEND_TYPE_AUTOVAC);
+ /* and the autovac launcher too */
+ if (AutoVacPID != 0)
+ signal_child(AutoVacPID, SIGTERM);
+ /* and the walwriter too */
+ if (WalWriterPID != 0)
+ signal_child(WalWriterPID, SIGTERM);
+
+ /*
+ * If we're in recovery, we can't kill the startup process
+ * right away, because at present doing so does not release
+ * its locks. We might want to change this in a future
+ * release. For the time being, the PM_WAIT_READONLY state
+ * indicates that we're waiting for the regular (read only)
+ * backends to die off; once they do, we'll kill the startup
+ * and walreceiver processes.
+ */
+ pmState = (pmState == PM_RUN) ?
+ PM_WAIT_BACKUP : PM_WAIT_READONLY;
+ }
+
+ /*
+ * Now wait for online backup mode to end and backends to exit. If
+ * that is already the case, PostmasterStateMachine will take the
+ * next step.
+ */
+ PostmasterStateMachine();
+ break;
+
+ case SIGINT:
+
+ /*
+ * Fast Shutdown:
+ *
+ * Abort all children with SIGTERM (rollback active transactions
+ * and exit) and shut down when they are gone.
+ */
+ if (Shutdown >= FastShutdown)
+ break;
+ Shutdown = FastShutdown;
+ ereport(LOG,
+ (errmsg("received fast shutdown request")));
+
+ if (StartupPID != 0)
+ signal_child(StartupPID, SIGTERM);
+ if (WalReceiverPID != 0)
+ signal_child(WalReceiverPID, SIGTERM);
+ if (pmState == PM_RECOVERY)
+ {
+ /* only bgwriter is active in this state */
+ pmState = PM_WAIT_BACKENDS;
+ }
+ else if (pmState == PM_RUN ||
+ pmState == PM_WAIT_BACKUP ||
+ pmState == PM_WAIT_READONLY ||
+ pmState == PM_WAIT_BACKENDS ||
+ pmState == PM_HOT_STANDBY)
+ {
+ ereport(LOG,
+ (errmsg("aborting any active transactions")));
+ /* shut down all backends and autovac workers */
+ SignalSomeChildren(SIGTERM,
+ BACKEND_TYPE_NORMAL | BACKEND_TYPE_AUTOVAC);
+ /* and the autovac launcher too */
+ if (AutoVacPID != 0)
+ signal_child(AutoVacPID, SIGTERM);
+ /* and the walwriter too */
+ if (WalWriterPID != 0)
+ signal_child(WalWriterPID, SIGTERM);
+ pmState = PM_WAIT_BACKENDS;
+ }
+
+ /*
+ * Now wait for backends to exit. If there are none,
+ * PostmasterStateMachine will take the next step.
+ */
+ PostmasterStateMachine();
+ break;
+
+ case SIGQUIT:
+
+ /*
+ * Immediate Shutdown:
+ *
+ * abort all children with SIGQUIT and exit without attempt to
+ * properly shut down data base system.
+ */
+ ereport(LOG,
+ (errmsg("received immediate shutdown request")));
+ SignalChildren(SIGQUIT);
+ if (StartupPID != 0)
+ signal_child(StartupPID, SIGQUIT);
+ if (BgWriterPID != 0)
+ signal_child(BgWriterPID, SIGQUIT);
+ if (WalWriterPID != 0)
+ signal_child(WalWriterPID, SIGQUIT);
+ if (WalReceiverPID != 0)
+ signal_child(WalReceiverPID, SIGQUIT);
+ if (AutoVacPID != 0)
+ signal_child(AutoVacPID, SIGQUIT);
+ if (PgArchPID != 0)
+ signal_child(PgArchPID, SIGQUIT);
+ if (PgStatPID != 0)
+ signal_child(PgStatPID, SIGQUIT);
+ ExitPostmaster(0);
+ break;
+ }
+
+ PG_SETMASK(&UnBlockSig);
+
+ errno = save_errno;
+ }
+
+ /*
+ * Reaper -- signal handler to cleanup after a child process dies.
+ */
+ static void
+ reaper(SIGNAL_ARGS)
+ {
+ int save_errno = errno;
+ int pid; /* process id of dead child process */
+ int exitstatus; /* its exit status */
+
+ /* These macros hide platform variations in getting child status */
+ #ifdef HAVE_WAITPID
+ int status; /* child exit status */
+
+ #define LOOPTEST() ((pid = waitpid(-1, &status, WNOHANG)) > 0)
+ #define LOOPHEADER() (exitstatus = status)
+ #else /* !HAVE_WAITPID */
+ #ifndef WIN32
+ union wait status; /* child exit status */
+
+ #define LOOPTEST() ((pid = wait3(&status, WNOHANG, NULL)) > 0)
+ #define LOOPHEADER() (exitstatus = status.w_status)
+ #else /* WIN32 */
+ #define LOOPTEST() ((pid = win32_waitpid(&exitstatus)) > 0)
+ #define LOOPHEADER()
+ #endif /* WIN32 */
+ #endif /* HAVE_WAITPID */
+
+ PG_SETMASK(&BlockSig);
+
+ ereport(DEBUG4,
+ (errmsg_internal("reaping dead processes")));
+
+ while (LOOPTEST())
+ {
+ LOOPHEADER();
+
+ /*
+ * Check if this child was a startup process.
+ */
+ if (pid == StartupPID)
+ {
+ StartupPID = 0;
+
+ /*
+ * Unexpected exit of startup process (including FATAL exit)
+ * during PM_STARTUP is treated as catastrophic. There are no
+ * other processes running yet, so we can just exit.
+ */
+ if (pmState == PM_STARTUP && !EXIT_STATUS_0(exitstatus))
+ {
+ LogChildExit(LOG, _("startup process"),
+ pid, exitstatus);
+ ereport(LOG,
+ (errmsg("aborting startup due to startup process failure")));
+ ExitPostmaster(1);
+ }
+
+ /*
+ * Startup process exited in response to a shutdown request (or it
+ * completed normally regardless of the shutdown request).
+ */
+ if (Shutdown > NoShutdown &&
+ (EXIT_STATUS_0(exitstatus) || EXIT_STATUS_1(exitstatus)))
+ {
+ pmState = PM_WAIT_BACKENDS;
+ /* PostmasterStateMachine logic does the rest */
+ continue;
+ }
+
+ /*
+ * Any unexpected exit (including FATAL exit) of the startup
+ * process is treated as a crash, except that we don't want to
+ * reinitialize.
+ */
+ if (!EXIT_STATUS_0(exitstatus))
+ {
+ RecoveryError = true;
+ HandleChildCrash(pid, exitstatus,
+ _("startup process"));
+ continue;
+ }
+
+ /*
+ * Startup succeeded, commence normal operations
+ */
+ FatalError = false;
+ ReachedNormalRunning = true;
+ pmState = PM_RUN;
+
+ /*
+ * Kill any walsenders to force the downstream standby(s) to
+ * reread the timeline history file, adjust their timelines and
+ * establish replication connections again. This is required
+ * because the timeline of cascading standby is not consistent
+ * with that of cascaded one just after failover. We LOG this
+ * message since we need to leave a record to explain this
+ * disconnection.
+ *
+ * XXX should avoid the need for disconnection. When we do,
+ * am_cascading_walsender should be replaced with RecoveryInProgress()
+ */
+ if (max_wal_senders > 0 && CountChildren(BACKEND_TYPE_WALSND) > 0)
+ {
+ ereport(LOG,
+ (errmsg("terminating all walsender processes to force cascaded "
+ "standby(s) to update timeline and reconnect")));
+ SignalSomeChildren(SIGUSR2, BACKEND_TYPE_WALSND);
+ }
+
+ /*
+ * Crank up the background writer, if we didn't do that already
+ * when we entered consistent recovery state. It doesn't matter
+ * if this fails, we'll just try again later.
+ */
+ if (BgWriterPID == 0)
+ BgWriterPID = StartBackgroundWriter();
+
+ /*
+ * Likewise, start other special children as needed. In a restart
+ * situation, some of them may be alive already.
+ */
+ if (WalWriterPID == 0)
+ WalWriterPID = StartWalWriter();
+ if (!IsBinaryUpgrade && AutoVacuumingActive() && AutoVacPID == 0)
+ AutoVacPID = StartAutoVacLauncher();
+ if (XLogArchivingActive() && PgArchPID == 0)
+ PgArchPID = pgarch_start();
+ if (PgStatPID == 0)
+ PgStatPID = pgstat_start();
+
+ /* at this point we are really open for business */
+ ereport(LOG,
+ (errmsg("database system is ready to accept connections")));
+
+ continue;
+ }
+
+ /*
+ * Was it the bgwriter?
+ */
+ if (pid == BgWriterPID)
+ {
+ BgWriterPID = 0;
+ if (EXIT_STATUS_0(exitstatus) && pmState == PM_SHUTDOWN)
+ {
+ /*
+ * OK, we saw normal exit of the bgwriter after it's been told
+ * to shut down. We expect that it wrote a shutdown
+ * checkpoint. (If for some reason it didn't, recovery will
+ * occur on next postmaster start.)
+ *
+ * At this point we should have no normal backend children
+ * left (else we'd not be in PM_SHUTDOWN state) but we might
+ * have dead_end children to wait for.
+ *
+ * If we have an archiver subprocess, tell it to do a last
+ * archive cycle and quit. Likewise, if we have walsender
+ * processes, tell them to send any remaining WAL and quit.
+ */
+ Assert(Shutdown > NoShutdown);
+
+ /* Waken archiver for the last time */
+ if (PgArchPID != 0)
+ signal_child(PgArchPID, SIGUSR2);
+
+ /*
+ * Waken walsenders for the last time. No regular backends
+ * should be around anymore.
+ */
+ SignalChildren(SIGUSR2);
+
+ pmState = PM_SHUTDOWN_2;
+
+ /*
+ * We can also shut down the stats collector now; there's
+ * nothing left for it to do.
+ */
+ if (PgStatPID != 0)
+ signal_child(PgStatPID, SIGQUIT);
+ }
+ else
+ {
+ /*
+ * Any unexpected exit of the bgwriter (including FATAL exit)
+ * is treated as a crash.
+ */
+ HandleChildCrash(pid, exitstatus,
+ _("background writer process"));
+ }
+
+ continue;
+ }
+
+ /*
+ * Was it the wal writer? Normal exit can be ignored; we'll start a
+ * new one at the next iteration of the postmaster's main loop, if
+ * necessary. Any other exit condition is treated as a crash.
+ */
+ if (pid == WalWriterPID)
+ {
+ WalWriterPID = 0;
+ if (!EXIT_STATUS_0(exitstatus))
+ HandleChildCrash(pid, exitstatus,
+ _("WAL writer process"));
+ continue;
+ }
+
+ /*
+ * Was it the wal receiver? If exit status is zero (normal) or one
+ * (FATAL exit), we assume everything is all right just like normal
+ * backends.
+ */
+ if (pid == WalReceiverPID)
+ {
+ WalReceiverPID = 0;
+ if (!EXIT_STATUS_0(exitstatus) && !EXIT_STATUS_1(exitstatus))
+ HandleChildCrash(pid, exitstatus,
+ _("WAL receiver process"));
+ continue;
+ }
+
+ /*
+ * Was it the autovacuum launcher? Normal exit can be ignored; we'll
+ * start a new one at the next iteration of the postmaster's main
+ * loop, if necessary. Any other exit condition is treated as a
+ * crash.
+ */
+ if (pid == AutoVacPID)
+ {
+ AutoVacPID = 0;
+ if (!EXIT_STATUS_0(exitstatus))
+ HandleChildCrash(pid, exitstatus,
+ _("autovacuum launcher process"));
+ continue;
+ }
+
+ /*
+ * Was it the archiver? If so, just try to start a new one; no need
+ * to force reset of the rest of the system. (If fail, we'll try
+ * again in future cycles of the main loop.). Unless we were waiting
+ * for it to shut down; don't restart it in that case, and and
+ * PostmasterStateMachine() will advance to the next shutdown step.
+ */
+ if (pid == PgArchPID)
+ {
+ PgArchPID = 0;
+ if (!EXIT_STATUS_0(exitstatus))
+ LogChildExit(LOG, _("archiver process"),
+ pid, exitstatus);
+ if (XLogArchivingActive() && pmState == PM_RUN)
+ PgArchPID = pgarch_start();
+ continue;
+ }
+
+ /*
+ * Was it the statistics collector? If so, just try to start a new
+ * one; no need to force reset of the rest of the system. (If fail,
+ * we'll try again in future cycles of the main loop.)
+ */
+ if (pid == PgStatPID)
+ {
+ PgStatPID = 0;
+ if (!EXIT_STATUS_0(exitstatus))
+ LogChildExit(LOG, _("statistics collector process"),
+ pid, exitstatus);
+ if (pmState == PM_RUN)
+ PgStatPID = pgstat_start();
+ continue;
+ }
+
+ /* Was it the system logger? If so, try to start a new one */
+ if (pid == SysLoggerPID)
+ {
+ SysLoggerPID = 0;
+ /* for safety's sake, launch new logger *first* */
+ SysLoggerPID = SysLogger_Start();
+ if (!EXIT_STATUS_0(exitstatus))
+ LogChildExit(LOG, _("system logger process"),
+ pid, exitstatus);
+ continue;
+ }
+
+ /*
+ * Else do standard backend child cleanup.
+ */
+ CleanupBackend(pid, exitstatus);
+ } /* loop over pending child-death reports */
+
+ /*
+ * After cleaning out the SIGCHLD queue, see if we have any state changes
+ * or actions to make.
+ */
+ PostmasterStateMachine();
+
+ /* Done with signal handler */
+ PG_SETMASK(&UnBlockSig);
+
+ errno = save_errno;
+ }
+
+
+ /*
+ * CleanupBackend -- cleanup after terminated backend.
+ *
+ * Remove all local state associated with backend.
+ */
+ static void
+ CleanupBackend(int pid,
+ int exitstatus) /* child's exit status. */
+ {
+ Dlelem *curr;
+
+ LogChildExit(DEBUG2, _("server process"), pid, exitstatus);
+
+ /*
+ * If a backend dies in an ugly way then we must signal all other backends
+ * to quickdie. If exit status is zero (normal) or one (FATAL exit), we
+ * assume everything is all right and proceed to remove the backend from
+ * the active backend list.
+ */
+ #ifdef WIN32
+
+ /*
+ * On win32, also treat ERROR_WAIT_NO_CHILDREN (128) as nonfatal case,
+ * since that sometimes happens under load when the process fails to start
+ * properly (long before it starts using shared memory). Microsoft reports
+ * it is related to mutex failure:
+ * http://archives.postgresql.org/pgsql-hackers/2010-09/msg00790.php
+ */
+ if (exitstatus == ERROR_WAIT_NO_CHILDREN)
+ {
+ LogChildExit(LOG, _("server process"), pid, exitstatus);
+ exitstatus = 0;
+ }
+ #endif
+
+ if (!EXIT_STATUS_0(exitstatus) && !EXIT_STATUS_1(exitstatus))
+ {
+ HandleChildCrash(pid, exitstatus, _("server process"));
+ return;
+ }
+
+ for (curr = DLGetHead(BackendList); curr; curr = DLGetSucc(curr))
+ {
+ Backend *bp = (Backend *) DLE_VAL(curr);
+
+ if (bp->pid == pid)
+ {
+ if (!bp->dead_end)
+ {
+ if (!ReleasePostmasterChildSlot(bp->child_slot))
+ {
+ /*
+ * Uh-oh, the child failed to clean itself up. Treat as a
+ * crash after all.
+ */
+ HandleChildCrash(pid, exitstatus, _("server process"));
+ return;
+ }
+ #ifdef EXEC_BACKEND
+ ShmemBackendArrayRemove(bp);
+ #endif
+ }
+ DLRemove(curr);
+ free(bp);
+ break;
+ }
+ }
+ }
+
+ /*
+ * HandleChildCrash -- cleanup after failed backend, bgwriter, walwriter,
+ * or autovacuum.
+ *
+ * The objectives here are to clean up our local state about the child
+ * process, and to signal all other remaining children to quickdie.
+ */
+ static void
+ HandleChildCrash(int pid, int exitstatus, const char *procname)
+ {
+ Dlelem *curr,
+ *next;
+ Backend *bp;
+
+ /*
+ * Make log entry unless there was a previous crash (if so, nonzero exit
+ * status is to be expected in SIGQUIT response; don't clutter log)
+ */
+ if (!FatalError)
+ {
+ LogChildExit(LOG, procname, pid, exitstatus);
+ ereport(LOG,
+ (errmsg("terminating any other active server processes")));
+ }
+
+ /* Process regular backends */
+ for (curr = DLGetHead(BackendList); curr; curr = next)
+ {
+ next = DLGetSucc(curr);
+ bp = (Backend *) DLE_VAL(curr);
+ if (bp->pid == pid)
+ {
+ /*
+ * Found entry for freshly-dead backend, so remove it.
+ */
+ if (!bp->dead_end)
+ {
+ (void) ReleasePostmasterChildSlot(bp->child_slot);
+ #ifdef EXEC_BACKEND
+ ShmemBackendArrayRemove(bp);
+ #endif
+ }
+ DLRemove(curr);
+ free(bp);
+ /* Keep looping so we can signal remaining backends */
+ }
+ else
+ {
+ /*
+ * This backend is still alive. Unless we did so already, tell it
+ * to commit hara-kiri.
+ *
+ * SIGQUIT is the special signal that says exit without proc_exit
+ * and let the user know what's going on. But if SendStop is set
+ * (-s on command line), then we send SIGSTOP instead, so that we
+ * can get core dumps from all backends by hand.
+ *
+ * We could exclude dead_end children here, but at least in the
+ * SIGSTOP case it seems better to include them.
+ */
+ if (!FatalError)
+ {
+ ereport(DEBUG2,
+ (errmsg_internal("sending %s to process %d",
+ (SendStop ? "SIGSTOP" : "SIGQUIT"),
+ (int) bp->pid)));
+ signal_child(bp->pid, (SendStop ? SIGSTOP : SIGQUIT));
+ }
+ }
+ }
+
+ /* Take care of the startup process too */
+ if (pid == StartupPID)
+ StartupPID = 0;
+ else if (StartupPID != 0 && !FatalError)
+ {
+ ereport(DEBUG2,
+ (errmsg_internal("sending %s to process %d",
+ (SendStop ? "SIGSTOP" : "SIGQUIT"),
+ (int) StartupPID)));
+ signal_child(StartupPID, (SendStop ? SIGSTOP : SIGQUIT));
+ }
+
+ /* Take care of the bgwriter too */
+ if (pid == BgWriterPID)
+ BgWriterPID = 0;
+ else if (BgWriterPID != 0 && !FatalError)
+ {
+ ereport(DEBUG2,
+ (errmsg_internal("sending %s to process %d",
+ (SendStop ? "SIGSTOP" : "SIGQUIT"),
+ (int) BgWriterPID)));
+ signal_child(BgWriterPID, (SendStop ? SIGSTOP : SIGQUIT));
+ }
+
+ /* Take care of the walwriter too */
+ if (pid == WalWriterPID)
+ WalWriterPID = 0;
+ else if (WalWriterPID != 0 && !FatalError)
+ {
+ ereport(DEBUG2,
+ (errmsg_internal("sending %s to process %d",
+ (SendStop ? "SIGSTOP" : "SIGQUIT"),
+ (int) WalWriterPID)));
+ signal_child(WalWriterPID, (SendStop ? SIGSTOP : SIGQUIT));
+ }
+
+ /* Take care of the walreceiver too */
+ if (pid == WalReceiverPID)
+ WalReceiverPID = 0;
+ else if (WalReceiverPID != 0 && !FatalError)
+ {
+ ereport(DEBUG2,
+ (errmsg_internal("sending %s to process %d",
+ (SendStop ? "SIGSTOP" : "SIGQUIT"),
+ (int) WalReceiverPID)));
+ signal_child(WalReceiverPID, (SendStop ? SIGSTOP : SIGQUIT));
+ }
+
+ /* Take care of the autovacuum launcher too */
+ if (pid == AutoVacPID)
+ AutoVacPID = 0;
+ else if (AutoVacPID != 0 && !FatalError)
+ {
+ ereport(DEBUG2,
+ (errmsg_internal("sending %s to process %d",
+ (SendStop ? "SIGSTOP" : "SIGQUIT"),
+ (int) AutoVacPID)));
+ signal_child(AutoVacPID, (SendStop ? SIGSTOP : SIGQUIT));
+ }
+
+ /*
+ * Force a power-cycle of the pgarch process too. (This isn't absolutely
+ * necessary, but it seems like a good idea for robustness, and it
+ * simplifies the state-machine logic in the case where a shutdown request
+ * arrives during crash processing.)
+ */
+ if (PgArchPID != 0 && !FatalError)
+ {
+ ereport(DEBUG2,
+ (errmsg_internal("sending %s to process %d",
+ "SIGQUIT",
+ (int) PgArchPID)));
+ signal_child(PgArchPID, SIGQUIT);
+ }
+
+ /*
+ * Force a power-cycle of the pgstat process too. (This isn't absolutely
+ * necessary, but it seems like a good idea for robustness, and it
+ * simplifies the state-machine logic in the case where a shutdown request
+ * arrives during crash processing.)
+ */
+ if (PgStatPID != 0 && !FatalError)
+ {
+ ereport(DEBUG2,
+ (errmsg_internal("sending %s to process %d",
+ "SIGQUIT",
+ (int) PgStatPID)));
+ signal_child(PgStatPID, SIGQUIT);
+ allow_immediate_pgstat_restart();
+ }
+
+ /* We do NOT restart the syslogger */
+
+ FatalError = true;
+ /* We now transit into a state of waiting for children to die */
+ if (pmState == PM_RECOVERY ||
+ pmState == PM_HOT_STANDBY ||
+ pmState == PM_RUN ||
+ pmState == PM_WAIT_BACKUP ||
+ pmState == PM_WAIT_READONLY ||
+ pmState == PM_SHUTDOWN)
+ pmState = PM_WAIT_BACKENDS;
+ }
+
+ /*
+ * Log the death of a child process.
+ */
+ static void
+ LogChildExit(int lev, const char *procname, int pid, int exitstatus)
+ {
+ if (WIFEXITED(exitstatus))
+ ereport(lev,
+
+ /*------
+ translator: %s is a noun phrase describing a child process, such as
+ "server process" */
+ (errmsg("%s (PID %d) exited with exit code %d",
+ procname, pid, WEXITSTATUS(exitstatus))));
+ else if (WIFSIGNALED(exitstatus))
+ #if defined(WIN32)
+ ereport(lev,
+
+ /*------
+ translator: %s is a noun phrase describing a child process, such as
+ "server process" */
+ (errmsg("%s (PID %d) was terminated by exception 0x%X",
+ procname, pid, WTERMSIG(exitstatus)),
+ errhint("See C include file \"ntstatus.h\" for a description of the hexadecimal value.")));
+ #elif defined(HAVE_DECL_SYS_SIGLIST) && HAVE_DECL_SYS_SIGLIST
+ ereport(lev,
+
+ /*------
+ translator: %s is a noun phrase describing a child process, such as
+ "server process" */
+ (errmsg("%s (PID %d) was terminated by signal %d: %s",
+ procname, pid, WTERMSIG(exitstatus),
+ WTERMSIG(exitstatus) < NSIG ?
+ sys_siglist[WTERMSIG(exitstatus)] : "(unknown)")));
+ #else
+ ereport(lev,
+
+ /*------
+ translator: %s is a noun phrase describing a child process, such as
+ "server process" */
+ (errmsg("%s (PID %d) was terminated by signal %d",
+ procname, pid, WTERMSIG(exitstatus))));
+ #endif
+ else
+ ereport(lev,
+
+ /*------
+ translator: %s is a noun phrase describing a child process, such as
+ "server process" */
+ (errmsg("%s (PID %d) exited with unrecognized status %d",
+ procname, pid, exitstatus)));
+ }
+
+ /*
+ * Advance the postmaster's state machine and take actions as appropriate
+ *
+ * This is common code for pmdie(), reaper() and sigusr1_handler(), which
+ * receive the signals that might mean we need to change state.
+ */
+ static void
+ PostmasterStateMachine(void)
+ {
+ if (pmState == PM_WAIT_BACKUP)
+ {
+ /*
+ * PM_WAIT_BACKUP state ends when online backup mode is not active.
+ */
+ if (!BackupInProgress())
+ pmState = PM_WAIT_BACKENDS;
+ }
+
+ if (pmState == PM_WAIT_READONLY)
+ {
+ /*
+ * PM_WAIT_READONLY state ends when we have no regular backends that
+ * have been started during recovery. We kill the startup and
+ * walreceiver processes and transition to PM_WAIT_BACKENDS. Ideally,
+ * we might like to kill these processes first and then wait for
+ * backends to die off, but that doesn't work at present because
+ * killing the startup process doesn't release its locks.
+ */
+ if (CountChildren(BACKEND_TYPE_NORMAL) == 0)
+ {
+ if (StartupPID != 0)
+ signal_child(StartupPID, SIGTERM);
+ if (WalReceiverPID != 0)
+ signal_child(WalReceiverPID, SIGTERM);
+ pmState = PM_WAIT_BACKENDS;
+ }
+ }
+
+ /*
+ * If we are in a state-machine state that implies waiting for backends to
+ * exit, see if they're all gone, and change state if so.
+ */
+ if (pmState == PM_WAIT_BACKENDS)
+ {
+ /*
+ * PM_WAIT_BACKENDS state ends when we have no regular backends
+ * (including autovac workers) and no walwriter or autovac launcher.
+ * If we are doing crash recovery then we expect the bgwriter to exit
+ * too, otherwise not. The archiver, stats, and syslogger processes
+ * are disregarded since they are not connected to shared memory; we
+ * also disregard dead_end children here. Walsenders are also
+ * disregarded, they will be terminated later after writing the
+ * checkpoint record, like the archiver process.
+ */
+ if (CountChildren(BACKEND_TYPE_NORMAL | BACKEND_TYPE_AUTOVAC) == 0 &&
+ StartupPID == 0 &&
+ WalReceiverPID == 0 &&
+ (BgWriterPID == 0 || !FatalError) &&
+ WalWriterPID == 0 &&
+ AutoVacPID == 0)
+ {
+ if (FatalError)
+ {
+ /*
+ * Start waiting for dead_end children to die. This state
+ * change causes ServerLoop to stop creating new ones.
+ */
+ pmState = PM_WAIT_DEAD_END;
+
+ /*
+ * We already SIGQUIT'd the archiver and stats processes, if
+ * any, when we entered FatalError state.
+ */
+ }
+ else
+ {
+ /*
+ * If we get here, we are proceeding with normal shutdown. All
+ * the regular children are gone, and it's time to tell the
+ * bgwriter to do a shutdown checkpoint.
+ */
+ Assert(Shutdown > NoShutdown);
+ /* Start the bgwriter if not running */
+ if (BgWriterPID == 0)
+ BgWriterPID = StartBackgroundWriter();
+ /* And tell it to shut down */
+ if (BgWriterPID != 0)
+ {
+ signal_child(BgWriterPID, SIGUSR2);
+ pmState = PM_SHUTDOWN;
+ }
+ else
+ {
+ /*
+ * If we failed to fork a bgwriter, just shut down. Any
+ * required cleanup will happen at next restart. We set
+ * FatalError so that an "abnormal shutdown" message gets
+ * logged when we exit.
+ */
+ FatalError = true;
+ pmState = PM_WAIT_DEAD_END;
+
+ /* Kill the walsenders, archiver and stats collector too */
+ SignalChildren(SIGQUIT);
+ if (PgArchPID != 0)
+ signal_child(PgArchPID, SIGQUIT);
+ if (PgStatPID != 0)
+ signal_child(PgStatPID, SIGQUIT);
+ }
+ }
+ }
+ }
+
+ if (pmState == PM_SHUTDOWN_2)
+ {
+ /*
+ * PM_SHUTDOWN_2 state ends when there's no other children than
+ * dead_end children left. There shouldn't be any regular backends
+ * left by now anyway; what we're really waiting for is walsenders and
+ * archiver.
+ *
+ * Walreceiver should normally be dead by now, but not when a fast
+ * shutdown is performed during recovery.
+ */
+ if (PgArchPID == 0 && CountChildren(BACKEND_TYPE_ALL) == 0 &&
+ WalReceiverPID == 0)
+ {
+ pmState = PM_WAIT_DEAD_END;
+ }
+ }
+
+ if (pmState == PM_WAIT_DEAD_END)
+ {
+ /*
+ * PM_WAIT_DEAD_END state ends when the BackendList is entirely empty
+ * (ie, no dead_end children remain), and the archiver and stats
+ * collector are gone too.
+ *
+ * The reason we wait for those two is to protect them against a new
+ * postmaster starting conflicting subprocesses; this isn't an
+ * ironclad protection, but it at least helps in the
+ * shutdown-and-immediately-restart scenario. Note that they have
+ * already been sent appropriate shutdown signals, either during a
+ * normal state transition leading up to PM_WAIT_DEAD_END, or during
+ * FatalError processing.
+ */
+ if (DLGetHead(BackendList) == NULL &&
+ PgArchPID == 0 && PgStatPID == 0)
+ {
+ /* These other guys should be dead already */
+ Assert(StartupPID == 0);
+ Assert(WalReceiverPID == 0);
+ Assert(BgWriterPID == 0);
+ Assert(WalWriterPID == 0);
+ Assert(AutoVacPID == 0);
+ /* syslogger is not considered here */
+ pmState = PM_NO_CHILDREN;
+ }
+ }
+
+ /*
+ * If we've been told to shut down, we exit as soon as there are no
+ * remaining children. If there was a crash, cleanup will occur at the
+ * next startup. (Before PostgreSQL 8.3, we tried to recover from the
+ * crash before exiting, but that seems unwise if we are quitting because
+ * we got SIGTERM from init --- there may well not be time for recovery
+ * before init decides to SIGKILL us.)
+ *
+ * Note that the syslogger continues to run. It will exit when it sees
+ * EOF on its input pipe, which happens when there are no more upstream
+ * processes.
+ */
+ if (Shutdown > NoShutdown && pmState == PM_NO_CHILDREN)
+ {
+ if (FatalError)
+ {
+ ereport(LOG, (errmsg("abnormal database system shutdown")));
+ ExitPostmaster(1);
+ }
+ else
+ {
+ /*
+ * Terminate backup mode to avoid recovery after a clean fast
+ * shutdown. Since a backup can only be taken during normal
+ * running (and not, for example, while running under Hot Standby)
+ * it only makes sense to do this if we reached normal running. If
+ * we're still in recovery, the backup file is one we're
+ * recovering *from*, and we must keep it around so that recovery
+ * restarts from the right place.
+ */
+ if (ReachedNormalRunning)
+ CancelBackup();
+
+ /* Normal exit from the postmaster is here */
+ ExitPostmaster(0);
+ }
+ }
+
+ /*
+ * If recovery failed, or the user does not want an automatic restart
+ * after backend crashes, wait for all non-syslogger children to exit, and
+ * then exit postmaster. We don't try to reinitialize when recovery fails,
+ * because more than likely it will just fail again and we will keep
+ * trying forever.
+ */
+ if (pmState == PM_NO_CHILDREN && (RecoveryError || !restart_after_crash))
+ ExitPostmaster(1);
+
+ /*
+ * If we need to recover from a crash, wait for all non-syslogger children
+ * to exit, then reset shmem and StartupDataBase.
+ */
+ if (FatalError && pmState == PM_NO_CHILDREN)
+ {
+ ereport(LOG,
+ (errmsg("all server processes terminated; reinitializing")));
+
+ shmem_exit(1);
+ reset_shared(PostPortNumber);
+
+ StartupPID = StartupDataBase();
+ Assert(StartupPID != 0);
+ pmState = PM_STARTUP;
+ }
+ }
+
+
+ /*
+ * Send a signal to a postmaster child process
+ *
+ * On systems that have setsid(), each child process sets itself up as a
+ * process group leader. For signals that are generally interpreted in the
+ * appropriate fashion, we signal the entire process group not just the
+ * direct child process. This allows us to, for example, SIGQUIT a blocked
+ * archive_recovery script, or SIGINT a script being run by a backend via
+ * system().
+ *
+ * There is a race condition for recently-forked children: they might not
+ * have executed setsid() yet. So we signal the child directly as well as
+ * the group. We assume such a child will handle the signal before trying
+ * to spawn any grandchild processes. We also assume that signaling the
+ * child twice will not cause any problems.
+ */
+ static void
+ signal_child(pid_t pid, int signal)
+ {
+ if (kill(pid, signal) < 0)
+ elog(DEBUG3, "kill(%ld,%d) failed: %m", (long) pid, signal);
+ #ifdef HAVE_SETSID
+ switch (signal)
+ {
+ case SIGINT:
+ case SIGTERM:
+ case SIGQUIT:
+ case SIGSTOP:
+ if (kill(-pid, signal) < 0)
+ elog(DEBUG3, "kill(%ld,%d) failed: %m", (long) (-pid), signal);
+ break;
+ default:
+ break;
+ }
+ #endif
+ }
+
+ /*
+ * Send a signal to the targeted children (but NOT special children;
+ * dead_end children are never signaled, either).
+ */
+ static bool
+ SignalSomeChildren(int signal, int target)
+ {
+ Dlelem *curr;
+ bool signaled = false;
+
+ for (curr = DLGetHead(BackendList); curr; curr = DLGetSucc(curr))
+ {
+ Backend *bp = (Backend *) DLE_VAL(curr);
+
+ if (bp->dead_end)
+ continue;
+
+ /*
+ * Since target == BACKEND_TYPE_ALL is the most common case, we test
+ * it first and avoid touching shared memory for every child.
+ */
+ if (target != BACKEND_TYPE_ALL)
+ {
+ int child;
+
+ if (bp->is_autovacuum)
+ child = BACKEND_TYPE_AUTOVAC;
+ else if (IsPostmasterChildWalSender(bp->child_slot))
+ child = BACKEND_TYPE_WALSND;
+ else
+ child = BACKEND_TYPE_NORMAL;
+ if (!(target & child))
+ continue;
+ }
+
+ ereport(DEBUG4,
+ (errmsg_internal("sending signal %d to process %d",
+ signal, (int) bp->pid)));
+ signal_child(bp->pid, signal);
+ signaled = true;
+ }
+ return signaled;
+ }
+
+ /*
+ * BackendStartup -- start backend process
+ *
+ * returns: STATUS_ERROR if the fork failed, STATUS_OK otherwise.
+ *
+ * Note: if you change this code, also consider StartAutovacuumWorker.
+ */
+ static int
+ BackendStartup(Port *port)
+ {
+ Backend *bn; /* for backend cleanup */
+ pid_t pid;
+
+ /*
+ * Create backend data structure. Better before the fork() so we can
+ * handle failure cleanly.
+ */
+ bn = (Backend *) malloc(sizeof(Backend));
+ if (!bn)
+ {
+ ereport(LOG,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ return STATUS_ERROR;
+ }
+
+ /*
+ * Compute the cancel key that will be assigned to this backend. The
+ * backend will have its own copy in the forked-off process' value of
+ * MyCancelKey, so that it can transmit the key to the frontend.
+ */
+ MyCancelKey = PostmasterRandom();
+ bn->cancel_key = MyCancelKey;
+
+ /* Pass down canAcceptConnections state */
+ port->canAcceptConnections = canAcceptConnections();
+ bn->dead_end = (port->canAcceptConnections != CAC_OK &&
+ port->canAcceptConnections != CAC_WAITBACKUP);
+
+ /*
+ * Unless it's a dead_end child, assign it a child slot number
+ */
+ if (!bn->dead_end)
+ bn->child_slot = MyPMChildSlot = AssignPostmasterChildSlot();
+ else
+ bn->child_slot = 0;
+
+ #ifdef EXEC_BACKEND
+ pid = backend_forkexec(port);
+ #else /* !EXEC_BACKEND */
+ pid = fork_process();
+ if (pid == 0) /* child */
+ {
+ free(bn);
+
+ /*
+ * Let's clean up ourselves as the postmaster child, and close the
+ * postmaster's listen sockets. (In EXEC_BACKEND case this is all
+ * done in SubPostmasterMain.)
+ */
+ IsUnderPostmaster = true; /* we are a postmaster subprocess now */
+
+ MyProcPid = getpid(); /* reset MyProcPid */
+
+ MyStartTime = time(NULL);
+
+ /* We don't want the postmaster's proc_exit() handlers */
+ on_exit_reset();
+
+ /* Close the postmaster's sockets */
+ ClosePostmasterPorts(false);
+
+ /* Perform additional initialization and collect startup packet */
+ BackendInitialize(port);
+
+ /* And run the backend */
+ proc_exit(BackendRun(port));
+ }
+ #endif /* EXEC_BACKEND */
+
+ if (pid < 0)
+ {
+ /* in parent, fork failed */
+ int save_errno = errno;
+
+ if (!bn->dead_end)
+ (void) ReleasePostmasterChildSlot(bn->child_slot);
+ free(bn);
+ errno = save_errno;
+ ereport(LOG,
+ (errmsg("could not fork new process for connection: %m")));
+ report_fork_failure_to_client(port, save_errno);
+ return STATUS_ERROR;
+ }
+
+ /* in parent, successful fork */
+ ereport(DEBUG2,
+ (errmsg_internal("forked new backend, pid=%d socket=%d",
+ (int) pid, (int) port->sock)));
+
+ /*
+ * Everything's been successful, it's safe to add this backend to our list
+ * of backends.
+ */
+ bn->pid = pid;
+ bn->is_autovacuum = false;
+ DLInitElem(&bn->elem, bn);
+ DLAddHead(BackendList, &bn->elem);
+ #ifdef EXEC_BACKEND
+ if (!bn->dead_end)
+ ShmemBackendArrayAdd(bn);
+ #endif
+
+ return STATUS_OK;
+ }
+
+ /*
+ * Try to report backend fork() failure to client before we close the
+ * connection. Since we do not care to risk blocking the postmaster on
+ * this connection, we set the connection to non-blocking and try only once.
+ *
+ * This is grungy special-purpose code; we cannot use backend libpq since
+ * it's not up and running.
+ */
+ static void
+ report_fork_failure_to_client(Port *port, int errnum)
+ {
+ char buffer[1000];
+ int rc;
+
+ /* Format the error message packet (always V2 protocol) */
+ snprintf(buffer, sizeof(buffer), "E%s%s\n",
+ _("could not fork new process for connection: "),
+ strerror(errnum));
+
+ /* Set port to non-blocking. Don't do send() if this fails */
+ if (!pg_set_noblock(port->sock))
+ return;
+
+ /* We'll retry after EINTR, but ignore all other failures */
+ do
+ {
+ rc = send(port->sock, buffer, strlen(buffer) + 1, 0);
+ } while (rc < 0 && errno == EINTR);
+ }
+
+
+ /*
+ * BackendInitialize -- initialize an interactive (postmaster-child)
+ * backend process, and collect the client's startup packet.
+ *
+ * returns: nothing. Will not return at all if there's any failure.
+ *
+ * Note: this code does not depend on having any access to shared memory.
+ * In the EXEC_BACKEND case, we are physically attached to shared memory
+ * but have not yet set up most of our local pointers to shmem structures.
+ */
+ static void
+ BackendInitialize(Port *port)
+ {
+ int status;
+ char remote_host[NI_MAXHOST];
+ char remote_port[NI_MAXSERV];
+ char remote_ps_data[NI_MAXHOST];
+
+ /* Save port etc. for ps status */
+ MyProcPort = port;
+
+ /*
+ * PreAuthDelay is a debugging aid for investigating problems in the
+ * authentication cycle: it can be set in postgresql.conf to allow time to
+ * attach to the newly-forked backend with a debugger. (See also
+ * PostAuthDelay, which we allow clients to pass through PGOPTIONS, but it
+ * is not honored until after authentication.)
+ */
+ if (PreAuthDelay > 0)
+ pg_usleep(PreAuthDelay * 1000000L);
+
+ /* This flag will remain set until InitPostgres finishes authentication */
+ ClientAuthInProgress = true; /* limit visibility of log messages */
+
+ /* save process start time */
+ port->SessionStartTime = GetCurrentTimestamp();
+ MyStartTime = timestamptz_to_time_t(port->SessionStartTime);
+
+ /* set these to empty in case they are needed before we set them up */
+ port->remote_host = "";
+ port->remote_port = "";
+
+ /*
+ * Initialize libpq and enable reporting of ereport errors to the client.
+ * Must do this now because authentication uses libpq to send messages.
+ */
+ pq_init(); /* initialize libpq to talk to client */
+ whereToSendOutput = DestRemote; /* now safe to ereport to client */
+
+ /*
+ * If possible, make this process a group leader, so that the postmaster
+ * can signal any child processes too. (We do this now on the off chance
+ * that something might spawn a child process during authentication.)
+ */
+ #ifdef HAVE_SETSID
+ if (setsid() < 0)
+ elog(FATAL, "setsid() failed: %m");
+ #endif
+
+ /*
+ * We arrange for a simple exit(1) if we receive SIGTERM or SIGQUIT or
+ * timeout while trying to collect the startup packet. Otherwise the
+ * postmaster cannot shutdown the database FAST or IMMED cleanly if a
+ * buggy client fails to send the packet promptly.
+ */
+ pqsignal(SIGTERM, startup_die);
+ pqsignal(SIGQUIT, startup_die);
+ pqsignal(SIGALRM, startup_die);
+ PG_SETMASK(&StartupBlockSig);
+
+ /*
+ * Get the remote host name and port for logging and status display.
+ */
+ remote_host[0] = '\0';
+ remote_port[0] = '\0';
+ if (pg_getnameinfo_all(&port->raddr.addr, port->raddr.salen,
+ remote_host, sizeof(remote_host),
+ remote_port, sizeof(remote_port),
+ (log_hostname ? 0 : NI_NUMERICHOST) | NI_NUMERICSERV) != 0)
+ {
+ int ret = pg_getnameinfo_all(&port->raddr.addr, port->raddr.salen,
+ remote_host, sizeof(remote_host),
+ remote_port, sizeof(remote_port),
+ NI_NUMERICHOST | NI_NUMERICSERV);
+
+ if (ret != 0)
+ ereport(WARNING,
+ (errmsg_internal("pg_getnameinfo_all() failed: %s",
+ gai_strerror(ret))));
+ }
+ if (remote_port[0] == '\0')
+ snprintf(remote_ps_data, sizeof(remote_ps_data), "%s", remote_host);
+ else
+ snprintf(remote_ps_data, sizeof(remote_ps_data), "%s(%s)", remote_host, remote_port);
+
+ if (Log_connections)
+ {
+ if (remote_port[0])
+ ereport(LOG,
+ (errmsg("connection received: host=%s port=%s",
+ remote_host,
+ remote_port)));
+ else
+ ereport(LOG,
+ (errmsg("connection received: host=%s",
+ remote_host)));
+ }
+
+ /*
+ * save remote_host and remote_port in port structure
+ */
+ port->remote_host = strdup(remote_host);
+ port->remote_port = strdup(remote_port);
+ if (log_hostname)
+ port->remote_hostname = port->remote_host;
+
+ /*
+ * Ready to begin client interaction. We will give up and exit(1) after a
+ * time delay, so that a broken client can't hog a connection
+ * indefinitely. PreAuthDelay and any DNS interactions above don't count
+ * against the time limit.
+ */
+ if (!enable_sig_alarm(AuthenticationTimeout * 1000, false))
+ elog(FATAL, "could not set timer for startup packet timeout");
+
+ /*
+ * Receive the startup packet (which might turn out to be a cancel request
+ * packet).
+ */
+ status = ProcessStartupPacket(port, false);
+
+ /*
+ * Stop here if it was bad or a cancel packet. ProcessStartupPacket
+ * already did any appropriate error reporting.
+ */
+ if (status != STATUS_OK)
+ proc_exit(0);
+
+ /*
+ * Now that we have the user and database name, we can set the process
+ * title for ps. It's good to do this as early as possible in startup.
+ *
+ * For a walsender, the ps display is set in the following form:
+ *
+ * postgres: wal sender process
+ *
+ * To achieve that, we pass "wal sender process" as username and username
+ * as dbname to init_ps_display(). XXX: should add a new variant of
+ * init_ps_display() to avoid abusing the parameters like this.
+ */
+ if (am_walsender)
+ init_ps_display("wal sender process", port->user_name, remote_ps_data,
+ update_process_title ? "authentication" : "");
+ else
+ init_ps_display(port->user_name, port->database_name, remote_ps_data,
+ update_process_title ? "authentication" : "");
+
+ /*
+ * Disable the timeout, and prevent SIGTERM/SIGQUIT again.
+ */
+ if (!disable_sig_alarm(false))
+ elog(FATAL, "could not disable timer for startup packet timeout");
+ PG_SETMASK(&BlockSig);
+ }
+
+
+ /*
+ * BackendRun -- set up the backend's argument list and invoke PostgresMain()
+ *
+ * returns:
+ * Shouldn't return at all.
+ * If PostgresMain() fails, return status.
+ */
+ static int
+ BackendRun(Port *port)
+ {
+ char **av;
+ int maxac;
+ int ac;
+ long secs;
+ int usecs;
+ int i;
+
+ /*
+ * Don't want backend to be able to see the postmaster random number
+ * generator state. We have to clobber the static random_seed *and* start
+ * a new random sequence in the random() library function.
+ */
+ random_seed = 0;
+ random_start_time.tv_usec = 0;
+ /* slightly hacky way to get integer microseconds part of timestamptz */
+ TimestampDifference(0, port->SessionStartTime, &secs, &usecs);
+ srandom((unsigned int) (MyProcPid ^ usecs));
+
+ /*
+ * Now, build the argv vector that will be given to PostgresMain.
+ *
+ * The maximum possible number of commandline arguments that could come
+ * from ExtraOptions is (strlen(ExtraOptions) + 1) / 2; see
+ * pg_split_opts().
+ */
+ maxac = 5; /* for fixed args supplied below */
+ maxac += (strlen(ExtraOptions) + 1) / 2;
+
+ av = (char **) MemoryContextAlloc(TopMemoryContext,
+ maxac * sizeof(char *));
+ ac = 0;
+
+ av[ac++] = "postgres";
+
+ /*
+ * Pass any backend switches specified with -o on the postmaster's own
+ * command line. We assume these are secure. (It's OK to mangle
+ * ExtraOptions now, since we're safely inside a subprocess.)
+ */
+ pg_split_opts(av, &ac, ExtraOptions);
+
+ /*
+ * Tell the backend which database to use.
+ */
+ av[ac++] = port->database_name;
+
+ av[ac] = NULL;
+
+ Assert(ac < maxac);
+
+ /*
+ * Debug: print arguments being passed to backend
+ */
+ ereport(DEBUG3,
+ (errmsg_internal("%s child[%d]: starting with (",
+ progname, (int) getpid())));
+ for (i = 0; i < ac; ++i)
+ ereport(DEBUG3,
+ (errmsg_internal("\t%s", av[i])));
+ ereport(DEBUG3,
+ (errmsg_internal(")")));
+
+ /*
+ * Make sure we aren't in PostmasterContext anymore. (We can't delete it
+ * just yet, though, because InitPostgres will need the HBA data.)
+ */
+ MemoryContextSwitchTo(TopMemoryContext);
+
+ return (PostgresMain(ac, av, port->user_name));
+ }
+
+
+ #ifdef EXEC_BACKEND
+
+ /*
+ * postmaster_forkexec -- fork and exec a postmaster subprocess
+ *
+ * The caller must have set up the argv array already, except for argv[2]
+ * which will be filled with the name of the temp variable file.
+ *
+ * Returns the child process PID, or -1 on fork failure (a suitable error
+ * message has been logged on failure).
+ *
+ * All uses of this routine will dispatch to SubPostmasterMain in the
+ * child process.
+ */
+ pid_t
+ postmaster_forkexec(int argc, char *argv[])
+ {
+ Port port;
+
+ /* This entry point passes dummy values for the Port variables */
+ memset(&port, 0, sizeof(port));
+ return internal_forkexec(argc, argv, &port);
+ }
+
+ /*
+ * backend_forkexec -- fork/exec off a backend process
+ *
+ * Some operating systems (WIN32) don't have fork() so we have to simulate
+ * it by storing parameters that need to be passed to the child and
+ * then create a new child process.
+ *
+ * returns the pid of the fork/exec'd process, or -1 on failure
+ */
+ static pid_t
+ backend_forkexec(Port *port)
+ {
+ char *av[4];
+ int ac = 0;
+
+ av[ac++] = "postgres";
+ av[ac++] = "--forkbackend";
+ av[ac++] = NULL; /* filled in by internal_forkexec */
+
+ av[ac] = NULL;
+ Assert(ac < lengthof(av));
+
+ return internal_forkexec(ac, av, port);
+ }
+
+ #ifndef WIN32
+
+ /*
+ * internal_forkexec non-win32 implementation
+ *
+ * - writes out backend variables to the parameter file
+ * - fork():s, and then exec():s the child process
+ */
+ static pid_t
+ internal_forkexec(int argc, char *argv[], Port *port)
+ {
+ static unsigned long tmpBackendFileNum = 0;
+ pid_t pid;
+ char tmpfilename[MAXPGPATH];
+ BackendParameters param;
+ FILE *fp;
+
+ if (!save_backend_variables(¶m, port))
+ return -1; /* log made by save_backend_variables */
+
+ /* Calculate name for temp file */
+ snprintf(tmpfilename, MAXPGPATH, "%s/%s.backend_var.%d.%lu",
+ PG_TEMP_FILES_DIR, PG_TEMP_FILE_PREFIX,
+ MyProcPid, ++tmpBackendFileNum);
+
+ /* Open file */
+ fp = AllocateFile(tmpfilename, PG_BINARY_W);
+ if (!fp)
+ {
+ /* As in OpenTemporaryFile, try to make the temp-file directory */
+ mkdir(PG_TEMP_FILES_DIR, S_IRWXU);
+
+ fp = AllocateFile(tmpfilename, PG_BINARY_W);
+ if (!fp)
+ {
+ ereport(LOG,
+ (errcode_for_file_access(),
+ errmsg("could not create file \"%s\": %m",
+ tmpfilename)));
+ return -1;
+ }
+ }
+
+ if (fwrite(¶m, sizeof(param), 1, fp) != 1)
+ {
+ ereport(LOG,
+ (errcode_for_file_access(),
+ errmsg("could not write to file \"%s\": %m", tmpfilename)));
+ FreeFile(fp);
+ return -1;
+ }
+
+ /* Release file */
+ if (FreeFile(fp))
+ {
+ ereport(LOG,
+ (errcode_for_file_access(),
+ errmsg("could not write to file \"%s\": %m", tmpfilename)));
+ return -1;
+ }
+
+ /* Make sure caller set up argv properly */
+ Assert(argc >= 3);
+ Assert(argv[argc] == NULL);
+ Assert(strncmp(argv[1], "--fork", 6) == 0);
+ Assert(argv[2] == NULL);
+
+ /* Insert temp file name after --fork argument */
+ argv[2] = tmpfilename;
+
+ /* Fire off execv in child */
+ if ((pid = fork_process()) == 0)
+ {
+ if (execv(postgres_exec_path, argv) < 0)
+ {
+ ereport(LOG,
+ (errmsg("could not execute server process \"%s\": %m",
+ postgres_exec_path)));
+ /* We're already in the child process here, can't return */
+ exit(1);
+ }
+ }
+
+ return pid; /* Parent returns pid, or -1 on fork failure */
+ }
+ #else /* WIN32 */
+
+ /*
+ * internal_forkexec win32 implementation
+ *
+ * - starts backend using CreateProcess(), in suspended state
+ * - writes out backend variables to the parameter file
+ * - during this, duplicates handles and sockets required for
+ * inheritance into the new process
+ * - resumes execution of the new process once the backend parameter
+ * file is complete.
+ */
+ static pid_t
+ internal_forkexec(int argc, char *argv[], Port *port)
+ {
+ STARTUPINFO si;
+ PROCESS_INFORMATION pi;
+ int i;
+ int j;
+ char cmdLine[MAXPGPATH * 2];
+ HANDLE paramHandle;
+ BackendParameters *param;
+ SECURITY_ATTRIBUTES sa;
+ char paramHandleStr[32];
+ win32_deadchild_waitinfo *childinfo;
+
+ /* Make sure caller set up argv properly */
+ Assert(argc >= 3);
+ Assert(argv[argc] == NULL);
+ Assert(strncmp(argv[1], "--fork", 6) == 0);
+ Assert(argv[2] == NULL);
+
+ /* Set up shared memory for parameter passing */
+ ZeroMemory(&sa, sizeof(sa));
+ sa.nLength = sizeof(sa);
+ sa.bInheritHandle = TRUE;
+ paramHandle = CreateFileMapping(INVALID_HANDLE_VALUE,
+ &sa,
+ PAGE_READWRITE,
+ 0,
+ sizeof(BackendParameters),
+ NULL);
+ if (paramHandle == INVALID_HANDLE_VALUE)
+ {
+ elog(LOG, "could not create backend parameter file mapping: error code %lu",
+ GetLastError());
+ return -1;
+ }
+
+ param = MapViewOfFile(paramHandle, FILE_MAP_WRITE, 0, 0, sizeof(BackendParameters));
+ if (!param)
+ {
+ elog(LOG, "could not map backend parameter memory: error code %lu",
+ GetLastError());
+ CloseHandle(paramHandle);
+ return -1;
+ }
+
+ /* Insert temp file name after --fork argument */
+ #ifdef _WIN64
+ sprintf(paramHandleStr, "%llu", (LONG_PTR) paramHandle);
+ #else
+ sprintf(paramHandleStr, "%lu", (DWORD) paramHandle);
+ #endif
+ argv[2] = paramHandleStr;
+
+ /* Format the cmd line */
+ cmdLine[sizeof(cmdLine) - 1] = '\0';
+ cmdLine[sizeof(cmdLine) - 2] = '\0';
+ snprintf(cmdLine, sizeof(cmdLine) - 1, "\"%s\"", postgres_exec_path);
+ i = 0;
+ while (argv[++i] != NULL)
+ {
+ j = strlen(cmdLine);
+ snprintf(cmdLine + j, sizeof(cmdLine) - 1 - j, " \"%s\"", argv[i]);
+ }
+ if (cmdLine[sizeof(cmdLine) - 2] != '\0')
+ {
+ elog(LOG, "subprocess command line too long");
+ return -1;
+ }
+
+ memset(&pi, 0, sizeof(pi));
+ memset(&si, 0, sizeof(si));
+ si.cb = sizeof(si);
+
+ /*
+ * Create the subprocess in a suspended state. This will be resumed later,
+ * once we have written out the parameter file.
+ */
+ if (!CreateProcess(NULL, cmdLine, NULL, NULL, TRUE, CREATE_SUSPENDED,
+ NULL, NULL, &si, &pi))
+ {
+ elog(LOG, "CreateProcess call failed: %m (error code %lu)",
+ GetLastError());
+ return -1;
+ }
+
+ if (!save_backend_variables(param, port, pi.hProcess, pi.dwProcessId))
+ {
+ /*
+ * log made by save_backend_variables, but we have to clean up the
+ * mess with the half-started process
+ */
+ if (!TerminateProcess(pi.hProcess, 255))
+ ereport(LOG,
+ (errmsg_internal("could not terminate unstarted process: error code %lu",
+ GetLastError())));
+ CloseHandle(pi.hProcess);
+ CloseHandle(pi.hThread);
+ return -1; /* log made by save_backend_variables */
+ }
+
+ /* Drop the parameter shared memory that is now inherited to the backend */
+ if (!UnmapViewOfFile(param))
+ elog(LOG, "could not unmap view of backend parameter file: error code %lu",
+ GetLastError());
+ if (!CloseHandle(paramHandle))
+ elog(LOG, "could not close handle to backend parameter file: error code %lu",
+ GetLastError());
+
+ /*
+ * Reserve the memory region used by our main shared memory segment before
+ * we resume the child process.
+ */
+ if (!pgwin32_ReserveSharedMemoryRegion(pi.hProcess))
+ {
+ /*
+ * Failed to reserve the memory, so terminate the newly created
+ * process and give up.
+ */
+ if (!TerminateProcess(pi.hProcess, 255))
+ ereport(LOG,
+ (errmsg_internal("could not terminate process that failed to reserve memory: error code %lu",
+ GetLastError())));
+ CloseHandle(pi.hProcess);
+ CloseHandle(pi.hThread);
+ return -1; /* logging done made by
+ * pgwin32_ReserveSharedMemoryRegion() */
+ }
+
+ /*
+ * Now that the backend variables are written out, we start the child
+ * thread so it can start initializing while we set up the rest of the
+ * parent state.
+ */
+ if (ResumeThread(pi.hThread) == -1)
+ {
+ if (!TerminateProcess(pi.hProcess, 255))
+ {
+ ereport(LOG,
+ (errmsg_internal("could not terminate unstartable process: error code %lu",
+ GetLastError())));
+ CloseHandle(pi.hProcess);
+ CloseHandle(pi.hThread);
+ return -1;
+ }
+ CloseHandle(pi.hProcess);
+ CloseHandle(pi.hThread);
+ ereport(LOG,
+ (errmsg_internal("could not resume thread of unstarted process: error code %lu",
+ GetLastError())));
+ return -1;
+ }
+
+ /*
+ * Queue a waiter for to signal when this child dies. The wait will be
+ * handled automatically by an operating system thread pool.
+ *
+ * Note: use malloc instead of palloc, since it needs to be thread-safe.
+ * Struct will be free():d from the callback function that runs on a
+ * different thread.
+ */
+ childinfo = malloc(sizeof(win32_deadchild_waitinfo));
+ if (!childinfo)
+ ereport(FATAL,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+
+ childinfo->procHandle = pi.hProcess;
+ childinfo->procId = pi.dwProcessId;
+
+ if (!RegisterWaitForSingleObject(&childinfo->waitHandle,
+ pi.hProcess,
+ pgwin32_deadchild_callback,
+ childinfo,
+ INFINITE,
+ WT_EXECUTEONLYONCE | WT_EXECUTEINWAITTHREAD))
+ ereport(FATAL,
+ (errmsg_internal("could not register process for wait: error code %lu",
+ GetLastError())));
+
+ /* Don't close pi.hProcess here - the wait thread needs access to it */
+
+ CloseHandle(pi.hThread);
+
+ return pi.dwProcessId;
+ }
+ #endif /* WIN32 */
+
+
+ /*
+ * SubPostmasterMain -- Get the fork/exec'd process into a state equivalent
+ * to what it would be if we'd simply forked on Unix, and then
+ * dispatch to the appropriate place.
+ *
+ * The first two command line arguments are expected to be "--forkFOO"
+ * (where FOO indicates which postmaster child we are to become), and
+ * the name of a variables file that we can read to load data that would
+ * have been inherited by fork() on Unix. Remaining arguments go to the
+ * subprocess FooMain() routine.
+ */
+ int
+ SubPostmasterMain(int argc, char *argv[])
+ {
+ Port port;
+
+ /* Do this sooner rather than later... */
+ IsUnderPostmaster = true; /* we are a postmaster subprocess now */
+
+ MyProcPid = getpid(); /* reset MyProcPid */
+
+ MyStartTime = time(NULL);
+
+ /*
+ * make sure stderr is in binary mode before anything can possibly be
+ * written to it, in case it's actually the syslogger pipe, so the pipe
+ * chunking protocol isn't disturbed. Non-logpipe data gets translated on
+ * redirection (e.g. via pg_ctl -l) anyway.
+ */
+ #ifdef WIN32
+ _setmode(fileno(stderr), _O_BINARY);
+ #endif
+
+ /* Lose the postmaster's on-exit routines (really a no-op) */
+ on_exit_reset();
+
+ /* In EXEC_BACKEND case we will not have inherited these settings */
+ IsPostmasterEnvironment = true;
+ whereToSendOutput = DestNone;
+
+ /* Setup essential subsystems (to ensure elog() behaves sanely) */
+ MemoryContextInit();
+ InitializeGUCOptions();
+
+ /* Read in the variables file */
+ memset(&port, 0, sizeof(Port));
+ read_backend_variables(argv[2], &port);
+
+ /*
+ * Set up memory area for GSS information. Mirrors the code in ConnCreate
+ * for the non-exec case.
+ */
+ #if defined(ENABLE_GSS) || defined(ENABLE_SSPI)
+ port.gss = (pg_gssinfo *) calloc(1, sizeof(pg_gssinfo));
+ if (!port.gss)
+ ereport(FATAL,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ #endif
+
+ /* Check we got appropriate args */
+ if (argc < 3)
+ elog(FATAL, "invalid subpostmaster invocation");
+
+ /*
+ * If appropriate, physically re-attach to shared memory segment. We want
+ * to do this before going any further to ensure that we can attach at the
+ * same address the postmaster used.
+ */
+ if (strcmp(argv[1], "--forkbackend") == 0 ||
+ strcmp(argv[1], "--forkavlauncher") == 0 ||
+ strcmp(argv[1], "--forkavworker") == 0 ||
+ strcmp(argv[1], "--forkboot") == 0)
+ PGSharedMemoryReAttach();
+
+ /* autovacuum needs this set before calling InitProcess */
+ if (strcmp(argv[1], "--forkavlauncher") == 0)
+ AutovacuumLauncherIAm();
+ if (strcmp(argv[1], "--forkavworker") == 0)
+ AutovacuumWorkerIAm();
+
+ /*
+ * Start our win32 signal implementation. This has to be done after we
+ * read the backend variables, because we need to pick up the signal pipe
+ * from the parent process.
+ */
+ #ifdef WIN32
+ pgwin32_signal_initialize();
+ #endif
+
+ /* In EXEC_BACKEND case we will not have inherited these settings */
+ pqinitmask();
+ PG_SETMASK(&BlockSig);
+
+ /* Read in remaining GUC variables */
+ read_nondefault_variables();
+
+ /*
+ * Reload any libraries that were preloaded by the postmaster. Since we
+ * exec'd this process, those libraries didn't come along with us; but we
+ * should load them into all child processes to be consistent with the
+ * non-EXEC_BACKEND behavior.
+ */
+ process_shared_preload_libraries();
+
+ /* Run backend or appropriate child */
+ if (strcmp(argv[1], "--forkbackend") == 0)
+ {
+ Assert(argc == 3); /* shouldn't be any more args */
+
+ /* Close the postmaster's sockets */
+ ClosePostmasterPorts(false);
+
+ /*
+ * Need to reinitialize the SSL library in the backend, since the
+ * context structures contain function pointers and cannot be passed
+ * through the parameter file.
+ *
+ * XXX should we do this in all child processes? For the moment it's
+ * enough to do it in backend children.
+ */
+ #ifdef USE_SSL
+ if (EnableSSL)
+ secure_initialize();
+ #endif
+
+ /*
+ * Perform additional initialization and collect startup packet.
+ *
+ * We want to do this before InitProcess() for a couple of reasons: 1.
+ * so that we aren't eating up a PGPROC slot while waiting on the
+ * client. 2. so that if InitProcess() fails due to being out of
+ * PGPROC slots, we have already initialized libpq and are able to
+ * report the error to the client.
+ */
+ BackendInitialize(&port);
+
+ /* Restore basic shared memory pointers */
+ InitShmemAccess(UsedShmemSegAddr);
+
+ /* Need a PGPROC to run CreateSharedMemoryAndSemaphores */
+ InitProcess();
+
+ /*
+ * Attach process to shared data structures. If testing EXEC_BACKEND
+ * on Linux, you must run this as root before starting the postmaster:
+ *
+ * echo 0 >/proc/sys/kernel/randomize_va_space
+ *
+ * This prevents a randomized stack base address that causes child
+ * shared memory to be at a different address than the parent, making
+ * it impossible to attached to shared memory. Return the value to
+ * '1' when finished.
+ */
+ CreateSharedMemoryAndSemaphores(false, 0);
+
+ /* And run the backend */
+ proc_exit(BackendRun(&port));
+ }
+ if (strcmp(argv[1], "--forkboot") == 0)
+ {
+ /* Close the postmaster's sockets */
+ ClosePostmasterPorts(false);
+
+ /* Restore basic shared memory pointers */
+ InitShmemAccess(UsedShmemSegAddr);
+
+ /* Need a PGPROC to run CreateSharedMemoryAndSemaphores */
+ InitAuxiliaryProcess();
+
+ /* Attach process to shared data structures */
+ CreateSharedMemoryAndSemaphores(false, 0);
+
+ AuxiliaryProcessMain(argc - 2, argv + 2);
+ proc_exit(0);
+ }
+ if (strcmp(argv[1], "--forkavlauncher") == 0)
+ {
+ /* Close the postmaster's sockets */
+ ClosePostmasterPorts(false);
+
+ /* Restore basic shared memory pointers */
+ InitShmemAccess(UsedShmemSegAddr);
+
+ /* Need a PGPROC to run CreateSharedMemoryAndSemaphores */
+ InitProcess();
+
+ /* Attach process to shared data structures */
+ CreateSharedMemoryAndSemaphores(false, 0);
+
+ AutoVacLauncherMain(argc - 2, argv + 2);
+ proc_exit(0);
+ }
+ if (strcmp(argv[1], "--forkavworker") == 0)
+ {
+ /* Close the postmaster's sockets */
+ ClosePostmasterPorts(false);
+
+ /* Restore basic shared memory pointers */
+ InitShmemAccess(UsedShmemSegAddr);
+
+ /* Need a PGPROC to run CreateSharedMemoryAndSemaphores */
+ InitProcess();
+
+ /* Attach process to shared data structures */
+ CreateSharedMemoryAndSemaphores(false, 0);
+
+ AutoVacWorkerMain(argc - 2, argv + 2);
+ proc_exit(0);
+ }
+ if (strcmp(argv[1], "--forkarch") == 0)
+ {
+ /* Close the postmaster's sockets */
+ ClosePostmasterPorts(false);
+
+ /* Do not want to attach to shared memory */
+
+ PgArchiverMain(argc, argv);
+ proc_exit(0);
+ }
+ if (strcmp(argv[1], "--forkcol") == 0)
+ {
+ /* Close the postmaster's sockets */
+ ClosePostmasterPorts(false);
+
+ /* Do not want to attach to shared memory */
+
+ PgstatCollectorMain(argc, argv);
+ proc_exit(0);
+ }
+ if (strcmp(argv[1], "--forklog") == 0)
+ {
+ /* Close the postmaster's sockets */
+ ClosePostmasterPorts(true);
+
+ /* Do not want to attach to shared memory */
+
+ SysLoggerMain(argc, argv);
+ proc_exit(0);
+ }
+
+ return 1; /* shouldn't get here */
+ }
+ #endif /* EXEC_BACKEND */
+
+
+ /*
+ * ExitPostmaster -- cleanup
+ *
+ * Do NOT call exit() directly --- always go through here!
+ */
+ static void
+ ExitPostmaster(int status)
+ {
+ /* should cleanup shared memory and kill all backends */
+
+ /*
+ * Not sure of the semantics here. When the Postmaster dies, should the
+ * backends all be killed? probably not.
+ *
+ * MUST -- vadim 05-10-1999
+ */
+
+ proc_exit(status);
+ }
+
+ /*
+ * sigusr1_handler - handle signal conditions from child processes
+ */
+ static void
+ sigusr1_handler(SIGNAL_ARGS)
+ {
+ int save_errno = errno;
+
+ PG_SETMASK(&BlockSig);
+
+ /*
+ * RECOVERY_STARTED and BEGIN_HOT_STANDBY signals are ignored in
+ * unexpected states. If the startup process quickly starts up, completes
+ * recovery, exits, we might process the death of the startup process
+ * first. We don't want to go back to recovery in that case.
+ */
+ if (CheckPostmasterSignal(PMSIGNAL_RECOVERY_STARTED) &&
+ pmState == PM_STARTUP)
+ {
+ /* WAL redo has started. We're out of reinitialization. */
+ FatalError = false;
+
+ /*
+ * Crank up the background writer. It doesn't matter if this fails,
+ * we'll just try again later.
+ */
+ Assert(BgWriterPID == 0);
+ BgWriterPID = StartBackgroundWriter();
+
+ pmState = PM_RECOVERY;
+ }
+ if (CheckPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY) &&
+ pmState == PM_RECOVERY)
+ {
+ /*
+ * Likewise, start other special children as needed.
+ */
+ Assert(PgStatPID == 0);
+ PgStatPID = pgstat_start();
+
+ ereport(LOG,
+ (errmsg("database system is ready to accept read only connections")));
+
+ pmState = PM_HOT_STANDBY;
+ }
+
+ if (CheckPostmasterSignal(PMSIGNAL_WAKEN_ARCHIVER) &&
+ PgArchPID != 0)
+ {
+ /*
+ * Send SIGUSR1 to archiver process, to wake it up and begin archiving
+ * next transaction log file.
+ */
+ signal_child(PgArchPID, SIGUSR1);
+ }
+
+ if (CheckPostmasterSignal(PMSIGNAL_ROTATE_LOGFILE) &&
+ SysLoggerPID != 0)
+ {
+ /* Tell syslogger to rotate logfile */
+ signal_child(SysLoggerPID, SIGUSR1);
+ }
+
+ if (CheckPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER))
+ {
+ /*
+ * Start one iteration of the autovacuum daemon, even if autovacuuming
+ * is nominally not enabled. This is so we can have an active defense
+ * against transaction ID wraparound. We set a flag for the main loop
+ * to do it rather than trying to do it here --- this is because the
+ * autovac process itself may send the signal, and we want to handle
+ * that by launching another iteration as soon as the current one
+ * completes.
+ */
+ start_autovac_launcher = true;
+ }
+
+ if (CheckPostmasterSignal(PMSIGNAL_START_AUTOVAC_WORKER))
+ {
+ /* The autovacuum launcher wants us to start a worker process. */
+ StartAutovacuumWorker();
+ }
+
+ if (CheckPostmasterSignal(PMSIGNAL_START_WALRECEIVER) &&
+ WalReceiverPID == 0 &&
+ (pmState == PM_STARTUP || pmState == PM_RECOVERY ||
+ pmState == PM_HOT_STANDBY || pmState == PM_WAIT_READONLY))
+ {
+ /* Startup Process wants us to start the walreceiver process. */
+ WalReceiverPID = StartWalReceiver();
+ }
+
+ if (CheckPostmasterSignal(PMSIGNAL_ADVANCE_STATE_MACHINE) &&
+ (pmState == PM_WAIT_BACKUP || pmState == PM_WAIT_BACKENDS))
+ {
+ /* Advance postmaster's state machine */
+ PostmasterStateMachine();
+ }
+
+ if (CheckPromoteSignal() && StartupPID != 0 &&
+ (pmState == PM_STARTUP || pmState == PM_RECOVERY ||
+ pmState == PM_HOT_STANDBY || pmState == PM_WAIT_READONLY))
+ {
+ /* Tell startup process to finish recovery */
+ signal_child(StartupPID, SIGUSR2);
+ }
+
+ PG_SETMASK(&UnBlockSig);
+
+ errno = save_errno;
+ }
+
+ /*
+ * Timeout or shutdown signal from postmaster while processing startup packet.
+ * Cleanup and exit(1).
+ *
+ * XXX: possible future improvement: try to send a message indicating
+ * why we are disconnecting. Problem is to be sure we don't block while
+ * doing so, nor mess up SSL initialization. In practice, if the client
+ * has wedged here, it probably couldn't do anything with the message anyway.
+ */
+ static void
+ startup_die(SIGNAL_ARGS)
+ {
+ proc_exit(1);
+ }
+
+ /*
+ * Dummy signal handler
+ *
+ * We use this for signals that we don't actually use in the postmaster,
+ * but we do use in backends. If we were to SIG_IGN such signals in the
+ * postmaster, then a newly started backend might drop a signal that arrives
+ * before it's able to reconfigure its signal processing. (See notes in
+ * tcop/postgres.c.)
+ */
+ static void
+ dummy_handler(SIGNAL_ARGS)
+ {
+ }
+
+ /*
+ * RandomSalt
+ */
+ static void
+ RandomSalt(char *md5Salt)
+ {
+ long rand;
+
+ /*
+ * We use % 255, sacrificing one possible byte value, so as to ensure that
+ * all bits of the random() value participate in the result. While at it,
+ * add one to avoid generating any null bytes.
+ */
+ rand = PostmasterRandom();
+ md5Salt[0] = (rand % 255) + 1;
+ rand = PostmasterRandom();
+ md5Salt[1] = (rand % 255) + 1;
+ rand = PostmasterRandom();
+ md5Salt[2] = (rand % 255) + 1;
+ rand = PostmasterRandom();
+ md5Salt[3] = (rand % 255) + 1;
+ }
+
+ /*
+ * PostmasterRandom
+ */
+ static long
+ PostmasterRandom(void)
+ {
+ /*
+ * Select a random seed at the time of first receiving a request.
+ */
+ if (random_seed == 0)
+ {
+ do
+ {
+ struct timeval random_stop_time;
+
+ gettimeofday(&random_stop_time, NULL);
+
+ /*
+ * We are not sure how much precision is in tv_usec, so we swap
+ * the high and low 16 bits of 'random_stop_time' and XOR them
+ * with 'random_start_time'. On the off chance that the result is
+ * 0, we loop until it isn't.
+ */
+ random_seed = random_start_time.tv_usec ^
+ ((random_stop_time.tv_usec << 16) |
+ ((random_stop_time.tv_usec >> 16) & 0xffff));
+ }
+ while (random_seed == 0);
+
+ srandom(random_seed);
+ }
+
+ return random();
+ }
+
+ /*
+ * Count up number of child processes of specified types (dead_end chidren
+ * are always excluded).
+ */
+ static int
+ CountChildren(int target)
+ {
+ Dlelem *curr;
+ int cnt = 0;
+
+ for (curr = DLGetHead(BackendList); curr; curr = DLGetSucc(curr))
+ {
+ Backend *bp = (Backend *) DLE_VAL(curr);
+
+ if (bp->dead_end)
+ continue;
+
+ /*
+ * Since target == BACKEND_TYPE_ALL is the most common case, we test
+ * it first and avoid touching shared memory for every child.
+ */
+ if (target != BACKEND_TYPE_ALL)
+ {
+ int child;
+
+ if (bp->is_autovacuum)
+ child = BACKEND_TYPE_AUTOVAC;
+ else if (IsPostmasterChildWalSender(bp->child_slot))
+ child = BACKEND_TYPE_WALSND;
+ else
+ child = BACKEND_TYPE_NORMAL;
+ if (!(target & child))
+ continue;
+ }
+
+ cnt++;
+ }
+ return cnt;
+ }
+
+
+ /*
+ * StartChildProcess -- start an auxiliary process for the postmaster
+ *
+ * xlop determines what kind of child will be started. All child types
+ * initially go to AuxiliaryProcessMain, which will handle common setup.
+ *
+ * Return value of StartChildProcess is subprocess' PID, or 0 if failed
+ * to start subprocess.
+ */
+ static pid_t
+ StartChildProcess(AuxProcType type)
+ {
+ pid_t pid;
+ char *av[10];
+ int ac = 0;
+ char typebuf[32];
+
+ /*
+ * Set up command-line arguments for subprocess
+ */
+ av[ac++] = "postgres";
+
+ #ifdef EXEC_BACKEND
+ av[ac++] = "--forkboot";
+ av[ac++] = NULL; /* filled in by postmaster_forkexec */
+ #endif
+
+ snprintf(typebuf, sizeof(typebuf), "-x%d", type);
+ av[ac++] = typebuf;
+
+ av[ac] = NULL;
+ Assert(ac < lengthof(av));
+
+ #ifdef EXEC_BACKEND
+ pid = postmaster_forkexec(ac, av);
+ #else /* !EXEC_BACKEND */
+ pid = fork_process();
+
+ if (pid == 0) /* child */
+ {
+ IsUnderPostmaster = true; /* we are a postmaster subprocess now */
+
+ /* Close the postmaster's sockets */
+ ClosePostmasterPorts(false);
+
+ /* Lose the postmaster's on-exit routines and port connections */
+ on_exit_reset();
+
+ /* Release postmaster's working memory context */
+ MemoryContextSwitchTo(TopMemoryContext);
+ MemoryContextDelete(PostmasterContext);
+ PostmasterContext = NULL;
+
+ AuxiliaryProcessMain(ac, av);
+ ExitPostmaster(0);
+ }
+ #endif /* EXEC_BACKEND */
+
+ if (pid < 0)
+ {
+ /* in parent, fork failed */
+ int save_errno = errno;
+
+ errno = save_errno;
+ switch (type)
+ {
+ case StartupProcess:
+ ereport(LOG,
+ (errmsg("could not fork startup process: %m")));
+ break;
+ case BgWriterProcess:
+ ereport(LOG,
+ (errmsg("could not fork background writer process: %m")));
+ break;
+ case WalWriterProcess:
+ ereport(LOG,
+ (errmsg("could not fork WAL writer process: %m")));
+ break;
+ case WalReceiverProcess:
+ ereport(LOG,
+ (errmsg("could not fork WAL receiver process: %m")));
+ break;
+ default:
+ ereport(LOG,
+ (errmsg("could not fork process: %m")));
+ break;
+ }
+
+ /*
+ * fork failure is fatal during startup, but there's no need to choke
+ * immediately if starting other child types fails.
+ */
+ if (type == StartupProcess)
+ ExitPostmaster(1);
+ return 0;
+ }
+
+ /*
+ * in parent, successful fork
+ */
+ return pid;
+ }
+
+ /*
+ * StartAutovacuumWorker
+ * Start an autovac worker process.
+ *
+ * This function is here because it enters the resulting PID into the
+ * postmaster's private backends list.
+ *
+ * NB -- this code very roughly matches BackendStartup.
+ */
+ static void
+ StartAutovacuumWorker(void)
+ {
+ Backend *bn;
+
+ /*
+ * If not in condition to run a process, don't try, but handle it like a
+ * fork failure. This does not normally happen, since the signal is only
+ * supposed to be sent by autovacuum launcher when it's OK to do it, but
+ * we have to check to avoid race-condition problems during DB state
+ * changes.
+ */
+ if (canAcceptConnections() == CAC_OK)
+ {
+ bn = (Backend *) malloc(sizeof(Backend));
+ if (bn)
+ {
+ /*
+ * Compute the cancel key that will be assigned to this session.
+ * We probably don't need cancel keys for autovac workers, but
+ * we'd better have something random in the field to prevent
+ * unfriendly people from sending cancels to them.
+ */
+ MyCancelKey = PostmasterRandom();
+ bn->cancel_key = MyCancelKey;
+
+ /* Autovac workers are not dead_end and need a child slot */
+ bn->dead_end = false;
+ bn->child_slot = MyPMChildSlot = AssignPostmasterChildSlot();
+
+ bn->pid = StartAutoVacWorker();
+ if (bn->pid > 0)
+ {
+ bn->is_autovacuum = true;
+ DLInitElem(&bn->elem, bn);
+ DLAddHead(BackendList, &bn->elem);
+ #ifdef EXEC_BACKEND
+ ShmemBackendArrayAdd(bn);
+ #endif
+ /* all OK */
+ return;
+ }
+
+ /*
+ * fork failed, fall through to report -- actual error message was
+ * logged by StartAutoVacWorker
+ */
+ (void) ReleasePostmasterChildSlot(bn->child_slot);
+ free(bn);
+ }
+ else
+ ereport(LOG,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ }
+
+ /*
+ * Report the failure to the launcher, if it's running. (If it's not, we
+ * might not even be connected to shared memory, so don't try to call
+ * AutoVacWorkerFailed.) Note that we also need to signal it so that it
+ * responds to the condition, but we don't do that here, instead waiting
+ * for ServerLoop to do it. This way we avoid a ping-pong signalling in
+ * quick succession between the autovac launcher and postmaster in case
+ * things get ugly.
+ */
+ if (AutoVacPID != 0)
+ {
+ AutoVacWorkerFailed();
+ avlauncher_needs_signal = true;
+ }
+ }
+
+ /*
+ * Create the opts file
+ */
+ static bool
+ CreateOptsFile(int argc, char *argv[], char *fullprogname)
+ {
+ FILE *fp;
+ int i;
+
+ #define OPTS_FILE "postmaster.opts"
+
+ if ((fp = fopen(OPTS_FILE, "w")) == NULL)
+ {
+ elog(LOG, "could not create file \"%s\": %m", OPTS_FILE);
+ return false;
+ }
+
+ fprintf(fp, "%s", fullprogname);
+ for (i = 1; i < argc; i++)
+ fprintf(fp, " \"%s\"", argv[i]);
+ fputs("\n", fp);
+
+ if (fclose(fp))
+ {
+ elog(LOG, "could not write file \"%s\": %m", OPTS_FILE);
+ return false;
+ }
+
+ return true;
+ }
+
+
+ /*
+ * MaxLivePostmasterChildren
+ *
+ * This reports the number of entries needed in per-child-process arrays
+ * (the PMChildFlags array, and if EXEC_BACKEND the ShmemBackendArray).
+ * These arrays include regular backends, autovac workers and walsenders,
+ * but not special children nor dead_end children. This allows the arrays
+ * to have a fixed maximum size, to wit the same too-many-children limit
+ * enforced by canAcceptConnections(). The exact value isn't too critical
+ * as long as it's more than MaxBackends.
+ */
+ int
+ MaxLivePostmasterChildren(void)
+ {
+ return 2 * MaxBackends;
+ }
+
+
+ #ifdef EXEC_BACKEND
+
+ /*
+ * The following need to be available to the save/restore_backend_variables
+ * functions
+ */
+ extern slock_t *ShmemLock;
+ extern LWLock *LWLockArray;
+ extern slock_t *ProcStructLock;
+ extern PGPROC *AuxiliaryProcs;
+ extern PMSignalData *PMSignalState;
+ extern pgsocket pgStatSock;
+
+ #ifndef WIN32
+ #define write_inheritable_socket(dest, src, childpid) ((*(dest) = (src)), true)
+ #define read_inheritable_socket(dest, src) (*(dest) = *(src))
+ #else
+ static bool write_duplicated_handle(HANDLE *dest, HANDLE src, HANDLE child);
+ static bool write_inheritable_socket(InheritableSocket *dest, SOCKET src,
+ pid_t childPid);
+ static void read_inheritable_socket(SOCKET *dest, InheritableSocket *src);
+ #endif
+
+
+ /* Save critical backend variables into the BackendParameters struct */
+ #ifndef WIN32
+ static bool
+ save_backend_variables(BackendParameters *param, Port *port)
+ #else
+ static bool
+ save_backend_variables(BackendParameters *param, Port *port,
+ HANDLE childProcess, pid_t childPid)
+ #endif
+ {
+ memcpy(¶m->port, port, sizeof(Port));
+ if (!write_inheritable_socket(¶m->portsocket, port->sock, childPid))
+ return false;
+
+ strlcpy(param->DataDir, DataDir, MAXPGPATH);
+
+ memcpy(¶m->ListenSocket, &ListenSocket, sizeof(ListenSocket));
+
+ param->MyCancelKey = MyCancelKey;
+ param->MyPMChildSlot = MyPMChildSlot;
+
+ param->UsedShmemSegID = UsedShmemSegID;
+ param->UsedShmemSegAddr = UsedShmemSegAddr;
+
+ param->ShmemLock = ShmemLock;
+ param->ShmemVariableCache = ShmemVariableCache;
+ param->ShmemBackendArray = ShmemBackendArray;
+
+ param->LWLockArray = LWLockArray;
+ param->ProcStructLock = ProcStructLock;
+ param->ProcGlobal = ProcGlobal;
+ param->AuxiliaryProcs = AuxiliaryProcs;
+ param->PMSignalState = PMSignalState;
+ if (!write_inheritable_socket(¶m->pgStatSock, pgStatSock, childPid))
+ return false;
+
+ param->PostmasterPid = PostmasterPid;
+ param->PgStartTime = PgStartTime;
+ param->PgReloadTime = PgReloadTime;
+
+ param->redirection_done = redirection_done;
+
+ #ifdef WIN32
+ param->PostmasterHandle = PostmasterHandle;
+ if (!write_duplicated_handle(¶m->initial_signal_pipe,
+ pgwin32_create_signal_listener(childPid),
+ childProcess))
+ return false;
+ #else
+ memcpy(¶m->postmaster_alive_fds, &postmaster_alive_fds,
+ sizeof(postmaster_alive_fds));
+ #endif
+
+ memcpy(¶m->syslogPipe, &syslogPipe, sizeof(syslogPipe));
+
+ strlcpy(param->my_exec_path, my_exec_path, MAXPGPATH);
+
+ strlcpy(param->pkglib_path, pkglib_path, MAXPGPATH);
+
+ strlcpy(param->ExtraOptions, ExtraOptions, MAXPGPATH);
+
+ return true;
+ }
+
+
+ #ifdef WIN32
+ /*
+ * Duplicate a handle for usage in a child process, and write the child
+ * process instance of the handle to the parameter file.
+ */
+ static bool
+ write_duplicated_handle(HANDLE *dest, HANDLE src, HANDLE childProcess)
+ {
+ HANDLE hChild = INVALID_HANDLE_VALUE;
+
+ if (!DuplicateHandle(GetCurrentProcess(),
+ src,
+ childProcess,
+ &hChild,
+ 0,
+ TRUE,
+ DUPLICATE_CLOSE_SOURCE | DUPLICATE_SAME_ACCESS))
+ {
+ ereport(LOG,
+ (errmsg_internal("could not duplicate handle to be written to backend parameter file: error code %lu",
+ GetLastError())));
+ return false;
+ }
+
+ *dest = hChild;
+ return true;
+ }
+
+ /*
+ * Duplicate a socket for usage in a child process, and write the resulting
+ * structure to the parameter file.
+ * This is required because a number of LSPs (Layered Service Providers) very
+ * common on Windows (antivirus, firewalls, download managers etc) break
+ * straight socket inheritance.
+ */
+ static bool
+ write_inheritable_socket(InheritableSocket *dest, SOCKET src, pid_t childpid)
+ {
+ dest->origsocket = src;
+ if (src != 0 && src != PGINVALID_SOCKET)
+ {
+ /* Actual socket */
+ if (WSADuplicateSocket(src, childpid, &dest->wsainfo) != 0)
+ {
+ ereport(LOG,
+ (errmsg("could not duplicate socket %d for use in backend: error code %d",
+ (int) src, WSAGetLastError())));
+ return false;
+ }
+ }
+ return true;
+ }
+
+ /*
+ * Read a duplicate socket structure back, and get the socket descriptor.
+ */
+ static void
+ read_inheritable_socket(SOCKET *dest, InheritableSocket *src)
+ {
+ SOCKET s;
+
+ if (src->origsocket == PGINVALID_SOCKET || src->origsocket == 0)
+ {
+ /* Not a real socket! */
+ *dest = src->origsocket;
+ }
+ else
+ {
+ /* Actual socket, so create from structure */
+ s = WSASocket(FROM_PROTOCOL_INFO,
+ FROM_PROTOCOL_INFO,
+ FROM_PROTOCOL_INFO,
+ &src->wsainfo,
+ 0,
+ 0);
+ if (s == INVALID_SOCKET)
+ {
+ write_stderr("could not create inherited socket: error code %d\n",
+ WSAGetLastError());
+ exit(1);
+ }
+ *dest = s;
+
+ /*
+ * To make sure we don't get two references to the same socket, close
+ * the original one. (This would happen when inheritance actually
+ * works..
+ */
+ closesocket(src->origsocket);
+ }
+ }
+ #endif
+
+ static void
+ read_backend_variables(char *id, Port *port)
+ {
+ BackendParameters param;
+
+ #ifndef WIN32
+ /* Non-win32 implementation reads from file */
+ FILE *fp;
+
+ /* Open file */
+ fp = AllocateFile(id, PG_BINARY_R);
+ if (!fp)
+ {
+ write_stderr("could not read from backend variables file \"%s\": %s\n",
+ id, strerror(errno));
+ exit(1);
+ }
+
+ if (fread(¶m, sizeof(param), 1, fp) != 1)
+ {
+ write_stderr("could not read from backend variables file \"%s\": %s\n",
+ id, strerror(errno));
+ exit(1);
+ }
+
+ /* Release file */
+ FreeFile(fp);
+ if (unlink(id) != 0)
+ {
+ write_stderr("could not remove file \"%s\": %s\n",
+ id, strerror(errno));
+ exit(1);
+ }
+ #else
+ /* Win32 version uses mapped file */
+ HANDLE paramHandle;
+ BackendParameters *paramp;
+
+ #ifdef _WIN64
+ paramHandle = (HANDLE) _atoi64(id);
+ #else
+ paramHandle = (HANDLE) atol(id);
+ #endif
+ paramp = MapViewOfFile(paramHandle, FILE_MAP_READ, 0, 0, 0);
+ if (!paramp)
+ {
+ write_stderr("could not map view of backend variables: error code %lu\n",
+ GetLastError());
+ exit(1);
+ }
+
+ memcpy(¶m, paramp, sizeof(BackendParameters));
+
+ if (!UnmapViewOfFile(paramp))
+ {
+ write_stderr("could not unmap view of backend variables: error code %lu\n",
+ GetLastError());
+ exit(1);
+ }
+
+ if (!CloseHandle(paramHandle))
+ {
+ write_stderr("could not close handle to backend parameter variables: error code %lu\n",
+ GetLastError());
+ exit(1);
+ }
+ #endif
+
+ restore_backend_variables(¶m, port);
+ }
+
+ /* Restore critical backend variables from the BackendParameters struct */
+ static void
+ restore_backend_variables(BackendParameters *param, Port *port)
+ {
+ memcpy(port, ¶m->port, sizeof(Port));
+ read_inheritable_socket(&port->sock, ¶m->portsocket);
+
+ SetDataDir(param->DataDir);
+
+ memcpy(&ListenSocket, ¶m->ListenSocket, sizeof(ListenSocket));
+
+ MyCancelKey = param->MyCancelKey;
+ MyPMChildSlot = param->MyPMChildSlot;
+
+ UsedShmemSegID = param->UsedShmemSegID;
+ UsedShmemSegAddr = param->UsedShmemSegAddr;
+
+ ShmemLock = param->ShmemLock;
+ ShmemVariableCache = param->ShmemVariableCache;
+ ShmemBackendArray = param->ShmemBackendArray;
+
+ LWLockArray = param->LWLockArray;
+ ProcStructLock = param->ProcStructLock;
+ ProcGlobal = param->ProcGlobal;
+ AuxiliaryProcs = param->AuxiliaryProcs;
+ PMSignalState = param->PMSignalState;
+ read_inheritable_socket(&pgStatSock, ¶m->pgStatSock);
+
+ PostmasterPid = param->PostmasterPid;
+ PgStartTime = param->PgStartTime;
+ PgReloadTime = param->PgReloadTime;
+
+ redirection_done = param->redirection_done;
+
+ #ifdef WIN32
+ PostmasterHandle = param->PostmasterHandle;
+ pgwin32_initial_signal_pipe = param->initial_signal_pipe;
+ #else
+ memcpy(&postmaster_alive_fds, ¶m->postmaster_alive_fds,
+ sizeof(postmaster_alive_fds));
+ #endif
+
+ memcpy(&syslogPipe, ¶m->syslogPipe, sizeof(syslogPipe));
+
+ strlcpy(my_exec_path, param->my_exec_path, MAXPGPATH);
+
+ strlcpy(pkglib_path, param->pkglib_path, MAXPGPATH);
+
+ strlcpy(ExtraOptions, param->ExtraOptions, MAXPGPATH);
+ }
+
+
+ Size
+ ShmemBackendArraySize(void)
+ {
+ return mul_size(MaxLivePostmasterChildren(), sizeof(Backend));
+ }
+
+ void
+ ShmemBackendArrayAllocation(void)
+ {
+ Size size = ShmemBackendArraySize();
+
+ ShmemBackendArray = (Backend *) ShmemAlloc(size);
+ /* Mark all slots as empty */
+ memset(ShmemBackendArray, 0, size);
+ }
+
+ static void
+ ShmemBackendArrayAdd(Backend *bn)
+ {
+ /* The array slot corresponding to my PMChildSlot should be free */
+ int i = bn->child_slot - 1;
+
+ Assert(ShmemBackendArray[i].pid == 0);
+ ShmemBackendArray[i] = *bn;
+ }
+
+ static void
+ ShmemBackendArrayRemove(Backend *bn)
+ {
+ int i = bn->child_slot - 1;
+
+ Assert(ShmemBackendArray[i].pid == bn->pid);
+ /* Mark the slot as empty */
+ ShmemBackendArray[i].pid = 0;
+ }
+ #endif /* EXEC_BACKEND */
+
+
+ #ifdef WIN32
+
+ static pid_t
+ win32_waitpid(int *exitstatus)
+ {
+ DWORD dwd;
+ ULONG_PTR key;
+ OVERLAPPED *ovl;
+
+ /*
+ * Check if there are any dead children. If there are, return the pid of
+ * the first one that died.
+ */
+ if (GetQueuedCompletionStatus(win32ChildQueue, &dwd, &key, &ovl, 0))
+ {
+ *exitstatus = (int) key;
+ return dwd;
+ }
+
+ return -1;
+ }
+
+ /*
+ * Note! Code below executes on a thread pool! All operations must
+ * be thread safe! Note that elog() and friends must *not* be used.
+ */
+ static void WINAPI
+ pgwin32_deadchild_callback(PVOID lpParameter, BOOLEAN TimerOrWaitFired)
+ {
+ win32_deadchild_waitinfo *childinfo = (win32_deadchild_waitinfo *) lpParameter;
+ DWORD exitcode;
+
+ if (TimerOrWaitFired)
+ return; /* timeout. Should never happen, since we use
+ * INFINITE as timeout value. */
+
+ /*
+ * Remove handle from wait - required even though it's set to wait only
+ * once
+ */
+ UnregisterWaitEx(childinfo->waitHandle, NULL);
+
+ if (!GetExitCodeProcess(childinfo->procHandle, &exitcode))
+ {
+ /*
+ * Should never happen. Inform user and set a fixed exitcode.
+ */
+ write_stderr("could not read exit code for process\n");
+ exitcode = 255;
+ }
+
+ if (!PostQueuedCompletionStatus(win32ChildQueue, childinfo->procId, (ULONG_PTR) exitcode, NULL))
+ write_stderr("could not post child completion status\n");
+
+ /*
+ * Handle is per-process, so we close it here instead of in the
+ * originating thread
+ */
+ CloseHandle(childinfo->procHandle);
+
+ /*
+ * Free struct that was allocated before the call to
+ * RegisterWaitForSingleObject()
+ */
+ free(childinfo);
+
+ /* Queue SIGCHLD signal */
+ pg_queue_signal(SIGCHLD);
+ }
+
+ #endif /* WIN32 */
+
+ /*
+ * Initialize one and only handle for monitoring postmaster death.
+ *
+ * Called once in the postmaster, so that child processes can subsequently
+ * monitor if their parent is dead.
+ */
+ static void
+ InitPostmasterDeathWatchHandle(void)
+ {
+ #ifndef WIN32
+ /*
+ * Create a pipe. Postmaster holds the write end of the pipe open
+ * (POSTMASTER_FD_OWN), and children hold the read end. Children can
+ * pass the read file descriptor to select() to wake up in case postmaster
+ * dies, or check for postmaster death with a (read() == 0). Children must
+ * close the write end as soon as possible after forking, because EOF
+ * won't be signaled in the read end until all processes have closed the
+ * write fd. That is taken care of in ClosePostmasterPorts().
+ */
+ Assert(MyProcPid == PostmasterPid);
+ if (pipe(postmaster_alive_fds))
+ ereport(FATAL,
+ (errcode_for_file_access(),
+ errmsg_internal("could not create pipe to monitor postmaster death: %m")));
+
+ /*
+ * Set O_NONBLOCK to allow testing for the fd's presence with a read()
+ * call.
+ */
+ if (fcntl(postmaster_alive_fds[POSTMASTER_FD_WATCH], F_SETFL, O_NONBLOCK))
+ ereport(FATAL,
+ (errcode_for_socket_access(),
+ errmsg_internal("could not set postmaster death monitoring pipe to non-blocking mode: %m")));
+
+ #else
+ /*
+ * On Windows, we use a process handle for the same purpose.
+ */
+ if (DuplicateHandle(GetCurrentProcess(),
+ GetCurrentProcess(),
+ GetCurrentProcess(),
+ &PostmasterHandle,
+ 0,
+ TRUE,
+ DUPLICATE_SAME_ACCESS) == 0)
+ ereport(FATAL,
+ (errmsg_internal("could not duplicate postmaster handle: error code %lu",
+ GetLastError())));
+ #endif /* WIN32 */
+ }
diff -rcN postgresql/src/bin/pg_controldata/pg_controldata.c postgresql_with_patch/src/bin/pg_controldata/pg_controldata.c
*** postgresql/src/bin/pg_controldata/pg_controldata.c 2011-09-12 05:19:14.000000000 +0900
--- postgresql_with_patch/src/bin/pg_controldata/pg_controldata.c 2011-09-12 05:24:42.000000000 +0900
***************
*** 234,239 ****
--- 234,242 ----
ControlFile.backupStartPoint.xrecoff);
printf(_("End-of-backup record required: %s\n"),
ControlFile.backupEndRequired ? _("yes") : _("no"));
+ printf(_("Backup end location: %X/%X\n"),
+ ControlFile.backupEndPoint.xlogid,
+ ControlFile.backupEndPoint.xrecoff);
printf(_("Current wal_level setting: %s\n"),
wal_level_str(ControlFile.wal_level));
printf(_("Current max_connections setting: %d\n"),
diff -rcN postgresql/src/bin/pg_ctl/pg_ctl.c postgresql_with_patch/src/bin/pg_ctl/pg_ctl.c
*** postgresql/src/bin/pg_ctl/pg_ctl.c 2011-09-12 05:19:14.000000000 +0900
--- postgresql_with_patch/src/bin/pg_ctl/pg_ctl.c 2011-09-12 05:24:42.000000000 +0900
***************
*** 882,894 ****
{
/*
* If backup_label exists, an online backup is running. Warn the user
! * that smart shutdown will wait for it to finish. However, if
! * recovery.conf is also present, we're recovering from an online
! * backup instead of performing one.
*/
if (shutdown_mode == SMART_MODE &&
! stat(backup_file, &statbuf) == 0 &&
! stat(recovery_file, &statbuf) != 0)
{
print_msg(_("WARNING: online backup mode is active\n"
"Shutdown will not complete until pg_stop_backup() is called.\n\n"));
--- 882,891 ----
{
/*
* If backup_label exists, an online backup is running. Warn the user
! * that smart shutdown will wait for it to finish.
*/
if (shutdown_mode == SMART_MODE &&
! stat(backup_file, &statbuf) == 0)
{
print_msg(_("WARNING: online backup mode is active\n"
"Shutdown will not complete until pg_stop_backup() is called.\n\n"));
diff -rcN postgresql/src/bin/pg_resetxlog/pg_resetxlog.c postgresql_with_patch/src/bin/pg_resetxlog/pg_resetxlog.c
*** postgresql/src/bin/pg_resetxlog/pg_resetxlog.c 2011-09-12 05:19:14.000000000 +0900
--- postgresql_with_patch/src/bin/pg_resetxlog/pg_resetxlog.c 2011-09-12 05:24:42.000000000 +0900
***************
*** 503,509 ****
ControlFile.time = (pg_time_t) time(NULL);
ControlFile.checkPoint = ControlFile.checkPointCopy.redo;
! /* minRecoveryPoint and backupStartPoint can be left zero */
ControlFile.wal_level = WAL_LEVEL_MINIMAL;
ControlFile.MaxConnections = 100;
--- 503,509 ----
ControlFile.time = (pg_time_t) time(NULL);
ControlFile.checkPoint = ControlFile.checkPointCopy.redo;
! /* minRecoveryPoint, backupStartPoint and backupEndPoint can be left zero */
ControlFile.wal_level = WAL_LEVEL_MINIMAL;
ControlFile.MaxConnections = 100;
***************
*** 638,643 ****
--- 638,645 ----
ControlFile.backupStartPoint.xlogid = 0;
ControlFile.backupStartPoint.xrecoff = 0;
ControlFile.backupEndRequired = false;
+ ControlFile.backupEndPoint.xlogid = 0;
+ ControlFile.backupEndPoint.xrecoff = 0;
/*
* Force the defaults for max_* settings. The values don't really matter
diff -rcN postgresql/src/include/access/xlog.h postgresql_with_patch/src/include/access/xlog.h
*** postgresql/src/include/access/xlog.h 2011-09-12 05:19:14.000000000 +0900
--- postgresql_with_patch/src/include/access/xlog.h 2011-09-12 05:24:42.000000000 +0900
***************
*** 328,331 ****
--- 328,335 ----
#define BACKUP_LABEL_FILE "backup_label"
#define BACKUP_LABEL_OLD "backup_label.old"
+ /* These are written in backup label file */
+ #define BACKUP_FROM_MASTER "master"
+ #define BACKUP_FROM_SLAVE "slave"
+
#endif /* XLOG_H */
diff -rcN postgresql/src/include/catalog/pg_control.h postgresql_with_patch/src/include/catalog/pg_control.h
*** postgresql/src/include/catalog/pg_control.h 2011-09-12 05:19:14.000000000 +0900
--- postgresql_with_patch/src/include/catalog/pg_control.h 2011-09-12 05:24:42.000000000 +0900
***************
*** 143,152 ****
--- 143,158 ----
* start up. If it's false, but backupStartPoint is set, a backup_label
* file was found at startup but it may have been a leftover from a stray
* pg_start_backup() call, not accompanied by pg_stop_backup().
+ *
+ * backupEndPoint is set a same value as minRecoveryPoint at the first
+ * recovery if the backup is taken from hot standby. It is unset if we
+ * reach the end of backup. It is not set if the backup is taken from
+ * normal running.
*/
XLogRecPtr minRecoveryPoint;
XLogRecPtr backupStartPoint;
bool backupEndRequired;
+ XLogRecPtr backupEndPoint;
/*
* Parameter settings that determine if the WAL can be used for archival