Re: Idea for improving buildfarm robustness
| От | Tom Lane |
|---|---|
| Тема | Re: Idea for improving buildfarm robustness |
| Дата | |
| Msg-id | 9063.1443738966@sss.pgh.pa.us обсуждение исходный текст |
| Ответ на | Re: Idea for improving buildfarm robustness (Tom Lane <tgl@sss.pgh.pa.us>) |
| Ответы |
Re: Idea for improving buildfarm robustness
|
| Список | pgsql-hackers |
I wrote:
> It strikes me that a different approach that might be of value would
> be to re-read postmaster.pid and make sure that (a) it's still there
> and (b) it still contains the current postmaster's PID. This would
> be morally equivalent to what Jim suggests above, and it would dodge
> the checkpointer-destroys-the-evidence problem, and it would have the
> additional advantage that we'd notice when a brain-dead DBA decides
> to manually remove postmaster.pid so he can start a new postmaster.
> (It's probably far too late to avoid data corruption at that point,
> but better late than never.)
> This is still not bulletproof against all overwrite-with-a-backup
> scenarios, but it seems like a noticeable improvement over what we
> discussed yesterday.
Here's a rewritten patch that looks at postmaster.pid instead of
pg_control. It should be effectively the same as the prior patch in terms
of response to directory-removal cases, and it should also catch many
overwrite cases.
regards, tom lane
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index baa43b2..498ebfa 100644
*** a/src/backend/postmaster/postmaster.c
--- b/src/backend/postmaster/postmaster.c
*************** ServerLoop(void)
*** 1602,1610 ****
fd_set readmask;
int nSockets;
time_t now,
last_touch_time;
! last_touch_time = time(NULL);
nSockets = initMasks(&readmask);
--- 1602,1611 ----
fd_set readmask;
int nSockets;
time_t now,
+ last_lockfile_recheck_time,
last_touch_time;
! last_lockfile_recheck_time = last_touch_time = time(NULL);
nSockets = initMasks(&readmask);
*************** ServerLoop(void)
*** 1754,1772 ****
if (StartWorkerNeeded || HaveCrashedWorker)
maybe_start_bgworker();
- /*
- * Touch Unix socket and lock files every 58 minutes, to ensure that
- * they are not removed by overzealous /tmp-cleaning tasks. We assume
- * no one runs cleaners with cutoff times of less than an hour ...
- */
- now = time(NULL);
- if (now - last_touch_time >= 58 * SECS_PER_MINUTE)
- {
- TouchSocketFiles();
- TouchSocketLockFiles();
- last_touch_time = now;
- }
-
#ifdef HAVE_PTHREAD_IS_THREADED_NP
/*
--- 1755,1760 ----
*************** ServerLoop(void)
*** 1793,1798 ****
--- 1781,1828 ----
/* reset flag so we don't SIGKILL again */
AbortStartTime = 0;
}
+
+ /*
+ * Lastly, check to see if it's time to do some things that we don't
+ * want to do every single time through the loop, because they're a
+ * bit expensive. Note that there's up to a minute of slop in when
+ * these tasks will be performed, since DetermineSleepTime() will let
+ * us sleep at most that long.
+ */
+ now = time(NULL);
+
+ /*
+ * Once a minute, verify that postmaster.pid hasn't been removed or
+ * overwritten. If it has, we commit hara-kiri. This avoids having
+ * postmasters and child processes hanging around after their database
+ * is gone, and maybe causing problems if a new database cluster is
+ * created in the same place. It also provides some protection
+ * against a DBA foolishly removing postmaster.pid and manually
+ * starting a new postmaster. Data corruption is likely to ensue from
+ * that anyway, but we can minimize the damage by aborting ASAP.
+ */
+ if (now - last_lockfile_recheck_time >= 1 * SECS_PER_MINUTE)
+ {
+ if (!RecheckDataDirLockFile())
+ {
+ ereport(LOG,
+ (errmsg("performing immediate shutdown because data directory lock file is invalid")));
+ kill(MyProcPid, SIGQUIT);
+ }
+ last_lockfile_recheck_time = now;
+ }
+
+ /*
+ * Touch Unix socket and lock files every 58 minutes, to ensure that
+ * they are not removed by overzealous /tmp-cleaning tasks. We assume
+ * no one runs cleaners with cutoff times of less than an hour ...
+ */
+ if (now - last_touch_time >= 58 * SECS_PER_MINUTE)
+ {
+ TouchSocketFiles();
+ TouchSocketLockFiles();
+ last_touch_time = now;
+ }
}
}
diff --git a/src/backend/utils/init/miscinit.c b/src/backend/utils/init/miscinit.c
index f0099d3..b6270e1 100644
*** a/src/backend/utils/init/miscinit.c
--- b/src/backend/utils/init/miscinit.c
*************** AddToDataDirLockFile(int target_line, co
*** 1202,1207 ****
--- 1202,1277 ----
}
+ /*
+ * Recheck that the data directory lock file still exists with expected
+ * content. Return TRUE if the lock file appears OK, FALSE if it isn't.
+ *
+ * We call this periodically in the postmaster. The idea is that if the
+ * lock file has been removed or replaced by another postmaster, we should
+ * do a panic database shutdown. Therefore, we should return TRUE if there
+ * is any doubt: we do not want to cause a panic shutdown unnecessarily.
+ * Transient failures like EINTR or ENFILE should not cause us to fail.
+ * (If there really is something wrong, we'll detect it on a future recheck.)
+ */
+ bool
+ RecheckDataDirLockFile(void)
+ {
+ int fd;
+ int len;
+ long file_pid;
+ char buffer[BLCKSZ];
+
+ fd = open(DIRECTORY_LOCK_FILE, O_RDWR | PG_BINARY, 0);
+ if (fd < 0)
+ {
+ /*
+ * There are many foreseeable false-positive error conditions. For
+ * safety, fail only on enumerated clearly-something-is-wrong
+ * conditions.
+ */
+ switch (errno)
+ {
+ case ENOENT:
+ case ENOTDIR:
+ /* disaster */
+ ereport(LOG,
+ (errcode_for_file_access(),
+ errmsg("could not open file \"%s\": %m",
+ DIRECTORY_LOCK_FILE)));
+ return false;
+ default:
+ /* non-fatal, at least for now */
+ ereport(LOG,
+ (errcode_for_file_access(),
+ errmsg("could not open file \"%s\": %m; continuing anyway",
+ DIRECTORY_LOCK_FILE)));
+ return true;
+ }
+ }
+ len = read(fd, buffer, sizeof(buffer) - 1);
+ if (len < 0)
+ {
+ ereport(LOG,
+ (errcode_for_file_access(),
+ errmsg("could not read from file \"%s\": %m",
+ DIRECTORY_LOCK_FILE)));
+ close(fd);
+ return true; /* treat read failure as nonfatal */
+ }
+ buffer[len] = '\0';
+ close(fd);
+ file_pid = atol(buffer);
+ if (file_pid == getpid())
+ return true; /* all is well */
+
+ /* Trouble: someone's overwritten the lock file */
+ ereport(LOG,
+ (errmsg("lock file \"%s\" contains wrong PID: %ld instead of %ld",
+ DIRECTORY_LOCK_FILE, file_pid, (long) getpid())));
+ return false;
+ }
+
+
/*-------------------------------------------------------------------------
* Version checking support
*-------------------------------------------------------------------------
diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h
index 80ac732..87133bd 100644
*** a/src/include/miscadmin.h
--- b/src/include/miscadmin.h
*************** extern void CreateSocketLockFile(const c
*** 450,455 ****
--- 450,456 ----
const char *socketDir);
extern void TouchSocketLockFiles(void);
extern void AddToDataDirLockFile(int target_line, const char *str);
+ extern bool RecheckDataDirLockFile(void);
extern void ValidatePgVersion(const char *path);
extern void process_shared_preload_libraries(void);
extern void process_session_preload_libraries(void);
В списке pgsql-hackers по дате отправления: