diff --git a/src/backend/replication/logical/slotsync.c b/src/backend/replication/logical/slotsync.c index 66fb46ac27..72a775b09c 100644 --- a/src/backend/replication/logical/slotsync.c +++ b/src/backend/replication/logical/slotsync.c @@ -79,10 +79,11 @@ * and also sets stopSignaled=true to handle the race condition when the * postmaster has not noticed the promotion yet and thus may end up restarting * the slot sync worker. If stopSignaled is set, the worker will exit in such a - * case. Note that we don't need to reset this variable as after promotion the - * slot sync worker won't be restarted because the pmState changes to PM_RUN from - * PM_HOT_STANDBY and we don't support demoting primary without restarting the - * server. See MaybeStartSlotSyncWorker. + * case. The SQL function pg_sync_replication_slots() will also error out if + * this flag is set. Note that we don't need to reset this variable as after + * promotion the slot sync worker won't be restarted because the pmState + * changes to PM_RUN from PM_HOT_STANDBY and we don't support demoting + * primary without restarting the server. See MaybeStartSlotSyncWorker. * * The 'syncing' flag is needed to prevent concurrent slot syncs to avoid slot * overwrites. @@ -1246,12 +1247,11 @@ wait_for_slot_activity(bool some_slot_updated) } /* - * Check stopSignaled and syncing flags. Emit error if promotion has - * already set stopSignaled or if it is concurrent sync call. Otherwise, - * set 'syncing' flag and pid info. + * Emit an error if a promotion or a concurrent sync call is in progress. + * Otherwise, advertise that a sync is in progress. */ static void -check_flags_and_set_sync_info(pid_t worker_pid) +check_and_set_sync_info(pid_t worker_pid) { SpinLockAcquire(&SlotSyncCtx->mutex); @@ -1259,9 +1259,8 @@ check_flags_and_set_sync_info(pid_t worker_pid) Assert(worker_pid == InvalidPid || SlotSyncCtx->pid == InvalidPid); /* - * Startup process signaled the slot sync machinery to stop, so if - * meanwhile postmaster ended up starting the worker again or user has - * invoked pg_sync_replication_slots(), error out. + * Emit an error if startup process signaled the slot sync machinery to + * stop. See comments atop SlotSyncCtxStruct. */ if (SlotSyncCtx->stopSignaled) { @@ -1281,7 +1280,10 @@ check_flags_and_set_sync_info(pid_t worker_pid) SlotSyncCtx->syncing = true; - /* Advertise our PID so that the startup process can kill us on promotion */ + /* + * Advertise the required PID so that the startup process can kill the slot + * sync worker on promotion. + */ SlotSyncCtx->pid = worker_pid; SpinLockRelease(&SlotSyncCtx->mutex); @@ -1333,13 +1335,13 @@ ReplSlotSyncWorkerMain(char *startup_data, size_t startup_data_len) /* * Register slotsync_worker_onexit() before we register - * ReplicationSlotShmemExit() in BaseInit(), to ensure that during exit of - * slot sync worker, ReplicationSlotShmemExit() is called first, followed - * by slotsync_worker_onexit(). Startup process during promotion calls - * ShutDownSlotSync() which waits for slot sync to finish and it does that - * by checking the 'syncing' flag. Thus it is important that worker should - * be done with slots' release and cleanup before it actually marks itself - * as finished syncing. + * ReplicationSlotShmemExit() in BaseInit(), to ensure that during the exit + * of the slot sync worker, ReplicationSlotShmemExit() is called first, + * followed by slotsync_worker_onexit(). The startup process during + * promotion invokes ShutDownSlotSync() which waits for slot sync to finish + * and it does that by checking the 'syncing' flag. Thus worker must be + * done with the slots' release and cleanup before it marks itself as + * finished syncing. */ before_shmem_exit(slotsync_worker_onexit, (Datum) 0); @@ -1391,7 +1393,7 @@ ReplSlotSyncWorkerMain(char *startup_data, size_t startup_data_len) pqsignal(SIGPIPE, SIG_IGN); pqsignal(SIGCHLD, SIG_DFL); - check_flags_and_set_sync_info(MyProcPid); + check_and_set_sync_info(MyProcPid); ereport(LOG, errmsg("slot sync worker started")); @@ -1544,9 +1546,9 @@ update_synced_slots_inactive_since(void) /* * Shut down the slot sync worker. * - * It sends signal to shutdown slot sync worker. It also waits till - * the slot sync worker has exited and pg_sync_replication_slots() - * has finished. + * This function sends signal to shutdown slot sync worker, if required. It + * also waits till the slot sync worker has exited or + * pg_sync_replication_slots() has finished. */ void ShutDownSlotSync(void) @@ -1593,10 +1595,7 @@ ShutDownSlotSync(void) SpinLockAcquire(&SlotSyncCtx->mutex); - /* - * Confirm that both the worker and the function - * pg_sync_replication_slots() are done. - */ + /* Ensure that no process is syncing the slots. */ if (!SlotSyncCtx->syncing) break; @@ -1685,12 +1684,13 @@ slotsync_failure_callback(int code, Datum arg) /* * We need to do slots cleanup here just like WalSndErrorCleanup() does. * - * Startup process during promotion calls ShutDownSlotSync() which waits - * for slot sync to finish and it does that by checking the 'syncing' - * flag. Thus it is important that SQL function should be done with slots' - * release and cleanup before it actually marks itself as finished - * syncing. + * The startup process during promotion invokes ShutDownSlotSync() which + * waits for slot sync to finish and it does that by checking the 'syncing' + * flag. Thus the SQL function must be done with slots' release and cleanup + * to avoid any dangling temporary slots or active slots before it marks + * itself as finished syncing. */ + /* Make sure active replication slots are released */ if (MyReplicationSlot != NULL) ReplicationSlotRelease(); @@ -1699,9 +1699,9 @@ slotsync_failure_callback(int code, Datum arg) ReplicationSlotCleanup(); /* - * If syncing_slots is true, it indicates that the process errored out - * without resetting the flag. So, we need to clean up shared memory and - * reset the flag here. + * The set syncing_slots indicates that the process errored out without + * resetting the flag. So, we need to clean up shared memory and reset the + * flag here. */ if (syncing_slots) reset_syncing_flag(); @@ -1718,7 +1718,7 @@ SyncReplicationSlots(WalReceiverConn *wrconn) { PG_ENSURE_ERROR_CLEANUP(slotsync_failure_callback, PointerGetDatum(wrconn)); { - check_flags_and_set_sync_info(InvalidPid); + check_and_set_sync_info(InvalidPid); validate_remote_info(wrconn);