"wal receiver" process hang in syslog() while exiting after receivingSIGTERM while the postgres has been promoted.

Поиск
Список
Период
Сортировка
От Chen, Yan-Jack (NSB - CN/Hangzhou)
Тема "wal receiver" process hang in syslog() while exiting after receivingSIGTERM while the postgres has been promoted.
Дата
Msg-id HE1PR0702MB36260727D800D630F3DBFB52F8760@HE1PR0702MB3626.eurprd07.prod.outlook.com
обсуждение исходный текст
Ответы Re: "wal receiver" process hang in syslog() while exiting afterreceiving SIGTERM while the postgres has been promoted.  (Robert Haas <robertmhaas@gmail.com>)
Список pgsql-hackers
Hi Hackers,
  We encounter one problem happened while we try to promote standby postgres(version 9.6.9) server to active. From the trace(we triggered the process abort). We can see the process was hang in syslog() while handling SIGTERM. According to below article. Looks it is risky to write syslog in signal handling. Any idea to avoid it?
 
 
 
[New LWP 7741]
b[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib64/libthread_db.so.1".
Core was generated by `postgres: wal receiver process                     '.
Program terminated with signal SIGABRT, Aborted.
#0  0x00007ff25122d8db in __lll_lock_wait_private () from /lib64/libc.so.6
Missing separate debuginfos, use: dnf debuginfo-install postgresql-server-9.6.9-3.wf28.x86_64
(gdb) bt full
 
#0  0x00007ff25122d8db in __lll_lock_wait_private () from /lib64/libc.so.6
No symbol table info available.
#1  0x00007ff25121aa7b in __vsyslog_chk () from /lib64/libc.so.6
No symbol table info available.
#2  0x00007ff25121ae5c in syslog () from /lib64/libc.so.6
No symbol table info available.
#3  0x0000000000705e88 in write_syslog (level=3, line=0x2b19cd8 "FATAL:  terminating walreceiver process due to administrator command\n") at elog.c:2033
        buf = "FATAL:  terminating walreceiver process due to administrator command\000\200\310\a\000\000\000\000\000U\211*Q\362\177\000\000\000\000\000\000\000\000\000\000P\317\033\373\377\177", '\000' <repeats 42 times>, "\020\272\261\002\000\000\000\000\020", '\000' <repeats 11 times>, "\001", '\000' <repeats 35 times>, "\240\311\033"...
        buflen = 68
        i = <optimized out>
        chunk_nr = <optimized out>
        seq = 4
        len = 69
        nlpos = 0x2b19d1c "\n"
#4  0x0000000000707105 in send_message_to_server_log (edata=0xb4ded8 <errordata+184>) at elog.c:3003
        syslog_level = <optimized out>
        buf = {data = 0x2b19cd8 "FATAL:  terminating walreceiver process due to administrator command\n", len = 69, maxlen = 1024, cursor = 0}
        buf = <optimized out>
        syslog_level = <optimized out>
#5  EmitErrorReport () at elog.c:1479
        edata = 0xb4ded8 <errordata+184>
        oldcontext = <optimized out>
#6  0x0000000000707d8f in errfinish (dummy=<optimized out>) at elog.c:483
        edata = 0xb4ded8 <errordata+184>
        elevel = 21
        oldcontext = <optimized out>
        econtext = 0x0
        __func__ = "errfinish"
#7  0x0000000000622e58 in ProcessWalRcvInterrupts () at walreceiver.c:171
No locals.
#8  0x0000000000622e9f in WalRcvShutdownHandler (postgres_signal_arg=<optimized out>) at walreceiver.c:821
        save_errno = 0
#9  <signal handler called>
No symbol table info available.
#10 0x00007ff2512218fb in send () from /lib64/libc.so.6
No symbol table info available.
#11 0x00007ff25121ab72 in __vsyslog_chk () from /lib64/libc.so.6
No symbol table info available.
#12 0x00007ff25121ae5c in syslog () from /lib64/libc.so.6
No symbol table info available.
#13 0x0000000000705e88 in write_syslog (level=3, line=0x2b19929 "\t\tIs the server running on host \"192.168.1.8\" and accepting\n\t\tTCP/IP connections on port 5432?\n\t\n") at elog.c:2033
        buf = "\t\tIs the server running on host \"192.168.1.8\" and accepting\000onnect to server: Connection refused\000\377\177\000\000x\376\263\002", '\000' <repeats 12 times>, "\003\000\000\000\000\000\000\000\000\300\rR\362\177\000\000\377\377\377\377\000\000\000\000\343\030bm\225k\330\001\270\267\337E\362\177\000\000\360\372\263\002\000\000\000\000@\324\033\373\377\177\000\000\000\267\261\030\275\036y=`\324\033\373\377\177\000\000`\324\033\373\377\177\000\000\220\325\033"...
        buflen = 59
        i = <optimized out>
        chunk_nr = <optimized out>
        seq = 4
        len = 97
        nlpos = 0x2b19964 "\n\t\tTCP/IP connections on port 5432?\n\t\n"
#14 0x0000000000707105 in send_message_to_server_log (edata=0xb4de20 <errordata>) at elog.c:3003
        syslog_level = <optimized out>
        buf = {data = 0x2b198c8 "terminating walreceiver process due to administrator command", len = 194, maxlen = 1024, cursor = 0}
        buf = <optimized out>
        syslog_level = <optimized out>
#15 EmitErrorReport () at elog.c:1479
        edata = 0xb4de20 <errordata>
        oldcontext = <optimized out>
#16 0x0000000000707d8f in errfinish (dummy=<optimized out>) at elog.c:483
        edata = 0xb4de20 <errordata>
        elevel = 21
        oldcontext = <optimized out>
        econtext = 0x0
        __func__ = "errfinish"
#17 0x00007ff245dfcc90 in libpqrcv_connect () from /usr/lib/libpqwalreceiver.so
No symbol table info available.
#18 0x000000000062352b in WalReceiverMain () at walreceiver.c:314
        conninfo = "host=192.168.1.8 port=5432 user=_nokfssystestpostgres\000ION\000\354Q\362\177\000\000\001", '\000' <repeats 16 times>, "[\fR\362\177\000\000\250o\023Q\362\177\000\000\060\335\033\373\377\177\000\000:\274\354Q\362\177\000\000x", '\000' <repeats 16 times>, "\020NQ\362\177\000\000x\000\000\000\000\000\000\000\222\335\033\373\377\177\000\000=\245*Q\362\177\000\000\222\335\033\373\377\177\000\000:\274\354Q\362\177\000\000=\036", '\000' <repeats 14 times>, "\001\000\000\000\000\000\000\000"...
        tmp_conninfo = <optimized out>
        slotname = "\000\000\000\000\000\000\000\000?\002", '\000' <repeats 12 times>, " \264MQ\362\177\000\000\t\000\000\000\000\000\000\000\260\335\033\373\377\177\000\000\060?\002\000\000\000\000\316l\031Q\362\177\000"
        startpoint = 578813952
        startpointTLI = 55
        primaryTLI = 1360676704
        first_stream = <optimized out>
        walrcv = 0x7ff24fc0a500
        last_recv_timestamp = <optimized out>
        now = 582800572427736
        ping_sent = <optimized out>
        __func__ = "WalReceiverMain"
#19 0x00000000004d56f7 in AuxiliaryProcessMain (argc=argc@entry=2, argv=argv@entry=0x7ffffb1bddb0) at bootstrap.c:440
        progname = 0x73f59e "postgres"
        flag = -1
        userDoption = 0x0
        __func__ = "AuxiliaryProcessMain"
#20 0x00000000006096ea in StartChildProcess (type=WalReceiverProcess) at postmaster.c:5300
        pid = <optimized out>
        av = {0x73f59e "postgres", 0x7ffffb1bdd90 "-x6", 0x0, 0x1dc0 <error: Cannot access memory at address 0x1dc0>, 0x2b3c850 "\220\061\264\002", 0x7ff2520c5b00 "", 0x4 <error: Cannot access memory at address 0x4>, 0x0,
          0x2b3c830 "\220\061\264\002", 0x60a781 <reaper+1490> "E\211,$H\203\304\020[]A\\A]A^?\005\310DW"}
        ac = 2
        typebuf = "-x6", '\000' <repeats 21 times>, "\306jp\000\000\000\000"
        save_errno = <optimized out>
#21 0x0000000000609e3f in MaybeStartWalReceiver () at postmaster.c:5447
No locals.
#22 0x000000000060aae3 in sigusr1_handler (postgres_signal_arg=<optimized out>) at postmaster.c:5084
        save_errno = 4
        __func__ = "sigusr1_handler"
#23 <signal handler called>
No symbol table info available.
#24 0x00007ff251217f57 in select () from /lib64/libc.so.6
No symbol table info available.
#25 0x000000000046ec0a in ServerLoop () at postmaster.c:1685
        timeout = {tv_sec = 58, tv_usec = 886165}
        rmask = {fds_bits = {40, 0 <repeats 15 times>}}
        selres = <optimized out>
        now = <optimized out>
        readmask = {fds_bits = {40, 0 <repeats 15 times>}}
        nSockets = <optimized out>
        last_lockfile_recheck_time = 1529485360
        last_touch_time = 1529485360
        __func__ = "ServerLoop"
#26 0x000000000060b94b in PostmasterMain (argc=argc@entry=3, argv=argv@entry=0x2b176f0) at postmaster.c:1329
        opt = <optimized out>
        status = <optimized out>
        userDoption = <optimized out>
        listen_addr_saved = <optimized out>
        i = <optimized out>
        output_config_variable = <optimized out>
        __func__ = "PostmasterMain"
#27 0x000000000047001e in main (argc=3, argv=0x2b176f0) at main.c:228
No locals.
 
Br. J
Chen Yan(Jack)
 
 
 

В списке pgsql-hackers по дате отправления:

Предыдущее
От: Craig Ringer
Дата:
Сообщение: Re: PATCH: backtraces for error messages
Следующее
От: Michael Paquier
Дата:
Сообщение: Re: PATCH: backtraces for error messages