parallel queries hang

Поиск
Список
Период
Сортировка
От Euler Taveira
Тема parallel queries hang
Дата
Msg-id CAHE3wggOccFrWsNTnMQqPDDc1C=J=oqkoupDdJmqFyiHy58npg@mail.gmail.com
обсуждение исходный текст
Ответы Re: parallel queries hang  (Sergei Kornilov <sk@zsrv.org>)
Список pgsql-bugs
Hi,

Parallel queries hang and can't terminate backends (SIGTERM). It seems
the same (or related) case as reported in [1]. It is a postgres 10.4
on RHEL 6.9. It is not the first time it happens (but it is the first
time I got stack traces). I have two cases today.

The first case is waiting on IPC (wait_event_type) for BtreePage (wait_event).

postgres  15785  0.0  0.0 6670688 733652 ?      Ss   Oct17   0:02  \_
postgres: usuario prod 10.0.0.2(38065) SELECT
postgres  16664  0.0  0.0 6667696 747816 ?      Ss   Oct17   0:02  \_
postgres: bgworker: parallel worker for PID 15785

The stack trace for PID 15785:

(gdb) bt
#0  0x00000030ac6e91a3 in __epoll_wait_nocancel () from /lib64/libc.so.6
#1  0x00000000006f6b65 in WaitEventSetWaitBlock (set=0xdcc680,
timeout=-1, occurred_events=0x7ffd0ceed320, nevents=1,
    wait_event_info=<value optimized out>) at latch.c:1048
#2  WaitEventSetWait (set=0xdcc680, timeout=-1,
occurred_events=0x7ffd0ceed320, nevents=1, wait_event_info=<value
optimized out>) at latch.c:1000
#3  0x000000000070f207 in ConditionVariableSleep (cv=<value optimized
out>, wait_event_info=134217730) at condition_variable.c:149
#4  0x00000000004bffe1 in _bt_parallel_seize (scan=<value optimized
out>, pageno=0x7ffd0ceed3ec) at nbtree.c:811
#5  0x00000000004c1ffe in _bt_readnextpage (scan=0xfe2da8, blkno=0,
dir=ForwardScanDirection) at nbtsearch.c:1498
#6  0x00000000004c21fe in _bt_steppage (scan=0xfe2da8,
dir=ForwardScanDirection) at nbtsearch.c:1429
#7  0x00000000004c2384 in _bt_next (scan=0xfe2da8, dir=<value
optimized out>) at nbtsearch.c:1150
#8  0x00000000004c06fc in btgettuple (scan=0xfe2da8,
dir=ForwardScanDirection) at nbtree.c:399
#9  0x00000000004b9c9a in index_getnext_tid (scan=0xfe2da8,
direction=<value optimized out>) at indexam.c:541
#10 0x00000000004b9d4a in index_getnext (scan=0xfe2da8,
direction=ForwardScanDirection) at indexam.c:679
#11 0x0000000000606b89 in IndexNext (node=0xf07f58) at nodeIndexscan.c:134
#12 0x00000000005f5ee7 in ExecScanFetch (node=0xf07f58,
accessMtd=0x606b20 <IndexNext>, recheckMtd=0x606ac0 <IndexRecheck>) at
execScan.c:97
#13 ExecScan (node=0xf07f58, accessMtd=0x606b20 <IndexNext>,
recheckMtd=0x606ac0 <IndexRecheck>) at execScan.c:147
#14 0x000000000060eccc in ExecProcNode (pstate=0xf07d98) at
../../../src/include/executor/executor.h:250
#15 ExecNestLoop (pstate=0xf07d98) at nodeNestloop.c:109
#16 0x000000000060eccc in ExecProcNode (pstate=0xf07bd8) at
../../../src/include/executor/executor.h:250
#17 ExecNestLoop (pstate=0xf07bd8) at nodeNestloop.c:109
#18 0x0000000000602cff in ExecProcNode (pstate=0xf07858) at
../../../src/include/executor/executor.h:250
#19 gather_getnext (pstate=0xf07858) at nodeGather.c:286
#20 ExecGather (pstate=0xf07858) at nodeGather.c:215
#21 0x00000000005f333f in ExecProcNode (queryDesc=0xe1d578,
direction=<value optimized out>, count=0, execute_once=88 'X')
    at ../../../src/include/executor/executor.h:250
#22 ExecutePlan (queryDesc=0xe1d578, direction=<value optimized out>,
count=0, execute_once=88 'X') at execMain.c:1722
#23 standard_ExecutorRun (queryDesc=0xe1d578, direction=<value
optimized out>, count=0, execute_once=88 'X') at execMain.c:363
#24 0x000000000071a77b in PortalRunSelect (portal=0xe320d8,
forward=<value optimized out>, count=0, dest=<value optimized out>) at
pquery.c:932
#25 0x000000000071b8e1 in PortalRun (portal=0xe320d8,
count=9223372036854775807, isTopLevel=1 '\001', run_once=1 '\001',
dest=0xf746f0,
    altdest=0xf746f0, completionTag=0x7ffd0ceed950 "") at pquery.c:773
#26 0x0000000000718097 in exec_simple_query (
    query_string=0xdb0c18 "query goes here"...) at postgres.c:1099
#27 0x0000000000718fd9 in PostgresMain (argc=<value optimized out>,
argv=<value optimized out>, dbname=0xd076d8 "prod",
    username=<value optimized out>) at postgres.c:4088
#28 0x00000000006ae04d in BackendRun (argc=<value optimized out>,
argv=<value optimized out>) at postmaster.c:4405
#29 BackendStartup (argc=<value optimized out>, argv=<value optimized
out>) at postmaster.c:4077
#30 ServerLoop (argc=<value optimized out>, argv=<value optimized
out>) at postmaster.c:1755
#31 PostmasterMain (argc=<value optimized out>, argv=<value optimized
out>) at postmaster.c:1363
#32 0x000000000062f050 in main (argc=3, argv=0xd05540) at main.c:228

The stack trace for PID 16664:

(gdb) bt
#0  0x00000030ac6e91a3 in __epoll_wait_nocancel () from /lib64/libc.so.6
#1  0x00000000006f6b65 in WaitEventSetWaitBlock (set=0xde4598,
timeout=-1, occurred_events=0x7ffd0ceecd90, nevents=1,
    wait_event_info=<value optimized out>) at latch.c:1048
#2  WaitEventSetWait (set=0xde4598, timeout=-1,
occurred_events=0x7ffd0ceecd90, nevents=1, wait_event_info=<value
optimized out>) at latch.c:1000
#3  0x000000000070f207 in ConditionVariableSleep (cv=<value optimized
out>, wait_event_info=134217730) at condition_variable.c:149
#4  0x00000000004bffe1 in _bt_parallel_seize (scan=<value optimized
out>, pageno=0x7ffd0ceece5c) at nbtree.c:811
#5  0x00000000004c1ffe in _bt_readnextpage (scan=0xe0c8c0, blkno=0,
dir=ForwardScanDirection) at nbtsearch.c:1498
#6  0x00000000004c21fe in _bt_steppage (scan=0xe0c8c0,
dir=ForwardScanDirection) at nbtsearch.c:1429
#7  0x00000000004c2384 in _bt_next (scan=0xe0c8c0, dir=<value
optimized out>) at nbtsearch.c:1150
#8  0x00000000004c06fc in btgettuple (scan=0xe0c8c0,
dir=ForwardScanDirection) at nbtree.c:399
#9  0x00000000004b9c9a in index_getnext_tid (scan=0xe0c8c0,
direction=<value optimized out>) at indexam.c:541
#10 0x00000000004b9d4a in index_getnext (scan=0xe0c8c0,
direction=ForwardScanDirection) at indexam.c:679
#11 0x0000000000606b89 in IndexNext (node=0xddefe8) at nodeIndexscan.c:134
#12 0x00000000005f5ee7 in ExecScanFetch (node=0xddefe8,
accessMtd=0x606b20 <IndexNext>, recheckMtd=0x606ac0 <IndexRecheck>) at
execScan.c:97
#13 ExecScan (node=0xddefe8, accessMtd=0x606b20 <IndexNext>,
recheckMtd=0x606ac0 <IndexRecheck>) at execScan.c:147
#14 0x000000000060eccc in ExecProcNode (pstate=0xdded28) at
../../../src/include/executor/executor.h:250
#15 ExecNestLoop (pstate=0xdded28) at nodeNestloop.c:109
#16 0x000000000060eccc in ExecProcNode (pstate=0xddee98) at
../../../src/include/executor/executor.h:250
#17 ExecNestLoop (pstate=0xddee98) at nodeNestloop.c:109
#18 0x00000000005f333f in ExecProcNode (queryDesc=0xe0c7a0,
direction=<value optimized out>, count=0, execute_once=-104 '\230')
    at ../../../src/include/executor/executor.h:250
#19 ExecutePlan (queryDesc=0xe0c7a0, direction=<value optimized out>,
count=0, execute_once=-104 '\230') at execMain.c:1722
#20 standard_ExecutorRun (queryDesc=0xe0c7a0, direction=<value
optimized out>, count=0, execute_once=-104 '\230') at execMain.c:363
#21 0x00000000005f3809 in ParallelQueryMain (seg=<value optimized
out>, toc=0x7f80a66e3000) at execParallel.c:997
#22 0x00000000004dac49 in ParallelWorkerMain (main_arg=<value
optimized out>) at parallel.c:1203
#23 0x000000000069f8f0 in StartBackgroundWorker () at bgworker.c:835
#24 0x00000000006a8aec in do_start_bgworker () at postmaster.c:5728
#25 maybe_start_bgworkers () at postmaster.c:5941
#26 0x00000000006ac115 in sigusr1_handler (postgres_signal_arg=<value
optimized out>) at postmaster.c:5121
#27 <signal handler called>
#28 0x00000030ac6e1603 in __select_nocancel () from /lib64/libc.so.6
#29 0x00000000006ad65d in ServerLoop (argc=<value optimized out>,
argv=<value optimized out>) at postmaster.c:1719
#30 PostmasterMain (argc=<value optimized out>, argv=<value optimized
out>) at postmaster.c:1363
#31 0x000000000062f050 in main (argc=3, argv=0xd05540) at main.c:228


The second case is waiting on IPC (wait_event_type) for
BgWorkerShutdown (wait_event):

postgres  16811  0.0  0.0 6671376 493104 ?      Ss   Oct17   0:01  \_
postgres: ana rdnet 10.0.32.200(38793) SELECT
postgres  17877  0.0  0.0 6667696 311972 ?      Ss   Oct17   0:00  \_
postgres: bgworker: parallel worker for PID 16811

The stack trace for PID 16811:

(gdb) bt
#0  0x00000030ac6e91a3 in __epoll_wait_nocancel () from /lib64/libc.so.6
#1  0x00000000006f6b65 in WaitEventSetWaitBlock (set=0xda2fa8,
timeout=-1, occurred_events=0x7ffd0ceed8e0, nevents=1,
    wait_event_info=<value optimized out>) at latch.c:1048
#2  WaitEventSetWait (set=0xda2fa8, timeout=-1,
occurred_events=0x7ffd0ceed8e0, nevents=1, wait_event_info=<value
optimized out>) at latch.c:1000
#3  0x00000000006f73c9 in WaitLatchOrSocket (latch=0x7f82325ab0b4,
wakeEvents=<value optimized out>, sock=-1, timeout=-1,
wait_event_info=134217728)
    at latch.c:385
#4  0x000000000069eff8 in WaitForBackgroundWorkerShutdown
(handle=0xd24f70) at bgworker.c:1154
#5  0x00000000004db11e in WaitForParallelWorkersToExit (pcxt=0xd24c78)
at parallel.c:655
#6  0x00000000004db248 in DestroyParallelContext (pcxt=0xd24c78) at
parallel.c:737
#7  0x00000000004db2e3 in AtEOXact_Parallel (isCommit=0 '\000') at
parallel.c:1006
#8  0x00000000004e4537 in AbortTransaction () at xact.c:2538
#9  0x00000000004e4c05 in AbortCurrentTransaction () at xact.c:3046
#10 0x000000000071971d in PostgresMain (argc=1, argv=<value optimized
out>, dbname=0xd076d8 "rdnet", username=<value optimized out>)
    at postgres.c:3879
#11 0x00000000006ae04d in BackendRun (argc=<value optimized out>,
argv=<value optimized out>) at postmaster.c:4405
#12 BackendStartup (argc=<value optimized out>, argv=<value optimized
out>) at postmaster.c:4077
#13 ServerLoop (argc=<value optimized out>, argv=<value optimized
out>) at postmaster.c:1755
#14 PostmasterMain (argc=<value optimized out>, argv=<value optimized
out>) at postmaster.c:1363
#15 0x000000000062f050 in main (argc=3, argv=0xd05540) at main.c:228

The stack trace for PID 17877:

(gdb) bt
#0  0x00000030ac6e91a3 in __epoll_wait_nocancel () from /lib64/libc.so.6
#1  0x00000000006f6b65 in WaitEventSetWaitBlock (set=0xde4968,
timeout=-1, occurred_events=0x7ffd0ceecd90, nevents=1,
    wait_event_info=<value optimized out>) at latch.c:1048
#2  WaitEventSetWait (set=0xde4968, timeout=-1,
occurred_events=0x7ffd0ceecd90, nevents=1, wait_event_info=<value
optimized out>) at latch.c:1000
#3  0x000000000070f207 in ConditionVariableSleep (cv=<value optimized
out>, wait_event_info=134217730) at condition_variable.c:149
#4  0x00000000004bffe1 in _bt_parallel_seize (scan=<value optimized
out>, pageno=0x7ffd0ceece5c) at nbtree.c:811
#5  0x00000000004c1ffe in _bt_readnextpage (scan=0xe0cc90, blkno=0,
dir=ForwardScanDirection) at nbtsearch.c:1498
#6  0x00000000004c21fe in _bt_steppage (scan=0xe0cc90,
dir=ForwardScanDirection) at nbtsearch.c:1429
#7  0x00000000004c2384 in _bt_next (scan=0xe0cc90, dir=<value
optimized out>) at nbtsearch.c:1150
#8  0x00000000004c06fc in btgettuple (scan=0xe0cc90,
dir=ForwardScanDirection) at nbtree.c:399
#9  0x00000000004b9c9a in index_getnext_tid (scan=0xe0cc90,
direction=<value optimized out>) at indexam.c:541
#10 0x00000000004b9d4a in index_getnext (scan=0xe0cc90,
direction=ForwardScanDirection) at indexam.c:679
#11 0x0000000000606b89 in IndexNext (node=0xddf3b8) at nodeIndexscan.c:134
#12 0x00000000005f5ee7 in ExecScanFetch (node=0xddf3b8,
accessMtd=0x606b20 <IndexNext>, recheckMtd=0x606ac0 <IndexRecheck>) at
execScan.c:97
#13 ExecScan (node=0xddf3b8, accessMtd=0x606b20 <IndexNext>,
recheckMtd=0x606ac0 <IndexRecheck>) at execScan.c:147
#14 0x000000000060eccc in ExecProcNode (pstate=0xddf0f8) at
../../../src/include/executor/executor.h:250
#15 ExecNestLoop (pstate=0xddf0f8) at nodeNestloop.c:109
#16 0x000000000060eccc in ExecProcNode (pstate=0xddf268) at
../../../src/include/executor/executor.h:250
#17 ExecNestLoop (pstate=0xddf268) at nodeNestloop.c:109
#18 0x00000000005f333f in ExecProcNode (queryDesc=0xe0cb70,
direction=<value optimized out>, count=0, execute_once=104 'h')
    at ../../../src/include/executor/executor.h:250
#19 ExecutePlan (queryDesc=0xe0cb70, direction=<value optimized out>,
count=0, execute_once=104 'h') at execMain.c:1722
#20 standard_ExecutorRun (queryDesc=0xe0cb70, direction=<value
optimized out>, count=0, execute_once=104 'h') at execMain.c:363
#21 0x00000000005f3809 in ParallelQueryMain (seg=<value optimized
out>, toc=0x7f80a66e3000) at execParallel.c:997
#22 0x00000000004dac49 in ParallelWorkerMain (main_arg=<value
optimized out>) at parallel.c:1203
#23 0x000000000069f8f0 in StartBackgroundWorker () at bgworker.c:835
#24 0x00000000006a8aec in do_start_bgworker () at postmaster.c:5728
#25 maybe_start_bgworkers () at postmaster.c:5941
#26 0x00000000006ac115 in sigusr1_handler (postgres_signal_arg=<value
optimized out>) at postmaster.c:5121
#27 <signal handler called>
#28 0x00000030ac6e1603 in __select_nocancel () from /lib64/libc.so.6
#29 0x00000000006ad65d in ServerLoop (argc=<value optimized out>,
argv=<value optimized out>) at postmaster.c:1719
#30 PostmasterMain (argc=<value optimized out>, argv=<value optimized
out>) at postmaster.c:1363
#31 0x000000000062f050 in main (argc=3, argv=0xd05540) at main.c:228

The non-cancellable queries is not an expected behavior because you
have to shutdown (immediate) postgres.


[1] https://www.postgresql.org/message-id/151724453314.1238.409882538067070269%40wrigleys.postgresql.org


-- 
   Euler Taveira                                   Timbira -
http://www.timbira.com.br/
   PostgreSQL: Consultoria, Desenvolvimento, Suporte 24x7 e Treinamento


В списке pgsql-bugs по дате отправления:

Предыдущее
От: PG Bug reporting form
Дата:
Сообщение: BUG #15437: Segfault during insert into declarative partitioned tablewith a trigger creating partition
Следующее
От: Sergei Kornilov
Дата:
Сообщение: Re: parallel queries hang