Обсуждение: BUG #3883: Autovacuum deadlock with truncate?

Поиск
Список
Период
Сортировка

BUG #3883: Autovacuum deadlock with truncate?

От
"Steven Flatt"
Дата:
The following bug has been logged online:

Bug reference:      3883
Logged by:          Steven Flatt
Email address:      steven.flatt@gmail.com
PostgreSQL version: 8.2.4
Operating system:   FreeBSD 6.1
Description:        Autovacuum deadlock with truncate?
Details:

This isn't a postgres deadlock per se, but the end result is that two
postgres backends are stuck, each waiting on a PGSemaphoreLock that the
other presumably has.  The processes have been stuck for hours.

First process is the postgres autovacuum.
Second process is a PLpgSQL function which is intended to "clean tables".
It goes through a list of tables and truncates them if they have any data.
I realize that it generally doesn't make a great deal of sense to truncate
data so soon after being modified that the data hasn't even been
vacuumed/analyzed, but this happened in a test environment and we'd like to
understand the root cause.

pg_locks info and gdb backtraces follow:

-------------------------------------------------------

pid 35775 (autovacuum)

=# select pid,relation,mode,granted from pg_locks where pid = 35775;
  pid  | relation |           mode           | granted
-------+----------+--------------------------+---------
 35775 |    16783 | ShareUpdateExclusiveLock | t
 35775 |    16788 | RowExclusiveLock         | t
 35775 |    16790 | RowExclusiveLock         | t
 35775 |    16791 | RowExclusiveLock         | t
 35775 |          | ExclusiveLock            | t
(5 rows)

(gdb) bt
#0  0x6854a5b7 in semop () from /lib/libc.so.6
#1  0x081c7b4f in PGSemaphoreLock (sema=0x86adf888, interruptOK=1 '\001')
    at pg_sema.c:411
#2  0x081f50f1 in ProcWaitForSignal () at proc.c:1075
#3  0x081e78e3 in LockBufferForCleanup (buffer=14408) at bufmgr.c:1926
#4  0x081541c2 in lazy_vacuum_heap (onerel=0x86a8c08,
vacrelstats=0x8668170)
    at vacuumlazy.c:552
#5  0x08153fa0 in lazy_scan_heap (onerel=0x86a8c08, vacrelstats=0x8668170,
    Irel=0x8668158, nindexes=3) at vacuumlazy.c:482
#6  0x08153722 in lazy_vacuum_rel (onerel=0x86a8c08, vacstmt=0x86c6be0)
    at vacuumlazy.c:164
#7  0x0814f623 in vacuum_rel (relid=16783, vacstmt=0x86c6be0,
    expected_relkind=114 'r') at vacuum.c:1098
#8  0x0814eb10 in vacuum (vacstmt=0x86c6be0, relids=0x86c6d18) at
vacuum.c:397
#9  0x081c9d57 in autovacuum_do_vac_analyze (relid=16783, dovacuum=1
'\001',
    doanalyze=1 '\001', freeze_min_age=100000000) at autovacuum.c:912
#10 0x081c97e4 in do_autovacuum (dbentry=0x840dc08) at autovacuum.c:659
#11 0x081c92c1 in AutoVacMain (argc=0, argv=0x0) at autovacuum.c:433
#12 0x081c8f3a in autovac_start () at autovacuum.c:178
#13 0x081cf01a in ServerLoop () at postmaster.c:1249
#14 0x081ce916 in PostmasterMain (argc=3, argv=0x9fbfed40) at
postmaster.c:963
#15 0x0817fef0 in main (argc=3, argv=0x9fbfed40) at main.c:188

-------------------------------------------------------

pid 6869 (function to clean tables)

=# select pid,relation,mode,granted from pg_locks where pid = 6869 and not
granted;
 pid  | relation |        mode         | granted
------+----------+---------------------+---------
 6869 |    16783 | AccessExclusiveLock | f

(gdb) bt
#0  0x6854a5b7 in semop () from /lib/libc.so.6
#1  0x081c7b4f in PGSemaphoreLock (sema=0x86ae0890, interruptOK=1 '\001')
    at pg_sema.c:411
#2  0x081f4e29 in ProcSleep (locallock=0x8f5e160,
lockMethodTable=0x8332324)
    at proc.c:829
#3  0x081f2660 in WaitOnLock (locallock=0x8f5e160, owner=0x9965db8)
    at lock.c:1140
#4  0x081f2120 in LockAcquire (locktag=0x9fbfdc70, lockmode=8,
    sessionLock=0 '\0', dontWait=0 '\0') at lock.c:792
#5  0x081f0eb4 in LockRelationOid (relid=16783, lockmode=8) at lmgr.c:81
#6  0x08091b6e in relation_open (relationId=16783, lockmode=8) at
heapam.c:694
#7  0x08091db2 in relation_openrv (relation=0xbdf9110, lockmode=8)
    at heapam.c:821
#8  0x08091ef4 in heap_openrv (relation=0xbdf9110, lockmode=8) at
heapam.c:891
#9  0x08135610 in ExecuteTruncate (stmt=0xbdf9160) at tablecmds.c:549
#10 0x08202da1 in ProcessUtility (parsetree=0xbdf9160, params=0x0,
    dest=0x835b4c0, completionTag=0x0) at utility.c:626
#11 0x08175868 in _SPI_execute_plan (plan=0x9fbfde80, Values=0x0,
Nulls=0x0,
    snapshot=0x0, crosscheck_snapshot=0x0, read_only=0 '\0', tcount=0)
    at spi.c:1496
#12 0x0817374d in SPI_execute (
    src=0x8a1db18 "TRUNCATE foo", read_only=0 '\0',
    tcount=0) at spi.c:316
#13 0x87d23df1 in exec_stmt_dynexecute (estate=0x9fbfe270, stmt=0x84e01b0)
    at pl_exec.c:2561
#14 0x87d21ba2 in exec_stmt (estate=0x9fbfe270, stmt=0x84e01b0)
    at pl_exec.c:1212
#15 0x87d2196d in exec_stmts (estate=0x9fbfe270, stmts=0x84e0038)
    at pl_exec.c:1123
#16 0x87d2450d in exec_stmt_dynfors (estate=0x9fbfe270, stmt=0x84dfde0)
    at pl_exec.c:2793
#17 0x87d21bbb in exec_stmt (estate=0x9fbfe270, stmt=0x84dfde0)
    at pl_exec.c:1216
#18 0x87d2196d in exec_stmts (estate=0x9fbfe270, stmts=0x84dffb0)
    at pl_exec.c:1123
#19 0x87d226e6 in exec_stmt_fors (estate=0x9fbfe270, stmt=0x84df9e0)
    at pl_exec.c:1694
#20 0x87d21b0c in exec_stmt (estate=0x9fbfe270, stmt=0x84df9e0)
    at pl_exec.c:1188
#21 0x87d2196d in exec_stmts (estate=0x9fbfe270, stmts=0x84dfed0)
    at pl_exec.c:1123
#22 0x87d21e85 in exec_stmt_if (estate=0x9fbfe270, stmt=0x84e0760)
    at pl_exec.c:1346
#23 0x87d21aa8 in exec_stmt (estate=0x9fbfe270, stmt=0x84e0760)
    at pl_exec.c:1172
#24 0x87d2196d in exec_stmts (estate=0x9fbfe270, stmts=0x84dff78)
    at pl_exec.c:1123
#25 0x87d217ec in exec_stmt_block (estate=0x9fbfe270, block=0x84e1db0)
    at pl_exec.c:1068
#26 0x87d20167 in plpgsql_exec_function (func=0x84c22f8, fcinfo=0x9fbfe3b0)
    at pl_exec.c:286
#27 0x87d1c2c3 in plpgsql_call_handler (fcinfo=0x9fbfe3b0) at
pl_handler.c:95
#28 0x0815d00a in ExecMakeFunctionResult (fcache=0x87982b8,
    econtext=0x8798230, isNull=0x8798840 "", isDone=0x8798898)
    at execQual.c:1269
#29 0x0815d860 in ExecEvalFunc (fcache=0x87982b8, econtext=0x8798230,
    isNull=0x8798840 "", isDone=0x8798898) at execQual.c:1671
#30 0x08161ad2 in ExecTargetList (targetlist=0x8798710, econtext=0x8798230,
    values=0x8798830, isnull=0x8798840 "", itemIsDone=0x8798898,
    isDone=0x9fbfe694) at execQual.c:4119
#31 0x08161ec4 in ExecProject (projInfo=0x8798850, isDone=0x9fbfe694)
    at execQual.c:4320
#32 0x0816e944 in ExecResult (node=0x87981a8) at nodeResult.c:157
#33 0x0815aef9 in ExecProcNode (node=0x87981a8) at execProcnode.c:334
#34 0x08158f26 in ExecutePlan (estate=0x8798018, planstate=0x87981a8,
    operation=CMD_SELECT, numberTuples=0, direction=ForwardScanDirection,
    dest=0x84524d8) at execMain.c:1082
#35 0x081580e7 in ExecutorRun (queryDesc=0x8a35040,
    direction=ForwardScanDirection, count=0) at execMain.c:241
#36 0x08201509 in PortalRunSelect (portal=0x8497018, forward=1 '\001',
    count=0, dest=0x84524d8) at pquery.c:831
#37 0x08201247 in PortalRun (portal=0x8497018, count=2147483647,
    dest=0x84524d8, altdest=0x84524d8, completionTag=0x9fbfe900 "")
    at pquery.c:656
#38 0x081fc69d in exec_simple_query (
    query_string=0x8451018 "SELECT fn_clean_tables()")
    at postgres.c:939
#39 0x081ffe18 in PostgresMain (argc=4, argv=0x83b3448,
    username=0x83b3428 "pgsql") at postgres.c:3424
#40 0x081d1ace in BackendRun (port=0x83af000) at postmaster.c:2931
#41 0x081d1098 in BackendStartup (port=0x83af000) at postmaster.c:2558
#42 0x081cef3b in ServerLoop () at postmaster.c:1211
#43 0x081ce916 in PostmasterMain (argc=3, argv=0x9fbfed40) at
postmaster.c:963
#44 0x0817fef0 in main (argc=3, argv=0x9fbfed40) at main.c:188

Re: BUG #3883: Autovacuum deadlock with truncate?

От
Alvaro Herrera
Дата:
Steven Flatt escribió:

> This isn't a postgres deadlock per se, but the end result is that two
> postgres backends are stuck, each waiting on a PGSemaphoreLock that the
> other presumably has.  The processes have been stuck for hours.

Can you reproduce this problem at will?

Are the processes still running (sleeping, rather) so that we can try to
examine them more deeply?

--
Alvaro Herrera                                http://www.CommandPrompt.com/
PostgreSQL Replication, Consulting, Custom Development, 24x7 support

Re: BUG #3883: Autovacuum deadlock with truncate?

От
Tom Lane
Дата:
"Steven Flatt" <steven.flatt@gmail.com> writes:
> This isn't a postgres deadlock per se, but the end result is that two
> postgres backends are stuck, each waiting on a PGSemaphoreLock that the
> other presumably has.  The processes have been stuck for hours.

No, that's not what the backtraces say.  The autovac process is trying
to get super-exclusive lock on a buffer (apparently in relation 16783
--- what is that?).  There's no evidence in the stack trace that the
TRUNCATE process has any conflicting buffer lock.

What I think might be happening is a three-way deadlock involving these
two and a third process that has the desired buffer lock.  Have you got
anything else that seems to be stuck?

            regards, tom lane

Re: BUG #3883: Autovacuum deadlock with truncate?

От
"Steven Flatt"
Дата:
On 1/17/08, Alvaro Herrera <alvherre@commandprompt.com> wrote:
> > This isn't a postgres deadlock per se, but the end result is that two
> > postgres backends are stuck, each waiting on a PGSemaphoreLock that the
> > other presumably has.  The processes have been stuck for hours.
>
> Can you reproduce this problem at will?

Well we have a bunch of test runs (at least a few dozen) that do a
similar thing, but only this one seems to be stuck.  Still, I imagine
I can reproduce by doing enough updates/deletes to interest the
autovacuumer immediately followed by executing the fn_clean_tables()
function -- however not sure of the precise timing to get into this
state.

> Are the processes still running (sleeping, rather) so that we can try to
> examine them more deeply?

Yes, the processes are still "sleeping" in the same state.  Just did
another bt a few minutes ago and they're the exact same.  Don't
imagine the processes are going to be moving much...

Steve

Re: BUG #3883: Autovacuum deadlock with truncate?

От
"Steven Flatt"
Дата:
On 1/17/08, Tom Lane <tgl@sss.pgh.pa.us> wrote:
> No, that's not what the backtraces say.  The autovac process is trying
> to get super-exclusive lock on a buffer (apparently in relation 16783
> --- what is that?).  There's no evidence in the stack trace that the
> TRUNCATE process has any conflicting buffer lock.

Relation 16783 is a regular table, nothing special about it, except
perhaps that it's inherited?  (It's a partition.)  It's got an integer
primary key column whose default value is the nextval of a sequence,
another integer column, two varchar columns, and five timestamptz
columns.  It's got three indexes and a check constraint.

Data would have been inserted into this table, then many rows updated
as part of a test.  Immediately following the test, the data would
have been truncated by the fn_clean_tables() function.  Apparently the
autovacuumer wanted to work on the table at the same time.

> What I think might be happening is a three-way deadlock involving these
> two and a third process that has the desired buffer lock.  Have you got
> anything else that seems to be stuck?

Don't see a third process that is stuck...

=# select * from pg_stat_activity where current_query not like '%IDLE%';
 datid | datname | procpid | usesysid | usename |                  current_query
                  | waiting |         query_start          |         backend_sta
rt         | client_addr | client_port
-------+---------+---------+----------+---------+-------------------------------
------------------+---------+------------------------------+--------------------
-----------+-------------+-------------
 16384 | datname |   35775 |       10 | pgsql   | VACUUM ANALYZE
public.foo | f       |                              | 2008-01-17
01:31:54.932049-05 |             |
 16384 | datname |    6869 |       10 | pgsql   | SELECT
fn_clean_tables()            | t       | 2008-01-17 01:31:51.68996-05
| 2008-01-16 22:34:40.914391-05 | 0.0.0.0 |       51451
(2 rows)

(I've obfuscated some of the names.)

Steve

Re: BUG #3883: Autovacuum deadlock with truncate?

От
Tom Lane
Дата:
"Steven Flatt" <steven.flatt@gmail.com> writes:
> On 1/17/08, Tom Lane <tgl@sss.pgh.pa.us> wrote:
>> What I think might be happening is a three-way deadlock involving these
>> two and a third process that has the desired buffer lock.  Have you got
>> anything else that seems to be stuck?

> Don't see a third process that is stuck...

Hmm.  The only other theory I can think of is that some process forgot
about a buffer pin it was holding, or the wakeup signal to release the
autovac process somehow got lost; either of which would be significant
bugs.

Do you still have the hung processes available?  It would be really
useful to take a look at the buffer header that the autovac process's
LockBufferForCleanup() is working on.  (In gdb, "f 3" then "p *bufHdr")

            regards, tom lane

Re: BUG #3883: Autovacuum deadlock with truncate?

От
"Steven Flatt"
Дата:
On 1/17/08, Tom Lane <tgl@sss.pgh.pa.us> wrote:
> Do you still have the hung processes available?  It would be really
> useful to take a look at the buffer header that the autovac process's
> LockBufferForCleanup() is working on.  (In gdb, "f 3" then "p *bufHdr")

Well I just lost the hung processes a few minutes ago thanks to a
cleanup process... but I do still have cores from yesterday:

(gdb) p *bufHdr
$1 = {tag = {rnode = {spcNode = 1663, dbNode = 16384, relNode = 85707},
    blockNum = 0}, flags = 70, usage_count = 5, refcount = 2,
  wait_backend_pid = 35775, buf_hdr_lock = 0 '\0', buf_id = 14407,
  freeNext = -2, io_in_progress_lock = 28933, content_lock = 28934}

Let me know what other info I can provide.

Steve

Re: BUG #3883: Autovacuum deadlock with truncate?

От
Tom Lane
Дата:
"Steven Flatt" <steven.flatt@gmail.com> writes:
> (gdb) p *bufHdr
> $1 = {tag = {rnode = {spcNode = 1663, dbNode = 16384, relNode = 85707},
>     blockNum = 0}, flags = 70, usage_count = 5, refcount = 2,
>   wait_backend_pid = 35775, buf_hdr_lock = 0 '\0', buf_id = 14407,
>   freeNext = -2, io_in_progress_lock = 28933, content_lock = 28934}

Hm, PIN_COUNT_WAITER flag is still set, and refcount = 2 saying there is
still someone else pinning the buffer, so nothing evidently wrong here.

Could you check PrivateRefCount[14407] in both cores?

            regards, tom lane

Re: BUG #3883: Autovacuum deadlock with truncate?

От
Tom Lane
Дата:
"Steven Flatt" <steven.flatt@gmail.com> writes:
> (gdb) f 3
> (gdb) p *bufHdr
> $1 = {tag = {rnode = {spcNode = 1663, dbNode = 16384, relNode = 16648},
>     blockNum = 3}, flags = 70, usage_count = 0, refcount = 2,
>   wait_backend_pid = 23965, buf_hdr_lock = 0 '\0', buf_id = 501,
>   freeNext = -2, io_in_progress_lock = 1121, content_lock = 1122}

> (gdb) p PrivateRefCount[501]
> $2 = 1

> pid 7908 (function to clean tables)

> (gdb) p PrivateRefCount[501]
> $1 = 2

Well, there's our problem: for some reason PID 7908 has this buffer
pinned, which is blocking the vacuum.  That seems pretty darn odd for
a process that is about to (try to) truncate the table.  The only way
I can imagine is that the function has an open cursor scanning the table
... which is a case that we probably ought to error out on, though I
think TRUNCATE is not checking for it now.  Or you've managed to tickle
some previously-unknown bug that leaks buffer pins.

Could we see that whole function?  Also, what has been done previously
in the transaction that's calling it?

            regards, tom lane

Re: BUG #3883: Autovacuum deadlock with truncate?

От
"Steven Flatt"
Дата:
On Jan 18, 2008 10:58 AM, Tom Lane <tgl@sss.pgh.pa.us> wrote:

> Hm, PIN_COUNT_WAITER flag is still set, and refcount = 2 saying there is
> still someone else pinning the buffer, so nothing evidently wrong here.
>
> Could you check PrivateRefCount[14407] in both cores?
>
Okay, got two new hung processes and two new cores.  Looks the same as last
time, but happens on a slightly different table (a different partition of
the same base table).  Processes have been hung for almost 24 hours.
Requested info follows:

-------------------------------------------------------------------------------

pid 23965 (autovacuum)

=# select pid,relation,mode,granted from pg_locks where pid = 23965;
  pid  | relation |           mode           | granted
-------+----------+--------------------------+---------
 23965 |          | ExclusiveLock            | t
 23965 |    16648 | ShareUpdateExclusiveLock | t
 23965 |    16655 | RowExclusiveLock         | t
 23965 |    16656 | RowExclusiveLock         | t
 23965 |    16653 | RowExclusiveLock         | t
(5 rows)

(gdb) bt
#0  0x6854a5b7 in semop () from /lib/libc.so.6
#1  0x081c7b4f in PGSemaphoreLock (sema=0x695bae58, interruptOK=1 '\001')
    at pg_sema.c:411
#2  0x081f50f1 in ProcWaitForSignal () at proc.c:1075
#3  0x081e78e3 in LockBufferForCleanup (buffer=502) at bufmgr.c:1926
#4  0x081541c2 in lazy_vacuum_heap (onerel=0x866a860, vacrelstats=0x8629170)
    at vacuumlazy.c:552
#5  0x08153fa0 in lazy_scan_heap (onerel=0x866a860, vacrelstats=0x8629170,
    Irel=0x8629158, nindexes=3) at vacuumlazy.c:482
#6  0x08153722 in lazy_vacuum_rel (onerel=0x866a860, vacstmt=0x867f720)
    at vacuumlazy.c:164
#7  0x0814f623 in vacuum_rel (relid=16648, vacstmt=0x867f720,
    expected_relkind=114 'r') at vacuum.c:1098
#8  0x0814eb10 in vacuum (vacstmt=0x867f720, relids=0x867f780) at vacuum.c
:397
#9  0x081c9d57 in autovacuum_do_vac_analyze (relid=16648, dovacuum=1 '\001',
    doanalyze=1 '\001', freeze_min_age=100000000) at autovacuum.c:912
#10 0x081c97e4 in do_autovacuum (dbentry=0x83cdc08) at autovacuum.c:659
#11 0x081c92c1 in AutoVacMain (argc=0, argv=0x0) at autovacuum.c:433
#12 0x081c8f3a in autovac_start () at autovacuum.c:178
#13 0x081cf01a in ServerLoop () at postmaster.c:1249
#14 0x081ce916 in PostmasterMain (argc=3, argv=0x9fbfed40) at postmaster.c
:963
#15 0x0817fef0 in main (argc=3, argv=0x9fbfed40) at main.c:188

(gdb) f 3
(gdb) p *bufHdr
$1 = {tag = {rnode = {spcNode = 1663, dbNode = 16384, relNode = 16648},
    blockNum = 3}, flags = 70, usage_count = 0, refcount = 2,
  wait_backend_pid = 23965, buf_hdr_lock = 0 '\0', buf_id = 501,
  freeNext = -2, io_in_progress_lock = 1121, content_lock = 1122}

(gdb) p PrivateRefCount[501]
$2 = 1

-------------------------------------------------------------------------------

pid 7908 (function to clean tables)

=# select pid,relation,mode,granted from pg_locks where pid = 7908 and not
granted;
 pid  | relation |        mode         | granted
------+----------+---------------------+---------
 7908 |    16648 | AccessExclusiveLock | f
(1 row)

(gdb) bt
#0  0x6854a5b7 in semop () from /lib/libc.so.6
#1  0x081c7b4f in PGSemaphoreLock (sema=0x695bc580, interruptOK=1 '\001')
    at pg_sema.c:411
#2  0x081f4e29 in ProcSleep (locallock=0xafbbf30, lockMethodTable=0x8332324)
    at proc.c:829
#3  0x081f2660 in WaitOnLock (locallock=0xafbbf30, owner=0xbc456e0)
    at lock.c:1140
#4  0x081f2120 in LockAcquire (locktag=0x9fbfdc70, lockmode=8,
    sessionLock=0 '\0', dontWait=0 '\0') at lock.c:792
#5  0x081f0eb4 in LockRelationOid (relid=16648, lockmode=8) at lmgr.c:81
#6  0x08091b6e in relation_open (relationId=16648, lockmode=8) at heapam.c
:694
#7  0x08091db2 in relation_openrv (relation=0x851e110, lockmode=8)
    at heapam.c:821
#8  0x08091ef4 in heap_openrv (relation=0x851e110, lockmode=8) at heapam.c
:891
#9  0x08135610 in ExecuteTruncate (stmt=0x851e160) at tablecmds.c:549
#10 0x08202da1 in ProcessUtility (parsetree=0x851e160, params=0x0,
    dest=0x835b4c0, completionTag=0x0) at utility.c:626
#11 0x08175868 in _SPI_execute_plan (plan=0x9fbfde80, Values=0x0, Nulls=0x0,
    snapshot=0x0, crosscheck_snapshot=0x0, read_only=0 '\0', tcount=0)
    at spi.c:1496
#12 0x0817374d in SPI_execute (
    src=0x84a3908 "TRUNCATE foo", read_only=0 '\0',
    tcount=0) at spi.c:316
#13 0x69f0fdf1 in exec_stmt_dynexecute (estate=0x9fbfe270, stmt=0x84981b0)
    at pl_exec.c:2561
#14 0x69f0dba2 in exec_stmt (estate=0x9fbfe270, stmt=0x84981b0)
    at pl_exec.c:1212
#15 0x69f0d96d in exec_stmts (estate=0x9fbfe270, stmts=0x8498038)
    at pl_exec.c:1123
#16 0x69f1050d in exec_stmt_dynfors (estate=0x9fbfe270, stmt=0x8497de0)
    at pl_exec.c:2793
#17 0x69f0dbbb in exec_stmt (estate=0x9fbfe270, stmt=0x8497de0)
    at pl_exec.c:1216
#18 0x69f0d96d in exec_stmts (estate=0x9fbfe270, stmts=0x8497fb0)
    at pl_exec.c:1123
#19 0x69f0e6e6 in exec_stmt_fors (estate=0x9fbfe270, stmt=0x84979e0)
    at pl_exec.c:1694
#20 0x69f0db0c in exec_stmt (estate=0x9fbfe270, stmt=0x84979e0)
    at pl_exec.c:1188
#21 0x69f0d96d in exec_stmts (estate=0x9fbfe270, stmts=0x8497ed0)
    at pl_exec.c:1123
#22 0x69f0de85 in exec_stmt_if (estate=0x9fbfe270, stmt=0x8498760)
    at pl_exec.c:1346
#23 0x69f0daa8 in exec_stmt (estate=0x9fbfe270, stmt=0x8498760)
    at pl_exec.c:1172
#24 0x69f0d96d in exec_stmts (estate=0x9fbfe270, stmts=0x8497f78)
    at pl_exec.c:1123
#25 0x69f0d7ec in exec_stmt_block (estate=0x9fbfe270, block=0x8499db0)
    at pl_exec.c:1068
#26 0x69f0c167 in plpgsql_exec_function (func=0x847c338, fcinfo=0x9fbfe3b0)
    at pl_exec.c:286
#27 0x69f082c3 in plpgsql_call_handler (fcinfo=0x9fbfe3b0) at
pl_handler.c:95
#28 0x0815d00a in ExecMakeFunctionResult (fcache=0x84312b8,
    econtext=0x8431230, isNull=0x8431998 "", isDone=0x84319a8)
    at execQual.c:1269
#29 0x0815d860 in ExecEvalFunc (fcache=0x84312b8, econtext=0x8431230,
    isNull=0x8431998 "", isDone=0x84319a8) at execQual.c:1671
#30 0x08161ad2 in ExecTargetList (targetlist=0x8431710, econtext=0x8431230,
    values=0x8431988, isnull=0x8431998 "", itemIsDone=0x84319a8,
    isDone=0x9fbfe694) at execQual.c:4119
#31 0x08161ec4 in ExecProject (projInfo=0x84318d0, isDone=0x9fbfe694)
    at execQual.c:4320
#32 0x0816e944 in ExecResult (node=0x84311a8) at nodeResult.c:157
#33 0x0815aef9 in ExecProcNode (node=0x84311a8) at execProcnode.c:334
#34 0x08158f26 in ExecutePlan (estate=0x8431018, planstate=0x84311a8,
    operation=CMD_SELECT, numberTuples=0, direction=ForwardScanDirection,
    dest=0x84129c0) at execMain.c:1082
#35 0x081580e7 in ExecutorRun (queryDesc=0x842cc40,
    direction=ForwardScanDirection, count=0) at execMain.c:241
#36 0x08201509 in PortalRunSelect (portal=0x8457018, forward=1 '\001',
    count=0, dest=0x84129c0) at pquery.c:831
#37 0x08201247 in PortalRun (portal=0x8457018, count=2147483647,
    dest=0x84129c0, altdest=0x84129c0, completionTag=0x9fbfe900 "")
    at pquery.c:656
#38 0x081fc69d in exec_simple_query (
    query_string=0x8412018 "SELECT fn_clean_tables()")
    at postgres.c:939
#39 0x081ffe18 in PostgresMain (argc=4, argv=0x83b3448,
    username=0x83b3428 "pgsql") at postgres.c:3424
#40 0x081d1ace in BackendRun (port=0x83af000) at postmaster.c:2931
#41 0x081d1098 in BackendStartup (port=0x83af000) at postmaster.c:2558
#42 0x081cef3b in ServerLoop () at postmaster.c:1211
#43 0x081ce916 in PostmasterMain (argc=3, argv=0x9fbfed40) at postmaster.c
:963
#44 0x0817fef0 in main (argc=3, argv=0x9fbfed40) at main.c:188

(gdb) p PrivateRefCount[501]
$1 = 2
Steve

Re: BUG #3883: Autovacuum deadlock with truncate?

От
"Steven Flatt"
Дата:
On Jan 21, 2008 1:24 PM, Tom Lane <tgl@sss.pgh.pa.us> wrote:

> Well, there's our problem: for some reason PID 7908 has this buffer
> pinned, which is blocking the vacuum.  That seems pretty darn odd for
> a process that is about to (try to) truncate the table.  The only way
> I can imagine is that the function has an open cursor scanning the table
> ... which is a case that we probably ought to error out on, though I
> think TRUNCATE is not checking for it now.  Or you've managed to tickle
> some previously-unknown bug that leaks buffer pins.
>
> Could we see that whole function?  Also, what has been done previously
> in the transaction that's calling it?
>

Here's the function.  I've removed some of our schema-specific details but
the important stuff is there.  The part about "checking if a table has any
rows" before truncating it was recently added to speed up our tests -- and
this no doubt introduced the problem.  Before we were unconditionally
truncating a few thousand tables, most of which were empty, which took close
to 5 minutes.  By checking first and only truncating tables with data,
execution time was reduced to <10 seconds.  If possible, I'd still like a
way to check the tables but avoid the "hanging" problem.

By the way, nothing else is done in this transaction before calling the
function.

CREATE OR REPLACE FUNCTION fn_clean_tables()
    RETURNS VOID
    AS '
        DECLARE
            p_table RECORD;
            r RECORD;

            -- A list of the dimension tables which are cleaned.
            tables varchar[] := ''{
                -- table names removed
                }'';

            -- A list of the sequences on dimension tables which
            -- need to be reset.
            sequences varchar[] := ''{
                -- sequence names removed
                }'';

            -- A list of the sequences on partitioned tables which need to
            -- be reset.
            p_sequences varchar[] := ''{
                -- sequence names removed
                }'';

            i int;
        BEGIN
            -- Clean all partitioned tables, those which match a regex.
            FOR p_table IN SELECT tablename FROM pg_tables
                           WHERE tablename ~ ''regex''
            LOOP
                -- Check if table has any rows.
                FOR r IN EXECUTE
                        ''SELECT 1 FROM '' || p_table.tablename ||
                        '' LIMIT 1''
                LOOP
                    EXECUTE ''TRUNCATE '' || p_table.tablename;
                END LOOP;
            END LOOP;

            -- Reset all sequences of cleaned tables to start at 1.
            FOR i IN array_lower(p_sequences, 1) ..
                     array_upper(p_sequences, 1)
            LOOP
                EXECUTE ''SELECT setval('''''' || p_sequences[i] ||
                        '''''', 1, false)'';
            END LOOP;

            -- Clean all dimension tables.
            -- The order in which the tables are cleaned is important.
            -- Note that we cannot simply truncate the tables due to the
            -- foreign key relationships between tables.
            FOR i IN array_lower(tables, 1) .. array_upper(tables, 1)
            LOOP
                -- Check if table has any rows.
                FOR r IN EXECUTE
                        ''SELECT 1 FROM '' || tables[i] ||
                        '' LIMIT 1''
                LOOP
                    EXECUTE ''DELETE FROM '' || tables[i];
                END LOOP;
            END LOOP;

            -- Reset all sequences of cleaned tables to start at 1.
            FOR i IN array_lower(sequences, 1) ..
                     array_upper(sequences, 1)
            LOOP
                EXECUTE ''SELECT setval('''''' || sequences[i] ||
                        '''''', 1, false)'';
            END LOOP;
        END;'
    LANGUAGE PLpgSQL
    VOLATILE;
COMMENT ON FUNCTION fn_clean_tables()
    IS 'Cleans tables and resets sequences';
Steve

Re: BUG #3883: Autovacuum deadlock with truncate?

От
Tom Lane
Дата:
"Steven Flatt" <steven.flatt@gmail.com> writes:
> On Jan 21, 2008 1:24 PM, Tom Lane <tgl@sss.pgh.pa.us> wrote:
>> Well, there's our problem: for some reason PID 7908 has this buffer
>> pinned, which is blocking the vacuum.  That seems pretty darn odd for
>> a process that is about to (try to) truncate the table.

> Here's the function.  I've removed some of our schema-specific details but
> the important stuff is there.  The part about "checking if a table has any
> rows" before truncating it was recently added to speed up our tests -- and
> this no doubt introduced the problem.

Yeah, you've got this:

>                 -- Check if table has any rows.
>                 FOR r IN EXECUTE
>                         ''SELECT 1 FROM '' || p_table.tablename ||
>                         '' LIMIT 1''
>                 LOOP
>                     EXECUTE ''TRUNCATE '' || p_table.tablename;
>                 END LOOP;

which means that at the instant you try to do the TRUNCATE, the
FOR-loop's cursor still has the table open.  We really should error
out on that (in fact, I'm a bit surprised it doesn't crash).  So
instead you should do something like

        EXECUTE 'SELECT 1 FROM ' || p_table.tablename || ' LIMIT 1'
            INTO x;
        IF x IS NOT NULL THEN
            EXECUTE 'TRUNCATE ' || p_table.tablename;
        END IF;

I think we need to make some fixes here, but the fixes would mainly
consist of complaining about the first approach ;-).  The second one
is a much safer way to do it.

            regards, tom lane

Re: BUG #3883: Autovacuum deadlock with truncate?

От
"Steven Flatt"
Дата:
On Jan 21, 2008 3:33 PM, Tom Lane <tgl@sss.pgh.pa.us> wrote:

> I think we need to make some fixes here, but the fixes would mainly
> consist of complaining about the first approach ;-).  The second one
> is a much safer way to do it.
>

Second approach looks good.  Thanks for all your help!

Steve

Re: BUG #3883: Autovacuum deadlock with truncate?

От
Bruce Momjian
Дата:
Added to TODO:

* Detect deadlocks involving LockBufferForCleanup()

  http://archives.postgresql.org/pgsql-hackers/2008-01/msg00873.php


---------------------------------------------------------------------------

Steven Flatt wrote:
> On Jan 18, 2008 10:58 AM, Tom Lane <tgl@sss.pgh.pa.us> wrote:
>
> > Hm, PIN_COUNT_WAITER flag is still set, and refcount = 2 saying there is
> > still someone else pinning the buffer, so nothing evidently wrong here.
> >
> > Could you check PrivateRefCount[14407] in both cores?
> >
> Okay, got two new hung processes and two new cores.  Looks the same as last
> time, but happens on a slightly different table (a different partition of
> the same base table).  Processes have been hung for almost 24 hours.
> Requested info follows:
>
> -------------------------------------------------------------------------------
>
> pid 23965 (autovacuum)
>
> =# select pid,relation,mode,granted from pg_locks where pid = 23965;
>   pid  | relation |           mode           | granted
> -------+----------+--------------------------+---------
>  23965 |          | ExclusiveLock            | t
>  23965 |    16648 | ShareUpdateExclusiveLock | t
>  23965 |    16655 | RowExclusiveLock         | t
>  23965 |    16656 | RowExclusiveLock         | t
>  23965 |    16653 | RowExclusiveLock         | t
> (5 rows)
>
> (gdb) bt
> #0  0x6854a5b7 in semop () from /lib/libc.so.6
> #1  0x081c7b4f in PGSemaphoreLock (sema=0x695bae58, interruptOK=1 '\001')
>     at pg_sema.c:411
> #2  0x081f50f1 in ProcWaitForSignal () at proc.c:1075
> #3  0x081e78e3 in LockBufferForCleanup (buffer=502) at bufmgr.c:1926
> #4  0x081541c2 in lazy_vacuum_heap (onerel=0x866a860, vacrelstats=0x8629170)
>     at vacuumlazy.c:552
> #5  0x08153fa0 in lazy_scan_heap (onerel=0x866a860, vacrelstats=0x8629170,
>     Irel=0x8629158, nindexes=3) at vacuumlazy.c:482
> #6  0x08153722 in lazy_vacuum_rel (onerel=0x866a860, vacstmt=0x867f720)
>     at vacuumlazy.c:164
> #7  0x0814f623 in vacuum_rel (relid=16648, vacstmt=0x867f720,
>     expected_relkind=114 'r') at vacuum.c:1098
> #8  0x0814eb10 in vacuum (vacstmt=0x867f720, relids=0x867f780) at vacuum.c
> :397
> #9  0x081c9d57 in autovacuum_do_vac_analyze (relid=16648, dovacuum=1 '\001',
>     doanalyze=1 '\001', freeze_min_age=100000000) at autovacuum.c:912
> #10 0x081c97e4 in do_autovacuum (dbentry=0x83cdc08) at autovacuum.c:659
> #11 0x081c92c1 in AutoVacMain (argc=0, argv=0x0) at autovacuum.c:433
> #12 0x081c8f3a in autovac_start () at autovacuum.c:178
> #13 0x081cf01a in ServerLoop () at postmaster.c:1249
> #14 0x081ce916 in PostmasterMain (argc=3, argv=0x9fbfed40) at postmaster.c
> :963
> #15 0x0817fef0 in main (argc=3, argv=0x9fbfed40) at main.c:188
>
> (gdb) f 3
> (gdb) p *bufHdr
> $1 = {tag = {rnode = {spcNode = 1663, dbNode = 16384, relNode = 16648},
>     blockNum = 3}, flags = 70, usage_count = 0, refcount = 2,
>   wait_backend_pid = 23965, buf_hdr_lock = 0 '\0', buf_id = 501,
>   freeNext = -2, io_in_progress_lock = 1121, content_lock = 1122}
>
> (gdb) p PrivateRefCount[501]
> $2 = 1
>
> -------------------------------------------------------------------------------
>
> pid 7908 (function to clean tables)
>
> =# select pid,relation,mode,granted from pg_locks where pid = 7908 and not
> granted;
>  pid  | relation |        mode         | granted
> ------+----------+---------------------+---------
>  7908 |    16648 | AccessExclusiveLock | f
> (1 row)
>
> (gdb) bt
> #0  0x6854a5b7 in semop () from /lib/libc.so.6
> #1  0x081c7b4f in PGSemaphoreLock (sema=0x695bc580, interruptOK=1 '\001')
>     at pg_sema.c:411
> #2  0x081f4e29 in ProcSleep (locallock=0xafbbf30, lockMethodTable=0x8332324)
>     at proc.c:829
> #3  0x081f2660 in WaitOnLock (locallock=0xafbbf30, owner=0xbc456e0)
>     at lock.c:1140
> #4  0x081f2120 in LockAcquire (locktag=0x9fbfdc70, lockmode=8,
>     sessionLock=0 '\0', dontWait=0 '\0') at lock.c:792
> #5  0x081f0eb4 in LockRelationOid (relid=16648, lockmode=8) at lmgr.c:81
> #6  0x08091b6e in relation_open (relationId=16648, lockmode=8) at heapam.c
> :694
> #7  0x08091db2 in relation_openrv (relation=0x851e110, lockmode=8)
>     at heapam.c:821
> #8  0x08091ef4 in heap_openrv (relation=0x851e110, lockmode=8) at heapam.c
> :891
> #9  0x08135610 in ExecuteTruncate (stmt=0x851e160) at tablecmds.c:549
> #10 0x08202da1 in ProcessUtility (parsetree=0x851e160, params=0x0,
>     dest=0x835b4c0, completionTag=0x0) at utility.c:626
> #11 0x08175868 in _SPI_execute_plan (plan=0x9fbfde80, Values=0x0, Nulls=0x0,
>     snapshot=0x0, crosscheck_snapshot=0x0, read_only=0 '\0', tcount=0)
>     at spi.c:1496
> #12 0x0817374d in SPI_execute (
>     src=0x84a3908 "TRUNCATE foo", read_only=0 '\0',
>     tcount=0) at spi.c:316
> #13 0x69f0fdf1 in exec_stmt_dynexecute (estate=0x9fbfe270, stmt=0x84981b0)
>     at pl_exec.c:2561
> #14 0x69f0dba2 in exec_stmt (estate=0x9fbfe270, stmt=0x84981b0)
>     at pl_exec.c:1212
> #15 0x69f0d96d in exec_stmts (estate=0x9fbfe270, stmts=0x8498038)
>     at pl_exec.c:1123
> #16 0x69f1050d in exec_stmt_dynfors (estate=0x9fbfe270, stmt=0x8497de0)
>     at pl_exec.c:2793
> #17 0x69f0dbbb in exec_stmt (estate=0x9fbfe270, stmt=0x8497de0)
>     at pl_exec.c:1216
> #18 0x69f0d96d in exec_stmts (estate=0x9fbfe270, stmts=0x8497fb0)
>     at pl_exec.c:1123
> #19 0x69f0e6e6 in exec_stmt_fors (estate=0x9fbfe270, stmt=0x84979e0)
>     at pl_exec.c:1694
> #20 0x69f0db0c in exec_stmt (estate=0x9fbfe270, stmt=0x84979e0)
>     at pl_exec.c:1188
> #21 0x69f0d96d in exec_stmts (estate=0x9fbfe270, stmts=0x8497ed0)
>     at pl_exec.c:1123
> #22 0x69f0de85 in exec_stmt_if (estate=0x9fbfe270, stmt=0x8498760)
>     at pl_exec.c:1346
> #23 0x69f0daa8 in exec_stmt (estate=0x9fbfe270, stmt=0x8498760)
>     at pl_exec.c:1172
> #24 0x69f0d96d in exec_stmts (estate=0x9fbfe270, stmts=0x8497f78)
>     at pl_exec.c:1123
> #25 0x69f0d7ec in exec_stmt_block (estate=0x9fbfe270, block=0x8499db0)
>     at pl_exec.c:1068
> #26 0x69f0c167 in plpgsql_exec_function (func=0x847c338, fcinfo=0x9fbfe3b0)
>     at pl_exec.c:286
> #27 0x69f082c3 in plpgsql_call_handler (fcinfo=0x9fbfe3b0) at
> pl_handler.c:95
> #28 0x0815d00a in ExecMakeFunctionResult (fcache=0x84312b8,
>     econtext=0x8431230, isNull=0x8431998 "", isDone=0x84319a8)
>     at execQual.c:1269
> #29 0x0815d860 in ExecEvalFunc (fcache=0x84312b8, econtext=0x8431230,
>     isNull=0x8431998 "", isDone=0x84319a8) at execQual.c:1671
> #30 0x08161ad2 in ExecTargetList (targetlist=0x8431710, econtext=0x8431230,
>     values=0x8431988, isnull=0x8431998 "", itemIsDone=0x84319a8,
>     isDone=0x9fbfe694) at execQual.c:4119
> #31 0x08161ec4 in ExecProject (projInfo=0x84318d0, isDone=0x9fbfe694)
>     at execQual.c:4320
> #32 0x0816e944 in ExecResult (node=0x84311a8) at nodeResult.c:157
> #33 0x0815aef9 in ExecProcNode (node=0x84311a8) at execProcnode.c:334
> #34 0x08158f26 in ExecutePlan (estate=0x8431018, planstate=0x84311a8,
>     operation=CMD_SELECT, numberTuples=0, direction=ForwardScanDirection,
>     dest=0x84129c0) at execMain.c:1082
> #35 0x081580e7 in ExecutorRun (queryDesc=0x842cc40,
>     direction=ForwardScanDirection, count=0) at execMain.c:241
> #36 0x08201509 in PortalRunSelect (portal=0x8457018, forward=1 '\001',
>     count=0, dest=0x84129c0) at pquery.c:831
> #37 0x08201247 in PortalRun (portal=0x8457018, count=2147483647,
>     dest=0x84129c0, altdest=0x84129c0, completionTag=0x9fbfe900 "")
>     at pquery.c:656
> #38 0x081fc69d in exec_simple_query (
>     query_string=0x8412018 "SELECT fn_clean_tables()")
>     at postgres.c:939
> #39 0x081ffe18 in PostgresMain (argc=4, argv=0x83b3448,
>     username=0x83b3428 "pgsql") at postgres.c:3424
> #40 0x081d1ace in BackendRun (port=0x83af000) at postmaster.c:2931
> #41 0x081d1098 in BackendStartup (port=0x83af000) at postmaster.c:2558
> #42 0x081cef3b in ServerLoop () at postmaster.c:1211
> #43 0x081ce916 in PostmasterMain (argc=3, argv=0x9fbfed40) at postmaster.c
> :963
> #44 0x0817fef0 in main (argc=3, argv=0x9fbfed40) at main.c:188
>
> (gdb) p PrivateRefCount[501]
> $1 = 2
> Steve

--
  Bruce Momjian  <bruce@momjian.us>        http://momjian.us
  EnterpriseDB                             http://postgres.enterprisedb.com

  + If your life is a hard drive, Christ can be your backup. +