Segmentation fault when calling BlessTupleDesc in a C function in parallel on PostgreSQL-(12.6, 12.7, 13.2, 13.3)

Поиск
Список
Период
Сортировка
От Eric Thinnes
Тема Segmentation fault when calling BlessTupleDesc in a C function in parallel on PostgreSQL-(12.6, 12.7, 13.2, 13.3)
Дата
Msg-id 99aaa2eb-e194-bf07-c29a-1a76b4f2bcf9@gmx.de
обсуждение исходный текст
Ответы Re: Segmentation fault when calling BlessTupleDesc in a C function in parallel on PostgreSQL-(12.6, 12.7, 13.2, 13.3)  (Bharath Rupireddy <bharath.rupireddyforpostgres@gmail.com>)
Список pgsql-bugs
Hello Postgres team,

i get a segmentation fault when calling BlessTupleDesc in a C function
in parallel.

System Configuration
---------------------
   Architecture        : Intel Pentium
   Operating System    : Ubuntu versions: 16.04.7 LTS, 18.04.5 LTS,
20.04.2 LTS
   PostgreSQL version  : PostgreSQL-(12.6, 12.7, 13.2, 13.3)
   Compiler used       : gcc (Ubuntu 9.3.0-17ubuntu1~20.04) 9.3.0

Please enter a FULL description of your problem:
------------------------------------------------

In the version with the configure option --enable-cassert, the process
is aborted
before the segmentation fault occurs.

It is possible that the error does not occur the first time and that
the call has to be repeated several times.
That probably indicates a timing problem with concurrency.


postgres:5433:et@test=# SET max_parallel_workers_per_gather TO 4;
SET
Zeit: 0,558 ms
postgres:5433:et@test=#* SELECT sum(sign*Betrag) FROM kpos_test,
setof_kpos(kpos);
INFO:  pid: 9132, outTypeId: 40962
INFO:  outTypRelId: 40960, typlen: -1, typbyval: 0, typalign: d
INFO:  pid: 9131, outTypeId: 40962
INFO:  outTypRelId: 40960, typlen: -1, typbyval: 0, typalign: d
INFO:  pid: 9130, outTypeId: 40962
INFO:  outTypRelId: 40960, typlen: -1, typbyval: 0, typalign: d
INFO:  pid: 9134, outTypeId: 40962
INFO:  outTypRelId: 40960, typlen: -1, typbyval: 0, typalign: d
INFO:  pid: 9133, outTypeId: 40962
INFO:  outTypRelId: 40960, typlen: -1, typbyval: 0, typalign: d
WARNING:  terminating connection because of crash of another server process
DETAIL:  The postmaster has commanded this server process to roll back
the current transaction and exit, because another server process exited
abnormally and possibly corrupted shared memory.
TIP:  In a moment you should be able to reconnect to the database and
repeat your command.
SSL-SYSCALL-Fehler: Dateiende entdeckt
Die Verbindung zum Server wurde verloren.  Versuche Reset: Fehlgeschlagen.
Zeit: 421,098 ms
::@!>? \q

If I call the c function once with an empty argument before the
parallel scan, the error does not occur.


SET max_parallel_workers_per_gather TO 4;
SET
Zeit: 0,390 ms
postgres:5433:et@test=#* SELECT setof_kpos('');
INFO:  pid: 12396, outTypeId: 40962
INFO:  outTypRelId: 40960, typlen: -1, typbyval: 0, typalign: d
  setof_kpos
------------
(0 Zeilen)

Zeit: 1,253 ms
postgres:5433:et@test=#* SELECT sum(sign*Betrag) FROM kpos_test,
setof_kpos(kpos);
INFO:  pid: 12400, outTypeId: 40962
INFO:  outTypRelId: 40960, typlen: -1, typbyval: 0, typalign: d
INFO:  pid: 12398, outTypeId: 40962
INFO:  outTypRelId: 40960, typlen: -1, typbyval: 0, typalign: d
INFO:  pid: 12399, outTypeId: 40962
INFO:  outTypRelId: 40960, typlen: -1, typbyval: 0, typalign: d
INFO:  pid: 12397, outTypeId: 40962
INFO:  outTypRelId: 40960, typlen: -1, typbyval: 0, typalign: d
  sum
-----
    0
(1 Zeile)

I don't know how to fix this.

In the appendix I have added the complete sources for the test and a
core dump including postgres logs and configs.


The complete test SQL.

DROP TYPE t_kpos CASCADE;
CREATE TYPE t_kpos AS (
     Pos         smallint
   , Count       smallint
   , Konto       integer
   , Sign        smallint
   , BTyp        smallint
   , Betrag      bigint
);

DROP FUNCTION IF EXISTS setof_kpos;
CREATE OR REPLACE FUNCTION setof_kpos(kontopositionen text) RETURNS
SETOF t_kpos
   AS '/var/lib/postgresql/lib/13/test', 'set_of_kpos'
   LANGUAGE C IMMUTABLE STRICT
   PARALLEL SAFE
;
COMMIT;
SELECT * FROM
setof_kpos('11.40,11052,-1,328;9.58,4736,1,226;1.82,3806,1,314');

DROP TABLE kpos_test;
CREATE TABLE kpos_test (id int, kpos text);
INSERT INTO kpos_test SELECT generate_series(1,10000000) AS id,
'11.40,11052,-1,328;9.58,4736,1,226;1.82,3806,1,314' AS kpos;
COMMIT;

-- psql -h postgres.vmware -p 5433 test
SET max_parallel_workers_per_gather TO 4;
SELECT sum(sign*Betrag) FROM kpos_test, setof_kpos(kpos);


Short Version of C-function (complete version in attechment)

....
int create_result_from_kpos( Datum result[], const t_TypInfo &tinfo,
const char txt[] )
{
   t_KPos  kpos   [MAX_KPOS];
   short   allBtyp[MAX_KPOS];
   bool    nulls  [N_ATTS]; memcpy( nulls, tinfo.nulls, sizeof(nulls) );
   Datum   values [N_ATTS];

   int       count     = split_kpos( kpos, allBtyp, MAX_KPOS, txt );
   TupleDesc tupleDesc = CreateTemplateTupleDesc( N_ATTS );

   for ( int n = 0; n < N_ATTS; n++ )
     TupleDescInitEntry( tupleDesc, n+1, tinfo.attNames[n],
tinfo.attOids[n], -1, 0 );
   tupleDesc = BlessTupleDesc( tupleDesc );

   for ( int i = 0; i < count; i++ ) {
//     elog(INFO, "%2i: %8i %5i %5i %10lli", i+1, kpos[i].Konto,
kpos[i].Sign, kpos[i].BTyp, kpos[i].Betrag );

     values[0] = Int16GetDatum( (short)i+1      );
     values[1] = Int16GetDatum( (short)count    );
     values[2] = Int32GetDatum( kpos[i].Konto   );
     values[3] = Int16GetDatum( kpos[i].Sign    );
     values[4] = Int16GetDatum( kpos[i].BTyp    );
     values[5] = Int64GetDatum( kpos[i].Betrag  );

     result[i] = HeapTupleGetDatum( heap_form_tuple( tupleDesc, values,
nulls ) );
   }

   return count;
}

Datum set_of_kpos(PG_FUNCTION_ARGS)
{
   typedef struct {
     int     nextelem, numelems;
     Datum   result[MAX_KPOS];
   } t_user_ctx;

   FuncCallContext *funcctx;
   t_user_ctx      *userctx;

   // stuff done only on the first call of the function
   if ( SRF_IS_FIRSTCALL() ) {
     // create a function context for cross-call persistence
     funcctx = SRF_FIRSTCALL_INIT();

     // switch to memory context appropriate for multiple function calls
     MemoryContext oldcontext  = MemoryContextSwitchTo(
funcctx->multi_call_memory_ctx );

     const t_TypInfo  &tinfo   = init_tinfo( fcinfo );
     const char       *txt     = TextDatumGetCString( PG_GETARG_TEXT_PP(
0 ) );

     // allocate memory for user context
     funcctx->user_fctx = userctx = (t_user_ctx*)
palloc(sizeof(t_user_ctx));

     userctx->nextelem = 0;
     userctx->numelems = create_result_from_kpos( userctx->result,
tinfo, txt );

     MemoryContextSwitchTo( oldcontext );
   }

   // stuff done on every call of the function
   funcctx  = SRF_PERCALL_SETUP();
   userctx  = (t_user_ctx*) funcctx->user_fctx;

   if ( userctx->nextelem < userctx->numelems ) {
     SRF_RETURN_NEXT( funcctx, userctx->result[ userctx->nextelem++ ] );
   } else {
     SRF_RETURN_DONE( funcctx );
   }
}



2021-05-14 10:38:18.462 CEST     9014 00000 LOG:  redo done at 0/D1F496C8
2021-05-14 10:38:18.473 CEST     2188 00000 LOG:  database system is
ready to accept connections
TRAP: FailedAssertion("result->tdrefcount == -1", File: "typcache.c",
Line: 2711)
postgres: 13/main13: parallel worker for PID 9080
(ExceptionalCondition+0x7c)[0x5566dd7ef4ec]
postgres: 13/main13: parallel worker for PID 9080
(+0x58d2b4)[0x5566dd7ed2b4]
postgres: 13/main13: parallel worker for PID 9080
(assign_record_type_typmod+0xa7)[0x5566dd7eeb77]
postgres: 13/main13: parallel worker for PID 9080
(BlessTupleDesc+0x2d)[0x5566dd52ca9d]
/var/lib/postgresql/lib/13/test.so(_Z23create_result_from_kposPmRK9s_TypInfoPKc+0xa7)[0x7f034364e7d7]
/var/lib/postgresql/lib/13/test.so(set_of_kpos+0x107)[0x7f034364e9a7]
postgres: 13/main13: parallel worker for PID 9080
(ExecMakeTableFunctionResult+0x37f)[0x5566dd5281ef]
postgres: 13/main13: parallel worker for PID 9080
(+0x2dbbec)[0x5566dd53bbec]
postgres: 13/main13: parallel worker for PID 9080
(ExecScan+0x371)[0x5566dd528ea1]
postgres: 13/main13: parallel worker for PID 9080
(+0x2f0cb0)[0x5566dd550cb0]
postgres: 13/main13: parallel worker for PID 9080
(+0x2d18c1)[0x5566dd5318c1]
postgres: 13/main13: parallel worker for PID 9080
(+0x2d4a88)[0x5566dd534a88]
postgres: 13/main13: parallel worker for PID 9080
(standard_ExecutorRun+0x113)[0x5566dd51dbb3]
/usr/local/pgsql/13/lib/pg_stat_statements.so(+0x4335)[0x7f035053c335]
postgres: 13/main13: parallel worker for PID 9080
(ParallelQueryMain+0x214)[0x5566dd522d54]
postgres: 13/main13: parallel worker for PID 9080
(ParallelWorkerMain+0x498)[0x5566dd3cfef8]
postgres: 13/main13: parallel worker for PID 9080
(StartBackgroundWorker+0x2e5)[0x5566dd619265]
postgres: 13/main13: parallel worker for PID 9080
(+0x3c7123)[0x5566dd627123]
postgres: 13/main13: parallel worker for PID 9080
(+0x3c7b92)[0x5566dd627b92]
/lib/x86_64-linux-gnu/libpthread.so.0(+0x153c0)[0x7f03540733c0]
/lib/x86_64-linux-gnu/libc.so.6(__select+0x1a)[0x7f03537e80da]
postgres: 13/main13: parallel worker for PID 9080
(+0x3c83ce)[0x5566dd6283ce]
postgres: 13/main13: parallel worker for PID 9080
(PostmasterMain+0xe54)[0x5566dd62a124]
postgres: 13/main13: parallel worker for PID 9080
(main+0x793)[0x5566dd322b03]
/lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xf3)[0x7f03536f70b3]
postgres: 13/main13: parallel worker for PID 9080
(_start+0x2e)[0x5566dd322bae]
2021-05-14 10:38:47.819 CEST     2188 00000 LOG:  background worker
"parallel worker" (PID 9082) was terminated by signal 6: Aborted
2021-05-14 10:38:47.819 CEST     2188 00000 DETAIL:  Failed process was
running: SELECT sum(sign*Betrag) FROM kpos_test, setof_kpos(kpos);
2021-05-14 10:38:47.819 CEST     2188 00000 LOG:  terminating any other
active server processes
2021-05-14 10:38:47.819 CEST et test 192.168.150.1(33280) SELECT 9080
57P02 WARNING:  terminating connection because of crash of another
server process
2021-05-14 10:38:47.819 CEST et test 192.168.150.1(33280) SELECT 9080
57P02 DETAIL:  The postmaster has commanded this server process to roll
back the current transaction and exit, because another server process
exited abnormally and possibly corrupted shared memory.
2021-05-14 10:38:47.819 CEST et test 192.168.150.1(33280) SELECT 9080
57P02 HINT:  In a moment you should be able to reconnect to the database
and repeat your command.
2021-05-14 10:38:47.819 CEST     9019 57P02 WARNING:  terminating
connection because of crash of another server process
2021-05-14 10:38:47.819 CEST     9019 57P02 DETAIL:  The postmaster has
commanded this server process to roll back the current transaction and
exit, because another server process exited abnormally and possibly
corrupted shared memory.
2021-05-14 10:38:47.819 CEST     9019 57P02 HINT:  In a moment you
should be able to reconnect to the database and repeat your command.
2021-05-14 10:38:47.821 CEST     2188 00000 LOG:  archiver process (PID
9020) exited with exit code 2
2021-05-14 10:38:47.828 CEST et test 192.168.150.1(33282)  9086 57P03
FATAL:  the database system is in recovery mode
2021-05-14 10:38:47.830 CEST     2188 00000 LOG:  all server processes
terminated; reinitializing
2021-05-14 10:38:47.900 CEST     9088 00000 LOG:  database system was
interrupted; last known up at 2021-05-14 10:38:18 CEST
2021-05-14 10:38:47.900 CEST et test 192.168.150.1(33284)  9089 57P03
FATAL:  the database system is in recovery mode
2021-05-14 10:38:47.933 CEST     9088 00000 LOG:  database system was
not properly shut down; automatic recovery in progress
2021-05-14 10:38:47.935 CEST     9088 00000 LOG:  redo starts at 0/D1F49778
2021-05-14 10:38:47.935 CEST     9088 00000 LOG:  invalid record length
at 0/D1F497B0: wanted 24, got 0
2021-05-14 10:38:47.935 CEST     9088 00000 LOG:  redo done at 0/D1F49778
2021-05-14 10:38:47.946 CEST     2188 00000 LOG:  database system is
ready to accept connections
TRAP: FailedAssertion("result->tdrefcount == -1", File: "typcache.c",
Line: 2711)
postgres: 13/main13: parallel worker for PID 9130
(ExceptionalCondition+0x7c)[0x5566dd7ef4ec]
postgres: 13/main13: parallel worker for PID 9130
(+0x58d2b4)[0x5566dd7ed2b4]
postgres: 13/main13: parallel worker for PID 9130
(assign_record_type_typmod+0xa7)[0x5566dd7eeb77]
postgres: 13/main13: parallel worker for PID 9130
(BlessTupleDesc+0x2d)[0x5566dd52ca9d]
/var/lib/postgresql/lib/13/test.so(_Z23create_result_from_kposPmRK9s_TypInfoPKc+0xa7)[0x7f034364e7d7]
/var/lib/postgresql/lib/13/test.so(set_of_kpos+0x107)[0x7f034364e9a7]
postgres: 13/main13: parallel worker for PID 9130
(ExecMakeTableFunctionResult+0x37f)[0x5566dd5281ef]
postgres: 13/main13: parallel worker for PID 9130
(+0x2dbbec)[0x5566dd53bbec]
postgres: 13/main13: parallel worker for PID 9130
(ExecScan+0x371)[0x5566dd528ea1]
postgres: 13/main13: parallel worker for PID 9130
(+0x2f0cb0)[0x5566dd550cb0]
postgres: 13/main13: parallel worker for PID 9130
(+0x2d18c1)[0x5566dd5318c1]
postgres: 13/main13: parallel worker for PID 9130
(+0x2d4a88)[0x5566dd534a88]
postgres: 13/main13: parallel worker for PID 9130
(standard_ExecutorRun+0x113)[0x5566dd51dbb3]
/usr/local/pgsql/13/lib/pg_stat_statements.so(+0x4335)[0x7f035053c335]
postgres: 13/main13: parallel worker for PID 9130
(ParallelQueryMain+0x214)[0x5566dd522d54]
postgres: 13/main13: parallel worker for PID 9130
(ParallelWorkerMain+0x498)[0x5566dd3cfef8]
postgres: 13/main13: parallel worker for PID 9130
(StartBackgroundWorker+0x2e5)[0x5566dd619265]
postgres: 13/main13: parallel worker for PID 9130
(+0x3c7123)[0x5566dd627123]
postgres: 13/main13: parallel worker for PID 9130
(+0x3c7b92)[0x5566dd627b92]
/lib/x86_64-linux-gnu/libpthread.so.0(+0x153c0)[0x7f03540733c0]
/lib/x86_64-linux-gnu/libc.so.6(__select+0x1a)[0x7f03537e80da]
postgres: 13/main13: parallel worker for PID 9130
(+0x3c83ce)[0x5566dd6283ce]
postgres: 13/main13: parallel worker for PID 9130
(PostmasterMain+0xe54)[0x5566dd62a124]
postgres: 13/main13: parallel worker for PID 9130
(main+0x793)[0x5566dd322b03]
/lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xf3)[0x7f03536f70b3]
postgres: 13/main13: parallel worker for PID 9130
(_start+0x2e)[0x5566dd322bae]
2021-05-14 10:39:04.103 CEST     2188 00000 LOG:  background worker
"parallel worker" (PID 9132) was terminated by signal 6: Aborted
2021-05-14 10:39:04.103 CEST     2188 00000 DETAIL:  Failed process was
running: SELECT sum(sign*Betrag) FROM kpos_test, setof_kpos(kpos);
2021-05-14 10:39:04.103 CEST     2188 00000 LOG:  terminating any other
active server processes

(gdb) bt
#0  __GI_raise (sig=sig@entry=6) at ../sysdeps/unix/sysv/linux/raise.c:50
#1  0x00007f03536f5859 in __GI_abort () at abort.c:79
#2  0x00005566dd7ef50b in ExceptionalCondition
(conditionName=conditionName@entry=0x5566dd99826a "result->tdrefcount ==
-1",
     errorType=errorType@entry=0x5566dd84b01d "FailedAssertion",
fileName=fileName@entry=0x5566dd998237 "typcache.c",
lineNumber=lineNumber@entry=2711) at assert.c:67
#3  0x00005566dd7ed2b4 in find_or_make_matching_shared_tupledesc
(tupdesc=<optimized out>) at typcache.c:2711
#4  0x00005566dd7eeb77 in assign_record_type_typmod (tupDesc=<optimized
out>, tupDesc@entry=0x5566e02087b8) at typcache.c:1873
#5  0x00005566dd52ca9d in BlessTupleDesc (tupdesc=0x5566e02087b8) at
execTuples.c:2086
#6  BlessTupleDesc (tupdesc=tupdesc@entry=0x5566e02087b8) at
execTuples.c:2082
#7  0x00007f034364e7d7 in create_result_from_kpos
(result=result@entry=0x5566e0208728, tinfo=...,
     txt=txt@entry=0x5566df3ee200
"11.40,11052,-1,328;9.58,4736,1,226;1.82,3806,1,314") at test.c:125
#8  0x00007f034364e9a7 in set_of_kpos (fcinfo=<optimized out>) at test.c:168
#9  0x00005566dd5281ef in ExecMakeTableFunctionResult
(setexpr=0x5566df435368, econtext=0x5566df4346f0, argContext=<optimized
out>, expectedDesc=0x5566df457600,
     randomAccess=false) at execSRF.c:234
#10 0x00005566dd53bbec in FunctionNext (node=node@entry=0x5566df4345d8)
at nodeFunctionscan.c:94
#11 0x00005566dd528ea1 in ExecScanFetch (recheckMtd=0x5566dd53b8e0
<FunctionRecheck>, accessMtd=0x5566dd53b8f0 <FunctionNext>,
node=0x5566df4345d8) at execScan.c:133
#12 ExecScan (node=0x5566df4345d8, accessMtd=0x5566dd53b8f0
<FunctionNext>, recheckMtd=0x5566dd53b8e0 <FunctionRecheck>) at
execScan.c:182
#13 0x00005566dd550cb0 in ExecProcNode (node=0x5566df4345d8) at
../../../src/include/executor/executor.h:248
#14 ExecNestLoop (pstate=0x5566df434118) at nodeNestloop.c:160
#15 0x00005566dd5318c1 in ExecProcNode (node=0x5566df434118) at
../../../src/include/executor/executor.h:248
#16 fetch_input_tuple (aggstate=aggstate@entry=0x5566df433bd0) at
nodeAgg.c:589
#17 0x00005566dd534a88 in agg_retrieve_direct (aggstate=<optimized out>)
at nodeAgg.c:2368
#18 ExecAgg (pstate=<optimized out>) at nodeAgg.c:2183
#19 0x00005566dd51dbb3 in ExecProcNode (node=0x5566df433bd0) at
../../../src/include/executor/executor.h:248
#20 ExecutePlan (execute_once=<optimized out>, dest=0x5566df392598,
direction=<optimized out>, numberTuples=0, sendTuples=<optimized out>,
operation=CMD_SELECT,
     use_parallel_mode=<optimized out>, planstate=0x5566df433bd0,
estate=0x5566df4336f0) at execMain.c:1632
#21 standard_ExecutorRun (queryDesc=0x5566df394230, direction=<optimized
out>, count=0, execute_once=<optimized out>) at execMain.c:350
#22 0x00007f035053c335 in pgss_ExecutorRun (queryDesc=0x5566df394230,
direction=ForwardScanDirection, count=0, execute_once=<optimized out>)
     at pg_stat_statements.c:1045
#23 0x00005566dd522d54 in ParallelQueryMain
(seg=seg@entry=0x5566df305938, toc=toc@entry=0x7f03435ca000) at
execParallel.c:1448
#24 0x00005566dd3cfef8 in ParallelWorkerMain (main_arg=<optimized out>)
at parallel.c:1470
#25 0x00005566dd619265 in StartBackgroundWorker () at bgworker.c:879
#26 0x00005566dd627123 in do_start_bgworker (rw=<optimized out>) at
postmaster.c:5870
#27 maybe_start_bgworkers () at postmaster.c:6095
#28 0x00005566dd627b92 in sigusr1_handler
(postgres_signal_arg=<optimized out>) at postmaster.c:5255
#29 <signal handler called>
#30 0x00007f03537e80da in __GI___select (nfds=nfds@entry=8,
readfds=readfds@entry=0x7ffe861f5d30, writefds=writefds@entry=0x0,
exceptfds=exceptfds@entry=0x0,
     timeout=timeout@entry=0x7ffe861f5c90) at
../sysdeps/unix/sysv/linux/select.c:41
#31 0x00005566dd6283ce in ServerLoop () at postmaster.c:1703
#32 0x00005566dd62a124 in PostmasterMain (argc=3, argv=<optimized out>)
at postmaster.c:1412
#33 0x00005566dd322b03 in main (argc=3, argv=0x5566df303920) at main.c:210



Вложения

В списке pgsql-bugs по дате отправления:

Предыдущее
От: David Rowley
Дата:
Сообщение: Re: BUG #17009: create temporary table with like option using same name as persistent table does not create indexes
Следующее
От: Нагель Михаил
Дата:
Сообщение: Re: BUG #17009: create temporary table with like option using same name as persistent table does not create indexes