Обсуждение: Panic during xlog building with big values

Поиск
Список
Период
Сортировка

Panic during xlog building with big values

От
"Maksim.Melnikov"
Дата:
Hello,
during testing we found following issue when update records with big values.

2025-07-07 14:40:30.434 MSK [125435] PANIC:  oversized WAL record
2025-07-07 14:40:30.434 MSK [125435] DETAIL:  WAL record would be
1073742015 bytes (of maximum 1069547520 bytes); rmid 10 flags 64.

tested commit: 62a17a92833d1eaa60d8ea372663290942a1e8eb

Test description:

set wal_level = logical in postgresql.conf

CREATE DATABASE regression_big_values WITH TEMPLATE = template0 ENCODING
= 'UTF8';
\c regression_big_values
CREATE TABLE big_text_test (i int, c1 text, c2 text);
-- Mark columns as toastable, but don't try to compress
ALTER TABLE big_text_test ALTER c1 SET STORAGE EXTERNAL;
ALTER TABLE big_text_test ALTER c2 SET STORAGE EXTERNAL;
ALTER TABLE big_text_test REPLICA IDENTITY FULL;
INSERT INTO big_text_test (i, c1, c2) VALUES (1, repeat('a',
1073741737), NULL);
UPDATE big_text_test SET c2 = repeat('b', 1073741717);


(gdb) bt
#0  __pthread_kill_implementation (no_tid=0, signo=6,
threadid=<optimized out>) at ./nptl/pthread_kill.c:44
#1  __pthread_kill_internal (signo=6, threadid=<optimized out>) at
./nptl/pthread_kill.c:78
#2  __GI___pthread_kill (threadid=<optimized out>, signo=signo@entry=6)
at ./nptl/pthread_kill.c:89
#3  0x000073665c64527e in __GI_raise (sig=sig@entry=6) at
../sysdeps/posix/raise.c:26
#4  0x000073665c6288ff in __GI_abort () at ./stdlib/abort.c:79
#5  0x0000632bc0ff9b78 in errfinish (filename=0x632bc10b7778
"xloginsert.c", lineno=916, funcname=0x632bc10b7d50 <__func__.2>
"XLogRecordAssemble") at elog.c:600
#6  0x0000632bc08e41ca in XLogRecordAssemble (rmid=10 '\n', info=64 '@',
RedoRecPtr=2252407976, doPageWrites=true, fpw_lsn=0x7ffd23d44bb0,
num_fpi=0x7ffd23d44ba4,
     topxid_included=0x7ffd23d44ba3) at xloginsert.c:916
#7  0x0000632bc08e3851 in XLogInsert (rmid=10 '\n', info=64 '@') at
xloginsert.c:520
#8  0x0000632bc083f052 in log_heap_update (reln=0x73665d191c38,
oldbuf=6989, newbuf=6989, oldtup=0x7ffd23d44da0, newtup=0x632bfa9d89c0,
old_key_tuple=0x7364d09fe048,
     all_visible_cleared=false, new_all_visible_cleared=false) at
heapam.c:9042
#9  0x0000632bc08372ff in heap_update (relation=0x73665d191c38,
otid=0x7ffd23d45082, newtup=0x736610a13048, cid=0, crosscheck=0x0,
wait=true, tmfd=0x7ffd23d45120,
     lockmode=0x7ffd23d45034, update_indexes=0x7ffd23d45030) at
heapam.c:4132
#10 0x0000632bc0840bd4 in heapam_tuple_update (relation=0x73665d191c38,
otid=0x7ffd23d45082, slot=0x632bfa9d7fb8, cid=0,
snapshot=0x632bfa979400, crosscheck=0x0, wait=true,
     tmfd=0x7ffd23d45120, lockmode=0x7ffd23d45034,
update_indexes=0x7ffd23d45030) at heapam_handler.c:330
#11 0x0000632bc0b33f21 in table_tuple_update (rel=0x73665d191c38,
otid=0x7ffd23d45082, slot=0x632bfa9d7fb8, cid=0,
snapshot=0x632bfa979400, crosscheck=0x0, wait=true,
     tmfd=0x7ffd23d45120, lockmode=0x7ffd23d45034,
update_indexes=0x7ffd23d45030) at ../../../src/include/access/tableam.h:1500
#12 0x0000632bc0b37a46 in ExecUpdateAct (context=0x7ffd23d45100,
resultRelInfo=0x632bfa9d50e8, tupleid=0x7ffd23d45082, oldtuple=0x0,
slot=0x632bfa9d7fb8, canSetTag=true,
     updateCxt=0x7ffd23d4502c) at nodeModifyTable.c:2301
#13 0x0000632bc0b37fdc in ExecUpdate (context=0x7ffd23d45100,
resultRelInfo=0x632bfa9d50e8, tupleid=0x7ffd23d45082, oldtuple=0x0,
oldSlot=0x632bfa9d7ea8, slot=0x632bfa9d7fb8,
     canSetTag=true) at nodeModifyTable.c:2525
#14 0x0000632bc0b3b9bc in ExecModifyTable (pstate=0x632bfa9d4ed8) at
nodeModifyTable.c:4507
#15 0x0000632bc0af5585 in ExecProcNodeFirst (node=0x632bfa9d4ed8) at
execProcnode.c:469
#16 0x0000632bc0ae7c82 in ExecProcNode (node=0x632bfa9d4ed8) at
../../../src/include/executor/executor.h:313
#17 0x0000632bc0aeab37 in ExecutePlan (queryDesc=0x632bfa8b79d0,
operation=CMD_UPDATE, sendTuples=false, numberTuples=0,
direction=ForwardScanDirection, dest=0x632bfa940888)
     at execMain.c:1679
#18 0x0000632bc0ae8345 in standard_ExecutorRun
(queryDesc=0x632bfa8b79d0, direction=ForwardScanDirection, count=0) at
execMain.c:367
#19 0x0000632bc0ae81a3 in ExecutorRun (queryDesc=0x632bfa8b79d0,
direction=ForwardScanDirection, count=0) at execMain.c:304
#20 0x0000632bc0deac67 in ProcessQuery (plan=0x632bfa93f750,
sourceText=0x632bfa8e31a0 "UPDATE big_text_test SET c2 = repeat('b',
1073741717) || 'бвг';", params=0x0, queryEnv=0x0,
     dest=0x632bfa940888, qc=0x7ffd23d45550) at pquery.c:161
#21 0x0000632bc0dec79a in PortalRunMulti (portal=0x632bfa964e30,
isTopLevel=true, setHoldSnapshot=false, dest=0x632bfa940888,
altdest=0x632bfa940888, qc=0x7ffd23d45550)
     at pquery.c:1272
#22 0x0000632bc0debca6 in PortalRun (portal=0x632bfa964e30,
count=9223372036854775807, isTopLevel=true, dest=0x632bfa940888,
altdest=0x632bfa940888, qc=0x7ffd23d45550) at pquery.c:788
#23 0x0000632bc0de432a in exec_simple_query (query_string=0x632bfa8e31a0
"UPDATE big_text_test SET c2 = repeat('b', 1073741717) || 'бвг';") at
postgres.c:1273
#24 0x0000632bc0de9b1b in PostgresMain (dbname=0x632bfa91e510
"regression_big_values", username=0x632bfa91e4f8 "maxim") at postgres.c:4766
#25 0x0000632bc0ddf84e in BackendMain (startup_data=0x7ffd23d45800,
startup_data_len=24) at backend_startup.c:124
#26 0x0000632bc0cded62 in postmaster_child_launch (child_type=B_BACKEND,
child_slot=2, startup_data=0x7ffd23d45800, startup_data_len=24,
client_sock=0x7ffd23d45860)
     at launch_backend.c:290
#27 0x0000632bc0ce5854 in BackendStartup (client_sock=0x7ffd23d45860) at
postmaster.c:3580
#28 0x0000632bc0ce2d23 in ServerLoop () at postmaster.c:1702
#29 0x0000632bc0ce2612 in PostmasterMain (argc=1, argv=0x632bfa89b9c0)
at postmaster.c:1400
#30 0x0000632bc0b7eeab in main (argc=1, argv=0x632bfa89b9c0) at main.c:227

The reason is "if (total_len > XLogRecordMaxSize)" check in
XLogRecordAssemble() function in xloginsert.c file. So we oversized
xlog record max size and log error in critical section. I found thread
where this problem was partially discussed:
https://www.postgresql.org/message-id/flat/CAEze2WgGiw%2BLZt%2BvHf8tWqB_6VxeLsMeoAuod0N%3Dij1q17n5pw%40mail.gmail.com.
Some ideas from thread:
"I think the big issue with the patch as it stands is that it will typically
cause PANICs on failure, because the record-too-large ERROR be a in a
critical
section. That's still better than generating a record that can't be
replayed,
but it's not good."

In my opinion we can avoid PANIC in critical section, so it is better
check xlog size before critical section.
Also I have some ideas how we can do it.
So I've attached patch to check oversize xlog record before critical
section.
It seems, from one side it doesn't complicate codebase and from other
side it helps to solve above problem.

Best regards,
Maksim Melnikov

Вложения

Re: Panic during xlog building with big values

От
"Maksim.Melnikov"
Дата:
Hi, forgot to exec pgindent for patch, so attach new version with 
indents fixes.

Best regards,

Maksim Melnikov
Вложения

Re: Panic during xlog building with big values

От
Andy Pogrebnoi
Дата:
Hi,

> On Jul 7, 2025, at 17:05, Maksim.Melnikov <m.melnikov@postgrespro.ru> wrote:
>
> Hello,
> during testing we found following issue when update records with big values.
>
> 2025-07-07 14:40:30.434 MSK [125435] PANIC:  oversized WAL record
> 2025-07-07 14:40:30.434 MSK [125435] DETAIL:  WAL record would be 1073742015 bytes (of maximum 1069547520 bytes);
rmid10 flags 64. 
>
> tested commit: 62a17a92833d1eaa60d8ea372663290942a1e8eb
>
> Test description:
>
> set wal_level = logical in postgresql.conf
>
> CREATE DATABASE regression_big_values WITH TEMPLATE = template0 ENCODING = 'UTF8';
> \c regression_big_values
> CREATE TABLE big_text_test (i int, c1 text, c2 text);
> -- Mark columns as toastable, but don't try to compress
> ALTER TABLE big_text_test ALTER c1 SET STORAGE EXTERNAL;
> ALTER TABLE big_text_test ALTER c2 SET STORAGE EXTERNAL;
> ALTER TABLE big_text_test REPLICA IDENTITY FULL;
> INSERT INTO big_text_test (i, c1, c2) VALUES (1, repeat('a', 1073741737), NULL);
> UPDATE big_text_test SET c2 = repeat('b', 1073741717);
>

I tried the patch and it fixes the test case. Now it produces an ERROR instead of PANIC.

I’m wondering, though, if there are other places that can produce huge records besides ExtractReplicaIdentity? Andres
Freundhas also suggested changes in RecordTransactionCommit(), for example [1]. 


> @@ -9043,6 +9044,33 @@ log_heap_update(Relation reln, Buffer oldbuf,
> return recptr;
> }
>
> +/*
> + * Pre-check potential XLogRecord oversize. XLogRecord will be created
> + * later, and it size will be checked, but it will occur in critical
> + * section and in case of failure core dump will be generated.
> + * It seems not good, so to avoid this, we can calculate approximate
> + * xlog record size here and check it.
> + *
> + * Size prediction is based on xlog update and xlog delete logic and can
> + * be revised in case of it changing, now buf size is limited by
> + * UINT16_MAX(Assert(regbuf->rdata_len <= UINT16_MAX) in xloginsert).
> + *
> + * Anyway to accommodate some overhead, 1M is substract from predicted
> + * value. It seems now it is quite enough.
> + */

I also suggest tidying up grammar and syntax a bit in the comment above. My variant would be:

/*
* Pre-check potential XLogRecord oversize. XLogRecord will be created
* later, and its size will be checked. However, this operation will
* occur within a critical section, and in the event of failure, a core
* dump will be generated.
* It does not seem good, so to avoid this, we can calculate the approximate
* xlog record size here and check it.
*
* Size prediction is based on xlog update and xlog delete logic, and can
* be revised if it changes. For now, the buf size is limited by
* UINT16_MAX (Assert(regbuf->rdata_len <= UINT16_MAX) in xloginsert).
*
* Anyway, to accommodate some overhead, 1M is subtracted from the predicted
* value. It seems like that's enough for now.
*/


Cheers,
Andy

[1] https://www.postgresql.org/message-id/20221202165717.wtdd5ijoqawrdt75%40awork3.anarazel.de





Re: Panic during xlog building with big values

От
"Maksim.Melnikov"
Дата:
Hi, Andy, thanks for your review!

I've checked RecordTransactionCommit too, but I don't think it can fire 
similar error. Problem, that was described above, occurred because we 
used external column storage without compression and with REPLICA 
IDENTITY FULL.
To be honest, it's degenerate case, that can occur only in case of tuple 
update/delete, when we need full row to identify updated/deleted value, 
more info can be found in doc [1].

I've fixed comments with yours remarks, thanks. Patch is attached.

Also rebased patch with commit d3ba50db48e66be8804b9edf093b0f921d625425.

[1] 
https://www.postgresql.org/docs/current/logical-replication-publication.html

Best regards,
Maksim Melnikov

Вложения

Re: Panic during xlog building with big values

От
Michael Paquier
Дата:
On Tue, Oct 14, 2025 at 10:08:12AM +0300, Maksim.Melnikov wrote:
> I've checked RecordTransactionCommit too, but I don't think it can fire
> similar error. Problem, that was described above, occurred because we used
> external column storage without compression and with REPLICA IDENTITY FULL.
> To be honest, it's degenerate case, that can occur only in case of tuple
> update/delete, when we need full row to identify updated/deleted value, more
> info can be found in doc [1].

"Degenerate" sounds like a pretty good term to define your test case.
So the issue is that the uncompressed TOAST blobs get so large that
the mainrdata_len computed with a single call of XLogRegisterData()
triggers the size restriction.  The protections added in XLogInsert()
are doing their job here: the record generated by the UPDATE cannot be
replayed, failing on an allocation failure in the standby if one lifts
the size restriction in XLogInsert().  What's pretty "good" about your
case is that the first INSERT is large, but small enough so as a
palloc() would fail on the initial insertion, making it succeed.  Only
the second UPDATE would become large enough, still you are able to
bypass the allocation limits with a combination of the old and new
tuple data that need to be replicated because of the full replica
identity.  Fun case, I'd say.

> I've fixed comments with yours remarks, thanks. Patch is attached.

I see what you are doing in your patch.  ExtractReplicaIdentity() has
only two callers: heap_update() or heap_delete().  Both document that
this stuff happens before entering a critical section to avoid a PANIC
on allocation, but this does not count for the overhead required by a
WAL record because we don't know yet how large the record will be
(well most of it is going to be the old tuple key anyway), as we may
have pages, some of them compressed or holes.  Then your patch adds an
extra check depending on the size of the "old" key generated.

+static void
+log_heap_precheck(Relation reln, HeapTuple tp)
+{
+#define XLogRecordMaxOverhead ((uint32) (1024 * 1024))
+
+    if (tp && RelationIsLogicallyLogged(reln))
+    {
+        uint32        data_len = tp->t_len - SizeofHeapTupleHeader;
+
+        XLogPreCheckSize(data_len + XLogRecordMaxOverhead);
+    }
+}

This adds a size prediction of XLogRecordMaxOverhead on top of the
existing XLogRecordMaxSize, which is itself an estimation with a 4MB
allocation overhead allowed, so you are adding a second estimation
layer on top of the existing one based on how much the XLogReader
needs when processing a record.  This is not optimal, and we cannot
have a precise number until we have computed all the elements that
build a WAL record.

Some numbers I've grabbed on the way, while looking at your case, for
reference:
- size of allocation at replay: 1073750016
- number of repeat values in the UPDATE: 1073741717
- size registered in XLogRegisterData(): 1073741746

A different way to think about the problem would be to rework the way
we flatten the tuple when a old tuple is extracted in full.  For
example, if some attributes are external but not compressed, we could
also take the route to force some compression in the key extracted to
make it shorter and able to fit in a record all the time.  External
but uncompressed data is not a very common case, so this may not
justify the extra implementation cost and complications in the tuple
flattening routines.

Perhaps the best answer is just to do nothing here.
--
Michael

Вложения