Re: Another regexp performance improvement: skip useless paren-captures
| От | Tom Lane |
|---|---|
| Тема | Re: Another regexp performance improvement: skip useless paren-captures |
| Дата | |
| Msg-id | 3643732.1628551901@sss.pgh.pa.us обсуждение исходный текст |
| Ответ на | Re: Another regexp performance improvement: skip useless paren-captures (Mark Dilger <mark.dilger@enterprisedb.com>) |
| Ответы |
Re: Another regexp performance improvement: skip useless paren-captures
Re: Another regexp performance improvement: skip useless paren-captures |
| Список | pgsql-hackers |
Mark Dilger <mark.dilger@enterprisedb.com> writes:
> +select regexp_split_to_array('', '(?:((?:q+))){0}(\1){0,0}?*[^]');
> +server closed the connection unexpectedly
Here's a quick draft patch for this. Basically it moves the
responsibility for clearing v->subs[] pointers into the freesubre()
recursion, so that it will happen for contained capturing parens
not only the top level.
There is a potentially interesting definitional question:
what exactly ought this regexp do?
((.)){0}\2
Because the capturing paren sets are zero-quantified, they will
never be matched to any characters, so the backref can never
have any defined referent. I suspect that study of the POSIX
spec would lead to the conclusion that this is a legal regexp
but it will never match anything. Implementing that would be
tedious though, and what's more it seems very unlikely that
the user wanted any such behavior. So I think throwing an
error is an appropriate response. The existing code will
throw such an error for
((.)){0}\1
so I guess Spencer did think about this to some extent -- he
just forgot about the possibility of nested parens.
This patch should work OK in HEAD and v14, but it will need
a bit of fooling-about for older branches I think, given that
they fill v->subs[] a little differently.
regards, tom lane
diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c
index ae3a7b6a38..ae1ff670f9 100644
--- a/src/backend/regex/regcomp.c
+++ b/src/backend/regex/regcomp.c
@@ -550,8 +550,6 @@ freev(struct vars *v,
{
if (v->re != NULL)
rfree(v->re);
- if (v->subs != v->sub10)
- FREE(v->subs);
if (v->nfa != NULL)
freenfa(v->nfa);
if (v->tree != NULL)
@@ -562,6 +560,8 @@ freev(struct vars *v,
freecvec(v->cv);
if (v->cv2 != NULL)
freecvec(v->cv2);
+ if (v->subs != v->sub10)
+ FREE(v->subs);
if (v->lacons != NULL)
freelacons(v->lacons, v->nlacons);
ERR(err); /* nop if err==0 */
@@ -1091,8 +1091,8 @@ parseqatom(struct vars *v,
{
if (atom != NULL)
freesubre(v, atom);
- if (atomtype == '(')
- v->subs[subno] = NULL;
+ /* freesubre() will clean up any v->subs[] pointers into the atom */
+ assert(atomtype != '(' || v->subs[subno] == NULL);
delsub(v->nfa, lp, rp);
EMPTYARC(lp, rp);
return top;
@@ -2127,9 +2127,24 @@ freesrnode(struct vars *v, /* might be NULL */
if (sr == NULL)
return;
+ /*
+ * If the node is referenced in v->subs[], clear that to avoid leaving a
+ * dangling pointer. This should only ever matter when parseqatom() is
+ * throwing away a {0,0}-quantified atom that contains capturing parens.
+ * Doing this will cause later backrefs to such parens to fail, which is
+ * maybe not strictly POSIX but seems more helpful than allowing a backref
+ * that can never have a defined referent.
+ */
+ if (sr->capno > 0 && v != NULL)
+ {
+ assert(v->subs[sr->capno] == sr);
+ v->subs[sr->capno] = NULL;
+ }
+
if (!NULLCNFA(sr->cnfa))
freecnfa(&sr->cnfa);
sr->flags = 0; /* in particular, not INUSE */
+ sr->capno = 0;
sr->child = sr->sibling = NULL;
sr->begin = sr->end = NULL;
diff --git a/src/test/modules/test_regex/expected/test_regex.out b/src/test/modules/test_regex/expected/test_regex.out
index 5a6cdf47c2..850e0c8ef5 100644
--- a/src/test/modules/test_regex/expected/test_regex.out
+++ b/src/test/modules/test_regex/expected/test_regex.out
@@ -3506,6 +3506,10 @@ select * from test_regex('((.))(\2)', 'xyy', 'oRP');
{yy,NULL,NULL,NULL}
(2 rows)
+-- # throwing an error for this is arguably not per POSIX
+-- expectError 21.39 - {((.)){0}(\2){0}} ESUBREG
+select * from test_regex('((.)){0}(\2){0}', '', '-');
+ERROR: invalid regular expression: invalid backreference number
-- doing 22 "multicharacter collating elements"
-- # again ugh
-- MCCEs are not implemented in Postgres, so we skip all these tests
diff --git a/src/test/modules/test_regex/sql/test_regex.sql b/src/test/modules/test_regex/sql/test_regex.sql
index 3419564203..378f22b67b 100644
--- a/src/test/modules/test_regex/sql/test_regex.sql
+++ b/src/test/modules/test_regex/sql/test_regex.sql
@@ -1020,6 +1020,9 @@ select * from test_regex('((.))(\2){0}', 'xy', 'RPQ');
select * from test_regex('((.))(\2)', 'xyy', 'RP');
-- expectMatch 21.38 oRP ((.))(\2) xyy yy {} {} {}
select * from test_regex('((.))(\2)', 'xyy', 'oRP');
+-- # throwing an error for this is arguably not per POSIX
+-- expectError 21.39 - {((.)){0}(\2){0}} ESUBREG
+select * from test_regex('((.)){0}(\2){0}', '', '-');
-- doing 22 "multicharacter collating elements"
-- # again ugh
В списке pgsql-hackers по дате отправления: