Fix quadratic performance of regexp match/split functions

Поиск
Список
Период
Сортировка
От Andrew Gierth
Тема Fix quadratic performance of regexp match/split functions
Дата
Msg-id 87pnyn55qh.fsf@news-spur.riddles.org.uk
обсуждение исходный текст
Ответы Re: Fix quadratic performance of regexp match/split functions
Список pgsql-hackers
While poking at that xml issue I tried to compare the memory usage of
the xml queries with vaguely comparable regexp_split_to_table queries,
and found that I could not do so; the regexp queries simply did not
complete in any sensible timeframe.

Investigation showed that in UTF8 (or other multibyte encodings), the
performance of regexp_matches and regexp_split_to_* is subject to
quadratic slowdowns thanks to the use of character offsets and calls to
text_substr, which must scan the string from the start each time to
count characters.

This patch changes that by keeping the wide-char copy of the string
available rather than freeing it, and converting the required substrings
back to the db encoding via a pre-allocated conversion buffer.

This gives noticable speedups on even very small strings (all timings
below are with -O2 --disable-cassert):

select count(*)
  from (select 'aaaaa,aaaaa,aaaaa'::text as content
          from generate_series(1,1000000) i offset 0) s,
       regexp_split_to_table(content, ',') r;

-- ~10% faster with patch: 2.8 s -> 2.5 s

but on strings of even modest size (~1kb) the improvement is vast:

select count(*)
  from (select repeat(repeat('a',10) || ',', 100+(i*0))::text
                 as content -- 1100 bytes, 101 matches
          from generate_series(1,100000) i offset 0) s,
       regexp_split_to_table(content, ',') r;

-- over 8 times faster: 51.8 sec -> 6.3 sec

and it only gets bigger:

select count(*)
  from regexp_split_to_table(repeat('aaa,',10000), ',') r;  -- 40KB

-- 270 times faster: 1628ms -> 6ms

This patch passes regression but I haven't yet tested it on complex
multibyte data (or non-UTF8 encodings but that shouldn't matter).

Comments?

-- 
Andrew (irc:RhodiumToad)

diff --git a/src/backend/utils/adt/regexp.c b/src/backend/utils/adt/regexp.c
index 5025a449fb..8839fb4ce2 100644
--- a/src/backend/utils/adt/regexp.c
+++ b/src/backend/utils/adt/regexp.c
@@ -61,6 +61,9 @@ typedef struct regexp_matches_ctx
     /* workspace for build_regexp_match_result() */
     Datum       *elems;            /* has npatterns elements */
     bool       *nulls;            /* has npatterns elements */
+    pg_wchar   *wide_str;        /* wide-char version of original string */
+    char       *conv_buf;        /* conversion buffer */
+    size_t        conv_bufsiz;    /* size thereof */
 } regexp_matches_ctx;
 
 /*
@@ -111,7 +114,8 @@ static regexp_matches_ctx *setup_regexp_matches(text *orig_str, text *pattern,
                      pg_re_flags *flags,
                      Oid collation,
                      bool use_subpatterns,
-                     bool ignore_degenerate);
+                     bool ignore_degenerate,
+                     bool fetching_unmatched);
 static void cleanup_regexp_matches(regexp_matches_ctx *matchctx);
 static ArrayType *build_regexp_match_result(regexp_matches_ctx *matchctx);
 static Datum build_regexp_split_result(regexp_matches_ctx *splitctx);
@@ -863,7 +867,7 @@ regexp_match(PG_FUNCTION_ARGS)
                  errhint("Use the regexp_matches function instead.")));
 
     matchctx = setup_regexp_matches(orig_str, pattern, &re_flags,
-                                    PG_GET_COLLATION(), true, false);
+                                    PG_GET_COLLATION(), true, false, false);
 
     if (matchctx->nmatches == 0)
         PG_RETURN_NULL();
@@ -911,7 +915,7 @@ regexp_matches(PG_FUNCTION_ARGS)
         matchctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern,
                                         &re_flags,
                                         PG_GET_COLLATION(),
-                                        true, false);
+                                        true, false, false);
 
         /* Pre-create workspace that build_regexp_match_result needs */
         matchctx->elems = (Datum *) palloc(sizeof(Datum) * matchctx->npatterns);
@@ -954,7 +958,7 @@ regexp_matches_no_flags(PG_FUNCTION_ARGS)
  * all the matching in one swoop.  The returned regexp_matches_ctx contains
  * the locations of all the substrings matching the pattern.
  *
- * The two bool parameters have only two patterns (one for matching, one for
+ * The three bool parameters have only two patterns (one for matching, one for
  * splitting) but it seems clearer to distinguish the functionality this way
  * than to key it all off one "is_split" flag.
  */
@@ -962,9 +966,11 @@ static regexp_matches_ctx *
 setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
                      Oid collation,
                      bool use_subpatterns,
-                     bool ignore_degenerate)
+                     bool ignore_degenerate,
+                     bool fetching_unmatched)
 {
     regexp_matches_ctx *matchctx = palloc0(sizeof(regexp_matches_ctx));
+    int            eml = pg_database_encoding_max_length();
     int            orig_len;
     pg_wchar   *wide_str;
     int            wide_len;
@@ -975,6 +981,7 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
     int            array_idx;
     int            prev_match_end;
     int            start_search;
+    int            maxlen = 0;        /* largest fetch length in characters */
 
     /* save original string --- we'll extract result substrings from it */
     matchctx->orig_str = orig_str;
@@ -1024,7 +1031,7 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
              pmatch[0].rm_eo > prev_match_end))
         {
             /* enlarge output space if needed */
-            while (array_idx + matchctx->npatterns * 2 > array_len)
+            while (array_idx + matchctx->npatterns * 2 + 1 > array_len)
             {
                 array_len *= 2;
                 matchctx->match_locs = (int *) repalloc(matchctx->match_locs,
@@ -1038,16 +1045,29 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
 
                 for (i = 1; i <= matchctx->npatterns; i++)
                 {
-                    matchctx->match_locs[array_idx++] = pmatch[i].rm_so;
-                    matchctx->match_locs[array_idx++] = pmatch[i].rm_eo;
+                    int        so = pmatch[i].rm_so;
+                    int        eo = pmatch[i].rm_eo;
+                    matchctx->match_locs[array_idx++] = so;
+                    matchctx->match_locs[array_idx++] = eo;
+                    if (so >= 0 && eo >= 0 && (eo - so) > maxlen)
+                        maxlen = (eo - so);
                 }
             }
             else
             {
-                matchctx->match_locs[array_idx++] = pmatch[0].rm_so;
-                matchctx->match_locs[array_idx++] = pmatch[0].rm_eo;
+                int        so = pmatch[0].rm_so;
+                int        eo = pmatch[0].rm_eo;
+                matchctx->match_locs[array_idx++] = so;
+                matchctx->match_locs[array_idx++] = eo;
+                if (so >= 0 && eo >= 0 && (eo - so) > maxlen)
+                    maxlen = (eo - so);
             }
             matchctx->nmatches++;
+
+            if (fetching_unmatched &&
+                pmatch[0].rm_so >= 0 &&
+                (pmatch[0].rm_so - prev_match_end) > maxlen)
+                maxlen = (pmatch[0].rm_so - prev_match_end);
         }
         prev_match_end = pmatch[0].rm_eo;
 
@@ -1068,8 +1088,45 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
             break;
     }
 
+    if (fetching_unmatched &&
+        (wide_len - prev_match_end) > maxlen)
+        maxlen = (wide_len - prev_match_end);
+
+    /*
+     * Keep a note of the end position of the string for the benefit of
+     * splitting code.
+     */
+    matchctx->match_locs[array_idx] = wide_len;
+
+    if (eml > 1)
+    {
+        int64        maxsiz = (int64) maxlen * eml;
+        size_t        conv_bufsiz;
+
+        /*
+         * worst case: assume we need the maximum size (maxlen*eml), but take
+         * advantage of the fact that the original string length in bytes is an
+         * upper bound on the byte length of any fetched substring (and we know
+         * that len+1 is safe to allocate because the varlena header is longer
+         * than 1 byte).
+         */
+        if (maxsiz > orig_len)
+            conv_bufsiz = (size_t) orig_len + 1;
+        else
+            conv_bufsiz = (size_t) maxsiz + 1;    /* safe since maxsiz < 2^30 */
+        matchctx->conv_buf = palloc(conv_bufsiz);
+        matchctx->conv_bufsiz = conv_bufsiz;
+        matchctx->wide_str = wide_str;
+    }
+    else
+    {
+        pfree(wide_str);
+        matchctx->wide_str = NULL;
+        matchctx->conv_buf = NULL;
+        matchctx->conv_bufsiz = 0;
+    }
+
     /* Clean up temp storage */
-    pfree(wide_str);
     pfree(pmatch);
 
     return matchctx;
@@ -1082,6 +1139,10 @@ static void
 cleanup_regexp_matches(regexp_matches_ctx *matchctx)
 {
     pfree(matchctx->orig_str);
+    if (matchctx->wide_str)
+        pfree(matchctx->wide_str);
+    if (matchctx->conv_buf)
+        pfree(matchctx->conv_buf);
     pfree(matchctx->match_locs);
     if (matchctx->elems)
         pfree(matchctx->elems);
@@ -1096,6 +1157,7 @@ cleanup_regexp_matches(regexp_matches_ctx *matchctx)
 static ArrayType *
 build_regexp_match_result(regexp_matches_ctx *matchctx)
 {
+    int            eml = pg_database_encoding_max_length();
     Datum       *elems = matchctx->elems;
     bool       *nulls = matchctx->nulls;
     int            dims[1];
@@ -1115,6 +1177,17 @@ build_regexp_match_result(regexp_matches_ctx *matchctx)
             elems[i] = (Datum) 0;
             nulls[i] = true;
         }
+        else if (eml > 1)
+        {
+            size_t     bufsiz PG_USED_FOR_ASSERTS_ONLY = matchctx->conv_bufsiz;
+            char   *buf = matchctx->conv_buf;
+            int        len = pg_wchar2mb_with_len(matchctx->wide_str + so,
+                                               buf,
+                                               eo - so);
+            Assert(len < bufsiz);
+            elems[i] = PointerGetDatum(cstring_to_text_with_len(buf, len));
+            nulls[i] = false;
+        }
         else
         {
             elems[i] = DirectFunctionCall3(text_substr,
@@ -1168,7 +1241,7 @@ regexp_split_to_table(PG_FUNCTION_ARGS)
         splitctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern,
                                         &re_flags,
                                         PG_GET_COLLATION(),
-                                        false, true);
+                                        false, true, true);
 
         MemoryContextSwitchTo(oldcontext);
         funcctx->user_fctx = (void *) splitctx;
@@ -1224,7 +1297,7 @@ regexp_split_to_array(PG_FUNCTION_ARGS)
                                     PG_GETARG_TEXT_PP(1),
                                     &re_flags,
                                     PG_GET_COLLATION(),
-                                    false, true);
+                                    false, true, true);
 
     while (splitctx->next_match <= splitctx->nmatches)
     {
@@ -1261,6 +1334,7 @@ regexp_split_to_array_no_flags(PG_FUNCTION_ARGS)
 static Datum
 build_regexp_split_result(regexp_matches_ctx *splitctx)
 {
+    int            eml = pg_database_encoding_max_length();
     int            startpos;
     int            endpos;
 
@@ -1271,7 +1345,22 @@ build_regexp_split_result(regexp_matches_ctx *splitctx)
     if (startpos < 0)
         elog(ERROR, "invalid match ending position");
 
-    if (splitctx->next_match < splitctx->nmatches)
+    if (eml > 1)
+    {
+        size_t    bufsiz PG_USED_FOR_ASSERTS_ONLY = splitctx->conv_bufsiz;
+        char   *buf = splitctx->conv_buf;
+        int        len;
+
+        endpos = splitctx->match_locs[splitctx->next_match * 2];
+        if (endpos < startpos)
+            elog(ERROR, "invalid match starting position");
+        len = pg_wchar2mb_with_len(splitctx->wide_str + startpos,
+                                   buf,
+                                   endpos-startpos);
+        Assert(len < bufsiz);
+        return PointerGetDatum(cstring_to_text_with_len(buf, len));
+    }
+    else if (splitctx->next_match < splitctx->nmatches)
     {
         endpos = splitctx->match_locs[splitctx->next_match * 2];
         if (endpos < startpos)

В списке pgsql-hackers по дате отправления:

Предыдущее
От: Robert Haas
Дата:
Сообщение: Re: Allowing printf("%m") only where it actually works
Следующее
От: Amit Kapila
Дата:
Сообщение: Re: Explain buffers wrong counter with parallel plans