Re: Notes about fixing regexes and UTF-8 (yet again)

Поиск
Список
Период
Сортировка
От	Tom Lane
Тема	Re: Notes about fixing regexes and UTF-8 (yet again)
Дата	17 февраля 2012 г. 22:17:48
Msg-id	12169.1329531447@sss.pgh.pa.us обсуждение
Ответ на	Re: Notes about fixing regexes and UTF-8 (yet again) (Tom Lane <tgl@sss.pgh.pa.us>)
Ответы	Re: Notes about fixing regexes and UTF-8 (yet again)
Список	pgsql-hackers
Дерево обсуждения
I wrote:
> The answer, on a reasonably new desktop machine (2.0GHz Xeon E5503)
> running Fedora 16 in en_US.utf8 locale, is that 64K iterations of
> pg_wc_isalpha or sibling functions requires a shade under 2ms.
> So this definitely justifies caching the values to avoid computing
> them more than once per session, but I'm not convinced there are
> grounds for trying harder than that.

And here's a poorly-tested draft patch for that.

            regards, tom lane


diff --git a/src/backend/regex/regc_cvec.c b/src/backend/regex/regc_cvec.c
index fb6f06b5243f50bfad2cefa5c016d4e842791a3d..98f3c597678b492dd59afcd956e5cdfecdba4f86 100644
*** a/src/backend/regex/regc_cvec.c
--- b/src/backend/regex/regc_cvec.c
*************** static void
*** 77,82 ****
--- 77,83 ----
  addchr(struct cvec * cv,        /* character vector */
         chr c)                    /* character to add */
  {
+     assert(cv->nchrs < cv->chrspace);
      cv->chrs[cv->nchrs++] = (chr) c;
  }

diff --git a/src/backend/regex/regc_locale.c b/src/backend/regex/regc_locale.c
index 6cf27958b1545a61fba01e76dc4d37aca32789dc..44ce582bdad1a7d830d4122cada45a39c188981c 100644
*** a/src/backend/regex/regc_locale.c
--- b/src/backend/regex/regc_locale.c
*************** static const struct cname
*** 351,356 ****
--- 351,366 ----


  /*
+  * We do not use the hard-wired Unicode classification tables that Tcl does.
+  * This is because (a) we need to deal with other encodings besides Unicode,
+  * and (b) we want to track the behavior of the libc locale routines as
+  * closely as possible.  For example, it wouldn't be unreasonable for a
+  * locale to not consider every Unicode letter as a letter.  So we build
+  * character classification cvecs by asking libc, even for Unicode.
+  */
+
+
+ /*
   * element - map collating-element name to celt
   */
  static celt
*************** cclass(struct vars * v,            /* context */
*** 498,503 ****
--- 508,514 ----
         int cases)                /* case-independent? */
  {
      size_t        len;
+     const struct cvec *ccv = NULL;
      struct cvec *cv = NULL;
      const char * const *namePtr;
      int            i,
*************** cclass(struct vars * v,            /* context */
*** 549,626 ****

      /*
       * Now compute the character class contents.
-      *
-      * For the moment, assume that only char codes < 256 can be in these
-      * classes.
       */

      switch ((enum classes) index)
      {
          case CC_PRINT:
!             cv = getcvec(v, UCHAR_MAX, 0);
!             if (cv)
!             {
!                 for (i = 0; i <= UCHAR_MAX; i++)
!                 {
!                     if (pg_wc_isprint((chr) i))
!                         addchr(cv, (chr) i);
!                 }
!             }
              break;
          case CC_ALNUM:
!             cv = getcvec(v, UCHAR_MAX, 0);
!             if (cv)
!             {
!                 for (i = 0; i <= UCHAR_MAX; i++)
!                 {
!                     if (pg_wc_isalnum((chr) i))
!                         addchr(cv, (chr) i);
!                 }
!             }
              break;
          case CC_ALPHA:
!             cv = getcvec(v, UCHAR_MAX, 0);
!             if (cv)
!             {
!                 for (i = 0; i <= UCHAR_MAX; i++)
!                 {
!                     if (pg_wc_isalpha((chr) i))
!                         addchr(cv, (chr) i);
!                 }
!             }
              break;
          case CC_ASCII:
              cv = getcvec(v, 0, 1);
              if (cv)
                  addrange(cv, 0, 0x7f);
              break;
          case CC_BLANK:
              cv = getcvec(v, 2, 0);
              addchr(cv, '\t');
              addchr(cv, ' ');
              break;
          case CC_CNTRL:
              cv = getcvec(v, 0, 2);
              addrange(cv, 0x0, 0x1f);
              addrange(cv, 0x7f, 0x9f);
              break;
          case CC_DIGIT:
!             cv = getcvec(v, 0, 1);
!             if (cv)
!                 addrange(cv, (chr) '0', (chr) '9');
              break;
          case CC_PUNCT:
!             cv = getcvec(v, UCHAR_MAX, 0);
!             if (cv)
!             {
!                 for (i = 0; i <= UCHAR_MAX; i++)
!                 {
!                     if (pg_wc_ispunct((chr) i))
!                         addchr(cv, (chr) i);
!                 }
!             }
              break;
          case CC_XDIGIT:
              cv = getcvec(v, 0, 3);
              if (cv)
              {
--- 560,608 ----

      /*
       * Now compute the character class contents.
       */

      switch ((enum classes) index)
      {
          case CC_PRINT:
!             ccv = pg_ctype_get_cache(pg_wc_isprint);
              break;
          case CC_ALNUM:
!             ccv = pg_ctype_get_cache(pg_wc_isalnum);
              break;
          case CC_ALPHA:
!             ccv = pg_ctype_get_cache(pg_wc_isalpha);
              break;
          case CC_ASCII:
+             /* hard-wired meaning */
              cv = getcvec(v, 0, 1);
              if (cv)
                  addrange(cv, 0, 0x7f);
              break;
          case CC_BLANK:
+             /* hard-wired meaning */
              cv = getcvec(v, 2, 0);
              addchr(cv, '\t');
              addchr(cv, ' ');
              break;
          case CC_CNTRL:
+             /* hard-wired meaning */
              cv = getcvec(v, 0, 2);
              addrange(cv, 0x0, 0x1f);
              addrange(cv, 0x7f, 0x9f);
              break;
          case CC_DIGIT:
!             ccv = pg_ctype_get_cache(pg_wc_isdigit);
              break;
          case CC_PUNCT:
!             ccv = pg_ctype_get_cache(pg_wc_ispunct);
              break;
          case CC_XDIGIT:
+             /*
+              * It's not clear how to define this in non-western locales, and
+              * even less clear that there's any particular use in trying.
+              * So just hard-wire the meaning.
+              */
              cv = getcvec(v, 0, 3);
              if (cv)
              {
*************** cclass(struct vars * v,            /* context */
*** 630,679 ****
              }
              break;
          case CC_SPACE:
!             cv = getcvec(v, UCHAR_MAX, 0);
!             if (cv)
!             {
!                 for (i = 0; i <= UCHAR_MAX; i++)
!                 {
!                     if (pg_wc_isspace((chr) i))
!                         addchr(cv, (chr) i);
!                 }
!             }
              break;
          case CC_LOWER:
!             cv = getcvec(v, UCHAR_MAX, 0);
!             if (cv)
!             {
!                 for (i = 0; i <= UCHAR_MAX; i++)
!                 {
!                     if (pg_wc_islower((chr) i))
!                         addchr(cv, (chr) i);
!                 }
!             }
              break;
          case CC_UPPER:
!             cv = getcvec(v, UCHAR_MAX, 0);
!             if (cv)
!             {
!                 for (i = 0; i <= UCHAR_MAX; i++)
!                 {
!                     if (pg_wc_isupper((chr) i))
!                         addchr(cv, (chr) i);
!                 }
!             }
              break;
          case CC_GRAPH:
!             cv = getcvec(v, UCHAR_MAX, 0);
!             if (cv)
!             {
!                 for (i = 0; i <= UCHAR_MAX; i++)
!                 {
!                     if (pg_wc_isgraph((chr) i))
!                         addchr(cv, (chr) i);
!                 }
!             }
              break;
      }
      if (cv == NULL)
          ERR(REG_ESPACE);
      return cv;
--- 612,643 ----
              }
              break;
          case CC_SPACE:
!             ccv = pg_ctype_get_cache(pg_wc_isspace);
              break;
          case CC_LOWER:
!             ccv = pg_ctype_get_cache(pg_wc_islower);
              break;
          case CC_UPPER:
!             ccv = pg_ctype_get_cache(pg_wc_isupper);
              break;
          case CC_GRAPH:
!             ccv = pg_ctype_get_cache(pg_wc_isgraph);
              break;
      }
+
+     /* If we got a cached cvec, copy it to a freeable one. */
+     if (ccv != NULL)
+     {
+         cv = getcvec(v, ccv->nchrs, ccv->nranges);
+         if (cv)
+         {
+             cv->nchrs = ccv->nchrs;
+             memcpy(cv->chrs, ccv->chrs, cv->nchrs * sizeof(chr));
+             cv->nranges = ccv->nranges;
+             memcpy(cv->ranges, ccv->ranges, cv->nranges * sizeof(chr) * 2);
+         }
+     }
+
      if (cv == NULL)
          ERR(REG_ESPACE);
      return cv;
diff --git a/src/backend/regex/regc_pg_locale.c b/src/backend/regex/regc_pg_locale.c
index 7c010e372858065bedbc47382a84cb3650e03efa..9ea7a81b176de59ed989238b1849672108374d97 100644
*** a/src/backend/regex/regc_pg_locale.c
--- b/src/backend/regex/regc_pg_locale.c
***************
*** 1,7 ****
  /*-------------------------------------------------------------------------
   *
   * regc_pg_locale.c
!  *      ctype functions adapted to work on pg_wchar (a/k/a chr)
   *
   * This file is #included by regcomp.c; it's not meant to compile standalone.
   *
--- 1,8 ----
  /*-------------------------------------------------------------------------
   *
   * regc_pg_locale.c
!  *      ctype functions adapted to work on pg_wchar (a/k/a chr),
!  *      and functions to cache the results of wholesale ctype probing.
   *
   * This file is #included by regcomp.c; it's not meant to compile standalone.
   *
*************** typedef enum
*** 72,77 ****
--- 73,79 ----

  static PG_Locale_Strategy pg_regex_strategy;
  static pg_locale_t pg_regex_locale;
+ static Oid    pg_regex_collation;

  /*
   * Hard-wired character properties for C locale
*************** pg_set_regex_collation(Oid collation)
*** 233,238 ****
--- 235,241 ----
          /* C/POSIX collations use this path regardless of database encoding */
          pg_regex_strategy = PG_REGEX_LOCALE_C;
          pg_regex_locale = 0;
+         pg_regex_collation = C_COLLATION_OID;
      }
      else
      {
*************** pg_set_regex_collation(Oid collation)
*** 275,280 ****
--- 278,285 ----
              else
                  pg_regex_strategy = PG_REGEX_LOCALE_1BYTE;
          }
+
+         pg_regex_collation = collation;
      }
  }

*************** pg_wc_tolower(pg_wchar c)
*** 656,658 ****
--- 661,873 ----
      }
      return 0;                    /* can't get here, but keep compiler quiet */
  }
+
+
+ /*
+  * These functions cache the results of probing libc's ctype behavior for
+  * all character codes of interest in a given encoding/collation.  The
+  * result is provided as a "struct cvec", but notice that the representation
+  * is a touch different from a cvec created by regc_cvec.c: we allocate the
+  * chrs[] and ranges[] arrays separately from the struct so that we can
+  * realloc them larger at need.  This is okay since the cvecs made here
+  * should never be freed by freecvec().
+  *
+  * We use malloc not palloc since we mustn't lose control on out-of-memory;
+  * the main regex code expects us to return a failure indication instead.
+  */
+
+ typedef int (*pg_wc_probefunc) (pg_wchar c);
+
+ typedef struct pg_ctype_cache
+ {
+     pg_wc_probefunc probefunc;        /* pg_wc_isalpha or a sibling */
+     Oid            collation;            /* collation this entry is for */
+     struct cvec cv;                    /* cache entry contents */
+     struct pg_ctype_cache *next;    /* chain link */
+ } pg_ctype_cache;
+
+ static pg_ctype_cache *pg_ctype_cache_list = NULL;
+
+ /*
+  * Add a chr or range to pcc->cv; return false if run out of memory
+  */
+ static bool
+ store_match(pg_ctype_cache *pcc, pg_wchar chr1, int nchrs)
+ {
+     chr           *newchrs;
+
+     if (nchrs > 1)
+     {
+         if (pcc->cv.nranges >= pcc->cv.rangespace)
+         {
+             pcc->cv.rangespace *= 2;
+             newchrs = (chr *) realloc(pcc->cv.ranges,
+                                       pcc->cv.rangespace * sizeof(chr) * 2);
+             if (newchrs == NULL)
+                 return false;
+             pcc->cv.ranges = newchrs;
+         }
+         pcc->cv.ranges[pcc->cv.nranges * 2] = chr1;
+         pcc->cv.ranges[pcc->cv.nranges * 2 + 1] = chr1 + nchrs - 1;
+         pcc->cv.nranges++;
+     }
+     else
+     {
+         assert(nchrs == 1);
+         if (pcc->cv.nchrs >= pcc->cv.chrspace)
+         {
+             pcc->cv.chrspace *= 2;
+             newchrs = (chr *) realloc(pcc->cv.chrs,
+                                       pcc->cv.chrspace * sizeof(chr));
+             if (newchrs == NULL)
+                 return false;
+             pcc->cv.chrs = newchrs;
+         }
+         pcc->cv.chrs[pcc->cv.nchrs++] = chr1;
+     }
+     return true;
+ }
+
+ /*
+  * Given a probe function (e.g., pg_wc_isalpha) get a struct cvec for all
+  * chrs satisfying the probe function.  The active collation is the one
+  * previously set by pg_set_regex_collation.  Returns NULL if out of memory.
+  *
+  * Note that the result must NOT be freed or modified by caller.
+  */
+ static const struct cvec *
+ pg_ctype_get_cache(pg_wc_probefunc probefunc)
+ {
+     pg_ctype_cache *pcc;
+     pg_wchar    max_chr;
+     pg_wchar    cur_chr;
+     int            seq;
+     chr           *newchrs;
+
+     /*
+      * Do we already have the answer cached?
+      */
+     for (pcc = pg_ctype_cache_list; pcc != NULL; pcc = pcc->next)
+     {
+         if (pcc->probefunc == probefunc &&
+             pcc->collation == pg_regex_collation)
+             return &pcc->cv;
+     }
+
+     /*
+      * Nope, so initialize some workspace ...
+      */
+     pcc = (pg_ctype_cache *) malloc(sizeof(pg_ctype_cache));
+     if (pcc == NULL)
+         return NULL;
+     pcc->probefunc = probefunc;
+     pcc->collation = pg_regex_collation;
+     pcc->cv.nchrs = 0;
+     pcc->cv.chrspace = 256;
+     pcc->cv.chrs = (chr *) malloc(pcc->cv.chrspace * sizeof(chr));
+     pcc->cv.nranges = 0;
+     pcc->cv.rangespace = 256;
+     pcc->cv.ranges = (chr *) malloc(pcc->cv.rangespace * sizeof(chr) * 2);
+     if (pcc->cv.chrs == NULL || pcc->cv.ranges == NULL)
+         goto out_of_memory;
+
+     /*
+      * Decide how many character codes we ought to look through.  For C locale
+      * there's no need to go further than 127.  Otherwise, if the encoding is
+      * UTF8 go up to 0xFFFF (the end of the Basic Multilingual Plane).
+      * Otherwise, go up to 255.  These limits are interrelated with
+      * restrictions discussed at the head of this file.
+      */
+     switch (pg_regex_strategy)
+     {
+         case PG_REGEX_LOCALE_C:
+             max_chr = (pg_wchar) 127;
+             break;
+         case PG_REGEX_LOCALE_WIDE:
+         case PG_REGEX_LOCALE_WIDE_L:
+             max_chr = (pg_wchar) 0xFFFF;
+             break;
+         case PG_REGEX_LOCALE_1BYTE:
+         case PG_REGEX_LOCALE_1BYTE_L:
+             max_chr = (pg_wchar) UCHAR_MAX;
+             break;
+         default:
+             max_chr = 0;        /* can't get here, but keep compiler quiet */
+             break;
+     }
+
+     /*
+      * And scan 'em ...
+      */
+     seq = 0;                    /* number of consecutive matches */
+
+     for (cur_chr = 0; cur_chr <= max_chr; cur_chr++)
+     {
+         if ((*probefunc) (cur_chr))
+             seq++;
+         else if (seq > 0)
+         {
+             if (!store_match(pcc, cur_chr - seq, seq))
+                 goto out_of_memory;
+             seq = 0;
+         }
+     }
+
+     if (seq > 0)
+         if (!store_match(pcc, cur_chr - seq, seq))
+             goto out_of_memory;
+
+     /*
+      * We might have allocated more memory than needed, if so free it
+      */
+     if (pcc->cv.nchrs == 0)
+     {
+         free(pcc->cv.chrs);
+         pcc->cv.chrs = NULL;
+         pcc->cv.chrspace = 0;
+     }
+     else if (pcc->cv.nchrs < pcc->cv.chrspace)
+     {
+         newchrs = (chr *) realloc(pcc->cv.chrs,
+                                   pcc->cv.nchrs * sizeof(chr));
+         if (newchrs == NULL)
+             goto out_of_memory;
+         pcc->cv.chrs = newchrs;
+         pcc->cv.chrspace = pcc->cv.nchrs;
+     }
+     if (pcc->cv.nranges == 0)
+     {
+         free(pcc->cv.ranges);
+         pcc->cv.ranges = NULL;
+         pcc->cv.rangespace = 0;
+     }
+     else if (pcc->cv.nranges < pcc->cv.rangespace)
+     {
+         newchrs = (chr *) realloc(pcc->cv.ranges,
+                                   pcc->cv.nranges * sizeof(chr) * 2);
+         if (newchrs == NULL)
+             goto out_of_memory;
+         pcc->cv.ranges = newchrs;
+         pcc->cv.rangespace = pcc->cv.nranges;
+     }
+
+     /*
+      * Success, link it into cache chain
+      */
+     pcc->next = pg_ctype_cache_list;
+     pg_ctype_cache_list = pcc;
+
+     return &pcc->cv;
+
+     /*
+      * Failure, clean up
+      */
+ out_of_memory:
+     if (pcc->cv.chrs)
+         free(pcc->cv.chrs);
+     if (pcc->cv.ranges)
+         free(pcc->cv.ranges);
+     free(pcc);
+
+     return NULL;
+ }
В списке pgsql-hackers по дате отправления:
Вход в личный кабинет

Восстановление пароля

Подтверждение аккаунта

Изменение пароля

Re: Notes about fixing regexes and UTF-8 (yet again)