Re: reducing the footprint of ScanKeyword (was Re: Large writable variables)

Поиск
Список
Период
Сортировка
От Tom Lane
Тема Re: reducing the footprint of ScanKeyword (was Re: Large writable variables)
Дата
Msg-id 10021.1546907818@sss.pgh.pa.us
обсуждение исходный текст
Ответ на Re: reducing the footprint of ScanKeyword (was Re: Large writable variables)  (Tom Lane <tgl@sss.pgh.pa.us>)
Список pgsql-hackers
I wrote:
> Probably there's a lot to be criticized about the Perl style below;
> anybody feel a need to rewrite it?

Here's a somewhat better version.  I realized that I was being too
slavishly tied to the data structures used in the C version; in Perl
it's easier to manage the lists of edges as hashes.  I can't see any
need to distinguish left and right edge sets, either, so this just
has one such hash per vertex.

Also, it seems to me that we *can* make intelligent use of unused
hashtable entries to exit early on many non-keyword inputs.  The reason
the existing code fails to do so is that it computes the sums and
differences of hashtable entries in unsigned modulo arithmetic; but if
we make the hashtable entries signed, we can set them up as exact
differences and drop the final modulo operation in the hash function.
Then, any out-of-range sum must indicate an input that is not a keyword
(because it is touching a pair of hashtable entries that didn't go
together) and we can exit early from the caller.  This in turn lets us
mark unused hashtable entries with large values to ensure that sums
involving them will be out of range.

A weak spot in that argument is that it's not entirely clear how
large the differences can get --- with an unlucky series of
collisions, maybe they could get large enough to overflow int16?
I don't think that's likely for the size of problem this script
is going to encounter, so I just put in an error check for it.
But it could do with closer analysis before deciding that this
is a general-purpose solution.

            regards, tom lane

diff --git a/src/common/kwlookup.c b/src/common/kwlookup.c
index d72842e..9dc1fee 100644
*** a/src/common/kwlookup.c
--- b/src/common/kwlookup.c
***************
*** 35,94 ****
   * receive a different case-normalization mapping.
   */
  int
! ScanKeywordLookup(const char *text,
                    const ScanKeywordList *keywords)
  {
!     int            len,
!                 i;
!     char        word[NAMEDATALEN];
!     const char *kw_string;
!     const uint16 *kw_offsets;
!     const uint16 *low;
!     const uint16 *high;
!
!     len = strlen(text);

      if (len > keywords->max_kw_len)
!         return -1;                /* too long to be any keyword */
!
!     /* We assume all keywords are shorter than NAMEDATALEN. */
!     Assert(len < NAMEDATALEN);

      /*
!      * Apply an ASCII-only downcasing.  We must not use tolower() since it may
!      * produce the wrong translation in some locales (eg, Turkish).
       */
!     for (i = 0; i < len; i++)
!     {
!         char        ch = text[i];

!         if (ch >= 'A' && ch <= 'Z')
!             ch += 'a' - 'A';
!         word[i] = ch;
!     }
!     word[len] = '\0';

      /*
!      * Now do a binary search using plain strcmp() comparison.
       */
!     kw_string = keywords->kw_string;
!     kw_offsets = keywords->kw_offsets;
!     low = kw_offsets;
!     high = kw_offsets + (keywords->num_keywords - 1);
!     while (low <= high)
      {
!         const uint16 *middle;
!         int            difference;

!         middle = low + (high - low) / 2;
!         difference = strcmp(kw_string + *middle, word);
!         if (difference == 0)
!             return middle - kw_offsets;
!         else if (difference < 0)
!             low = middle + 1;
!         else
!             high = middle - 1;
      }

!     return -1;
  }
--- 35,89 ----
   * receive a different case-normalization mapping.
   */
  int
! ScanKeywordLookup(const char *str,
                    const ScanKeywordList *keywords)
  {
!     size_t        len;
!     int            h;
!     const char *kw;

+     /*
+      * Reject immediately if too long to be any keyword.  This saves useless
+      * hashing and downcasing work on long strings.
+      */
+     len = strlen(str);
      if (len > keywords->max_kw_len)
!         return -1;

      /*
!      * Compute the hash function.  We assume it was generated to produce
!      * case-insensitive results.  Since it's a perfect hash, we need only
!      * match to the specific keyword it identifies.
       */
!     h = keywords->hash(str, len);

!     /*
!      * An out-of-range result implies no match.  (This can happen for
!      * non-keyword inputs because the hash function will sum two unrelated
!      * hashtable entries.)
!      */
!     if (h < 0 || h >= keywords->num_keywords)
!         return -1;

      /*
!      * Compare character-by-character to see if we have a match, applying an
!      * ASCII-only downcasing to the input characters.  We must not use
!      * tolower() since it may produce the wrong translation in some locales
!      * (eg, Turkish).
       */
!     kw = GetScanKeyword(h, keywords);
!     while (*str != '\0')
      {
!         char        ch = *str++;

!         if (ch >= 'A' && ch <= 'Z')
!             ch += 'a' - 'A';
!         if (ch != *kw++)
!             return -1;
      }
+     if (*kw != '\0')
+         return -1;

!     /* Success! */
!     return h;
  }
diff --git a/src/include/common/kwlookup.h b/src/include/common/kwlookup.h
index 39efb35..dbff367 100644
*** a/src/include/common/kwlookup.h
--- b/src/include/common/kwlookup.h
***************
*** 14,19 ****
--- 14,22 ----
  #ifndef KWLOOKUP_H
  #define KWLOOKUP_H

+ /* Hash function used by ScanKeywordLookup */
+ typedef int (*ScanKeywordHashFunc) (const void *key, size_t keylen);
+
  /*
   * This struct contains the data needed by ScanKeywordLookup to perform a
   * search within a set of keywords.  The contents are typically generated by
*************** typedef struct ScanKeywordList
*** 23,28 ****
--- 26,32 ----
  {
      const char *kw_string;        /* all keywords in order, separated by \0 */
      const uint16 *kw_offsets;    /* offsets to the start of each keyword */
+     ScanKeywordHashFunc hash;    /* perfect hash function for keywords */
      int            num_keywords;    /* number of keywords */
      int            max_kw_len;        /* length of longest keyword */
  } ScanKeywordList;
diff --git a/src/interfaces/ecpg/preproc/Makefile b/src/interfaces/ecpg/preproc/Makefile
index b5b74a3..abfe3cc 100644
*** a/src/interfaces/ecpg/preproc/Makefile
--- b/src/interfaces/ecpg/preproc/Makefile
*************** preproc.y: ../../../backend/parser/gram.
*** 57,63 ****

  # generate keyword headers
  c_kwlist_d.h: c_kwlist.h $(GEN_KEYWORDLIST)
!     $(PERL) $(GEN_KEYWORDLIST) --varname ScanCKeywords $<

  ecpg_kwlist_d.h: ecpg_kwlist.h $(GEN_KEYWORDLIST)
      $(PERL) $(GEN_KEYWORDLIST) --varname ScanECPGKeywords $<
--- 57,63 ----

  # generate keyword headers
  c_kwlist_d.h: c_kwlist.h $(GEN_KEYWORDLIST)
!     $(PERL) $(GEN_KEYWORDLIST) --varname ScanCKeywords --case $<

  ecpg_kwlist_d.h: ecpg_kwlist.h $(GEN_KEYWORDLIST)
      $(PERL) $(GEN_KEYWORDLIST) --varname ScanECPGKeywords $<
diff --git a/src/interfaces/ecpg/preproc/c_keywords.c b/src/interfaces/ecpg/preproc/c_keywords.c
index 38ddf6f..387298b 100644
*** a/src/interfaces/ecpg/preproc/c_keywords.c
--- b/src/interfaces/ecpg/preproc/c_keywords.c
***************
*** 9,16 ****
   */
  #include "postgres_fe.h"

- #include <ctype.h>
-
  #include "preproc_extern.h"
  #include "preproc.h"

--- 9,14 ----
*************** static const uint16 ScanCKeywordTokens[]
*** 32,70 ****
   *
   * Returns the token value of the keyword, or -1 if no match.
   *
!  * Do a binary search using plain strcmp() comparison.  This is much like
   * ScanKeywordLookup(), except we want case-sensitive matching.
   */
  int
! ScanCKeywordLookup(const char *text)
  {
!     const char *kw_string;
!     const uint16 *kw_offsets;
!     const uint16 *low;
!     const uint16 *high;

!     if (strlen(text) > ScanCKeywords.max_kw_len)
!         return -1;                /* too long to be any keyword */

!     kw_string = ScanCKeywords.kw_string;
!     kw_offsets = ScanCKeywords.kw_offsets;
!     low = kw_offsets;
!     high = kw_offsets + (ScanCKeywords.num_keywords - 1);

!     while (low <= high)
!     {
!         const uint16 *middle;
!         int            difference;

!         middle = low + (high - low) / 2;
!         difference = strcmp(kw_string + *middle, text);
!         if (difference == 0)
!             return ScanCKeywordTokens[middle - kw_offsets];
!         else if (difference < 0)
!             low = middle + 1;
!         else
!             high = middle - 1;
!     }

      return -1;
  }
--- 30,71 ----
   *
   * Returns the token value of the keyword, or -1 if no match.
   *
!  * Do a hash search using plain strcmp() comparison.  This is much like
   * ScanKeywordLookup(), except we want case-sensitive matching.
   */
  int
! ScanCKeywordLookup(const char *str)
  {
!     size_t        len;
!     int            h;
!     const char *kw;

!     /*
!      * Reject immediately if too long to be any keyword.  This saves useless
!      * hashing work on long strings.
!      */
!     len = strlen(str);
!     if (len > ScanCKeywords.max_kw_len)
!         return -1;

!     /*
!      * Compute the hash function.  Since it's a perfect hash, we need only
!      * match to the specific keyword it identifies.
!      */
!     h = ScanCKeywords_hash_func(str, len);

!     /*
!      * An out-of-range result implies no match.  (This can happen for
!      * non-keyword inputs because the hash function will sum two unrelated
!      * hashtable entries.)
!      */
!     if (h < 0 || h >= ScanCKeywords.num_keywords)
!         return -1;

!     kw = GetScanKeyword(h, &ScanCKeywords);
!
!     if (strcmp(kw, str) == 0)
!         return ScanCKeywordTokens[h];

      return -1;
  }
diff --git a/src/tools/gen_keywordlist.pl b/src/tools/gen_keywordlist.pl
index d764aff..f15afc1 100644
*** a/src/tools/gen_keywordlist.pl
--- b/src/tools/gen_keywordlist.pl
***************
*** 14,19 ****
--- 14,25 ----
  # variable named according to the -v switch ("ScanKeywords" by default).
  # The variable is marked "static" unless the -e switch is given.
  #
+ # ScanKeywordList uses hash-based lookup, so this script also selects
+ # a minimal perfect hash function for the keyword set, and emits a
+ # static hash function that is referenced in the ScanKeywordList struct.
+ # The hash function is case-insensitive unless --case is specified.
+ # Note that case insensitivity assumes all-ASCII keywords!
+ #
  #
  # Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
  # Portions Copyright (c) 1994, Regents of the University of California
*************** use Getopt::Long;
*** 28,39 ****

  my $output_path = '';
  my $extern = 0;
  my $varname = 'ScanKeywords';

  GetOptions(
!     'output:s' => \$output_path,
!     'extern'   => \$extern,
!     'varname:s' => \$varname) || usage();

  my $kw_input_file = shift @ARGV || die "No input file.\n";

--- 34,47 ----

  my $output_path = '';
  my $extern = 0;
+ my $case_sensitive = 0;
  my $varname = 'ScanKeywords';

  GetOptions(
!     'output:s'       => \$output_path,
!     'extern'         => \$extern,
!     'case-sensitive' => \$case_sensitive,
!     'varname:s'      => \$varname) || usage();

  my $kw_input_file = shift @ARGV || die "No input file.\n";

*************** while (<$kif>)
*** 87,93 ****
--- 95,116 ----
      }
  }

+ # When being case-insensitive, insist that the input be all-lower-case.
+ if (!$case_sensitive)
+ {
+     foreach my $kw (@keywords)
+     {
+         die qq|The keyword "$kw" is not lower-case in $kw_input_file\n|
+           if ($kw ne lc $kw);
+     }
+ }
+
  # Error out if the keyword names are not in ASCII order.
+ #
+ # While this isn't really necessary with hash-based lookup, it's still
+ # helpful because it provides a cheap way to reject duplicate keywords.
+ # Also, insisting on sorted order ensures that code that scans the keyword
+ # table linearly will see the keywords in a canonical order.
  for my $i (0..$#keywords - 1)
  {
      die qq|The keyword "$keywords[$i + 1]" is out of order in $kw_input_file\n|
*************** print $kwdef "};\n\n";
*** 128,139 ****
--- 151,167 ----

  printf $kwdef "#define %s_NUM_KEYWORDS %d\n\n", uc $varname, scalar @keywords;

+ # Emit the definition of the hash function.
+
+ construct_hash_function();
+
  # Emit the struct that wraps all this lookup info into one variable.

  print $kwdef "static " if !$extern;
  printf $kwdef "const ScanKeywordList %s = {\n", $varname;
  printf $kwdef qq|\t%s_kw_string,\n|, $varname;
  printf $kwdef qq|\t%s_kw_offsets,\n|, $varname;
+ printf $kwdef qq|\t%s_hash_func,\n|, $varname;
  printf $kwdef qq|\t%s_NUM_KEYWORDS,\n|, uc $varname;
  printf $kwdef qq|\t%d\n|, $max_len;
  print $kwdef "};\n\n";
*************** print $kwdef "};\n\n";
*** 141,146 ****
--- 169,392 ----
  printf $kwdef "#endif\t\t\t\t\t\t\t/* %s_H */\n", uc $base_filename;


+ # This code constructs a minimal perfect hash function for the given
+ # keyword set, using an algorithm described in
+ # "An optimal algorithm for generating minimal perfect hash functions"
+ # by Czech, Havas and Majewski in Information Processing Letters,
+ # 43(5):256-264, October 1992.
+ # This implementation is loosely based on NetBSD's "nbperf",
+ # which was written by Joerg Sonnenberger.
+
+ # At runtime, we'll compute two simple hash functions of the input word,
+ # and use them to index into a mapping table.  The hash functions are just
+ # multiply-and-add in uint32 arithmetic, with different multipliers but
+ # the same initial seed.
+
+ # Calculate a hash function as the run-time code will do.
+ # If we are making a case-insensitive hash function, we implement that
+ # by OR'ing 0x20 into each byte of the key.  This correctly transforms
+ # upper-case ASCII into lower-case ASCII, while not changing digits or
+ # dollar signs.  (It does change '_', else we could just skip adjusting
+ # $cn here at all, for typical keyword strings.)
+ sub calc_hash
+ {
+     my ($keyword, $mult, $seed) = @_;
+
+     my $result = $seed;
+     for my $c (split //, $keyword)
+     {
+         my $cn = ord($c);
+         $cn |= 0x20 if !$case_sensitive;
+         $result = ($result * $mult + $cn) % 4294967296;
+     }
+     return $result;
+ }
+
+ # Attempt to construct a mapping table for the minimal perfect hash function.
+ # Returns a nonempty integer array if successful, else an empty array.
+ sub construct_hash_table
+ {
+     # Parameters of the hash functions are passed in.
+     my ($hash_mult1, $hash_mult2, $hash_seed) = @_;
+
+     # This algorithm is based on a graph whose edges correspond to the
+     # keywords and whose vertices correspond to entries of the mapping table.
+     # A keyword edge links the two vertices whose indexes are the outputs of
+     # the two hash functions for that keyword.  For K keywords, the mapping
+     # table must have at least 2*K+1 entries, guaranteeing that there's at
+     # least one unused entry.
+     my $nedges = scalar @keywords;    # number of edges
+     my $nverts = 2 * $nedges + 1;     # number of vertices
+
+     # Initialize the array of edges.
+     my @E = ();
+     foreach my $kw (@keywords)
+     {
+         # Calculate hashes for this keyword.
+         # The hashes are immediately reduced modulo the mapping table size.
+         my $hash1 = calc_hash($kw, $hash_mult1, $hash_seed) % $nverts;
+         my $hash2 = calc_hash($kw, $hash_mult2, $hash_seed) % $nverts;
+
+         # If the two hashes are the same for any keyword, we have to fail
+         # since this edge would itself form a cycle in the graph.
+         return () if $hash1 == $hash2;
+
+         # Add the edge for this keyword.
+         push @E, { left => $hash1, right => $hash2 };
+     }
+
+     # Initialize the array of vertices, giving them all empty lists
+     # of associated edges.  (The lists will be hashes of edge numbers.)
+     my @V = ();
+     for (my $v = 0; $v < $nverts; $v++)
+     {
+         push @V, { edges => {} };
+     }
+
+     # Insert each edge in the lists of edges using its vertices.
+     for (my $e = 0; $e < $nedges; $e++)
+     {
+         my $v = $E[$e]{left};
+         $V[$v]{edges}->{$e} = 1;
+
+         $v = $E[$e]{right};
+         $V[$v]{edges}->{$e} = 1;
+     }
+
+     # Now we attempt to prove the graph acyclic.
+     # A cycle-free graph is either empty or has some vertex of degree 1.
+     # Removing the edge attached to that vertex doesn't change this property,
+     # so doing that repeatedly will reduce the size of the graph.
+     # If the graph is empty at the end of the process, it was acyclic.
+     # We track the order of edge removal so that the next phase can process
+     # them in reverse order of removal.
+     my @output_order = ();
+
+     # Consider each vertex as a possible starting point for edge-removal.
+     for (my $startv = 0; $startv < $nverts; $startv++)
+     {
+         my $v = $startv;
+
+         # If vertex v is of degree 1 (i.e. exactly 1 edge connects to it),
+         # remove that edge, and then consider the edge's other vertex to see
+         # if it is now of degree 1.  The inner loop repeats until reaching a
+         # vertex not of degree 1.
+         while (scalar(keys(%{ $V[$v]{edges} })) == 1)
+         {
+             # Unlink its only edge.
+             my $e = (keys(%{ $V[$v]{edges} }))[0];
+             delete($V[$v]{edges}->{$e});
+
+             # Unlink the edge from its other vertex, too.
+             my $v2 = $E[$e]{left};
+             $v2 = $E[$e]{right} if ($v2 == $v);
+             delete($V[$v2]{edges}->{$e});
+
+             # Push e onto the front of the output-order list.
+             unshift @output_order, $e;
+
+             # Consider v2 on next iteration of inner loop.
+             $v = $v2;
+         }
+     }
+
+     # We succeeded only if all edges were removed from the graph.
+     return () if (scalar(@output_order) != $nedges);
+
+     # OK, build the hash table of size $nverts.
+     my @hashtab = (0) x $nverts;
+     # We need a "visited" flag array in this step, too.
+     my @visited = (0) x $nverts;
+
+     # The idea is that for any keyword, the sum of the hash table entries
+     # for its first and second hash values is the desired output (i.e., the
+     # keyword number).  By assigning hash table values in the selected edge
+     # order, we can guarantee that that's true.
+     foreach my $e (@output_order)
+     {
+         my $l = $E[$e]{left};
+         my $r = $E[$e]{right};
+         if (!$visited[$l])
+         {
+             # $hashtab[$r] might be zero, or some previously assigned value.
+             $hashtab[$l] = $e - $hashtab[$r];
+         }
+         else
+         {
+             die "oops, doubly used hashtab entry" if $visited[$r];
+             # $hashtab[$l] might be zero, or some previously assigned value.
+             $hashtab[$r] = $e - $hashtab[$l];
+         }
+         # Now freeze both of these hashtab entries.
+         $visited[$l] = 1;
+         $visited[$r] = 1;
+     }
+
+     # Check that the results fit in int16.  (With very large keyword sets, we
+     # might need to allow wider hashtable entries; but that day is far away.)
+     # Then set any unused hash table entries to 0x7FFF.  For reasonable
+     # keyword counts, that will ensure that any hash sum involving such an
+     # entry will be out-of-range, allowing the caller to exit early.
+     for (my $v = 0; $v < $nverts; $v++)
+     {
+         die "oops, hashtab entry overflow"
+           if $hashtab[$v] < -32767 || $hashtab[$v] > 32767;
+         $hashtab[$v] = 0x7FFF if !$visited[$v];
+     }
+
+     return @hashtab;
+ }
+
+ sub construct_hash_function
+ {
+     # Try different hash function parameters until we find a set that works
+     # for these keywords.  In principle we might need to change multipliers,
+     # but these two multipliers are chosen to be cheap to calculate via
+     # shift-and-add, so don't change them except at great need.
+     my $hash_mult1 = 31;
+     my $hash_mult2 = 37;
+
+     # We just try successive hash seed values until we find one that works.
+     # (Commonly, random seeds are tried, but we want reproducible results
+     # from this program so we don't do that.)
+     my $hash_seed;
+     my @hashtab;
+     for ($hash_seed = 0;; $hash_seed++)
+     {
+         @hashtab = construct_hash_table($hash_mult1, $hash_mult2, $hash_seed);
+         last if @hashtab;
+     }
+     my $nhash = scalar(@hashtab);
+
+     # OK, emit the hash function definition including the hash table.
+     printf $kwdef "static int\n";
+     printf $kwdef "%s_hash_func(const void *key, size_t keylen)\n{\n",
+       $varname;
+     printf $kwdef "\tstatic const int16 h[%d] = {\n", $nhash;
+     for (my $i = 0; $i < $nhash; $i++)
+     {
+         printf $kwdef "%s%6d,%s",
+           ($i % 8 == 0 ? "\t\t" : " "),
+           $hashtab[$i],
+           ($i % 8 == 7 ? "\n" : "");
+     }
+     printf $kwdef "\n" if ($nhash % 8 != 0);
+     printf $kwdef "\t};\n\n";
+     printf $kwdef "\tconst unsigned char *k = key;\n";
+     printf $kwdef "\tuint32\t\ta = %d;\n",   $hash_seed;
+     printf $kwdef "\tuint32\t\tb = %d;\n\n", $hash_seed;
+     printf $kwdef "\twhile (keylen--)\n\t{\n";
+     printf $kwdef "\t\tunsigned char c = *k++";
+     printf $kwdef " | 0x20" if !$case_sensitive;
+     printf $kwdef ";\n\n";
+     printf $kwdef "\t\ta = a * %d + c;\n", $hash_mult1;
+     printf $kwdef "\t\tb = b * %d + c;\n", $hash_mult2;
+     printf $kwdef "\t}\n";
+     printf $kwdef "\treturn h[a %% %d] + h[b %% %d];\n", $nhash, $nhash;
+     printf $kwdef "}\n\n";
+ }
+
+
  sub usage
  {
      die <<EOM;
*************** Usage: gen_keywordlist.pl [--output/-o <
*** 148,153 ****
--- 394,400 ----
      --output   Output directory (default '.')
      --varname  Name for ScanKeywordList variable (default 'ScanKeywords')
      --extern   Allow the ScanKeywordList variable to be globally visible
+     --case     Keyword matching is to be case-sensitive

  gen_keywordlist.pl transforms a list of keywords into a ScanKeywordList.
  The output filename is derived from the input file by inserting _d,
diff --git a/src/tools/msvc/Solution.pm b/src/tools/msvc/Solution.pm
index 937bf18..ed603fc 100644
*** a/src/tools/msvc/Solution.pm
--- b/src/tools/msvc/Solution.pm
*************** sub GenerateFiles
*** 440,446 ****
      {
          print "Generating c_kwlist_d.h and ecpg_kwlist_d.h...\n";
          chdir('src/interfaces/ecpg/preproc');
!         system('perl ../../../tools/gen_keywordlist.pl --varname ScanCKeywords c_kwlist.h');
          system('perl ../../../tools/gen_keywordlist.pl --varname ScanECPGKeywords ecpg_kwlist.h');
          chdir('../../../..');
      }
--- 440,446 ----
      {
          print "Generating c_kwlist_d.h and ecpg_kwlist_d.h...\n";
          chdir('src/interfaces/ecpg/preproc');
!         system('perl ../../../tools/gen_keywordlist.pl --varname ScanCKeywords --case c_kwlist.h');
          system('perl ../../../tools/gen_keywordlist.pl --varname ScanECPGKeywords ecpg_kwlist.h');
          chdir('../../../..');
      }

В списке pgsql-hackers по дате отправления:

Предыдущее
От: Andres Freund
Дата:
Сообщение: Re: Displaying and dumping of table access methods
Следующее
От: Tom Lane
Дата:
Сообщение: Re: reducing the footprint of ScanKeyword (was Re: Large writable variables)