Configurable Penalty Costs for Levenshtein

Поиск
Список
Период
Сортировка
Hi,

Following patch implements configurable penalty costs for levenshtein
distance metric in fuzzystrmatch contrib module.

It doesn't introduce any compatibility issues. Just implements

  levenshtein(text,text,int,int,int)

function on top of fuzzystrmatch.c:levenshtein_internal(). At the
same time, levenshtein(text,text) becomes just a wrapper for
levenshtein(text,text,1,1,1) call.

BTW, in current CVS tip

  if (cols/rows == 0) ...

checks in fuzzyztrmatch.c:levenshtein() never fail because of

  cols/rows = strlen(...) + 1;

FYI.


Regards.
? contrib/fuzzystrmatch/fuzzystrmatch.sql
? contrib/fuzzystrmatch/libfuzzystrmatch.so.0.0
Index: contrib/fuzzystrmatch/README.fuzzystrmatch
===================================================================
RCS file: /projects/cvsroot/pgsql/contrib/fuzzystrmatch/README.fuzzystrmatch,v
retrieving revision 1.10
diff -c -r1.10 README.fuzzystrmatch
*** contrib/fuzzystrmatch/README.fuzzystrmatch    5 Jan 2007 22:19:17 -0000    1.10
--- contrib/fuzzystrmatch/README.fuzzystrmatch    17 Oct 2007 13:58:09 -0000
***************
*** 14,19 ****
--- 14,20 ----
   * found at http://www.merriampark.com/ld.htm
   * Also looked at levenshtein.c in the PHP 4.0.6 distribution for
   * inspiration.
+  * Configurable penalty costs extension is introduced by Volkan YAZICI.
   *
   * metaphone()
   * -----------
***************
*** 116,121 ****
--- 117,158 ----
  ==================================================================
  Name

+ levenshtein -- calculates levenshtein distance with respect
+                to specified costs.
+
+ Synopsis
+
+ levenshtein(text source, text target,
+             int insert_cost, int delete_cost, int substitution_cost)
+
+ Inputs
+
+   source
+     any text string, 255 characters max, NOT NULL
+
+   target
+     any text string, 255 characters max, NOT NULL
+
+   insert_cost
+     cost of extra inserted characters
+
+   delete_cost
+     cost of missing characters
+
+   substitution_cost
+     cost of character substitutions
+
+ Outputs
+
+   Returns int
+
+ Example usage
+
+   select levenshtein('GUMBO','GAMBOL', 1, 1, 3);
+
+ ==================================================================
+ Name
+
  metaphone -- calculates the metaphone code of an input string

  Synopsis
***************
*** 140,144 ****
    select metaphone('GUMBO',4);

  ==================================================================
! -- Joe Conway
!
--- 177,180 ----
    select metaphone('GUMBO',4);

  ==================================================================
! -- Joe Conway
\ No newline at end of file
Index: contrib/fuzzystrmatch/fuzzystrmatch.c
===================================================================
RCS file: /projects/cvsroot/pgsql/contrib/fuzzystrmatch/fuzzystrmatch.c,v
retrieving revision 1.24
diff -c -r1.24 fuzzystrmatch.c
*** contrib/fuzzystrmatch/fuzzystrmatch.c    13 Feb 2007 18:00:35 -0000    1.24
--- contrib/fuzzystrmatch/fuzzystrmatch.c    17 Oct 2007 13:58:09 -0000
***************
*** 15,20 ****
--- 15,21 ----
   * found at http://www.merriampark.com/ld.htm
   * Also looked at levenshtein.c in the PHP 4.0.6 distribution for
   * inspiration.
+  * Configurable penalty consts extension is introduced by Volkan YAZICI.
   *
   * metaphone()
   * -----------
***************
*** 47,201 ****

  PG_MODULE_MAGIC;

  /*
!  * Calculates Levenshtein Distance between two strings.
!  * Uses simplest and fastest cost model only, i.e. assumes a cost of 1 for
!  * each deletion, substitution, or insertion.
   */
! PG_FUNCTION_INFO_V1(levenshtein);
! Datum
! levenshtein(PG_FUNCTION_ARGS)
  {
!     char       *str_s;
!     char       *str_s0;
!     char       *str_t;
!     int            cols = 0;
!     int            rows = 0;
!     int           *u_cells;
!     int           *l_cells;
!     int           *tmp;
!     int            i;
!     int            j;
!
!     /*
!      * Fetch the arguments. str_s is referred to as the "source" cols = length
!      * of source + 1 to allow for the initialization column str_t is referred
!      * to as the "target", rows = length of target + 1 rows = length of target
!      * + 1 to allow for the initialization row
!      */
!     str_s = DatumGetCString(DirectFunctionCall1(textout, PointerGetDatum(PG_GETARG_TEXT_P(0))));
!     str_t = DatumGetCString(DirectFunctionCall1(textout, PointerGetDatum(PG_GETARG_TEXT_P(1))));
!
!     cols = strlen(str_s) + 1;
!     rows = strlen(str_t) + 1;

      /*
!      * Restrict the length of the strings being compared to something
!      * reasonable because we will have to perform rows * cols calculations. If
!      * longer strings need to be compared, increase MAX_LEVENSHTEIN_STRLEN to
!      * suit (but within your tolerance for speed and memory usage).
       */
!     if ((cols > MAX_LEVENSHTEIN_STRLEN + 1) || (rows > MAX_LEVENSHTEIN_STRLEN + 1))
          ereport(ERROR,
                  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                   errmsg("argument exceeds the maximum length of %d bytes",
                          MAX_LEVENSHTEIN_STRLEN)));

!     /*
!      * If either rows or cols is 0, the answer is the other value. This makes
!      * sense since it would take that many insertions the build a matching
!      * string
!      */
!
!     if (cols == 0)
!         PG_RETURN_INT32(rows);
!
!     if (rows == 0)
!         PG_RETURN_INT32(cols);

      /*
!      * Allocate two vectors of integers. One will be used for the "upper" row,
!      * the other for the "lower" row. Initialize the "upper" row to 0..cols
       */
!     u_cells = palloc(sizeof(int) * cols);
!     for (i = 0; i < cols; i++)
!         u_cells[i] = i;
!
!     l_cells = palloc(sizeof(int) * cols);

!     /*
!      * Use str_s0 to "rewind" the pointer to str_s in the nested for loop
!      * below
!      */
!     str_s0 = str_s;

!     /*
!      * Loop through the rows, starting at row 1. Row 0 is used for the initial
!      * "upper" row.
!      */
!     for (j = 1; j < rows; j++)
      {
          /*
!          * We'll always start with col 1, and initialize lower row col 0 to j
           */
!         l_cells[0] = j;
!
!         for (i = 1; i < cols; i++)
          {
!             int            c = 0;
!             int            c1 = 0;
!             int            c2 = 0;
!             int            c3 = 0;
!
!             /*
!              * The "cost" value is 0 if the character at the current col
!              * position in the source string, matches the character at the
!              * current row position in the target string; cost is 1 otherwise.
!              */
!             c = (*str_s != *str_t);

!             /*
!              * c1 is upper right cell plus 1
!              */
!             c1 = u_cells[i] + 1;

!             /*
!              * c2 is lower left cell plus 1
!              */
!             c2 = l_cells[i - 1] + 1;

-             /*
-              * c3 is cell diagonally above to the left plus "cost"
-              */
-             c3 = u_cells[i - 1] + c;

!             /*
!              * The lower right cell is set to the minimum of c1, c2, c3
!              */
!             l_cells[i] = (c1 < c2 ? c1 : c2) < c3 ? (c1 < c2 ? c1 : c2) : c3;

!             /*
!              * Increment the pointer to str_s
!              */
!             str_s++;
!         }

-         /*
-          * Lower row now becomes the upper row, and the upper row gets reused
-          * as the new lower row.
-          */
-         tmp = u_cells;
-         u_cells = l_cells;
-         l_cells = tmp;

!         /*
!          * Increment the pointer to str_t
!          */
!         str_t++;

!         /*
!          * Rewind the pointer to str_s
!          */
!         str_s = str_s0;
!     }

!     /*
!      * Because the final value (at position row, col) was swapped from the
!      * lower row to the upper row, that's where we'll find it.
!      */
!     PG_RETURN_INT32(u_cells[cols - 1]);
  }

  /*
   * Calculates the metaphone of an input string.
   * Returns number of characters requested
--- 48,178 ----

  PG_MODULE_MAGIC;

+
  /*
!  * levenshtein_internal - Calculates Levenshtein distance metric
!  *                        between supplied strings. Generally
!  *                        (1, 1, 1) penalty costs suffices common
!  *                        cases, but your mileage may vary.
   */
! static int
! levenshtein_internal(char *s, char *t, int ins_c, int del_c, int sub_c)
  {
!     int         m, n;
!     int        *prev;
!     int        *curr;
!     int         i, j;
!     char    *x;
!     char    *y;
!
!     m = strlen(s);
!     n = strlen(t);
!
!     if (!m)
!         return n;
!     if (!n)
!         return m;

      /*
!      * For security concerns, restrict excessive CPU+RAM usage. (This
!      * implementation uses O(m+n) memory and has O(mn) complexity.)
       */
!     if ((m + n) > (2 * MAX_LEVENSHTEIN_STRLEN))
          ereport(ERROR,
                  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                   errmsg("argument exceeds the maximum length of %d bytes",
                          MAX_LEVENSHTEIN_STRLEN)));

!     /* One more cell for initial segments. */
!     ++m;
!     ++n;

      /*
!      * Instead of building an (m+1)x(n+1) array, we'll use two
!      * different arrays of size m+1 and n+1 for storing accumulated
!      * values in previous calls.
       */
!     prev = palloc((m + n) * sizeof(char));
!     curr = prev + (m * sizeof(char));

!     for (i = 0; i < m; i++)
!         prev[i] = i;

!     for (y = t, j = 1; j < n; y++, j++)
      {
+         int *temp;
+
          /*
!          * First cell must increment sequentially, as we're on the
!          * j'th column of the (m+1)x(n+1) array.
           */
!         curr[0] = j;
!
!         for (x = s, i = 1; i < m; x++, i++)
          {
!             int    ins;
!             int    del;
!             int    sub;
!
!             /* Calculate costs for probable operations. */
!             ins = prev[i] + ins_c;                        /* Insertion    */
!             del = curr[i-1] + del_c;                    /* Deletion     */
!             sub = prev[i-1] + ((*x == *y) ? 0 : sub_c);    /* Substitution */
!
!             /* Take the one with minimum cost. */
!             curr[i] = ins < del ? ins : del;
!             curr[i] = curr[i] < sub ? curr[i] : sub;
!         }

!         /* Swap current row with previous row. */
!         temp = curr;
!         curr = prev;
!         prev = temp;
!     }

!     /*
!      * Because the final value was swapped from the previous row to
!      * the current row, that's where we'll find it.
!      */
!     return prev[m-1];
! }


! PG_FUNCTION_INFO_V1(levenshtein_with_costs);
! Datum
! levenshtein_with_costs(PG_FUNCTION_ARGS)
! {
!     char    *src;
!     char    *dst;
!     int         ins_c;
!     int         del_c;
!     int         sub_c;
!
!     src = _textout(PG_GETARG_DATUM(0));
!     dst = _textout(PG_GETARG_DATUM(1));
!
!     ins_c = PG_GETARG_INT32(2);
!     del_c = PG_GETARG_INT32(3);
!     sub_c = PG_GETARG_INT32(4);

!     PG_RETURN_INT32(levenshtein_internal(src, dst, ins_c, del_c, sub_c));
! }


! PG_FUNCTION_INFO_V1(levenshtein);
! Datum
! levenshtein(PG_FUNCTION_ARGS)
! {
!     char    *src;
!     char    *dst;

!     src = _textout(PG_GETARG_DATUM(0));
!     dst = _textout(PG_GETARG_DATUM(1));

!     PG_RETURN_INT32(levenshtein_internal(src, dst, 1, 1, 1));
  }

+
  /*
   * Calculates the metaphone of an input string.
   * Returns number of characters requested
Index: contrib/fuzzystrmatch/fuzzystrmatch.h
===================================================================
RCS file: /projects/cvsroot/pgsql/contrib/fuzzystrmatch/fuzzystrmatch.h,v
retrieving revision 1.15
diff -c -r1.15 fuzzystrmatch.h
*** contrib/fuzzystrmatch/fuzzystrmatch.h    5 Jan 2007 22:19:18 -0000    1.15
--- contrib/fuzzystrmatch/fuzzystrmatch.h    17 Oct 2007 13:58:09 -0000
***************
*** 58,63 ****
--- 58,64 ----
  /*
   * External declarations
   */
+ extern Datum levenshtein_with_costs(PG_FUNCTION_ARGS);
  extern Datum levenshtein(PG_FUNCTION_ARGS);
  extern Datum metaphone(PG_FUNCTION_ARGS);
  extern Datum soundex(PG_FUNCTION_ARGS);
***************
*** 82,87 ****
--- 83,89 ----
   * Levenshtein
   */
  #define MAX_LEVENSHTEIN_STRLEN        255
+ static int    levenshtein_internal(char *s, char *t, int ins_c, int del_c, int sub_c);


  /*
Index: contrib/fuzzystrmatch/fuzzystrmatch.sql.in
===================================================================
RCS file: /projects/cvsroot/pgsql/contrib/fuzzystrmatch/fuzzystrmatch.sql.in,v
retrieving revision 1.7
diff -c -r1.7 fuzzystrmatch.sql.in
*** contrib/fuzzystrmatch/fuzzystrmatch.sql.in    26 Jan 2005 08:04:04 -0000    1.7
--- contrib/fuzzystrmatch/fuzzystrmatch.sql.in    17 Oct 2007 13:58:09 -0000
***************
*** 1,6 ****
--- 1,10 ----
  -- Adjust this setting to control where the objects get created.
  SET search_path = public;

+ CREATE FUNCTION levenshtein (text,text,int,int,int) RETURNS int
+ AS 'MODULE_PATHNAME','levenshtein_with_costs'
+ LANGUAGE C IMMUTABLE STRICT;
+
  CREATE FUNCTION levenshtein (text,text) RETURNS int
  AS 'MODULE_PATHNAME','levenshtein'
  LANGUAGE C IMMUTABLE STRICT;

В списке pgsql-patches по дате отправления:

Предыдущее
От: "Marko Kreen"
Дата:
Сообщение: Re: [RFC] extended txid docs
Следующее
От: Tom Lane
Дата:
Сообщение: Re: [RFC] extended txid docs