Hi,
Following patch implements configurable penalty costs for levenshtein
distance metric in fuzzystrmatch contrib module.
It doesn't introduce any compatibility issues. Just implements
levenshtein(text,text,int,int,int)
function on top of fuzzystrmatch.c:levenshtein_internal(). At the
same time, levenshtein(text,text) becomes just a wrapper for
levenshtein(text,text,1,1,1) call.
BTW, in current CVS tip
if (cols/rows == 0) ...
checks in fuzzyztrmatch.c:levenshtein() never fail because of
cols/rows = strlen(...) + 1;
FYI.
Regards.
? contrib/fuzzystrmatch/fuzzystrmatch.sql
? contrib/fuzzystrmatch/libfuzzystrmatch.so.0.0
Index: contrib/fuzzystrmatch/README.fuzzystrmatch
===================================================================
RCS file: /projects/cvsroot/pgsql/contrib/fuzzystrmatch/README.fuzzystrmatch,v
retrieving revision 1.10
diff -c -r1.10 README.fuzzystrmatch
*** contrib/fuzzystrmatch/README.fuzzystrmatch 5 Jan 2007 22:19:17 -0000 1.10
--- contrib/fuzzystrmatch/README.fuzzystrmatch 17 Oct 2007 13:58:09 -0000
***************
*** 14,19 ****
--- 14,20 ----
* found at http://www.merriampark.com/ld.htm
* Also looked at levenshtein.c in the PHP 4.0.6 distribution for
* inspiration.
+ * Configurable penalty costs extension is introduced by Volkan YAZICI.
*
* metaphone()
* -----------
***************
*** 116,121 ****
--- 117,158 ----
==================================================================
Name
+ levenshtein -- calculates levenshtein distance with respect
+ to specified costs.
+
+ Synopsis
+
+ levenshtein(text source, text target,
+ int insert_cost, int delete_cost, int substitution_cost)
+
+ Inputs
+
+ source
+ any text string, 255 characters max, NOT NULL
+
+ target
+ any text string, 255 characters max, NOT NULL
+
+ insert_cost
+ cost of extra inserted characters
+
+ delete_cost
+ cost of missing characters
+
+ substitution_cost
+ cost of character substitutions
+
+ Outputs
+
+ Returns int
+
+ Example usage
+
+ select levenshtein('GUMBO','GAMBOL', 1, 1, 3);
+
+ ==================================================================
+ Name
+
metaphone -- calculates the metaphone code of an input string
Synopsis
***************
*** 140,144 ****
select metaphone('GUMBO',4);
==================================================================
! -- Joe Conway
!
--- 177,180 ----
select metaphone('GUMBO',4);
==================================================================
! -- Joe Conway
\ No newline at end of file
Index: contrib/fuzzystrmatch/fuzzystrmatch.c
===================================================================
RCS file: /projects/cvsroot/pgsql/contrib/fuzzystrmatch/fuzzystrmatch.c,v
retrieving revision 1.24
diff -c -r1.24 fuzzystrmatch.c
*** contrib/fuzzystrmatch/fuzzystrmatch.c 13 Feb 2007 18:00:35 -0000 1.24
--- contrib/fuzzystrmatch/fuzzystrmatch.c 17 Oct 2007 13:58:09 -0000
***************
*** 15,20 ****
--- 15,21 ----
* found at http://www.merriampark.com/ld.htm
* Also looked at levenshtein.c in the PHP 4.0.6 distribution for
* inspiration.
+ * Configurable penalty consts extension is introduced by Volkan YAZICI.
*
* metaphone()
* -----------
***************
*** 47,201 ****
PG_MODULE_MAGIC;
/*
! * Calculates Levenshtein Distance between two strings.
! * Uses simplest and fastest cost model only, i.e. assumes a cost of 1 for
! * each deletion, substitution, or insertion.
*/
! PG_FUNCTION_INFO_V1(levenshtein);
! Datum
! levenshtein(PG_FUNCTION_ARGS)
{
! char *str_s;
! char *str_s0;
! char *str_t;
! int cols = 0;
! int rows = 0;
! int *u_cells;
! int *l_cells;
! int *tmp;
! int i;
! int j;
!
! /*
! * Fetch the arguments. str_s is referred to as the "source" cols = length
! * of source + 1 to allow for the initialization column str_t is referred
! * to as the "target", rows = length of target + 1 rows = length of target
! * + 1 to allow for the initialization row
! */
! str_s = DatumGetCString(DirectFunctionCall1(textout, PointerGetDatum(PG_GETARG_TEXT_P(0))));
! str_t = DatumGetCString(DirectFunctionCall1(textout, PointerGetDatum(PG_GETARG_TEXT_P(1))));
!
! cols = strlen(str_s) + 1;
! rows = strlen(str_t) + 1;
/*
! * Restrict the length of the strings being compared to something
! * reasonable because we will have to perform rows * cols calculations. If
! * longer strings need to be compared, increase MAX_LEVENSHTEIN_STRLEN to
! * suit (but within your tolerance for speed and memory usage).
*/
! if ((cols > MAX_LEVENSHTEIN_STRLEN + 1) || (rows > MAX_LEVENSHTEIN_STRLEN + 1))
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("argument exceeds the maximum length of %d bytes",
MAX_LEVENSHTEIN_STRLEN)));
! /*
! * If either rows or cols is 0, the answer is the other value. This makes
! * sense since it would take that many insertions the build a matching
! * string
! */
!
! if (cols == 0)
! PG_RETURN_INT32(rows);
!
! if (rows == 0)
! PG_RETURN_INT32(cols);
/*
! * Allocate two vectors of integers. One will be used for the "upper" row,
! * the other for the "lower" row. Initialize the "upper" row to 0..cols
*/
! u_cells = palloc(sizeof(int) * cols);
! for (i = 0; i < cols; i++)
! u_cells[i] = i;
!
! l_cells = palloc(sizeof(int) * cols);
! /*
! * Use str_s0 to "rewind" the pointer to str_s in the nested for loop
! * below
! */
! str_s0 = str_s;
! /*
! * Loop through the rows, starting at row 1. Row 0 is used for the initial
! * "upper" row.
! */
! for (j = 1; j < rows; j++)
{
/*
! * We'll always start with col 1, and initialize lower row col 0 to j
*/
! l_cells[0] = j;
!
! for (i = 1; i < cols; i++)
{
! int c = 0;
! int c1 = 0;
! int c2 = 0;
! int c3 = 0;
!
! /*
! * The "cost" value is 0 if the character at the current col
! * position in the source string, matches the character at the
! * current row position in the target string; cost is 1 otherwise.
! */
! c = (*str_s != *str_t);
! /*
! * c1 is upper right cell plus 1
! */
! c1 = u_cells[i] + 1;
! /*
! * c2 is lower left cell plus 1
! */
! c2 = l_cells[i - 1] + 1;
- /*
- * c3 is cell diagonally above to the left plus "cost"
- */
- c3 = u_cells[i - 1] + c;
! /*
! * The lower right cell is set to the minimum of c1, c2, c3
! */
! l_cells[i] = (c1 < c2 ? c1 : c2) < c3 ? (c1 < c2 ? c1 : c2) : c3;
! /*
! * Increment the pointer to str_s
! */
! str_s++;
! }
- /*
- * Lower row now becomes the upper row, and the upper row gets reused
- * as the new lower row.
- */
- tmp = u_cells;
- u_cells = l_cells;
- l_cells = tmp;
! /*
! * Increment the pointer to str_t
! */
! str_t++;
! /*
! * Rewind the pointer to str_s
! */
! str_s = str_s0;
! }
! /*
! * Because the final value (at position row, col) was swapped from the
! * lower row to the upper row, that's where we'll find it.
! */
! PG_RETURN_INT32(u_cells[cols - 1]);
}
/*
* Calculates the metaphone of an input string.
* Returns number of characters requested
--- 48,178 ----
PG_MODULE_MAGIC;
+
/*
! * levenshtein_internal - Calculates Levenshtein distance metric
! * between supplied strings. Generally
! * (1, 1, 1) penalty costs suffices common
! * cases, but your mileage may vary.
*/
! static int
! levenshtein_internal(char *s, char *t, int ins_c, int del_c, int sub_c)
{
! int m, n;
! int *prev;
! int *curr;
! int i, j;
! char *x;
! char *y;
!
! m = strlen(s);
! n = strlen(t);
!
! if (!m)
! return n;
! if (!n)
! return m;
/*
! * For security concerns, restrict excessive CPU+RAM usage. (This
! * implementation uses O(m+n) memory and has O(mn) complexity.)
*/
! if ((m + n) > (2 * MAX_LEVENSHTEIN_STRLEN))
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("argument exceeds the maximum length of %d bytes",
MAX_LEVENSHTEIN_STRLEN)));
! /* One more cell for initial segments. */
! ++m;
! ++n;
/*
! * Instead of building an (m+1)x(n+1) array, we'll use two
! * different arrays of size m+1 and n+1 for storing accumulated
! * values in previous calls.
*/
! prev = palloc((m + n) * sizeof(char));
! curr = prev + (m * sizeof(char));
! for (i = 0; i < m; i++)
! prev[i] = i;
! for (y = t, j = 1; j < n; y++, j++)
{
+ int *temp;
+
/*
! * First cell must increment sequentially, as we're on the
! * j'th column of the (m+1)x(n+1) array.
*/
! curr[0] = j;
!
! for (x = s, i = 1; i < m; x++, i++)
{
! int ins;
! int del;
! int sub;
!
! /* Calculate costs for probable operations. */
! ins = prev[i] + ins_c; /* Insertion */
! del = curr[i-1] + del_c; /* Deletion */
! sub = prev[i-1] + ((*x == *y) ? 0 : sub_c); /* Substitution */
!
! /* Take the one with minimum cost. */
! curr[i] = ins < del ? ins : del;
! curr[i] = curr[i] < sub ? curr[i] : sub;
! }
! /* Swap current row with previous row. */
! temp = curr;
! curr = prev;
! prev = temp;
! }
! /*
! * Because the final value was swapped from the previous row to
! * the current row, that's where we'll find it.
! */
! return prev[m-1];
! }
! PG_FUNCTION_INFO_V1(levenshtein_with_costs);
! Datum
! levenshtein_with_costs(PG_FUNCTION_ARGS)
! {
! char *src;
! char *dst;
! int ins_c;
! int del_c;
! int sub_c;
!
! src = _textout(PG_GETARG_DATUM(0));
! dst = _textout(PG_GETARG_DATUM(1));
!
! ins_c = PG_GETARG_INT32(2);
! del_c = PG_GETARG_INT32(3);
! sub_c = PG_GETARG_INT32(4);
! PG_RETURN_INT32(levenshtein_internal(src, dst, ins_c, del_c, sub_c));
! }
! PG_FUNCTION_INFO_V1(levenshtein);
! Datum
! levenshtein(PG_FUNCTION_ARGS)
! {
! char *src;
! char *dst;
! src = _textout(PG_GETARG_DATUM(0));
! dst = _textout(PG_GETARG_DATUM(1));
! PG_RETURN_INT32(levenshtein_internal(src, dst, 1, 1, 1));
}
+
/*
* Calculates the metaphone of an input string.
* Returns number of characters requested
Index: contrib/fuzzystrmatch/fuzzystrmatch.h
===================================================================
RCS file: /projects/cvsroot/pgsql/contrib/fuzzystrmatch/fuzzystrmatch.h,v
retrieving revision 1.15
diff -c -r1.15 fuzzystrmatch.h
*** contrib/fuzzystrmatch/fuzzystrmatch.h 5 Jan 2007 22:19:18 -0000 1.15
--- contrib/fuzzystrmatch/fuzzystrmatch.h 17 Oct 2007 13:58:09 -0000
***************
*** 58,63 ****
--- 58,64 ----
/*
* External declarations
*/
+ extern Datum levenshtein_with_costs(PG_FUNCTION_ARGS);
extern Datum levenshtein(PG_FUNCTION_ARGS);
extern Datum metaphone(PG_FUNCTION_ARGS);
extern Datum soundex(PG_FUNCTION_ARGS);
***************
*** 82,87 ****
--- 83,89 ----
* Levenshtein
*/
#define MAX_LEVENSHTEIN_STRLEN 255
+ static int levenshtein_internal(char *s, char *t, int ins_c, int del_c, int sub_c);
/*
Index: contrib/fuzzystrmatch/fuzzystrmatch.sql.in
===================================================================
RCS file: /projects/cvsroot/pgsql/contrib/fuzzystrmatch/fuzzystrmatch.sql.in,v
retrieving revision 1.7
diff -c -r1.7 fuzzystrmatch.sql.in
*** contrib/fuzzystrmatch/fuzzystrmatch.sql.in 26 Jan 2005 08:04:04 -0000 1.7
--- contrib/fuzzystrmatch/fuzzystrmatch.sql.in 17 Oct 2007 13:58:09 -0000
***************
*** 1,6 ****
--- 1,10 ----
-- Adjust this setting to control where the objects get created.
SET search_path = public;
+ CREATE FUNCTION levenshtein (text,text,int,int,int) RETURNS int
+ AS 'MODULE_PATHNAME','levenshtein_with_costs'
+ LANGUAGE C IMMUTABLE STRICT;
+
CREATE FUNCTION levenshtein (text,text) RETURNS int
AS 'MODULE_PATHNAME','levenshtein'
LANGUAGE C IMMUTABLE STRICT;