Re: Fuzzy matching?

Поиск
Список
Период
Сортировка
От Timothy H. Keitt
Тема Re: Fuzzy matching?
Дата
Msg-id 3B676905.90208@keittlab.bio.sunysb.edu
обсуждение исходный текст
Ответ на Fuzzy matching?  ("Josh Berkus" <josh@agliodbs.com>)
Список pgsql-sql
I posted this many moons ago to pgsql-hackers.  'Guess nobody noticed.

Tim

Josh Berkus wrote:

>Folks,
>
>For many of my programs, it would be extremely useful to have some form
>of "fuzzy matching" for VARCHAR fields.  There are two kinds of fuzzy
>matching for words that I know of:
>
>1. Phonetic matching, which would be nice but will have to wait for
>someone's $100,000 project;
>
>2. Textual mathcing, which I will outline below.
>
>The way textual fuzzy matching should work is as follows:
>The developer supplies two VARCHARs to match and a number/percent of
>character mis-match that is acceptable:
>
>Fuzzy_match('Thornton','Tornton',1)
>
>And the fuzzy_match should return True if the two phrases are no more
>than that number of characters different.  Thus, we should get:
>
>fuzzy_match('Thornton','Tornton',1) = TRUE
>fuzzy_match('Thornton','Torntin',1) = FALSE
>fuzzy_match('Thornton','Torntin',2) = TRUE
>
>Unfortunately, I cannot think of a way to make this happen in a function
>without cycling through all the possible permutations of characters for
>both words or doing some character-by-character comparison with
>elaborate logic for placement.  Either of these approaches would be very
>slow, and completely unsuitable for column comparisons on large tables.
>
>Can anyone suggest some shortcuts here?  Perhaps using pl/perl or
>something similar?
>
>Grazie!
>
>-Josh Berkus
>
>______AGLIO DATABASE SOLUTIONS___________________________
>                                       Josh Berkus
>  Complete information technology      josh@agliodbs.com
>   and data management solutions       (415) 565-7293
>  for law firms, small businesses        fax 621-2533
>    and non-profit organizations.      San Francisco
>
>
>------------------------------------------------------------------------
>
>
>
>------------------------------------------------------------------------
>
>
>
>------------------------------------------------------------------------
>
>
>
>------------------------------------------------------------------------
>
>
>---------------------------(end of broadcast)---------------------------
>TIP 2: you can get off all lists at once with the unregister command
>    (send "unregister YourEmailAddressHere" to majordomo@postgresql.org)
>
> Part 1.2
>
> Content-Type:
>
> text/plain
> Content-Encoding:
>
> base64
>
>
> ------------------------------------------------------------------------
> Part 1.3
>
> Content-Type:
>
> text/plain
> Content-Encoding:
>
> base64
>
>
> ------------------------------------------------------------------------
> Part 1.4
>
> Content-Type:
>
> text/plain
> Content-Encoding:
>
> base64
>
>
> ------------------------------------------------------------------------
> Part 1.5
>
> Content-Type:
>
> text/plain
> Content-Encoding:
>
> binary
>
>

--
Timothy H. Keitt
Department of Ecology and Evolution
State University of New York at Stony Brook
Stony Brook, New York 11794 USA
Phone: 631-632-1101, FAX: 631-632-7626
http://life.bio.sunysb.edu/ee/keitt/


/* Copyright (c) 2000 Timothy H. Keitt */
/* Licence: GPL version 2 or higher (see http://www.gnu.org/) */

#include "postgres.h"

#define STATIC_SIZE 32

/* This must be changed if STATIC_SIZE is changed */
static int4 static_array[STATIC_SIZE][STATIC_SIZE] = {
  {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
   19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31},
  {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
  {2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
  {3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
  {4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
  {5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
  {6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
  {7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
  {8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
  {9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
  {10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
  {11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
  {12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
  {13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
  {14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
  {15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
  {16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
  {17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
  {18, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
  {19, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
  {20, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
  {21, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
  {22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
  {23, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
  {24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
  {25, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
  {26, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
  {27, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
  {28, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
  {29, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
  {30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
  {31, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0}};

/* Fast version for strings up to STATIC_SIZE characters */
int4
levenshtein_distance(text *s1, text *s2) {
  register int i, j, l, m, n, add, rows, columns;

  columns = VARSIZE(s1) - VARHDRSZ + 1;
  rows = VARSIZE(s2) - VARHDRSZ + 1;

  /* Use slower dynamically allocated version for larger strings */
  if (columns > STATIC_SIZE || rows > STATIC_SIZE)
    return levenshtein_distance_dynamic(s1, s2);

  for (j=1; j<rows; ++j) {
    for (i=1; i<columns; ++i) {

      if (VARDATA(s1)[i-1] == VARDATA(s2)[j-1])    add=0; else add=1;

      m = 1 + static_array[j-1][i];
      l = 1 + static_array[j][i-1];
      n = add + static_array[j-1][i-1];

      static_array[j][i] = (m < l ? (m < n ? m : n): (l < n ? l : n));

    } /* next column (i) */
  } /* next row (j) */

  return static_array[rows-1][columns-1];

} /* end levenshtein_distance(text *s1, text *s2) */

int4
levenshtein_distance_dynamic(text *s1, text *s2) {
  register int i, j, l, m, n, add, rows, columns, out;
  int4 *dynamic_array;

  columns = VARSIZE(s1) - VARHDRSZ + 1;
  rows = VARSIZE(s2) - VARHDRSZ + 1;

  dynamic_array = (int4 *) palloc(rows * columns * sizeof(int4));
  if (dynamic_array == NULL) return -1;

  for (i=0; i<columns; ++i) dynamic_array[i] = i;
  for (j=0; j<rows; ++j) dynamic_array[j*columns] = j;

  for (j=1; j<rows; ++j) {
    for (i=1; i<columns; ++i) {

      if (VARDATA(s1)[i-1] == VARDATA(s2)[j-1])    add=0; else add=1;

      m = 1 + dynamic_array[(j-1)*columns+i];
      l = 1 + dynamic_array[j*columns+i-1];
      n = add + dynamic_array[(j-1)*columns+i-1];

      dynamic_array[j*columns+i] =
    (m < l ? (m < n ? m : n): (l < n ? l : n));

    } /* next column (i) */
  } /* next row (j) */

  out = dynamic_array[(rows-1)*columns+columns-1];

  pfree(dynamic_array);

  return out;

} /* end levenshtein_distance_dynamic(text *s1, text *s2) */



В списке pgsql-sql по дате отправления:

Предыдущее
От: Christopher Sawtell
Дата:
Сообщение: Re: Fuzzy matching?
Следующее
От: "Timothy H. Keitt"
Дата:
Сообщение: Re: Fuzzy matching?