Re: like/ilike improvements
От | Andrew Dunstan |
---|---|
Тема | Re: like/ilike improvements |
Дата | |
Msg-id | 46F278BC.1080508@dunslane.net обсуждение исходный текст |
Ответ на | Re: like/ilike improvements ("Guillaume Smet" <guillaume.smet@gmail.com>) |
Ответы |
Re: like/ilike improvements
(Andrew Dunstan <andrew@dunslane.net>)
Re: like/ilike improvements ("Guillaume Smet" <guillaume.smet@gmail.com>) |
Список | pgsql-hackers |
Guillaume Smet wrote: app_hls > On 9/20/07, Andrew Dunstan <andrew@dunslane.net> wrote: > >> Can you retry both sets of tests but this time in C locale? The lower() >> code works differently in C locale, and it might be that we need to look >> at tweaking just one case. >> > > Please try the attached patch, which goes back to using a special case for single-byte ILIKE. I want to make sure that at the very least we don't cause a performance regression with the code done this release. I can't see an obvious way around the problem for multi-byte case - lower() then requires converting to and from wchar, and I don't see a way of avoiding calling lower(). If this is a major blocker I would suggest you look at an alternative to using ILIKE for your UTF8 data. cheers andrew Index: src/backend/utils/adt/like.c =================================================================== RCS file: /cvsroot/pgsql/src/backend/utils/adt/like.c,v retrieving revision 1.69 diff -c -r1.69 like.c *** src/backend/utils/adt/like.c 2 Jun 2007 02:03:42 -0000 1.69 --- src/backend/utils/adt/like.c 20 Sep 2007 13:12:39 -0000 *************** *** 36,41 **** --- 36,43 ---- static int UTF8_MatchText(char *t, int tlen, char *p, int plen); + static int SB_IMatchText(char *t, int tlen, char *p, int plen); + static int GenericMatchText(char *s, int slen, char* p, int plen); static int Generic_Text_IC_like(text *str, text *pat); *************** *** 104,109 **** --- 106,117 ---- #include "like_match.c" + /* setup to compile like_match.c for single byte case insensitive matches */ + #define MATCH_LOWER + #define NextChar(p, plen) NextByte((p), (plen)) + #define MatchText SB_IMatchText + + #include "like_match.c" /* setup to compile like_match.c for UTF8 encoding, using fast NextChar */ *************** *** 132,146 **** int slen, plen; ! /* Force inputs to lower case to achieve case insensitivity */ ! str = DatumGetTextP(DirectFunctionCall1(lower, PointerGetDatum(str))); ! pat = DatumGetTextP(DirectFunctionCall1(lower, PointerGetDatum(pat))); ! s = VARDATA(str); ! slen = (VARSIZE(str) - VARHDRSZ); ! p = VARDATA(pat); ! plen = (VARSIZE(pat) - VARHDRSZ); ! return GenericMatchText(s, slen, p, plen); } /* --- 140,171 ---- int slen, plen; ! /* For efficiency reasons, in the single byte case we don't call ! * lower() on the pattern and text, but instead call to_lower on each ! * character. In the multi-byte case we don't have much choice :-( ! */ ! if (pg_database_encoding_max_length() > 1) ! { ! pat = DatumGetTextP(DirectFunctionCall1(lower, PointerGetDatum(pat))); ! p = VARDATA(pat); ! plen = (VARSIZE(pat) - VARHDRSZ); ! str = DatumGetTextP(DirectFunctionCall1(lower, PointerGetDatum(str))); ! s = VARDATA(str); ! slen = (VARSIZE(str) - VARHDRSZ); ! if (GetDatabaseEncoding() == PG_UTF8) ! return UTF8_MatchText(s, slen, p, plen); ! else ! return MB_MatchText(s, slen, p, plen); ! } ! else ! { ! p = VARDATA(pat); ! plen = (VARSIZE(pat) - VARHDRSZ); ! s = VARDATA(str); ! slen = (VARSIZE(str) - VARHDRSZ); ! return SB_IMatchText(s, slen, p, plen); ! } } /* Index: src/backend/utils/adt/like_match.c =================================================================== RCS file: /cvsroot/pgsql/src/backend/utils/adt/like_match.c,v retrieving revision 1.16 diff -c -r1.16 like_match.c *** src/backend/utils/adt/like_match.c 2 Jun 2007 02:03:42 -0000 1.16 --- src/backend/utils/adt/like_match.c 20 Sep 2007 13:12:39 -0000 *************** *** 13,18 **** --- 13,19 ---- * NextChar * MatchText - to name of function wanted * do_like_escape - name of function if wanted - needs CHAREQ and CopyAdvChar + * MATCH_LOWER - define iff using to_lower on text chars * * Copyright (c) 1996-2007, PostgreSQL Global Development Group * *************** *** 68,73 **** --- 69,80 ---- *-------------------- */ + #ifdef MATCH_LOWER + #define TCHAR(t) tolower((t)) + #else + #define TCHAR(t) (t) + #endif + static int MatchText(char *t, int tlen, char *p, int plen) { *************** *** 143,155 **** else { ! char firstpat = *p ; if (*p == '\\') { if (plen < 2) return LIKE_FALSE; ! firstpat = p[1]; } while (tlen > 0) --- 150,162 ---- else { ! char firstpat = TCHAR(*p) ; if (*p == '\\') { if (plen < 2) return LIKE_FALSE; ! firstpat = TCHAR(p[1]); } while (tlen > 0) *************** *** 158,164 **** * Optimization to prevent most recursion: don't recurse * unless first pattern byte matches first text byte. */ ! if (*t == firstpat) { int matched = MatchText(t, tlen, p, plen); --- 165,171 ---- * Optimization to prevent most recursion: don't recurse * unless first pattern byte matches first text byte. */ ! if (TCHAR(*t) == firstpat) { int matched = MatchText(t, tlen, p, plen); *************** *** 183,189 **** NextByte(p, plen); continue; } ! else if (*t != *p) { /* * Not the single-character wildcard and no explicit match? Then --- 190,196 ---- NextByte(p, plen); continue; } ! else if (TCHAR(*t) != TCHAR(*p)) { /* * Not the single-character wildcard and no explicit match? Then *************** *** 338,340 **** --- 345,352 ---- #undef do_like_escape #endif + #undef TCHAR + + #ifdef MATCH_LOWER + #undef MATCH_LOWER + #endif
В списке pgsql-hackers по дате отправления: