Re: Request for review: tsearch2 patch

Поиск
Список
Период
Сортировка
От Teodor Sigaev
Тема Re: Request for review: tsearch2 patch
Дата
Msg-id 45A7A623.5010700@sigaev.ru
обсуждение исходный текст
Ответ на Re: Request for review: tsearch2 patch  (Tatsuo Ishii <ishii@sraoss.co.jp>)
Ответы Re: Request for review: tsearch2 patch  (Tatsuo Ishii <ishii@postgresql.org>)
Список pgsql-hackers
> Yeah, it's a workaround. Since there's no concept other than
> alpha/numeric/latin in tsearch2, Asian characters have to be fall in
> one of them.

Ok, I see.

Pls, test attached patch - if it is good then I'll commit it at Monday to HEAD
and 8.2 branches.

PS. Magnus, may I ask you to test under Windows? Thank you.

--
Teodor Sigaev                                   E-mail: teodor@sigaev.ru
                                                    WWW: http://www.sigaev.ru/
diff -c -r -N ../tsearch2.orig/ts_locale.c ./ts_locale.c
*** ../tsearch2.orig/ts_locale.c    Fri Jan 12 10:53:11 2007
--- ./ts_locale.c    Fri Jan 12 18:10:27 2007
***************
*** 12,24 ****
  size_t
  wchar2char(char *to, const wchar_t *from, size_t len)
  {
      if (GetDatabaseEncoding() == PG_UTF8)
      {
          int            r;

-         if (len == 0)
-             return 0;
-
          r = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, len,
                                  NULL, NULL);

--- 12,24 ----
  size_t
  wchar2char(char *to, const wchar_t *from, size_t len)
  {
+     if (len == 0)
+         return 0;
+
      if (GetDatabaseEncoding() == PG_UTF8)
      {
          int            r;

          r = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, len,
                                  NULL, NULL);

***************
*** 34,50 ****

      return wcstombs(to, from, len);
  }

  size_t
  char2wchar(wchar_t *to, const char *from, size_t len)
  {
      if (GetDatabaseEncoding() == PG_UTF8)
      {
          int            r;

-         if (len == 0)
-             return 0;
-
          r = MultiByteToWideChar(CP_UTF8, 0, from, len, to, len);

          if (!r)
--- 34,52 ----

      return wcstombs(to, from, len);
  }
+ #endif   /* WIN32 */

  size_t
  char2wchar(wchar_t *to, const char *from, size_t len)
  {
+     if (len == 0)
+         return 0;
+
+ #ifdef WIN32
      if (GetDatabaseEncoding() == PG_UTF8)
      {
          int            r;

          r = MultiByteToWideChar(CP_UTF8, 0, from, len, to, len);

          if (!r)
***************
*** 60,88 ****

          return r;
      }

      return mbstowcs(to, from, len);
  }
- #endif   /* WIN32 */

  int
  _t_isalpha(const char *ptr)
  {
!     wchar_t        character;

!     char2wchar(&character, ptr, 1);

!     return iswalpha((wint_t) character);
  }

  int
  _t_isprint(const char *ptr)
  {
!     wchar_t        character;

!     char2wchar(&character, ptr, 1);

!     return iswprint((wint_t) character);
  }
  #endif   /* TS_USE_WIDE */

--- 62,105 ----

          return r;
      }
+     else
+ #endif /* WIN32 */
+     if ( lc_ctype_is_c() )
+     {
+         /*
+          * pg_mb2wchar_with_len always adds trailing '\0', so
+          * 'to' should be allocated with sufficient space
+          */
+         return pg_mb2wchar_with_len(from, (pg_wchar *)to, len);
+     }

      return mbstowcs(to, from, len);
  }

  int
  _t_isalpha(const char *ptr)
  {
!     wchar_t        character[2];
!
!     if (lc_ctype_is_c())
!         return isalpha(TOUCHAR(ptr));

!     char2wchar(character, ptr, 1);

!     return iswalpha((wint_t) *character);
  }

  int
  _t_isprint(const char *ptr)
  {
!     wchar_t        character[2];
!
!     if (lc_ctype_is_c())
!         return isprint(TOUCHAR(ptr));

!     char2wchar(character, ptr, 1);

!     return iswprint((wint_t) *character);
  }
  #endif   /* TS_USE_WIDE */

***************
*** 126,132 ****
          if ( wlen < 0 )
              ereport(ERROR,
                      (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
!                      errmsg("transalation failed from server encoding to wchar_t")));

          Assert(wlen<=len);
          wstr[wlen] = 0;
--- 143,149 ----
          if ( wlen < 0 )
              ereport(ERROR,
                      (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
!                      errmsg("translation failed from server encoding to wchar_t")));

          Assert(wlen<=len);
          wstr[wlen] = 0;
***************
*** 152,158 ****
          if ( wlen < 0 )
              ereport(ERROR,
                      (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
!                      errmsg("transalation failed from wchar_t to server encoding %d", errno)));
          Assert(wlen<=len);
          out[wlen]='\0';
      }
--- 169,175 ----
          if ( wlen < 0 )
              ereport(ERROR,
                      (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
!                      errmsg("translation failed from wchar_t to server encoding %d", errno)));
          Assert(wlen<=len);
          out[wlen]='\0';
      }
diff -c -r -N ../tsearch2.orig/ts_locale.h ./ts_locale.h
*** ../tsearch2.orig/ts_locale.h    Fri Jan 12 10:53:11 2007
--- ./ts_locale.h    Fri Jan 12 18:10:19 2007
***************
*** 30,45 ****
  #define TOUCHAR(x)    (*((unsigned char*)(x)))

  #ifdef TS_USE_WIDE

  #ifdef WIN32

  size_t        wchar2char(char *to, const wchar_t *from, size_t len);
! size_t        char2wchar(wchar_t *to, const char *from, size_t len);
  #else                            /* WIN32 */

! /* correct mbstowcs */
! #define char2wchar mbstowcs
  #define wchar2char wcstombs
  #endif   /* WIN32 */

  #define t_isdigit(x)    ( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) )
--- 30,46 ----
  #define TOUCHAR(x)    (*((unsigned char*)(x)))

  #ifdef TS_USE_WIDE
+ size_t        char2wchar(wchar_t *to, const char *from, size_t len);

  #ifdef WIN32

  size_t        wchar2char(char *to, const wchar_t *from, size_t len);
!
  #else                            /* WIN32 */

! /* correct wcstombs */
  #define wchar2char wcstombs
+
  #endif   /* WIN32 */

  #define t_isdigit(x)    ( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) )
***************
*** 55,64 ****
   */
  #define t_iseq(x,c) ( (pg_mblen(x)==1) ? ( TOUCHAR(x) == ((unsigned char)(c)) ) : false )

! #define COPYCHAR(d,s)    do {                \
!     int lll = pg_mblen( s );            \
!                             \
!     while( lll-- )                    \
          TOUCHAR((d)+lll) = TOUCHAR((s)+lll);    \
  } while(0)

--- 56,65 ----
   */
  #define t_iseq(x,c) ( (pg_mblen(x)==1) ? ( TOUCHAR(x) == ((unsigned char)(c)) ) : false )

! #define COPYCHAR(d,s)    do {                    \
!     int lll = pg_mblen( s );                    \
!                                                 \
!     while( lll-- )                                \
          TOUCHAR((d)+lll) = TOUCHAR((s)+lll);    \
  } while(0)

diff -c -r -N ../tsearch2.orig/tsearch2.patch ./tsearch2.patch
*** ../tsearch2.orig/tsearch2.patch    Thu Jan  1 03:00:00 1970
--- ./tsearch2.patch    Fri Jan 12 18:12:30 2007
***************
*** 0 ****
--- 1,243 ----
+ diff -c -r -N ../tsearch2.orig/ts_locale.c ./ts_locale.c
+ *** ../tsearch2.orig/ts_locale.c    Fri Jan 12 10:53:11 2007
+ --- ./ts_locale.c    Fri Jan 12 18:10:27 2007
+ ***************
+ *** 12,24 ****
+   size_t
+   wchar2char(char *to, const wchar_t *from, size_t len)
+   {
+       if (GetDatabaseEncoding() == PG_UTF8)
+       {
+           int            r;
+
+ -         if (len == 0)
+ -             return 0;
+ -
+           r = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, len,
+                                   NULL, NULL);
+
+ --- 12,24 ----
+   size_t
+   wchar2char(char *to, const wchar_t *from, size_t len)
+   {
+ +     if (len == 0)
+ +         return 0;
+ +
+       if (GetDatabaseEncoding() == PG_UTF8)
+       {
+           int            r;
+
+           r = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, len,
+                                   NULL, NULL);
+
+ ***************
+ *** 34,50 ****
+
+       return wcstombs(to, from, len);
+   }
+
+   size_t
+   char2wchar(wchar_t *to, const char *from, size_t len)
+   {
+       if (GetDatabaseEncoding() == PG_UTF8)
+       {
+           int            r;
+
+ -         if (len == 0)
+ -             return 0;
+ -
+           r = MultiByteToWideChar(CP_UTF8, 0, from, len, to, len);
+
+           if (!r)
+ --- 34,52 ----
+
+       return wcstombs(to, from, len);
+   }
+ + #endif   /* WIN32 */
+
+   size_t
+   char2wchar(wchar_t *to, const char *from, size_t len)
+   {
+ +     if (len == 0)
+ +         return 0;
+ +
+ + #ifdef WIN32
+       if (GetDatabaseEncoding() == PG_UTF8)
+       {
+           int            r;
+
+           r = MultiByteToWideChar(CP_UTF8, 0, from, len, to, len);
+
+           if (!r)
+ ***************
+ *** 60,88 ****
+
+           return r;
+       }
+
+       return mbstowcs(to, from, len);
+   }
+ - #endif   /* WIN32 */
+
+   int
+   _t_isalpha(const char *ptr)
+   {
+ !     wchar_t        character;
+
+ !     char2wchar(&character, ptr, 1);
+
+ !     return iswalpha((wint_t) character);
+   }
+
+   int
+   _t_isprint(const char *ptr)
+   {
+ !     wchar_t        character;
+
+ !     char2wchar(&character, ptr, 1);
+
+ !     return iswprint((wint_t) character);
+   }
+   #endif   /* TS_USE_WIDE */
+
+ --- 62,105 ----
+
+           return r;
+       }
+ +     else
+ + #endif /* WIN32 */
+ +     if ( lc_ctype_is_c() )
+ +     {
+ +         /*
+ +          * pg_mb2wchar_with_len always adds trailing '\0', so
+ +          * 'to' should be allocated with sufficient space
+ +          */
+ +         return pg_mb2wchar_with_len(from, (pg_wchar *)to, len);
+ +     }
+
+       return mbstowcs(to, from, len);
+   }
+
+   int
+   _t_isalpha(const char *ptr)
+   {
+ !     wchar_t        character[2];
+ !
+ !     if (lc_ctype_is_c())
+ !         return isalpha(TOUCHAR(ptr));
+
+ !     char2wchar(character, ptr, 1);
+
+ !     return iswalpha((wint_t) *character);
+   }
+
+   int
+   _t_isprint(const char *ptr)
+   {
+ !     wchar_t        character[2];
+ !
+ !     if (lc_ctype_is_c())
+ !         return isprint(TOUCHAR(ptr));
+
+ !     char2wchar(character, ptr, 1);
+
+ !     return iswprint((wint_t) *character);
+   }
+   #endif   /* TS_USE_WIDE */
+
+ ***************
+ *** 126,132 ****
+           if ( wlen < 0 )
+               ereport(ERROR,
+                       (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
+ !                      errmsg("transalation failed from server encoding to wchar_t")));
+
+           Assert(wlen<=len);
+           wstr[wlen] = 0;
+ --- 143,149 ----
+           if ( wlen < 0 )
+               ereport(ERROR,
+                       (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
+ !                      errmsg("translation failed from server encoding to wchar_t")));
+
+           Assert(wlen<=len);
+           wstr[wlen] = 0;
+ ***************
+ *** 152,158 ****
+           if ( wlen < 0 )
+               ereport(ERROR,
+                       (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
+ !                      errmsg("transalation failed from wchar_t to server encoding %d", errno)));
+           Assert(wlen<=len);
+           out[wlen]='\0';
+       }
+ --- 169,175 ----
+           if ( wlen < 0 )
+               ereport(ERROR,
+                       (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
+ !                      errmsg("translation failed from wchar_t to server encoding %d", errno)));
+           Assert(wlen<=len);
+           out[wlen]='\0';
+       }
+ diff -c -r -N ../tsearch2.orig/ts_locale.h ./ts_locale.h
+ *** ../tsearch2.orig/ts_locale.h    Fri Jan 12 10:53:11 2007
+ --- ./ts_locale.h    Fri Jan 12 18:10:19 2007
+ ***************
+ *** 30,45 ****
+   #define TOUCHAR(x)    (*((unsigned char*)(x)))
+
+   #ifdef TS_USE_WIDE
+
+   #ifdef WIN32
+
+   size_t        wchar2char(char *to, const wchar_t *from, size_t len);
+ ! size_t        char2wchar(wchar_t *to, const char *from, size_t len);
+   #else                            /* WIN32 */
+
+ ! /* correct mbstowcs */
+ ! #define char2wchar mbstowcs
+   #define wchar2char wcstombs
+   #endif   /* WIN32 */
+
+   #define t_isdigit(x)    ( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) )
+ --- 30,46 ----
+   #define TOUCHAR(x)    (*((unsigned char*)(x)))
+
+   #ifdef TS_USE_WIDE
+ + size_t        char2wchar(wchar_t *to, const char *from, size_t len);
+
+   #ifdef WIN32
+
+   size_t        wchar2char(char *to, const wchar_t *from, size_t len);
+ !
+   #else                            /* WIN32 */
+
+ ! /* correct wcstombs */
+   #define wchar2char wcstombs
+ +
+   #endif   /* WIN32 */
+
+   #define t_isdigit(x)    ( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) )
+ ***************
+ *** 55,64 ****
+    */
+   #define t_iseq(x,c) ( (pg_mblen(x)==1) ? ( TOUCHAR(x) == ((unsigned char)(c)) ) : false )
+
+ ! #define COPYCHAR(d,s)    do {                \
+ !     int lll = pg_mblen( s );            \
+ !                             \
+ !     while( lll-- )                    \
+           TOUCHAR((d)+lll) = TOUCHAR((s)+lll);    \
+   } while(0)
+
+ --- 56,65 ----
+    */
+   #define t_iseq(x,c) ( (pg_mblen(x)==1) ? ( TOUCHAR(x) == ((unsigned char)(c)) ) : false )
+
+ ! #define COPYCHAR(d,s)    do {                    \
+ !     int lll = pg_mblen( s );                    \
+ !                                                 \
+ !     while( lll-- )                                \
+           TOUCHAR((d)+lll) = TOUCHAR((s)+lll);    \
+   } while(0)
+
diff -c -r -N ../tsearch2.orig/wordparser/parser.c ./wordparser/parser.c
*** ../tsearch2.orig/wordparser/parser.c    Fri Jan 12 10:53:11 2007
--- ./wordparser/parser.c    Fri Jan 12 18:10:38 2007
***************
*** 40,55 ****
  #ifdef TS_USE_WIDE

      /*
!      * Use wide char code only when max encoding length > 1 and ctype != C.
!      * Some operating systems fail with multi-byte encodings and a C locale.
!      * Also, for a C locale there is no need to process as multibyte. From
!      * backend/utils/adt/oracle_compat.c Teodor
       */

!     if (prs->charmaxlen > 1 && !lc_ctype_is_c())
      {
          prs->usewide = true;
!         prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * prs->lenstr);
          prs->lenwstr = char2wchar(prs->wstr, prs->str, prs->lenstr);
      }
      else
--- 40,52 ----
  #ifdef TS_USE_WIDE

      /*
!      * Use wide char code only when max encoding length > 1.
       */

!     if (prs->charmaxlen > 1)
      {
          prs->usewide = true;
!         prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr+1));
          prs->lenwstr = char2wchar(prs->wstr, prs->str, prs->lenstr);
      }
      else
***************
*** 83,107 ****

  /*
   * defining support function, equvalent is* macroses, but
!  * working with any possible encodings and locales
   */

  #ifdef TS_USE_WIDE

! #define p_iswhat(type)                                        \
! static int                                            \
! p_is##type(TParser *prs) {                                    \
!     Assert( prs->state );                                    \
!     return ( ( prs->usewide ) ? isw##type( (wint_t)*( prs->wstr + prs->state->poschar ) ) : \
!         is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ) );        \
! }    \
!                                                 \
! static int                                            \
! p_isnot##type(TParser *prs) {                                    \
!     return !p_is##type(prs);                                \
  }



  /* p_iseq should be used only for ascii symbols */

--- 80,178 ----

  /*
   * defining support function, equvalent is* macroses, but
!  * working with any possible encodings and locales. Note,
!  * that with multibyte encoding and C-locale isw* function may fail
!  * or give wrong result. Note 2: multibyte encoding and C-locale
!  * often are used for Asian languages.
   */

  #ifdef TS_USE_WIDE

! #define p_iswhat(type)                                                        \
! static int                                                                    \
! p_is##type(TParser *prs) {                                                    \
!     Assert( prs->state );                                                    \
!     if ( prs->usewide )                                                        \
!     {                                                                        \
!         if ( lc_ctype_is_c() )                                                \
!             return is##type( 0xff & *( prs->wstr + prs->state->poschar) );    \
!                                                                             \
!         return isw##type( *(wint_t*)( prs->wstr + prs->state->poschar ) );    \
!     }                                                                        \
!                                                                             \
!     return is##type( *(unsigned char*)( prs->str + prs->state->posbyte ) );    \
! }                                                                            \
!                                                                             \
! static int                                                                    \
! p_isnot##type(TParser *prs) {                                                \
!     return !p_is##type(prs);                                                \
  }

+ static int
+ p_isalnum(TParser *prs)
+ {
+     Assert( prs->state );
+
+     if (prs->usewide)
+     {
+         if (lc_ctype_is_c())
+         {
+             unsigned int c = *(unsigned int*)(prs->wstr + prs->state->poschar);
+
+             /*
+              * any non-ascii symbol with multibyte encoding
+              * with C-locale is an alpha character
+              */
+             if ( c > 0x7f )
+                 return 1;
+
+             return isalnum(0xff & c);
+         }
+
+         return iswalnum( (wint_t)*( prs->wstr + prs->state->poschar));
+     }
+
+     return isalnum( *(unsigned char*)( prs->str + prs->state->posbyte ));
+ }

+ static int
+ p_isnotalnum(TParser *prs)
+ {
+     return !p_isalnum(prs);
+ }
+
+ static int
+ p_isalpha(TParser *prs)
+ {
+     Assert( prs->state );
+
+     if (prs->usewide)
+     {
+         if (lc_ctype_is_c())
+         {
+             unsigned int c = *(prs->wstr + prs->state->poschar);
+
+             /*
+              * any non-ascii symbol with multibyte encoding
+              * with C-locale is an alpha character
+              */
+             if ( c > 0x7f )
+                 return 1;
+
+             return isalpha(0xff & c);
+         }
+
+         return iswalpha( (wint_t)*( prs->wstr + prs->state->poschar));
+     }
+
+     return isalpha( *(unsigned char*)( prs->str + prs->state->posbyte ));
+ }
+
+ static int
+ p_isnotalpha(TParser *prs)
+ {
+     return !p_isalpha(prs);
+ }

  /* p_iseq should be used only for ascii symbols */

***************
*** 111,128 ****
      Assert(prs->state);
      return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte) == c)) ? 1 : 0;
  }
  #else                            /* TS_USE_WIDE */

! #define p_iswhat(type)                                        \
! static int                                            \
! p_is##type(TParser *prs) {                                    \
!     Assert( prs->state );                                    \
!     return is##type( (unsigned char)*( prs->str + prs->state->posbyte ) );            \
! }    \
!                                                 \
! static int                                            \
! p_isnot##type(TParser *prs) {                                    \
!     return !p_is##type(prs);                                \
  }


--- 182,200 ----
      Assert(prs->state);
      return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte) == c)) ? 1 : 0;
  }
+
  #else                            /* TS_USE_WIDE */

! #define p_iswhat(type)                                                        \
! static int                                                                    \
! p_is##type(TParser *prs) {                                                    \
!     Assert( prs->state );                                                    \
!     return is##type( (unsigned char)*( prs->str + prs->state->posbyte ) );    \
! }                                                                            \
!                                                                             \
! static int                                                                    \
! p_isnot##type(TParser *prs) {                                                \
!     return !p_is##type(prs);                                                \
  }


***************
*** 132,141 ****
      Assert(prs->state);
      return (*(prs->str + prs->state->posbyte) == c) ? 1 : 0;
  }
- #endif   /* TS_USE_WIDE */

  p_iswhat(alnum)
  p_iswhat(alpha)
  p_iswhat(digit)
  p_iswhat(lower)
  p_iswhat(print)
--- 204,215 ----
      Assert(prs->state);
      return (*(prs->str + prs->state->posbyte) == c) ? 1 : 0;
  }

  p_iswhat(alnum)
  p_iswhat(alpha)
+
+ #endif   /* TS_USE_WIDE */
+
  p_iswhat(digit)
  p_iswhat(lower)
  p_iswhat(print)

В списке pgsql-hackers по дате отправления:

Предыдущее
От: Magnus Hagander
Дата:
Сообщение: Re: [GENERAL] Checkpoint request failed on version 8.2.1.
Следующее
От: Tom Lane
Дата:
Сообщение: Re: [GENERAL] Checkpoint request failed on version 8.2.1.