Обсуждение: Request for review: tsearch2 patch
Hi,
Here are patches against tsearch2 with CVS head. Currently tsearch2
does not work with multibyte encoding which uses C locale. These
patches are intended to solve the problem by using PostgreSQL in-house
multibyte function instead of mbstowcs which does not work with C
locale. Also iswalpha etc. will not be called in case of C locale
since they are not working with it. Tested with the EUC_JP encoding
(should be working with any multibye encodings). Existing single byte
encodings should not be broken by the patches, I did not test though.
--
Tatsuo Ishii
SRA OSS, Inc. Japan
Index: ts_locale.c
===================================================================
RCS file: /cvsroot/pgsql/contrib/tsearch2/ts_locale.c,v
retrieving revision 1.7
diff -c -r1.7 ts_locale.c
*** ts_locale.c 20 Nov 2006 14:03:30 -0000 1.7
--- ts_locale.c 1 Jan 2007 12:22:50 -0000
***************
*** 63,68 ****
--- 63,101 ---- return mbstowcs(to, from, len); }
+
+ #else /* WIN32 */
+
+ size_t
+ char2wchar(wchar_t *to, const char *from, size_t len)
+ {
+ wchar_t *result;
+ size_t n;
+
+ if (to == NULL)
+ return 0;
+
+ if (lc_ctype_is_c)
+ {
+ /* allocate neccesary memory for "to" including NULL terminate */
+ result = (wchar_t *)palloc((len+1)*sizeof(wchar_t));
+
+ /* do the conversion */
+ n = (size_t)pg_mb2wchar_with_len(from, (pg_wchar *)result, len);
+ if (n > 0)
+ {
+ /* store the result */
+ if (n > len)
+ n = len;
+ memcpy(to, result, n*sizeof(wchar_t));
+ pfree(result);
+ *(to + n) = '\0';
+ }
+ return n;
+ }
+ return mbstowcs(to, from, len);
+ }
+ #endif /* WIN32 */ int
***************
*** 70,75 ****
--- 103,113 ---- { wchar_t character;
+ if (lc_ctype_is_c)
+ {
+ return isalpha(TOUCHAR(ptr));
+ }
+ char2wchar(&character, ptr, 1); return iswalpha((wint_t) character);
***************
*** 80,85 ****
--- 118,128 ---- { wchar_t character;
+ if (lc_ctype_is_c)
+ {
+ return isprint(TOUCHAR(ptr));
+ }
+ char2wchar(&character, ptr, 1); return iswprint((wint_t) character);
***************
*** 126,132 **** if ( wlen < 0 ) ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
! errmsg("transalation failed from server encoding to wchar_t"))); Assert(wlen<=len);
wstr[wlen] = 0;
--- 169,175 ---- if ( wlen < 0 ) ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
! errmsg("translation failed from server encoding to wchar_t"))); Assert(wlen<=len);
wstr[wlen] = 0;
***************
*** 152,158 **** if ( wlen < 0 ) ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
! errmsg("transalation failed from wchar_t to server encoding %d", errno)));
Assert(wlen<=len); out[wlen]='\0'; }
--- 195,201 ---- if ( wlen < 0 ) ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
! errmsg("translation failed from wchar_t to server encoding %d", errno)));
Assert(wlen<=len); out[wlen]='\0'; }
Index: ts_locale.h
===================================================================
RCS file: /cvsroot/pgsql/contrib/tsearch2/ts_locale.h,v
retrieving revision 1.7
diff -c -r1.7 ts_locale.h
*** ts_locale.h 4 Oct 2006 00:29:47 -0000 1.7
--- ts_locale.h 1 Jan 2007 12:22:50 -0000
***************
*** 38,45 **** #else /* WIN32 */ /* correct mbstowcs */
- #define char2wchar mbstowcs #define wchar2char wcstombs #endif /* WIN32 */ #define t_isdigit(x) (
pg_mblen(x)==1&& isdigit( TOUCHAR(x) ) )
--- 38,46 ---- #else /* WIN32 */ /* correct mbstowcs */ #define wchar2char wcstombs
+ size_t char2wchar(wchar_t *to, const char *from, size_t len);
+ #endif /* WIN32 */ #define t_isdigit(x) ( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) )
***************
*** 54,59 ****
--- 55,61 ---- * t_iseq() should be called only for ASCII symbols */ #define t_iseq(x,c) ( (pg_mblen(x)==1) ? (
TOUCHAR(x)== ((unsigned char)(c)) ) : false )
+ /*#define t_iseq(x,c) ( TOUCHAR(x) == ((unsigned char)(c)))*/ #define COPYCHAR(d,s) do { \ int
lll= pg_mblen( s ); \
Index: wordparser/parser.c
===================================================================
RCS file: /cvsroot/pgsql/contrib/tsearch2/wordparser/parser.c,v
retrieving revision 1.11
diff -c -r1.11 parser.c
*** wordparser/parser.c 4 Oct 2006 00:29:47 -0000 1.11
--- wordparser/parser.c 1 Jan 2007 12:22:51 -0000
***************
*** 44,52 **** * Some operating systems fail with multi-byte encodings and a C locale. * Also, for a C locale
thereis no need to process as multibyte. From * backend/utils/adt/oracle_compat.c Teodor */
! if (prs->charmaxlen > 1 && !lc_ctype_is_c()) { prs->usewide = true; prs->wstr = (wchar_t *)
palloc(sizeof(wchar_t)* prs->lenstr);
--- 44,54 ---- * Some operating systems fail with multi-byte encodings and a C locale. * Also, for a C locale
thereis no need to process as multibyte. From * backend/utils/adt/oracle_compat.c Teodor
+ *
+ * This is wrong assumption. even if locale is C, multibyte is necceary. */
! if (prs->charmaxlen > 1) { prs->usewide = true; prs->wstr = (wchar_t *)
palloc(sizeof(wchar_t)* prs->lenstr);
***************
*** 92,98 **** static int \ p_is##type(TParser *prs) {
\ Assert( prs->state ); \
! return ( ( prs->usewide ) ? isw##type( (wint_t)*( prs->wstr + prs->state->poschar ) ) : \ is##type(
(unsignedchar)*( prs->str + prs->state->posbyte ) ) ); \ } \
\
--- 94,102 ---- static int \ p_is##type(TParser *prs) {
\ Assert( prs->state ); \
! return ( ( prs->usewide ) ? \
! (lc_ctype_is_c? is##type( 0xff & *( prs->wstr + prs->state->poschar)): \
! isw##type( (wint_t)*( prs->wstr + prs->state->poschar))): \ is##type( (unsigned char)*(
prs->str+ prs->state->posbyte ) ) ); \ } \ \
***************
*** 134,141 **** } #endif /* TS_USE_WIDE */
! p_iswhat(alnum)
! p_iswhat(alpha) p_iswhat(digit) p_iswhat(lower) p_iswhat(print)
--- 138,197 ---- } #endif /* TS_USE_WIDE */
! static int p_isalnum(TParser *prs) {
! Assert( prs->state );
!
! if (prs->usewide)
! {
! unsigned int c;
!
! c = *(prs->wstr + prs->state->poschar);
!
! if (lc_ctype_is_c)
! {
! if (c > 0x7f)
! return 1;
! return isalnum(0xff & c);
! }
! else
! return iswalnum( (wint_t)*( prs->wstr + prs->state->poschar));
! }
! else
! return isalnum( (unsigned char)*( prs->str + prs->state->posbyte ));
! }
!
! static int p_isnotalnum(TParser *prs)
! {
! return !p_isalnum(prs);
! }
!
! static int p_isalpha(TParser *prs) {
! Assert( prs->state );
!
! if (prs->usewide)
! {
! unsigned int c;
!
! c = *(prs->wstr + prs->state->poschar);
!
! if (lc_ctype_is_c)
! {
! if (c > 0x7f)
! return 1;
! return isalpha(0xff & c);
! }
! else
! return iswalpha( (wint_t)*( prs->wstr + prs->state->poschar));
! }
! else
! return isalpha( (unsigned char)*( prs->str + prs->state->posbyte ));
! }
!
! static int p_isnotalpha(TParser *prs)
! {
! return !p_isalpha(prs);
! }
! p_iswhat(digit) p_iswhat(lower) p_iswhat(print)
I have tested with local-enabled environment and found a bug. Included
is the new version of patches.
Teodor, Oleg, what do you think about these patches?
If ok, shall I commit to CVS head?
--
Tatsuo Ishii
SRA OSS, Inc. Japan
> Hi,
>
> Here are patches against tsearch2 with CVS head. Currently tsearch2
> does not work with multibyte encoding which uses C locale. These
> patches are intended to solve the problem by using PostgreSQL in-house
> multibyte function instead of mbstowcs which does not work with C
> locale. Also iswalpha etc. will not be called in case of C locale
> since they are not working with it. Tested with the EUC_JP encoding
> (should be working with any multibye encodings). Existing single byte
> encodings should not be broken by the patches, I did not test though.
> --
> Tatsuo Ishii
> SRA OSS, Inc. Japan
Index: ts_locale.c
===================================================================
RCS file: /cvsroot/pgsql/contrib/tsearch2/ts_locale.c,v
retrieving revision 1.7
diff -c -r1.7 ts_locale.c
*** ts_locale.c 20 Nov 2006 14:03:30 -0000 1.7
--- ts_locale.c 4 Jan 2007 12:16:00 -0000
***************
*** 63,68 ****
--- 63,101 ---- return mbstowcs(to, from, len); }
+
+ #else /* WIN32 */
+
+ size_t
+ char2wchar(wchar_t *to, const char *from, size_t len)
+ {
+ wchar_t *result;
+ size_t n;
+
+ if (to == NULL)
+ return 0;
+
+ if (lc_ctype_is_c())
+ {
+ /* allocate neccesary memory for "to" including NULL terminate */
+ result = (wchar_t *)palloc((len+1)*sizeof(wchar_t));
+
+ /* do the conversion */
+ n = (size_t)pg_mb2wchar_with_len(from, (pg_wchar *)result, len);
+ if (n > 0)
+ {
+ /* store the result */
+ if (n > len)
+ n = len;
+ memcpy(to, result, n*sizeof(wchar_t));
+ pfree(result);
+ *(to + n) = '\0';
+ }
+ return n;
+ }
+ return mbstowcs(to, from, len);
+ }
+ #endif /* WIN32 */ int
***************
*** 70,75 ****
--- 103,113 ---- { wchar_t character;
+ if (lc_ctype_is_c())
+ {
+ return isalpha(TOUCHAR(ptr));
+ }
+ char2wchar(&character, ptr, 1); return iswalpha((wint_t) character);
***************
*** 80,85 ****
--- 118,128 ---- { wchar_t character;
+ if (lc_ctype_is_c())
+ {
+ return isprint(TOUCHAR(ptr));
+ }
+ char2wchar(&character, ptr, 1); return iswprint((wint_t) character);
***************
*** 126,132 **** if ( wlen < 0 ) ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
! errmsg("transalation failed from server encoding to wchar_t"))); Assert(wlen<=len);
wstr[wlen] = 0;
--- 169,175 ---- if ( wlen < 0 ) ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
! errmsg("translation failed from server encoding to wchar_t"))); Assert(wlen<=len);
wstr[wlen] = 0;
***************
*** 152,158 **** if ( wlen < 0 ) ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
! errmsg("transalation failed from wchar_t to server encoding %d", errno)));
Assert(wlen<=len); out[wlen]='\0'; }
--- 195,201 ---- if ( wlen < 0 ) ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
! errmsg("translation failed from wchar_t to server encoding %d", errno)));
Assert(wlen<=len); out[wlen]='\0'; }
Index: ts_locale.h
===================================================================
RCS file: /cvsroot/pgsql/contrib/tsearch2/ts_locale.h,v
retrieving revision 1.7
diff -c -r1.7 ts_locale.h
*** ts_locale.h 4 Oct 2006 00:29:47 -0000 1.7
--- ts_locale.h 4 Jan 2007 12:16:00 -0000
***************
*** 38,45 **** #else /* WIN32 */ /* correct mbstowcs */
- #define char2wchar mbstowcs #define wchar2char wcstombs #endif /* WIN32 */ #define t_isdigit(x) (
pg_mblen(x)==1&& isdigit( TOUCHAR(x) ) )
--- 38,46 ---- #else /* WIN32 */ /* correct mbstowcs */ #define wchar2char wcstombs
+ size_t char2wchar(wchar_t *to, const char *from, size_t len);
+ #endif /* WIN32 */ #define t_isdigit(x) ( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) )
***************
*** 54,59 ****
--- 55,61 ---- * t_iseq() should be called only for ASCII symbols */ #define t_iseq(x,c) ( (pg_mblen(x)==1) ? (
TOUCHAR(x)== ((unsigned char)(c)) ) : false )
+ /*#define t_iseq(x,c) ( TOUCHAR(x) == ((unsigned char)(c)))*/ #define COPYCHAR(d,s) do { \ int
lll= pg_mblen( s ); \
Index: wordparser/parser.c
===================================================================
RCS file: /cvsroot/pgsql/contrib/tsearch2/wordparser/parser.c,v
retrieving revision 1.11
diff -c -r1.11 parser.c
*** wordparser/parser.c 4 Oct 2006 00:29:47 -0000 1.11
--- wordparser/parser.c 4 Jan 2007 12:16:01 -0000
***************
*** 44,52 **** * Some operating systems fail with multi-byte encodings and a C locale. * Also, for a C locale
thereis no need to process as multibyte. From * backend/utils/adt/oracle_compat.c Teodor */
! if (prs->charmaxlen > 1 && !lc_ctype_is_c()) { prs->usewide = true; prs->wstr = (wchar_t *)
palloc(sizeof(wchar_t)* prs->lenstr);
--- 44,54 ---- * Some operating systems fail with multi-byte encodings and a C locale. * Also, for a C locale
thereis no need to process as multibyte. From * backend/utils/adt/oracle_compat.c Teodor
+ *
+ * This is wrong assumption. even if locale is C, multibyte is necceary. */
! if (prs->charmaxlen > 1) { prs->usewide = true; prs->wstr = (wchar_t *)
palloc(sizeof(wchar_t)* prs->lenstr);
***************
*** 92,98 **** static int \ p_is##type(TParser *prs) {
\ Assert( prs->state ); \
! return ( ( prs->usewide ) ? isw##type( (wint_t)*( prs->wstr + prs->state->poschar ) ) : \ is##type(
(unsignedchar)*( prs->str + prs->state->posbyte ) ) ); \ } \
\
--- 94,102 ---- static int \ p_is##type(TParser *prs) {
\ Assert( prs->state ); \
! return ( ( prs->usewide ) ? \
! (lc_ctype_is_c()? is##type( 0xff & *( prs->wstr + prs->state->poschar)): \
! isw##type( (wint_t)*( prs->wstr + prs->state->poschar))): \ is##type( (unsigned char)*(
prs->str+ prs->state->posbyte ) ) ); \ } \ \
***************
*** 134,141 **** } #endif /* TS_USE_WIDE */
! p_iswhat(alnum)
! p_iswhat(alpha) p_iswhat(digit) p_iswhat(lower) p_iswhat(print)
--- 138,197 ---- } #endif /* TS_USE_WIDE */
! static int p_isalnum(TParser *prs) {
! Assert( prs->state );
!
! if (prs->usewide)
! {
! unsigned int c;
!
! c = *(prs->wstr + prs->state->poschar);
!
! if (lc_ctype_is_c())
! {
! if (c > 0x7f)
! return 1;
! return isalnum(0xff & c);
! }
! else
! return iswalnum( (wint_t)*( prs->wstr + prs->state->poschar));
! }
! else
! return isalnum( (unsigned char)*( prs->str + prs->state->posbyte ));
! }
!
! static int p_isnotalnum(TParser *prs)
! {
! return !p_isalnum(prs);
! }
!
! static int p_isalpha(TParser *prs) {
! Assert( prs->state );
!
! if (prs->usewide)
! {
! unsigned int c;
!
! c = *(prs->wstr + prs->state->poschar);
!
! if (lc_ctype_is_c())
! {
! if (c > 0x7f)
! return 1;
! return isalpha(0xff & c);
! }
! else
! return iswalpha( (wint_t)*( prs->wstr + prs->state->poschar));
! }
! else
! return isalpha( (unsigned char)*( prs->str + prs->state->posbyte ));
! }
!
! static int p_isnotalpha(TParser *prs)
! {
! return !p_isalpha(prs);
! }
! p_iswhat(digit) p_iswhat(lower) p_iswhat(print)
Sorry for delay, I was on holidays :)
Did you test patch on Windows platform?
Tatsuo Ishii wrote:
> I have tested with local-enabled environment and found a bug. Included
> is the new version of patches.
>
> Teodor, Oleg, what do you think about these patches?
> If ok, shall I commit to CVS head?
> --
> Tatsuo Ishii
> SRA OSS, Inc. Japan
>
>> Hi,
>>
>> Here are patches against tsearch2 with CVS head. Currently tsearch2
>> does not work with multibyte encoding which uses C locale. These
>> patches are intended to solve the problem by using PostgreSQL in-house
>> multibyte function instead of mbstowcs which does not work with C
>> locale. Also iswalpha etc. will not be called in case of C locale
>> since they are not working with it. Tested with the EUC_JP encoding
>> (should be working with any multibye encodings). Existing single byte
>> encodings should not be broken by the patches, I did not test though.
>> --
>> Tatsuo Ishii
>> SRA OSS, Inc. Japan
>>
>> ------------------------------------------------------------------------
>>
>> Index: ts_locale.c
>> ===================================================================
>> RCS file: /cvsroot/pgsql/contrib/tsearch2/ts_locale.c,v
>> retrieving revision 1.7
>> diff -c -r1.7 ts_locale.c
>> *** ts_locale.c 20 Nov 2006 14:03:30 -0000 1.7
>> --- ts_locale.c 4 Jan 2007 12:16:00 -0000
>> ***************
>> *** 63,68 ****
>> --- 63,101 ----
>>
>> return mbstowcs(to, from, len);
>> }
>> +
>> + #else /* WIN32 */
>> +
>> + size_t
>> + char2wchar(wchar_t *to, const char *from, size_t len)
>> + {
>> + wchar_t *result;
>> + size_t n;
>> +
>> + if (to == NULL)
>> + return 0;
>> +
>> + if (lc_ctype_is_c())
>> + {
>> + /* allocate neccesary memory for "to" including NULL terminate */
>> + result = (wchar_t *)palloc((len+1)*sizeof(wchar_t));
>> +
>> + /* do the conversion */
>> + n = (size_t)pg_mb2wchar_with_len(from, (pg_wchar *)result, len);
>> + if (n > 0)
>> + {
>> + /* store the result */
>> + if (n > len)
>> + n = len;
>> + memcpy(to, result, n*sizeof(wchar_t));
>> + pfree(result);
>> + *(to + n) = '\0';
>> + }
>> + return n;
>> + }
>> + return mbstowcs(to, from, len);
>> + }
>> +
>> #endif /* WIN32 */
>>
>> int
>> ***************
>> *** 70,75 ****
>> --- 103,113 ----
>> {
>> wchar_t character;
>>
>> + if (lc_ctype_is_c())
>> + {
>> + return isalpha(TOUCHAR(ptr));
>> + }
>> +
>> char2wchar(&character, ptr, 1);
>>
>> return iswalpha((wint_t) character);
>> ***************
>> *** 80,85 ****
>> --- 118,128 ----
>> {
>> wchar_t character;
>>
>> + if (lc_ctype_is_c())
>> + {
>> + return isprint(TOUCHAR(ptr));
>> + }
>> +
>> char2wchar(&character, ptr, 1);
>>
>> return iswprint((wint_t) character);
>> ***************
>> *** 126,132 ****
>> if ( wlen < 0 )
>> ereport(ERROR,
>> (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
>> ! errmsg("transalation failed from server encoding to wchar_t")));
>>
>> Assert(wlen<=len);
>> wstr[wlen] = 0;
>> --- 169,175 ----
>> if ( wlen < 0 )
>> ereport(ERROR,
>> (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
>> ! errmsg("translation failed from server encoding to wchar_t")));
>>
>> Assert(wlen<=len);
>> wstr[wlen] = 0;
>> ***************
>> *** 152,158 ****
>> if ( wlen < 0 )
>> ereport(ERROR,
>> (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
>> ! errmsg("transalation failed from wchar_t to server encoding %d", errno)));
>> Assert(wlen<=len);
>> out[wlen]='\0';
>> }
>> --- 195,201 ----
>> if ( wlen < 0 )
>> ereport(ERROR,
>> (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
>> ! errmsg("translation failed from wchar_t to server encoding %d", errno)));
>> Assert(wlen<=len);
>> out[wlen]='\0';
>> }
>> Index: ts_locale.h
>> ===================================================================
>> RCS file: /cvsroot/pgsql/contrib/tsearch2/ts_locale.h,v
>> retrieving revision 1.7
>> diff -c -r1.7 ts_locale.h
>> *** ts_locale.h 4 Oct 2006 00:29:47 -0000 1.7
>> --- ts_locale.h 4 Jan 2007 12:16:00 -0000
>> ***************
>> *** 38,45 ****
>> #else /* WIN32 */
>>
>> /* correct mbstowcs */
>> - #define char2wchar mbstowcs
>> #define wchar2char wcstombs
>> #endif /* WIN32 */
>>
>> #define t_isdigit(x) ( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) )
>> --- 38,46 ----
>> #else /* WIN32 */
>>
>> /* correct mbstowcs */
>> #define wchar2char wcstombs
>> + size_t char2wchar(wchar_t *to, const char *from, size_t len);
>> +
>> #endif /* WIN32 */
>>
>> #define t_isdigit(x) ( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) )
>> ***************
>> *** 54,59 ****
>> --- 55,61 ----
>> * t_iseq() should be called only for ASCII symbols
>> */
>> #define t_iseq(x,c) ( (pg_mblen(x)==1) ? ( TOUCHAR(x) == ((unsigned char)(c)) ) : false )
>> + /*#define t_iseq(x,c) ( TOUCHAR(x) == ((unsigned char)(c)))*/
>>
>> #define COPYCHAR(d,s) do { \
>> int lll = pg_mblen( s ); \
>> Index: wordparser/parser.c
>> ===================================================================
>> RCS file: /cvsroot/pgsql/contrib/tsearch2/wordparser/parser.c,v
>> retrieving revision 1.11
>> diff -c -r1.11 parser.c
>> *** wordparser/parser.c 4 Oct 2006 00:29:47 -0000 1.11
>> --- wordparser/parser.c 4 Jan 2007 12:16:01 -0000
>> ***************
>> *** 44,52 ****
>> * Some operating systems fail with multi-byte encodings and a C locale.
>> * Also, for a C locale there is no need to process as multibyte. From
>> * backend/utils/adt/oracle_compat.c Teodor
>> */
>>
>> ! if (prs->charmaxlen > 1 && !lc_ctype_is_c())
>> {
>> prs->usewide = true;
>> prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * prs->lenstr);
>> --- 44,54 ----
>> * Some operating systems fail with multi-byte encodings and a C locale.
>> * Also, for a C locale there is no need to process as multibyte. From
>> * backend/utils/adt/oracle_compat.c Teodor
>> + *
>> + * This is wrong assumption. even if locale is C, multibyte is necceary.
>> */
>>
>> ! if (prs->charmaxlen > 1)
>> {
>> prs->usewide = true;
>> prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * prs->lenstr);
>> ***************
>> *** 92,98 ****
>> static int \
>> p_is##type(TParser *prs) { \
>> Assert( prs->state ); \
>> ! return ( ( prs->usewide ) ? isw##type( (wint_t)*( prs->wstr + prs->state->poschar ) ) : \
>> is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ) ); \
>> } \
>> \
>> --- 94,102 ----
>> static int \
>> p_is##type(TParser *prs) { \
>> Assert( prs->state ); \
>> ! return ( ( prs->usewide ) ? \
>> ! (lc_ctype_is_c()? is##type( 0xff & *( prs->wstr + prs->state->poschar)): \
>> ! isw##type( (wint_t)*( prs->wstr + prs->state->poschar))): \
>> is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ) ); \
>> } \
>> \
>> ***************
>> *** 134,141 ****
>> }
>> #endif /* TS_USE_WIDE */
>>
>> ! p_iswhat(alnum)
>> ! p_iswhat(alpha)
>> p_iswhat(digit)
>> p_iswhat(lower)
>> p_iswhat(print)
>> --- 138,197 ----
>> }
>> #endif /* TS_USE_WIDE */
>>
>> ! static int p_isalnum(TParser *prs) {
>> ! Assert( prs->state );
>> !
>> ! if (prs->usewide)
>> ! {
>> ! unsigned int c;
>> !
>> ! c = *(prs->wstr + prs->state->poschar);
>> !
>> ! if (lc_ctype_is_c())
>> ! {
>> ! if (c > 0x7f)
>> ! return 1;
>> ! return isalnum(0xff & c);
>> ! }
>> ! else
>> ! return iswalnum( (wint_t)*( prs->wstr + prs->state->poschar));
>> ! }
>> ! else
>> ! return isalnum( (unsigned char)*( prs->str + prs->state->posbyte ));
>> ! }
>> !
>> ! static int p_isnotalnum(TParser *prs)
>> ! {
>> ! return !p_isalnum(prs);
>> ! }
>> !
>> ! static int p_isalpha(TParser *prs) {
>> ! Assert( prs->state );
>> !
>> ! if (prs->usewide)
>> ! {
>> ! unsigned int c;
>> !
>> ! c = *(prs->wstr + prs->state->poschar);
>> !
>> ! if (lc_ctype_is_c())
>> ! {
>> ! if (c > 0x7f)
>> ! return 1;
>> ! return isalpha(0xff & c);
>> ! }
>> ! else
>> ! return iswalpha( (wint_t)*( prs->wstr + prs->state->poschar));
>> ! }
>> ! else
>> ! return isalpha( (unsigned char)*( prs->str + prs->state->posbyte ));
>> ! }
>> !
>> ! static int p_isnotalpha(TParser *prs)
>> ! {
>> ! return !p_isalpha(prs);
>> ! }
>> !
>> p_iswhat(digit)
>> p_iswhat(lower)
>> p_iswhat(print)
>>
>> ------------------------------------------------------------------------
>>
>>
>> ---------------------------(end of broadcast)---------------------------
>> TIP 9: In versions below 8.0, the planner will ignore your desire to
>> choose an index scan if your joining column's datatypes do not
>> match
--
Teodor Sigaev E-mail: teodor@sigaev.ru
WWW: http://www.sigaev.ru/
> Sorry for delay, I was on holidays :)
>
> Did you test patch on Windows platform?
No. I myself does not use Windows platform.
Do you have any concern on Windows regarding my patches?
--
Tatsuo Ishii
SRA OSS, Inc. Japan
> Tatsuo Ishii wrote:
> > I have tested with local-enabled environment and found a bug. Included
> > is the new version of patches.
> >
> > Teodor, Oleg, what do you think about these patches?
> > If ok, shall I commit to CVS head?
> > --
> > Tatsuo Ishii
> > SRA OSS, Inc. Japan
> >
> >> Hi,
> >>
> >> Here are patches against tsearch2 with CVS head. Currently tsearch2
> >> does not work with multibyte encoding which uses C locale. These
> >> patches are intended to solve the problem by using PostgreSQL in-house
> >> multibyte function instead of mbstowcs which does not work with C
> >> locale. Also iswalpha etc. will not be called in case of C locale
> >> since they are not working with it. Tested with the EUC_JP encoding
> >> (should be working with any multibye encodings). Existing single byte
> >> encodings should not be broken by the patches, I did not test though.
> >> --
> >> Tatsuo Ishii
> >> SRA OSS, Inc. Japan
> >>
> >> ------------------------------------------------------------------------
> >>
> >> Index: ts_locale.c
> >> ===================================================================
> >> RCS file: /cvsroot/pgsql/contrib/tsearch2/ts_locale.c,v
> >> retrieving revision 1.7
> >> diff -c -r1.7 ts_locale.c
> >> *** ts_locale.c 20 Nov 2006 14:03:30 -0000 1.7
> >> --- ts_locale.c 4 Jan 2007 12:16:00 -0000
> >> ***************
> >> *** 63,68 ****
> >> --- 63,101 ----
> >>
> >> return mbstowcs(to, from, len);
> >> }
> >> +
> >> + #else /* WIN32 */
> >> +
> >> + size_t
> >> + char2wchar(wchar_t *to, const char *from, size_t len)
> >> + {
> >> + wchar_t *result;
> >> + size_t n;
> >> +
> >> + if (to == NULL)
> >> + return 0;
> >> +
> >> + if (lc_ctype_is_c())
> >> + {
> >> + /* allocate neccesary memory for "to" including NULL terminate */
> >> + result = (wchar_t *)palloc((len+1)*sizeof(wchar_t));
> >> +
> >> + /* do the conversion */
> >> + n = (size_t)pg_mb2wchar_with_len(from, (pg_wchar *)result, len);
> >> + if (n > 0)
> >> + {
> >> + /* store the result */
> >> + if (n > len)
> >> + n = len;
> >> + memcpy(to, result, n*sizeof(wchar_t));
> >> + pfree(result);
> >> + *(to + n) = '\0';
> >> + }
> >> + return n;
> >> + }
> >> + return mbstowcs(to, from, len);
> >> + }
> >> +
> >> #endif /* WIN32 */
> >>
> >> int
> >> ***************
> >> *** 70,75 ****
> >> --- 103,113 ----
> >> {
> >> wchar_t character;
> >>
> >> + if (lc_ctype_is_c())
> >> + {
> >> + return isalpha(TOUCHAR(ptr));
> >> + }
> >> +
> >> char2wchar(&character, ptr, 1);
> >>
> >> return iswalpha((wint_t) character);
> >> ***************
> >> *** 80,85 ****
> >> --- 118,128 ----
> >> {
> >> wchar_t character;
> >>
> >> + if (lc_ctype_is_c())
> >> + {
> >> + return isprint(TOUCHAR(ptr));
> >> + }
> >> +
> >> char2wchar(&character, ptr, 1);
> >>
> >> return iswprint((wint_t) character);
> >> ***************
> >> *** 126,132 ****
> >> if ( wlen < 0 )
> >> ereport(ERROR,
> >> (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
> >> ! errmsg("transalation failed from server encoding to wchar_t")));
> >>
> >> Assert(wlen<=len);
> >> wstr[wlen] = 0;
> >> --- 169,175 ----
> >> if ( wlen < 0 )
> >> ereport(ERROR,
> >> (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
> >> ! errmsg("translation failed from server encoding to wchar_t")));
> >>
> >> Assert(wlen<=len);
> >> wstr[wlen] = 0;
> >> ***************
> >> *** 152,158 ****
> >> if ( wlen < 0 )
> >> ereport(ERROR,
> >> (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
> >> ! errmsg("transalation failed from wchar_t to server encoding %d", errno)));
> >> Assert(wlen<=len);
> >> out[wlen]='\0';
> >> }
> >> --- 195,201 ----
> >> if ( wlen < 0 )
> >> ereport(ERROR,
> >> (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
> >> ! errmsg("translation failed from wchar_t to server encoding %d", errno)));
> >> Assert(wlen<=len);
> >> out[wlen]='\0';
> >> }
> >> Index: ts_locale.h
> >> ===================================================================
> >> RCS file: /cvsroot/pgsql/contrib/tsearch2/ts_locale.h,v
> >> retrieving revision 1.7
> >> diff -c -r1.7 ts_locale.h
> >> *** ts_locale.h 4 Oct 2006 00:29:47 -0000 1.7
> >> --- ts_locale.h 4 Jan 2007 12:16:00 -0000
> >> ***************
> >> *** 38,45 ****
> >> #else /* WIN32 */
> >>
> >> /* correct mbstowcs */
> >> - #define char2wchar mbstowcs
> >> #define wchar2char wcstombs
> >> #endif /* WIN32 */
> >>
> >> #define t_isdigit(x) ( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) )
> >> --- 38,46 ----
> >> #else /* WIN32 */
> >>
> >> /* correct mbstowcs */
> >> #define wchar2char wcstombs
> >> + size_t char2wchar(wchar_t *to, const char *from, size_t len);
> >> +
> >> #endif /* WIN32 */
> >>
> >> #define t_isdigit(x) ( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) )
> >> ***************
> >> *** 54,59 ****
> >> --- 55,61 ----
> >> * t_iseq() should be called only for ASCII symbols
> >> */
> >> #define t_iseq(x,c) ( (pg_mblen(x)==1) ? ( TOUCHAR(x) == ((unsigned char)(c)) ) : false )
> >> + /*#define t_iseq(x,c) ( TOUCHAR(x) == ((unsigned char)(c)))*/
> >>
> >> #define COPYCHAR(d,s) do { \
> >> int lll = pg_mblen( s ); \
> >> Index: wordparser/parser.c
> >> ===================================================================
> >> RCS file: /cvsroot/pgsql/contrib/tsearch2/wordparser/parser.c,v
> >> retrieving revision 1.11
> >> diff -c -r1.11 parser.c
> >> *** wordparser/parser.c 4 Oct 2006 00:29:47 -0000 1.11
> >> --- wordparser/parser.c 4 Jan 2007 12:16:01 -0000
> >> ***************
> >> *** 44,52 ****
> >> * Some operating systems fail with multi-byte encodings and a C locale.
> >> * Also, for a C locale there is no need to process as multibyte. From
> >> * backend/utils/adt/oracle_compat.c Teodor
> >> */
> >>
> >> ! if (prs->charmaxlen > 1 && !lc_ctype_is_c())
> >> {
> >> prs->usewide = true;
> >> prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * prs->lenstr);
> >> --- 44,54 ----
> >> * Some operating systems fail with multi-byte encodings and a C locale.
> >> * Also, for a C locale there is no need to process as multibyte. From
> >> * backend/utils/adt/oracle_compat.c Teodor
> >> + *
> >> + * This is wrong assumption. even if locale is C, multibyte is necceary.
> >> */
> >>
> >> ! if (prs->charmaxlen > 1)
> >> {
> >> prs->usewide = true;
> >> prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * prs->lenstr);
> >> ***************
> >> *** 92,98 ****
> >> static int \
> >> p_is##type(TParser *prs) { \
> >> Assert( prs->state ); \
> >> ! return ( ( prs->usewide ) ? isw##type( (wint_t)*( prs->wstr + prs->state->poschar ) ) : \
> >> is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ) ); \
> >> } \
> >> \
> >> --- 94,102 ----
> >> static int \
> >> p_is##type(TParser *prs) { \
> >> Assert( prs->state ); \
> >> ! return ( ( prs->usewide ) ? \
> >> ! (lc_ctype_is_c()? is##type( 0xff & *( prs->wstr + prs->state->poschar)): \
> >> ! isw##type( (wint_t)*( prs->wstr + prs->state->poschar))): \
> >> is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ) ); \
> >> } \
> >> \
> >> ***************
> >> *** 134,141 ****
> >> }
> >> #endif /* TS_USE_WIDE */
> >>
> >> ! p_iswhat(alnum)
> >> ! p_iswhat(alpha)
> >> p_iswhat(digit)
> >> p_iswhat(lower)
> >> p_iswhat(print)
> >> --- 138,197 ----
> >> }
> >> #endif /* TS_USE_WIDE */
> >>
> >> ! static int p_isalnum(TParser *prs) {
> >> ! Assert( prs->state );
> >> !
> >> ! if (prs->usewide)
> >> ! {
> >> ! unsigned int c;
> >> !
> >> ! c = *(prs->wstr + prs->state->poschar);
> >> !
> >> ! if (lc_ctype_is_c())
> >> ! {
> >> ! if (c > 0x7f)
> >> ! return 1;
> >> ! return isalnum(0xff & c);
> >> ! }
> >> ! else
> >> ! return iswalnum( (wint_t)*( prs->wstr + prs->state->poschar));
> >> ! }
> >> ! else
> >> ! return isalnum( (unsigned char)*( prs->str + prs->state->posbyte ));
> >> ! }
> >> !
> >> ! static int p_isnotalnum(TParser *prs)
> >> ! {
> >> ! return !p_isalnum(prs);
> >> ! }
> >> !
> >> ! static int p_isalpha(TParser *prs) {
> >> ! Assert( prs->state );
> >> !
> >> ! if (prs->usewide)
> >> ! {
> >> ! unsigned int c;
> >> !
> >> ! c = *(prs->wstr + prs->state->poschar);
> >> !
> >> ! if (lc_ctype_is_c())
> >> ! {
> >> ! if (c > 0x7f)
> >> ! return 1;
> >> ! return isalpha(0xff & c);
> >> ! }
> >> ! else
> >> ! return iswalpha( (wint_t)*( prs->wstr + prs->state->poschar));
> >> ! }
> >> ! else
> >> ! return isalpha( (unsigned char)*( prs->str + prs->state->posbyte ));
> >> ! }
> >> !
> >> ! static int p_isnotalpha(TParser *prs)
> >> ! {
> >> ! return !p_isalpha(prs);
> >> ! }
> >> !
> >> p_iswhat(digit)
> >> p_iswhat(lower)
> >> p_iswhat(print)
> >>
> >> ------------------------------------------------------------------------
> >>
> >>
> >> ---------------------------(end of broadcast)---------------------------
> >> TIP 9: In versions below 8.0, the planner will ignore your desire to
> >> choose an index scan if your joining column's datatypes do not
> >> match
>
> --
> Teodor Sigaev E-mail: teodor@sigaev.ru
> WWW: http://www.sigaev.ru/
>
> I have tested with local-enabled environment and found a bug. Included
> is the new version of patches.
Your patch causes crash on tsearch2's installcheck with 'initdb -E UTF8 --locale
C', simple way to reproduce:
# select to_tsquery('default', '''New York''');
server closed the connection unexpectedly This probably means the server terminated abnormally before or
whileprocessing the request.
The connection to the server was lost. Attempting reset: Failed.
>> ! static int p_isalnum(TParser *prs) {
...
>> ! if (lc_ctype_is_c())
>> ! {
>> ! if (c > 0x7f)
>> ! return 1;
I have some some doubts that any character greater than 0x7f is an alpha symbol.
Is it simple assumption or workaround?
--
Teodor Sigaev E-mail: teodor@sigaev.ru
WWW: http://www.sigaev.ru/
From: Teodor Sigaev <teodor@sigaev.ru>
Subject: Re: [HACKERS] Request for review: tsearch2 patch
Date: Wed, 10 Jan 2007 18:50:44 +0300
Message-ID: <45A50B54.6090608@sigaev.ru>
> > I have tested with local-enabled environment and found a bug. Included
> > is the new version of patches.
> Your patch causes crash on tsearch2's installcheck with 'initdb -E UTF8 --locale
> C', simple way to reproduce:
> # select to_tsquery('default', '''New York''');
> server closed the connection unexpectedly
> This probably means the server terminated abnormally
> before or while processing the request.
> The connection to the server was lost. Attempting reset: Failed.
It seems it's a bug with original tsearch2. Here is the patches.
------------------------------------------------------------------
*** wordparser/parser.c~ 2007-01-07 09:54:39.000000000 +0900
--- wordparser/parser.c 2007-01-11 10:33:41.000000000 +0900
***************
*** 51,57 **** if (prs->charmaxlen > 1) { prs->usewide = true;
! prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * prs->lenstr); prs->lenwstr = char2wchar(prs->wstr,
prs->str,prs->lenstr); } else
--- 51,57 ---- if (prs->charmaxlen > 1) { prs->usewide = true;
! prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr+1)); prs->lenwstr =
char2wchar(prs->wstr,prs->str, prs->lenstr); } else
------------------------------------------------------------------
> >> ! static int p_isalnum(TParser *prs) {
> ...
> >> ! if (lc_ctype_is_c())
> >> ! {
> >> ! if (c > 0x7f)
> >> ! return 1;
>
> I have some some doubts that any character greater than 0x7f is an alpha symbol.
> Is it simple assumption or workaround?
Yeah, it's a workaround. Since there's no concept other than
alpha/numeric/latin in tsearch2, Asian characters have to be fall in
one of them.
--
Tatsuo Ishii
SRA OSS, Inc. Japan
> > I have tested with local-enabled environment and found a bug. Included
> > is the new version of patches.
> Your patch causes crash on tsearch2's installcheck with 'initdb -E UTF8 --locale
> C', simple way to reproduce:
> # select to_tsquery('default', '''New York''');
> server closed the connection unexpectedly
> This probably means the server terminated abnormally
> before or while processing the request.
> The connection to the server was lost. Attempting reset: Failed.
It seems it's a bug with original tsearch2. Here is the patches.
------------------------------------------------------------------
*** wordparser/parser.c~ 2007-01-07 09:54:39.000000000 +0900
--- wordparser/parser.c 2007-01-11 10:33:41.000000000 +0900
***************
*** 51,57 **** if (prs->charmaxlen > 1) { prs->usewide = true;
! prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * prs->lenstr); prs->lenwstr = char2wchar(prs->wstr,
prs->str,prs->lenstr); } else
--- 51,57 ---- if (prs->charmaxlen > 1) { prs->usewide = true;
! prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr+1)); prs->lenwstr =
char2wchar(prs->wstr,prs->str, prs->lenstr); } else
------------------------------------------------------------------
> >> ! static int p_isalnum(TParser *prs) {
> ...
> >> ! if (lc_ctype_is_c())
> >> ! {
> >> ! if (c > 0x7f)
> >> ! return 1;
>
> I have some some doubts that any character greater than 0x7f is an alpha symbol.
> Is it simple assumption or workaround?
Yeah, it's a workaround. Since there's no concept other than
alpha/numeric/latin in tsearch2, Asian characters have to be fall in
one of them.
--
Tatsuo Ishii
SRA OSS, Inc. Japan
> Yeah, it's a workaround. Since there's no concept other than
> alpha/numeric/latin in tsearch2, Asian characters have to be fall in
> one of them.
Ok, I see.
Pls, test attached patch - if it is good then I'll commit it at Monday to HEAD
and 8.2 branches.
PS. Magnus, may I ask you to test under Windows? Thank you.
--
Teodor Sigaev E-mail: teodor@sigaev.ru
WWW: http://www.sigaev.ru/
diff -c -r -N ../tsearch2.orig/ts_locale.c ./ts_locale.c
*** ../tsearch2.orig/ts_locale.c Fri Jan 12 10:53:11 2007
--- ./ts_locale.c Fri Jan 12 18:10:27 2007
***************
*** 12,24 ****
size_t
wchar2char(char *to, const wchar_t *from, size_t len)
{
if (GetDatabaseEncoding() == PG_UTF8)
{
int r;
- if (len == 0)
- return 0;
-
r = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, len,
NULL, NULL);
--- 12,24 ----
size_t
wchar2char(char *to, const wchar_t *from, size_t len)
{
+ if (len == 0)
+ return 0;
+
if (GetDatabaseEncoding() == PG_UTF8)
{
int r;
r = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, len,
NULL, NULL);
***************
*** 34,50 ****
return wcstombs(to, from, len);
}
size_t
char2wchar(wchar_t *to, const char *from, size_t len)
{
if (GetDatabaseEncoding() == PG_UTF8)
{
int r;
- if (len == 0)
- return 0;
-
r = MultiByteToWideChar(CP_UTF8, 0, from, len, to, len);
if (!r)
--- 34,52 ----
return wcstombs(to, from, len);
}
+ #endif /* WIN32 */
size_t
char2wchar(wchar_t *to, const char *from, size_t len)
{
+ if (len == 0)
+ return 0;
+
+ #ifdef WIN32
if (GetDatabaseEncoding() == PG_UTF8)
{
int r;
r = MultiByteToWideChar(CP_UTF8, 0, from, len, to, len);
if (!r)
***************
*** 60,88 ****
return r;
}
return mbstowcs(to, from, len);
}
- #endif /* WIN32 */
int
_t_isalpha(const char *ptr)
{
! wchar_t character;
! char2wchar(&character, ptr, 1);
! return iswalpha((wint_t) character);
}
int
_t_isprint(const char *ptr)
{
! wchar_t character;
! char2wchar(&character, ptr, 1);
! return iswprint((wint_t) character);
}
#endif /* TS_USE_WIDE */
--- 62,105 ----
return r;
}
+ else
+ #endif /* WIN32 */
+ if ( lc_ctype_is_c() )
+ {
+ /*
+ * pg_mb2wchar_with_len always adds trailing '\0', so
+ * 'to' should be allocated with sufficient space
+ */
+ return pg_mb2wchar_with_len(from, (pg_wchar *)to, len);
+ }
return mbstowcs(to, from, len);
}
int
_t_isalpha(const char *ptr)
{
! wchar_t character[2];
!
! if (lc_ctype_is_c())
! return isalpha(TOUCHAR(ptr));
! char2wchar(character, ptr, 1);
! return iswalpha((wint_t) *character);
}
int
_t_isprint(const char *ptr)
{
! wchar_t character[2];
!
! if (lc_ctype_is_c())
! return isprint(TOUCHAR(ptr));
! char2wchar(character, ptr, 1);
! return iswprint((wint_t) *character);
}
#endif /* TS_USE_WIDE */
***************
*** 126,132 ****
if ( wlen < 0 )
ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
! errmsg("transalation failed from server encoding to wchar_t")));
Assert(wlen<=len);
wstr[wlen] = 0;
--- 143,149 ----
if ( wlen < 0 )
ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
! errmsg("translation failed from server encoding to wchar_t")));
Assert(wlen<=len);
wstr[wlen] = 0;
***************
*** 152,158 ****
if ( wlen < 0 )
ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
! errmsg("transalation failed from wchar_t to server encoding %d", errno)));
Assert(wlen<=len);
out[wlen]='\0';
}
--- 169,175 ----
if ( wlen < 0 )
ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
! errmsg("translation failed from wchar_t to server encoding %d", errno)));
Assert(wlen<=len);
out[wlen]='\0';
}
diff -c -r -N ../tsearch2.orig/ts_locale.h ./ts_locale.h
*** ../tsearch2.orig/ts_locale.h Fri Jan 12 10:53:11 2007
--- ./ts_locale.h Fri Jan 12 18:10:19 2007
***************
*** 30,45 ****
#define TOUCHAR(x) (*((unsigned char*)(x)))
#ifdef TS_USE_WIDE
#ifdef WIN32
size_t wchar2char(char *to, const wchar_t *from, size_t len);
! size_t char2wchar(wchar_t *to, const char *from, size_t len);
#else /* WIN32 */
! /* correct mbstowcs */
! #define char2wchar mbstowcs
#define wchar2char wcstombs
#endif /* WIN32 */
#define t_isdigit(x) ( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) )
--- 30,46 ----
#define TOUCHAR(x) (*((unsigned char*)(x)))
#ifdef TS_USE_WIDE
+ size_t char2wchar(wchar_t *to, const char *from, size_t len);
#ifdef WIN32
size_t wchar2char(char *to, const wchar_t *from, size_t len);
!
#else /* WIN32 */
! /* correct wcstombs */
#define wchar2char wcstombs
+
#endif /* WIN32 */
#define t_isdigit(x) ( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) )
***************
*** 55,64 ****
*/
#define t_iseq(x,c) ( (pg_mblen(x)==1) ? ( TOUCHAR(x) == ((unsigned char)(c)) ) : false )
! #define COPYCHAR(d,s) do { \
! int lll = pg_mblen( s ); \
! \
! while( lll-- ) \
TOUCHAR((d)+lll) = TOUCHAR((s)+lll); \
} while(0)
--- 56,65 ----
*/
#define t_iseq(x,c) ( (pg_mblen(x)==1) ? ( TOUCHAR(x) == ((unsigned char)(c)) ) : false )
! #define COPYCHAR(d,s) do { \
! int lll = pg_mblen( s ); \
! \
! while( lll-- ) \
TOUCHAR((d)+lll) = TOUCHAR((s)+lll); \
} while(0)
diff -c -r -N ../tsearch2.orig/tsearch2.patch ./tsearch2.patch
*** ../tsearch2.orig/tsearch2.patch Thu Jan 1 03:00:00 1970
--- ./tsearch2.patch Fri Jan 12 18:12:30 2007
***************
*** 0 ****
--- 1,243 ----
+ diff -c -r -N ../tsearch2.orig/ts_locale.c ./ts_locale.c
+ *** ../tsearch2.orig/ts_locale.c Fri Jan 12 10:53:11 2007
+ --- ./ts_locale.c Fri Jan 12 18:10:27 2007
+ ***************
+ *** 12,24 ****
+ size_t
+ wchar2char(char *to, const wchar_t *from, size_t len)
+ {
+ if (GetDatabaseEncoding() == PG_UTF8)
+ {
+ int r;
+
+ - if (len == 0)
+ - return 0;
+ -
+ r = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, len,
+ NULL, NULL);
+
+ --- 12,24 ----
+ size_t
+ wchar2char(char *to, const wchar_t *from, size_t len)
+ {
+ + if (len == 0)
+ + return 0;
+ +
+ if (GetDatabaseEncoding() == PG_UTF8)
+ {
+ int r;
+
+ r = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, len,
+ NULL, NULL);
+
+ ***************
+ *** 34,50 ****
+
+ return wcstombs(to, from, len);
+ }
+
+ size_t
+ char2wchar(wchar_t *to, const char *from, size_t len)
+ {
+ if (GetDatabaseEncoding() == PG_UTF8)
+ {
+ int r;
+
+ - if (len == 0)
+ - return 0;
+ -
+ r = MultiByteToWideChar(CP_UTF8, 0, from, len, to, len);
+
+ if (!r)
+ --- 34,52 ----
+
+ return wcstombs(to, from, len);
+ }
+ + #endif /* WIN32 */
+
+ size_t
+ char2wchar(wchar_t *to, const char *from, size_t len)
+ {
+ + if (len == 0)
+ + return 0;
+ +
+ + #ifdef WIN32
+ if (GetDatabaseEncoding() == PG_UTF8)
+ {
+ int r;
+
+ r = MultiByteToWideChar(CP_UTF8, 0, from, len, to, len);
+
+ if (!r)
+ ***************
+ *** 60,88 ****
+
+ return r;
+ }
+
+ return mbstowcs(to, from, len);
+ }
+ - #endif /* WIN32 */
+
+ int
+ _t_isalpha(const char *ptr)
+ {
+ ! wchar_t character;
+
+ ! char2wchar(&character, ptr, 1);
+
+ ! return iswalpha((wint_t) character);
+ }
+
+ int
+ _t_isprint(const char *ptr)
+ {
+ ! wchar_t character;
+
+ ! char2wchar(&character, ptr, 1);
+
+ ! return iswprint((wint_t) character);
+ }
+ #endif /* TS_USE_WIDE */
+
+ --- 62,105 ----
+
+ return r;
+ }
+ + else
+ + #endif /* WIN32 */
+ + if ( lc_ctype_is_c() )
+ + {
+ + /*
+ + * pg_mb2wchar_with_len always adds trailing '\0', so
+ + * 'to' should be allocated with sufficient space
+ + */
+ + return pg_mb2wchar_with_len(from, (pg_wchar *)to, len);
+ + }
+
+ return mbstowcs(to, from, len);
+ }
+
+ int
+ _t_isalpha(const char *ptr)
+ {
+ ! wchar_t character[2];
+ !
+ ! if (lc_ctype_is_c())
+ ! return isalpha(TOUCHAR(ptr));
+
+ ! char2wchar(character, ptr, 1);
+
+ ! return iswalpha((wint_t) *character);
+ }
+
+ int
+ _t_isprint(const char *ptr)
+ {
+ ! wchar_t character[2];
+ !
+ ! if (lc_ctype_is_c())
+ ! return isprint(TOUCHAR(ptr));
+
+ ! char2wchar(character, ptr, 1);
+
+ ! return iswprint((wint_t) *character);
+ }
+ #endif /* TS_USE_WIDE */
+
+ ***************
+ *** 126,132 ****
+ if ( wlen < 0 )
+ ereport(ERROR,
+ (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
+ ! errmsg("transalation failed from server encoding to wchar_t")));
+
+ Assert(wlen<=len);
+ wstr[wlen] = 0;
+ --- 143,149 ----
+ if ( wlen < 0 )
+ ereport(ERROR,
+ (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
+ ! errmsg("translation failed from server encoding to wchar_t")));
+
+ Assert(wlen<=len);
+ wstr[wlen] = 0;
+ ***************
+ *** 152,158 ****
+ if ( wlen < 0 )
+ ereport(ERROR,
+ (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
+ ! errmsg("transalation failed from wchar_t to server encoding %d", errno)));
+ Assert(wlen<=len);
+ out[wlen]='\0';
+ }
+ --- 169,175 ----
+ if ( wlen < 0 )
+ ereport(ERROR,
+ (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
+ ! errmsg("translation failed from wchar_t to server encoding %d", errno)));
+ Assert(wlen<=len);
+ out[wlen]='\0';
+ }
+ diff -c -r -N ../tsearch2.orig/ts_locale.h ./ts_locale.h
+ *** ../tsearch2.orig/ts_locale.h Fri Jan 12 10:53:11 2007
+ --- ./ts_locale.h Fri Jan 12 18:10:19 2007
+ ***************
+ *** 30,45 ****
+ #define TOUCHAR(x) (*((unsigned char*)(x)))
+
+ #ifdef TS_USE_WIDE
+
+ #ifdef WIN32
+
+ size_t wchar2char(char *to, const wchar_t *from, size_t len);
+ ! size_t char2wchar(wchar_t *to, const char *from, size_t len);
+ #else /* WIN32 */
+
+ ! /* correct mbstowcs */
+ ! #define char2wchar mbstowcs
+ #define wchar2char wcstombs
+ #endif /* WIN32 */
+
+ #define t_isdigit(x) ( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) )
+ --- 30,46 ----
+ #define TOUCHAR(x) (*((unsigned char*)(x)))
+
+ #ifdef TS_USE_WIDE
+ + size_t char2wchar(wchar_t *to, const char *from, size_t len);
+
+ #ifdef WIN32
+
+ size_t wchar2char(char *to, const wchar_t *from, size_t len);
+ !
+ #else /* WIN32 */
+
+ ! /* correct wcstombs */
+ #define wchar2char wcstombs
+ +
+ #endif /* WIN32 */
+
+ #define t_isdigit(x) ( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) )
+ ***************
+ *** 55,64 ****
+ */
+ #define t_iseq(x,c) ( (pg_mblen(x)==1) ? ( TOUCHAR(x) == ((unsigned char)(c)) ) : false )
+
+ ! #define COPYCHAR(d,s) do { \
+ ! int lll = pg_mblen( s ); \
+ ! \
+ ! while( lll-- ) \
+ TOUCHAR((d)+lll) = TOUCHAR((s)+lll); \
+ } while(0)
+
+ --- 56,65 ----
+ */
+ #define t_iseq(x,c) ( (pg_mblen(x)==1) ? ( TOUCHAR(x) == ((unsigned char)(c)) ) : false )
+
+ ! #define COPYCHAR(d,s) do { \
+ ! int lll = pg_mblen( s ); \
+ ! \
+ ! while( lll-- ) \
+ TOUCHAR((d)+lll) = TOUCHAR((s)+lll); \
+ } while(0)
+
diff -c -r -N ../tsearch2.orig/wordparser/parser.c ./wordparser/parser.c
*** ../tsearch2.orig/wordparser/parser.c Fri Jan 12 10:53:11 2007
--- ./wordparser/parser.c Fri Jan 12 18:10:38 2007
***************
*** 40,55 ****
#ifdef TS_USE_WIDE
/*
! * Use wide char code only when max encoding length > 1 and ctype != C.
! * Some operating systems fail with multi-byte encodings and a C locale.
! * Also, for a C locale there is no need to process as multibyte. From
! * backend/utils/adt/oracle_compat.c Teodor
*/
! if (prs->charmaxlen > 1 && !lc_ctype_is_c())
{
prs->usewide = true;
! prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * prs->lenstr);
prs->lenwstr = char2wchar(prs->wstr, prs->str, prs->lenstr);
}
else
--- 40,52 ----
#ifdef TS_USE_WIDE
/*
! * Use wide char code only when max encoding length > 1.
*/
! if (prs->charmaxlen > 1)
{
prs->usewide = true;
! prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr+1));
prs->lenwstr = char2wchar(prs->wstr, prs->str, prs->lenstr);
}
else
***************
*** 83,107 ****
/*
* defining support function, equvalent is* macroses, but
! * working with any possible encodings and locales
*/
#ifdef TS_USE_WIDE
! #define p_iswhat(type) \
! static int \
! p_is##type(TParser *prs) { \
! Assert( prs->state ); \
! return ( ( prs->usewide ) ? isw##type( (wint_t)*( prs->wstr + prs->state->poschar ) ) : \
! is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ) ); \
! } \
! \
! static int \
! p_isnot##type(TParser *prs) { \
! return !p_is##type(prs); \
}
/* p_iseq should be used only for ascii symbols */
--- 80,178 ----
/*
* defining support function, equvalent is* macroses, but
! * working with any possible encodings and locales. Note,
! * that with multibyte encoding and C-locale isw* function may fail
! * or give wrong result. Note 2: multibyte encoding and C-locale
! * often are used for Asian languages.
*/
#ifdef TS_USE_WIDE
! #define p_iswhat(type) \
! static int \
! p_is##type(TParser *prs) { \
! Assert( prs->state ); \
! if ( prs->usewide ) \
! { \
! if ( lc_ctype_is_c() ) \
! return is##type( 0xff & *( prs->wstr + prs->state->poschar) ); \
! \
! return isw##type( *(wint_t*)( prs->wstr + prs->state->poschar ) ); \
! } \
! \
! return is##type( *(unsigned char*)( prs->str + prs->state->posbyte ) ); \
! } \
! \
! static int \
! p_isnot##type(TParser *prs) { \
! return !p_is##type(prs); \
}
+ static int
+ p_isalnum(TParser *prs)
+ {
+ Assert( prs->state );
+
+ if (prs->usewide)
+ {
+ if (lc_ctype_is_c())
+ {
+ unsigned int c = *(unsigned int*)(prs->wstr + prs->state->poschar);
+
+ /*
+ * any non-ascii symbol with multibyte encoding
+ * with C-locale is an alpha character
+ */
+ if ( c > 0x7f )
+ return 1;
+
+ return isalnum(0xff & c);
+ }
+
+ return iswalnum( (wint_t)*( prs->wstr + prs->state->poschar));
+ }
+
+ return isalnum( *(unsigned char*)( prs->str + prs->state->posbyte ));
+ }
+ static int
+ p_isnotalnum(TParser *prs)
+ {
+ return !p_isalnum(prs);
+ }
+
+ static int
+ p_isalpha(TParser *prs)
+ {
+ Assert( prs->state );
+
+ if (prs->usewide)
+ {
+ if (lc_ctype_is_c())
+ {
+ unsigned int c = *(prs->wstr + prs->state->poschar);
+
+ /*
+ * any non-ascii symbol with multibyte encoding
+ * with C-locale is an alpha character
+ */
+ if ( c > 0x7f )
+ return 1;
+
+ return isalpha(0xff & c);
+ }
+
+ return iswalpha( (wint_t)*( prs->wstr + prs->state->poschar));
+ }
+
+ return isalpha( *(unsigned char*)( prs->str + prs->state->posbyte ));
+ }
+
+ static int
+ p_isnotalpha(TParser *prs)
+ {
+ return !p_isalpha(prs);
+ }
/* p_iseq should be used only for ascii symbols */
***************
*** 111,128 ****
Assert(prs->state);
return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte) == c)) ? 1 : 0;
}
#else /* TS_USE_WIDE */
! #define p_iswhat(type) \
! static int \
! p_is##type(TParser *prs) { \
! Assert( prs->state ); \
! return is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ); \
! } \
! \
! static int \
! p_isnot##type(TParser *prs) { \
! return !p_is##type(prs); \
}
--- 182,200 ----
Assert(prs->state);
return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte) == c)) ? 1 : 0;
}
+
#else /* TS_USE_WIDE */
! #define p_iswhat(type) \
! static int \
! p_is##type(TParser *prs) { \
! Assert( prs->state ); \
! return is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ); \
! } \
! \
! static int \
! p_isnot##type(TParser *prs) { \
! return !p_is##type(prs); \
}
***************
*** 132,141 ****
Assert(prs->state);
return (*(prs->str + prs->state->posbyte) == c) ? 1 : 0;
}
- #endif /* TS_USE_WIDE */
p_iswhat(alnum)
p_iswhat(alpha)
p_iswhat(digit)
p_iswhat(lower)
p_iswhat(print)
--- 204,215 ----
Assert(prs->state);
return (*(prs->str + prs->state->posbyte) == c) ? 1 : 0;
}
p_iswhat(alnum)
p_iswhat(alpha)
+
+ #endif /* TS_USE_WIDE */
+
p_iswhat(digit)
p_iswhat(lower)
p_iswhat(print)
> > Yeah, it's a workaround. Since there's no concept other than > > alpha/numeric/latin in tsearch2, Asian characters have to be fall in > > one of them. > > Ok, I see. > > Pls, test attached patch - if it is good then I'll commit it at Monday to HEAD > and 8.2 branches. I have tested on a Linux box running PostgreSQL 8.2.1 (C locale, EUC_JP encoding), and it worked great! BTW, is your patch supposed to work with PostgreSQL 8.1? -- Tatsuo Ishii SRA OSS, Inc. Japan > PS. Magnus, may I ask you to test under Windows? Thank you. > > -- > Teodor Sigaev E-mail: teodor@sigaev.ru > WWW: http://www.sigaev.ru/