Обсуждение: BUG #8970: ts_parse incorrectly split numbers in digit token

Поиск
Список
Период
Сортировка

BUG #8970: ts_parse incorrectly split numbers in digit token

От
marco.atzeri@gmail.com
Дата:
The following bug has been logged on the website:

Bug reference:      8970
Logged by:          Marco Atzeri
Email address:      marco.atzeri@gmail.com
PostgreSQL version: 9.3.2
Operating system:   Cygwin
Description:

Extract from failure comparison.

=====================================
---
/pub/devel/postgresql/postgresql-9.3.2-1/src/postgresql-9.3.2/src/test/regress/expected/tsearch.out
2013-12-02 21:57:48.000000000 +0100
+++
/pub/devel/postgresql/postgresql-9.3.2-1/build/src/test/regress/results/tsearch.out
2014-01-24 19:36:27.625466800 +0100
@@ -267,10 +267,11 @@
 SELECT * FROM ts_parse('default', '345 qwe@efd.r '' http://www.com/
http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr
http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr
http://5aew.werc.ewr:8100/?  ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw
7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455
5.005 teodor@stack.net qwe-wer asdf <fr>qwer jf sdjk<we hjwer <werrwe> ewr1>
ewri2 <a href="qwe<qwe>">
 /usr/local/fff /awdf/dwqe/4325 rewt/ewr wefjn /wqe-324/ewr gist.h gist.h.c
gist.c. readline 4.2 4.2. 4.2, readline-4.2 readline-4.2. 234
 <i <b> wow  < jqw <> qwerty');
- tokid |                token
--------+--------------------------------------
-    22 | 345
-    12 |
+ tokid |          token
+-------+-------------------------
+    12 | 3
+    12 | 4
+    12 | 5
      1 | qwe
     12 | @
     19 | efd.r
@@ -283,53 +284,97 @@
      6 | aew.werc.ewr
     18 | /?ad=qwe&dw
     12 |
-     5 | 1aew.werc.ewr/?ad=qwe&dw
-     6 | 1aew.werc.ewr
+    12 | 1
+     5 | aew.werc.ewr/?ad=qwe&dw
+     6 | aew.werc.ewr
     18 | /?ad=qwe&dw
=======================================

current build dependecy
  bash-4.1.10-4
  crypt-1.2-1
  cygwin-1.7.27-2
  gettext-0.18.1.1-2
  libgcc1-4.8.2-2
  libintl8-0.18.1.1-2
  libopenldap2_4_2-2.4.35-2
  libopenssl100-1.0.1f-1
  libreadline7-6.1.2-3
  openldap-devel-2.4.35-2
  openssl-devel-1.0.1f-1
  perl-5.14.2-3
  python-2.7.3-1
  zlib0-1.2.8-1

####################################
extracts from config.log

/pub/devel/postgresql/postgresql-9.3.2-2/src/postgresql-9.3.2/configure
--srcdir=/pub/devel/postgresql/postgresql-9.3.2-2/src/postgresql-9.3.2
--prefix=/usr --exec-prefix=/usr --bindir=/usr/bin --sbindir=/usr/sbin
--libexecdir=/usr/libexec --datadir=/usr/share --localstatedir=/var
--sysconfdir=/etc --libdir=/usr/lib --datarootdir=/usr/share
--docdir=/usr/share/doc/postgresql --htmldir=/usr/share/doc/postgresql/html
-C LDFLAGS=-Wl,-no-undefined --enable-nls --with-openssl --with-perl
--with-python --with-ldap

[cut]

## ----------- ##
## confdefs.h. ##
## ----------- ##

#define PACKAGE_NAME "PostgreSQL"
#define PACKAGE_TARNAME "postgresql"
#define PACKAGE_VERSION "9.3.2"
#define PACKAGE_STRING "PostgreSQL 9.3.2"
#define PACKAGE_BUGREPORT "pgsql-bugs@postgresql.org"
#define PG_VERSION "9.3.2"
#define PG_MAJORVERSION "9.3"
#define USE_INTEGER_DATETIMES 1
#define ENABLE_NLS 1
#define DEF_PGPORT 5432
#define DEF_PGPORT_STR "5432"
#define BLCKSZ 8192
#define RELSEG_SIZE 131072
#define XLOG_BLCKSZ 8192
#define XLOG_SEG_SIZE (16 * 1024 * 1024)
#define ENABLE_THREAD_SAFETY 1
#define PG_KRB_SRVNAM "postgres"
#define USE_LDAP 1
#define USE_SSL 1
#define HAVE_LIBM 1
#define HAVE_LIBREADLINE 1
#define HAVE_LIBZ 1
#define HAVE_SPINLOCKS 1
#define HAVE_LIBCRYPTO 1
#define HAVE_LIBSSL 1
#define STDC_HEADERS 1
#define HAVE_SYS_TYPES_H 1
#define HAVE_SYS_STAT_H 1
#define HAVE_STDLIB_H 1
#define HAVE_STRING_H 1
#define HAVE_MEMORY_H 1
#define HAVE_STRINGS_H 1
#define HAVE_INTTYPES_H 1
#define HAVE_STDINT_H 1
#define HAVE_UNISTD_H 1
#define HAVE_CRYPT_H 1
#define HAVE_GETOPT_H 1
#define HAVE_IEEEFP_H 1
#define HAVE_IFADDRS_H 1
#define HAVE_LANGINFO_H 1
#define HAVE_POLL_H 1
#define HAVE_PWD_H 1
#define HAVE_SYS_IOCTL_H 1
#define HAVE_SYS_IPC_H 1
#define HAVE_SYS_POLL_H 1
#define HAVE_SYS_RESOURCE_H 1
#define HAVE_SYS_SELECT_H 1
#define HAVE_SYS_SEM_H 1
#define HAVE_SYS_SHM_H 1
#define HAVE_SYS_SOCKET_H 1
#define HAVE_SYS_TIME_H 1
#define HAVE_SYS_UN_H 1
#define HAVE_TERMIOS_H 1
#define HAVE_UTIME_H 1
#define HAVE_WCHAR_H 1
#define HAVE_WCTYPE_H 1
#define HAVE_NET_IF_H 1
#define HAVE_NETINET_IN_H 1
#define HAVE_NETINET_TCP_H 1
#define HAVE_READLINE_READLINE_H 1
#define HAVE_READLINE_HISTORY_H 1
#define HAVE_LDAP_H 1
#define PG_USE_INLINE 1
#define HAVE_STRINGIZE 1
#define FLEXIBLE_ARRAY_MEMBER /**/
#define HAVE_FUNCNAME__FUNC 1
#define HAVE__STATIC_ASSERT 1
#define HAVE__BUILTIN_TYPES_COMPATIBLE_P 1
#define HAVE__BUILTIN_CONSTANT_P 1
#define HAVE__BUILTIN_UNREACHABLE 1
#define HAVE__VA_ARGS 1
#define HAVE_TZNAME 1
#define HAVE_UNIX_SOCKETS 1
#define HAVE_STRUCT_SOCKADDR_STORAGE 1
#define HAVE_STRUCT_SOCKADDR_STORAGE_SS_FAMILY 1
#define HAVE_STRUCT_ADDRINFO 1
#define HAVE_INTPTR_T 1
#define HAVE_UINTPTR_T 1
#define HAVE_LONG_LONG_INT 1
#define HAVE_STRUCT_OPTION 1
#define SIZEOF_OFF_T 8
#define HAVE_INT_TIMEZONE /**/
#define ACCEPT_TYPE_RETURN int
#define ACCEPT_TYPE_ARG1 int
#define ACCEPT_TYPE_ARG2 struct sockaddr *
#define ACCEPT_TYPE_ARG3 int
#define HAVE_CBRT 1
#define HAVE_DLOPEN 1
#define HAVE_FDATASYNC 1
#define HAVE_GETIFADDRS 1
#define HAVE_GETRLIMIT 1
#define HAVE_MEMMOVE 1
#define HAVE_POLL 1
#define HAVE_READLINK 1
#define HAVE_SETSID 1
#define HAVE_SIGPROCMASK 1
#define HAVE_SYMLINK 1
#define HAVE_TOWLOWER 1
#define HAVE_UTIME 1
#define HAVE_UTIMES 1
#define HAVE_WCSTOMBS 1
#define HAVE_FSEEKO 1
#define HAVE_FSEEKO 1
#define HAVE_POSIX_FADVISE 1
#define HAVE_DECL_POSIX_FADVISE 1
#define HAVE_DECL_FDATASYNC 1
#define HAVE_DECL_STRLCAT 1
#define HAVE_DECL_STRLCPY 1
#define HAVE_DECL_F_FULLFSYNC 0
#define HAVE_IPV6 1
#define HAVE_SNPRINTF 1
#define HAVE_VSNPRINTF 1
#define HAVE_DECL_SNPRINTF 1
#define HAVE_DECL_VSNPRINTF 1
#define HAVE_ISINF 1
#define HAVE_CRYPT 1
#define HAVE_GETOPT 1
#define HAVE_GETRUSAGE 1
#define HAVE_INET_ATON 1
#define HAVE_RANDOM 1
#define HAVE_RINT 1
#define HAVE_SRANDOM 1
#define HAVE_STRERROR 1
#define HAVE_STRLCAT 1
#define HAVE_STRLCPY 1
#define HAVE_UNSETENV 1
#define HAVE_GETPEEREID 1
#define HAVE_GETADDRINFO 1
#define HAVE_GETOPT_LONG 1
#define HAVE_SIGSETJMP 1
#define HAVE_DECL_SYS_SIGLIST 1
#define HAVE_SYSLOG 1
#define HAVE_INT_OPTERR 1
#define HAVE_INT_OPTRESET 1
#define HAVE_STRTOLL 1
#define HAVE_STRTOULL 1
#define HAVE_GCC_INT_ATOMICS 1
#define HAVE_RL_COMPLETION_APPEND_CHARACTER 1
#define HAVE_RL_COMPLETION_MATCHES 1
#define HAVE_RL_FILENAME_COMPLETION_FUNCTION 1
#define HAVE_APPEND_HISTORY 1
#define HAVE_HISTORY_TRUNCATE_FILE 1
#define HAVE_STRERROR_R 1
#define HAVE_GETPWUID_R 1
#define GETPWUID_R_5ARG /**/
#define STRERROR_R_INT /**/
#define HAVE_LIBLDAP 1
#define HAVE_LIBLDAP_R 1
#define HAVE_LONG_LONG_INT_64 1
#define PG_INT64_TYPE long long int
#define HAVE_LL_CONSTANTS 1
#define INT64_FORMAT "%lld"
#define UINT64_FORMAT "%llu"
#define SIZEOF_VOID_P 4
#define SIZEOF_SIZE_T 4
#define SIZEOF_LONG 4
#define USE_FLOAT4_BYVAL 1
#define FLOAT4PASSBYVAL true
#define FLOAT8PASSBYVAL false
#define ALIGNOF_SHORT 2
#define ALIGNOF_INT 4
#define ALIGNOF_LONG 4
#define ALIGNOF_LONG_LONG_INT 8
#define ALIGNOF_DOUBLE 8
#define MAXIMUM_ALIGNOF 8
#define HAVE_SIG_ATOMIC_T 1
#define HAVE_POSIX_SIGNALS /**/
#define USE_SYSV_SEMAPHORES 1
#define USE_SYSV_SHARED_MEMORY 1
#define MEMSET_LOOP_LIMIT 1024
#define PG_VERSION_STR "PostgreSQL 9.3.2 on i686-pc-cygwin, compiled by gcc
(GCC) 4.8.2, 32-bit"
#define PG_VERSION_NUM 90302

Re: BUG #8970: ts_parse incorrectly split numbers in digit token

От
Bruce Momjian
Дата:
On Sat, Jan 25, 2014 at 06:50:24AM +0000, marco.atzeri@gmail.com wrote:
> The following bug has been logged on the website:
>
> Bug reference:      8970
> Logged by:          Marco Atzeri
> Email address:      marco.atzeri@gmail.com
> PostgreSQL version: 9.3.2
> Operating system:   Cygwin
> Description:
>
> Extract from failure comparison.
>
> =====================================
> ---
> /pub/devel/postgresql/postgresql-9.3.2-1/src/postgresql-9.3.2/src/test/regress/expected/tsearch.out
> 2013-12-02 21:57:48.000000000 +0100
> +++
> /pub/devel/postgresql/postgresql-9.3.2-1/build/src/test/regress/results/tsearch.out
> 2014-01-24 19:36:27.625466800 +0100
> @@ -267,10 +267,11 @@
>  SELECT * FROM ts_parse('default', '345 qwe@efd.r '' http://www.com/
> http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr
> http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr
> http://5aew.werc.ewr:8100/?  ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw
> 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455
> 5.005 teodor@stack.net qwe-wer asdf <fr>qwer jf sdjk<we hjwer <werrwe> ewr1>
> ewri2 <a href="qwe<qwe>">
>  /usr/local/fff /awdf/dwqe/4325 rewt/ewr wefjn /wqe-324/ewr gist.h gist.h.c
> gist.c. readline 4.2 4.2. 4.2, readline-4.2 readline-4.2. 234
>  <i <b> wow  < jqw <> qwerty');
> - tokid |                token
> --------+--------------------------------------
> -    22 | 345
> -    12 |
> + tokid |          token
> +-------+-------------------------
> +    12 | 3
> +    12 | 4
> +    12 | 5

Uh, there are some known problems with Cygwin, particularly with text
search.  I don't think anyone knows the cause, but I think it is problem
in the Cygwin platform.

Here is a thread that mentions the issue:

    http://www.postgresql.org/message-id/51B59794.3000500@gmail.com

and here is a comment about the failures:

    http://www.postgresql.org/message-id/52E1EBD0.4090304@dunslane.net

    AFAICT the regression is in Cygwin. The buildfarm passes because it's
    using an oldish Cygwin release, 1.7.7 rather than the current 1.7.27. I
    have brought the regression the athe attention of the Cygwin people in
    the past, but without response.

--
  Bruce Momjian  <bruce@momjian.us>        http://momjian.us
  EnterpriseDB                             http://enterprisedb.com

  + Everyone has their own god. +

Re: BUG #8970: ts_parse incorrectly split numbers in digit token

От
Bruce Momjian
Дата:
On Sat, Jan 25, 2014 at 10:39:53AM -0500, Bruce Momjian wrote:
> Uh, there are some known problems with Cygwin, particularly with text
> search.  I don't think anyone knows the cause, but I think it is problem
> in the Cygwin platform.
>
> Here is a thread that mentions the issue:
>
>     http://www.postgresql.org/message-id/51B59794.3000500@gmail.com
>
> and here is a comment about the failures:
>
>     http://www.postgresql.org/message-id/52E1EBD0.4090304@dunslane.net
>
>     AFAICT the regression is in Cygwin. The buildfarm passes because it's
>     using an oldish Cygwin release, 1.7.7 rather than the current 1.7.27. I
>     have brought the regression the athe attention of the Cygwin people in
>     the past, but without response.

Oops, I just replied to you with your own thread.  :-)  I thought it was
odd we had all these Cygwin reports all of a sudden.  LOL

Anyway, are you saying these are not platform bugs as Andrew suggested?

--
  Bruce Momjian  <bruce@momjian.us>        http://momjian.us
  EnterpriseDB                             http://enterprisedb.com

  + Everyone has their own god. +

Re: BUG #8970: ts_parse incorrectly split numbers in digit token

От
Marco Atzeri
Дата:
On 25/01/2014 16:43, Bruce Momjian wrote:
> On Sat, Jan 25, 2014 at 10:39:53AM -0500, Bruce Momjian wrote:
>> Uh, there are some known problems with Cygwin, particularly with text
>> search.  I don't think anyone knows the cause, but I think it is problem
>> in the Cygwin platform.
>>
>> Here is a thread that mentions the issue:
>>
>>     http://www.postgresql.org/message-id/51B59794.3000500@gmail.com
>>
>> and here is a comment about the failures:
>>
>>     http://www.postgresql.org/message-id/52E1EBD0.4090304@dunslane.net
>>
>>     AFAICT the regression is in Cygwin. The buildfarm passes because it's
>>     using an oldish Cygwin release, 1.7.7 rather than the current 1.7.27. I
>>     have brought the regression the athe attention of the Cygwin people in
>>     the past, but without response.

Bruce,
I took over, the binary package responsabilities, in Jan 2013 with 9.2.2
http://cygwin.com/ml/cygwin-announce/2013-01/msg00032.html

Before that the previous package releasse was 8.2.11
http://cygwin.com/ml/cygwin-announce/2009-01/msg00010.html

Can we move forward ?
Testing on 1.7.7 will not help me on packaging a full functional binary
deployed through our setup as our users expect.

> Oops, I just replied to you with your own thread.  :-)  I thought it was
> odd we had all these Cygwin reports all of a sudden.  LOL

Always me ;-)

> Anyway, are you saying these are not platform bugs as Andrew suggested?

what do you you mean as platform bugs ?

I have no clue of how "ts_parse" works, so from my point of view
it could be any of:

1) wrong expectation of postgres about cygwin behaviour
  (I noticed that the code is full of "#if defined(platform)"
   that could break if the platform evolves)
2) a bug in latest cygwin library (cygwin core or newlib )
3) a bug in one of the several other library that postgres is using
4) a bug in the latest version of C compiler
In the past, on other softwares, I have seen any of these happening.

So where I should look in the postgres code to understand why "123"
is splitted in 3 token ?

If (2) or (3) I need to pass a "simple" test case to the other
core and package maintainers that shows the incorrect behaviour
and allow to amend the bug or report to the upstream library developers.

If (1) we need to amend postgres code, and for (4)
report a test case to gcc developers and look for a temporary workaround

Regards
Marco

Re: BUG #8970: ts_parse incorrectly split numbers in digit token

От
Alvaro Herrera
Дата:
marco.atzeri@gmail.com escribió:

>  SELECT * FROM ts_parse('default', '345 qwe@efd.r '' http://www.com/
> http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr
> http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr
> http://5aew.werc.ewr:8100/?  ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw

To trace this, I would look at src/backend/tsearch/wparser_def.c;
probably try compiling that file with WPARSER_TRACE defined, and compare
the output of ts_parse() in something simple such as '345' in a working
port with the failing one.  That might give you clues as to what is
causing the failure.

--
Álvaro Herrera                http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

Re: BUG #8970: ts_parse incorrectly split numbers in digit token

От
Marco Atzeri
Дата:
On 26/01/2014 03:25, Alvaro Herrera wrote:
> marco.atzeri@gmail.com escribió:
>

>
> To trace this, I would look at src/backend/tsearch/wparser_def.c;
> probably try compiling that file with WPARSER_TRACE defined, and compare
> the output of ts_parse() in something simple such as '345' in a working
> port with the failing one.  That might give you clues as to what is
> causing the failure.
>

database created with LANG=en_US.UTF-8

postgres=# SELECT * FROM ts_parse('default', '345');
;
  tokid | token
-------+-------
     12 | 3
     12 | 4
     22 | 5
(3 rows)


parsing "345"
state TPS_Base at 3 matched rule 12 flags tostate TPS_InSpace
state TPS_InSpace at 4 matched rule 8 flags BINGO tostate TPS_Base type
blank
state TPS_Base at 4 matched rule 12 flags tostate TPS_InSpace
state TPS_InSpace at 5 matched rule 8 flags BINGO tostate TPS_Base type
blank
state TPS_Base at 5 matched rule 5 flags tostate TPS_InUnsignedInt
state TPS_InUnsignedInt at EOF matched rule 0 flags BINGO tostate
TPS_Base type uint

database created with LANG=C

postgres=# SELECT * FROM ts_parse('default', '345');
;
  tokid | token
-------+-------
     22 | 345
(1 row)

parsing "345"
state TPS_Base at 3 matched rule 5 flags tostate TPS_InUnsignedInt
state TPS_InUnsignedInt at 4 matched rule 1 flags
state TPS_InUnsignedInt at 5 matched rule 1 flags
state TPS_InUnsignedInt at EOF matched rule 0 flags BINGO tostate
TPS_Base type uint

Re: BUG #8970: ts_parse incorrectly split numbers in digit token

От
Tom Lane
Дата:
Marco Atzeri <marco.atzeri@gmail.com> writes:
> On 26/01/2014 03:25, Alvaro Herrera wrote:
>> To trace this, I would look at src/backend/tsearch/wparser_def.c;
>> probably try compiling that file with WPARSER_TRACE defined, and compare
>> the output of ts_parse() in something simple such as '345' in a working
>> port with the failing one.  That might give you clues as to what is
>> causing the failure.

> [ trace ]

As was suspected upthread, this shows that p_isdigit() is failing to
recognize "3" as a digit.  So you've got broken locale support somewhere.

There are two different implementations of p_isdigit in wparser_def.c,
depending on whether USE_WIDE_UPPER_LOWER is defined.  It should be, in
a Windows build, but maybe this is tracing back to a configure problem?

            regards, tom lane

Re: BUG #8970: ts_parse incorrectly split numbers in digit token

От
Marco Atzeri
Дата:
On 26/01/2014 18:27, Tom Lane wrote:
> Marco Atzeri <marco.atzeri@gmail.com> writes:
>> On 26/01/2014 03:25, Alvaro Herrera wrote:
>>> To trace this, I would look at src/backend/tsearch/wparser_def.c;
>>> probably try compiling that file with WPARSER_TRACE defined, and compare
>>> the output of ts_parse() in something simple such as '345' in a working
>>> port with the failing one.  That might give you clues as to what is
>>> causing the failure.
>
>> [ trace ]
>
> As was suspected upthread, this shows that p_isdigit() is failing to
> recognize "3" as a digit.  So you've got broken locale support somewhere.
>
> There are two different implementations of p_isdigit in wparser_def.c,
> depending on whether USE_WIDE_UPPER_LOWER is defined.  It should be, in
> a Windows build, but maybe this is tracing back to a configure problem?
>
>             regards, tom lane
>

debugging a bit I think that is not a broken locale

the first two times the character contains also a portion of the
next digit so the result is always false.

Eventually it was assumed that size of a wide char is always 32 bit ?

"Unlike Windows UTF-16 2-byte wide chars, wchar_t on Linux and OS X is 4
bytes UTF-32 (gcc/g++ and XCode). On cygwin it is 2 (cygwin uses Windows
APIs)."

testing with "SELECT * FROM ts_parse('default', '345');"

--------------------------------------------------------------
Breakpoint 1, p_isdigit (prs=0x80100930)
     at
/pub/devel/postgresql/postgresql-9.3.2-2/src/postgresql-9.3.2/src/backend/tsearch/wparser_def.c:560
560     p_iswhat(digit)
(gdb) step
0x007036d8 in iswdigit ()
(gdb) step
Single stepping until exit from function iswdigit,
which has no line number information.
iswdigit (c=3407923)
     at /usr/src/debug/cygwin-1.7.27-2/newlib/libc/ctype/iswdigit.c:35
35        return (c >= (wint_t)'0' && c <= (wint_t)'9');
(gdb) p/x c
$77 = 0x340033
(gdb) finish
Run till exit from #0  iswdigit (c=3407923)
     at /usr/src/debug/cygwin-1.7.27-2/newlib/libc/ctype/iswdigit.c:35
0x0060c510 in TParserGet (prs=0x80100930)
     at
/pub/devel/postgresql/postgresql-9.3.2-2/src/postgresql-9.3.2/src/backend/tsearch/wparser_def.c:1834
1834                            if (item->isclass(prs) != 0)
Value returned is $78 = 0

Breakpoint 1, p_isdigit (prs=0x80100930)
     at
/pub/devel/postgresql/postgresql-9.3.2-2/src/postgresql-9.3.2/src/backend/tsearch/wparser_def.c:560
560     p_iswhat(digit)
(gdb) step
0x007036d8 in iswdigit ()
(gdb) step
Single stepping until exit from function iswdigit,
which has no line number information.
iswdigit (c=3473460)
     at /usr/src/debug/cygwin-1.7.27-2/newlib/libc/ctype/iswdigit.c:35
35        return (c >= (wint_t)'0' && c <= (wint_t)'9');
(gdb) p/x c
$79 = 0x350034
(gdb) finish
Run till exit from #0  iswdigit (c=3473460)
     at /usr/src/debug/cygwin-1.7.27-2/newlib/libc/ctype/iswdigit.c:35
0x0060c510 in TParserGet (prs=0x80100930)
     at
/pub/devel/postgresql/postgresql-9.3.2-2/src/postgresql-9.3.2/src/backend/tsearch/wparser_def.c:1834
1834                            if (item->isclass(prs) != 0)
Value returned is $80 = 0

Breakpoint 1, p_isdigit (prs=0x80100930)
     at
/pub/devel/postgresql/postgresql-9.3.2-2/src/postgresql-9.3.2/src/backend/tsearch/wparser_def.c:560
560     p_iswhat(digit)
(gdb) step
0x007036d8 in iswdigit ()
(gdb) step
Single stepping until exit from function iswdigit,
which has no line number information.
iswdigit (c=53)
     at /usr/src/debug/cygwin-1.7.27-2/newlib/libc/ctype/iswdigit.c:35
35        return (c >= (wint_t)'0' && c <= (wint_t)'9');
(gdb) p/x c
$81 = 0x35
(gdb) finish
Run till exit from #0  iswdigit (c=53)
     at /usr/src/debug/cygwin-1.7.27-2/newlib/libc/ctype/iswdigit.c:35
0x0060c510 in TParserGet (prs=0x80100930)
     at
/pub/devel/postgresql/postgresql-9.3.2-2/src/postgresql-9.3.2/src/backend/tsearch/wparser_def.c:1834
1834                            if (item->isclass(prs) != 0)
Value returned is $82 = 1
-------------------------------------------------------------------------

Re: BUG #8970: ts_parse incorrectly split numbers in digit token

От
Tom Lane
Дата:
Marco Atzeri <marco.atzeri@gmail.com> writes:
> Eventually it was assumed that size of a wide char is always 32 bit ?

Hm.  It looks like there's an entirely unnecessary assumption that wchar_t
and wint_t are the same width.  Does the attached patch make things better
for you?

            regards, tom lane

diff --git a/src/backend/tsearch/wparser_def.c b/src/backend/tsearch/wparser_def.c
index fa73dff..6728212 100644
*** a/src/backend/tsearch/wparser_def.c
--- b/src/backend/tsearch/wparser_def.c
*************** TParserCopyClose(TParser *prs)
*** 432,438 ****
   *      or give wrong result.
   *    - multibyte encoding and C-locale often are used for
   *      Asian languages.
!  *    - if locale is C the we use pgwstr instead of wstr
   */

  #ifdef USE_WIDE_UPPER_LOWER
--- 432,438 ----
   *      or give wrong result.
   *    - multibyte encoding and C-locale often are used for
   *      Asian languages.
!  *    - if locale is C then we use pgwstr instead of wstr.
   */

  #ifdef USE_WIDE_UPPER_LOWER
*************** p_is##type(TParser *prs) {                                                    \
*** 444,452 ****
      if ( prs->usewide )                                                        \
      {                                                                        \
          if ( prs->pgwstr )                                                    \
!             return is##type( 0xff & *( prs->pgwstr + prs->state->poschar) );\
!                                                                             \
!         return isw##type( *(wint_t*)( prs->wstr + prs->state->poschar ) );    \
      }                                                                        \
                                                                              \
      return is##type( *(unsigned char*)( prs->str + prs->state->posbyte ) ); \
--- 444,456 ----
      if ( prs->usewide )                                                        \
      {                                                                        \
          if ( prs->pgwstr )                                                    \
!         {                                                                    \
!             unsigned int c = *(prs->pgwstr + prs->state->poschar);            \
!             if ( c > 0x7f )                                                    \
!                 return 0;                                                    \
!             return is##type( c );                                            \
!         }                                                                    \
!         return isw##type( *( prs->wstr + prs->state->poschar ) );            \
      }                                                                        \
                                                                              \
      return is##type( *(unsigned char*)( prs->str + prs->state->posbyte ) ); \
*************** p_isalnum(TParser *prs)
*** 475,484 ****
              if (c > 0x7f)
                  return 1;

!             return isalnum(0xff & c);
          }

!         return iswalnum((wint_t) *(prs->wstr + prs->state->poschar));
      }

      return isalnum(*(unsigned char *) (prs->str + prs->state->posbyte));
--- 479,488 ----
              if (c > 0x7f)
                  return 1;

!             return isalnum(c);
          }

!         return iswalnum(*(prs->wstr + prs->state->poschar));
      }

      return isalnum(*(unsigned char *) (prs->str + prs->state->posbyte));
*************** p_isalpha(TParser *prs)
*** 507,516 ****
              if (c > 0x7f)
                  return 1;

!             return isalpha(0xff & c);
          }

!         return iswalpha((wint_t) *(prs->wstr + prs->state->poschar));
      }

      return isalpha(*(unsigned char *) (prs->str + prs->state->posbyte));
--- 511,520 ----
              if (c > 0x7f)
                  return 1;

!             return isalpha(c);
          }

!         return iswalpha(*(prs->wstr + prs->state->poschar));
      }

      return isalpha(*(unsigned char *) (prs->str + prs->state->posbyte));

Re: BUG #8970: ts_parse incorrectly split numbers in digit token

От
Marco Atzeri
Дата:
On 01/02/2014 23:27, Tom Lane wrote:
> Marco Atzeri <marco.atzeri@gmail.com> writes:
>> Eventually it was assumed that size of a wide char is always 32 bit ?
>
> Hm.  It looks like there's an entirely unnecessary assumption that wchar_t
> and wint_t are the same width.  Does the attached patch make things better
> for you?
>
>             regards, tom lane
>

tested on 9.3.2 with LANG=en_US.UTF-8

      tsearch                  ... ok


Thanks
Marco

Re: BUG #8970: ts_parse incorrectly split numbers in digit token

От
Tom Lane
Дата:
Marco Atzeri <marco.atzeri@gmail.com> writes:
> On 01/02/2014 23:27, Tom Lane wrote:
>> Hm.  It looks like there's an entirely unnecessary assumption that wchar_t
>> and wint_t are the same width.  Does the attached patch make things better
>> for you?

> tested on 9.3.2 with LANG=en_US.UTF-8
>       tsearch                  ... ok

Excellent, I'll go commit this.

            regards, tom lane