Обсуждение: ...

Поиск
Список
Период
Сортировка

...

От
Angelos Karageorgiou
Дата:
I am using postgres 6.4.2 on BSD/OS 3.1 with  a Greek locale  that I
have developed. I knew that regexes with postgress would not work because
of something I did but a posting from another follow from Sweden gave me a
clue that the problem must be with the regex package and not the locale.

So I investigated the code and found out the pg_isdigit(int ch),
pg_isalpha(int ch) and the associated functions do a comparison of
characters as ints. I changed a few crucial points with a cast to
(unsigned char) and voila , regexs in Greek with full locale support. My
guess is that an int != unsigned char when comparing, the sign bit is
probably the culprit.

Please test the patch on some other language too, Swedish or Finish
would be a nice touch.

Patch follows, but it is trivial really.
---------------------------------------------------------------------------------
*** regcomp.c    Tue Sep  1 07:31:25 1998
--- regcomp.c.patched    Wed Feb 10 19:57:11 1999
***************
*** 1038,1046 **** {     assert(pg_isalpha(ch));     if (pg_isupper(ch))
!         return tolower(ch);     else if (pg_islower(ch))
!         return toupper(ch);     else /* peculiar, but could happen */         return ch;
--- 1038,1046 ---- {     assert(pg_isalpha(ch));     if (pg_isupper(ch))
!         return tolower((unsigned char)ch);     else if (pg_islower(ch))
!         return toupper((unsigned char)ch);     else /* peculiar, but could happen */         return ch;
***************
*** 1055,1067 **** static void bothcases(p, ch) struct parse *p;
! int            ch; {     pg_wchar   *oldnext = p->next;     pg_wchar   *oldend = p->end;     pg_wchar    bracket[3];

!     assert(othercase(ch) != ch);/* p_bracket() would recurse */     p->next = bracket;     p->end = bracket + 2;
bracket[0]= ch;
 
--- 1055,1067 ---- static void bothcases(p, ch) struct parse *p;
! int        ch; {     pg_wchar   *oldnext = p->next;     pg_wchar   *oldend = p->end;     pg_wchar    bracket[3]; 
!     assert(othercase(ch) != (unsigned char)ch);/* p_bracket() would recurse */     p->next = bracket;     p->end =
bracket+ 2;     bracket[0] = ch;
 
***************
*** 1084,1090 **** {     cat_t       *cap = p->g->categories; 
!     if ((p->g->cflags & REG_ICASE) && pg_isalpha(ch) && othercase(ch) != ch)         bothcases(p, ch);     else
{
--- 1084,1090 ---- {     cat_t       *cap = p->g->categories; 
!     if ((p->g->cflags & REG_ICASE) && pg_isalpha(ch) && othercase(ch) != (unsigned char)ch)         bothcases(p, ch);
   else     {
 
***************
*** 1862,1868 **** #ifdef MULTIBYTE     return (c >= 0 && c <= UCHAR_MAX && isdigit(c)); #else
!     return (isdigit(c)); #endif } 
--- 1862,1868 ---- #ifdef MULTIBYTE     return (c >= 0 && c <= UCHAR_MAX && isdigit(c)); #else
!     return (isdigit((unsigned char)c)); #endif } 
***************
*** 1872,1878 **** #ifdef MULTIBYTE     return (c >= 0 && c <= UCHAR_MAX && isalpha(c)); #else
!     return (isalpha(c)); #endif } 
--- 1872,1878 ---- #ifdef MULTIBYTE     return (c >= 0 && c <= UCHAR_MAX && isalpha(c)); #else
!     return (isalpha((unsigned char)c)); #endif } 
***************
*** 1882,1888 **** #ifdef MULTIBYTE     return (c >= 0 && c <= UCHAR_MAX && isupper(c)); #else
!     return (isupper(c)); #endif } 
--- 1882,1888 ---- #ifdef MULTIBYTE     return (c >= 0 && c <= UCHAR_MAX && isupper(c)); #else
!     return (isupper((unsigned char)c)); #endif } 
***************
*** 1892,1897 **** #ifdef MULTIBYTE     return (c >= 0 && c <= UCHAR_MAX && islower(c)); #else
!     return (islower(c)); #endif }
--- 1892,1897 ---- #ifdef MULTIBYTE     return (c >= 0 && c <= UCHAR_MAX && islower(c)); #else
!     return (islower((unsigned char)c)); #endif }


Re: your mail

От
Oleg Broytmann
Дата:
Hello!
  Next time you'll send a patch could you use tools in     .../src/tools/make_diff
  I've applied the patch to 6.4.2 on Debian 2.0 and ran locale test on
koi8-r locale. The locale test before the patch passed and test after patch
passed as well. I didn't note any difference. What difference you expected?
  Please supply data for locale test (look into .../src/test/locale). This
is not related to your patch, we're just collecting test data.

On Wed, 10 Feb 1999, Angelos Karageorgiou wrote:

> I am using postgres 6.4.2 on BSD/OS 3.1 with  a Greek locale  that I
> have developed. I knew that regexes with postgress would not work because
> of something I did but a posting from another follow from Sweden gave me a
> clue that the problem must be with the regex package and not the locale.
> 
> So I investigated the code and found out the pg_isdigit(int ch),
> pg_isalpha(int ch) and the associated functions do a comparison of
> characters as ints. I changed a few crucial points with a cast to
> (unsigned char) and voila , regexs in Greek with full locale support. My
> guess is that an int != unsigned char when comparing, the sign bit is
> probably the culprit.
> 
> Please test the patch on some other language too, Swedish or Finish
> would be a nice touch.
> 
> Patch follows, but it is trivial really.
> ---------------------------------------------------------------------------------
> *** regcomp.c    Tue Sep  1 07:31:25 1998
> --- regcomp.c.patched    Wed Feb 10 19:57:11 1999
> ***************
> *** 1038,1046 ****
>   {
>       assert(pg_isalpha(ch));
>       if (pg_isupper(ch))
> !         return tolower(ch);
>       else if (pg_islower(ch))
> !         return toupper(ch);
>       else
>   /* peculiar, but could happen */
>           return ch;
> --- 1038,1046 ----
>   {
>       assert(pg_isalpha(ch));
>       if (pg_isupper(ch))
> !         return tolower((unsigned char)ch);
>       else if (pg_islower(ch))
> !         return toupper((unsigned char)ch);
>       else
>   /* peculiar, but could happen */
>           return ch;
> ***************
> *** 1055,1067 ****
>   static void
>   bothcases(p, ch)
>   struct parse *p;
> ! int            ch;
>   {
>       pg_wchar   *oldnext = p->next;
>       pg_wchar   *oldend = p->end;
>       pg_wchar    bracket[3];
>   
> !     assert(othercase(ch) != ch);/* p_bracket() would recurse */
>       p->next = bracket;
>       p->end = bracket + 2;
>       bracket[0] = ch;
> --- 1055,1067 ----
>   static void
>   bothcases(p, ch)
>   struct parse *p;
> ! int        ch;
>   {
>       pg_wchar   *oldnext = p->next;
>       pg_wchar   *oldend = p->end;
>       pg_wchar    bracket[3];
>   
> !     assert(othercase(ch) != (unsigned char)ch);/* p_bracket() would recurse */
>       p->next = bracket;
>       p->end = bracket + 2;
>       bracket[0] = ch;
> ***************
> *** 1084,1090 ****
>   {
>       cat_t       *cap = p->g->categories;
>   
> !     if ((p->g->cflags & REG_ICASE) && pg_isalpha(ch) && othercase(ch) != ch)
>           bothcases(p, ch);
>       else
>       {
> --- 1084,1090 ----
>   {
>       cat_t       *cap = p->g->categories;
>   
> !     if ((p->g->cflags & REG_ICASE) && pg_isalpha(ch) && othercase(ch) != (unsigned char)ch)
>           bothcases(p, ch);
>       else
>       {
> ***************
> *** 1862,1868 ****
>   #ifdef MULTIBYTE
>       return (c >= 0 && c <= UCHAR_MAX && isdigit(c));
>   #else
> !     return (isdigit(c));
>   #endif
>   }
>   
> --- 1862,1868 ----
>   #ifdef MULTIBYTE
>       return (c >= 0 && c <= UCHAR_MAX && isdigit(c));
>   #else
> !     return (isdigit((unsigned char)c));
>   #endif
>   }
>   
> ***************
> *** 1872,1878 ****
>   #ifdef MULTIBYTE
>       return (c >= 0 && c <= UCHAR_MAX && isalpha(c));
>   #else
> !     return (isalpha(c));
>   #endif
>   }
>   
> --- 1872,1878 ----
>   #ifdef MULTIBYTE
>       return (c >= 0 && c <= UCHAR_MAX && isalpha(c));
>   #else
> !     return (isalpha((unsigned char)c));
>   #endif
>   }
>   
> ***************
> *** 1882,1888 ****
>   #ifdef MULTIBYTE
>       return (c >= 0 && c <= UCHAR_MAX && isupper(c));
>   #else
> !     return (isupper(c));
>   #endif
>   }
>   
> --- 1882,1888 ----
>   #ifdef MULTIBYTE
>       return (c >= 0 && c <= UCHAR_MAX && isupper(c));
>   #else
> !     return (isupper((unsigned char)c));
>   #endif
>   }
>   
> ***************
> *** 1892,1897 ****
>   #ifdef MULTIBYTE
>       return (c >= 0 && c <= UCHAR_MAX && islower(c));
>   #else
> !     return (islower(c));
>   #endif
>   }
> --- 1892,1897 ----
>   #ifdef MULTIBYTE
>       return (c >= 0 && c <= UCHAR_MAX && islower(c));
>   #else
> !     return (islower((unsigned char)c));
>   #endif
>   }
> 

Oleg.
----    Oleg Broytmann     http://members.xoom.com/phd2/     phd2@earthling.net          Programmers don't die, they
justGOSUB without RETURN.
 



Re: your mail

От
Bruce Momjian
Дата:
Did we reject this 'unsigned' patch, folks?  I seem to remember someone
objecting to it.


> I am using postgres 6.4.2 on BSD/OS 3.1 with  a Greek locale  that I
> have developed. I knew that regexes with postgress would not work because
> of something I did but a posting from another follow from Sweden gave me a
> clue that the problem must be with the regex package and not the locale.
> 
> So I investigated the code and found out the pg_isdigit(int ch),
> pg_isalpha(int ch) and the associated functions do a comparison of
> characters as ints. I changed a few crucial points with a cast to
> (unsigned char) and voila , regexs in Greek with full locale support. My
> guess is that an int != unsigned char when comparing, the sign bit is
> probably the culprit.
> 
> Please test the patch on some other language too, Swedish or Finish
> would be a nice touch.
> 
> Patch follows, but it is trivial really.
> ---------------------------------------------------------------------------------
> *** regcomp.c    Tue Sep  1 07:31:25 1998
> --- regcomp.c.patched    Wed Feb 10 19:57:11 1999
> ***************
> *** 1038,1046 ****
>   {
>       assert(pg_isalpha(ch));
>       if (pg_isupper(ch))
> !         return tolower(ch);
>       else if (pg_islower(ch))
> !         return toupper(ch);
>       else
>   /* peculiar, but could happen */
>           return ch;
> --- 1038,1046 ----
>   {
>       assert(pg_isalpha(ch));
>       if (pg_isupper(ch))
> !         return tolower((unsigned char)ch);
>       else if (pg_islower(ch))
> !         return toupper((unsigned char)ch);
>       else
>   /* peculiar, but could happen */
>           return ch;
> ***************
> *** 1055,1067 ****
>   static void
>   bothcases(p, ch)
>   struct parse *p;
> ! int            ch;
>   {
>       pg_wchar   *oldnext = p->next;
>       pg_wchar   *oldend = p->end;
>       pg_wchar    bracket[3];
>   
> !     assert(othercase(ch) != ch);/* p_bracket() would recurse */
>       p->next = bracket;
>       p->end = bracket + 2;
>       bracket[0] = ch;
> --- 1055,1067 ----
>   static void
>   bothcases(p, ch)
>   struct parse *p;
> ! int        ch;
>   {
>       pg_wchar   *oldnext = p->next;
>       pg_wchar   *oldend = p->end;
>       pg_wchar    bracket[3];
>   
> !     assert(othercase(ch) != (unsigned char)ch);/* p_bracket() would recurse */
>       p->next = bracket;
>       p->end = bracket + 2;
>       bracket[0] = ch;
> ***************
> *** 1084,1090 ****
>   {
>       cat_t       *cap = p->g->categories;
>   
> !     if ((p->g->cflags & REG_ICASE) && pg_isalpha(ch) && othercase(ch) != ch)
>           bothcases(p, ch);
>       else
>       {
> --- 1084,1090 ----
>   {
>       cat_t       *cap = p->g->categories;
>   
> !     if ((p->g->cflags & REG_ICASE) && pg_isalpha(ch) && othercase(ch) != (unsigned char)ch)
>           bothcases(p, ch);
>       else
>       {
> ***************
> *** 1862,1868 ****
>   #ifdef MULTIBYTE
>       return (c >= 0 && c <= UCHAR_MAX && isdigit(c));
>   #else
> !     return (isdigit(c));
>   #endif
>   }
>   
> --- 1862,1868 ----
>   #ifdef MULTIBYTE
>       return (c >= 0 && c <= UCHAR_MAX && isdigit(c));
>   #else
> !     return (isdigit((unsigned char)c));
>   #endif
>   }
>   
> ***************
> *** 1872,1878 ****
>   #ifdef MULTIBYTE
>       return (c >= 0 && c <= UCHAR_MAX && isalpha(c));
>   #else
> !     return (isalpha(c));
>   #endif
>   }
>   
> --- 1872,1878 ----
>   #ifdef MULTIBYTE
>       return (c >= 0 && c <= UCHAR_MAX && isalpha(c));
>   #else
> !     return (isalpha((unsigned char)c));
>   #endif
>   }
>   
> ***************
> *** 1882,1888 ****
>   #ifdef MULTIBYTE
>       return (c >= 0 && c <= UCHAR_MAX && isupper(c));
>   #else
> !     return (isupper(c));
>   #endif
>   }
>   
> --- 1882,1888 ----
>   #ifdef MULTIBYTE
>       return (c >= 0 && c <= UCHAR_MAX && isupper(c));
>   #else
> !     return (isupper((unsigned char)c));
>   #endif
>   }
>   
> ***************
> *** 1892,1897 ****
>   #ifdef MULTIBYTE
>       return (c >= 0 && c <= UCHAR_MAX && islower(c));
>   #else
> !     return (islower(c));
>   #endif
>   }
> --- 1892,1897 ----
>   #ifdef MULTIBYTE
>       return (c >= 0 && c <= UCHAR_MAX && islower(c));
>   #else
> !     return (islower((unsigned char)c));
>   #endif
>   }
> 
> 


--  Bruce Momjian                        |  http://www.op.net/~candle maillist@candle.pha.pa.us            |  (610)
853-3000+  If your life is a hard drive,     |  830 Blythe Avenue +  Christ can be your backup.        |  Drexel Hill,
Pennsylvania19026
 


Re: [HACKERS] Re: your mail

От
Tatsuo Ishii
Дата:
>Did we reject this 'unsigned' patch, folks?  I seem to remember someone
>objecting to it.
[snip]
>> ***************
>> *** 1862,1868 ****
>>   #ifdef MULTIBYTE
>>       return (c >= 0 && c <= UCHAR_MAX && isdigit(c));
>>   #else
>> !     return (isdigit(c));
>>   #endif
>>   }
>>   
>> --- 1862,1868 ----
>>   #ifdef MULTIBYTE
>>       return (c >= 0 && c <= UCHAR_MAX && isdigit(c));
>>   #else
>> !     return (isdigit((unsigned char)c));
>>   #endif
>>   }

According to the ANSI/C standard the argument to isdigit (or some
other friends) must have the value of either an unsigned char or
*EOF*. That's why the argument is typed to int, I guess. This patch
seems to break the rule?

BTW, I would like to propose yet another patches for the problem. This 
seems to work on FreeBSD and Linux. Angelos, can you test it on your
platform (is it a BSD/OS?)?
--
Tatsuo Ishii

*** regcomp.c~    Tue Sep  1 13:31:25 1998
--- regcomp.c    Thu Mar 11 16:51:28 1999
***************
*** 95,101 ****     static void p_b_eclass(struct parse * p, cset *cs);     static pg_wchar p_b_symbol(struct parse *
p);    static char p_b_coll_elem(struct parse * p, int endc);
 
!     static char othercase(int ch);     static void bothcases(struct parse * p, int ch);     static void
ordinary(structparse * p, int ch);     static void nonnewline(struct parse * p);
 
--- 95,101 ----     static void p_b_eclass(struct parse * p, cset *cs);     static pg_wchar p_b_symbol(struct parse *
p);    static char p_b_coll_elem(struct parse * p, int endc);
 
!     static unsigned char othercase(int ch);     static void bothcases(struct parse * p, int ch);     static void
ordinary(structparse * p, int ch);     static void nonnewline(struct parse * p);
 
***************
*** 1032,1049 ****  - othercase - return the case counterpart of an alphabetic  == static char othercase(int ch);  */
! static char                        /* if no counterpart, return ch */ othercase(ch) int            ch; {
assert(pg_isalpha(ch));    if (pg_isupper(ch))
 
!         return tolower(ch);     else if (pg_islower(ch))
!         return toupper(ch);     else /* peculiar, but could happen */
!         return ch; }  /*
--- 1032,1049 ----  - othercase - return the case counterpart of an alphabetic  == static char othercase(int ch);  */
! static unsigned char        /* if no counterpart, return ch */ othercase(ch) int            ch; {
assert(pg_isalpha(ch));    if (pg_isupper(ch))
 
!         return (unsigned char)tolower(ch);     else if (pg_islower(ch))
!         return (unsigned char)toupper(ch);     else /* peculiar, but could happen */
!         return (unsigned char)ch; }  /*


Re: [HACKERS] Re: your mail

От
Bruce Momjian
Дата:
I think we decided against this, right?

> >Did we reject this 'unsigned' patch, folks?  I seem to remember someone
> >objecting to it.
> [snip]
> >> ***************
> >> *** 1862,1868 ****
> >>   #ifdef MULTIBYTE
> >>       return (c >= 0 && c <= UCHAR_MAX && isdigit(c));
> >>   #else
> >> !     return (isdigit(c));
> >>   #endif
> >>   }
> >>   
> >> --- 1862,1868 ----
> >>   #ifdef MULTIBYTE
> >>       return (c >= 0 && c <= UCHAR_MAX && isdigit(c));
> >>   #else
> >> !     return (isdigit((unsigned char)c));
> >>   #endif
> >>   }
> 
> According to the ANSI/C standard the argument to isdigit (or some
> other friends) must have the value of either an unsigned char or
> *EOF*. That's why the argument is typed to int, I guess. This patch
> seems to break the rule?
> 
> BTW, I would like to propose yet another patches for the problem. This 
> seems to work on FreeBSD and Linux. Angelos, can you test it on your
> platform (is it a BSD/OS?)?
> --
> Tatsuo Ishii
> 
> *** regcomp.c~    Tue Sep  1 13:31:25 1998
> --- regcomp.c    Thu Mar 11 16:51:28 1999
> ***************
> *** 95,101 ****
>       static void p_b_eclass(struct parse * p, cset *cs);
>       static pg_wchar p_b_symbol(struct parse * p);
>       static char p_b_coll_elem(struct parse * p, int endc);
> !     static char othercase(int ch);
>       static void bothcases(struct parse * p, int ch);
>       static void ordinary(struct parse * p, int ch);
>       static void nonnewline(struct parse * p);
> --- 95,101 ----
>       static void p_b_eclass(struct parse * p, cset *cs);
>       static pg_wchar p_b_symbol(struct parse * p);
>       static char p_b_coll_elem(struct parse * p, int endc);
> !     static unsigned char othercase(int ch);
>       static void bothcases(struct parse * p, int ch);
>       static void ordinary(struct parse * p, int ch);
>       static void nonnewline(struct parse * p);
> ***************
> *** 1032,1049 ****
>    - othercase - return the case counterpart of an alphabetic
>    == static char othercase(int ch);
>    */
> ! static char                        /* if no counterpart, return ch */
>   othercase(ch)
>   int            ch;
>   {
>       assert(pg_isalpha(ch));
>       if (pg_isupper(ch))
> !         return tolower(ch);
>       else if (pg_islower(ch))
> !         return toupper(ch);
>       else
>   /* peculiar, but could happen */
> !         return ch;
>   }
>   
>   /*
> --- 1032,1049 ----
>    - othercase - return the case counterpart of an alphabetic
>    == static char othercase(int ch);
>    */
> ! static unsigned char        /* if no counterpart, return ch */
>   othercase(ch)
>   int            ch;
>   {
>       assert(pg_isalpha(ch));
>       if (pg_isupper(ch))
> !         return (unsigned char)tolower(ch);
>       else if (pg_islower(ch))
> !         return (unsigned char)toupper(ch);
>       else
>   /* peculiar, but could happen */
> !         return (unsigned char)ch;
>   }
>   
>   /*
> 


--  Bruce Momjian                        |  http://www.op.net/~candle maillist@candle.pha.pa.us            |  (610)
853-3000+  If your life is a hard drive,     |  830 Blythe Avenue +  Christ can be your backup.        |  Drexel Hill,
Pennsylvania19026