Обсуждение: ...
I am using postgres 6.4.2 on BSD/OS 3.1 with a Greek locale that I
have developed. I knew that regexes with postgress would not work because
of something I did but a posting from another follow from Sweden gave me a
clue that the problem must be with the regex package and not the locale.
So I investigated the code and found out the pg_isdigit(int ch),
pg_isalpha(int ch) and the associated functions do a comparison of
characters as ints. I changed a few crucial points with a cast to
(unsigned char) and voila , regexs in Greek with full locale support. My
guess is that an int != unsigned char when comparing, the sign bit is
probably the culprit.
Please test the patch on some other language too, Swedish or Finish
would be a nice touch.
Patch follows, but it is trivial really.
---------------------------------------------------------------------------------
*** regcomp.c Tue Sep 1 07:31:25 1998
--- regcomp.c.patched Wed Feb 10 19:57:11 1999
***************
*** 1038,1046 **** { assert(pg_isalpha(ch)); if (pg_isupper(ch))
! return tolower(ch); else if (pg_islower(ch))
! return toupper(ch); else /* peculiar, but could happen */ return ch;
--- 1038,1046 ---- { assert(pg_isalpha(ch)); if (pg_isupper(ch))
! return tolower((unsigned char)ch); else if (pg_islower(ch))
! return toupper((unsigned char)ch); else /* peculiar, but could happen */ return ch;
***************
*** 1055,1067 **** static void bothcases(p, ch) struct parse *p;
! int ch; { pg_wchar *oldnext = p->next; pg_wchar *oldend = p->end; pg_wchar bracket[3];
! assert(othercase(ch) != ch);/* p_bracket() would recurse */ p->next = bracket; p->end = bracket + 2;
bracket[0]= ch;
--- 1055,1067 ---- static void bothcases(p, ch) struct parse *p;
! int ch; { pg_wchar *oldnext = p->next; pg_wchar *oldend = p->end; pg_wchar bracket[3];
! assert(othercase(ch) != (unsigned char)ch);/* p_bracket() would recurse */ p->next = bracket; p->end =
bracket+ 2; bracket[0] = ch;
***************
*** 1084,1090 **** { cat_t *cap = p->g->categories;
! if ((p->g->cflags & REG_ICASE) && pg_isalpha(ch) && othercase(ch) != ch) bothcases(p, ch); else
{
--- 1084,1090 ---- { cat_t *cap = p->g->categories;
! if ((p->g->cflags & REG_ICASE) && pg_isalpha(ch) && othercase(ch) != (unsigned char)ch) bothcases(p, ch);
else {
***************
*** 1862,1868 **** #ifdef MULTIBYTE return (c >= 0 && c <= UCHAR_MAX && isdigit(c)); #else
! return (isdigit(c)); #endif }
--- 1862,1868 ---- #ifdef MULTIBYTE return (c >= 0 && c <= UCHAR_MAX && isdigit(c)); #else
! return (isdigit((unsigned char)c)); #endif }
***************
*** 1872,1878 **** #ifdef MULTIBYTE return (c >= 0 && c <= UCHAR_MAX && isalpha(c)); #else
! return (isalpha(c)); #endif }
--- 1872,1878 ---- #ifdef MULTIBYTE return (c >= 0 && c <= UCHAR_MAX && isalpha(c)); #else
! return (isalpha((unsigned char)c)); #endif }
***************
*** 1882,1888 **** #ifdef MULTIBYTE return (c >= 0 && c <= UCHAR_MAX && isupper(c)); #else
! return (isupper(c)); #endif }
--- 1882,1888 ---- #ifdef MULTIBYTE return (c >= 0 && c <= UCHAR_MAX && isupper(c)); #else
! return (isupper((unsigned char)c)); #endif }
***************
*** 1892,1897 **** #ifdef MULTIBYTE return (c >= 0 && c <= UCHAR_MAX && islower(c)); #else
! return (islower(c)); #endif }
--- 1892,1897 ---- #ifdef MULTIBYTE return (c >= 0 && c <= UCHAR_MAX && islower(c)); #else
! return (islower((unsigned char)c)); #endif }
Hello!
Next time you'll send a patch could you use tools in .../src/tools/make_diff
I've applied the patch to 6.4.2 on Debian 2.0 and ran locale test on
koi8-r locale. The locale test before the patch passed and test after patch
passed as well. I didn't note any difference. What difference you expected?
Please supply data for locale test (look into .../src/test/locale). This
is not related to your patch, we're just collecting test data.
On Wed, 10 Feb 1999, Angelos Karageorgiou wrote:
> I am using postgres 6.4.2 on BSD/OS 3.1 with a Greek locale that I
> have developed. I knew that regexes with postgress would not work because
> of something I did but a posting from another follow from Sweden gave me a
> clue that the problem must be with the regex package and not the locale.
>
> So I investigated the code and found out the pg_isdigit(int ch),
> pg_isalpha(int ch) and the associated functions do a comparison of
> characters as ints. I changed a few crucial points with a cast to
> (unsigned char) and voila , regexs in Greek with full locale support. My
> guess is that an int != unsigned char when comparing, the sign bit is
> probably the culprit.
>
> Please test the patch on some other language too, Swedish or Finish
> would be a nice touch.
>
> Patch follows, but it is trivial really.
> ---------------------------------------------------------------------------------
> *** regcomp.c Tue Sep 1 07:31:25 1998
> --- regcomp.c.patched Wed Feb 10 19:57:11 1999
> ***************
> *** 1038,1046 ****
> {
> assert(pg_isalpha(ch));
> if (pg_isupper(ch))
> ! return tolower(ch);
> else if (pg_islower(ch))
> ! return toupper(ch);
> else
> /* peculiar, but could happen */
> return ch;
> --- 1038,1046 ----
> {
> assert(pg_isalpha(ch));
> if (pg_isupper(ch))
> ! return tolower((unsigned char)ch);
> else if (pg_islower(ch))
> ! return toupper((unsigned char)ch);
> else
> /* peculiar, but could happen */
> return ch;
> ***************
> *** 1055,1067 ****
> static void
> bothcases(p, ch)
> struct parse *p;
> ! int ch;
> {
> pg_wchar *oldnext = p->next;
> pg_wchar *oldend = p->end;
> pg_wchar bracket[3];
>
> ! assert(othercase(ch) != ch);/* p_bracket() would recurse */
> p->next = bracket;
> p->end = bracket + 2;
> bracket[0] = ch;
> --- 1055,1067 ----
> static void
> bothcases(p, ch)
> struct parse *p;
> ! int ch;
> {
> pg_wchar *oldnext = p->next;
> pg_wchar *oldend = p->end;
> pg_wchar bracket[3];
>
> ! assert(othercase(ch) != (unsigned char)ch);/* p_bracket() would recurse */
> p->next = bracket;
> p->end = bracket + 2;
> bracket[0] = ch;
> ***************
> *** 1084,1090 ****
> {
> cat_t *cap = p->g->categories;
>
> ! if ((p->g->cflags & REG_ICASE) && pg_isalpha(ch) && othercase(ch) != ch)
> bothcases(p, ch);
> else
> {
> --- 1084,1090 ----
> {
> cat_t *cap = p->g->categories;
>
> ! if ((p->g->cflags & REG_ICASE) && pg_isalpha(ch) && othercase(ch) != (unsigned char)ch)
> bothcases(p, ch);
> else
> {
> ***************
> *** 1862,1868 ****
> #ifdef MULTIBYTE
> return (c >= 0 && c <= UCHAR_MAX && isdigit(c));
> #else
> ! return (isdigit(c));
> #endif
> }
>
> --- 1862,1868 ----
> #ifdef MULTIBYTE
> return (c >= 0 && c <= UCHAR_MAX && isdigit(c));
> #else
> ! return (isdigit((unsigned char)c));
> #endif
> }
>
> ***************
> *** 1872,1878 ****
> #ifdef MULTIBYTE
> return (c >= 0 && c <= UCHAR_MAX && isalpha(c));
> #else
> ! return (isalpha(c));
> #endif
> }
>
> --- 1872,1878 ----
> #ifdef MULTIBYTE
> return (c >= 0 && c <= UCHAR_MAX && isalpha(c));
> #else
> ! return (isalpha((unsigned char)c));
> #endif
> }
>
> ***************
> *** 1882,1888 ****
> #ifdef MULTIBYTE
> return (c >= 0 && c <= UCHAR_MAX && isupper(c));
> #else
> ! return (isupper(c));
> #endif
> }
>
> --- 1882,1888 ----
> #ifdef MULTIBYTE
> return (c >= 0 && c <= UCHAR_MAX && isupper(c));
> #else
> ! return (isupper((unsigned char)c));
> #endif
> }
>
> ***************
> *** 1892,1897 ****
> #ifdef MULTIBYTE
> return (c >= 0 && c <= UCHAR_MAX && islower(c));
> #else
> ! return (islower(c));
> #endif
> }
> --- 1892,1897 ----
> #ifdef MULTIBYTE
> return (c >= 0 && c <= UCHAR_MAX && islower(c));
> #else
> ! return (islower((unsigned char)c));
> #endif
> }
>
Oleg.
---- Oleg Broytmann http://members.xoom.com/phd2/ phd2@earthling.net Programmers don't die, they
justGOSUB without RETURN.
Did we reject this 'unsigned' patch, folks? I seem to remember someone
objecting to it.
> I am using postgres 6.4.2 on BSD/OS 3.1 with a Greek locale that I
> have developed. I knew that regexes with postgress would not work because
> of something I did but a posting from another follow from Sweden gave me a
> clue that the problem must be with the regex package and not the locale.
>
> So I investigated the code and found out the pg_isdigit(int ch),
> pg_isalpha(int ch) and the associated functions do a comparison of
> characters as ints. I changed a few crucial points with a cast to
> (unsigned char) and voila , regexs in Greek with full locale support. My
> guess is that an int != unsigned char when comparing, the sign bit is
> probably the culprit.
>
> Please test the patch on some other language too, Swedish or Finish
> would be a nice touch.
>
> Patch follows, but it is trivial really.
> ---------------------------------------------------------------------------------
> *** regcomp.c Tue Sep 1 07:31:25 1998
> --- regcomp.c.patched Wed Feb 10 19:57:11 1999
> ***************
> *** 1038,1046 ****
> {
> assert(pg_isalpha(ch));
> if (pg_isupper(ch))
> ! return tolower(ch);
> else if (pg_islower(ch))
> ! return toupper(ch);
> else
> /* peculiar, but could happen */
> return ch;
> --- 1038,1046 ----
> {
> assert(pg_isalpha(ch));
> if (pg_isupper(ch))
> ! return tolower((unsigned char)ch);
> else if (pg_islower(ch))
> ! return toupper((unsigned char)ch);
> else
> /* peculiar, but could happen */
> return ch;
> ***************
> *** 1055,1067 ****
> static void
> bothcases(p, ch)
> struct parse *p;
> ! int ch;
> {
> pg_wchar *oldnext = p->next;
> pg_wchar *oldend = p->end;
> pg_wchar bracket[3];
>
> ! assert(othercase(ch) != ch);/* p_bracket() would recurse */
> p->next = bracket;
> p->end = bracket + 2;
> bracket[0] = ch;
> --- 1055,1067 ----
> static void
> bothcases(p, ch)
> struct parse *p;
> ! int ch;
> {
> pg_wchar *oldnext = p->next;
> pg_wchar *oldend = p->end;
> pg_wchar bracket[3];
>
> ! assert(othercase(ch) != (unsigned char)ch);/* p_bracket() would recurse */
> p->next = bracket;
> p->end = bracket + 2;
> bracket[0] = ch;
> ***************
> *** 1084,1090 ****
> {
> cat_t *cap = p->g->categories;
>
> ! if ((p->g->cflags & REG_ICASE) && pg_isalpha(ch) && othercase(ch) != ch)
> bothcases(p, ch);
> else
> {
> --- 1084,1090 ----
> {
> cat_t *cap = p->g->categories;
>
> ! if ((p->g->cflags & REG_ICASE) && pg_isalpha(ch) && othercase(ch) != (unsigned char)ch)
> bothcases(p, ch);
> else
> {
> ***************
> *** 1862,1868 ****
> #ifdef MULTIBYTE
> return (c >= 0 && c <= UCHAR_MAX && isdigit(c));
> #else
> ! return (isdigit(c));
> #endif
> }
>
> --- 1862,1868 ----
> #ifdef MULTIBYTE
> return (c >= 0 && c <= UCHAR_MAX && isdigit(c));
> #else
> ! return (isdigit((unsigned char)c));
> #endif
> }
>
> ***************
> *** 1872,1878 ****
> #ifdef MULTIBYTE
> return (c >= 0 && c <= UCHAR_MAX && isalpha(c));
> #else
> ! return (isalpha(c));
> #endif
> }
>
> --- 1872,1878 ----
> #ifdef MULTIBYTE
> return (c >= 0 && c <= UCHAR_MAX && isalpha(c));
> #else
> ! return (isalpha((unsigned char)c));
> #endif
> }
>
> ***************
> *** 1882,1888 ****
> #ifdef MULTIBYTE
> return (c >= 0 && c <= UCHAR_MAX && isupper(c));
> #else
> ! return (isupper(c));
> #endif
> }
>
> --- 1882,1888 ----
> #ifdef MULTIBYTE
> return (c >= 0 && c <= UCHAR_MAX && isupper(c));
> #else
> ! return (isupper((unsigned char)c));
> #endif
> }
>
> ***************
> *** 1892,1897 ****
> #ifdef MULTIBYTE
> return (c >= 0 && c <= UCHAR_MAX && islower(c));
> #else
> ! return (islower(c));
> #endif
> }
> --- 1892,1897 ----
> #ifdef MULTIBYTE
> return (c >= 0 && c <= UCHAR_MAX && islower(c));
> #else
> ! return (islower((unsigned char)c));
> #endif
> }
>
>
-- Bruce Momjian | http://www.op.net/~candle maillist@candle.pha.pa.us | (610)
853-3000+ If your life is a hard drive, | 830 Blythe Avenue + Christ can be your backup. | Drexel Hill,
Pennsylvania19026
>Did we reject this 'unsigned' patch, folks? I seem to remember someone
>objecting to it.
[snip]
>> ***************
>> *** 1862,1868 ****
>> #ifdef MULTIBYTE
>> return (c >= 0 && c <= UCHAR_MAX && isdigit(c));
>> #else
>> ! return (isdigit(c));
>> #endif
>> }
>>
>> --- 1862,1868 ----
>> #ifdef MULTIBYTE
>> return (c >= 0 && c <= UCHAR_MAX && isdigit(c));
>> #else
>> ! return (isdigit((unsigned char)c));
>> #endif
>> }
According to the ANSI/C standard the argument to isdigit (or some
other friends) must have the value of either an unsigned char or
*EOF*. That's why the argument is typed to int, I guess. This patch
seems to break the rule?
BTW, I would like to propose yet another patches for the problem. This
seems to work on FreeBSD and Linux. Angelos, can you test it on your
platform (is it a BSD/OS?)?
--
Tatsuo Ishii
*** regcomp.c~ Tue Sep 1 13:31:25 1998
--- regcomp.c Thu Mar 11 16:51:28 1999
***************
*** 95,101 **** static void p_b_eclass(struct parse * p, cset *cs); static pg_wchar p_b_symbol(struct parse *
p); static char p_b_coll_elem(struct parse * p, int endc);
! static char othercase(int ch); static void bothcases(struct parse * p, int ch); static void
ordinary(structparse * p, int ch); static void nonnewline(struct parse * p);
--- 95,101 ---- static void p_b_eclass(struct parse * p, cset *cs); static pg_wchar p_b_symbol(struct parse *
p); static char p_b_coll_elem(struct parse * p, int endc);
! static unsigned char othercase(int ch); static void bothcases(struct parse * p, int ch); static void
ordinary(structparse * p, int ch); static void nonnewline(struct parse * p);
***************
*** 1032,1049 **** - othercase - return the case counterpart of an alphabetic == static char othercase(int ch); */
! static char /* if no counterpart, return ch */ othercase(ch) int ch; {
assert(pg_isalpha(ch)); if (pg_isupper(ch))
! return tolower(ch); else if (pg_islower(ch))
! return toupper(ch); else /* peculiar, but could happen */
! return ch; } /*
--- 1032,1049 ---- - othercase - return the case counterpart of an alphabetic == static char othercase(int ch); */
! static unsigned char /* if no counterpart, return ch */ othercase(ch) int ch; {
assert(pg_isalpha(ch)); if (pg_isupper(ch))
! return (unsigned char)tolower(ch); else if (pg_islower(ch))
! return (unsigned char)toupper(ch); else /* peculiar, but could happen */
! return (unsigned char)ch; } /*
I think we decided against this, right?
> >Did we reject this 'unsigned' patch, folks? I seem to remember someone
> >objecting to it.
> [snip]
> >> ***************
> >> *** 1862,1868 ****
> >> #ifdef MULTIBYTE
> >> return (c >= 0 && c <= UCHAR_MAX && isdigit(c));
> >> #else
> >> ! return (isdigit(c));
> >> #endif
> >> }
> >>
> >> --- 1862,1868 ----
> >> #ifdef MULTIBYTE
> >> return (c >= 0 && c <= UCHAR_MAX && isdigit(c));
> >> #else
> >> ! return (isdigit((unsigned char)c));
> >> #endif
> >> }
>
> According to the ANSI/C standard the argument to isdigit (or some
> other friends) must have the value of either an unsigned char or
> *EOF*. That's why the argument is typed to int, I guess. This patch
> seems to break the rule?
>
> BTW, I would like to propose yet another patches for the problem. This
> seems to work on FreeBSD and Linux. Angelos, can you test it on your
> platform (is it a BSD/OS?)?
> --
> Tatsuo Ishii
>
> *** regcomp.c~ Tue Sep 1 13:31:25 1998
> --- regcomp.c Thu Mar 11 16:51:28 1999
> ***************
> *** 95,101 ****
> static void p_b_eclass(struct parse * p, cset *cs);
> static pg_wchar p_b_symbol(struct parse * p);
> static char p_b_coll_elem(struct parse * p, int endc);
> ! static char othercase(int ch);
> static void bothcases(struct parse * p, int ch);
> static void ordinary(struct parse * p, int ch);
> static void nonnewline(struct parse * p);
> --- 95,101 ----
> static void p_b_eclass(struct parse * p, cset *cs);
> static pg_wchar p_b_symbol(struct parse * p);
> static char p_b_coll_elem(struct parse * p, int endc);
> ! static unsigned char othercase(int ch);
> static void bothcases(struct parse * p, int ch);
> static void ordinary(struct parse * p, int ch);
> static void nonnewline(struct parse * p);
> ***************
> *** 1032,1049 ****
> - othercase - return the case counterpart of an alphabetic
> == static char othercase(int ch);
> */
> ! static char /* if no counterpart, return ch */
> othercase(ch)
> int ch;
> {
> assert(pg_isalpha(ch));
> if (pg_isupper(ch))
> ! return tolower(ch);
> else if (pg_islower(ch))
> ! return toupper(ch);
> else
> /* peculiar, but could happen */
> ! return ch;
> }
>
> /*
> --- 1032,1049 ----
> - othercase - return the case counterpart of an alphabetic
> == static char othercase(int ch);
> */
> ! static unsigned char /* if no counterpart, return ch */
> othercase(ch)
> int ch;
> {
> assert(pg_isalpha(ch));
> if (pg_isupper(ch))
> ! return (unsigned char)tolower(ch);
> else if (pg_islower(ch))
> ! return (unsigned char)toupper(ch);
> else
> /* peculiar, but could happen */
> ! return (unsigned char)ch;
> }
>
> /*
>
-- Bruce Momjian | http://www.op.net/~candle maillist@candle.pha.pa.us | (610)
853-3000+ If your life is a hard drive, | 830 Blythe Avenue + Christ can be your backup. | Drexel Hill,
Pennsylvania19026