Обсуждение: ...
I am using postgres 6.4.2 on BSD/OS 3.1 with a Greek locale that I have developed. I knew that regexes with postgress would not work because of something I did but a posting from another follow from Sweden gave me a clue that the problem must be with the regex package and not the locale. So I investigated the code and found out the pg_isdigit(int ch), pg_isalpha(int ch) and the associated functions do a comparison of characters as ints. I changed a few crucial points with a cast to (unsigned char) and voila , regexs in Greek with full locale support. My guess is that an int != unsigned char when comparing, the sign bit is probably the culprit. Please test the patch on some other language too, Swedish or Finish would be a nice touch. Patch follows, but it is trivial really. --------------------------------------------------------------------------------- *** regcomp.c Tue Sep 1 07:31:25 1998 --- regcomp.c.patched Wed Feb 10 19:57:11 1999 *************** *** 1038,1046 **** { assert(pg_isalpha(ch)); if (pg_isupper(ch)) ! return tolower(ch); else if (pg_islower(ch)) ! return toupper(ch); else /* peculiar, but could happen */ return ch; --- 1038,1046 ---- { assert(pg_isalpha(ch)); if (pg_isupper(ch)) ! return tolower((unsigned char)ch); else if (pg_islower(ch)) ! return toupper((unsigned char)ch); else /* peculiar, but could happen */ return ch; *************** *** 1055,1067 **** static void bothcases(p, ch) struct parse *p; ! int ch; { pg_wchar *oldnext = p->next; pg_wchar *oldend = p->end; pg_wchar bracket[3]; ! assert(othercase(ch) != ch);/* p_bracket() would recurse */ p->next = bracket; p->end = bracket + 2; bracket[0]= ch; --- 1055,1067 ---- static void bothcases(p, ch) struct parse *p; ! int ch; { pg_wchar *oldnext = p->next; pg_wchar *oldend = p->end; pg_wchar bracket[3]; ! assert(othercase(ch) != (unsigned char)ch);/* p_bracket() would recurse */ p->next = bracket; p->end = bracket+ 2; bracket[0] = ch; *************** *** 1084,1090 **** { cat_t *cap = p->g->categories; ! if ((p->g->cflags & REG_ICASE) && pg_isalpha(ch) && othercase(ch) != ch) bothcases(p, ch); else { --- 1084,1090 ---- { cat_t *cap = p->g->categories; ! if ((p->g->cflags & REG_ICASE) && pg_isalpha(ch) && othercase(ch) != (unsigned char)ch) bothcases(p, ch); else { *************** *** 1862,1868 **** #ifdef MULTIBYTE return (c >= 0 && c <= UCHAR_MAX && isdigit(c)); #else ! return (isdigit(c)); #endif } --- 1862,1868 ---- #ifdef MULTIBYTE return (c >= 0 && c <= UCHAR_MAX && isdigit(c)); #else ! return (isdigit((unsigned char)c)); #endif } *************** *** 1872,1878 **** #ifdef MULTIBYTE return (c >= 0 && c <= UCHAR_MAX && isalpha(c)); #else ! return (isalpha(c)); #endif } --- 1872,1878 ---- #ifdef MULTIBYTE return (c >= 0 && c <= UCHAR_MAX && isalpha(c)); #else ! return (isalpha((unsigned char)c)); #endif } *************** *** 1882,1888 **** #ifdef MULTIBYTE return (c >= 0 && c <= UCHAR_MAX && isupper(c)); #else ! return (isupper(c)); #endif } --- 1882,1888 ---- #ifdef MULTIBYTE return (c >= 0 && c <= UCHAR_MAX && isupper(c)); #else ! return (isupper((unsigned char)c)); #endif } *************** *** 1892,1897 **** #ifdef MULTIBYTE return (c >= 0 && c <= UCHAR_MAX && islower(c)); #else ! return (islower(c)); #endif } --- 1892,1897 ---- #ifdef MULTIBYTE return (c >= 0 && c <= UCHAR_MAX && islower(c)); #else ! return (islower((unsigned char)c)); #endif }
Hello! Next time you'll send a patch could you use tools in .../src/tools/make_diff I've applied the patch to 6.4.2 on Debian 2.0 and ran locale test on koi8-r locale. The locale test before the patch passed and test after patch passed as well. I didn't note any difference. What difference you expected? Please supply data for locale test (look into .../src/test/locale). This is not related to your patch, we're just collecting test data. On Wed, 10 Feb 1999, Angelos Karageorgiou wrote: > I am using postgres 6.4.2 on BSD/OS 3.1 with a Greek locale that I > have developed. I knew that regexes with postgress would not work because > of something I did but a posting from another follow from Sweden gave me a > clue that the problem must be with the regex package and not the locale. > > So I investigated the code and found out the pg_isdigit(int ch), > pg_isalpha(int ch) and the associated functions do a comparison of > characters as ints. I changed a few crucial points with a cast to > (unsigned char) and voila , regexs in Greek with full locale support. My > guess is that an int != unsigned char when comparing, the sign bit is > probably the culprit. > > Please test the patch on some other language too, Swedish or Finish > would be a nice touch. > > Patch follows, but it is trivial really. > --------------------------------------------------------------------------------- > *** regcomp.c Tue Sep 1 07:31:25 1998 > --- regcomp.c.patched Wed Feb 10 19:57:11 1999 > *************** > *** 1038,1046 **** > { > assert(pg_isalpha(ch)); > if (pg_isupper(ch)) > ! return tolower(ch); > else if (pg_islower(ch)) > ! return toupper(ch); > else > /* peculiar, but could happen */ > return ch; > --- 1038,1046 ---- > { > assert(pg_isalpha(ch)); > if (pg_isupper(ch)) > ! return tolower((unsigned char)ch); > else if (pg_islower(ch)) > ! return toupper((unsigned char)ch); > else > /* peculiar, but could happen */ > return ch; > *************** > *** 1055,1067 **** > static void > bothcases(p, ch) > struct parse *p; > ! int ch; > { > pg_wchar *oldnext = p->next; > pg_wchar *oldend = p->end; > pg_wchar bracket[3]; > > ! assert(othercase(ch) != ch);/* p_bracket() would recurse */ > p->next = bracket; > p->end = bracket + 2; > bracket[0] = ch; > --- 1055,1067 ---- > static void > bothcases(p, ch) > struct parse *p; > ! int ch; > { > pg_wchar *oldnext = p->next; > pg_wchar *oldend = p->end; > pg_wchar bracket[3]; > > ! assert(othercase(ch) != (unsigned char)ch);/* p_bracket() would recurse */ > p->next = bracket; > p->end = bracket + 2; > bracket[0] = ch; > *************** > *** 1084,1090 **** > { > cat_t *cap = p->g->categories; > > ! if ((p->g->cflags & REG_ICASE) && pg_isalpha(ch) && othercase(ch) != ch) > bothcases(p, ch); > else > { > --- 1084,1090 ---- > { > cat_t *cap = p->g->categories; > > ! if ((p->g->cflags & REG_ICASE) && pg_isalpha(ch) && othercase(ch) != (unsigned char)ch) > bothcases(p, ch); > else > { > *************** > *** 1862,1868 **** > #ifdef MULTIBYTE > return (c >= 0 && c <= UCHAR_MAX && isdigit(c)); > #else > ! return (isdigit(c)); > #endif > } > > --- 1862,1868 ---- > #ifdef MULTIBYTE > return (c >= 0 && c <= UCHAR_MAX && isdigit(c)); > #else > ! return (isdigit((unsigned char)c)); > #endif > } > > *************** > *** 1872,1878 **** > #ifdef MULTIBYTE > return (c >= 0 && c <= UCHAR_MAX && isalpha(c)); > #else > ! return (isalpha(c)); > #endif > } > > --- 1872,1878 ---- > #ifdef MULTIBYTE > return (c >= 0 && c <= UCHAR_MAX && isalpha(c)); > #else > ! return (isalpha((unsigned char)c)); > #endif > } > > *************** > *** 1882,1888 **** > #ifdef MULTIBYTE > return (c >= 0 && c <= UCHAR_MAX && isupper(c)); > #else > ! return (isupper(c)); > #endif > } > > --- 1882,1888 ---- > #ifdef MULTIBYTE > return (c >= 0 && c <= UCHAR_MAX && isupper(c)); > #else > ! return (isupper((unsigned char)c)); > #endif > } > > *************** > *** 1892,1897 **** > #ifdef MULTIBYTE > return (c >= 0 && c <= UCHAR_MAX && islower(c)); > #else > ! return (islower(c)); > #endif > } > --- 1892,1897 ---- > #ifdef MULTIBYTE > return (c >= 0 && c <= UCHAR_MAX && islower(c)); > #else > ! return (islower((unsigned char)c)); > #endif > } > Oleg. ---- Oleg Broytmann http://members.xoom.com/phd2/ phd2@earthling.net Programmers don't die, they justGOSUB without RETURN.
Did we reject this 'unsigned' patch, folks? I seem to remember someone objecting to it. > I am using postgres 6.4.2 on BSD/OS 3.1 with a Greek locale that I > have developed. I knew that regexes with postgress would not work because > of something I did but a posting from another follow from Sweden gave me a > clue that the problem must be with the regex package and not the locale. > > So I investigated the code and found out the pg_isdigit(int ch), > pg_isalpha(int ch) and the associated functions do a comparison of > characters as ints. I changed a few crucial points with a cast to > (unsigned char) and voila , regexs in Greek with full locale support. My > guess is that an int != unsigned char when comparing, the sign bit is > probably the culprit. > > Please test the patch on some other language too, Swedish or Finish > would be a nice touch. > > Patch follows, but it is trivial really. > --------------------------------------------------------------------------------- > *** regcomp.c Tue Sep 1 07:31:25 1998 > --- regcomp.c.patched Wed Feb 10 19:57:11 1999 > *************** > *** 1038,1046 **** > { > assert(pg_isalpha(ch)); > if (pg_isupper(ch)) > ! return tolower(ch); > else if (pg_islower(ch)) > ! return toupper(ch); > else > /* peculiar, but could happen */ > return ch; > --- 1038,1046 ---- > { > assert(pg_isalpha(ch)); > if (pg_isupper(ch)) > ! return tolower((unsigned char)ch); > else if (pg_islower(ch)) > ! return toupper((unsigned char)ch); > else > /* peculiar, but could happen */ > return ch; > *************** > *** 1055,1067 **** > static void > bothcases(p, ch) > struct parse *p; > ! int ch; > { > pg_wchar *oldnext = p->next; > pg_wchar *oldend = p->end; > pg_wchar bracket[3]; > > ! assert(othercase(ch) != ch);/* p_bracket() would recurse */ > p->next = bracket; > p->end = bracket + 2; > bracket[0] = ch; > --- 1055,1067 ---- > static void > bothcases(p, ch) > struct parse *p; > ! int ch; > { > pg_wchar *oldnext = p->next; > pg_wchar *oldend = p->end; > pg_wchar bracket[3]; > > ! assert(othercase(ch) != (unsigned char)ch);/* p_bracket() would recurse */ > p->next = bracket; > p->end = bracket + 2; > bracket[0] = ch; > *************** > *** 1084,1090 **** > { > cat_t *cap = p->g->categories; > > ! if ((p->g->cflags & REG_ICASE) && pg_isalpha(ch) && othercase(ch) != ch) > bothcases(p, ch); > else > { > --- 1084,1090 ---- > { > cat_t *cap = p->g->categories; > > ! if ((p->g->cflags & REG_ICASE) && pg_isalpha(ch) && othercase(ch) != (unsigned char)ch) > bothcases(p, ch); > else > { > *************** > *** 1862,1868 **** > #ifdef MULTIBYTE > return (c >= 0 && c <= UCHAR_MAX && isdigit(c)); > #else > ! return (isdigit(c)); > #endif > } > > --- 1862,1868 ---- > #ifdef MULTIBYTE > return (c >= 0 && c <= UCHAR_MAX && isdigit(c)); > #else > ! return (isdigit((unsigned char)c)); > #endif > } > > *************** > *** 1872,1878 **** > #ifdef MULTIBYTE > return (c >= 0 && c <= UCHAR_MAX && isalpha(c)); > #else > ! return (isalpha(c)); > #endif > } > > --- 1872,1878 ---- > #ifdef MULTIBYTE > return (c >= 0 && c <= UCHAR_MAX && isalpha(c)); > #else > ! return (isalpha((unsigned char)c)); > #endif > } > > *************** > *** 1882,1888 **** > #ifdef MULTIBYTE > return (c >= 0 && c <= UCHAR_MAX && isupper(c)); > #else > ! return (isupper(c)); > #endif > } > > --- 1882,1888 ---- > #ifdef MULTIBYTE > return (c >= 0 && c <= UCHAR_MAX && isupper(c)); > #else > ! return (isupper((unsigned char)c)); > #endif > } > > *************** > *** 1892,1897 **** > #ifdef MULTIBYTE > return (c >= 0 && c <= UCHAR_MAX && islower(c)); > #else > ! return (islower(c)); > #endif > } > --- 1892,1897 ---- > #ifdef MULTIBYTE > return (c >= 0 && c <= UCHAR_MAX && islower(c)); > #else > ! return (islower((unsigned char)c)); > #endif > } > > -- Bruce Momjian | http://www.op.net/~candle maillist@candle.pha.pa.us | (610) 853-3000+ If your life is a hard drive, | 830 Blythe Avenue + Christ can be your backup. | Drexel Hill, Pennsylvania19026
>Did we reject this 'unsigned' patch, folks? I seem to remember someone >objecting to it. [snip] >> *************** >> *** 1862,1868 **** >> #ifdef MULTIBYTE >> return (c >= 0 && c <= UCHAR_MAX && isdigit(c)); >> #else >> ! return (isdigit(c)); >> #endif >> } >> >> --- 1862,1868 ---- >> #ifdef MULTIBYTE >> return (c >= 0 && c <= UCHAR_MAX && isdigit(c)); >> #else >> ! return (isdigit((unsigned char)c)); >> #endif >> } According to the ANSI/C standard the argument to isdigit (or some other friends) must have the value of either an unsigned char or *EOF*. That's why the argument is typed to int, I guess. This patch seems to break the rule? BTW, I would like to propose yet another patches for the problem. This seems to work on FreeBSD and Linux. Angelos, can you test it on your platform (is it a BSD/OS?)? -- Tatsuo Ishii *** regcomp.c~ Tue Sep 1 13:31:25 1998 --- regcomp.c Thu Mar 11 16:51:28 1999 *************** *** 95,101 **** static void p_b_eclass(struct parse * p, cset *cs); static pg_wchar p_b_symbol(struct parse * p); static char p_b_coll_elem(struct parse * p, int endc); ! static char othercase(int ch); static void bothcases(struct parse * p, int ch); static void ordinary(structparse * p, int ch); static void nonnewline(struct parse * p); --- 95,101 ---- static void p_b_eclass(struct parse * p, cset *cs); static pg_wchar p_b_symbol(struct parse * p); static char p_b_coll_elem(struct parse * p, int endc); ! static unsigned char othercase(int ch); static void bothcases(struct parse * p, int ch); static void ordinary(structparse * p, int ch); static void nonnewline(struct parse * p); *************** *** 1032,1049 **** - othercase - return the case counterpart of an alphabetic == static char othercase(int ch); */ ! static char /* if no counterpart, return ch */ othercase(ch) int ch; { assert(pg_isalpha(ch)); if (pg_isupper(ch)) ! return tolower(ch); else if (pg_islower(ch)) ! return toupper(ch); else /* peculiar, but could happen */ ! return ch; } /* --- 1032,1049 ---- - othercase - return the case counterpart of an alphabetic == static char othercase(int ch); */ ! static unsigned char /* if no counterpart, return ch */ othercase(ch) int ch; { assert(pg_isalpha(ch)); if (pg_isupper(ch)) ! return (unsigned char)tolower(ch); else if (pg_islower(ch)) ! return (unsigned char)toupper(ch); else /* peculiar, but could happen */ ! return (unsigned char)ch; } /*
I think we decided against this, right? > >Did we reject this 'unsigned' patch, folks? I seem to remember someone > >objecting to it. > [snip] > >> *************** > >> *** 1862,1868 **** > >> #ifdef MULTIBYTE > >> return (c >= 0 && c <= UCHAR_MAX && isdigit(c)); > >> #else > >> ! return (isdigit(c)); > >> #endif > >> } > >> > >> --- 1862,1868 ---- > >> #ifdef MULTIBYTE > >> return (c >= 0 && c <= UCHAR_MAX && isdigit(c)); > >> #else > >> ! return (isdigit((unsigned char)c)); > >> #endif > >> } > > According to the ANSI/C standard the argument to isdigit (or some > other friends) must have the value of either an unsigned char or > *EOF*. That's why the argument is typed to int, I guess. This patch > seems to break the rule? > > BTW, I would like to propose yet another patches for the problem. This > seems to work on FreeBSD and Linux. Angelos, can you test it on your > platform (is it a BSD/OS?)? > -- > Tatsuo Ishii > > *** regcomp.c~ Tue Sep 1 13:31:25 1998 > --- regcomp.c Thu Mar 11 16:51:28 1999 > *************** > *** 95,101 **** > static void p_b_eclass(struct parse * p, cset *cs); > static pg_wchar p_b_symbol(struct parse * p); > static char p_b_coll_elem(struct parse * p, int endc); > ! static char othercase(int ch); > static void bothcases(struct parse * p, int ch); > static void ordinary(struct parse * p, int ch); > static void nonnewline(struct parse * p); > --- 95,101 ---- > static void p_b_eclass(struct parse * p, cset *cs); > static pg_wchar p_b_symbol(struct parse * p); > static char p_b_coll_elem(struct parse * p, int endc); > ! static unsigned char othercase(int ch); > static void bothcases(struct parse * p, int ch); > static void ordinary(struct parse * p, int ch); > static void nonnewline(struct parse * p); > *************** > *** 1032,1049 **** > - othercase - return the case counterpart of an alphabetic > == static char othercase(int ch); > */ > ! static char /* if no counterpart, return ch */ > othercase(ch) > int ch; > { > assert(pg_isalpha(ch)); > if (pg_isupper(ch)) > ! return tolower(ch); > else if (pg_islower(ch)) > ! return toupper(ch); > else > /* peculiar, but could happen */ > ! return ch; > } > > /* > --- 1032,1049 ---- > - othercase - return the case counterpart of an alphabetic > == static char othercase(int ch); > */ > ! static unsigned char /* if no counterpart, return ch */ > othercase(ch) > int ch; > { > assert(pg_isalpha(ch)); > if (pg_isupper(ch)) > ! return (unsigned char)tolower(ch); > else if (pg_islower(ch)) > ! return (unsigned char)toupper(ch); > else > /* peculiar, but could happen */ > ! return (unsigned char)ch; > } > > /* > -- Bruce Momjian | http://www.op.net/~candle maillist@candle.pha.pa.us | (610) 853-3000+ If your life is a hard drive, | 830 Blythe Avenue + Christ can be your backup. | Drexel Hill, Pennsylvania19026