Re: [v9.2] make_greater_string() does not return a string in some cases
От | Kyotaro HORIGUCHI |
---|---|
Тема | Re: [v9.2] make_greater_string() does not return a string in some cases |
Дата | |
Msg-id | 20111007.132246.187325868.horiguchi.kyotaro@oss.ntt.co.jp обсуждение исходный текст |
Ответ на | Re: [v9.2] make_greater_string() does not return a string in some cases (Robert Haas <robertmhaas@gmail.com>) |
Ответы |
Re: [v9.2] make_greater_string() does not return a string
in some cases
(Robert Haas <robertmhaas@gmail.com>)
|
Список | pgsql-hackers |
Thank you for reviewing. The new version of this patch is attached to this message. > > But it seems to me that if the datatype is BYTEAOID then > > there's no need to restore anything at all, because we're not > > going to call pg_mbcliplen() in that case anyway. So I think > > the logic here can be simplified. I agree with you. The original code does not restore the changed byte. I removed the lastchar preservation from make_greater_string(). > > Also, you haven't completely fixed the style issues. Function > > definitions should look like this: .. > > Opening curly braces should be on a line by themselves, not at the end > > of the preceding if, while, etc. line. > > > > "finnaly" is spelled incorrectly. I'm very sorry for left mixed defferent style a lot. I think I've done the correction of the styles for function definition and braces. The misspelled word is removed with whole sentense because re-cheking just before return had been removed. > Oh, and there's this: > > wchar.c: In function ‘pg_utf8_increment’: > wchar.c:1376: warning: unused variable ‘success’ > wchar.c: In function ‘pg_eucjp_increment’: > wchar.c:1433: warning: unused variable ‘success’ Oops... I have rebased the patch and removed all warnings. make check has been passed all-ok. I confirmed that the planner decides to use index with proper boundaries for like expression with the certain characters on problematic code point, on the database encodings both UTF-8 and EUC-JP with the database locale is C, and database locale is ja_JP.UTF8. And also for bytea ends with 0xff and 0x00, 0x01. This is the third version of the patch. Regards, -- Kyotaro Horiguchi NTT Open Source Software Center diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index 8ceea82..59f8c37 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -5664,6 +5664,19 @@ pattern_selectivity(Const *patt, Pattern_Type ptype)/* + * This function is "character increment" function for bytea used in + * make_greater_string() that has same interface with pg_wchar_tbl.charinc. + */ +static bool +byte_increment(unsigned char *ptr, int len) +{ + if (*ptr >= 255) return false; + + (*ptr)++; + return true; +} + +/* * Try to generate a string greater than the given string or any * string it is a prefix of. If successful, return apalloc'd string * in the form of a Const node; else return NULL. @@ -5702,6 +5715,7 @@ make_greater_string(const Const *str_const, FmgrInfo *ltproc, Oid collation) int len; Datum cmpstr; text *cmptxt = NULL; + character_incrementer charincfunc; /* * Get a modifiable copy of the prefix string in C-string format, and set @@ -5763,29 +5777,33 @@ make_greater_string(const Const *str_const, FmgrInfo *ltproc, Oid collation) } } + if (datatype == BYTEAOID) + charincfunc = &byte_increment; + else + charincfunc = pg_database_encoding_character_incrementer(); + while (len > 0) { - unsigned char *lastchar = (unsigned char *) (workstr + len - 1); - unsigned char savelastchar = *lastchar; + int charlen = 1; + unsigned char *lastchar; + Const *workstr_const; + + if (datatype != BYTEAOID) + charlen = len - pg_mbcliplen(workstr, len, len - 1); + + lastchar = (unsigned char *) (workstr + len - charlen); /* - * Try to generate a larger string by incrementing the last byte. + * Try to generate a larger string by incrementing the last byte or + * character. */ - while (*lastchar < (unsigned char) 255) - { - Const *workstr_const; - - (*lastchar)++; - if (datatype != BYTEAOID) - { - /* do not generate invalid encoding sequences */ - if (!pg_verifymbstr(workstr, len, true)) - continue; - workstr_const = string_to_const(workstr, datatype); - } - else + if (charincfunc(lastchar, charlen)) + { + if (datatype == BYTEAOID) workstr_const = string_to_bytea_const(workstr, len); + else + workstr_const = string_to_const(workstr, datatype); if (DatumGetBool(FunctionCall2Coll(ltproc, collation, @@ -5804,20 +5822,10 @@ make_greater_string(const Const *str_const, FmgrInfo *ltproc, Oid collation) pfree(workstr_const); } - /* restore last byte so we don't confuse pg_mbcliplen */ - *lastchar = savelastchar; - /* - * Truncate off the last character, which might be more than 1 byte, - * depending on the character encoding. + * Truncate off the last character or byte. */ - if (datatype != BYTEAOID && pg_database_encoding_max_length() > 1) - len = pg_mbcliplen(workstr, len, len - 1); - else - len -= 1; - - if (datatype != BYTEAOID) - workstr[len] = '\0'; + len -= charlen; } /* Failed... */ diff --git a/src/backend/utils/mb/wchar.c b/src/backend/utils/mb/wchar.c index f23732f..e8a1bc8 100644 --- a/src/backend/utils/mb/wchar.c +++ b/src/backend/utils/mb/wchar.c @@ -1336,6 +1336,194 @@ pg_utf8_islegal(const unsigned char *source, int length)/* *------------------------------------------------------------------- + * character incrementer + * + * These functions accept "charptr", a pointer to the first byte of a + * maybe-multibyte character. Try `increment' the character and return + * true if successed. If these functions returns false, the character + * should be untouched. These functions must be implemented in + * correspondence with verifiers, in other words, the rewrited + * character by this function must pass the check by pg_*_verifier() + * if returns true. + * ------------------------------------------------------------------- + */ + +#ifndef FRONTEND +static bool +pg_generic_charinc(unsigned char *charptr, int len) +{ + unsigned char *lastchar = (unsigned char *) (charptr + len - 1); + unsigned char savelastchar = *lastchar; + const char *const_charptr = (const char *)charptr; + + while (*lastchar < (unsigned char) 255) + { + (*lastchar)++; + if (!pg_verifymbstr(const_charptr, len, true)) + continue; + return true; + } + + *lastchar = savelastchar; + return false; +} + +static bool +pg_utf8_increment(unsigned char *charptr, int length) +{ + unsigned char a; + unsigned char bak[4]; + + switch (length) + { + default: + /* reject lengths 5 and 6 for now */ + return false; + case 4: + bak[3] = charptr[3]; + a = charptr[3]; + if (a < 0xBF) + { + charptr[3]++; + break; + } + charptr[3] = 0x80; + /* FALL THRU */ + case 3: + bak[2] = charptr[2]; + a = charptr[2]; + if (a < 0xBF) + { + charptr[2]++; + break; + } + charptr[2] = 0x80; + /* FALL THRU */ + case 2: + bak[1] = charptr[1]; + a = charptr[1]; + if ((*charptr == 0xed && a < 0x9F) || a < 0xBF) + { + charptr[1]++; + break; + } + charptr[1] = 0x80; + /* FALL THRU */ + case 1: + bak[0] = *charptr; + a = *charptr; + if (a == 0x7F || a == 0xDF || a == 0xEF || a == 0xF7) + { + /* Rewinding modified bytes and return fail. length is + * confirmed to be between 1 and 4 here. */ + memcpy(charptr, bak, length); + return false; + } + charptr[0]++; + break; + } + + return true; +} + +static bool +pg_eucjp_increment(unsigned char *charptr, int length) +{ + unsigned char bak[3]; + unsigned char c1, c2; + signed int i; + + c1 = *charptr; + + switch (c1) + { + case SS2: /* JIS X 0201 */ + if (length != 2) return false; + + c2 = charptr[1]; + + if (c2 > 0xde) + charptr[0] = charptr[1] = 0xa1; + else if (c2 < 0xa1) + charptr[1] = 0xa1; + else + charptr[1]++; + + break; + + case SS3: /* JIS X 0212 */ + if (length != 3) return false; + + for (i = 2 ; i > 0 ; i--) + { + bak[i] = charptr[i]; + c2 = charptr[i]; + if (c2 < 0xa1) + { + charptr[i] = 0xa1; + return true; + } + else if (c2 < 0xfe) + { + charptr[i]++; + break; + } + charptr[i] = 0xa1; + } + + + if (i == 0) /* Out of 3-byte code region */ + { + charptr[1] = bak[1]; + charptr[2] = bak[2]; + return false; + } + + break; + + default: + if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */ + { + if (length != 2) return false; + + for (i = 1 ; i >= 0 ; i--) /* i must be signed */ + { + bak[i] = charptr[i]; + c2 = charptr[i]; + if (c2 < 0xa1) + { + charptr[i] = 0xa1; + return true; + } + else if (c2 < 0xfe) + { + charptr[i]++; + break; + } + charptr[i] = 0xa1; + } + + if (i < 0) /* Out of 2 byte code region */ + { + charptr[0] = bak[0]; + charptr[1] = bak[1]; + return false; + } + } + else + { /* ASCII, single byte */ + if (c1 > 0x7e) + return false; + (*charptr)++; + } + } + + return true; +} +#endif + +/* + *------------------------------------------------------------------- * encoding info table * XXX must be sorted by thesame order as enum pg_enc (in mb/pg_wchar.h) *------------------------------------------------------------------- @@ -1459,6 +1647,25 @@ pg_database_encoding_max_length(void)}/* + * give the character incrementer for the encoding for the current database + */ +character_incrementer +pg_database_encoding_character_incrementer(void) +{ + switch (GetDatabaseEncoding()) + { + case PG_UTF8: + return pg_utf8_increment; + + case PG_EUC_JP: + return pg_eucjp_increment; + + default: + return pg_generic_charinc; + } +} + +/* * Verify mbstr to make sure that it is validly encoded in the current * database encoding. Otherwise same as pg_verify_mbstr().*/ diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h index 826c7af..728175c 100644 --- a/src/include/mb/pg_wchar.h +++ b/src/include/mb/pg_wchar.h @@ -284,6 +284,8 @@ typedef int (*mblen_converter) (const unsigned char *mbstr);typedef int (*mbdisplaylen_converter) (constunsigned char *mbstr); +typedef bool (*character_incrementer) (unsigned char *mbstr, int len); +typedef int (*mbverifier) (const unsigned char *mbstr, int len);typedef struct @@ -389,6 +391,7 @@ extern int pg_encoding_mbcliplen(int encoding, const char *mbstr,extern int pg_mbcharcliplen(constchar *mbstr, int len, int imit);extern int pg_encoding_max_length(int encoding);extern int pg_database_encoding_max_length(void); +extern character_incrementer pg_database_encoding_character_incrementer(void);extern int PrepareClientEncoding(int encoding);externint SetClientEncoding(int encoding);
В списке pgsql-hackers по дате отправления: