diff options
| author | Peter Eisentraut <peter_e@gmx.net> | 2017-03-23 15:25:34 -0400 |
|---|---|---|
| committer | Peter Eisentraut <peter_e@gmx.net> | 2017-03-23 15:28:48 -0400 |
| commit | eccfef81e1f73ee41f1d8bfe4fa4e80576945048 (patch) | |
| tree | 52bd1b2468bcf9682b356cf5b5f6199ae9d80ee4 /src/backend/utils | |
| parent | ea42cc18c35381f639d45628d792e790ff39e271 (diff) | |
| download | postgresql-eccfef81e1f73ee41f1d8bfe4fa4e80576945048.tar.gz | |
ICU support
Add a column collprovider to pg_collation that determines which library
provides the collation data. The existing choices are default and libc,
and this adds an icu choice, which uses the ICU4C library.
The pg_locale_t type is changed to a union that contains the
provider-specific locale handles. Users of locale information are
changed to look into that struct for the appropriate handle to use.
Also add a collversion column that records the version of the collation
when it is created, and check at run time whether it is still the same.
This detects potentially incompatible library upgrades that can corrupt
indexes and other structures. This is currently only supported by
ICU-provided collations.
initdb initializes the default collation set as before from the `locale
-a` output but also adds all available ICU locales with a "-x-icu"
appended.
Currently, ICU-provided collations can only be explicitly named
collations. The global database locales are still always libc-provided.
ICU support is enabled by configure --with-icu.
Reviewed-by: Thomas Munro <thomas.munro@enterprisedb.com>
Reviewed-by: Andreas Karlsson <andreas@proxel.se>
Diffstat (limited to 'src/backend/utils')
| -rw-r--r-- | src/backend/utils/adt/formatting.c | 453 | ||||
| -rw-r--r-- | src/backend/utils/adt/like.c | 53 | ||||
| -rw-r--r-- | src/backend/utils/adt/pg_locale.c | 266 | ||||
| -rw-r--r-- | src/backend/utils/adt/selfuncs.c | 8 | ||||
| -rw-r--r-- | src/backend/utils/adt/varlena.c | 179 | ||||
| -rw-r--r-- | src/backend/utils/mb/encnames.c | 76 |
6 files changed, 725 insertions, 310 deletions
diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c index c16bfbca93..0566abd314 100644 --- a/src/backend/utils/adt/formatting.c +++ b/src/backend/utils/adt/formatting.c @@ -82,6 +82,10 @@ #include <wctype.h> #endif +#ifdef USE_ICU +#include <unicode/ustring.h> +#endif + #include "catalog/pg_collation.h" #include "mb/pg_wchar.h" #include "utils/builtins.h" @@ -1443,6 +1447,42 @@ str_numth(char *dest, char *num, int type) * upper/lower/initcap functions *****************************************************************************/ +#ifdef USE_ICU +static int32_t +icu_convert_case(int32_t (*func)(UChar *, int32_t, const UChar *, int32_t, const char *, UErrorCode *), + pg_locale_t mylocale, UChar **buff_dest, UChar *buff_source, int32_t len_source) +{ + UErrorCode status; + int32_t len_dest; + + len_dest = len_source; /* try first with same length */ + *buff_dest = palloc(len_dest * sizeof(**buff_dest)); + status = U_ZERO_ERROR; + len_dest = func(*buff_dest, len_dest, buff_source, len_source, mylocale->info.icu.locale, &status); + if (status == U_BUFFER_OVERFLOW_ERROR) + { + /* try again with adjusted length */ + pfree(buff_dest); + buff_dest = palloc(len_dest * sizeof(**buff_dest)); + status = U_ZERO_ERROR; + len_dest = func(*buff_dest, len_dest, buff_source, len_source, mylocale->info.icu.locale, &status); + } + if (U_FAILURE(status)) + ereport(ERROR, + (errmsg("case conversion failed: %s", u_errorName(status)))); + return len_dest; +} + +static int32_t +u_strToTitle_default_BI(UChar *dest, int32_t destCapacity, + const UChar *src, int32_t srcLength, + const char *locale, + UErrorCode *pErrorCode) +{ + return u_strToTitle(dest, destCapacity, src, srcLength, NULL, locale, pErrorCode); +} +#endif + /* * If the system provides the needed functions for wide-character manipulation * (which are all standardized by C99), then we implement upper/lower/initcap @@ -1479,12 +1519,9 @@ str_tolower(const char *buff, size_t nbytes, Oid collid) result = asc_tolower(buff, nbytes); } #ifdef USE_WIDE_UPPER_LOWER - else if (pg_database_encoding_max_length() > 1) + else { pg_locale_t mylocale = 0; - wchar_t *workspace; - size_t curr_char; - size_t result_size; if (collid != DEFAULT_COLLATION_OID) { @@ -1502,77 +1539,79 @@ str_tolower(const char *buff, size_t nbytes, Oid collid) mylocale = pg_newlocale_from_collation(collid); } - /* Overflow paranoia */ - if ((nbytes + 1) > (INT_MAX / sizeof(wchar_t))) - ereport(ERROR, - (errcode(ERRCODE_OUT_OF_MEMORY), - errmsg("out of memory"))); +#ifdef USE_ICU + if (mylocale && mylocale->provider == COLLPROVIDER_ICU) + { + int32_t len_uchar; + int32_t len_conv; + UChar *buff_uchar; + UChar *buff_conv; + + len_uchar = icu_to_uchar(&buff_uchar, buff, nbytes); + len_conv = icu_convert_case(u_strToLower, mylocale, &buff_conv, buff_uchar, len_uchar); + icu_from_uchar(&result, buff_conv, len_conv); + } + else +#endif + { + if (pg_database_encoding_max_length() > 1) + { + wchar_t *workspace; + size_t curr_char; + size_t result_size; - /* Output workspace cannot have more codes than input bytes */ - workspace = (wchar_t *) palloc((nbytes + 1) * sizeof(wchar_t)); + /* Overflow paranoia */ + if ((nbytes + 1) > (INT_MAX / sizeof(wchar_t))) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); - char2wchar(workspace, nbytes + 1, buff, nbytes, mylocale); + /* Output workspace cannot have more codes than input bytes */ + workspace = (wchar_t *) palloc((nbytes + 1) * sizeof(wchar_t)); - for (curr_char = 0; workspace[curr_char] != 0; curr_char++) - { + char2wchar(workspace, nbytes + 1, buff, nbytes, mylocale); + + for (curr_char = 0; workspace[curr_char] != 0; curr_char++) + { #ifdef HAVE_LOCALE_T - if (mylocale) - workspace[curr_char] = towlower_l(workspace[curr_char], mylocale); - else + if (mylocale) + workspace[curr_char] = towlower_l(workspace[curr_char], mylocale->info.lt); + else #endif - workspace[curr_char] = towlower(workspace[curr_char]); - } + workspace[curr_char] = towlower(workspace[curr_char]); + } - /* Make result large enough; case change might change number of bytes */ - result_size = curr_char * pg_database_encoding_max_length() + 1; - result = palloc(result_size); + /* Make result large enough; case change might change number of bytes */ + result_size = curr_char * pg_database_encoding_max_length() + 1; + result = palloc(result_size); - wchar2char(result, workspace, result_size, mylocale); - pfree(workspace); - } + wchar2char(result, workspace, result_size, mylocale); + pfree(workspace); + } #endif /* USE_WIDE_UPPER_LOWER */ - else - { -#ifdef HAVE_LOCALE_T - pg_locale_t mylocale = 0; -#endif - char *p; - - if (collid != DEFAULT_COLLATION_OID) - { - if (!OidIsValid(collid)) + else { - /* - * This typically means that the parser could not resolve a - * conflict of implicit collations, so report it that way. - */ - ereport(ERROR, - (errcode(ERRCODE_INDETERMINATE_COLLATION), - errmsg("could not determine which collation to use for lower() function"), - errhint("Use the COLLATE clause to set the collation explicitly."))); - } -#ifdef HAVE_LOCALE_T - mylocale = pg_newlocale_from_collation(collid); -#endif - } + char *p; - result = pnstrdup(buff, nbytes); + result = pnstrdup(buff, nbytes); - /* - * Note: we assume that tolower_l() will not be so broken as to need - * an isupper_l() guard test. When using the default collation, we - * apply the traditional Postgres behavior that forces ASCII-style - * treatment of I/i, but in non-default collations you get exactly - * what the collation says. - */ - for (p = result; *p; p++) - { + /* + * Note: we assume that tolower_l() will not be so broken as to need + * an isupper_l() guard test. When using the default collation, we + * apply the traditional Postgres behavior that forces ASCII-style + * treatment of I/i, but in non-default collations you get exactly + * what the collation says. + */ + for (p = result; *p; p++) + { #ifdef HAVE_LOCALE_T - if (mylocale) - *p = tolower_l((unsigned char) *p, mylocale); - else + if (mylocale) + *p = tolower_l((unsigned char) *p, mylocale->info.lt); + else #endif - *p = pg_tolower((unsigned char) *p); + *p = pg_tolower((unsigned char) *p); + } + } } } @@ -1599,12 +1638,9 @@ str_toupper(const char *buff, size_t nbytes, Oid collid) result = asc_toupper(buff, nbytes); } #ifdef USE_WIDE_UPPER_LOWER - else if (pg_database_encoding_max_length() > 1) + else { pg_locale_t mylocale = 0; - wchar_t *workspace; - size_t curr_char; - size_t result_size; if (collid != DEFAULT_COLLATION_OID) { @@ -1622,77 +1658,78 @@ str_toupper(const char *buff, size_t nbytes, Oid collid) mylocale = pg_newlocale_from_collation(collid); } - /* Overflow paranoia */ - if ((nbytes + 1) > (INT_MAX / sizeof(wchar_t))) - ereport(ERROR, - (errcode(ERRCODE_OUT_OF_MEMORY), - errmsg("out of memory"))); +#ifdef USE_ICU + if (mylocale && mylocale->provider == COLLPROVIDER_ICU) + { + int32_t len_uchar, len_conv; + UChar *buff_uchar; + UChar *buff_conv; - /* Output workspace cannot have more codes than input bytes */ - workspace = (wchar_t *) palloc((nbytes + 1) * sizeof(wchar_t)); + len_uchar = icu_to_uchar(&buff_uchar, buff, nbytes); + len_conv = icu_convert_case(u_strToUpper, mylocale, &buff_conv, buff_uchar, len_uchar); + icu_from_uchar(&result, buff_conv, len_conv); + } + else +#endif + { + if (pg_database_encoding_max_length() > 1) + { + wchar_t *workspace; + size_t curr_char; + size_t result_size; - char2wchar(workspace, nbytes + 1, buff, nbytes, mylocale); + /* Overflow paranoia */ + if ((nbytes + 1) > (INT_MAX / sizeof(wchar_t))) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); - for (curr_char = 0; workspace[curr_char] != 0; curr_char++) - { -#ifdef HAVE_LOCALE_T - if (mylocale) - workspace[curr_char] = towupper_l(workspace[curr_char], mylocale); - else -#endif - workspace[curr_char] = towupper(workspace[curr_char]); - } + /* Output workspace cannot have more codes than input bytes */ + workspace = (wchar_t *) palloc((nbytes + 1) * sizeof(wchar_t)); - /* Make result large enough; case change might change number of bytes */ - result_size = curr_char * pg_database_encoding_max_length() + 1; - result = palloc(result_size); + char2wchar(workspace, nbytes + 1, buff, nbytes, mylocale); - wchar2char(result, workspace, result_size, mylocale); - pfree(workspace); - } -#endif /* USE_WIDE_UPPER_LOWER */ - else - { + for (curr_char = 0; workspace[curr_char] != 0; curr_char++) + { #ifdef HAVE_LOCALE_T - pg_locale_t mylocale = 0; + if (mylocale) + workspace[curr_char] = towupper_l(workspace[curr_char], mylocale->info.lt); + else #endif - char *p; + workspace[curr_char] = towupper(workspace[curr_char]); + } - if (collid != DEFAULT_COLLATION_OID) - { - if (!OidIsValid(collid)) - { - /* - * This typically means that the parser could not resolve a - * conflict of implicit collations, so report it that way. - */ - ereport(ERROR, - (errcode(ERRCODE_INDETERMINATE_COLLATION), - errmsg("could not determine which collation to use for upper() function"), - errhint("Use the COLLATE clause to set the collation explicitly."))); + /* Make result large enough; case change might change number of bytes */ + result_size = curr_char * pg_database_encoding_max_length() + 1; + result = palloc(result_size); + + wchar2char(result, workspace, result_size, mylocale); + pfree(workspace); } -#ifdef HAVE_LOCALE_T - mylocale = pg_newlocale_from_collation(collid); -#endif - } +#endif /* USE_WIDE_UPPER_LOWER */ + else + { + char *p; - result = pnstrdup(buff, nbytes); + result = pnstrdup(buff, nbytes); - /* - * Note: we assume that toupper_l() will not be so broken as to need - * an islower_l() guard test. When using the default collation, we - * apply the traditional Postgres behavior that forces ASCII-style - * treatment of I/i, but in non-default collations you get exactly - * what the collation says. - */ - for (p = result; *p; p++) - { + /* + * Note: we assume that toupper_l() will not be so broken as to need + * an islower_l() guard test. When using the default collation, we + * apply the traditional Postgres behavior that forces ASCII-style + * treatment of I/i, but in non-default collations you get exactly + * what the collation says. + */ + for (p = result; *p; p++) + { #ifdef HAVE_LOCALE_T - if (mylocale) - *p = toupper_l((unsigned char) *p, mylocale); - else + if (mylocale) + *p = toupper_l((unsigned char) *p, mylocale->info.lt); + else #endif - *p = pg_toupper((unsigned char) *p); + *p = pg_toupper((unsigned char) *p); + } + } } } @@ -1720,12 +1757,9 @@ str_initcap(const char *buff, size_t nbytes, Oid collid) result = asc_initcap(buff, nbytes); } #ifdef USE_WIDE_UPPER_LOWER - else if (pg_database_encoding_max_length() > 1) + else { pg_locale_t mylocale = 0; - wchar_t *workspace; - size_t curr_char; - size_t result_size; if (collid != DEFAULT_COLLATION_OID) { @@ -1743,100 +1777,101 @@ str_initcap(const char *buff, size_t nbytes, Oid collid) mylocale = pg_newlocale_from_collation(collid); } - /* Overflow paranoia */ - if ((nbytes + 1) > (INT_MAX / sizeof(wchar_t))) - ereport(ERROR, - (errcode(ERRCODE_OUT_OF_MEMORY), - errmsg("out of memory"))); - - /* Output workspace cannot have more codes than input bytes */ - workspace = (wchar_t *) palloc((nbytes + 1) * sizeof(wchar_t)); - - char2wchar(workspace, nbytes + 1, buff, nbytes, mylocale); - - for (curr_char = 0; workspace[curr_char] != 0; curr_char++) +#ifdef USE_ICU + if (mylocale && mylocale->provider == COLLPROVIDER_ICU) { -#ifdef HAVE_LOCALE_T - if (mylocale) - { - if (wasalnum) - workspace[curr_char] = towlower_l(workspace[curr_char], mylocale); - else - workspace[curr_char] = towupper_l(workspace[curr_char], mylocale); - wasalnum = iswalnum_l(workspace[curr_char], mylocale); - } - else + int32_t len_uchar, len_conv; + UChar *buff_uchar; + UChar *buff_conv; + + len_uchar = icu_to_uchar(&buff_uchar, buff, nbytes); + len_conv = icu_convert_case(u_strToTitle_default_BI, mylocale, &buff_conv, buff_uchar, len_uchar); + icu_from_uchar(&result, buff_conv, len_conv); + } + else #endif + { + if (pg_database_encoding_max_length() > 1) { - if (wasalnum) - workspace[curr_char] = towlower(workspace[curr_char]); - else - workspace[curr_char] = towupper(workspace[curr_char]); - wasalnum = iswalnum(workspace[curr_char]); - } - } + wchar_t *workspace; + size_t curr_char; + size_t result_size; - /* Make result large enough; case change might change number of bytes */ - result_size = curr_char * pg_database_encoding_max_length() + 1; - result = palloc(result_size); + /* Overflow paranoia */ + if ((nbytes + 1) > (INT_MAX / sizeof(wchar_t))) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); - wchar2char(result, workspace, result_size, mylocale); - pfree(workspace); - } -#endif /* USE_WIDE_UPPER_LOWER */ - else - { -#ifdef HAVE_LOCALE_T - pg_locale_t mylocale = 0; -#endif - char *p; + /* Output workspace cannot have more codes than input bytes */ + workspace = (wchar_t *) palloc((nbytes + 1) * sizeof(wchar_t)); - if (collid != DEFAULT_COLLATION_OID) - { - if (!OidIsValid(collid)) - { - /* - * This typically means that the parser could not resolve a - * conflict of implicit collations, so report it that way. - */ - ereport(ERROR, - (errcode(ERRCODE_INDETERMINATE_COLLATION), - errmsg("could not determine which collation to use for initcap() function"), - errhint("Use the COLLATE clause to set the collation explicitly."))); - } + char2wchar(workspace, nbytes + 1, buff, nbytes, mylocale); + + for (curr_char = 0; workspace[curr_char] != 0; curr_char++) + { #ifdef HAVE_LOCALE_T - mylocale = pg_newlocale_from_collation(collid); + if (mylocale) + { + if (wasalnum) + workspace[curr_char] = towlower_l(workspace[curr_char], mylocale->info.lt); + else + workspace[curr_char] = towupper_l(workspace[curr_char], mylocale->info.lt); + wasalnum = iswalnum_l(workspace[curr_char], mylocale->info.lt); + } + else #endif - } + { + if (wasalnum) + workspace[curr_char] = towlower(workspace[curr_char]); + else + workspace[curr_char] = towupper(workspace[curr_char]); + wasalnum = iswalnum(workspace[curr_char]); + } + } - result = pnstrdup(buff, nbytes); + /* Make result large enough; case change might change number of bytes */ + result_size = curr_char * pg_database_encoding_max_length() + 1; + result = palloc(result_size); - /* - * Note: we assume that toupper_l()/tolower_l() will not be so broken - * as to need guard tests. When using the default collation, we apply - * the traditional Postgres behavior that forces ASCII-style treatment - * of I/i, but in non-default collations you get exactly what the - * collation says. - */ - for (p = result; *p; p++) - { -#ifdef HAVE_LOCALE_T - if (mylocale) - { - if (wasalnum) - *p = tolower_l((unsigned char) *p, mylocale); - else - *p = toupper_l((unsigned char) *p, mylocale); - wasalnum = isalnum_l((unsigned char) *p, mylocale); + wchar2char(result, workspace, result_size, mylocale); + pfree(workspace); } +#endif /* USE_WIDE_UPPER_LOWER */ else -#endif { - if (wasalnum) - *p = pg_tolower((unsigned char) *p); - else - *p = pg_toupper((unsigned char) *p); - wasalnum = isalnum((unsigned char) *p); + char *p; + + result = pnstrdup(buff, nbytes); + + /* + * Note: we assume that toupper_l()/tolower_l() will not be so broken + * as to need guard tests. When using the default collation, we apply + * the traditional Postgres behavior that forces ASCII-style treatment + * of I/i, but in non-default collations you get exactly what the + * collation says. + */ + for (p = result; *p; p++) + { +#ifdef HAVE_LOCALE_T + if (mylocale) + { + if (wasalnum) + *p = tolower_l((unsigned char) *p, mylocale->info.lt); + else + *p = toupper_l((unsigned char) *p, mylocale->info.lt); + wasalnum = isalnum_l((unsigned char) *p, mylocale->info.lt); + } + else +#endif + { + if (wasalnum) + *p = pg_tolower((unsigned char) *p); + else + *p = pg_toupper((unsigned char) *p); + wasalnum = isalnum((unsigned char) *p); + } + } } } } diff --git a/src/backend/utils/adt/like.c b/src/backend/utils/adt/like.c index 8d9d285fb5..1f683ccd0f 100644 --- a/src/backend/utils/adt/like.c +++ b/src/backend/utils/adt/like.c @@ -96,7 +96,7 @@ SB_lower_char(unsigned char c, pg_locale_t locale, bool locale_is_c) return pg_ascii_tolower(c); #ifdef HAVE_LOCALE_T else if (locale) - return tolower_l(c, locale); + return tolower_l(c, locale->info.lt); #endif else return pg_tolower(c); @@ -165,14 +165,36 @@ Generic_Text_IC_like(text *str, text *pat, Oid collation) *p; int slen, plen; + pg_locale_t locale = 0; + bool locale_is_c = false; + + if (lc_ctype_is_c(collation)) + locale_is_c = true; + else if (collation != DEFAULT_COLLATION_OID) + { + if (!OidIsValid(collation)) + { + /* + * This typically means that the parser could not resolve a + * conflict of implicit collations, so report it that way. + */ + ereport(ERROR, + (errcode(ERRCODE_INDETERMINATE_COLLATION), + errmsg("could not determine which collation to use for ILIKE"), + errhint("Use the COLLATE clause to set the collation explicitly."))); + } + locale = pg_newlocale_from_collation(collation); + } /* * For efficiency reasons, in the single byte case we don't call lower() * on the pattern and text, but instead call SB_lower_char on each - * character. In the multi-byte case we don't have much choice :-( + * character. In the multi-byte case we don't have much choice :-(. + * Also, ICU does not support single-character case folding, so we go the + * long way. */ - if (pg_database_encoding_max_length() > 1) + if (pg_database_encoding_max_length() > 1 || locale->provider == COLLPROVIDER_ICU) { /* lower's result is never packed, so OK to use old macros here */ pat = DatumGetTextPP(DirectFunctionCall1Coll(lower, collation, @@ -190,31 +212,6 @@ Generic_Text_IC_like(text *str, text *pat, Oid collation) } else { - /* - * Here we need to prepare locale information for SB_lower_char. This - * should match the methods used in str_tolower(). - */ - pg_locale_t locale = 0; - bool locale_is_c = false; - - if (lc_ctype_is_c(collation)) - locale_is_c = true; - else if (collation != DEFAULT_COLLATION_OID) - { - if (!OidIsValid(collation)) - { - /* - * This typically means that the parser could not resolve a - * conflict of implicit collations, so report it that way. - */ - ereport(ERROR, - (errcode(ERRCODE_INDETERMINATE_COLLATION), - errmsg("could not determine which collation to use for ILIKE"), - errhint("Use the COLLATE clause to set the collation explicitly."))); - } - locale = pg_newlocale_from_collation(collation); - } - p = VARDATA_ANY(pat); plen = VARSIZE_ANY_EXHDR(pat); s = VARDATA_ANY(str); diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c index ab197025f8..2a2c9bc504 100644 --- a/src/backend/utils/adt/pg_locale.c +++ b/src/backend/utils/adt/pg_locale.c @@ -57,11 +57,17 @@ #include "catalog/pg_collation.h" #include "catalog/pg_control.h" #include "mb/pg_wchar.h" +#include "utils/builtins.h" #include "utils/hsearch.h" +#include "utils/lsyscache.h" #include "utils/memutils.h" #include "utils/pg_locale.h" #include "utils/syscache.h" +#ifdef USE_ICU +#include <unicode/ucnv.h> +#endif + #ifdef WIN32 /* * This Windows file defines StrNCpy. We don't need it here, so we undefine @@ -1272,12 +1278,13 @@ pg_newlocale_from_collation(Oid collid) if (cache_entry->locale == 0) { /* We haven't computed this yet in this session, so do it */ -#ifdef HAVE_LOCALE_T HeapTuple tp; Form_pg_collation collform; const char *collcollate; - const char *collctype; - locale_t result; + const char *collctype pg_attribute_unused(); + pg_locale_t result; + Datum collversion; + bool isnull; tp = SearchSysCache1(COLLOID, ObjectIdGetDatum(collid)); if (!HeapTupleIsValid(tp)) @@ -1287,61 +1294,230 @@ pg_newlocale_from_collation(Oid collid) collcollate = NameStr(collform->collcollate); collctype = NameStr(collform->collctype); - if (strcmp(collcollate, collctype) == 0) + result = malloc(sizeof(* result)); + memset(result, 0, sizeof(* result)); + result->provider = collform->collprovider; + + if (collform->collprovider == COLLPROVIDER_LIBC) { - /* Normal case where they're the same */ +#ifdef HAVE_LOCALE_T + locale_t loc; + + if (strcmp(collcollate, collctype) == 0) + { + /* Normal case where they're the same */ #ifndef WIN32 - result = newlocale(LC_COLLATE_MASK | LC_CTYPE_MASK, collcollate, - NULL); + loc = newlocale(LC_COLLATE_MASK | LC_CTYPE_MASK, collcollate, + NULL); #else - result = _create_locale(LC_ALL, collcollate); + loc = _create_locale(LC_ALL, collcollate); #endif - if (!result) - report_newlocale_failure(collcollate); - } - else - { + if (!loc) + report_newlocale_failure(collcollate); + } + else + { #ifndef WIN32 - /* We need two newlocale() steps */ - locale_t loc1; - - loc1 = newlocale(LC_COLLATE_MASK, collcollate, NULL); - if (!loc1) - report_newlocale_failure(collcollate); - result = newlocale(LC_CTYPE_MASK, collctype, loc1); - if (!result) - report_newlocale_failure(collctype); + /* We need two newlocale() steps */ + locale_t loc1; + + loc1 = newlocale(LC_COLLATE_MASK, collcollate, NULL); + if (!loc1) + report_newlocale_failure(collcollate); + loc = newlocale(LC_CTYPE_MASK, collctype, loc1); + if (!loc) + report_newlocale_failure(collctype); #else - /* - * XXX The _create_locale() API doesn't appear to support this. - * Could perhaps be worked around by changing pg_locale_t to - * contain two separate fields. - */ + /* + * XXX The _create_locale() API doesn't appear to support this. + * Could perhaps be worked around by changing pg_locale_t to + * contain two separate fields. + */ + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("collations with different collate and ctype values are not supported on this platform"))); +#endif + } + + result->info.lt = loc; +#else /* not HAVE_LOCALE_T */ + /* platform that doesn't support locale_t */ ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("collations with different collate and ctype values are not supported on this platform"))); -#endif + errmsg("collation provider LIBC is not supported on this platform"))); +#endif /* not HAVE_LOCALE_T */ + } + else if (collform->collprovider == COLLPROVIDER_ICU) + { +#ifdef USE_ICU + UCollator *collator; + UErrorCode status; + + status = U_ZERO_ERROR; + collator = ucol_open(collcollate, &status); + if (U_FAILURE(status)) + ereport(ERROR, + (errmsg("could not open collator for locale \"%s\": %s", + collcollate, u_errorName(status)))); + + result->info.icu.locale = strdup(collcollate); + result->info.icu.ucol = collator; +#else /* not USE_ICU */ + /* could get here if a collation was created by a build with ICU */ + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("ICU is not supported in this build"), \ + errhint("You need to rebuild PostgreSQL using --with-icu."))); +#endif /* not USE_ICU */ } - cache_entry->locale = result; + collversion = SysCacheGetAttr(COLLOID, tp, Anum_pg_collation_collversion, + &isnull); + if (!isnull) + { + char *actual_versionstr; + char *collversionstr; + + actual_versionstr = get_collation_actual_version(collform->collprovider, collcollate); + if (!actual_versionstr) + /* This could happen when specifying a version in CREATE + * COLLATION for a libc locale, or manually creating a mess + * in the catalogs. */ + ereport(ERROR, + (errmsg("collation \"%s\" has no actual version, but a version was specified", + NameStr(collform->collname)))); + collversionstr = TextDatumGetCString(collversion); + + if (strcmp(actual_versionstr, collversionstr) != 0) + ereport(WARNING, + (errmsg("collation \"%s\" has version mismatch", + NameStr(collform->collname)), + errdetail("The collation in the database was created using version %s, " + "but the operating system provides version %s.", + collversionstr, actual_versionstr), + errhint("Rebuild all objects affected by this collation and run " + "ALTER COLLATION %s REFRESH VERSION, " + "or build PostgreSQL with the right library version.", + quote_qualified_identifier(get_namespace_name(collform->collnamespace), + NameStr(collform->collname))))); + } ReleaseSysCache(tp); -#else /* not HAVE_LOCALE_T */ - /* - * For platforms that don't support locale_t, we can't do anything - * with non-default collations. - */ - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("nondefault collations are not supported on this platform"))); -#endif /* not HAVE_LOCALE_T */ + cache_entry->locale = result; } return cache_entry->locale; } +/* + * Get provider-specific collation version string for the given collation from + * the operating system/library. + * + * A particular provider must always either return a non-NULL string or return + * NULL (if it doesn't support versions). It must not return NULL for some + * collcollate and not NULL for others. + */ +char * +get_collation_actual_version(char collprovider, const char *collcollate) +{ + char *collversion; + +#ifdef USE_ICU + if (collprovider == COLLPROVIDER_ICU) + { + UCollator *collator; + UErrorCode status; + UVersionInfo versioninfo; + char buf[U_MAX_VERSION_STRING_LENGTH]; + + status = U_ZERO_ERROR; + collator = ucol_open(collcollate, &status); + if (U_FAILURE(status)) + ereport(ERROR, + (errmsg("could not open collator for locale \"%s\": %s", + collcollate, u_errorName(status)))); + ucol_getVersion(collator, versioninfo); + ucol_close(collator); + + u_versionToString(versioninfo, buf); + collversion = pstrdup(buf); + } + else +#endif + collversion = NULL; + + return collversion; +} + + +#ifdef USE_ICU +/* + * Converter object for converting between ICU's UChar strings and C strings + * in database encoding. Since the database encoding doesn't change, we only + * need one of these per session. + */ +static UConverter *icu_converter = NULL; + +static void +init_icu_converter(void) +{ + const char *icu_encoding_name; + UErrorCode status; + UConverter *conv; + + if (icu_converter) + return; + + icu_encoding_name = get_encoding_name_for_icu(GetDatabaseEncoding()); + + status = U_ZERO_ERROR; + conv = ucnv_open(icu_encoding_name, &status); + if (U_FAILURE(status)) + ereport(ERROR, + (errmsg("could not open ICU converter for encoding \"%s\": %s", + icu_encoding_name, u_errorName(status)))); + + icu_converter = conv; +} + +int32_t +icu_to_uchar(UChar **buff_uchar, const char *buff, size_t nbytes) +{ + UErrorCode status; + int32_t len_uchar; + + init_icu_converter(); + + len_uchar = 2 * nbytes; /* max length per docs */ + *buff_uchar = palloc(len_uchar * sizeof(**buff_uchar)); + status = U_ZERO_ERROR; + len_uchar = ucnv_toUChars(icu_converter, *buff_uchar, len_uchar, buff, nbytes, &status); + if (U_FAILURE(status)) + ereport(ERROR, + (errmsg("ucnv_toUChars failed: %s", u_errorName(status)))); + return len_uchar; +} + +int32_t +icu_from_uchar(char **result, UChar *buff_uchar, int32_t len_uchar) +{ + UErrorCode status; + int32_t len_result; + + init_icu_converter(); + + len_result = UCNV_GET_MAX_BYTES_FOR_STRING(len_uchar, ucnv_getMaxCharSize(icu_converter)); + *result = palloc(len_result + 1); + status = U_ZERO_ERROR; + ucnv_fromUChars(icu_converter, *result, len_result, buff_uchar, len_uchar, &status); + if (U_FAILURE(status)) + ereport(ERROR, + (errmsg("ucnv_fromUChars failed: %s", u_errorName(status)))); + return len_result; +} +#endif /* * These functions convert from/to libc's wchar_t, *not* pg_wchar_t. @@ -1362,6 +1538,8 @@ wchar2char(char *to, const wchar_t *from, size_t tolen, pg_locale_t locale) { size_t result; + Assert(!locale || locale->provider == COLLPROVIDER_LIBC); + if (tolen == 0) return 0; @@ -1398,10 +1576,10 @@ wchar2char(char *to, const wchar_t *from, size_t tolen, pg_locale_t locale) #ifdef HAVE_LOCALE_T #ifdef HAVE_WCSTOMBS_L /* Use wcstombs_l for nondefault locales */ - result = wcstombs_l(to, from, tolen, locale); + result = wcstombs_l(to, from, tolen, locale->info.lt); #else /* !HAVE_WCSTOMBS_L */ /* We have to temporarily set the locale as current ... ugh */ - locale_t save_locale = uselocale(locale); + locale_t save_locale = uselocale(locale->info.lt); result = wcstombs(to, from, tolen); @@ -1432,6 +1610,8 @@ char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen, { size_t result; + Assert(!locale || locale->provider == COLLPROVIDER_LIBC); + if (tolen == 0) return 0; @@ -1473,10 +1653,10 @@ char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen, #ifdef HAVE_LOCALE_T #ifdef HAVE_MBSTOWCS_L /* Use mbstowcs_l for nondefault locales */ - result = mbstowcs_l(to, str, tolen, locale); + result = mbstowcs_l(to, str, tolen, locale->info.lt); #else /* !HAVE_MBSTOWCS_L */ /* We have to temporarily set the locale as current ... ugh */ - locale_t save_locale = uselocale(locale); + locale_t save_locale = uselocale(locale->info.lt); result = mbstowcs(to, str, tolen); diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index bb9a544686..f8b28fe0e6 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -5259,7 +5259,7 @@ find_join_input_rel(PlannerInfo *root, Relids relids) /* * Check whether char is a letter (and, hence, subject to case-folding) * - * In multibyte character sets, we can't use isalpha, and it does not seem + * In multibyte character sets or with ICU, we can't use isalpha, and it does not seem * worth trying to convert to wchar_t to use iswalpha. Instead, just assume * any multibyte char is potentially case-varying. */ @@ -5271,9 +5271,11 @@ pattern_char_isalpha(char c, bool is_multibyte, return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); else if (is_multibyte && IS_HIGHBIT_SET(c)) return true; + else if (locale && locale->provider == COLLPROVIDER_ICU) + return IS_HIGHBIT_SET(c) ? true : false; #ifdef HAVE_LOCALE_T - else if (locale) - return isalpha_l((unsigned char) c, locale); + else if (locale && locale->provider == COLLPROVIDER_LIBC) + return isalpha_l((unsigned char) c, locale->info.lt); #endif else return isalpha((unsigned char) c); diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index cd036afc00..aa556aa5de 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -73,9 +73,7 @@ typedef struct hyperLogLogState abbr_card; /* Abbreviated key cardinality state */ hyperLogLogState full_card; /* Full key cardinality state */ double prop_card; /* Required cardinality proportion */ -#ifdef HAVE_LOCALE_T pg_locale_t locale; -#endif } VarStringSortSupport; /* @@ -1403,10 +1401,7 @@ varstr_cmp(char *arg1, int len1, char *arg2, int len2, Oid collid) char a2buf[TEXTBUFLEN]; char *a1p, *a2p; - -#ifdef HAVE_LOCALE_T pg_locale_t mylocale = 0; -#endif if (collid != DEFAULT_COLLATION_OID) { @@ -1421,9 +1416,7 @@ varstr_cmp(char *arg1, int len1, char *arg2, int len2, Oid collid) errmsg("could not determine which collation to use for string comparison"), errhint("Use the COLLATE clause to set the collation explicitly."))); } -#ifdef HAVE_LOCALE_T mylocale = pg_newlocale_from_collation(collid); -#endif } /* @@ -1542,11 +1535,54 @@ varstr_cmp(char *arg1, int len1, char *arg2, int len2, Oid collid) memcpy(a2p, arg2, len2); a2p[len2] = '\0'; -#ifdef HAVE_LOCALE_T if (mylocale) - result = strcoll_l(a1p, a2p, mylocale); - else + { + if (mylocale->provider == COLLPROVIDER_ICU) + { +#ifdef USE_ICU +#ifdef HAVE_UCOL_STRCOLLUTF8 + if (GetDatabaseEncoding() == PG_UTF8) + { + UErrorCode status; + + status = U_ZERO_ERROR; + result = ucol_strcollUTF8(mylocale->info.icu.ucol, + arg1, len1, + arg2, len2, + &status); + if (U_FAILURE(status)) + ereport(ERROR, + (errmsg("collation failed: %s", u_errorName(status)))); + } + else +#endif + { + int32_t ulen1, ulen2; + UChar *uchar1, *uchar2; + + ulen1 = icu_to_uchar(&uchar1, arg1, len1); + ulen2 = icu_to_uchar(&uchar2, arg2, len2); + + result = ucol_strcoll(mylocale->info.icu.ucol, + uchar1, ulen1, + uchar2, ulen2); + } +#else /* not USE_ICU */ + /* shouldn't happen */ + elog(ERROR, "unsupported collprovider: %c", mylocale->provider); +#endif /* not USE_ICU */ + } + else + { +#ifdef HAVE_LOCALE_T + result = strcoll_l(a1p, a2p, mylocale->info.lt); +#else + /* shouldn't happen */ + elog(ERROR, "unsupported collprovider: %c", mylocale->provider); #endif + } + } + else result = strcoll(a1p, a2p); /* @@ -1768,10 +1804,7 @@ varstr_sortsupport(SortSupport ssup, Oid collid, bool bpchar) bool abbreviate = ssup->abbreviate; bool collate_c = false; VarStringSortSupport *sss; - -#ifdef HAVE_LOCALE_T pg_locale_t locale = 0; -#endif /* * If possible, set ssup->comparator to a function which can be used to @@ -1826,9 +1859,7 @@ varstr_sortsupport(SortSupport ssup, Oid collid, bool bpchar) errmsg("could not determine which collation to use for string comparison"), errhint("Use the COLLATE clause to set the collation explicitly."))); } -#ifdef HAVE_LOCALE_T locale = pg_newlocale_from_collation(collid); -#endif } } @@ -1854,7 +1885,7 @@ varstr_sortsupport(SortSupport ssup, Oid collid, bool bpchar) * platforms. */ #ifndef TRUST_STRXFRM - if (!collate_c) + if (!collate_c && !(locale && locale->provider == COLLPROVIDER_ICU)) abbreviate = false; #endif @@ -1877,9 +1908,7 @@ varstr_sortsupport(SortSupport ssup, Oid collid, bool bpchar) sss->last_len2 = -1; /* Initialize */ sss->last_returned = 0; -#ifdef HAVE_LOCALE_T sss->locale = locale; -#endif /* * To avoid somehow confusing a strxfrm() blob and an original string, @@ -2090,11 +2119,54 @@ varstrfastcmp_locale(Datum x, Datum y, SortSupport ssup) goto done; } -#ifdef HAVE_LOCALE_T if (sss->locale) - result = strcoll_l(sss->buf1, sss->buf2, sss->locale); - else + { + if (sss->locale->provider == COLLPROVIDER_ICU) + { +#ifdef USE_ICU +#ifdef HAVE_UCOL_STRCOLLUTF8 + if (GetDatabaseEncoding() == PG_UTF8) + { + UErrorCode status; + + status = U_ZERO_ERROR; + result = ucol_strcollUTF8(sss->locale->info.icu.ucol, + a1p, len1, + a2p, len2, + &status); + if (U_FAILURE(status)) + ereport(ERROR, + (errmsg("collation failed: %s", u_errorName(status)))); + } + else #endif + { + int32_t ulen1, ulen2; + UChar *uchar1, *uchar2; + + ulen1 = icu_to_uchar(&uchar1, a1p, len1); + ulen2 = icu_to_uchar(&uchar2, a2p, len2); + + result = ucol_strcoll(sss->locale->info.icu.ucol, + uchar1, ulen1, + uchar2, ulen2); + } +#else /* not USE_ICU */ + /* shouldn't happen */ + elog(ERROR, "unsupported collprovider: %c", sss->locale->provider); +#endif /* not USE_ICU */ + } + else + { +#ifdef HAVE_LOCALE_T + result = strcoll_l(sss->buf1, sss->buf2, sss->locale->info.lt); +#else + /* shouldn't happen */ + elog(ERROR, "unsupported collprovider: %c", sss->locale->provider); +#endif + } + } + else result = strcoll(sss->buf1, sss->buf2); /* @@ -2200,9 +2272,14 @@ varstr_abbrev_convert(Datum original, SortSupport ssup) else { Size bsize; +#ifdef USE_ICU + int32_t ulen = -1; + UChar *uchar; +#endif /* - * We're not using the C collation, so fall back on strxfrm. + * We're not using the C collation, so fall back on strxfrm or ICU + * analogs. */ /* By convention, we use buffer 1 to store and NUL-terminate */ @@ -2222,17 +2299,66 @@ varstr_abbrev_convert(Datum original, SortSupport ssup) goto done; } - /* Just like strcoll(), strxfrm() expects a NUL-terminated string */ memcpy(sss->buf1, authoritative_data, len); + /* Just like strcoll(), strxfrm() expects a NUL-terminated string. + * Not necessary for ICU, but doesn't hurt. */ sss->buf1[len] = '\0'; sss->last_len1 = len; +#ifdef USE_ICU + /* When using ICU and not UTF8, convert string to UChar. */ + if (sss->locale && sss->locale->provider == COLLPROVIDER_ICU && + GetDatabaseEncoding() != PG_UTF8) + ulen = icu_to_uchar(&uchar, sss->buf1, len); +#endif + + /* + * Loop: Call strxfrm() or ucol_getSortKey(), possibly enlarge buffer, + * and try again. Both of these functions have the result buffer + * content undefined if the result did not fit, so we need to retry + * until everything fits, even though we only need the first few bytes + * in the end. When using ucol_nextSortKeyPart(), however, we only + * ask for as many bytes as we actually need. + */ for (;;) { +#ifdef USE_ICU + if (sss->locale && sss->locale->provider == COLLPROVIDER_ICU) + { + /* + * When using UTF8, use the iteration interface so we only + * need to produce as many bytes as we actually need. + */ + if (GetDatabaseEncoding() == PG_UTF8) + { + UCharIterator iter; + uint32_t state[2]; + UErrorCode status; + + uiter_setUTF8(&iter, sss->buf1, len); + state[0] = state[1] = 0; /* won't need that again */ + status = U_ZERO_ERROR; + bsize = ucol_nextSortKeyPart(sss->locale->info.icu.ucol, + &iter, + state, + (uint8_t *) sss->buf2, + Min(sizeof(Datum), sss->buflen2), + &status); + if (U_FAILURE(status)) + ereport(ERROR, + (errmsg("sort key generation failed: %s", u_errorName(status)))); + } + else + bsize = ucol_getSortKey(sss->locale->info.icu.ucol, + uchar, ulen, + (uint8_t *) sss->buf2, sss->buflen2); + } + else +#endif #ifdef HAVE_LOCALE_T - if (sss->locale) + if (sss->locale && sss->locale->provider == COLLPROVIDER_LIBC) bsize = strxfrm_l(sss->buf2, sss->buf1, - sss->buflen2, sss->locale); + sss->buflen2, sss->locale->info.lt); else #endif bsize = strxfrm(sss->buf2, sss->buf1, sss->buflen2); @@ -2242,8 +2368,7 @@ varstr_abbrev_convert(Datum original, SortSupport ssup) break; /* - * The C standard states that the contents of the buffer is now - * unspecified. Grow buffer, and retry. + * Grow buffer and retry. */ pfree(sss->buf2); sss->buflen2 = Max(bsize + 1, diff --git a/src/backend/utils/mb/encnames.c b/src/backend/utils/mb/encnames.c index 11099b844f..444eec25b5 100644 --- a/src/backend/utils/mb/encnames.c +++ b/src/backend/utils/mb/encnames.c @@ -403,6 +403,82 @@ const pg_enc2gettext pg_enc2gettext_tbl[] = }; +#ifndef FRONTEND + +/* + * Table of encoding names for ICU + * + * Reference: <https://ssl.icu-project.org/icu-bin/convexp> + * + * NULL entries are not supported by ICU, or their mapping is unclear. + */ +static const char * const pg_enc2icu_tbl[] = +{ + NULL, /* PG_SQL_ASCII */ + "EUC-JP", /* PG_EUC_JP */ + "EUC-CN", /* PG_EUC_CN */ + "EUC-KR", /* PG_EUC_KR */ + "EUC-TW", /* PG_EUC_TW */ + NULL, /* PG_EUC_JIS_2004 */ + "UTF-8", /* PG_UTF8 */ + NULL, /* PG_MULE_INTERNAL */ + "ISO-8859-1", /* PG_LATIN1 */ + "ISO-8859-2", /* PG_LATIN2 */ + "ISO-8859-3", /* PG_LATIN3 */ + "ISO-8859-4", /* PG_LATIN4 */ + "ISO-8859-9", /* PG_LATIN5 */ + "ISO-8859-10", /* PG_LATIN6 */ + "ISO-8859-13", /* PG_LATIN7 */ + "ISO-8859-14", /* PG_LATIN8 */ + "ISO-8859-15", /* PG_LATIN9 */ + NULL, /* PG_LATIN10 */ + "CP1256", /* PG_WIN1256 */ + "CP1258", /* PG_WIN1258 */ + "CP866", /* PG_WIN866 */ + NULL, /* PG_WIN874 */ + "KOI8-R", /* PG_KOI8R */ + "CP1251", /* PG_WIN1251 */ + "CP1252", /* PG_WIN1252 */ + "ISO-8859-5", /* PG_ISO_8859_5 */ + "ISO-8859-6", /* PG_ISO_8859_6 */ + "ISO-8859-7", /* PG_ISO_8859_7 */ + "ISO-8859-8", /* PG_ISO_8859_8 */ + "CP1250", /* PG_WIN1250 */ + "CP1253", /* PG_WIN1253 */ + "CP1254", /* PG_WIN1254 */ + "CP1255", /* PG_WIN1255 */ + "CP1257", /* PG_WIN1257 */ + "KOI8-U", /* PG_KOI8U */ +}; + +bool +is_encoding_supported_by_icu(int encoding) +{ + return (pg_enc2icu_tbl[encoding] != NULL); +} + +const char * +get_encoding_name_for_icu(int encoding) +{ + const char *icu_encoding_name; + + StaticAssertStmt(lengthof(pg_enc2icu_tbl) == PG_ENCODING_BE_LAST + 1, + "pg_enc2icu_tbl incomplete"); + + icu_encoding_name = pg_enc2icu_tbl[encoding]; + + if (!icu_encoding_name) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("encoding \"%s\" not supported by ICU", + pg_encoding_to_char(encoding)))); + + return icu_encoding_name; +} + +#endif /* not FRONTEND */ + + /* ---------- * Encoding checks, for error returns -1 else encoding id * ---------- |
