diff options
Diffstat (limited to 'src/backend/utils')
| -rw-r--r-- | src/backend/utils/adt/jsonpath_scan.l | 45 | ||||
| -rw-r--r-- | src/backend/utils/adt/xml.c | 24 | ||||
| -rw-r--r-- | src/backend/utils/mb/mbutils.c | 105 |
3 files changed, 117 insertions, 57 deletions
diff --git a/src/backend/utils/adt/jsonpath_scan.l b/src/backend/utils/adt/jsonpath_scan.l index 70681b789d..be0a2cfa2f 100644 --- a/src/backend/utils/adt/jsonpath_scan.l +++ b/src/backend/utils/adt/jsonpath_scan.l @@ -486,13 +486,6 @@ hexval(char c) static void addUnicodeChar(int ch) { - /* - * For UTF8, replace the escape sequence by the actual - * utf8 character in lex->strval. Do this also for other - * encodings if the escape designates an ASCII character, - * otherwise raise an error. - */ - if (ch == 0) { /* We can't allow this, since our TEXT type doesn't */ @@ -501,40 +494,20 @@ addUnicodeChar(int ch) errmsg("unsupported Unicode escape sequence"), errdetail("\\u0000 cannot be converted to text."))); } - else if (GetDatabaseEncoding() == PG_UTF8) - { - char utf8str[5]; - int utf8len; - - unicode_to_utf8(ch, (unsigned char *) utf8str); - utf8len = pg_utf_mblen((unsigned char *) utf8str); - addstring(false, utf8str, utf8len); - } - else if (ch <= 0x007f) - { - /* - * This is the only way to designate things like a - * form feed character in JSON, so it's useful in all - * encodings. - */ - addchar(false, (char) ch); - } else { - ereport(ERROR, - (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), - errmsg("invalid input syntax for type %s", "jsonpath"), - errdetail("Unicode escape values cannot be used for code " - "point values above 007F when the server encoding " - "is not UTF8."))); + char cbuf[MAX_UNICODE_EQUIVALENT_STRING + 1]; + + pg_unicode_to_server(ch, (unsigned char *) cbuf); + addstring(false, cbuf, strlen(cbuf)); } } -/* Add unicode character and process its hi surrogate */ +/* Add unicode character, processing any surrogate pairs */ static void addUnicode(int ch, int *hi_surrogate) { - if (ch >= 0xd800 && ch <= 0xdbff) + if (is_utf16_surrogate_first(ch)) { if (*hi_surrogate != -1) ereport(ERROR, @@ -542,10 +515,10 @@ addUnicode(int ch, int *hi_surrogate) errmsg("invalid input syntax for type %s", "jsonpath"), errdetail("Unicode high surrogate must not follow " "a high surrogate."))); - *hi_surrogate = (ch & 0x3ff) << 10; + *hi_surrogate = ch; return; } - else if (ch >= 0xdc00 && ch <= 0xdfff) + else if (is_utf16_surrogate_second(ch)) { if (*hi_surrogate == -1) ereport(ERROR, @@ -553,7 +526,7 @@ addUnicode(int ch, int *hi_surrogate) errmsg("invalid input syntax for type %s", "jsonpath"), errdetail("Unicode low surrogate must follow a high " "surrogate."))); - ch = 0x10000 + *hi_surrogate + (ch & 0x3ff); + ch = surrogate_pair_to_codepoint(*hi_surrogate, ch); *hi_surrogate = -1; } else if (*hi_surrogate != -1) diff --git a/src/backend/utils/adt/xml.c b/src/backend/utils/adt/xml.c index c7ae1eded8..4c299057a6 100644 --- a/src/backend/utils/adt/xml.c +++ b/src/backend/utils/adt/xml.c @@ -2086,26 +2086,6 @@ map_sql_identifier_to_xml_name(const char *ident, bool fully_escaped, /* - * Map a Unicode codepoint into the current server encoding. - */ -static char * -unicode_to_sqlchar(pg_wchar c) -{ - char utf8string[8]; /* need room for trailing zero */ - char *result; - - memset(utf8string, 0, sizeof(utf8string)); - unicode_to_utf8(c, (unsigned char *) utf8string); - - result = pg_any_to_server(utf8string, strlen(utf8string), PG_UTF8); - /* if pg_any_to_server didn't strdup, we must */ - if (result == utf8string) - result = pstrdup(result); - return result; -} - - -/* * Map XML name to SQL identifier; see SQL/XML:2008 section 9.3. */ char * @@ -2125,10 +2105,12 @@ map_xml_name_to_sql_identifier(const char *name) && isxdigit((unsigned char) *(p + 5)) && *(p + 6) == '_') { + char cbuf[MAX_UNICODE_EQUIVALENT_STRING + 1]; unsigned int u; sscanf(p + 2, "%X", &u); - appendStringInfoString(&buf, unicode_to_sqlchar(u)); + pg_unicode_to_server(u, (unsigned char *) cbuf); + appendStringInfoString(&buf, cbuf); p += 6; } else diff --git a/src/backend/utils/mb/mbutils.c b/src/backend/utils/mb/mbutils.c index 86787bcb31..a8e13cacfd 100644 --- a/src/backend/utils/mb/mbutils.c +++ b/src/backend/utils/mb/mbutils.c @@ -68,6 +68,13 @@ static FmgrInfo *ToServerConvProc = NULL; static FmgrInfo *ToClientConvProc = NULL; /* + * This variable stores the conversion function to convert from UTF-8 + * to the server encoding. It's NULL if the server encoding *is* UTF-8, + * or if we lack a conversion function for this. + */ +static FmgrInfo *Utf8ToServerConvProc = NULL; + +/* * These variables track the currently-selected encodings. */ static const pg_enc2name *ClientEncoding = &pg_enc2name_tbl[PG_SQL_ASCII]; @@ -273,6 +280,8 @@ SetClientEncoding(int encoding) void InitializeClientEncoding(void) { + int current_server_encoding; + Assert(!backend_startup_complete); backend_startup_complete = true; @@ -289,6 +298,35 @@ InitializeClientEncoding(void) pg_enc2name_tbl[pending_client_encoding].name, GetDatabaseEncodingName()))); } + + /* + * Also look up the UTF8-to-server conversion function if needed. Since + * the server encoding is fixed within any one backend process, we don't + * have to do this more than once. + */ + current_server_encoding = GetDatabaseEncoding(); + if (current_server_encoding != PG_UTF8 && + current_server_encoding != PG_SQL_ASCII) + { + Oid utf8_to_server_proc; + + Assert(IsTransactionState()); + utf8_to_server_proc = + FindDefaultConversionProc(PG_UTF8, + current_server_encoding); + /* If there's no such conversion, just leave the pointer as NULL */ + if (OidIsValid(utf8_to_server_proc)) + { + FmgrInfo *finfo; + + finfo = (FmgrInfo *) MemoryContextAlloc(TopMemoryContext, + sizeof(FmgrInfo)); + fmgr_info_cxt(utf8_to_server_proc, finfo, + TopMemoryContext); + /* Set Utf8ToServerConvProc only after data is fully valid */ + Utf8ToServerConvProc = finfo; + } + } } /* @@ -752,6 +790,73 @@ perform_default_encoding_conversion(const char *src, int len, return result; } +/* + * Convert a single Unicode code point into a string in the server encoding. + * + * The code point given by "c" is converted and stored at *s, which must + * have at least MAX_UNICODE_EQUIVALENT_STRING+1 bytes available. + * The output will have a trailing '\0'. Throws error if the conversion + * cannot be performed. + * + * Note that this relies on having previously looked up any required + * conversion function. That's partly for speed but mostly because the parser + * may call this outside any transaction, or in an aborted transaction. + */ +void +pg_unicode_to_server(pg_wchar c, unsigned char *s) +{ + unsigned char c_as_utf8[MAX_MULTIBYTE_CHAR_LEN + 1]; + int c_as_utf8_len; + int server_encoding; + + /* + * Complain if invalid Unicode code point. The choice of errcode here is + * debatable, but really our caller should have checked this anyway. + */ + if (!is_valid_unicode_codepoint(c)) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("invalid Unicode code point"))); + + /* Otherwise, if it's in ASCII range, conversion is trivial */ + if (c <= 0x7F) + { + s[0] = (unsigned char) c; + s[1] = '\0'; + return; + } + + /* If the server encoding is UTF-8, we just need to reformat the code */ + server_encoding = GetDatabaseEncoding(); + if (server_encoding == PG_UTF8) + { + unicode_to_utf8(c, s); + s[pg_utf_mblen(s)] = '\0'; + return; + } + + /* For all other cases, we must have a conversion function available */ + if (Utf8ToServerConvProc == NULL) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("conversion between %s and %s is not supported", + pg_enc2name_tbl[PG_UTF8].name, + GetDatabaseEncodingName()))); + + /* Construct UTF-8 source string */ + unicode_to_utf8(c, c_as_utf8); + c_as_utf8_len = pg_utf_mblen(c_as_utf8); + c_as_utf8[c_as_utf8_len] = '\0'; + + /* Convert, or throw error if we can't */ + FunctionCall5(Utf8ToServerConvProc, + Int32GetDatum(PG_UTF8), + Int32GetDatum(server_encoding), + CStringGetDatum(c_as_utf8), + CStringGetDatum(s), + Int32GetDatum(c_as_utf8_len)); +} + /* convert a multibyte string to a wchar */ int |
