diff options
author | Lorry Tar Creator <lorry-tar-importer@lorry> | 2017-08-05 16:22:51 +0000 |
---|---|---|
committer | Lorry Tar Creator <lorry-tar-importer@lorry> | 2017-08-05 16:22:51 +0000 |
commit | cf46733632c7279a9fd0fe6ce26f9185a4ae82a9 (patch) | |
tree | da27775a2161723ef342e91af41a8b51fedef405 /subversion/libsvn_subr/utf.c | |
parent | bb0ef45f7c46b0ae221b26265ef98a768c33f820 (diff) | |
download | subversion-tarball-master.tar.gz |
subversion-1.9.7HEADsubversion-1.9.7master
Diffstat (limited to 'subversion/libsvn_subr/utf.c')
-rw-r--r-- | subversion/libsvn_subr/utf.c | 338 |
1 files changed, 255 insertions, 83 deletions
diff --git a/subversion/libsvn_subr/utf.c b/subversion/libsvn_subr/utf.c index 4f9102d..7d20d24 100644 --- a/subversion/libsvn_subr/utf.c +++ b/subversion/libsvn_subr/utf.c @@ -59,6 +59,12 @@ static const char *SVN_APR_UTF8_CHARSET = "UTF-8"; static svn_mutex__t *xlate_handle_mutex = NULL; static svn_boolean_t assume_native_charset_is_utf8 = FALSE; +#if defined(WIN32) +typedef svn_subr__win32_xlate_t xlate_handle_t; +#else +typedef apr_xlate_t xlate_handle_t; +#endif + /* The xlate handle cache is a global hash table with linked lists of xlate * handles. In multi-threaded environments, a thread "borrows" an xlate * handle from the cache during a translation and puts it back afterwards. @@ -69,7 +75,7 @@ static svn_boolean_t assume_native_charset_is_utf8 = FALSE; * is the number of simultanous handles in use for that key. */ typedef struct xlate_handle_node_t { - apr_xlate_t *handle; + xlate_handle_t *handle; /* FALSE if the handle is not valid, since its pool is being destroyed. */ svn_boolean_t valid; @@ -172,7 +178,7 @@ get_xlate_key(const char *topage, topage = "APR_DEFAULT_CHARSET"; return apr_pstrcat(pool, "svn-utf-", frompage, "to", topage, - "-xlate-handle", (char *)NULL); + "-xlate-handle", SVN_VA_NULL); } /* Atomically replace the content in *MEM with NEW_VALUE and return @@ -184,16 +190,10 @@ static APR_INLINE void* atomic_swap(void * volatile * mem, void *new_value) { #if APR_HAS_THREADS -#if APR_VERSION_AT_LEAST(1,3,0) /* Cast is necessary because of APR bug: https://issues.apache.org/bugzilla/show_bug.cgi?id=50731 */ return apr_atomic_xchgptr((volatile void **)mem, new_value); #else - /* old APRs don't support atomic swaps. Simply return the - * input to the caller for further proccessing. */ - return new_value; -#endif -#else /* no threads - no sync. necessary */ void *old_value = (void*)*mem; *mem = new_value; @@ -211,7 +211,7 @@ xlate_alloc_handle(xlate_handle_node_t **ret, apr_pool_t *pool) { apr_status_t apr_err; - apr_xlate_t *handle; + xlate_handle_t *handle; const char *name; /* The error handling doesn't support the following cases, since we don't @@ -223,7 +223,7 @@ xlate_alloc_handle(xlate_handle_node_t **ret, /* Try to create a handle. */ #if defined(WIN32) - apr_err = svn_subr__win32_xlate_open((win32_xlate_t **)&handle, topage, + apr_err = svn_subr__win32_xlate_open(&handle, topage, frompage, pool); name = "win32-xlate: "; #else @@ -257,7 +257,7 @@ xlate_alloc_handle(xlate_handle_node_t **ret, later. APR_STRERR will be in the local encoding, not in UTF-8, though. */ svn_strerror(apr_err, apr_strerr, sizeof(apr_strerr)); - return svn_error_createf(SVN_ERR_PLUGIN_LOAD_FAILURE, + return svn_error_createf(SVN_ERR_PLUGIN_LOAD_FAILURE, svn_error_create(apr_err, NULL, apr_strerr), "%s%s", name, errstr); } @@ -480,58 +480,6 @@ get_uton_xlate_handle_node(xlate_handle_node_t **ret, apr_pool_t *pool) } -/* Copy LEN bytes of SRC, converting non-ASCII and zero bytes to ?\nnn - sequences, allocating the result in POOL. */ -static const char * -fuzzy_escape(const char *src, apr_size_t len, apr_pool_t *pool) -{ - const char *src_orig = src, *src_end = src + len; - apr_size_t new_len = 0; - char *new; - const char *new_orig; - - /* First count how big a dest string we'll need. */ - while (src < src_end) - { - if (! svn_ctype_isascii(*src) || *src == '\0') - new_len += 5; /* 5 slots, for "?\XXX" */ - else - new_len += 1; /* one slot for the 7-bit char */ - - src++; - } - - /* Allocate that amount, plus one slot for '\0' character. */ - new = apr_palloc(pool, new_len + 1); - - new_orig = new; - - /* And fill it up. */ - while (src_orig < src_end) - { - if (! svn_ctype_isascii(*src_orig) || src_orig == '\0') - { - /* This is the same format as svn_xml_fuzzy_escape uses, but that - function escapes different characters. Please keep in sync! - ### If we add another fuzzy escape somewhere, we should abstract - ### this out to a common function. */ - apr_snprintf(new, 6, "?\\%03u", (unsigned char) *src_orig); - new += 5; - } - else - { - *new = *src_orig; - new += 1; - } - - src_orig++; - } - - *new = '\0'; - - return new_orig; -} - /* Convert SRC_LENGTH bytes of SRC_DATA in NODE->handle, store the result in *DEST, which is allocated in POOL. */ static svn_error_t * @@ -544,9 +492,8 @@ convert_to_stringbuf(xlate_handle_node_t *node, #ifdef WIN32 apr_status_t apr_err; - apr_err = svn_subr__win32_xlate_to_stringbuf((win32_xlate_t *) node->handle, - src_data, src_length, - dest, pool); + apr_err = svn_subr__win32_xlate_to_stringbuf(node->handle, src_data, + src_length, dest, pool); #else apr_size_t buflen = src_length * 2; apr_status_t apr_err; @@ -609,8 +556,8 @@ convert_to_stringbuf(xlate_handle_node_t *node, (pool, _("Can't convert string from '%s' to '%s':"), node->frompage, node->topage); - err = svn_error_create(apr_err, NULL, fuzzy_escape(src_data, - src_length, pool)); + err = svn_error_create( + apr_err, NULL, svn_utf__fuzzy_escape(src_data, src_length, pool)); return svn_error_create(apr_err, err, errstr); } /* Else, exited due to success. Trim the result buffer down to the @@ -691,7 +638,7 @@ invalid_utf8(const char *data, apr_size_t len, apr_pool_t *pool) valid_txt = apr_pstrcat(pool, valid_txt, apr_psprintf(pool, " %02x", (unsigned char)last[i-valid]), - (char *)NULL); + SVN_VA_NULL); /* 4 invalid octets will guarantee that the faulty octet is displayed */ invalid = data + len - last; @@ -701,7 +648,7 @@ invalid_utf8(const char *data, apr_size_t len, apr_pool_t *pool) invalid_txt = apr_pstrcat(pool, invalid_txt, apr_psprintf(pool, " %02x", (unsigned char)last[i]), - (char *)NULL); + SVN_VA_NULL); return svn_error_createf(APR_EINVAL, NULL, _("Valid UTF-8 data\n(hex:%s)\n" @@ -986,18 +933,6 @@ svn_utf_cstring_from_utf8_ex2(const char **dest, return err; } - -svn_error_t * -svn_utf_cstring_from_utf8_ex(const char **dest, - const char *src, - const char *topage, - const char *convset_key, - apr_pool_t *pool) -{ - return svn_utf_cstring_from_utf8_ex2(dest, src, topage, pool); -} - - const char * svn_utf__cstring_from_utf8_fuzzy(const char *src, apr_pool_t *pool, @@ -1007,7 +942,7 @@ svn_utf__cstring_from_utf8_fuzzy(const char *src, const char *escaped, *converted; svn_error_t *err; - escaped = fuzzy_escape(src, strlen(src), pool); + escaped = svn_utf__fuzzy_escape(src, strlen(src), pool); /* Okay, now we have a *new* UTF-8 string, one that's guaranteed to contain only 7-bit bytes :-). Recode to native... */ @@ -1084,3 +1019,240 @@ svn_utf_cstring_from_utf8_string(const char **dest, return err; } + + +/* Insert the given UCS-4 VALUE into BUF at the given OFFSET. */ +static void +membuf_insert_ucs4(svn_membuf_t *buf, apr_size_t offset, apr_int32_t value) +{ + svn_membuf__resize(buf, (offset + 1) * sizeof(value)); + ((apr_int32_t*)buf->data)[offset] = value; +} + +/* TODO: Use compiler intrinsics for byte swaps. */ +#define SWAP_SHORT(x) ((((x) & 0xff) << 8) | (((x) >> 8) & 0xff)) +#define SWAP_LONG(x) ((((x) & 0xff) << 24) | (((x) & 0xff00) << 8) \ + | (((x) >> 8) & 0xff00) | (((x) >> 24) & 0xff)) + +#define IS_UTF16_LEAD_SURROGATE(c) ((c) >= 0xd800 && (c) <= 0xdbff) +#define IS_UTF16_TRAIL_SURROGATE(c) ((c) >= 0xdc00 && (c) <= 0xdfff) + +svn_error_t * +svn_utf__utf16_to_utf8(const svn_string_t **result, + const apr_uint16_t *utf16str, + apr_size_t utf16len, + svn_boolean_t big_endian, + apr_pool_t *result_pool, + apr_pool_t *scratch_pool) +{ + static const apr_uint16_t endiancheck = 0xa55a; + const svn_boolean_t arch_big_endian = + (((const char*)&endiancheck)[sizeof(endiancheck) - 1] == '\x5a'); + const svn_boolean_t swap_order = (!big_endian != !arch_big_endian); + + apr_uint16_t lead_surrogate; + apr_size_t length; + apr_size_t offset; + svn_membuf_t ucs4buf; + svn_membuf_t resultbuf; + svn_string_t *res; + + if (utf16len == SVN_UTF__UNKNOWN_LENGTH) + { + const apr_uint16_t *endp = utf16str; + while (*endp++) + ; + utf16len = (endp - utf16str); + } + + svn_membuf__create(&ucs4buf, utf16len * sizeof(apr_int32_t), scratch_pool); + + for (lead_surrogate = 0, length = 0, offset = 0; + offset < utf16len; ++offset) + { + const apr_uint16_t code = + (swap_order ? SWAP_SHORT(utf16str[offset]) : utf16str[offset]); + + if (lead_surrogate) + { + if (IS_UTF16_TRAIL_SURROGATE(code)) + { + /* Combine the lead and trail currogates into a 32-bit code. */ + membuf_insert_ucs4(&ucs4buf, length++, + (0x010000 + + (((lead_surrogate & 0x03ff) << 10) + | (code & 0x03ff)))); + lead_surrogate = 0; + continue; + } + else + { + /* If we didn't find a surrogate pair, just dump the + lead surrogate into the stream. */ + membuf_insert_ucs4(&ucs4buf, length++, lead_surrogate); + lead_surrogate = 0; + } + } + + if ((offset + 1) < utf16len && IS_UTF16_LEAD_SURROGATE(code)) + { + /* Store a lead surrogate that is followed by at least one + code for the next iteration. */ + lead_surrogate = code; + continue; + } + else + membuf_insert_ucs4(&ucs4buf, length++, code); + } + + /* Convert the UCS-4 buffer to UTF-8, assuming an average of 2 bytes + per code point for encoding. The buffer will grow as + necessary. */ + svn_membuf__create(&resultbuf, length * 2, result_pool); + SVN_ERR(svn_utf__encode_ucs4_string( + &resultbuf, ucs4buf.data, length, &length)); + + res = apr_palloc(result_pool, sizeof(*res)); + res->data = resultbuf.data; + res->len = length; + *result = res; + return SVN_NO_ERROR; +} + + +svn_error_t * +svn_utf__utf32_to_utf8(const svn_string_t **result, + const apr_int32_t *utf32str, + apr_size_t utf32len, + svn_boolean_t big_endian, + apr_pool_t *result_pool, + apr_pool_t *scratch_pool) +{ + static const apr_int32_t endiancheck = 0xa5cbbc5a; + const svn_boolean_t arch_big_endian = + (((const char*)&endiancheck)[sizeof(endiancheck) - 1] == '\x5a'); + const svn_boolean_t swap_order = (!big_endian != !arch_big_endian); + + apr_size_t length; + svn_membuf_t resultbuf; + svn_string_t *res; + + if (utf32len == SVN_UTF__UNKNOWN_LENGTH) + { + const apr_int32_t *endp = utf32str; + while (*endp++) + ; + utf32len = (endp - utf32str); + } + + if (swap_order) + { + apr_size_t offset; + svn_membuf_t ucs4buf; + + svn_membuf__create(&ucs4buf, utf32len * sizeof(apr_int32_t), + scratch_pool); + + for (offset = 0; offset < utf32len; ++offset) + { + const apr_int32_t code = SWAP_LONG(utf32str[offset]); + membuf_insert_ucs4(&ucs4buf, offset, code); + } + utf32str = ucs4buf.data; + } + + /* Convert the UCS-4 buffer to UTF-8, assuming an average of 2 bytes + per code point for encoding. The buffer will grow as + necessary. */ + svn_membuf__create(&resultbuf, utf32len * 2, result_pool); + SVN_ERR(svn_utf__encode_ucs4_string( + &resultbuf, utf32str, utf32len, &length)); + + res = apr_palloc(result_pool, sizeof(*res)); + res->data = resultbuf.data; + res->len = length; + *result = res; + return SVN_NO_ERROR; +} + + +#ifdef WIN32 + + +svn_error_t * +svn_utf__win32_utf8_to_utf16(const WCHAR **result, + const char *src, + const WCHAR *prefix, + apr_pool_t *result_pool) +{ + const int utf8_count = strlen(src); + const int prefix_len = (prefix ? lstrlenW(prefix) : 0); + WCHAR *wide_str; + int wide_count; + + if (0 == prefix_len + utf8_count) + { + *result = L""; + return SVN_NO_ERROR; + } + + wide_count = MultiByteToWideChar(CP_UTF8, 0, src, utf8_count, NULL, 0); + if (wide_count == 0) + return svn_error_wrap_apr(apr_get_os_error(), + _("Conversion to UTF-16 failed")); + + wide_str = apr_palloc(result_pool, + (prefix_len + wide_count + 1) * sizeof(*wide_str)); + if (prefix_len) + memcpy(wide_str, prefix, prefix_len * sizeof(*wide_str)); + if (0 == MultiByteToWideChar(CP_UTF8, 0, src, utf8_count, + wide_str + prefix_len, wide_count)) + return svn_error_wrap_apr(apr_get_os_error(), + _("Conversion to UTF-16 failed")); + + wide_str[prefix_len + wide_count] = 0; + *result = wide_str; + + return SVN_NO_ERROR; +} + +svn_error_t * +svn_utf__win32_utf16_to_utf8(const char **result, + const WCHAR *src, + const char *prefix, + apr_pool_t *result_pool) +{ + const int wide_count = lstrlenW(src); + const int prefix_len = (prefix ? strlen(prefix) : 0); + char *utf8_str; + int utf8_count; + + if (0 == prefix_len + wide_count) + { + *result = ""; + return SVN_NO_ERROR; + } + + utf8_count = WideCharToMultiByte(CP_UTF8, 0, src, wide_count, + NULL, 0, NULL, FALSE); + if (utf8_count == 0) + return svn_error_wrap_apr(apr_get_os_error(), + _("Conversion from UTF-16 failed")); + + utf8_str = apr_palloc(result_pool, + (prefix_len + utf8_count + 1) * sizeof(*utf8_str)); + if (prefix_len) + memcpy(utf8_str, prefix, prefix_len * sizeof(*utf8_str)); + if (0 == WideCharToMultiByte(CP_UTF8, 0, src, wide_count, + utf8_str + prefix_len, utf8_count, + NULL, FALSE)) + return svn_error_wrap_apr(apr_get_os_error(), + _("Conversion from UTF-16 failed")); + + utf8_str[prefix_len + utf8_count] = 0; + *result = utf8_str; + + return SVN_NO_ERROR; +} + +#endif /* WIN32 */ |