diff options
author | Lorry Tar Creator <lorry-tar-importer@lorry> | 2017-08-05 16:22:51 +0000 |
---|---|---|
committer | Lorry Tar Creator <lorry-tar-importer@lorry> | 2017-08-05 16:22:51 +0000 |
commit | cf46733632c7279a9fd0fe6ce26f9185a4ae82a9 (patch) | |
tree | da27775a2161723ef342e91af41a8b51fedef405 /subversion/libsvn_subr/utf8proc.c | |
parent | bb0ef45f7c46b0ae221b26265ef98a768c33f820 (diff) | |
download | subversion-tarball-master.tar.gz |
subversion-1.9.7HEADsubversion-1.9.7master
Diffstat (limited to 'subversion/libsvn_subr/utf8proc.c')
-rw-r--r-- | subversion/libsvn_subr/utf8proc.c | 530 |
1 files changed, 530 insertions, 0 deletions
diff --git a/subversion/libsvn_subr/utf8proc.c b/subversion/libsvn_subr/utf8proc.c new file mode 100644 index 0000000..1e705f5 --- /dev/null +++ b/subversion/libsvn_subr/utf8proc.c @@ -0,0 +1,530 @@ +/* + * utf8proc.c: Wrappers for the utf8proc library + * + * ==================================================================== + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * ==================================================================== + */ + + + +#include <apr_fnmatch.h> + +#include "private/svn_string_private.h" +#include "private/svn_utf_private.h" +#include "svn_private_config.h" + +#define UTF8PROC_INLINE +/* Somehow utf8proc thinks it is nice to use strlen as an argument name, + while this function is already defined via apr.h */ +#define strlen svn__strlen_var +#include "utf8proc/utf8proc.c" +#undef strlen + + + +const char * +svn_utf__utf8proc_compiled_version(void) +{ + static const char utf8proc_version[] = + APR_STRINGIFY(UTF8PROC_VERSION_MAJOR) "." + APR_STRINGIFY(UTF8PROC_VERSION_MINOR) "." + APR_STRINGIFY(UTF8PROC_VERSION_PATCH); + return utf8proc_version; +} + +const char * +svn_utf__utf8proc_runtime_version(void) +{ + /* Unused static function warning removal hack. */ + SVN_UNUSED(utf8proc_NFD); + SVN_UNUSED(utf8proc_NFC); + SVN_UNUSED(utf8proc_NFKD); + SVN_UNUSED(utf8proc_NFKC); + + return utf8proc_version(); +} + + + +/* Fill the given BUFFER with decomposed UCS-4 representation of the + * UTF-8 STRING. If LENGTH is SVN_UTF__UNKNOWN_LENGTH, assume STRING + * is NUL-terminated; otherwise look only at the first LENGTH bytes in + * STRING. Upon return, BUFFER->data points at an array of UCS-4 + * characters, and return the length of the array. TRANSFORM_FLAGS + * define exactly how the decomposition is performed. + * + * A negative return value is an utf8proc error code and may indicate + * that STRING contains invalid UTF-8 or was so long that an overflow + * occurred. + */ +static ssize_t +unicode_decomposition(int transform_flags, + const char *string, apr_size_t length, + svn_membuf_t *buffer) +{ + const int nullterm = (length == SVN_UTF__UNKNOWN_LENGTH + ? UTF8PROC_NULLTERM : 0); + + for (;;) + { + apr_int32_t *const ucs4buf = buffer->data; + const ssize_t ucs4len = buffer->size / sizeof(*ucs4buf); + const ssize_t result = + utf8proc_decompose((const void*) string, length, ucs4buf, ucs4len, + UTF8PROC_DECOMPOSE | UTF8PROC_STABLE + | transform_flags | nullterm); + + if (result < 0 || result <= ucs4len) + return result; + + /* Increase the decomposition buffer size and retry */ + svn_membuf__ensure(buffer, result * sizeof(*ucs4buf)); + } +} + +/* Fill the given BUFFER with an NFD UCS-4 representation of the UTF-8 + * STRING. If LENGTH is SVN_UTF__UNKNOWN_LENGTH, assume STRING is + * NUL-terminated; otherwise look only at the first LENGTH bytes in + * STRING. Upon return, BUFFER->data points at an array of UCS-4 + * characters and *RESULT_LENGTH contains the length of the array. + * + * A returned error may indicate that STRING contains invalid UTF-8 or + * invalid Unicode codepoints. Any error message comes from utf8proc. + */ +static svn_error_t * +decompose_normalized(apr_size_t *result_length, + const char *string, apr_size_t length, + svn_membuf_t *buffer) +{ + ssize_t result = unicode_decomposition(0, string, length, buffer); + if (result < 0) + return svn_error_create(SVN_ERR_UTF8PROC_ERROR, NULL, + gettext(utf8proc_errmsg(result))); + *result_length = result; + return SVN_NO_ERROR; +} + +/* Fill the given BUFFER with an NFC UTF-8 representation of the UTF-8 + * STRING. If LENGTH is SVN_UTF__UNKNOWN_LENGTH, assume STRING is + * NUL-terminated; otherwise look only at the first LENGTH bytes in + * STRING. Upon return, BUFFER->data points at a NUL-terminated string + * of UTF-8 characters. + * + * A returned error may indicate that STRING contains invalid UTF-8 or + * invalid Unicode codepoints. Any error message comes from utf8proc. + */ +static svn_error_t * +normalize_cstring(apr_size_t *result_length, + const char *string, apr_size_t length, + svn_membuf_t *buffer) +{ + ssize_t result = unicode_decomposition(0, string, length, buffer); + if (result >= 0) + { + svn_membuf__resize(buffer, result * sizeof(apr_int32_t) + 1); + result = utf8proc_reencode(buffer->data, result, + UTF8PROC_COMPOSE | UTF8PROC_STABLE); + } + if (result < 0) + return svn_error_create(SVN_ERR_UTF8PROC_ERROR, NULL, + gettext(utf8proc_errmsg(result))); + *result_length = result; + return SVN_NO_ERROR; +} + +/* Compare two arrays of UCS-4 codes, BUFA of length LENA and BUFB of + * length LENB. Return 0 if they're equal, a negative value if BUFA is + * less than BUFB, otherwise a positive value. + * + * Yes, this is strcmp for known-length UCS-4 strings. + */ +static int +ucs4cmp(const apr_int32_t *bufa, apr_size_t lena, + const apr_int32_t *bufb, apr_size_t lenb) +{ + const apr_size_t len = (lena < lenb ? lena : lenb); + apr_size_t i; + + for (i = 0; i < len; ++i) + { + const int diff = bufa[i] - bufb[i]; + if (diff) + return diff; + } + return (lena == lenb ? 0 : (lena < lenb ? -1 : 1)); +} + +svn_error_t * +svn_utf__normcmp(int *result, + const char *str1, apr_size_t len1, + const char *str2, apr_size_t len2, + svn_membuf_t *buf1, svn_membuf_t *buf2) +{ + apr_size_t buflen1; + apr_size_t buflen2; + + /* Shortcut-circuit the decision if at least one of the strings is empty. */ + const svn_boolean_t empty1 = + (0 == len1 || (len1 == SVN_UTF__UNKNOWN_LENGTH && !*str1)); + const svn_boolean_t empty2 = + (0 == len2 || (len2 == SVN_UTF__UNKNOWN_LENGTH && !*str2)); + if (empty1 || empty2) + { + *result = (empty1 == empty2 ? 0 : (empty1 ? -1 : 1)); + return SVN_NO_ERROR; + } + + SVN_ERR(decompose_normalized(&buflen1, str1, len1, buf1)); + SVN_ERR(decompose_normalized(&buflen2, str2, len2, buf2)); + *result = ucs4cmp(buf1->data, buflen1, buf2->data, buflen2); + return SVN_NO_ERROR; +} + +svn_error_t* +svn_utf__normalize(const char **result, + const char *str, apr_size_t len, + svn_membuf_t *buf) +{ + apr_size_t result_length; + SVN_ERR(normalize_cstring(&result_length, str, len, buf)); + *result = (const char*)(buf->data); + return SVN_NO_ERROR; +} + +/* Decode a single UCS-4 code point to UTF-8, appending the result to BUFFER. + * Assume BUFFER is already filled to *LENGTH and return the new size there. + * This function does *not* nul-terminate the stringbuf! + * + * A returned error indicates that the codepoint is invalid. + */ +static svn_error_t * +encode_ucs4(svn_membuf_t *buffer, apr_int32_t ucs4chr, apr_size_t *length) +{ + apr_size_t utf8len; + + if (buffer->size - *length < 4) + svn_membuf__resize(buffer, buffer->size + 4); + + utf8len = utf8proc_encode_char(ucs4chr, ((uint8_t*)buffer->data + *length)); + if (!utf8len) + return svn_error_createf(SVN_ERR_UTF8PROC_ERROR, NULL, + _("Invalid Unicode character U+%04lX"), + (long)ucs4chr); + *length += utf8len; + return SVN_NO_ERROR; +} + +svn_error_t * +svn_utf__encode_ucs4_string(svn_membuf_t *buffer, + const apr_int32_t *ucs4str, + apr_size_t length, + apr_size_t *result_length) +{ + *result_length = 0; + while (length-- > 0) + SVN_ERR(encode_ucs4(buffer, *ucs4str++, result_length)); + svn_membuf__resize(buffer, *result_length + 1); + ((char*)buffer->data)[*result_length] = '\0'; + return SVN_NO_ERROR; +} + + +svn_error_t * +svn_utf__glob(svn_boolean_t *match, + const char *pattern, apr_size_t pattern_len, + const char *string, apr_size_t string_len, + const char *escape, apr_size_t escape_len, + svn_boolean_t sql_like, + svn_membuf_t *pattern_buf, + svn_membuf_t *string_buf, + svn_membuf_t *temp_buf) +{ + apr_size_t patternbuf_len; + apr_size_t tempbuf_len; + + /* If we're in GLOB mode, we don't do custom escape chars. */ + if (escape && !sql_like) + return svn_error_create(SVN_ERR_UTF8_GLOB, NULL, + _("Cannot use a custom escape token" + " in glob matching mode")); + + /* Convert the patern to NFD UTF-8. We can't use the UCS-4 result + because apr_fnmatch can't handle it.*/ + SVN_ERR(decompose_normalized(&tempbuf_len, pattern, pattern_len, temp_buf)); + if (!sql_like) + SVN_ERR(svn_utf__encode_ucs4_string(pattern_buf, temp_buf->data, + tempbuf_len, &patternbuf_len)); + else + { + /* Convert a LIKE pattern to a GLOB pattern that apr_fnmatch can use. */ + const apr_int32_t *like = temp_buf->data; + apr_int32_t ucs4esc; + svn_boolean_t escaped; + apr_size_t i; + + if (!escape) + ucs4esc = -1; /* Definitely an invalid UCS-4 character. */ + else + { + const int nullterm = (escape_len == SVN_UTF__UNKNOWN_LENGTH + ? UTF8PROC_NULLTERM : 0); + ssize_t result = + utf8proc_decompose((const void*) escape, escape_len, &ucs4esc, 1, + UTF8PROC_DECOMPOSE | UTF8PROC_STABLE | nullterm); + if (result < 0) + return svn_error_create(SVN_ERR_UTF8PROC_ERROR, NULL, + gettext(utf8proc_errmsg(result))); + if (result == 0 || result > 1) + return svn_error_create(SVN_ERR_UTF8_GLOB, NULL, + _("Escape token must be one character")); + if ((ucs4esc & 0xFF) != ucs4esc) + return svn_error_createf(SVN_ERR_UTF8_GLOB, NULL, + _("Invalid escape character U+%04lX"), + (long)ucs4esc); + } + + patternbuf_len = 0; + svn_membuf__ensure(pattern_buf, tempbuf_len + 1); + for (i = 0, escaped = FALSE; i < tempbuf_len; ++i, ++like) + { + if (*like == ucs4esc && !escaped) + { + svn_membuf__resize(pattern_buf, patternbuf_len + 1); + ((char*)pattern_buf->data)[patternbuf_len++] = '\\'; + escaped = TRUE; + } + else if (escaped) + { + SVN_ERR(encode_ucs4(pattern_buf, *like, &patternbuf_len)); + escaped = FALSE; + } + else + { + if ((*like == '[' || *like == '\\') && !escaped) + { + /* Escape brackets and backslashes which are always + literals in LIKE patterns. */ + svn_membuf__resize(pattern_buf, patternbuf_len + 1); + ((char*)pattern_buf->data)[patternbuf_len++] = '\\'; + escaped = TRUE; + --i; --like; + continue; + } + + /* Replace LIKE wildcards with their GLOB equivalents. */ + if (*like == '%' || *like == '_') + { + const char wildcard = (*like == '%' ? '*' : '?'); + svn_membuf__resize(pattern_buf, patternbuf_len + 1); + ((char*)pattern_buf->data)[patternbuf_len++] = wildcard; + } + else + SVN_ERR(encode_ucs4(pattern_buf, *like, &patternbuf_len)); + } + } + svn_membuf__resize(pattern_buf, patternbuf_len + 1); + ((char*)pattern_buf->data)[patternbuf_len] = '\0'; + } + + /* Now normalize the string */ + SVN_ERR(decompose_normalized(&tempbuf_len, string, string_len, temp_buf)); + SVN_ERR(svn_utf__encode_ucs4_string(string_buf, temp_buf->data, + tempbuf_len, &tempbuf_len)); + + *match = !apr_fnmatch(pattern_buf->data, string_buf->data, 0); + return SVN_NO_ERROR; +} + +svn_boolean_t +svn_utf__is_normalized(const char *string, apr_pool_t *scratch_pool) +{ + svn_error_t *err; + svn_membuf_t buffer; + apr_size_t result_length; + const apr_size_t length = strlen(string); + svn_membuf__create(&buffer, length * sizeof(apr_int32_t), scratch_pool); + err = normalize_cstring(&result_length, string, length, &buffer); + if (err) + { + svn_error_clear(err); + return FALSE; + } + return (length == result_length && 0 == strcmp(string, buffer.data)); +} + +const char * +svn_utf__fuzzy_escape(const char *src, apr_size_t length, apr_pool_t *pool) +{ + /* Hexadecimal digits for code conversion. */ + static const char digits[] = "0123456789ABCDEF"; + + /* Flags used for Unicode decomposition. */ + static const int decomp_flags = ( + UTF8PROC_COMPAT | UTF8PROC_STABLE | UTF8PROC_LUMP + | UTF8PROC_NLF2LF | UTF8PROC_STRIPCC | UTF8PROC_STRIPMARK); + + svn_stringbuf_t *result; + svn_membuf_t buffer; + ssize_t decomp_length; + ssize_t len; + + /* Decompose to a non-reversible compatibility format. */ + svn_membuf__create(&buffer, length * sizeof(apr_int32_t), pool); + decomp_length = unicode_decomposition(decomp_flags, src, length, &buffer); + if (decomp_length < 0) + { + svn_membuf_t part; + apr_size_t done, prev; + + /* The only other error we can receive here indicates an integer + overflow due to the length of the input string. Not very + likely, but we certainly shouldn't continue in that case. */ + SVN_ERR_ASSERT_NO_RETURN(decomp_length == UTF8PROC_ERROR_INVALIDUTF8); + + /* Break the decomposition into parts that are valid UTF-8, and + bytes that are not. Represent the invalid bytes in the target + erray by their negative value. This works because utf8proc + will not generate Unicode code points with values larger than + U+10FFFF. */ + svn_membuf__create(&part, sizeof(apr_int32_t), pool); + decomp_length = 0; + done = prev = 0; + while (done < length) + { + apr_int32_t uc; + + while (done < length) + { + len = utf8proc_iterate((uint8_t*)src + done, length - done, &uc); + if (len < 0) + break; + done += len; + } + + /* Decompose the valid part */ + if (done > prev) + { + len = unicode_decomposition( + decomp_flags, src + prev, done - prev, &part); + SVN_ERR_ASSERT_NO_RETURN(len > 0); + svn_membuf__resize( + &buffer, (decomp_length + len) * sizeof(apr_int32_t)); + memcpy((apr_int32_t*)buffer.data + decomp_length, + part.data, len * sizeof(apr_int32_t)); + decomp_length += len; + prev = done; + } + + /* What follows could be a valid UTF-8 sequence, but not + a valid Unicode character. */ + if (done < length) + { + const char *last; + + /* Determine the length of the UTF-8 sequence */ + const char *const p = src + done; + len = utf8proc_utf8class[(uint8_t)*p]; + + /* Check if the multi-byte sequence is valid UTF-8. */ + if (len > 1 && len <= (apr_ssize_t)(length - done)) + last = svn_utf__last_valid(p, len); + else + last = NULL; + + /* Might not be a valid UTF-8 sequence at all */ + if (!last || (last && last - p < len)) + { + uc = -((apr_int32_t)(*p & 0xff)); + len = 1; + } + else + { + switch (len) + { + /* Decode the UTF-8 sequence without validation. */ + case 2: + uc = ((p[0] & 0x1f) << 6) + (p[1] & 0x3f); + break; + case 3: + uc = (((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + + (p[2] & 0x3f)); + break; + case 4: + uc = (((p[0] & 0x07) << 18) + ((p[1] & 0x3f) << 12) + + ((p[2] & 0x3f) << 6) + (p[3] & 0x3f)); + break; + default: + SVN_ERR_ASSERT_NO_RETURN( + !"Unexpected invalid UTF-8 byte"); + } + + } + + svn_membuf__resize( + &buffer, (decomp_length + 1) * sizeof(apr_int32_t)); + ((apr_int32_t*)buffer.data)[decomp_length++] = uc; + done += len; + prev = done; + } + } + } + + /* Scan the result and deleting any combining diacriticals and + inserting placeholders where any non-ascii characters remain. */ + result = svn_stringbuf_create_ensure(decomp_length, pool); + for (len = 0; len < decomp_length; ++len) + { + const apr_int32_t cp = ((apr_int32_t*)buffer.data)[len]; + if (cp > 0 && cp < 127) + svn_stringbuf_appendbyte(result, (char)cp); + else if (cp == 0) + svn_stringbuf_appendcstr(result, "\\0"); + else if (cp < 0) + { + const apr_int32_t rcp = ((-cp) & 0xff); + svn_stringbuf_appendcstr(result, "?\\"); + svn_stringbuf_appendbyte(result, digits[(rcp & 0x00f0) >> 4]); + svn_stringbuf_appendbyte(result, digits[(rcp & 0x000f)]); + } + else + { + if (utf8proc_codepoint_valid(cp)) + { + const utf8proc_property_t *prop = utf8proc_get_property(cp); + if (prop->combining_class != 0) + continue; /* Combining mark; ignore */ + svn_stringbuf_appendcstr(result, "{U+"); + } + else + svn_stringbuf_appendcstr(result, "{U?"); + if (cp > 0xffff) + { + svn_stringbuf_appendbyte(result, digits[(cp & 0xf00000) >> 20]); + svn_stringbuf_appendbyte(result, digits[(cp & 0x0f0000) >> 16]); + } + svn_stringbuf_appendbyte(result, digits[(cp & 0xf000) >> 12]); + svn_stringbuf_appendbyte(result, digits[(cp & 0x0f00) >> 8]); + svn_stringbuf_appendbyte(result, digits[(cp & 0x00f0) >> 4]); + svn_stringbuf_appendbyte(result, digits[(cp & 0x000f)]); + svn_stringbuf_appendbyte(result, '}'); + } + } + + return result->data; +} |