summaryrefslogtreecommitdiff
path: root/ext/mbstring/php_unicode.c
diff options
context:
space:
mode:
authorNikita Popov <nikita.ppv@gmail.com>2017-07-20 15:33:24 +0200
committerNikita Popov <nikita.ppv@gmail.com>2017-07-20 15:33:24 +0200
commitc098304e172cdabf8c0c685e5e843fff772615ce (patch)
treedb6b3bd5153f267281ae2b4dddeefce23fec05f2 /ext/mbstring/php_unicode.c
parent17da862b51caf0875d47e87040336cf3fcd429d1 (diff)
downloadphp-git-c098304e172cdabf8c0c685e5e843fff772615ce.tar.gz
Reduce number of encoding conversions in case conversion
Don't indirect through UCS4BE, instead directly work on wchars using a custom filter. This replaces the pipeline utf8 -> wchar -> ucs4be -> wchar -case-> wchar -> ucs4be -> wchar -> utf8 with utf8 -> wchar -case-> -> wchar -> utf8
Diffstat (limited to 'ext/mbstring/php_unicode.c')
-rw-r--r--ext/mbstring/php_unicode.c133
1 files changed, 85 insertions, 48 deletions
diff --git a/ext/mbstring/php_unicode.c b/ext/mbstring/php_unicode.c
index 47607ac331..f6f0e3f0e0 100644
--- a/ext/mbstring/php_unicode.c
+++ b/ext/mbstring/php_unicode.c
@@ -257,68 +257,105 @@ MBSTRING_API unsigned long php_unicode_totitle(unsigned long code, enum mbfl_no_
((unsigned char*)(ptr))[3] = (v ) & 0xff;\
}
-MBSTRING_API char *php_unicode_convert_case(
- int case_mode, const char *srcstr, size_t srclen, size_t *ret_len,
- const mbfl_encoding *src_encoding)
+struct convert_case_data {
+ mbfl_convert_filter *next_filter;
+ enum mbfl_no_encoding no_encoding;
+ int case_mode;
+ int title_mode;
+};
+
+static int convert_case_filter(int c, void *void_data)
{
- char *unicode, *newstr;
- size_t unicode_len;
- unsigned char *unicode_ptr;
- size_t i;
- enum mbfl_no_encoding src_no_encoding = src_encoding->no_encoding;
-
- unicode = php_mb_convert_encoding_ex(srcstr, srclen, &mbfl_encoding_ucs4be, src_encoding, &unicode_len);
- if (unicode == NULL)
- return NULL;
-
- unicode_ptr = (unsigned char *)unicode;
-
- switch(case_mode) {
+ struct convert_case_data *data = (struct convert_case_data *) void_data;
+ switch (data->case_mode) {
case PHP_UNICODE_CASE_UPPER:
- for (i = 0; i < unicode_len; i+=4) {
- UINT32_TO_BE_ARY(&unicode_ptr[i],
- php_unicode_toupper(BE_ARY_TO_UINT32(&unicode_ptr[i]), src_no_encoding));
- }
+ c = php_unicode_toupper(c, data->no_encoding);
break;
case PHP_UNICODE_CASE_LOWER:
- for (i = 0; i < unicode_len; i+=4) {
- UINT32_TO_BE_ARY(&unicode_ptr[i],
- php_unicode_tolower(BE_ARY_TO_UINT32(&unicode_ptr[i]), src_no_encoding));
- }
+ c = php_unicode_tolower(c, data->no_encoding);
break;
- case PHP_UNICODE_CASE_TITLE: {
- int mode = 0;
-
- for (i = 0; i < unicode_len; i+=4) {
- int res = php_unicode_is_prop(
- BE_ARY_TO_UINT32(&unicode_ptr[i]),
- UC_MN, UC_ME, UC_CF, UC_LM, UC_SK, UC_LU, UC_LL, UC_LT, UC_PO, UC_OS, -1);
- if (mode) {
- if (res) {
- UINT32_TO_BE_ARY(&unicode_ptr[i],
- php_unicode_tolower(BE_ARY_TO_UINT32(&unicode_ptr[i]), src_no_encoding));
- } else {
- mode = 0;
- }
+ case PHP_UNICODE_CASE_TITLE:
+ {
+ int res = php_unicode_is_prop(c,
+ UC_MN, UC_ME, UC_CF, UC_LM, UC_SK, UC_LU, UC_LL, UC_LT, UC_PO, UC_OS, -1);
+ if (data->title_mode) {
+ if (res) {
+ c = php_unicode_tolower(c, data->no_encoding);
} else {
- if (res) {
- mode = 1;
- UINT32_TO_BE_ARY(&unicode_ptr[i],
- php_unicode_totitle(BE_ARY_TO_UINT32(&unicode_ptr[i]), src_no_encoding));
- }
+ data->title_mode = 0;
+ }
+ } else {
+ if (res) {
+ data->title_mode = 1;
+ c = php_unicode_totitle(c, data->no_encoding);
}
}
- } break;
+ break;
+ }
+ }
+ return (*data->next_filter->filter_function)(c, data->next_filter);
+}
+
+MBSTRING_API char *php_unicode_convert_case(
+ int case_mode, const char *srcstr, size_t srclen, size_t *ret_len,
+ const mbfl_encoding *src_encoding)
+{
+ struct convert_case_data data;
+ mbfl_convert_filter *from_wchar, *to_wchar;
+ mbfl_string result, *result_ptr;
+
+ mbfl_memory_device device;
+ mbfl_memory_device_init(&device, srclen + 1, 0);
+
+ /* encoding -> wchar filter */
+ to_wchar = mbfl_convert_filter_new(src_encoding->no_encoding,
+ mbfl_no_encoding_wchar, convert_case_filter, NULL, &data);
+ if (to_wchar == NULL) {
+ mbfl_memory_device_clear(&device);
+ return NULL;
+ }
+
+ /* wchar -> encoding filter */
+ from_wchar = mbfl_convert_filter_new(
+ mbfl_no_encoding_wchar, src_encoding->no_encoding,
+ mbfl_memory_device_output, NULL, &device);
+ if (from_wchar == NULL) {
+ mbfl_convert_filter_delete(to_wchar);
+ mbfl_memory_device_clear(&device);
+ return NULL;
+ }
+ data.next_filter = from_wchar;
+ data.no_encoding = src_encoding->no_encoding;
+ data.case_mode = case_mode;
+ data.title_mode = 0;
+
+ {
+ /* feed data */
+ const unsigned char *p = (const unsigned char *) srcstr;
+ size_t n = srclen;
+ while (n > 0) {
+ if ((*to_wchar->filter_function)(*p++, to_wchar) < 0) {
+ break;
+ }
+ n--;
+ }
}
- newstr = php_mb_convert_encoding_ex(
- unicode, unicode_len, src_encoding, &mbfl_encoding_ucs4be, ret_len);
- efree(unicode);
+ mbfl_convert_filter_flush(to_wchar);
+ mbfl_convert_filter_flush(from_wchar);
+ result_ptr = mbfl_memory_device_result(&device, &result);
+ mbfl_convert_filter_delete(to_wchar);
+ mbfl_convert_filter_delete(from_wchar);
+
+ if (!result_ptr) {
+ return NULL;
+ }
- return newstr;
+ *ret_len = result.len;
+ return result.val;
}