diff options
Diffstat (limited to 'ext/intl/grapheme/grapheme_util.c')
-rw-r--r-- | ext/intl/grapheme/grapheme_util.c | 642 |
1 files changed, 642 insertions, 0 deletions
diff --git a/ext/intl/grapheme/grapheme_util.c b/ext/intl/grapheme/grapheme_util.c new file mode 100644 index 0000000..9200855 --- /dev/null +++ b/ext/intl/grapheme/grapheme_util.c @@ -0,0 +1,642 @@ +/* + +----------------------------------------------------------------------+ + | PHP Version 5 | + +----------------------------------------------------------------------+ + | This source file is subject to version 3.01 of the PHP license, | + | that is bundled with this package in the file LICENSE, and is | + | available through the world-wide-web at the following url: | + | http://www.php.net/license/3_01.txt | + | If you did not receive a copy of the PHP license and are unable to | + | obtain it through the world-wide-web, please send a note to | + | license@php.net so we can mail you a copy immediately. | + +----------------------------------------------------------------------+ + | Author: Ed Batutis <ed@batutis.com> | + +----------------------------------------------------------------------+ + */ + +/* {{{ includes */ +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <php.h> +#include "grapheme.h" +#include "grapheme_util.h" +#include "intl_common.h" + +#include <unicode/utypes.h> +#include <unicode/ucol.h> +#include <unicode/ustring.h> +#include <unicode/ubrk.h> + +#include "ext/standard/php_string.h" + +ZEND_EXTERN_MODULE_GLOBALS( intl ) + +/* }}} */ + +/* {{{ grapheme_close_global_iterator - clean up */ +void +grapheme_close_global_iterator( TSRMLS_D ) +{ + UBreakIterator *global_break_iterator = INTL_G( grapheme_iterator ); + + if ( NULL != global_break_iterator ) { + ubrk_close(global_break_iterator); + } +} +/* }}} */ + +/* {{{ grapheme_intl_case_fold: convert string to lowercase */ +void +grapheme_intl_case_fold(UChar** ptr_to_free, UChar **str, int32_t *str_len, UErrorCode *pstatus ) +{ + UChar *dest; + int32_t dest_len, size_required; + + /* allocate a destination string that is a bit larger than the src, hoping that is enough */ + dest_len = (*str_len) + ( *str_len / 10 ); + dest = (UChar*) eumalloc(dest_len); + + *pstatus = U_ZERO_ERROR; + size_required = u_strFoldCase(dest, dest_len, *str, *str_len, U_FOLD_CASE_DEFAULT, pstatus); + + dest_len = size_required; + + if ( U_BUFFER_OVERFLOW_ERROR == *pstatus ) { + + dest = (UChar*) eurealloc(dest, dest_len); + + *pstatus = U_ZERO_ERROR; + size_required = u_strFoldCase(dest, dest_len, *str, *str_len, U_FOLD_CASE_DEFAULT, pstatus); + } + + if ( U_FAILURE(*pstatus) ) { + return; + } + + if ( NULL != ptr_to_free) { + efree(*ptr_to_free); + *ptr_to_free = dest; + } + + *str = dest; + *str_len = dest_len; + + return; +} +/* }}} */ + +/* {{{ grapheme_substr_ascii f='from' - starting point, l='length' */ +void +grapheme_substr_ascii(char *str, int str_len, int f, int l, int argc, char **sub_str, int *sub_str_len) +{ + *sub_str = NULL; + + if (argc > 2) { + if ((l < 0 && -l > str_len)) { + return; + } else if (l > str_len) { + l = str_len; + } + } else { + l = str_len; + } + + if (f > str_len || (f < 0 && -f > str_len)) { + return; + } + + if (l < 0 && (l + str_len - f) < 0) { + return; + } + + /* if "from" position is negative, count start position from the end + * of the string + */ + if (f < 0) { + f = str_len + f; + if (f < 0) { + f = 0; + } + } + + + /* if "length" position is negative, set it to the length + * needed to stop that many chars from the end of the string + */ + if (l < 0) { + l = (str_len - f) + l; + if (l < 0) { + l = 0; + } + } + + if (f >= str_len) { + return; + } + + if ((f + l) > str_len) { + l = str_len - f; + } + + *sub_str = str + f; + *sub_str_len = l; + + return; +} +/* }}} */ + +/* {{{ grapheme_strrpos_utf16 - strrpos using utf16 */ +int +grapheme_strrpos_utf16(unsigned char *haystack, int32_t haystack_len, unsigned char*needle, int32_t needle_len, int32_t offset, int f_ignore_case TSRMLS_DC) +{ + UChar *uhaystack, *puhaystack, *uhaystack_end, *uneedle; + int32_t uhaystack_len, uneedle_len; + UErrorCode status; + unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE]; + UBreakIterator* bi = NULL; + int ret_pos, pos; + + /* convert the strings to UTF-16. */ + uhaystack = NULL; + uhaystack_len = 0; + status = U_ZERO_ERROR; + intl_convert_utf8_to_utf16(&uhaystack, &uhaystack_len, (char *) haystack, haystack_len, &status ); + + if ( U_FAILURE( status ) ) { + /* Set global error code. */ + intl_error_set_code( NULL, status TSRMLS_CC ); + + /* Set error messages. */ + intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC ); + if (uhaystack) { + efree( uhaystack ); + } + return -1; + } + + if ( f_ignore_case ) { + grapheme_intl_case_fold(&uhaystack, &uhaystack, &uhaystack_len, &status ); + } + + /* get a pointer to the haystack taking into account the offset */ + bi = NULL; + status = U_ZERO_ERROR; + bi = grapheme_get_break_iterator(u_break_iterator_buffer, &status TSRMLS_CC ); + + puhaystack = grapheme_get_haystack_offset(bi, uhaystack, uhaystack_len, offset); + + if ( NULL == puhaystack ) { + intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 TSRMLS_CC ); + if (uhaystack) { + efree( uhaystack ); + } + ubrk_close (bi); + return -1; + } + + uneedle = NULL; + uneedle_len = 0; + status = U_ZERO_ERROR; + intl_convert_utf8_to_utf16(&uneedle, &uneedle_len, (char *) needle, needle_len, &status ); + + if ( U_FAILURE( status ) ) { + /* Set global error code. */ + intl_error_set_code( NULL, status TSRMLS_CC ); + + /* Set error messages. */ + intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC ); + if (uhaystack) { + efree( uhaystack ); + } + if (uneedle) { + efree( uneedle ); + } + ubrk_close (bi); + return -1; + } + + if ( f_ignore_case ) { + grapheme_intl_case_fold(&uneedle, &uneedle, &uneedle_len, &status ); + } + + ret_pos = -1; /* -1 represents 'not found' */ + + /* back up until there's needle_len characters to compare */ + + uhaystack_end = uhaystack + uhaystack_len; + pos = ubrk_last(bi); + puhaystack = uhaystack + pos; + + while ( uhaystack_end - puhaystack < uneedle_len ) { + + pos = ubrk_previous(bi); + + if ( UBRK_DONE == pos ) { + break; + } + + puhaystack = uhaystack + pos; + } + + /* is there enough haystack left to hold the needle? */ + if ( ( uhaystack_end - puhaystack ) < uneedle_len ) { + /* not enough, not found */ + goto exit; + } + + while ( UBRK_DONE != pos ) { + + if (!u_memcmp(uneedle, puhaystack, uneedle_len)) { /* needle_len - 1 in zend memnstr? */ + + /* does the grapheme in the haystack end at the same place as the last grapheme in the needle? */ + + if ( ubrk_isBoundary(bi, pos + uneedle_len) ) { + + /* found it, get grapheme count offset */ + ret_pos = grapheme_count_graphemes(bi, uhaystack, pos); + break; + } + + /* set position back */ + ubrk_isBoundary(bi, pos); + } + + pos = ubrk_previous(bi); + puhaystack = uhaystack + pos; + } + +exit: + if (uhaystack) { + efree( uhaystack ); + } + if (uneedle) { + efree( uneedle ); + } + ubrk_close (bi); + + return ret_pos; +} + +/* }}} */ + +/* {{{ grapheme_strpos_utf16 - strrpos using utf16*/ +int +grapheme_strpos_utf16(unsigned char *haystack, int32_t haystack_len, unsigned char*needle, int32_t needle_len, int32_t offset, int32_t *puchar_pos, int f_ignore_case TSRMLS_DC) +{ + UChar *uhaystack, *puhaystack, *uneedle; + int32_t uhaystack_len, uneedle_len; + int ret_pos; + unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE]; + UBreakIterator* bi; + UErrorCode status; + + *puchar_pos = -1; + + /* convert the strings to UTF-16. */ + + uhaystack = NULL; + uhaystack_len = 0; + status = U_ZERO_ERROR; + intl_convert_utf8_to_utf16(&uhaystack, &uhaystack_len, (char *) haystack, haystack_len, &status ); + + if ( U_FAILURE( status ) ) { + /* Set global error code. */ + intl_error_set_code( NULL, status TSRMLS_CC ); + + /* Set error messages. */ + intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC ); + if (uhaystack) { + efree( uhaystack ); + } + return -1; + } + + /* get a pointer to the haystack taking into account the offset */ + bi = NULL; + status = U_ZERO_ERROR; + bi = grapheme_get_break_iterator(u_break_iterator_buffer, &status TSRMLS_CC ); + + puhaystack = grapheme_get_haystack_offset(bi, uhaystack, uhaystack_len, offset); + uhaystack_len = (uhaystack_len - ( puhaystack - uhaystack)); + + if ( NULL == puhaystack ) { + + intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 TSRMLS_CC ); + if (uhaystack) { + efree( uhaystack ); + } + ubrk_close (bi); + + return -1; + } + + if ( f_ignore_case ) { + grapheme_intl_case_fold(&uhaystack, &puhaystack, &uhaystack_len, &status ); + } + + uneedle = NULL; + uneedle_len = 0; + status = U_ZERO_ERROR; + intl_convert_utf8_to_utf16(&uneedle, &uneedle_len, (char *) needle, needle_len, &status ); + + if ( U_FAILURE( status ) ) { + /* Set global error code. */ + intl_error_set_code( NULL, status TSRMLS_CC ); + + /* Set error messages. */ + intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC ); + if (uhaystack) { + efree( uhaystack ); + } + if (uneedle) { + efree( uneedle ); + } + ubrk_close (bi); + + return -1; + } + + if ( f_ignore_case ) { + grapheme_intl_case_fold(&uneedle, &uneedle, &uneedle_len, &status ); + } + + ret_pos = grapheme_memnstr_grapheme(bi, puhaystack, uneedle, uneedle_len, puhaystack + uhaystack_len ); + + *puchar_pos = ubrk_current(bi); + + if (uhaystack) { + efree( uhaystack ); + } + if (uneedle) { + efree( uneedle ); + } + ubrk_close (bi); + + return ret_pos; +} + +/* }}} */ + +/* {{{ grapheme_ascii_check: ASCII check */ +int grapheme_ascii_check(const unsigned char *day, int32_t len) +{ + int ret_len = len; + while ( len-- ) { + if ( *day++ > 0x7f ) + return -1; + } + + return ret_len; +} + +/* }}} */ + +/* {{{ grapheme_split_string: find and optionally return grapheme boundaries */ +int grapheme_split_string(const UChar *text, int32_t text_length, int boundary_array[], int boundary_array_len TSRMLS_DC ) +{ + unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE]; + UErrorCode status = U_ZERO_ERROR; + int ret_len, pos; + UBreakIterator* bi; + + bi = grapheme_get_break_iterator((void*)u_break_iterator_buffer, &status TSRMLS_CC ); + + if( U_FAILURE(status) ) { + return -1; + } + + ubrk_setText(bi, text, text_length, &status); + + pos = 0; + + for ( ret_len = 0; pos != UBRK_DONE; ) { + + pos = ubrk_next(bi); + + if ( pos != UBRK_DONE ) { + + if ( NULL != boundary_array && ret_len < boundary_array_len ) { + boundary_array[ret_len] = pos; + } + + ret_len++; + } + } + + ubrk_close(bi); + + return ret_len; +} +/* }}} */ + +/* {{{ grapheme_count_graphemes */ +int32_t +grapheme_count_graphemes(UBreakIterator *bi, UChar *string, int32_t string_len) +{ + int ret_len = 0; + int pos = 0; + UErrorCode status = U_ZERO_ERROR; + + ubrk_setText(bi, string, string_len, &status); + + do { + + pos = ubrk_next(bi); + + if ( UBRK_DONE != pos ) { + ret_len++; + } + + } while ( UBRK_DONE != pos ); + + return ret_len; +} +/* }}} */ + +/* {{{ grapheme_memnstr_grapheme: find needle in haystack using grapheme boundaries */ +int32_t +grapheme_memnstr_grapheme(UBreakIterator *bi, UChar *haystack, UChar *needle, int32_t needle_len, UChar *end) +{ + UChar *p = haystack; + UChar ne = needle[needle_len-1]; + UErrorCode status; + int32_t grapheme_offset; + + end -= needle_len; + + while (p <= end) { + + if ((p = u_memchr(p, *needle, (end-p+1))) && ne == p[needle_len-1]) { + + if (!u_memcmp(needle, p, needle_len - 1)) { /* needle_len - 1 works because if needle_len is 1, we've already tested the char */ + + /* does the grapheme end here? */ + + status = U_ZERO_ERROR; + ubrk_setText (bi, haystack, (end - haystack) + needle_len, &status); + + if ( ubrk_isBoundary (bi, (p - haystack) + needle_len) ) { + + /* found it, get grapheme count offset */ + grapheme_offset = grapheme_count_graphemes(bi, haystack, (p - haystack)); + + return grapheme_offset; + } + } + } + + if (p == NULL) { + return -1; + } + + p++; + } + + return -1; +} + +/* }}} */ + +/* {{{ grapheme_memrstr_grapheme: reverse find needle in haystack using grapheme boundaries */ +inline void *grapheme_memrchr_grapheme(const void *s, int c, int32_t n) +{ + register unsigned char *e; + + if (n <= 0) { + return NULL; + } + + for (e = (unsigned char *)s + n - 1; e >= (unsigned char *)s; e--) { + if (*e == (unsigned char)c) { + return (void *)e; + } + } + + return NULL; +} +/* }}} */ + +/* {{{ grapheme_get_haystack_offset - bump the haystack pointer based on the grapheme count offset */ +UChar * +grapheme_get_haystack_offset(UBreakIterator* bi, UChar *uhaystack, int32_t uhaystack_len, int32_t offset) +{ + UErrorCode status; + int32_t pos; + int32_t (*iter_op)(UBreakIterator* bi); + int iter_incr; + + if ( NULL != bi ) { + status = U_ZERO_ERROR; + ubrk_setText (bi, uhaystack, uhaystack_len, &status); + } + + if ( 0 == offset ) { + return uhaystack; + } + + if ( offset < 0 ) { + iter_op = ubrk_previous; + ubrk_last(bi); /* one past the end */ + iter_incr = 1; + } + else { + iter_op = ubrk_next; + iter_incr = -1; + } + + pos = 0; + + while ( pos != UBRK_DONE && offset != 0 ) { + + pos = iter_op(bi); + + if ( UBRK_DONE != pos ) { + offset += iter_incr; + } + } + + if ( offset != 0 ) { + return NULL; + } + + return uhaystack + pos; +} +/* }}} */ + +/* {{{ grapheme_strrpos_ascii: borrowed from the php ext/standard/string.c */ + int32_t +grapheme_strrpos_ascii(unsigned char *haystack, int32_t haystack_len, unsigned char *needle, int32_t needle_len, int32_t offset) +{ + unsigned char *p, *e; + + if (offset >= 0) { + p = haystack + offset; + e = haystack + haystack_len - needle_len; + } else { + p = haystack; + if (needle_len > -offset) { + e = haystack + haystack_len - needle_len; + } else { + e = haystack + haystack_len + offset; + } + } + + if (needle_len == 1) { + /* Single character search can shortcut memcmps */ + while (e >= p) { + if (*e == *needle) { + return (e - p + (offset > 0 ? offset : 0)); + } + e--; + } + return -1; + } + + while (e >= p) { + if (memcmp(e, needle, needle_len) == 0) { + return (e - p + (offset > 0 ? offset : 0)); + } + e--; + } + + return -1; +} + +/* }}} */ + +/* {{{ grapheme_get_break_iterator: get a clone of the global character break iterator */ +UBreakIterator* +grapheme_get_break_iterator(void *stack_buffer, UErrorCode *status TSRMLS_DC ) +{ + int32_t buffer_size; + + UBreakIterator *global_break_iterator = INTL_G( grapheme_iterator ); + + if ( NULL == global_break_iterator ) { + + global_break_iterator = ubrk_open(UBRK_CHARACTER, + NULL, /* icu default locale - locale has no effect on this iterator */ + NULL, /* text not set in global iterator */ + 0, /* text length = 0 */ + status); + + INTL_G(grapheme_iterator) = global_break_iterator; + } + + buffer_size = U_BRK_SAFECLONE_BUFFERSIZE; + + return ubrk_safeClone(global_break_iterator, stack_buffer, &buffer_size, status); +} +/* }}} */ + +/* + * Local variables: + * tab-width: 4 + * c-basic-offset: 4 + * End: + * vim600: fdm=marker + * vim: noet sw=4 ts=4 + */ + |