diff options
-rwxr-xr-x | ext/standard/basic_functions.stub.php | 2 | ||||
-rwxr-xr-x | ext/standard/basic_functions_arginfo.h | 8 | ||||
-rw-r--r-- | ext/standard/levenshtein.c | 89 | ||||
-rw-r--r-- | ext/standard/tests/strings/levenshtein.phpt | 123 | ||||
-rw-r--r-- | ext/standard/tests/strings/levenshtein_bug_16473.phpt | 16 | ||||
-rw-r--r-- | ext/standard/tests/strings/levenshtein_bug_6562.phpt | 16 | ||||
-rw-r--r-- | ext/standard/tests/strings/levenshtein_bug_7368.phpt | 12 | ||||
-rw-r--r-- | ext/standard/tests/strings/levenshtein_error_conditions.phpt | 31 |
8 files changed, 163 insertions, 134 deletions
diff --git a/ext/standard/basic_functions.stub.php b/ext/standard/basic_functions.stub.php index 2b6e7709e6..f871a7633a 100755 --- a/ext/standard/basic_functions.stub.php +++ b/ext/standard/basic_functions.stub.php @@ -985,7 +985,7 @@ function iptcparse(string $iptcblock): array|false {} /* levenshtein.c */ -function levenshtein(string $str1, string $str2, $cost_ins = UNKNOWN, int $cost_rep = UNKNOWN, int $cost_del = UNKNOWN): int {} +function levenshtein(string $str1, string $str2, int $cost_ins = 1, int $cost_rep = 1, int $cost_del = 1): int {} /* link.c */ diff --git a/ext/standard/basic_functions_arginfo.h b/ext/standard/basic_functions_arginfo.h index e6f5b00771..a4b70b23bb 100755 --- a/ext/standard/basic_functions_arginfo.h +++ b/ext/standard/basic_functions_arginfo.h @@ -1,5 +1,5 @@ /* This is a generated file, edit the .stub.php file instead. - * Stub hash: 9cf2c691c081d9aaefaa9d22337a9e00efb0af77 */ + * Stub hash: 05b740207a70a9d272b4c327882fd0b52016a0af */ ZEND_BEGIN_ARG_WITH_RETURN_TYPE_INFO_EX(arginfo_set_time_limit, 0, 1, _IS_BOOL, 0) ZEND_ARG_TYPE_INFO(0, seconds, IS_LONG, 0) @@ -1568,9 +1568,9 @@ ZEND_END_ARG_INFO() ZEND_BEGIN_ARG_WITH_RETURN_TYPE_INFO_EX(arginfo_levenshtein, 0, 2, IS_LONG, 0) ZEND_ARG_TYPE_INFO(0, str1, IS_STRING, 0) ZEND_ARG_TYPE_INFO(0, str2, IS_STRING, 0) - ZEND_ARG_INFO(0, cost_ins) - ZEND_ARG_TYPE_INFO(0, cost_rep, IS_LONG, 0) - ZEND_ARG_TYPE_INFO(0, cost_del, IS_LONG, 0) + ZEND_ARG_TYPE_INFO_WITH_DEFAULT_VALUE(0, cost_ins, IS_LONG, 0, "1") + ZEND_ARG_TYPE_INFO_WITH_DEFAULT_VALUE(0, cost_rep, IS_LONG, 0, "1") + ZEND_ARG_TYPE_INFO_WITH_DEFAULT_VALUE(0, cost_del, IS_LONG, 0, "1") ZEND_END_ARG_INFO() #if defined(HAVE_SYMLINK) || defined(PHP_WIN32) diff --git a/ext/standard/levenshtein.c b/ext/standard/levenshtein.c index 75f317fc61..181674f4d2 100644 --- a/ext/standard/levenshtein.c +++ b/ext/standard/levenshtein.c @@ -15,42 +15,36 @@ */ #include "php.h" -#include <stdlib.h> -#include <errno.h> -#include <ctype.h> #include "php_string.h" #define LEVENSHTEIN_MAX_LENGTH 255 /* {{{ reference_levdist * reference implementation, only optimized for memory usage, not speed */ -static zend_long reference_levdist(const char *s1, size_t l1, const char *s2, size_t l2, zend_long cost_ins, zend_long cost_rep, zend_long cost_del ) +static zend_long reference_levdist(const zend_string *string1, const zend_string *string2, zend_long cost_ins, zend_long cost_rep, zend_long cost_del ) { zend_long *p1, *p2, *tmp; zend_long c0, c1, c2; size_t i1, i2; - if (l1 == 0) { - return l2 * cost_ins; + if (ZSTR_LEN(string1) == 0) { + return ZSTR_LEN(string2) * cost_ins; } - if (l2 == 0) { - return l1 * cost_del; + if (ZSTR_LEN(string2) == 0) { + return ZSTR_LEN(string1) * cost_del; } - if ((l1 > LEVENSHTEIN_MAX_LENGTH) || (l2 > LEVENSHTEIN_MAX_LENGTH)) { - return -1; - } - p1 = safe_emalloc((l2 + 1), sizeof(zend_long), 0); - p2 = safe_emalloc((l2 + 1), sizeof(zend_long), 0); + p1 = safe_emalloc((ZSTR_LEN(string2) + 1), sizeof(zend_long), 0); + p2 = safe_emalloc((ZSTR_LEN(string2) + 1), sizeof(zend_long), 0); - for (i2 = 0; i2 <= l2; i2++) { + for (i2 = 0; i2 <= ZSTR_LEN(string2); i2++) { p1[i2] = i2 * cost_ins; } - for (i1 = 0; i1 < l1 ; i1++) { + for (i1 = 0; i1 < ZSTR_LEN(string1) ; i1++) { p2[0] = p1[0] + cost_del; - for (i2 = 0; i2 < l2; i2++) { - c0 = p1[i2] + ((s1[i1] == s2[i2]) ? 0 : cost_rep); + for (i2 = 0; i2 < ZSTR_LEN(string2); i2++) { + c0 = p1[i2] + ((ZSTR_VAL(string1)[i1] == ZSTR_VAL(string2)[i2]) ? 0 : cost_rep); c1 = p1[i2 + 1] + cost_del; if (c1 < c0) { c0 = c1; @@ -65,7 +59,7 @@ static zend_long reference_levdist(const char *s1, size_t l1, const char *s2, si p1 = p2; p2 = tmp; } - c0 = p1[l2]; + c0 = p1[ZSTR_LEN(string2)]; efree(p1); efree(p2); @@ -74,56 +68,31 @@ static zend_long reference_levdist(const char *s1, size_t l1, const char *s2, si } /* }}} */ -/* {{{ custom_levdist */ -static int custom_levdist(char *str1, char *str2, char *callback_name) -{ - php_error_docref(NULL, E_WARNING, "The general Levenshtein support is not there yet"); - /* not there yet */ - - return -1; -} -/* }}} */ - /* {{{ Calculate Levenshtein distance between two strings */ PHP_FUNCTION(levenshtein) { - int argc = ZEND_NUM_ARGS(); - char *str1, *str2; - char *callback_name; - size_t str1_len, str2_len, callback_len; - zend_long cost_ins, cost_rep, cost_del; - zend_long distance = -1; - - switch (argc) { - case 2: /* just two strings: use maximum performance version */ - if (zend_parse_parameters(2, "ss", &str1, &str1_len, &str2, &str2_len) == FAILURE) { - RETURN_THROWS(); - } - distance = reference_levdist(str1, str1_len, str2, str2_len, 1, 1, 1); - break; - - case 5: /* more general version: calc cost by ins/rep/del weights */ - if (zend_parse_parameters(5, "sslll", &str1, &str1_len, &str2, &str2_len, &cost_ins, &cost_rep, &cost_del) == FAILURE) { - RETURN_THROWS(); - } - distance = reference_levdist(str1, str1_len, str2, str2_len, cost_ins, cost_rep, cost_del); - break; - - case 3: /* most general version: calc cost by user-supplied function */ - if (zend_parse_parameters(3, "sss", &str1, &str1_len, &str2, &str2_len, &callback_name, &callback_len) == FAILURE) { - RETURN_THROWS(); - } - distance = custom_levdist(str1, str2, callback_name); - break; + zend_string *string1, *string2; + zend_long cost_ins = 1; + zend_long cost_rep = 1; + zend_long cost_del = 1; + zend_long distance = 0; + + if (zend_parse_parameters(ZEND_NUM_ARGS(), "SS|lll", &string1, &string2, &cost_ins, &cost_rep, &cost_del) == FAILURE) { + RETURN_THROWS(); + } - default: - WRONG_PARAM_COUNT; + if (ZSTR_LEN(string1) > LEVENSHTEIN_MAX_LENGTH) { + zend_argument_value_error(1, "must be less than %d characters", LEVENSHTEIN_MAX_LENGTH + 1); + RETURN_THROWS(); } - if (distance < 0 && /* TODO */ ZEND_NUM_ARGS() != 3) { - php_error_docref(NULL, E_WARNING, "Argument string(s) too long"); + if (ZSTR_LEN(string2) > LEVENSHTEIN_MAX_LENGTH) { + zend_argument_value_error(2, "must be less than %d characters", LEVENSHTEIN_MAX_LENGTH + 1); + RETURN_THROWS(); } + distance = reference_levdist(string1, string2, cost_ins, cost_rep, cost_del); + RETURN_LONG(distance); } /* }}} */ diff --git a/ext/standard/tests/strings/levenshtein.phpt b/ext/standard/tests/strings/levenshtein.phpt index b4b7c03e7f..41b559f8cb 100644 --- a/ext/standard/tests/strings/levenshtein.phpt +++ b/ext/standard/tests/strings/levenshtein.phpt @@ -3,76 +3,61 @@ levenshtein() function test --FILE-- <?php -function test_me($title,$expect,$text1,$text2,$cost1="",$cost2="",$cost3="") { - - if ($cost1=="") { - $result=levenshtein($text1,$text2); - } - elseif ($cost2=="") { - $result=levenshtein($text1,$text2,$cost1); - } - else { - $result=levenshtein($text1,$text2,$cost1,$cost2,$cost3); - } - if($result==$expect) return 0; - - echo "$title: result is $result instead of $expect "; - echo "for '$text1'/'$text2' "; - if($cost1) echo "($cost1:$cost2:$cost3)"; - echo "\n"; - - return 1; -} - -$n=0; - -$n += test_me("equal" , 0, "12345", "12345"); -$n += test_me("1st empty" , 3, "", "xzy"); -$n += test_me("2nd empty" , 3, "xzy", ""); -$n += test_me("both empty" , 0, "", ""); -$n += test_me("1 char" , 1, "1", "2"); -$n += test_me("2 char swap", 2, "12", "21"); - -$n += test_me("inexpensive delete", 2, "2121", "11", 2, 1, 1); -$n += test_me("expensive delete" , 10, "2121", "11", 2, 1, 5); -$n += test_me("inexpensive insert", 2, "11", "2121", 1, 1, 1); -$n += test_me("expensive insert" , 10, "11", "2121", 5, 1, 1); - -$n += test_me("expensive replace" , 3, "111", "121", 2, 3, 2); -$n += test_me("very expensive replace", 4, "111", "121", 2, 9, 2); - -$n += test_me("bug #7368", 2, "13458", "12345"); -$n += test_me("bug #7368", 2, "1345", "1234"); - -$n += test_me("bug #6562", 1, "debugg", "debug"); -$n += test_me("bug #6562", 1, "ddebug", "debug"); -$n += test_me("bug #6562", 2, "debbbug", "debug"); -$n += test_me("bug #6562", 1, "debugging", "debuging"); - -$n += test_me("bug #16473", 2, "a", "bc"); -$n += test_me("bug #16473", 2, "xa", "xbc"); -$n += test_me("bug #16473", 2, "xax", "xbcx"); -$n += test_me("bug #16473", 2, "ax", "bcx"); - -$n += test_me("custom", -1, "111", "121", "my_levcode"); -$n += test_me("lt maxlength1", 254, "AbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsu", "A"); -$n += test_me("gt maxlength1", -1, "AbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuv", "A"); - -$n += test_me("lt maxlength2", 254, "A", "AbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsu"); -$n += test_me("gt maxlength2", -1, "A", "AbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuv"); - -echo ($n==0)?"all passed\n":"$n failed\n"; - -var_dump(levenshtein("", "", -1, -1, -1)); -var_dump(levenshtein("", "", 10, 10, 10)); +echo '--- Equal ---' . \PHP_EOL; +var_dump(levenshtein('12345', '12345')); + +echo '--- First string empty ---' . \PHP_EOL; +var_dump(levenshtein('', 'xyz')); +echo '--- Second string empty ---' . \PHP_EOL; +var_dump(levenshtein('xyz', '')); +echo '--- Both empty ---' . \PHP_EOL; +var_dump(levenshtein('', '')); +var_dump(levenshtein('', '', 10, 10, 10)); + +echo '--- 1 character ---' . \PHP_EOL; +var_dump(levenshtein('1', '2')); +echo '--- 2 character swapped ---' . \PHP_EOL; +var_dump(levenshtein('12', '21')); + +echo '--- Inexpensive deletion ---' . \PHP_EOL; +var_dump(levenshtein('2121', '11', 2)); +echo '--- Expensive deletion ---' . \PHP_EOL; +var_dump(levenshtein('2121', '11', 2, 1, 5)); + +echo '--- Inexpensive insertion ---' . \PHP_EOL; +var_dump(levenshtein('11', '2121')); +echo '--- Expensive insertion ---' . \PHP_EOL; +var_dump(levenshtein('11', '2121', 5)); + +echo '--- Expensive replacement ---' . \PHP_EOL; +var_dump(levenshtein('111', '121', 2, 3, 2)); +echo '--- Very expensive replacement ---' . \PHP_EOL; +var_dump(levenshtein('111', '121', 2, 9, 2)); ?> ---EXPECTF-- -Warning: levenshtein(): The general Levenshtein support is not there yet in %s on line %d - -Warning: levenshtein(): Argument string(s) too long in %s on line %d - -Warning: levenshtein(): Argument string(s) too long in %s on line %d -all passed +--EXPECT-- +--- Equal --- +int(0) +--- First string empty --- +int(3) +--- Second string empty --- +int(3) +--- Both empty --- int(0) int(0) +--- 1 character --- +int(1) +--- 2 character swapped --- +int(2) +--- Inexpensive deletion --- +int(2) +--- Expensive deletion --- +int(10) +--- Inexpensive insertion --- +int(2) +--- Expensive insertion --- +int(10) +--- Expensive replacement --- +int(3) +--- Very expensive replacement --- +int(4) diff --git a/ext/standard/tests/strings/levenshtein_bug_16473.phpt b/ext/standard/tests/strings/levenshtein_bug_16473.phpt new file mode 100644 index 0000000000..94c2650891 --- /dev/null +++ b/ext/standard/tests/strings/levenshtein_bug_16473.phpt @@ -0,0 +1,16 @@ +--TEST-- +levenshtein() bug 16473 +--FILE-- +<?php + +var_dump(levenshtein('a', 'bc')); +var_dump(levenshtein('xa', 'xbc')); +var_dump(levenshtein('xax', 'xbcx')); +var_dump(levenshtein('ax', 'bcx')); + +?> +--EXPECT-- +int(2) +int(2) +int(2) +int(2) diff --git a/ext/standard/tests/strings/levenshtein_bug_6562.phpt b/ext/standard/tests/strings/levenshtein_bug_6562.phpt new file mode 100644 index 0000000000..31f07f4780 --- /dev/null +++ b/ext/standard/tests/strings/levenshtein_bug_6562.phpt @@ -0,0 +1,16 @@ +--TEST-- +levenshtein() bug 6562 +--FILE-- +<?php + +var_dump(levenshtein('debugg', 'debug')); +var_dump(levenshtein('ddebug', 'debug')); +var_dump(levenshtein('debbbug', 'debug')); +var_dump(levenshtein('debugging', 'debuging')); + +?> +--EXPECT-- +int(1) +int(1) +int(2) +int(1) diff --git a/ext/standard/tests/strings/levenshtein_bug_7368.phpt b/ext/standard/tests/strings/levenshtein_bug_7368.phpt new file mode 100644 index 0000000000..ffbb7cd61f --- /dev/null +++ b/ext/standard/tests/strings/levenshtein_bug_7368.phpt @@ -0,0 +1,12 @@ +--TEST-- +levenshtein() bug 7368 +--FILE-- +<?php + +var_dump(levenshtein('13458', '12345')); +var_dump(levenshtein('1345', '1234')); + +?> +--EXPECT-- +int(2) +int(2) diff --git a/ext/standard/tests/strings/levenshtein_error_conditions.phpt b/ext/standard/tests/strings/levenshtein_error_conditions.phpt new file mode 100644 index 0000000000..bd1b80742b --- /dev/null +++ b/ext/standard/tests/strings/levenshtein_error_conditions.phpt @@ -0,0 +1,31 @@ +--TEST-- +levenshtein() error conditions +--FILE-- +<?php + +echo '--- String 1 ---' . \PHP_EOL; +var_dump(levenshtein('AbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsu', 'A')); +try { + var_dump(levenshtein('AbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuv', 'A')); +} catch (\ValueError $e) { + echo $e->getMessage() . \PHP_EOL; +} +echo '--- String 2 ---' . \PHP_EOL; +var_dump(levenshtein('A', 'AbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsu')); +try { + var_dump(levenshtein('A', 'AbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuv')); +} catch (\ValueError $e) { + echo $e->getMessage() . \PHP_EOL; +} + +// TODO ValueError for negative costs? +// var_dump(levenshtein("", "", -1, -1, -1)); + +?> +--EXPECT-- +--- String 1 --- +int(254) +levenshtein(): Argument #1 ($str1) must be less than 256 characters +--- String 2 --- +int(254) +levenshtein(): Argument #2 ($str2) must be less than 256 characters |