diff options
author | Richard Maw <richard.maw@codethink.co.uk> | 2012-01-17 14:43:55 +0000 |
---|---|---|
committer | Richard Maw <richard.maw@codethink.co.uk> | 2012-01-17 14:43:55 +0000 |
commit | 2de9abc5c9d40b3c716307d67d16146f823fd554 (patch) | |
tree | 6979db67934ddc8b564150b465846a383b428ff8 /gnulib/lib/fstrcmp.c | |
parent | 33cc1c6fda6e72a7bae1401e9b2cec495a4d3ff1 (diff) | |
download | patch-baserock/bootstrap.tar.gz |
add the output of bootstrapbaserock/bootstrap-pass2baserock/bootstrap
Diffstat (limited to 'gnulib/lib/fstrcmp.c')
m--------- | gnulib | 0 | ||||
-rw-r--r-- | gnulib/lib/fstrcmp.c | 271 |
2 files changed, 271 insertions, 0 deletions
diff --git a/gnulib b/gnulib deleted file mode 160000 -Subproject 443bc5ffcf7429e557f4a371b0661abe98ddbc1 diff --git a/gnulib/lib/fstrcmp.c b/gnulib/lib/fstrcmp.c new file mode 100644 index 0000000..595c1f3 --- /dev/null +++ b/gnulib/lib/fstrcmp.c @@ -0,0 +1,271 @@ +/* Functions to make fuzzy comparisons between strings + Copyright (C) 1988-1989, 1992-1993, 1995, 2001-2003, 2006, 2008-2011 Free + Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + + + Derived from GNU diff 2.7, analyze.c et al. + + The basic idea is to consider two vectors as similar if, when + transforming the first vector into the second vector through a + sequence of edits (inserts and deletes of one element each), + this sequence is short - or equivalently, if the ordered list + of elements that are untouched by these edits is long. For a + good introduction to the subject, read about the "Levenshtein + distance" in Wikipedia. + + The basic algorithm is described in: + "An O(ND) Difference Algorithm and its Variations", Eugene Myers, + Algorithmica Vol. 1 No. 2, 1986, pp. 251-266; + see especially section 4.2, which describes the variation used below. + + The basic algorithm was independently discovered as described in: + "Algorithms for Approximate String Matching", E. Ukkonen, + Information and Control Vol. 64, 1985, pp. 100-118. + + Unless the 'find_minimal' flag is set, this code uses the TOO_EXPENSIVE + heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N) + at the price of producing suboptimal output for large inputs with + many differences. */ + +#include <config.h> + +/* Specification. */ +#include "fstrcmp.h" + +#include <string.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <limits.h> + +#include "glthread/lock.h" +#include "glthread/tls.h" +#include "minmax.h" +#include "xalloc.h" + +#ifndef uintptr_t +# define uintptr_t unsigned long +#endif + + +#define ELEMENT char +#define EQUAL(x,y) ((x) == (y)) +#define OFFSET int +#define EXTRA_CONTEXT_FIELDS \ + /* The number of edits beyond which the computation can be aborted. */ \ + int edit_count_limit; \ + /* The number of edits (= number of elements inserted, plus the number of \ + elements deleted), temporarily minus edit_count_limit. */ \ + int edit_count; +#define NOTE_DELETE(ctxt, xoff) ctxt->edit_count++ +#define NOTE_INSERT(ctxt, yoff) ctxt->edit_count++ +#define EARLY_ABORT(ctxt) ctxt->edit_count > 0 +/* We don't need USE_HEURISTIC, since it is unlikely in typical uses of + fstrcmp(). */ +#include "diffseq.h" + + +/* Because fstrcmp is typically called multiple times, attempt to minimize + the number of memory allocations performed. Thus, let a call reuse the + memory already allocated by the previous call, if it is sufficient. + To make it multithread-safe, without need for a lock that protects the + already allocated memory, store the allocated memory per thread. Free + it only when the thread exits. */ + +static gl_tls_key_t buffer_key; /* TLS key for a 'int *' */ +static gl_tls_key_t bufmax_key; /* TLS key for a 'size_t' */ + +static void +keys_init (void) +{ + gl_tls_key_init (buffer_key, free); + gl_tls_key_init (bufmax_key, NULL); + /* The per-thread initial values are NULL and 0, respectively. */ +} + +/* Ensure that keys_init is called once only. */ +gl_once_define(static, keys_init_once) + + +/* In the code below, branch probabilities were measured by Ralf Wildenhues, + by running "msgmerge LL.po coreutils.pot" with msgmerge 0.18 for many + values of LL. The probability indicates that the condition evaluates + to true; whether that leads to a branch or a non-branch in the code, + depends on the compiler's reordering of basic blocks. */ + + +double +fstrcmp_bounded (const char *string1, const char *string2, double lower_bound) +{ + struct context ctxt; + int xvec_length = strlen (string1); + int yvec_length = strlen (string2); + int i; + + size_t fdiag_len; + int *buffer; + size_t bufmax; + + /* short-circuit obvious comparisons */ + if (xvec_length == 0 || yvec_length == 0) /* Prob: 1% */ + return (xvec_length == 0 && yvec_length == 0 ? 1.0 : 0.0); + + if (lower_bound > 0) + { + /* Compute a quick upper bound. + Each edit is an insertion or deletion of an element, hence modifies + the length of the sequence by at most 1. + Therefore, when starting from a sequence X and ending at a sequence Y, + with N edits, | yvec_length - xvec_length | <= N. (Proof by + induction over N.) + So, at the end, we will have + edit_count >= | xvec_length - yvec_length |. + and hence + result + = (xvec_length + yvec_length - edit_count) + / (xvec_length + yvec_length) + <= (xvec_length + yvec_length - | yvec_length - xvec_length |) + / (xvec_length + yvec_length) + = 2 * min (xvec_length, yvec_length) / (xvec_length + yvec_length). + */ + volatile double upper_bound = + (double) (2 * MIN (xvec_length, yvec_length)) + / (xvec_length + yvec_length); + + if (upper_bound < lower_bound) /* Prob: 74% */ + /* Return an arbitrary value < LOWER_BOUND. */ + return 0.0; + +#if CHAR_BIT <= 8 + /* When X and Y are both small, avoid the overhead of setting up an + array of size 256. */ + if (xvec_length + yvec_length >= 20) /* Prob: 99% */ + { + /* Compute a less quick upper bound. + Each edit is an insertion or deletion of a character, hence + modifies the occurrence count of a character by 1 and leaves the + other occurrence counts unchanged. + Therefore, when starting from a sequence X and ending at a + sequence Y, and denoting the occurrence count of C in X with + OCC (X, C), with N edits, + sum_C | OCC (X, C) - OCC (Y, C) | <= N. + (Proof by induction over N.) + So, at the end, we will have + edit_count >= sum_C | OCC (X, C) - OCC (Y, C) |, + and hence + result + = (xvec_length + yvec_length - edit_count) + / (xvec_length + yvec_length) + <= (xvec_length + yvec_length - sum_C | OCC(X,C) - OCC(Y,C) |) + / (xvec_length + yvec_length). + */ + int occ_diff[UCHAR_MAX + 1]; /* array C -> OCC(X,C) - OCC(Y,C) */ + int sum; + + /* Determine the occurrence counts in X. */ + memset (occ_diff, 0, sizeof (occ_diff)); + for (i = xvec_length - 1; i >= 0; i--) + occ_diff[(unsigned char) string1[i]]++; + /* Subtract the occurrence counts in Y. */ + for (i = yvec_length - 1; i >= 0; i--) + occ_diff[(unsigned char) string2[i]]--; + /* Sum up the absolute values. */ + sum = 0; + for (i = 0; i <= UCHAR_MAX; i++) + { + int d = occ_diff[i]; + sum += (d >= 0 ? d : -d); + } + + upper_bound = 1.0 - (double) sum / (xvec_length + yvec_length); + + if (upper_bound < lower_bound) /* Prob: 66% */ + /* Return an arbitrary value < LOWER_BOUND. */ + return 0.0; + } +#endif + } + + /* set the info for each string. */ + ctxt.xvec = string1; + ctxt.yvec = string2; + + /* Set TOO_EXPENSIVE to be approximate square root of input size, + bounded below by 256. */ + ctxt.too_expensive = 1; + for (i = xvec_length + yvec_length; + i != 0; + i >>= 2) + ctxt.too_expensive <<= 1; + if (ctxt.too_expensive < 256) + ctxt.too_expensive = 256; + + /* Allocate memory for fdiag and bdiag from a thread-local pool. */ + fdiag_len = xvec_length + yvec_length + 3; + gl_once (keys_init_once, keys_init); + buffer = (int *) gl_tls_get (buffer_key); + bufmax = (size_t) (uintptr_t) gl_tls_get (bufmax_key); + if (fdiag_len > bufmax) + { + /* Need more memory. */ + bufmax = 2 * bufmax; + if (fdiag_len > bufmax) + bufmax = fdiag_len; + /* Calling xrealloc would be a waste: buffer's contents does not need + to be preserved. */ + if (buffer != NULL) + free (buffer); + buffer = (int *) xnmalloc (bufmax, 2 * sizeof (int)); + gl_tls_set (buffer_key, buffer); + gl_tls_set (bufmax_key, (void *) (uintptr_t) bufmax); + } + ctxt.fdiag = buffer + yvec_length + 1; + ctxt.bdiag = ctxt.fdiag + fdiag_len; + + /* The edit_count is only ever increased. The computation can be aborted + when + (xvec_length + yvec_length - edit_count) / (xvec_length + yvec_length) + < lower_bound, + or equivalently + edit_count > (xvec_length + yvec_length) * (1 - lower_bound) + or equivalently + edit_count > floor((xvec_length + yvec_length) * (1 - lower_bound)). + We need to add an epsilon inside the floor(...) argument, to neutralize + rounding errors. */ + ctxt.edit_count_limit = + (lower_bound < 1.0 + ? (int) ((xvec_length + yvec_length) * (1.0 - lower_bound + 0.000001)) + : 0); + + /* Now do the main comparison algorithm */ + ctxt.edit_count = - ctxt.edit_count_limit; + if (compareseq (0, xvec_length, 0, yvec_length, 0, &ctxt)) /* Prob: 98% */ + /* The edit_count passed the limit. Hence the result would be + < lower_bound. We can return any value < lower_bound instead. */ + return 0.0; + ctxt.edit_count += ctxt.edit_count_limit; + + /* The result is + ((number of chars in common) / (average length of the strings)). + The numerator is + = xvec_length - (number of calls to NOTE_DELETE) + = yvec_length - (number of calls to NOTE_INSERT) + = 1/2 * (xvec_length + yvec_length - (number of edits)). + This is admittedly biased towards finding that the strings are + similar, however it does produce meaningful results. */ + return ((double) (xvec_length + yvec_length - ctxt.edit_count) + / (xvec_length + yvec_length)); +} |