diff options
| author | Greg Price <gnprice@gmail.com> | 2019-09-03 19:45:44 -0700 | 
|---|---|---|
| committer | Benjamin Peterson <benjamin@python.org> | 2019-09-03 19:45:44 -0700 | 
| commit | 2f09413947d1ce0043de62ed2346f9a2b4e5880b (patch) | |
| tree | 24a1f3b3e19d89925abd013531dc0f253606ed11 /Modules/unicodedata.c | |
| parent | 580bdb0ece681537eadb360f0c796123ead7a559 (diff) | |
| download | cpython-git-2f09413947d1ce0043de62ed2346f9a2b4e5880b.tar.gz | |
closes bpo-37966: Fully implement the UAX #15 quick-check algorithm. (GH-15558)
The purpose of the `unicodedata.is_normalized` function is to answer
the question `str == unicodedata.normalized(form, str)` more
efficiently than writing just that, by using the "quick check"
optimization described in the Unicode standard in UAX #15.
However, it turns out the code doesn't implement the full algorithm
from the standard, and as a result we often miss the optimization and
end up having to compute the whole normalized string after all.
Implement the standard's algorithm.  This greatly speeds up
`unicodedata.is_normalized` in many cases where our partial variant
of quick-check had been returning MAYBE and the standard algorithm
returns NO.
At a quick test on my desktop, the existing code takes about 4.4 ms/MB
(so 4.4 ns per byte) when the partial quick-check returns MAYBE and it
has to do the slow normalize-and-compare:
  $ build.base/python -m timeit -s 'import unicodedata; s = "\uf900"*500000' \
      -- 'unicodedata.is_normalized("NFD", s)'
  50 loops, best of 5: 4.39 msec per loop
With this patch, it gets the answer instantly (58 ns) on the same 1 MB
string:
  $ build.dev/python -m timeit -s 'import unicodedata; s = "\uf900"*500000' \
      -- 'unicodedata.is_normalized("NFD", s)'
  5000000 loops, best of 5: 58.2 nsec per loop
This restores a small optimization that the original version of this
code had for the `unicodedata.normalize` use case.
With this, that case is actually faster than in master!
$ build.base/python -m timeit -s 'import unicodedata; s = "\u0338"*500000' \
    -- 'unicodedata.normalize("NFD", s)'
500 loops, best of 5: 561 usec per loop
$ build.dev/python -m timeit -s 'import unicodedata; s = "\u0338"*500000' \
    -- 'unicodedata.normalize("NFD", s)'
500 loops, best of 5: 512 usec per loop
Diffstat (limited to 'Modules/unicodedata.c')
| -rw-r--r-- | Modules/unicodedata.c | 75 | 
1 files changed, 51 insertions, 24 deletions
diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c index ae0d4e46f9..5e8ba602d6 100644 --- a/Modules/unicodedata.c +++ b/Modules/unicodedata.c @@ -19,6 +19,8 @@  #include "ucnhash.h"  #include "structmember.h" +#include <stdbool.h> +  _Py_IDENTIFIER(NFC);  _Py_IDENTIFIER(NFD);  _Py_IDENTIFIER(NFKC); @@ -775,25 +777,40 @@ nfc_nfkc(PyObject *self, PyObject *input, int k)      return result;  } -typedef enum {YES, NO, MAYBE} NormalMode; - -/* Return YES if the input is certainly normalized, NO or MAYBE if it might not be. */ -static NormalMode -is_normalized(PyObject *self, PyObject *input, int nfc, int k) +// This needs to match the logic in makeunicodedata.py +// which constructs the quickcheck data. +typedef enum {YES = 0, MAYBE = 1, NO = 2} QuickcheckResult; + +/* Run the Unicode normalization "quickcheck" algorithm. + * + * Return YES or NO if quickcheck determines the input is certainly + * normalized or certainly not, and MAYBE if quickcheck is unable to + * tell. + * + * If `yes_only` is true, then return MAYBE as soon as we determine + * the answer is not YES. + * + * For background and details on the algorithm, see UAX #15: + *   https://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms + */ +static QuickcheckResult +is_normalized_quickcheck(PyObject *self, PyObject *input, +                         int nfc, int k, bool yes_only)  { -    Py_ssize_t i, len; -    int kind; -    void *data; -    unsigned char prev_combining = 0, quickcheck_mask; -      /* An older version of the database is requested, quickchecks must be         disabled. */      if (self && UCD_Check(self))          return NO; -    /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No, -       as described in http://unicode.org/reports/tr15/#Annex8. */ -    quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0)); +    Py_ssize_t i, len; +    int kind; +    void *data; +    unsigned char prev_combining = 0; + +    /* The two quickcheck bits at this shift have type QuickcheckResult. */ +    int quickcheck_shift = (nfc ? 4 : 0) + (k ? 2 : 0); + +    QuickcheckResult result = YES; /* certainly normalized, unless we find something */      i = 0;      kind = PyUnicode_KIND(input); @@ -802,16 +819,26 @@ is_normalized(PyObject *self, PyObject *input, int nfc, int k)      while (i < len) {          Py_UCS4 ch = PyUnicode_READ(kind, data, i++);          const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch); -        unsigned char combining = record->combining; -        unsigned char quickcheck = record->normalization_quick_check; -        if (quickcheck & quickcheck_mask) -            return MAYBE; /* this string might need normalization */ +        unsigned char combining = record->combining;          if (combining && prev_combining > combining)              return NO; /* non-canonical sort order, not normalized */          prev_combining = combining; + +        unsigned char quickcheck_whole = record->normalization_quick_check; +        if (yes_only) { +            if (quickcheck_whole & (3 << quickcheck_shift)) +                return MAYBE; +        } else { +            switch ((quickcheck_whole >> quickcheck_shift) & 3) { +            case NO: +              return NO; +            case MAYBE: +              result = MAYBE; /* this string might need normalization */ +            } +        }      } -    return YES; /* certainly normalized */ +    return result;  }  /*[clinic input] @@ -844,7 +871,7 @@ unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form,      PyObject *result;      int nfc = 0;      int k = 0; -    NormalMode m; +    QuickcheckResult m;      PyObject *cmp;      int match = 0; @@ -867,7 +894,7 @@ unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form,          return NULL;      } -    m = is_normalized(self, input, nfc, k); +    m = is_normalized_quickcheck(self, input, nfc, k, false);      if (m == MAYBE) {          cmp = (nfc ? nfc_nfkc : nfd_nfkd)(self, input, k); @@ -913,28 +940,28 @@ unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form,      }      if (_PyUnicode_EqualToASCIIId(form, &PyId_NFC)) { -        if (is_normalized(self, input, 1, 0) == YES) { +        if (is_normalized_quickcheck(self, input, 1, 0, true) == YES) {              Py_INCREF(input);              return input;          }          return nfc_nfkc(self, input, 0);      }      if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKC)) { -        if (is_normalized(self, input, 1, 1) == YES) { +        if (is_normalized_quickcheck(self, input, 1, 1, true) == YES) {              Py_INCREF(input);              return input;          }          return nfc_nfkc(self, input, 1);      }      if (_PyUnicode_EqualToASCIIId(form, &PyId_NFD)) { -        if (is_normalized(self, input, 0, 0) == YES) { +        if (is_normalized_quickcheck(self, input, 0, 0, true) == YES) {              Py_INCREF(input);              return input;          }          return nfd_nfkd(self, input, 0);      }      if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKD)) { -        if (is_normalized(self, input, 0, 1) == YES) { +        if (is_normalized_quickcheck(self, input, 0, 1, true) == YES) {              Py_INCREF(input);              return input;          }  | 
