Implement a dedicated int/float parser for XML (schema) values in lxml.objectify.xml_int_float_parsing

This disables support for "_" in numbers, which are allowed by Python but not by XMLSchema. Wee keep a few additional literals, such as "+NaN", simply because they shouldn't hurt. See https://mail.python.org/archives/list/lxml@python.org/thread/6F7VIDKWZTJ6LB6VOX6IJNNWICYHFPNR/
author: Stefan Behnel <stefan_ml@behnel.de> 2021-08-12 16:58:41 +0200
committer: Stefan Behnel <stefan_ml@behnel.de> 2021-08-12 16:59:26 +0200
commit: 83e6c031994d553b74991501c6cd85e3517fadd8 (patch)
tree: f455881a911562a009f0de2f4dcc783d181af8aa /src/lxml/objectify.pyx
parent: e23a807e816373e9eae9d45b5cecdd85ed2fa76a (diff)
download: python-lxml-xml_int_float_parsing.tar.gz
1 files changed, 117 insertions, 2 deletions
diff --git a/src/lxml/objectify.pyx b/src/lxml/objectify.pyx
index e587e4f2..cacbe806 100644
--- a/src/lxml/objectify.pyx
+++ b/src/lxml/objectify.pyx
@@ -943,6 +943,121 @@ cdef object _parseNumber(NumberElement element):
     return element._parse_value(textOf(element._c_node))
 
 
+cdef enum NumberParserState:
+    NPS_SPACE_PRE = 0
+    NPS_SIGN = 1
+    NPS_DIGITS = 2
+    NPS_POINT_LEAD = 3
+    NPS_POINT = 4
+    NPS_FRACTION = 5
+    NPS_EXP = 6
+    NPS_EXP_SIGN = 7
+    NPS_DIGITS_EXP = 8
+    NPS_SPACE_TAIL = 9
+    NPS_INF1 = 20
+    NPS_INF2 = 21
+    NPS_INF3 = 22
+    NPS_NAN1 = 23
+    NPS_NAN2 = 24
+    NPS_NAN3 = 25
+    NPS_ERROR = 99
+
+
+ctypedef fused bytes_unicode:
+    bytes
+    unicode
+
+
+cdef _checkNumber(bytes_unicode s, bint allow_float):
+    cdef Py_UCS4 c
+    cdef NumberParserState state = NPS_SPACE_PRE
+
+    for c in s:
+        if c.isdigit() if (bytes_unicode is unicode) else c in b'0123456789':
+            if state in (NPS_DIGITS, NPS_FRACTION, NPS_DIGITS_EXP):
+                pass
+            elif state in (NPS_SPACE_PRE, NPS_SIGN):
+                state = NPS_DIGITS
+            elif state in (NPS_POINT_LEAD, NPS_POINT):
+                state = NPS_FRACTION
+            elif state in (NPS_EXP, NPS_EXP_SIGN):
+                state = NPS_DIGITS_EXP
+            else:
+                state = NPS_ERROR
+        else:
+            if c == u'.':
+                if state in (NPS_SPACE_PRE, NPS_SIGN):
+                    state = NPS_POINT_LEAD
+                elif state == NPS_DIGITS:
+                    state = NPS_POINT
+                else:
+                    state = NPS_ERROR
+                if not allow_float:
+                    state = NPS_ERROR
+            elif c in u'-+':
+                if state == NPS_SPACE_PRE:
+                    state = NPS_SIGN
+                elif state == NPS_EXP:
+                    state = NPS_EXP_SIGN
+                else:
+                    state = NPS_ERROR
+            elif c == u'E':
+                if state in (NPS_DIGITS, NPS_POINT, NPS_FRACTION):
+                    state = NPS_EXP
+                else:
+                    state = NPS_ERROR
+                if not allow_float:
+                    state = NPS_ERROR
+            # Allow INF and NaN. XMLSchema requires case, we don't, like Python.
+            elif c in u'iI':
+                state = NPS_INF1 if allow_float and state in (NPS_SPACE_PRE, NPS_SIGN) else NPS_ERROR
+            elif c in u'fF':
+                state = NPS_INF3 if state == NPS_INF2 else NPS_ERROR
+            elif c in u'aA':
+                state = NPS_NAN2 if state == NPS_NAN1 else NPS_ERROR
+            elif c in u'nN':
+                # Python also allows [+-]NaN, so let's accept that.
+                if state in (NPS_SPACE_PRE, NPS_SIGN):
+                    state = NPS_NAN1 if allow_float else NPS_ERROR
+                elif state == NPS_NAN2:
+                    state = NPS_NAN3
+                elif state == NPS_INF1:
+                    state = NPS_INF2
+                else:
+                    state = NPS_ERROR
+            # Allow spaces around text values.
+            else:
+                if c.isspace() if (bytes_unicode is unicode) else c in b'\x09\x0a\x0b\x0c\x0d\x20':
+                    if state in (NPS_SPACE_PRE, NPS_SPACE_TAIL):
+                        pass
+                    elif state in (NPS_DIGITS, NPS_POINT, NPS_FRACTION, NPS_DIGITS_EXP, NPS_INF3, NPS_NAN3):
+                        state = NPS_SPACE_TAIL
+                    else:
+                        state = NPS_ERROR
+                else:
+                    state = NPS_ERROR
+
+            if state == NPS_ERROR:
+                break
+
+    if state not in (NPS_DIGITS, NPS_FRACTION, NPS_POINT, NPS_DIGITS_EXP, NPS_INF3, NPS_NAN3, NPS_SPACE_TAIL):
+        raise ValueError
+
+
+cdef _checkInt(s):
+    if python.IS_PYTHON2 and type(s) is bytes:
+        return _checkNumber(<bytes>s, allow_float=False)
+    else:
+        return _checkNumber(<unicode>s, allow_float=False)
+
+
+cdef _checkFloat(s):
+    if python.IS_PYTHON2 and type(s) is bytes:
+        return _checkNumber(<bytes>s, allow_float=True)
+    else:
+        return _checkNumber(<unicode>s, allow_float=True)
+
+
 cdef object _strValueOf(obj):
     if python._isString(obj):
         return obj
@@ -1104,7 +1219,7 @@ def pytypename(obj):
     return _pytypename(obj)
 
 cdef _registerPyTypes():
-    pytype = PyType(u'int', int, IntElement)
+    pytype = PyType(u'int', _checkInt, IntElement)  # wraps functions for Python
     pytype.xmlSchemaTypes = (u"integer", u"int", u"short", u"byte", u"unsignedShort",
                              u"unsignedByte", u"nonPositiveInteger",
                              u"negativeInteger", u"long", u"nonNegativeInteger",
@@ -1115,7 +1230,7 @@ cdef _registerPyTypes():
     pytype = PyType(u'long', None, IntElement)
     pytype.register()
 
-    pytype = PyType(u'float', float, FloatElement, repr)
+    pytype = PyType(u'float', _checkFloat, FloatElement, repr)  # wraps _parseFloat for Python
     pytype.xmlSchemaTypes = (u"double", u"float")
     pytype.register()
author	Stefan Behnel <stefan_ml@behnel.de>	2021-08-12 16:58:41 +0200
committer	Stefan Behnel <stefan_ml@behnel.de>	2021-08-12 16:59:26 +0200
commit	83e6c031994d553b74991501c6cd85e3517fadd8 (patch)
tree	f455881a911562a009f0de2f4dcc783d181af8aa /src/lxml/objectify.pyx
parent	e23a807e816373e9eae9d45b5cecdd85ed2fa76a (diff)
download	python-lxml-xml_int_float_parsing.tar.gz