# XPath evaluation class XPathSyntaxError(LxmlSyntaxError, XPathError): pass ################################################################################ # XPath cdef object _XPATH_SYNTAX_ERRORS _XPATH_SYNTAX_ERRORS = ( xmlerror.XML_XPATH_NUMBER_ERROR, xmlerror.XML_XPATH_UNFINISHED_LITERAL_ERROR, xmlerror.XML_XPATH_VARIABLE_REF_ERROR, xmlerror.XML_XPATH_INVALID_PREDICATE_ERROR, xmlerror.XML_XPATH_UNCLOSED_ERROR, xmlerror.XML_XPATH_INVALID_CHAR_ERROR ) cdef object _XPATH_EVAL_ERRORS _XPATH_EVAL_ERRORS = ( xmlerror.XML_XPATH_UNDEF_VARIABLE_ERROR, xmlerror.XML_XPATH_UNDEF_PREFIX_ERROR, xmlerror.XML_XPATH_UNKNOWN_FUNC_ERROR, xmlerror.XML_XPATH_INVALID_OPERAND, xmlerror.XML_XPATH_INVALID_TYPE, xmlerror.XML_XPATH_INVALID_ARITY, xmlerror.XML_XPATH_INVALID_CTXT_SIZE, xmlerror.XML_XPATH_INVALID_CTXT_POSITION ) cdef int _register_xpath_function(void* ctxt, name_utf, ns_utf): if ns_utf is None: return xpath.xmlXPathRegisterFunc( ctxt, _cstr(name_utf), _xpath_function_call) else: return xpath.xmlXPathRegisterFuncNS( ctxt, _cstr(name_utf), _cstr(ns_utf), _xpath_function_call) cdef int _unregister_xpath_function(void* ctxt, name_utf, ns_utf): if ns_utf is None: return xpath.xmlXPathRegisterFunc( ctxt, _cstr(name_utf), NULL) else: return xpath.xmlXPathRegisterFuncNS( ctxt, _cstr(name_utf), _cstr(ns_utf), NULL) @cython.final @cython.internal cdef class _XPathContext(_BaseContext): cdef object _variables def __init__(self, namespaces, extensions, error_log, enable_regexp, variables, build_smart_strings): self._variables = variables _BaseContext.__init__(self, namespaces, extensions, error_log, enable_regexp, build_smart_strings) cdef set_context(self, xpath.xmlXPathContext* xpathCtxt): self._set_xpath_context(xpathCtxt) self._setupDict(xpathCtxt) self.registerLocalNamespaces() self.registerLocalFunctions(xpathCtxt, _register_xpath_function) cdef register_context(self, _Document doc): self._register_context(doc) self.registerGlobalNamespaces() self.registerGlobalFunctions(self._xpathCtxt, _register_xpath_function) self.registerExsltFunctions() if self._variables is not None: self.registerVariables(self._variables) cdef unregister_context(self): self.unregisterGlobalFunctions( self._xpathCtxt, _unregister_xpath_function) self.unregisterGlobalNamespaces() xpath.xmlXPathRegisteredVariablesCleanup(self._xpathCtxt) self._cleanup_context() cdef void registerExsltFunctions(self): if xslt.LIBXSLT_VERSION < 10125: # we'd only execute dummy functions anyway return tree.xmlHashScan( self._xpathCtxt.nsHash, _registerExsltFunctionsForNamespaces, self._xpathCtxt) cdef registerVariables(self, variable_dict): for name, value in variable_dict.items(): name_utf = self._to_utf(name) xpath.xmlXPathRegisterVariable( self._xpathCtxt, _cstr(name_utf), _wrapXPathObject(value, None, None)) cdef registerVariable(self, name, value): name_utf = self._to_utf(name) xpath.xmlXPathRegisterVariable( self._xpathCtxt, _cstr(name_utf), _wrapXPathObject(value, None, None)) cdef void _registerVariable(self, name_utf, value): xpath.xmlXPathRegisterVariable( self._xpathCtxt, _cstr(name_utf), _wrapXPathObject(value, None, None)) cdef void _setupDict(self, xpath.xmlXPathContext* xpathCtxt): __GLOBAL_PARSER_CONTEXT.initXPathParserDict(xpathCtxt) cdef void _registerExsltFunctionsForNamespaces( void* _c_href, void* _ctxt, char* c_prefix): cdef char* c_href = _c_href cdef xpath.xmlXPathContext* ctxt = _ctxt if cstring_h.strcmp(c_href, xslt.EXSLT_DATE_NAMESPACE) == 0: xslt.exsltDateXpathCtxtRegister(ctxt, c_prefix) elif cstring_h.strcmp(c_href, xslt.EXSLT_SETS_NAMESPACE) == 0: xslt.exsltSetsXpathCtxtRegister(ctxt, c_prefix) elif cstring_h.strcmp(c_href, xslt.EXSLT_MATH_NAMESPACE) == 0: xslt.exsltMathXpathCtxtRegister(ctxt, c_prefix) elif cstring_h.strcmp(c_href, xslt.EXSLT_STRINGS_NAMESPACE) == 0: xslt.exsltStrXpathCtxtRegister(ctxt, c_prefix) cdef bint _XPATH_VERSION_WARNING_REQUIRED if _LIBXML_VERSION_INT == 20627: _XPATH_VERSION_WARNING_REQUIRED = 1 else: _XPATH_VERSION_WARNING_REQUIRED = 0 cdef class _XPathEvaluatorBase: cdef xpath.xmlXPathContext* _xpathCtxt cdef _XPathContext _context cdef python.PyThread_type_lock _eval_lock cdef _ErrorLog _error_log def __cinit__(self): self._xpathCtxt = NULL if config.ENABLE_THREADING: self._eval_lock = python.PyThread_allocate_lock() if self._eval_lock is NULL: raise MemoryError() self._error_log = _ErrorLog() def __init__(self, namespaces, extensions, enable_regexp, smart_strings): global _XPATH_VERSION_WARNING_REQUIRED if _XPATH_VERSION_WARNING_REQUIRED: _XPATH_VERSION_WARNING_REQUIRED = 0 import warnings warnings.warn(u"This version of libxml2 has a known XPath bug. " u"Use it at your own risk.") self._context = _XPathContext(namespaces, extensions, self._error_log, enable_regexp, None, smart_strings) property error_log: def __get__(self): assert self._error_log is not None, "XPath evaluator not initialised" return self._error_log.copy() def __dealloc__(self): if self._xpathCtxt is not NULL: xpath.xmlXPathFreeContext(self._xpathCtxt) if config.ENABLE_THREADING: if self._eval_lock is not NULL: python.PyThread_free_lock(self._eval_lock) cdef set_context(self, xpath.xmlXPathContext* xpathCtxt): self._xpathCtxt = xpathCtxt self._context.set_context(xpathCtxt) def evaluate(self, _eval_arg, **_variables): u"""evaluate(self, _eval_arg, **_variables) Evaluate an XPath expression. Instead of calling this method, you can also call the evaluator object itself. Variables may be provided as keyword arguments. Note that namespaces are currently not supported for variables. :deprecated: call the object, not its method. """ return self(_eval_arg, **_variables) cdef bint _checkAbsolutePath(self, char* path): cdef char c if path is NULL: return 0 c = path[0] while c == c' ' or c == c'\t': path = path + 1 c = path[0] return c == c'/' @cython.final cdef int _lock(self) except -1: cdef int result if config.ENABLE_THREADING and self._eval_lock != NULL: with nogil: result = python.PyThread_acquire_lock( self._eval_lock, python.WAIT_LOCK) if result == 0: raise XPathError, u"XPath evaluator locking failed" return 0 @cython.final cdef void _unlock(self): if config.ENABLE_THREADING and self._eval_lock != NULL: python.PyThread_release_lock(self._eval_lock) cdef _raise_parse_error(self): cdef _BaseErrorLog entries entries = self._error_log.filter_types(_XPATH_SYNTAX_ERRORS) if entries: message = entries._buildExceptionMessage(None) if message is not None: raise XPathSyntaxError(message, self._error_log) raise XPathSyntaxError(self._error_log._buildExceptionMessage( u"Error in xpath expression"), self._error_log) cdef _raise_eval_error(self): cdef _BaseErrorLog entries entries = self._error_log.filter_types(_XPATH_EVAL_ERRORS) if not entries: entries = self._error_log.filter_types(_XPATH_SYNTAX_ERRORS) if entries: message = entries._buildExceptionMessage(None) if message is not None: raise XPathEvalError(message, self._error_log) raise XPathEvalError(self._error_log._buildExceptionMessage( u"Error in xpath expression"), self._error_log) cdef object _handle_result(self, xpath.xmlXPathObject* xpathObj, _Document doc): if self._context._exc._has_raised(): if xpathObj is not NULL: _freeXPathObject(xpathObj) xpathObj = NULL self._context._release_temp_refs() self._context._exc._raise_if_stored() if xpathObj is NULL: self._context._release_temp_refs() self._raise_eval_error() try: result = _unwrapXPathObject(xpathObj, doc, self._context) finally: _freeXPathObject(xpathObj) self._context._release_temp_refs() return result cdef class XPathElementEvaluator(_XPathEvaluatorBase): u"""XPathElementEvaluator(self, element, namespaces=None, extensions=None, regexp=True, smart_strings=True) Create an XPath evaluator for an element. Absolute XPath expressions (starting with '/') will be evaluated against the ElementTree as returned by getroottree(). Additional namespace declarations can be passed with the 'namespace' keyword argument. EXSLT regular expression support can be disabled with the 'regexp' boolean keyword (defaults to True). Smart strings will be returned for string results unless you pass ``smart_strings=False``. """ cdef _Element _element def __init__(self, _Element element not None, *, namespaces=None, extensions=None, regexp=True, smart_strings=True): cdef xpath.xmlXPathContext* xpathCtxt cdef int ns_register_status cdef _Document doc _assertValidNode(element) _assertValidDoc(element._doc) self._element = element doc = element._doc _XPathEvaluatorBase.__init__(self, namespaces, extensions, regexp, smart_strings) xpathCtxt = xpath.xmlXPathNewContext(doc._c_doc) if xpathCtxt is NULL: raise MemoryError() self.set_context(xpathCtxt) def register_namespace(self, prefix, uri): u"""Register a namespace with the XPath context. """ assert self._xpathCtxt is not NULL, "XPath context not initialised" self._context.addNamespace(prefix, uri) def register_namespaces(self, namespaces): u"""Register a prefix -> uri dict. """ assert self._xpathCtxt is not NULL, "XPath context not initialised" for prefix, uri in namespaces.items(): self._context.addNamespace(prefix, uri) def __call__(self, _path, **_variables): u"""__call__(self, _path, **_variables) Evaluate an XPath expression on the document. Variables may be provided as keyword arguments. Note that namespaces are currently not supported for variables. Absolute XPath expressions (starting with '/') will be evaluated against the ElementTree as returned by getroottree(). """ cdef xpath.xmlXPathObject* xpathObj cdef _Document doc cdef char* c_path assert self._xpathCtxt is not NULL, "XPath context not initialised" path = _utf8(_path) doc = self._element._doc self._lock() self._xpathCtxt.node = self._element._c_node try: self._context.register_context(doc) self._context.registerVariables(_variables) c_path = _cstr(path) with nogil: xpathObj = xpath.xmlXPathEvalExpression( c_path, self._xpathCtxt) result = self._handle_result(xpathObj, doc) finally: self._context.unregister_context() self._unlock() return result cdef class XPathDocumentEvaluator(XPathElementEvaluator): u"""XPathDocumentEvaluator(self, etree, namespaces=None, extensions=None, regexp=True, smart_strings=True) Create an XPath evaluator for an ElementTree. Additional namespace declarations can be passed with the 'namespace' keyword argument. EXSLT regular expression support can be disabled with the 'regexp' boolean keyword (defaults to True). Smart strings will be returned for string results unless you pass ``smart_strings=False``. """ def __init__(self, _ElementTree etree not None, *, namespaces=None, extensions=None, regexp=True, smart_strings=True): XPathElementEvaluator.__init__( self, etree._context_node, namespaces=namespaces, extensions=extensions, regexp=regexp, smart_strings=smart_strings) def __call__(self, _path, **_variables): u"""__call__(self, _path, **_variables) Evaluate an XPath expression on the document. Variables may be provided as keyword arguments. Note that namespaces are currently not supported for variables. """ cdef xpath.xmlXPathObject* xpathObj cdef xmlDoc* c_doc cdef _Document doc cdef char* c_path assert self._xpathCtxt is not NULL, "XPath context not initialised" path = _utf8(_path) doc = self._element._doc self._lock() try: self._context.register_context(doc) c_doc = _fakeRootDoc(doc._c_doc, self._element._c_node) try: self._context.registerVariables(_variables) c_path = _cstr(path) with nogil: self._xpathCtxt.doc = c_doc self._xpathCtxt.node = tree.xmlDocGetRootElement(c_doc) xpathObj = xpath.xmlXPathEvalExpression( c_path, self._xpathCtxt) result = self._handle_result(xpathObj, doc) finally: _destroyFakeDoc(doc._c_doc, c_doc) self._context.unregister_context() finally: self._unlock() return result def XPathEvaluator(etree_or_element, *, namespaces=None, extensions=None, regexp=True, smart_strings=True): u"""XPathEvaluator(etree_or_element, namespaces=None, extensions=None, regexp=True, smart_strings=True) Creates an XPath evaluator for an ElementTree or an Element. The resulting object can be called with an XPath expression as argument and XPath variables provided as keyword arguments. Additional namespace declarations can be passed with the 'namespace' keyword argument. EXSLT regular expression support can be disabled with the 'regexp' boolean keyword (defaults to True). Smart strings will be returned for string results unless you pass ``smart_strings=False``. """ if isinstance(etree_or_element, _ElementTree): return XPathDocumentEvaluator( etree_or_element, namespaces=namespaces, extensions=extensions, regexp=regexp, smart_strings=smart_strings) else: return XPathElementEvaluator( etree_or_element, namespaces=namespaces, extensions=extensions, regexp=regexp, smart_strings=smart_strings) cdef class XPath(_XPathEvaluatorBase): u"""XPath(self, path, namespaces=None, extensions=None, regexp=True, smart_strings=True) A compiled XPath expression that can be called on Elements and ElementTrees. Besides the XPath expression, you can pass prefix-namespace mappings and extension functions to the constructor through the keyword arguments ``namespaces`` and ``extensions``. EXSLT regular expression support can be disabled with the 'regexp' boolean keyword (defaults to True). Smart strings will be returned for string results unless you pass ``smart_strings=False``. """ cdef xpath.xmlXPathCompExpr* _xpath cdef bytes _path def __cinit__(self): self._xpath = NULL def __init__(self, path, *, namespaces=None, extensions=None, regexp=True, smart_strings=True): cdef xpath.xmlXPathContext* xpathCtxt _XPathEvaluatorBase.__init__(self, namespaces, extensions, regexp, smart_strings) self._path = _utf8(path) xpathCtxt = xpath.xmlXPathNewContext(NULL) if xpathCtxt is NULL: raise MemoryError() self.set_context(xpathCtxt) self._xpath = xpath.xmlXPathCtxtCompile(xpathCtxt, _cstr(self._path)) if self._xpath is NULL: self._raise_parse_error() def __call__(self, _etree_or_element, **_variables): u"__call__(self, _etree_or_element, **_variables)" cdef xpath.xmlXPathObject* xpathObj cdef _Document document cdef _Element element assert self._xpathCtxt is not NULL, "XPath context not initialised" document = _documentOrRaise(_etree_or_element) element = _rootNodeOrRaise(_etree_or_element) self._lock() self._xpathCtxt.doc = document._c_doc self._xpathCtxt.node = element._c_node try: self._context.register_context(document) self._context.registerVariables(_variables) with nogil: xpathObj = xpath.xmlXPathCompiledEval( self._xpath, self._xpathCtxt) result = self._handle_result(xpathObj, document) finally: self._context.unregister_context() self._unlock() return result property path: u"""The literal XPath expression. """ def __get__(self): return self._path.decode(u'UTF-8') def __dealloc__(self): if self._xpath is not NULL: xpath.xmlXPathFreeCompExpr(self._xpath) def __repr__(self): return self.path cdef object _replace_strings cdef object _find_namespaces _replace_strings = re.compile(b'("[^"]*")|(\'[^\']*\')').sub _find_namespaces = re.compile(b'({[^}]+})').findall cdef class ETXPath(XPath): u"""ETXPath(self, path, extensions=None, regexp=True, smart_strings=True) Special XPath class that supports the ElementTree {uri} notation for namespaces. Note that this class does not accept the ``namespace`` keyword argument. All namespaces must be passed as part of the path string. Smart strings will be returned for string results unless you pass ``smart_strings=False``. """ def __init__(self, path, *, extensions=None, regexp=True, smart_strings=True): path, namespaces = self._nsextract_path(path) XPath.__init__(self, path, namespaces=namespaces, extensions=extensions, regexp=regexp, smart_strings=smart_strings) cdef _nsextract_path(self, path): # replace {namespaces} by new prefixes cdef dict namespaces = {} cdef list namespace_defs = [] cdef int i path_utf = _utf8(path) stripped_path = _replace_strings(b'', path_utf) # remove string literals i = 1 for namespace_def in _find_namespaces(stripped_path): if namespace_def not in namespace_defs: prefix = python.PyBytes_FromFormat("__xpp%02d", i) i += 1 namespace_defs.append(namespace_def) namespace = namespace_def[1:-1] # remove '{}' namespace = python.PyUnicode_FromEncodedObject( namespace, 'UTF-8', 'strict') namespaces[ python.PyUnicode_FromEncodedObject(prefix, 'UTF-8', 'strict') ] = namespace prefix_str = prefix + b':' # FIXME: this also replaces {namespaces} within strings! path_utf = path_utf.replace(namespace_def, prefix_str) path = python.PyUnicode_FromEncodedObject(path_utf, 'UTF-8', 'strict') return path, namespaces