diff options
| author | Nicholas Car <nicholas.car@surroundaustralia.com> | 2020-08-27 13:13:45 +1000 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2020-08-27 13:13:45 +1000 |
| commit | 3afffcd19d3a5d240e83b3a59b53e3ee1120c165 (patch) | |
| tree | 42ba0191f0a8f645cbc5b60aefd8a3cbfc383a8b | |
| parent | 3e42f5eea742563cdeab7d655fe55f7d0e25ea16 (diff) | |
| parent | 94295389204175783c2f369c2826f0ba55a2d42c (diff) | |
| download | rdflib-improve_graph_parse.tar.gz | |
Merge branch 'master' into improve_graph_parseimprove_graph_parse
| -rw-r--r-- | docs/plugin_parsers.rst | 2 | ||||
| -rw-r--r-- | docs/sphinx-requirements.txt | 2 | ||||
| -rw-r--r-- | rdflib/__init__.py | 6 | ||||
| -rw-r--r-- | rdflib/compare.py | 2 | ||||
| -rw-r--r-- | rdflib/graph.py | 12 | ||||
| -rw-r--r-- | rdflib/parser.py | 104 | ||||
| -rw-r--r-- | rdflib/plugin.py | 10 | ||||
| -rwxr-xr-x | rdflib/plugins/parsers/notation3.py | 102 | ||||
| -rw-r--r-- | rdflib/plugins/parsers/nquads.py | 11 | ||||
| -rw-r--r-- | rdflib/plugins/parsers/nt.py | 33 | ||||
| -rw-r--r-- | rdflib/plugins/parsers/ntriples.py | 122 | ||||
| -rw-r--r-- | rdflib/plugins/parsers/trig.py | 12 | ||||
| -rw-r--r-- | test/test_nt_misc.py | 62 |
13 files changed, 294 insertions, 186 deletions
diff --git a/docs/plugin_parsers.rst b/docs/plugin_parsers.rst index e114958d..81ab7ae6 100644 --- a/docs/plugin_parsers.rst +++ b/docs/plugin_parsers.rst @@ -26,7 +26,7 @@ mdata :class:`~rdflib.plugins.parsers.structureddata.MicrodataParser` microdata :class:`~rdflib.plugins.parsers.structureddata.MicrodataParser` n3 :class:`~rdflib.plugins.parsers.notation3.N3Parser` nquads :class:`~rdflib.plugins.parsers.nquads.NQuadsParser` -nt :class:`~rdflib.plugins.parsers.nt.NTParser` +nt :class:`~rdflib.plugins.parsers.ntriples.NTParser` rdfa :class:`~rdflib.plugins.parsers.structureddata.RDFaParser` rdfa1.0 :class:`~rdflib.plugins.parsers.structureddata.RDFa10Parser` rdfa1.1 :class:`~rdflib.plugins.parsers.structureddata.RDFaParser` diff --git a/docs/sphinx-requirements.txt b/docs/sphinx-requirements.txt index c6dfb112..175ef14e 100644 --- a/docs/sphinx-requirements.txt +++ b/docs/sphinx-requirements.txt @@ -1,3 +1,3 @@ -sphinx==3.2.0 +sphinx==3.2.1 sphinxcontrib-apidoc git+https://github.com/gniezen/n3pygments.git diff --git a/rdflib/__init__.py b/rdflib/__init__.py index bce8204f..06b1c2eb 100644 --- a/rdflib/__init__.py +++ b/rdflib/__init__.py @@ -92,7 +92,11 @@ _interactive_mode = False try: import __main__ - if not hasattr(__main__, "__file__") and sys.stdout is not None and sys.stderr.isatty(): + if ( + not hasattr(__main__, "__file__") + and sys.stdout is not None + and sys.stderr.isatty() + ): # show log messages in interactive mode _interactive_mode = True logger.setLevel(logging.INFO) diff --git a/rdflib/compare.py b/rdflib/compare.py index 839cfbb0..c1665b66 100644 --- a/rdflib/compare.py +++ b/rdflib/compare.py @@ -335,7 +335,7 @@ class _TripleCanonicalizer(object): coloring.extend(colors) try: si = sequence.index(c) - sequence = sequence[:si] + colors + sequence[si + 1:] + sequence = sequence[:si] + colors + sequence[si + 1 :] except ValueError: sequence = colors[1:] + sequence combined_colors = [] diff --git a/rdflib/graph.py b/rdflib/graph.py index 786f193c..49ebcda2 100644 --- a/rdflib/graph.py +++ b/rdflib/graph.py @@ -779,13 +779,17 @@ class Graph(Node): # setup the language filtering if lang is not None: if lang == "": # we only want not language-tagged literals + def langfilter(l_): return l_.language is None + else: + def langfilter(l_): return l_.language == lang else: # we don't care about language tags + def langfilter(l_): return True @@ -1079,9 +1083,11 @@ class Graph(Node): format = source.content_type could_not_guess_format = False if format is None: - if (hasattr(source, "file") - and getattr(source.file, "name", None) - and isinstance(source.file.name, str)): + if ( + hasattr(source, "file") + and getattr(source.file, "name", None) + and isinstance(source.file.name, str) + ): format = rdflib.util.guess_format(source.file.name) if format is None: format = "turtle" diff --git a/rdflib/parser.py b/rdflib/parser.py index 9e501c03..73ce2ba7 100644 --- a/rdflib/parser.py +++ b/rdflib/parser.py @@ -9,15 +9,11 @@ can plugin to rdflib. If you are wanting to invoke a parser you likely want to do so through the Graph class parse method. """ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - +import codecs import os import sys -from io import BytesIO - +from io import BytesIO, TextIOBase, TextIOWrapper, StringIO, BufferedIOBase from urllib.request import pathname2url from urllib.request import Request @@ -41,6 +37,8 @@ __all__ = [ class Parser(object): + __slots__ = set() + def __init__(self): pass @@ -48,6 +46,37 @@ class Parser(object): pass +class BytesIOWrapper(BufferedIOBase): + __slots__ = ("wrapped", "encoded", "encoding") + + def __init__(self, wrapped: str, encoding="utf-8"): + super(BytesIOWrapper, self).__init__() + self.wrapped = wrapped + self.encoding = encoding + self.encoded = None + + def read(self, *args, **kwargs): + if self.encoded is None: + b, blen = codecs.getencoder(self.encoding)(self.wrapped) + self.encoded = BytesIO(b) + return self.encoded.read(*args, **kwargs) + + def read1(self, *args, **kwargs): + if self.encoded is None: + b = codecs.getencoder(self.encoding)(self.wrapped) + self.encoded = BytesIO(b) + return self.encoded.read1(*args, **kwargs) + + def readinto(self, *args, **kwargs): + raise NotImplementedError() + + def readinto1(self, *args, **kwargs): + raise NotImplementedError() + + def write(self, *args, **kwargs): + raise NotImplementedError() + + class InputSource(xmlreader.InputSource, object): """ TODO: @@ -59,23 +88,39 @@ class InputSource(xmlreader.InputSource, object): self.auto_close = False # see Graph.parse(), true if opened by us def close(self): + c = self.getCharacterStream() + if c and hasattr(c, "close"): + try: + c.close() + except Exception: + pass f = self.getByteStream() if f and hasattr(f, "close"): - f.close() + try: + f.close() + except Exception: + pass class StringInputSource(InputSource): """ - TODO: + Constructs an RDFLib Parser InputSource from a Python String or Bytes """ - def __init__(self, value, system_id=None): + def __init__(self, value, encoding="utf-8", system_id=None): super(StringInputSource, self).__init__(system_id) - stream = BytesIO(value) - self.setByteStream(stream) - # TODO: - # encoding = value.encoding - # self.setEncoding(encoding) + if isinstance(value, str): + stream = StringIO(value) + self.setCharacterStream(stream) + self.setEncoding(encoding) + b_stream = BytesIOWrapper(value, encoding) + self.setByteStream(b_stream) + else: + stream = BytesIO(value) + self.setByteStream(stream) + c_stream = TextIOWrapper(stream, encoding) + self.setCharacterStream(c_stream) + self.setEncoding(c_stream.encoding) headers = { @@ -134,8 +179,18 @@ class FileInputSource(InputSource): system_id = URIRef(urljoin("file:", pathname2url(file.name)), base=base) super(FileInputSource, self).__init__(system_id) self.file = file - self.setByteStream(file) - # TODO: self.setEncoding(encoding) + if isinstance(file, TextIOBase): # Python3 unicode fp + self.setCharacterStream(file) + self.setEncoding(file.encoding) + try: + b = file.buffer + self.setByteStream(b) + except (AttributeError, LookupError): + self.setByteStream(file) + else: + self.setByteStream(file) + # We cannot set characterStream here because + # we do not know the Raw Bytes File encoding. def __repr__(self): return repr(self.file) @@ -171,10 +226,21 @@ def create_input_source( else: if isinstance(source, str): location = source + elif isinstance(source, bytes): + data = source elif hasattr(source, "read") and not isinstance(source, Namespace): f = source input_source = InputSource() - input_source.setByteStream(f) + if hasattr(source, "encoding"): + input_source.setCharacterStream(source) + input_source.setEncoding(source.encoding) + try: + b = file.buffer + input_source.setByteStream(b) + except (AttributeError, LookupError): + input_source.setByteStream(source) + else: + input_source.setByteStream(f) if f is sys.stdin: input_source.setSystemId("file:///dev/stdin") elif hasattr(f, "name"): @@ -206,8 +272,8 @@ def create_input_source( input_source = FileInputSource(file) if data is not None: - if isinstance(data, str): - data = data.encode("utf-8") + if not isinstance(data, (str, bytes, bytearray)): + raise RuntimeError("parse data can only str, or bytes.") input_source = StringInputSource(data) auto_close = True diff --git a/rdflib/plugin.py b/rdflib/plugin.py index cc5b6d35..baa2fb5e 100644 --- a/rdflib/plugin.py +++ b/rdflib/plugin.py @@ -11,7 +11,7 @@ following to your setup:: entry_points = { 'rdf.plugins.parser': [ - 'nt = rdf.plugins.parsers.nt:NTParser', + 'nt = rdf.plugins.parsers.ntriples:NTParser', ], 'rdf.plugins.serializer': [ 'nt = rdf.plugins.serializers.NTSerializer:NTSerializer', @@ -185,10 +185,10 @@ register("n3", Parser, "rdflib.plugins.parsers.notation3", "N3Parser") register("text/turtle", Parser, "rdflib.plugins.parsers.notation3", "TurtleParser") register("turtle", Parser, "rdflib.plugins.parsers.notation3", "TurtleParser") register("ttl", Parser, "rdflib.plugins.parsers.notation3", "TurtleParser") -register("application/n-triples", Parser, "rdflib.plugins.parsers.nt", "NTParser") -register("ntriples", Parser, "rdflib.plugins.parsers.nt", "NTParser") -register("nt", Parser, "rdflib.plugins.parsers.nt", "NTParser") -register("nt11", Parser, "rdflib.plugins.parsers.nt", "NTParser") +register("application/n-triples", Parser, "rdflib.plugins.parsers.ntriples", "NTParser") +register("ntriples", Parser, "rdflib.plugins.parsers.ntriples", "NTParser") +register("nt", Parser, "rdflib.plugins.parsers.ntriples", "NTParser") +register("nt11", Parser, "rdflib.plugins.parsers.ntriples", "NTParser") register("application/n-quads", Parser, "rdflib.plugins.parsers.nquads", "NQuadsParser") register("nquads", Parser, "rdflib.plugins.parsers.nquads", "NQuadsParser") register("application/trix", Parser, "rdflib.plugins.parsers.trix", "TriXParser") diff --git a/rdflib/plugins/parsers/notation3.py b/rdflib/plugins/parsers/notation3.py index c427f153..d866977d 100755 --- a/rdflib/plugins/parsers/notation3.py +++ b/rdflib/plugins/parsers/notation3.py @@ -139,10 +139,13 @@ def join(here, there): return here + frag # join('mid:foo@example', '../foo') bzzt - if here[bcolonl + 1: bcolonl + 2] != "/": - raise ValueError("Base <%s> has no slash after " "colon - with relative '%s'." % (here, there)) + if here[bcolonl + 1 : bcolonl + 2] != "/": + raise ValueError( + "Base <%s> has no slash after " + "colon - with relative '%s'." % (here, there) + ) - if here[bcolonl + 1: bcolonl + 3] == "//": + if here[bcolonl + 1 : bcolonl + 3] == "//": bpath = here.find("/", bcolonl + 3) else: bpath = bcolonl + 1 @@ -502,14 +505,14 @@ class SinkParser: """ assert tok[0] not in _notNameChars # not for punctuation - if argstr[i: i + 1] == "@": + if argstr[i : i + 1] == "@": i = i + 1 else: if tok not in self.keywords: return -1 # No, this has neither keywords declaration nor "@" if ( - argstr[i: i + len(tok)] == tok + argstr[i : i + len(tok)] == tok and (argstr[i + len(tok)] in _notKeywordsChars) or (colon and argstr[i + len(tok)] == ":") ): @@ -526,7 +529,7 @@ class SinkParser: assert tok[0] not in _notNameChars # not for punctuation - if argstr[i: i + len(tok)].lower() == tok.lower() and ( + if argstr[i : i + len(tok)].lower() == tok.lower() and ( argstr[i + len(tok)] in _notQNameChars ): i = i + len(tok) @@ -794,23 +797,23 @@ class SinkParser: res.append(("->", RDF_type)) return j - if argstr[i: i + 2] == "<=": + if argstr[i : i + 2] == "<=": if self.turtle: self.BadSyntax(argstr, i, "Found '<=' in Turtle mode. ") res.append(("<-", self._store.newSymbol(Logic_NS + "implies"))) return i + 2 - if argstr[i: i + 1] == "=": + if argstr[i : i + 1] == "=": if self.turtle: self.BadSyntax(argstr, i, "Found '=' in Turtle mode") - if argstr[i + 1: i + 2] == ">": + if argstr[i + 1 : i + 2] == ">": res.append(("->", self._store.newSymbol(Logic_NS + "implies"))) return i + 2 res.append(("->", DAML_sameAs)) return i + 1 - if argstr[i: i + 2] == ":=": + if argstr[i : i + 2] == ":=": if self.turtle: self.BadSyntax(argstr, i, "Found ':=' in Turtle mode") @@ -823,7 +826,7 @@ class SinkParser: res.append(("->", r[0])) return j - if argstr[i: i + 2] == ">-" or argstr[i: i + 2] == "<-": + if argstr[i : i + 2] == ">-" or argstr[i : i + 2] == "<-": self.BadSyntax(argstr, j, ">- ... -> syntax is obsolete.") return -1 @@ -844,8 +847,8 @@ class SinkParser: if j < 0: return j # nope - while argstr[j: j + 1] in "!^": # no spaces, must follow exactly (?) - ch = argstr[j: j + 1] + while argstr[j : j + 1] in "!^": # no spaces, must follow exactly (?) + ch = argstr[j : j + 1] subj = res.pop() obj = self.blankNode(uri=self.here(j)) j = self.node(argstr, j + 1, res) @@ -879,7 +882,7 @@ class SinkParser: if j < 0: return j # eof i = j - ch = argstr[i: i + 1] # Quick 1-character checks first: + ch = argstr[i : i + 1] # Quick 1-character checks first: if ch == "[": bnodeID = self.here(i) @@ -887,7 +890,7 @@ class SinkParser: if j < 0: self.BadSyntax(argstr, i, "EOF after '['") # Hack for "is" binding name to anon node - if argstr[j: j + 1] == "=": + if argstr[j : j + 1] == "=": if self.turtle: self.BadSyntax( argstr, j, "Found '[=' or '[ =' when in turtle mode." @@ -905,7 +908,7 @@ class SinkParser: self.BadSyntax( argstr, i, "EOF when objectList expected after [ = " ) - if argstr[j: j + 1] == ";": + if argstr[j : j + 1] == ";": j = j + 1 else: self.BadSyntax(argstr, i, "objectList expected after [= ") @@ -922,7 +925,7 @@ class SinkParser: self.BadSyntax( argstr, i, "EOF when ']' expected after [ <propertyList>" ) - if argstr[j: j + 1] != "]": + if argstr[j : j + 1] != "]": self.BadSyntax(argstr, j, "']' expected") res.append(subj) return j + 1 @@ -931,7 +934,7 @@ class SinkParser: # if self.turtle: # self.BadSyntax(argstr, i, # "found '{' while in Turtle mode, Formulas not supported!") - ch2 = argstr[i + 1: i + 2] + ch2 = argstr[i + 1 : i + 2] if ch2 == "$": # a set i += 1 @@ -942,12 +945,12 @@ class SinkParser: i = self.skipSpace(argstr, j) if i < 0: self.BadSyntax(argstr, i, "needed '$}', found end.") - if argstr[i: i + 2] == "$}": + if argstr[i : i + 2] == "$}": j = i + 2 break if not first_run: - if argstr[i: i + 1] == ",": + if argstr[i : i + 1] == ",": i += 1 else: self.BadSyntax(argstr, i, "expected: ','") @@ -982,7 +985,7 @@ class SinkParser: if i < 0: self.BadSyntax(argstr, i, "needed '}', found end.") - if argstr[i: i + 1] == "}": + if argstr[i : i + 1] == "}": j = i + 1 break @@ -1001,7 +1004,7 @@ class SinkParser: if ch == "(": thing_type = self._store.newList - ch2 = argstr[i + 1: i + 2] + ch2 = argstr[i + 1 : i + 2] if ch2 == "$": thing_type = self._store.newSet i += 1 @@ -1012,7 +1015,7 @@ class SinkParser: i = self.skipSpace(argstr, j) if i < 0: self.BadSyntax(argstr, i, "needed ')', found end.") - if argstr[i: i + 1] == ")": + if argstr[i : i + 1] == ")": j = i + 1 break @@ -1065,7 +1068,7 @@ class SinkParser: break i = j + 1 - if argstr[j: j + 2] == ":-": + if argstr[j : j + 2] == ":-": if self.turtle: self.BadSyntax(argstr, j, "Found in ':-' in Turtle mode") i = j + 2 @@ -1095,7 +1098,7 @@ class SinkParser: j = self.skipSpace(argstr, i) if j < 0: self.BadSyntax(argstr, j, "EOF found in list of objects") - if argstr[i: i + 1] != ";": + if argstr[i : i + 1] != ";": return i i = i + 1 # skip semicolon and continue @@ -1116,7 +1119,7 @@ class SinkParser: j = self.skipSpace(argstr, i) if j < 0: return j # eof - ch = argstr[j: j + 1] + ch = argstr[j : j + 1] if ch != ",": if ch != ".": return -1 @@ -1133,7 +1136,7 @@ class SinkParser: j = self.skipSpace(argstr, i) if j < 0: self.BadSyntax(argstr, j, "EOF found after object") - if argstr[j: j + 1] != ",": + if argstr[j : j + 1] != ",": return j # Found something else! i = self.object(argstr, j + 1, res) if i < 0: @@ -1143,11 +1146,11 @@ class SinkParser: j = self.skipSpace(argstr, i) if j < 0: return j # eof - if argstr[j: j + 1] == ".": + if argstr[j : j + 1] == ".": return j + 1 # skip - if argstr[j: j + 1] == "}": + if argstr[j : j + 1] == "}": return j # don't skip it - if argstr[j: j + 1] == "]": + if argstr[j : j + 1] == "]": return j self.BadSyntax(argstr, j, "expected '.' or '}' or ']' at end of statement") @@ -1212,7 +1215,7 @@ class SinkParser: assert ( ":" in uref ), "With no base URI, cannot deal with relative URIs" - if argstr[i - 1: i] == "#" and not uref[-1:] == "#": + if argstr[i - 1 : i] == "#" and not uref[-1:] == "#": uref = uref + "#" # She meant it! Weirdness in urlparse? symb = self._store.newSymbol(uref) if symb in self._variables: @@ -1261,7 +1264,7 @@ class SinkParser: if j < 0: return -1 - if argstr[j: j + 1] != "?": + if argstr[j : j + 1] != "?": return -1 j = j + 1 i = j @@ -1419,7 +1422,7 @@ class SinkParser: i = j if argstr[i] in self.string_delimiters: - if argstr[i: i + 3] == argstr[i] * 3: + if argstr[i : i + 3] == argstr[i] * 3: delim = argstr[i] * 3 else: delim = argstr[i] @@ -1467,7 +1470,7 @@ class SinkParser: # return -1 ## or fall through? if argstr[i] in self.string_delimiters: - if argstr[i: i + 3] == argstr[i] * 3: + if argstr[i : i + 3] == argstr[i] * 3: delim = argstr[i] * 3 else: delim = argstr[i] @@ -1476,7 +1479,7 @@ class SinkParser: dt = None j, s = self.strconst(argstr, i, delim) lang = None - if argstr[j: j + 1] == "@": # Language? + if argstr[j : j + 1] == "@": # Language? m = langcode.match(argstr, j + 1) if m is None: raise BadSyntax( @@ -1487,9 +1490,9 @@ class SinkParser: "Bad language code syntax on string " + "literal, after @", ) i = m.end() - lang = argstr[j + 1: i] + lang = argstr[j + 1 : i] j = i - if argstr[j: j + 2] == "^^": + if argstr[j : j + 2] == "^^": res2 = [] j = self.uri_ref2(argstr, j + 2, res2) # Read datatype URI dt = res2[0] @@ -1522,15 +1525,15 @@ class SinkParser: if ( delim == delim3 ): # done when delim is """ or ''' and, respectively ... - if argstr[j: j + 5] == delim5: # ... we have "" or '' before + if argstr[j : j + 5] == delim5: # ... we have "" or '' before i = j + 5 ustr = ustr + delim2 return i, ustr - if argstr[j: j + 4] == delim4: # ... we have " or ' before + if argstr[j : j + 4] == delim4: # ... we have " or ' before i = j + 4 ustr = ustr + delim1 return i, ustr - if argstr[j: j + 3] == delim3: # current " or ' is part of delim + if argstr[j : j + 3] == delim3: # current " or ' is part of delim i = j + 3 return i, ustr @@ -1542,8 +1545,8 @@ class SinkParser: m = interesting.search(argstr, j) # was argstr[j:]. # Note for pos param to work, MUST be compiled ... re bug? assert m, "Quote expected in string at ^ in %s^%s" % ( - argstr[j - 20: j], - argstr[j: j + 20], + argstr[j - 20 : j], + argstr[j : j + 20], ) # at least need a quote i = m.start() @@ -1589,7 +1592,7 @@ class SinkParser: elif ch == "\\": j = i + 1 - ch = argstr[j: j + 1] # Will be empty if string ends + ch = argstr[j : j + 1] # Will be empty if string ends if not ch: raise BadSyntax( self._thisDoc, @@ -1620,14 +1623,14 @@ class SinkParser: self._thisDoc, startline, argstr, i, "unterminated string literal(3)" ) try: - return i + n, reg.sub(unicodeExpand, "\\" + prefix + argstr[i: i + n]) + return i + n, reg.sub(unicodeExpand, "\\" + prefix + argstr[i : i + n]) except: raise BadSyntax( self._thisDoc, startline, argstr, i, - "bad string literal hex escape: " + argstr[i: i + n], + "bad string literal hex escape: " + argstr[i : i + n], ) def uEscape(self, argstr, i, startline): @@ -1672,7 +1675,7 @@ class BadSyntax(SyntaxError): self._why, pre, argstr[st:i], - argstr[i: i + 60], + argstr[i : i + 60], post, ) @@ -1896,8 +1899,11 @@ class TurtleParser(Parser): baseURI = graph.absolutize(source.getPublicId() or source.getSystemId() or "") p = SinkParser(sink, baseURI=baseURI, turtle=turtle) - - p.loadStream(source.getByteStream()) + # N3 parser prefers str stream + stream = source.getCharacterStream() + if not stream: + stream = source.getByteStream() + p.loadStream(stream) for prefix, namespace in p._bindings.items(): graph.bind(prefix, namespace) diff --git a/rdflib/plugins/parsers/nquads.py b/rdflib/plugins/parsers/nquads.py index a3bfbc6e..2a3a9136 100644 --- a/rdflib/plugins/parsers/nquads.py +++ b/rdflib/plugins/parsers/nquads.py @@ -31,7 +31,7 @@ from codecs import getreader from rdflib import ConjunctiveGraph # Build up from the NTriples parser: -from rdflib.plugins.parsers.ntriples import NTriplesParser +from rdflib.plugins.parsers.ntriples import W3CNTriplesParser from rdflib.plugins.parsers.ntriples import ParseError from rdflib.plugins.parsers.ntriples import r_tail from rdflib.plugins.parsers.ntriples import r_wspace @@ -39,7 +39,7 @@ from rdflib.plugins.parsers.ntriples import r_wspace __all__ = ["NQuadsParser"] -class NQuadsParser(NTriplesParser): +class NQuadsParser(W3CNTriplesParser): def parse(self, inputsource, sink, bnode_context=None, **kwargs): """ Parse inputsource as an N-Quads file. @@ -57,13 +57,14 @@ class NQuadsParser(NTriplesParser): ) self.sink = ConjunctiveGraph(store=sink.store, identifier=sink.identifier) - source = inputsource.getByteStream() + source = inputsource.getCharacterStream() + if not source: + source = inputsource.getByteStream() + source = getreader("utf-8")(source) if not hasattr(source, "read"): raise ParseError("Item to parse must be a file-like object.") - source = getreader("utf-8")(source) - self.file = source self.buffer = "" while True: diff --git a/rdflib/plugins/parsers/nt.py b/rdflib/plugins/parsers/nt.py deleted file mode 100644 index c37a1aa0..00000000 --- a/rdflib/plugins/parsers/nt.py +++ /dev/null @@ -1,33 +0,0 @@ -from rdflib.parser import Parser -from rdflib.plugins.parsers.ntriples import NTriplesParser - -__all__ = ["NTSink", "NTParser"] - - -class NTSink(object): - def __init__(self, graph): - self.graph = graph - - def triple(self, s, p, o): - self.graph.add((s, p, o)) - - -class NTParser(Parser): - """parser for the ntriples format, often stored with the .nt extension - - See http://www.w3.org/TR/rdf-testcases/#ntriples""" - - def parse(self, source, sink, **kwargs): - ''' - Parse the NT format - - :type source: `rdflib.parser.InputSource` - :param source: the source of NT-formatted data - :type sink: `rdflib.graph.Graph` - :param sink: where to send parsed triples - :param kwargs: Additional arguments to pass to `.NTriplesParser.parse` - ''' - f = source.getByteStream() # TODO getCharacterStream? - parser = NTriplesParser(NTSink(sink)) - parser.parse(f, **kwargs) - f.close() diff --git a/rdflib/plugins/parsers/ntriples.py b/rdflib/plugins/parsers/ntriples.py index 33a4a4e6..d43a240c 100644 --- a/rdflib/plugins/parsers/ntriples.py +++ b/rdflib/plugins/parsers/ntriples.py @@ -1,9 +1,6 @@ -#!/usr/bin/env python -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function +#!/usr/bin/env python3 -__doc__ = """ +__doc__ = """\ N-Triples Parser License: GPL 2, W3C, BSD, or MIT Author: Sean B. Palmer, inamidst.com @@ -15,14 +12,13 @@ import codecs from rdflib.term import URIRef as URI from rdflib.term import BNode as bNode from rdflib.term import Literal - - -from rdflib.compat import cast_bytes from rdflib.compat import decodeUnicodeEscape +from rdflib.exceptions import ParserError as ParseError +from rdflib.parser import Parser -from io import BytesIO +from io import StringIO, TextIOBase, BytesIO -__all__ = ["unquote", "uriquote", "Sink", "NTriplesParser"] +__all__ = ["unquote", "uriquote", "W3CNTriplesParser", "NTGraphSink", "NTParser"] uriref = r'<([^:]+:[^\s"<>]*)>' literal = r'"([^"\\]*(?:\\.[^"\\]*)*)"' @@ -40,15 +36,7 @@ bufsiz = 2048 validate = False -class Node(str): - pass - - -class ParseError(Exception): - pass - - -class Sink(object): +class DummySink(object): def __init__(self): self.length = 0 @@ -78,7 +66,7 @@ def unquote(s): while s: m = r_safe.match(s) if m: - s = s[m.end():] + s = s[m.end() :] result.append(m.group(1)) continue @@ -90,7 +78,7 @@ def unquote(s): m = r_uniquot.match(s) if m: - s = s[m.end():] + s = s[m.end() :] u, U = m.groups() codepoint = int(u or U, 16) if codepoint > 0x10FFFF: @@ -113,11 +101,10 @@ def uriquote(uri): return r_hibyte.sub(lambda m: "%%%02X" % ord(m.group(1)), uri) -class NTriplesParser(object): +class W3CNTriplesParser(object): """An N-Triples Parser. - + This is a legacy-style Triples parser for NTriples provided by W3C Usage:: - p = NTriplesParser(sink=MySink()) sink = p.parse(f) # file; use parsestring for a string @@ -127,6 +114,8 @@ class NTriplesParser(object): `NTriplesParser`. """ + __slots__ = ("_bnode_ids", "sink", "buffer", "file", "line") + def __init__(self, sink=None, bnode_context=None): if bnode_context is not None: self._bnode_ids = bnode_context @@ -136,7 +125,11 @@ class NTriplesParser(object): if sink is not None: self.sink = sink else: - self.sink = Sink() + self.sink = DummySink() + + self.buffer = None + self.file = None + self.line = "" def parse(self, f, bnode_context=None): """ @@ -150,10 +143,13 @@ class NTriplesParser(object): passed in to define a distinct context for a given call to `parse`. """ + if not hasattr(f, "read"): raise ParseError("Item to parse must be a file-like object.") - # since N-Triples 1.1 files can and should be utf-8 encoded - f = codecs.getreader("utf-8")(f) + + if not hasattr(f, "encoding") and not hasattr(f, "charbuffer"): + # someone still using a bytestream here? + f = codecs.getreader("utf-8")(f) self.file = f self.buffer = "" @@ -164,16 +160,17 @@ class NTriplesParser(object): try: self.parseline(bnode_context=bnode_context) except ParseError: - raise ParseError("Invalid line: %r" % self.line) + raise ParseError("Invalid line: {}".format(self.line)) return self.sink def parsestring(self, s, **kwargs): """Parse s as an N-Triples string.""" - if not isinstance(s, str): + if not isinstance(s, (str, bytes, bytearray)): raise ParseError("Item to parse must be a string instance.") - f = BytesIO() - f.write(cast_bytes(s)) - f.seek(0) + if isinstance(s, (bytes, bytearray)): + f = codecs.getreader("utf-8")(BytesIO(s)) + else: + f = StringIO(s) self.parse(f, **kwargs) def readline(self): @@ -189,7 +186,7 @@ class NTriplesParser(object): while True: m = r_line.match(self.buffer) if m: # the more likely prospect - self.buffer = self.buffer[m.end():] + self.buffer = self.buffer[m.end() :] return m.group(1) else: buffer = self.file.read(bufsiz) @@ -211,12 +208,12 @@ class NTriplesParser(object): predicate = self.predicate() self.eat(r_wspaces) - object = self.object(bnode_context) + object_ = self.object(bnode_context) self.eat(r_tail) if self.line: - raise ParseError("Trailing garbage") - self.sink.triple(subject, predicate, object) + raise ParseError("Trailing garbage: {}".format(self.line)) + self.sink.triple(subject, predicate, object_) def peek(self, token): return self.line.startswith(token) @@ -227,7 +224,7 @@ class NTriplesParser(object): # print(dir(pattern)) # print repr(self.line), type(self.line) raise ParseError("Failed to eat %s at %s" % (pattern.pattern, self.line)) - self.line = self.line[m.end():] + self.line = self.line[m.end() :] return m def subject(self, bnode_context=None): @@ -295,13 +292,44 @@ class NTriplesParser(object): return False -# # Obsolete, unused -# def parseURI(uri): -# import urllib -# parser = NTriplesParser() -# u = urllib.urlopen(uri) -# sink = parser.parse(u) -# u.close() -# # for triple in sink: -# # print triple -# print 'Length of input:', sink.length +class NTGraphSink(object): + __slots__ = ("g",) + + def __init__(self, graph): + self.g = graph + + def triple(self, s, p, o): + self.g.add((s, p, o)) + + +class NTParser(Parser): + """parser for the ntriples format, often stored with the .nt extension + + See http://www.w3.org/TR/rdf-testcases/#ntriples""" + + __slots__ = set() + + @classmethod + def parse(cls, source, sink, **kwargs): + """ + Parse the NT format + + :type source: `rdflib.parser.InputSource` + :param source: the source of NT-formatted data + :type sink: `rdflib.graph.Graph` + :param sink: where to send parsed triples + :param kwargs: Additional arguments to pass to `.NTriplesParser.parse` + """ + f = source.getCharacterStream() + if not f: + b = source.getByteStream() + # TextIOBase includes: StringIO and TextIOWrapper + if isinstance(b, TextIOBase): + # f is not really a ByteStream, but a CharacterStream + f = b + else: + # since N-Triples 1.1 files can and should be utf-8 encoded + f = codecs.getreader("utf-8")(b) + parser = W3CNTriplesParser(NTGraphSink(sink)) + parser.parse(f, **kwargs) + f.close() diff --git a/rdflib/plugins/parsers/trig.py b/rdflib/plugins/parsers/trig.py index 8f270de0..938fb259 100644 --- a/rdflib/plugins/parsers/trig.py +++ b/rdflib/plugins/parsers/trig.py @@ -82,7 +82,7 @@ class TrigSinkParser(SinkParser): if j < 0: self.BadSyntax(argstr, i, "EOF found when expected graph") - if argstr[j: j + 1] == "=": # optional = for legacy support + if argstr[j : j + 1] == "=": # optional = for legacy support i = self.skipSpace(argstr, j + 1) if i < 0: @@ -90,7 +90,7 @@ class TrigSinkParser(SinkParser): else: i = j - if argstr[i: i + 1] != "{": + if argstr[i : i + 1] != "{": return -1 # the node wasn't part of a graph j = i + 1 @@ -106,7 +106,7 @@ class TrigSinkParser(SinkParser): if i < 0: self.BadSyntax(argstr, i, "needed '}', found end.") - if argstr[i: i + 1] == "}": + if argstr[i : i + 1] == "}": j = i + 1 break @@ -153,7 +153,11 @@ class TrigParser(Parser): ) p = TrigSinkParser(sink, baseURI=baseURI, turtle=True) - p.loadStream(source.getByteStream()) + stream = source.getCharacterStream() # try to get str stream first + if not stream: + # fallback to get the bytes stream + stream = source.getByteStream() + p.loadStream(stream) for prefix, namespace in p._bindings.items(): conj_graph.bind(prefix, namespace) diff --git a/test/test_nt_misc.py b/test/test_nt_misc.py index af7049d8..2d25e742 100644 --- a/test/test_nt_misc.py +++ b/test/test_nt_misc.py @@ -34,8 +34,34 @@ class NTTestCase(unittest.TestCase): s = g.serialize(format="nt").strip() self.assertEqual(s, '<foo> <foo> "test\\n"@en .'.encode("latin-1")) + def testIssue1144_rdflib(self): + fname = "test/nt/lists-02.nt" + g1 = Graph() + with open(fname, "r") as f: + g1.parse(f, format='nt') + self.assertEqual(14, len(g1)) + g2 = Graph() + with open(fname, "rb") as fb: + g2.parse(fb, format='nt') + self.assertEqual(14, len(g2)) + + + def testIssue1144_w3c(self): + fname = "test/nt/lists-02.nt" + sink1 = ntriples.NTGraphSink(Graph()) + p1 = ntriples.W3CNTriplesParser(sink1) + with open(fname, "r") as f: + p1.parse(f) + self.assertEqual(14, len(sink1.g)) + sink2 = ntriples.NTGraphSink(Graph()) + p2 = ntriples.W3CNTriplesParser(sink2) + with open(fname, "rb") as f: + p2.parse(f) + self.assertEqual(14, len(sink2.g)) + + def test_sink(self): - s = ntriples.Sink() + s = ntriples.DummySink() self.assertTrue(s.length == 0) s.triple(None, None, None) self.assertTrue(s.length == 1) @@ -77,26 +103,26 @@ class NTTestCase(unittest.TestCase): ntriples.validate = False self.assertEqual(res, uniquot) - def test_NTriplesParser_fpath(self): + def test_W3CNTriplesParser_fpath(self): fpath = "test/nt/" + os.listdir("test/nt")[0] - p = ntriples.NTriplesParser() + p = ntriples.W3CNTriplesParser() self.assertRaises(ntriples.ParseError, p.parse, fpath) - def test_NTriplesParser_parsestring(self): - p = ntriples.NTriplesParser() + def test_W3CNTriplesParser_parsestring(self): + p = ntriples.W3CNTriplesParser() data = 3 self.assertRaises(ntriples.ParseError, p.parsestring, data) fname = "test/nt/lists-02.nt" with open(fname, "r") as f: data = f.read() - p = ntriples.NTriplesParser() + p = ntriples.W3CNTriplesParser() res = p.parsestring(data) self.assertTrue(res == None) def test_w3_ntriple_variants(self): uri = "file:///" + os.getcwd() + "/test/nt/test.ntriples" - parser = ntriples.NTriplesParser() + parser = ntriples.W3CNTriplesParser() u = urlopen(uri) sink = parser.parse(u) u.close() @@ -107,14 +133,14 @@ class NTTestCase(unittest.TestCase): data = ( """<http://example.org/resource32> 3 <http://example.org/datatype1> .\n""" ) - p = ntriples.NTriplesParser() + p = ntriples.W3CNTriplesParser() self.assertRaises(ntriples.ParseError, p.parsestring, data) def test_cover_eat(self): data = ( """<http://example.org/resource32> 3 <http://example.org/datatype1> .\n""" ) - p = ntriples.NTriplesParser() + p = ntriples.W3CNTriplesParser() p.line = data self.assertRaises( ntriples.ParseError, p.eat, re.compile("<http://example.org/datatype1>") @@ -122,7 +148,7 @@ class NTTestCase(unittest.TestCase): def test_cover_subjectobjectliteral(self): # data = '''<http://example.org/resource32> 3 <http://example.org/datatype1> .\n''' - p = ntriples.NTriplesParser() + p = ntriples.W3CNTriplesParser() p.line = "baz" self.assertRaises(ntriples.ParseError, p.subject) self.assertRaises(ntriples.ParseError, p.object) @@ -134,12 +160,12 @@ class BNodeContextTestCase(unittest.TestCase): def test_bnode_shared_across_instances(self): my_sink = FakeSink() bnode_context = dict() - p = ntriples.NTriplesParser(my_sink, bnode_context=bnode_context) + p = ntriples.W3CNTriplesParser(my_sink, bnode_context=bnode_context) p.parsestring(''' _:0 <http://purl.obolibrary.org/obo/RO_0002350> <http://www.gbif.org/species/0000001> . ''') - q = ntriples.NTriplesParser(my_sink, bnode_context=bnode_context) + q = ntriples.W3CNTriplesParser(my_sink, bnode_context=bnode_context) q.parsestring(''' _:0 <http://purl.obolibrary.org/obo/RO_0002350> <http://www.gbif.org/species/0000002> . ''') @@ -148,12 +174,12 @@ class BNodeContextTestCase(unittest.TestCase): def test_bnode_distinct_across_instances(self): my_sink = FakeSink() - p = ntriples.NTriplesParser(my_sink) + p = ntriples.W3CNTriplesParser(my_sink) p.parsestring(''' _:0 <http://purl.obolibrary.org/obo/RO_0002350> <http://www.gbif.org/species/0000001> . ''') - q = ntriples.NTriplesParser(my_sink) + q = ntriples.W3CNTriplesParser(my_sink) q.parsestring(''' _:0 <http://purl.obolibrary.org/obo/RO_0002350> <http://www.gbif.org/species/0000002> . ''') @@ -162,7 +188,7 @@ class BNodeContextTestCase(unittest.TestCase): def test_bnode_distinct_across_parse(self): my_sink = FakeSink() - p = ntriples.NTriplesParser(my_sink) + p = ntriples.W3CNTriplesParser(my_sink) p.parsestring(''' _:0 <http://purl.obolibrary.org/obo/RO_0002350> <http://www.gbif.org/species/0000001> . @@ -176,7 +202,7 @@ class BNodeContextTestCase(unittest.TestCase): def test_bnode_shared_across_parse(self): my_sink = FakeSink() - p = ntriples.NTriplesParser(my_sink) + p = ntriples.W3CNTriplesParser(my_sink) p.parsestring(''' _:0 <http://purl.obolibrary.org/obo/RO_0002350> <http://www.gbif.org/species/0000001> . @@ -192,12 +218,12 @@ class BNodeContextTestCase(unittest.TestCase): my_sink = FakeSink() bnode_ctx = dict() - p = ntriples.NTriplesParser(my_sink) + p = ntriples.W3CNTriplesParser(my_sink) p.parsestring(''' _:0 <http://purl.obolibrary.org/obo/RO_0002350> <http://www.gbif.org/species/0000001> . ''', bnode_context=bnode_ctx) - q = ntriples.NTriplesParser(my_sink) + q = ntriples.W3CNTriplesParser(my_sink) q.parsestring(''' _:0 <http://purl.obolibrary.org/obo/RO_0002350> <http://www.gbif.org/species/0000002> . ''', bnode_context=bnode_ctx) |
