summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNicholas Car <nicholas.car@surroundaustralia.com>2020-08-27 13:13:45 +1000
committerGitHub <noreply@github.com>2020-08-27 13:13:45 +1000
commit3afffcd19d3a5d240e83b3a59b53e3ee1120c165 (patch)
tree42ba0191f0a8f645cbc5b60aefd8a3cbfc383a8b
parent3e42f5eea742563cdeab7d655fe55f7d0e25ea16 (diff)
parent94295389204175783c2f369c2826f0ba55a2d42c (diff)
downloadrdflib-improve_graph_parse.tar.gz
Merge branch 'master' into improve_graph_parseimprove_graph_parse
-rw-r--r--docs/plugin_parsers.rst2
-rw-r--r--docs/sphinx-requirements.txt2
-rw-r--r--rdflib/__init__.py6
-rw-r--r--rdflib/compare.py2
-rw-r--r--rdflib/graph.py12
-rw-r--r--rdflib/parser.py104
-rw-r--r--rdflib/plugin.py10
-rwxr-xr-xrdflib/plugins/parsers/notation3.py102
-rw-r--r--rdflib/plugins/parsers/nquads.py11
-rw-r--r--rdflib/plugins/parsers/nt.py33
-rw-r--r--rdflib/plugins/parsers/ntriples.py122
-rw-r--r--rdflib/plugins/parsers/trig.py12
-rw-r--r--test/test_nt_misc.py62
13 files changed, 294 insertions, 186 deletions
diff --git a/docs/plugin_parsers.rst b/docs/plugin_parsers.rst
index e114958d..81ab7ae6 100644
--- a/docs/plugin_parsers.rst
+++ b/docs/plugin_parsers.rst
@@ -26,7 +26,7 @@ mdata :class:`~rdflib.plugins.parsers.structureddata.MicrodataParser`
microdata :class:`~rdflib.plugins.parsers.structureddata.MicrodataParser`
n3 :class:`~rdflib.plugins.parsers.notation3.N3Parser`
nquads :class:`~rdflib.plugins.parsers.nquads.NQuadsParser`
-nt :class:`~rdflib.plugins.parsers.nt.NTParser`
+nt :class:`~rdflib.plugins.parsers.ntriples.NTParser`
rdfa :class:`~rdflib.plugins.parsers.structureddata.RDFaParser`
rdfa1.0 :class:`~rdflib.plugins.parsers.structureddata.RDFa10Parser`
rdfa1.1 :class:`~rdflib.plugins.parsers.structureddata.RDFaParser`
diff --git a/docs/sphinx-requirements.txt b/docs/sphinx-requirements.txt
index c6dfb112..175ef14e 100644
--- a/docs/sphinx-requirements.txt
+++ b/docs/sphinx-requirements.txt
@@ -1,3 +1,3 @@
-sphinx==3.2.0
+sphinx==3.2.1
sphinxcontrib-apidoc
git+https://github.com/gniezen/n3pygments.git
diff --git a/rdflib/__init__.py b/rdflib/__init__.py
index bce8204f..06b1c2eb 100644
--- a/rdflib/__init__.py
+++ b/rdflib/__init__.py
@@ -92,7 +92,11 @@ _interactive_mode = False
try:
import __main__
- if not hasattr(__main__, "__file__") and sys.stdout is not None and sys.stderr.isatty():
+ if (
+ not hasattr(__main__, "__file__")
+ and sys.stdout is not None
+ and sys.stderr.isatty()
+ ):
# show log messages in interactive mode
_interactive_mode = True
logger.setLevel(logging.INFO)
diff --git a/rdflib/compare.py b/rdflib/compare.py
index 839cfbb0..c1665b66 100644
--- a/rdflib/compare.py
+++ b/rdflib/compare.py
@@ -335,7 +335,7 @@ class _TripleCanonicalizer(object):
coloring.extend(colors)
try:
si = sequence.index(c)
- sequence = sequence[:si] + colors + sequence[si + 1:]
+ sequence = sequence[:si] + colors + sequence[si + 1 :]
except ValueError:
sequence = colors[1:] + sequence
combined_colors = []
diff --git a/rdflib/graph.py b/rdflib/graph.py
index 786f193c..49ebcda2 100644
--- a/rdflib/graph.py
+++ b/rdflib/graph.py
@@ -779,13 +779,17 @@ class Graph(Node):
# setup the language filtering
if lang is not None:
if lang == "": # we only want not language-tagged literals
+
def langfilter(l_):
return l_.language is None
+
else:
+
def langfilter(l_):
return l_.language == lang
else: # we don't care about language tags
+
def langfilter(l_):
return True
@@ -1079,9 +1083,11 @@ class Graph(Node):
format = source.content_type
could_not_guess_format = False
if format is None:
- if (hasattr(source, "file")
- and getattr(source.file, "name", None)
- and isinstance(source.file.name, str)):
+ if (
+ hasattr(source, "file")
+ and getattr(source.file, "name", None)
+ and isinstance(source.file.name, str)
+ ):
format = rdflib.util.guess_format(source.file.name)
if format is None:
format = "turtle"
diff --git a/rdflib/parser.py b/rdflib/parser.py
index 9e501c03..73ce2ba7 100644
--- a/rdflib/parser.py
+++ b/rdflib/parser.py
@@ -9,15 +9,11 @@ can plugin to rdflib. If you are wanting to invoke a parser you likely
want to do so through the Graph class parse method.
"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
+import codecs
import os
import sys
-from io import BytesIO
-
+from io import BytesIO, TextIOBase, TextIOWrapper, StringIO, BufferedIOBase
from urllib.request import pathname2url
from urllib.request import Request
@@ -41,6 +37,8 @@ __all__ = [
class Parser(object):
+ __slots__ = set()
+
def __init__(self):
pass
@@ -48,6 +46,37 @@ class Parser(object):
pass
+class BytesIOWrapper(BufferedIOBase):
+ __slots__ = ("wrapped", "encoded", "encoding")
+
+ def __init__(self, wrapped: str, encoding="utf-8"):
+ super(BytesIOWrapper, self).__init__()
+ self.wrapped = wrapped
+ self.encoding = encoding
+ self.encoded = None
+
+ def read(self, *args, **kwargs):
+ if self.encoded is None:
+ b, blen = codecs.getencoder(self.encoding)(self.wrapped)
+ self.encoded = BytesIO(b)
+ return self.encoded.read(*args, **kwargs)
+
+ def read1(self, *args, **kwargs):
+ if self.encoded is None:
+ b = codecs.getencoder(self.encoding)(self.wrapped)
+ self.encoded = BytesIO(b)
+ return self.encoded.read1(*args, **kwargs)
+
+ def readinto(self, *args, **kwargs):
+ raise NotImplementedError()
+
+ def readinto1(self, *args, **kwargs):
+ raise NotImplementedError()
+
+ def write(self, *args, **kwargs):
+ raise NotImplementedError()
+
+
class InputSource(xmlreader.InputSource, object):
"""
TODO:
@@ -59,23 +88,39 @@ class InputSource(xmlreader.InputSource, object):
self.auto_close = False # see Graph.parse(), true if opened by us
def close(self):
+ c = self.getCharacterStream()
+ if c and hasattr(c, "close"):
+ try:
+ c.close()
+ except Exception:
+ pass
f = self.getByteStream()
if f and hasattr(f, "close"):
- f.close()
+ try:
+ f.close()
+ except Exception:
+ pass
class StringInputSource(InputSource):
"""
- TODO:
+ Constructs an RDFLib Parser InputSource from a Python String or Bytes
"""
- def __init__(self, value, system_id=None):
+ def __init__(self, value, encoding="utf-8", system_id=None):
super(StringInputSource, self).__init__(system_id)
- stream = BytesIO(value)
- self.setByteStream(stream)
- # TODO:
- # encoding = value.encoding
- # self.setEncoding(encoding)
+ if isinstance(value, str):
+ stream = StringIO(value)
+ self.setCharacterStream(stream)
+ self.setEncoding(encoding)
+ b_stream = BytesIOWrapper(value, encoding)
+ self.setByteStream(b_stream)
+ else:
+ stream = BytesIO(value)
+ self.setByteStream(stream)
+ c_stream = TextIOWrapper(stream, encoding)
+ self.setCharacterStream(c_stream)
+ self.setEncoding(c_stream.encoding)
headers = {
@@ -134,8 +179,18 @@ class FileInputSource(InputSource):
system_id = URIRef(urljoin("file:", pathname2url(file.name)), base=base)
super(FileInputSource, self).__init__(system_id)
self.file = file
- self.setByteStream(file)
- # TODO: self.setEncoding(encoding)
+ if isinstance(file, TextIOBase): # Python3 unicode fp
+ self.setCharacterStream(file)
+ self.setEncoding(file.encoding)
+ try:
+ b = file.buffer
+ self.setByteStream(b)
+ except (AttributeError, LookupError):
+ self.setByteStream(file)
+ else:
+ self.setByteStream(file)
+ # We cannot set characterStream here because
+ # we do not know the Raw Bytes File encoding.
def __repr__(self):
return repr(self.file)
@@ -171,10 +226,21 @@ def create_input_source(
else:
if isinstance(source, str):
location = source
+ elif isinstance(source, bytes):
+ data = source
elif hasattr(source, "read") and not isinstance(source, Namespace):
f = source
input_source = InputSource()
- input_source.setByteStream(f)
+ if hasattr(source, "encoding"):
+ input_source.setCharacterStream(source)
+ input_source.setEncoding(source.encoding)
+ try:
+ b = file.buffer
+ input_source.setByteStream(b)
+ except (AttributeError, LookupError):
+ input_source.setByteStream(source)
+ else:
+ input_source.setByteStream(f)
if f is sys.stdin:
input_source.setSystemId("file:///dev/stdin")
elif hasattr(f, "name"):
@@ -206,8 +272,8 @@ def create_input_source(
input_source = FileInputSource(file)
if data is not None:
- if isinstance(data, str):
- data = data.encode("utf-8")
+ if not isinstance(data, (str, bytes, bytearray)):
+ raise RuntimeError("parse data can only str, or bytes.")
input_source = StringInputSource(data)
auto_close = True
diff --git a/rdflib/plugin.py b/rdflib/plugin.py
index cc5b6d35..baa2fb5e 100644
--- a/rdflib/plugin.py
+++ b/rdflib/plugin.py
@@ -11,7 +11,7 @@ following to your setup::
entry_points = {
'rdf.plugins.parser': [
- 'nt = rdf.plugins.parsers.nt:NTParser',
+ 'nt = rdf.plugins.parsers.ntriples:NTParser',
],
'rdf.plugins.serializer': [
'nt = rdf.plugins.serializers.NTSerializer:NTSerializer',
@@ -185,10 +185,10 @@ register("n3", Parser, "rdflib.plugins.parsers.notation3", "N3Parser")
register("text/turtle", Parser, "rdflib.plugins.parsers.notation3", "TurtleParser")
register("turtle", Parser, "rdflib.plugins.parsers.notation3", "TurtleParser")
register("ttl", Parser, "rdflib.plugins.parsers.notation3", "TurtleParser")
-register("application/n-triples", Parser, "rdflib.plugins.parsers.nt", "NTParser")
-register("ntriples", Parser, "rdflib.plugins.parsers.nt", "NTParser")
-register("nt", Parser, "rdflib.plugins.parsers.nt", "NTParser")
-register("nt11", Parser, "rdflib.plugins.parsers.nt", "NTParser")
+register("application/n-triples", Parser, "rdflib.plugins.parsers.ntriples", "NTParser")
+register("ntriples", Parser, "rdflib.plugins.parsers.ntriples", "NTParser")
+register("nt", Parser, "rdflib.plugins.parsers.ntriples", "NTParser")
+register("nt11", Parser, "rdflib.plugins.parsers.ntriples", "NTParser")
register("application/n-quads", Parser, "rdflib.plugins.parsers.nquads", "NQuadsParser")
register("nquads", Parser, "rdflib.plugins.parsers.nquads", "NQuadsParser")
register("application/trix", Parser, "rdflib.plugins.parsers.trix", "TriXParser")
diff --git a/rdflib/plugins/parsers/notation3.py b/rdflib/plugins/parsers/notation3.py
index c427f153..d866977d 100755
--- a/rdflib/plugins/parsers/notation3.py
+++ b/rdflib/plugins/parsers/notation3.py
@@ -139,10 +139,13 @@ def join(here, there):
return here + frag
# join('mid:foo@example', '../foo') bzzt
- if here[bcolonl + 1: bcolonl + 2] != "/":
- raise ValueError("Base <%s> has no slash after " "colon - with relative '%s'." % (here, there))
+ if here[bcolonl + 1 : bcolonl + 2] != "/":
+ raise ValueError(
+ "Base <%s> has no slash after "
+ "colon - with relative '%s'." % (here, there)
+ )
- if here[bcolonl + 1: bcolonl + 3] == "//":
+ if here[bcolonl + 1 : bcolonl + 3] == "//":
bpath = here.find("/", bcolonl + 3)
else:
bpath = bcolonl + 1
@@ -502,14 +505,14 @@ class SinkParser:
"""
assert tok[0] not in _notNameChars # not for punctuation
- if argstr[i: i + 1] == "@":
+ if argstr[i : i + 1] == "@":
i = i + 1
else:
if tok not in self.keywords:
return -1 # No, this has neither keywords declaration nor "@"
if (
- argstr[i: i + len(tok)] == tok
+ argstr[i : i + len(tok)] == tok
and (argstr[i + len(tok)] in _notKeywordsChars)
or (colon and argstr[i + len(tok)] == ":")
):
@@ -526,7 +529,7 @@ class SinkParser:
assert tok[0] not in _notNameChars # not for punctuation
- if argstr[i: i + len(tok)].lower() == tok.lower() and (
+ if argstr[i : i + len(tok)].lower() == tok.lower() and (
argstr[i + len(tok)] in _notQNameChars
):
i = i + len(tok)
@@ -794,23 +797,23 @@ class SinkParser:
res.append(("->", RDF_type))
return j
- if argstr[i: i + 2] == "<=":
+ if argstr[i : i + 2] == "<=":
if self.turtle:
self.BadSyntax(argstr, i, "Found '<=' in Turtle mode. ")
res.append(("<-", self._store.newSymbol(Logic_NS + "implies")))
return i + 2
- if argstr[i: i + 1] == "=":
+ if argstr[i : i + 1] == "=":
if self.turtle:
self.BadSyntax(argstr, i, "Found '=' in Turtle mode")
- if argstr[i + 1: i + 2] == ">":
+ if argstr[i + 1 : i + 2] == ">":
res.append(("->", self._store.newSymbol(Logic_NS + "implies")))
return i + 2
res.append(("->", DAML_sameAs))
return i + 1
- if argstr[i: i + 2] == ":=":
+ if argstr[i : i + 2] == ":=":
if self.turtle:
self.BadSyntax(argstr, i, "Found ':=' in Turtle mode")
@@ -823,7 +826,7 @@ class SinkParser:
res.append(("->", r[0]))
return j
- if argstr[i: i + 2] == ">-" or argstr[i: i + 2] == "<-":
+ if argstr[i : i + 2] == ">-" or argstr[i : i + 2] == "<-":
self.BadSyntax(argstr, j, ">- ... -> syntax is obsolete.")
return -1
@@ -844,8 +847,8 @@ class SinkParser:
if j < 0:
return j # nope
- while argstr[j: j + 1] in "!^": # no spaces, must follow exactly (?)
- ch = argstr[j: j + 1]
+ while argstr[j : j + 1] in "!^": # no spaces, must follow exactly (?)
+ ch = argstr[j : j + 1]
subj = res.pop()
obj = self.blankNode(uri=self.here(j))
j = self.node(argstr, j + 1, res)
@@ -879,7 +882,7 @@ class SinkParser:
if j < 0:
return j # eof
i = j
- ch = argstr[i: i + 1] # Quick 1-character checks first:
+ ch = argstr[i : i + 1] # Quick 1-character checks first:
if ch == "[":
bnodeID = self.here(i)
@@ -887,7 +890,7 @@ class SinkParser:
if j < 0:
self.BadSyntax(argstr, i, "EOF after '['")
# Hack for "is" binding name to anon node
- if argstr[j: j + 1] == "=":
+ if argstr[j : j + 1] == "=":
if self.turtle:
self.BadSyntax(
argstr, j, "Found '[=' or '[ =' when in turtle mode."
@@ -905,7 +908,7 @@ class SinkParser:
self.BadSyntax(
argstr, i, "EOF when objectList expected after [ = "
)
- if argstr[j: j + 1] == ";":
+ if argstr[j : j + 1] == ";":
j = j + 1
else:
self.BadSyntax(argstr, i, "objectList expected after [= ")
@@ -922,7 +925,7 @@ class SinkParser:
self.BadSyntax(
argstr, i, "EOF when ']' expected after [ <propertyList>"
)
- if argstr[j: j + 1] != "]":
+ if argstr[j : j + 1] != "]":
self.BadSyntax(argstr, j, "']' expected")
res.append(subj)
return j + 1
@@ -931,7 +934,7 @@ class SinkParser:
# if self.turtle:
# self.BadSyntax(argstr, i,
# "found '{' while in Turtle mode, Formulas not supported!")
- ch2 = argstr[i + 1: i + 2]
+ ch2 = argstr[i + 1 : i + 2]
if ch2 == "$":
# a set
i += 1
@@ -942,12 +945,12 @@ class SinkParser:
i = self.skipSpace(argstr, j)
if i < 0:
self.BadSyntax(argstr, i, "needed '$}', found end.")
- if argstr[i: i + 2] == "$}":
+ if argstr[i : i + 2] == "$}":
j = i + 2
break
if not first_run:
- if argstr[i: i + 1] == ",":
+ if argstr[i : i + 1] == ",":
i += 1
else:
self.BadSyntax(argstr, i, "expected: ','")
@@ -982,7 +985,7 @@ class SinkParser:
if i < 0:
self.BadSyntax(argstr, i, "needed '}', found end.")
- if argstr[i: i + 1] == "}":
+ if argstr[i : i + 1] == "}":
j = i + 1
break
@@ -1001,7 +1004,7 @@ class SinkParser:
if ch == "(":
thing_type = self._store.newList
- ch2 = argstr[i + 1: i + 2]
+ ch2 = argstr[i + 1 : i + 2]
if ch2 == "$":
thing_type = self._store.newSet
i += 1
@@ -1012,7 +1015,7 @@ class SinkParser:
i = self.skipSpace(argstr, j)
if i < 0:
self.BadSyntax(argstr, i, "needed ')', found end.")
- if argstr[i: i + 1] == ")":
+ if argstr[i : i + 1] == ")":
j = i + 1
break
@@ -1065,7 +1068,7 @@ class SinkParser:
break
i = j + 1
- if argstr[j: j + 2] == ":-":
+ if argstr[j : j + 2] == ":-":
if self.turtle:
self.BadSyntax(argstr, j, "Found in ':-' in Turtle mode")
i = j + 2
@@ -1095,7 +1098,7 @@ class SinkParser:
j = self.skipSpace(argstr, i)
if j < 0:
self.BadSyntax(argstr, j, "EOF found in list of objects")
- if argstr[i: i + 1] != ";":
+ if argstr[i : i + 1] != ";":
return i
i = i + 1 # skip semicolon and continue
@@ -1116,7 +1119,7 @@ class SinkParser:
j = self.skipSpace(argstr, i)
if j < 0:
return j # eof
- ch = argstr[j: j + 1]
+ ch = argstr[j : j + 1]
if ch != ",":
if ch != ".":
return -1
@@ -1133,7 +1136,7 @@ class SinkParser:
j = self.skipSpace(argstr, i)
if j < 0:
self.BadSyntax(argstr, j, "EOF found after object")
- if argstr[j: j + 1] != ",":
+ if argstr[j : j + 1] != ",":
return j # Found something else!
i = self.object(argstr, j + 1, res)
if i < 0:
@@ -1143,11 +1146,11 @@ class SinkParser:
j = self.skipSpace(argstr, i)
if j < 0:
return j # eof
- if argstr[j: j + 1] == ".":
+ if argstr[j : j + 1] == ".":
return j + 1 # skip
- if argstr[j: j + 1] == "}":
+ if argstr[j : j + 1] == "}":
return j # don't skip it
- if argstr[j: j + 1] == "]":
+ if argstr[j : j + 1] == "]":
return j
self.BadSyntax(argstr, j, "expected '.' or '}' or ']' at end of statement")
@@ -1212,7 +1215,7 @@ class SinkParser:
assert (
":" in uref
), "With no base URI, cannot deal with relative URIs"
- if argstr[i - 1: i] == "#" and not uref[-1:] == "#":
+ if argstr[i - 1 : i] == "#" and not uref[-1:] == "#":
uref = uref + "#" # She meant it! Weirdness in urlparse?
symb = self._store.newSymbol(uref)
if symb in self._variables:
@@ -1261,7 +1264,7 @@ class SinkParser:
if j < 0:
return -1
- if argstr[j: j + 1] != "?":
+ if argstr[j : j + 1] != "?":
return -1
j = j + 1
i = j
@@ -1419,7 +1422,7 @@ class SinkParser:
i = j
if argstr[i] in self.string_delimiters:
- if argstr[i: i + 3] == argstr[i] * 3:
+ if argstr[i : i + 3] == argstr[i] * 3:
delim = argstr[i] * 3
else:
delim = argstr[i]
@@ -1467,7 +1470,7 @@ class SinkParser:
# return -1 ## or fall through?
if argstr[i] in self.string_delimiters:
- if argstr[i: i + 3] == argstr[i] * 3:
+ if argstr[i : i + 3] == argstr[i] * 3:
delim = argstr[i] * 3
else:
delim = argstr[i]
@@ -1476,7 +1479,7 @@ class SinkParser:
dt = None
j, s = self.strconst(argstr, i, delim)
lang = None
- if argstr[j: j + 1] == "@": # Language?
+ if argstr[j : j + 1] == "@": # Language?
m = langcode.match(argstr, j + 1)
if m is None:
raise BadSyntax(
@@ -1487,9 +1490,9 @@ class SinkParser:
"Bad language code syntax on string " + "literal, after @",
)
i = m.end()
- lang = argstr[j + 1: i]
+ lang = argstr[j + 1 : i]
j = i
- if argstr[j: j + 2] == "^^":
+ if argstr[j : j + 2] == "^^":
res2 = []
j = self.uri_ref2(argstr, j + 2, res2) # Read datatype URI
dt = res2[0]
@@ -1522,15 +1525,15 @@ class SinkParser:
if (
delim == delim3
): # done when delim is """ or ''' and, respectively ...
- if argstr[j: j + 5] == delim5: # ... we have "" or '' before
+ if argstr[j : j + 5] == delim5: # ... we have "" or '' before
i = j + 5
ustr = ustr + delim2
return i, ustr
- if argstr[j: j + 4] == delim4: # ... we have " or ' before
+ if argstr[j : j + 4] == delim4: # ... we have " or ' before
i = j + 4
ustr = ustr + delim1
return i, ustr
- if argstr[j: j + 3] == delim3: # current " or ' is part of delim
+ if argstr[j : j + 3] == delim3: # current " or ' is part of delim
i = j + 3
return i, ustr
@@ -1542,8 +1545,8 @@ class SinkParser:
m = interesting.search(argstr, j) # was argstr[j:].
# Note for pos param to work, MUST be compiled ... re bug?
assert m, "Quote expected in string at ^ in %s^%s" % (
- argstr[j - 20: j],
- argstr[j: j + 20],
+ argstr[j - 20 : j],
+ argstr[j : j + 20],
) # at least need a quote
i = m.start()
@@ -1589,7 +1592,7 @@ class SinkParser:
elif ch == "\\":
j = i + 1
- ch = argstr[j: j + 1] # Will be empty if string ends
+ ch = argstr[j : j + 1] # Will be empty if string ends
if not ch:
raise BadSyntax(
self._thisDoc,
@@ -1620,14 +1623,14 @@ class SinkParser:
self._thisDoc, startline, argstr, i, "unterminated string literal(3)"
)
try:
- return i + n, reg.sub(unicodeExpand, "\\" + prefix + argstr[i: i + n])
+ return i + n, reg.sub(unicodeExpand, "\\" + prefix + argstr[i : i + n])
except:
raise BadSyntax(
self._thisDoc,
startline,
argstr,
i,
- "bad string literal hex escape: " + argstr[i: i + n],
+ "bad string literal hex escape: " + argstr[i : i + n],
)
def uEscape(self, argstr, i, startline):
@@ -1672,7 +1675,7 @@ class BadSyntax(SyntaxError):
self._why,
pre,
argstr[st:i],
- argstr[i: i + 60],
+ argstr[i : i + 60],
post,
)
@@ -1896,8 +1899,11 @@ class TurtleParser(Parser):
baseURI = graph.absolutize(source.getPublicId() or source.getSystemId() or "")
p = SinkParser(sink, baseURI=baseURI, turtle=turtle)
-
- p.loadStream(source.getByteStream())
+ # N3 parser prefers str stream
+ stream = source.getCharacterStream()
+ if not stream:
+ stream = source.getByteStream()
+ p.loadStream(stream)
for prefix, namespace in p._bindings.items():
graph.bind(prefix, namespace)
diff --git a/rdflib/plugins/parsers/nquads.py b/rdflib/plugins/parsers/nquads.py
index a3bfbc6e..2a3a9136 100644
--- a/rdflib/plugins/parsers/nquads.py
+++ b/rdflib/plugins/parsers/nquads.py
@@ -31,7 +31,7 @@ from codecs import getreader
from rdflib import ConjunctiveGraph
# Build up from the NTriples parser:
-from rdflib.plugins.parsers.ntriples import NTriplesParser
+from rdflib.plugins.parsers.ntriples import W3CNTriplesParser
from rdflib.plugins.parsers.ntriples import ParseError
from rdflib.plugins.parsers.ntriples import r_tail
from rdflib.plugins.parsers.ntriples import r_wspace
@@ -39,7 +39,7 @@ from rdflib.plugins.parsers.ntriples import r_wspace
__all__ = ["NQuadsParser"]
-class NQuadsParser(NTriplesParser):
+class NQuadsParser(W3CNTriplesParser):
def parse(self, inputsource, sink, bnode_context=None, **kwargs):
"""
Parse inputsource as an N-Quads file.
@@ -57,13 +57,14 @@ class NQuadsParser(NTriplesParser):
)
self.sink = ConjunctiveGraph(store=sink.store, identifier=sink.identifier)
- source = inputsource.getByteStream()
+ source = inputsource.getCharacterStream()
+ if not source:
+ source = inputsource.getByteStream()
+ source = getreader("utf-8")(source)
if not hasattr(source, "read"):
raise ParseError("Item to parse must be a file-like object.")
- source = getreader("utf-8")(source)
-
self.file = source
self.buffer = ""
while True:
diff --git a/rdflib/plugins/parsers/nt.py b/rdflib/plugins/parsers/nt.py
deleted file mode 100644
index c37a1aa0..00000000
--- a/rdflib/plugins/parsers/nt.py
+++ /dev/null
@@ -1,33 +0,0 @@
-from rdflib.parser import Parser
-from rdflib.plugins.parsers.ntriples import NTriplesParser
-
-__all__ = ["NTSink", "NTParser"]
-
-
-class NTSink(object):
- def __init__(self, graph):
- self.graph = graph
-
- def triple(self, s, p, o):
- self.graph.add((s, p, o))
-
-
-class NTParser(Parser):
- """parser for the ntriples format, often stored with the .nt extension
-
- See http://www.w3.org/TR/rdf-testcases/#ntriples"""
-
- def parse(self, source, sink, **kwargs):
- '''
- Parse the NT format
-
- :type source: `rdflib.parser.InputSource`
- :param source: the source of NT-formatted data
- :type sink: `rdflib.graph.Graph`
- :param sink: where to send parsed triples
- :param kwargs: Additional arguments to pass to `.NTriplesParser.parse`
- '''
- f = source.getByteStream() # TODO getCharacterStream?
- parser = NTriplesParser(NTSink(sink))
- parser.parse(f, **kwargs)
- f.close()
diff --git a/rdflib/plugins/parsers/ntriples.py b/rdflib/plugins/parsers/ntriples.py
index 33a4a4e6..d43a240c 100644
--- a/rdflib/plugins/parsers/ntriples.py
+++ b/rdflib/plugins/parsers/ntriples.py
@@ -1,9 +1,6 @@
-#!/usr/bin/env python
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+#!/usr/bin/env python3
-__doc__ = """
+__doc__ = """\
N-Triples Parser
License: GPL 2, W3C, BSD, or MIT
Author: Sean B. Palmer, inamidst.com
@@ -15,14 +12,13 @@ import codecs
from rdflib.term import URIRef as URI
from rdflib.term import BNode as bNode
from rdflib.term import Literal
-
-
-from rdflib.compat import cast_bytes
from rdflib.compat import decodeUnicodeEscape
+from rdflib.exceptions import ParserError as ParseError
+from rdflib.parser import Parser
-from io import BytesIO
+from io import StringIO, TextIOBase, BytesIO
-__all__ = ["unquote", "uriquote", "Sink", "NTriplesParser"]
+__all__ = ["unquote", "uriquote", "W3CNTriplesParser", "NTGraphSink", "NTParser"]
uriref = r'<([^:]+:[^\s"<>]*)>'
literal = r'"([^"\\]*(?:\\.[^"\\]*)*)"'
@@ -40,15 +36,7 @@ bufsiz = 2048
validate = False
-class Node(str):
- pass
-
-
-class ParseError(Exception):
- pass
-
-
-class Sink(object):
+class DummySink(object):
def __init__(self):
self.length = 0
@@ -78,7 +66,7 @@ def unquote(s):
while s:
m = r_safe.match(s)
if m:
- s = s[m.end():]
+ s = s[m.end() :]
result.append(m.group(1))
continue
@@ -90,7 +78,7 @@ def unquote(s):
m = r_uniquot.match(s)
if m:
- s = s[m.end():]
+ s = s[m.end() :]
u, U = m.groups()
codepoint = int(u or U, 16)
if codepoint > 0x10FFFF:
@@ -113,11 +101,10 @@ def uriquote(uri):
return r_hibyte.sub(lambda m: "%%%02X" % ord(m.group(1)), uri)
-class NTriplesParser(object):
+class W3CNTriplesParser(object):
"""An N-Triples Parser.
-
+ This is a legacy-style Triples parser for NTriples provided by W3C
Usage::
-
p = NTriplesParser(sink=MySink())
sink = p.parse(f) # file; use parsestring for a string
@@ -127,6 +114,8 @@ class NTriplesParser(object):
`NTriplesParser`.
"""
+ __slots__ = ("_bnode_ids", "sink", "buffer", "file", "line")
+
def __init__(self, sink=None, bnode_context=None):
if bnode_context is not None:
self._bnode_ids = bnode_context
@@ -136,7 +125,11 @@ class NTriplesParser(object):
if sink is not None:
self.sink = sink
else:
- self.sink = Sink()
+ self.sink = DummySink()
+
+ self.buffer = None
+ self.file = None
+ self.line = ""
def parse(self, f, bnode_context=None):
"""
@@ -150,10 +143,13 @@ class NTriplesParser(object):
passed in to define a distinct context for a given call to
`parse`.
"""
+
if not hasattr(f, "read"):
raise ParseError("Item to parse must be a file-like object.")
- # since N-Triples 1.1 files can and should be utf-8 encoded
- f = codecs.getreader("utf-8")(f)
+
+ if not hasattr(f, "encoding") and not hasattr(f, "charbuffer"):
+ # someone still using a bytestream here?
+ f = codecs.getreader("utf-8")(f)
self.file = f
self.buffer = ""
@@ -164,16 +160,17 @@ class NTriplesParser(object):
try:
self.parseline(bnode_context=bnode_context)
except ParseError:
- raise ParseError("Invalid line: %r" % self.line)
+ raise ParseError("Invalid line: {}".format(self.line))
return self.sink
def parsestring(self, s, **kwargs):
"""Parse s as an N-Triples string."""
- if not isinstance(s, str):
+ if not isinstance(s, (str, bytes, bytearray)):
raise ParseError("Item to parse must be a string instance.")
- f = BytesIO()
- f.write(cast_bytes(s))
- f.seek(0)
+ if isinstance(s, (bytes, bytearray)):
+ f = codecs.getreader("utf-8")(BytesIO(s))
+ else:
+ f = StringIO(s)
self.parse(f, **kwargs)
def readline(self):
@@ -189,7 +186,7 @@ class NTriplesParser(object):
while True:
m = r_line.match(self.buffer)
if m: # the more likely prospect
- self.buffer = self.buffer[m.end():]
+ self.buffer = self.buffer[m.end() :]
return m.group(1)
else:
buffer = self.file.read(bufsiz)
@@ -211,12 +208,12 @@ class NTriplesParser(object):
predicate = self.predicate()
self.eat(r_wspaces)
- object = self.object(bnode_context)
+ object_ = self.object(bnode_context)
self.eat(r_tail)
if self.line:
- raise ParseError("Trailing garbage")
- self.sink.triple(subject, predicate, object)
+ raise ParseError("Trailing garbage: {}".format(self.line))
+ self.sink.triple(subject, predicate, object_)
def peek(self, token):
return self.line.startswith(token)
@@ -227,7 +224,7 @@ class NTriplesParser(object):
# print(dir(pattern))
# print repr(self.line), type(self.line)
raise ParseError("Failed to eat %s at %s" % (pattern.pattern, self.line))
- self.line = self.line[m.end():]
+ self.line = self.line[m.end() :]
return m
def subject(self, bnode_context=None):
@@ -295,13 +292,44 @@ class NTriplesParser(object):
return False
-# # Obsolete, unused
-# def parseURI(uri):
-# import urllib
-# parser = NTriplesParser()
-# u = urllib.urlopen(uri)
-# sink = parser.parse(u)
-# u.close()
-# # for triple in sink:
-# # print triple
-# print 'Length of input:', sink.length
+class NTGraphSink(object):
+ __slots__ = ("g",)
+
+ def __init__(self, graph):
+ self.g = graph
+
+ def triple(self, s, p, o):
+ self.g.add((s, p, o))
+
+
+class NTParser(Parser):
+ """parser for the ntriples format, often stored with the .nt extension
+
+ See http://www.w3.org/TR/rdf-testcases/#ntriples"""
+
+ __slots__ = set()
+
+ @classmethod
+ def parse(cls, source, sink, **kwargs):
+ """
+ Parse the NT format
+
+ :type source: `rdflib.parser.InputSource`
+ :param source: the source of NT-formatted data
+ :type sink: `rdflib.graph.Graph`
+ :param sink: where to send parsed triples
+ :param kwargs: Additional arguments to pass to `.NTriplesParser.parse`
+ """
+ f = source.getCharacterStream()
+ if not f:
+ b = source.getByteStream()
+ # TextIOBase includes: StringIO and TextIOWrapper
+ if isinstance(b, TextIOBase):
+ # f is not really a ByteStream, but a CharacterStream
+ f = b
+ else:
+ # since N-Triples 1.1 files can and should be utf-8 encoded
+ f = codecs.getreader("utf-8")(b)
+ parser = W3CNTriplesParser(NTGraphSink(sink))
+ parser.parse(f, **kwargs)
+ f.close()
diff --git a/rdflib/plugins/parsers/trig.py b/rdflib/plugins/parsers/trig.py
index 8f270de0..938fb259 100644
--- a/rdflib/plugins/parsers/trig.py
+++ b/rdflib/plugins/parsers/trig.py
@@ -82,7 +82,7 @@ class TrigSinkParser(SinkParser):
if j < 0:
self.BadSyntax(argstr, i, "EOF found when expected graph")
- if argstr[j: j + 1] == "=": # optional = for legacy support
+ if argstr[j : j + 1] == "=": # optional = for legacy support
i = self.skipSpace(argstr, j + 1)
if i < 0:
@@ -90,7 +90,7 @@ class TrigSinkParser(SinkParser):
else:
i = j
- if argstr[i: i + 1] != "{":
+ if argstr[i : i + 1] != "{":
return -1 # the node wasn't part of a graph
j = i + 1
@@ -106,7 +106,7 @@ class TrigSinkParser(SinkParser):
if i < 0:
self.BadSyntax(argstr, i, "needed '}', found end.")
- if argstr[i: i + 1] == "}":
+ if argstr[i : i + 1] == "}":
j = i + 1
break
@@ -153,7 +153,11 @@ class TrigParser(Parser):
)
p = TrigSinkParser(sink, baseURI=baseURI, turtle=True)
- p.loadStream(source.getByteStream())
+ stream = source.getCharacterStream() # try to get str stream first
+ if not stream:
+ # fallback to get the bytes stream
+ stream = source.getByteStream()
+ p.loadStream(stream)
for prefix, namespace in p._bindings.items():
conj_graph.bind(prefix, namespace)
diff --git a/test/test_nt_misc.py b/test/test_nt_misc.py
index af7049d8..2d25e742 100644
--- a/test/test_nt_misc.py
+++ b/test/test_nt_misc.py
@@ -34,8 +34,34 @@ class NTTestCase(unittest.TestCase):
s = g.serialize(format="nt").strip()
self.assertEqual(s, '<foo> <foo> "test\\n"@en .'.encode("latin-1"))
+ def testIssue1144_rdflib(self):
+ fname = "test/nt/lists-02.nt"
+ g1 = Graph()
+ with open(fname, "r") as f:
+ g1.parse(f, format='nt')
+ self.assertEqual(14, len(g1))
+ g2 = Graph()
+ with open(fname, "rb") as fb:
+ g2.parse(fb, format='nt')
+ self.assertEqual(14, len(g2))
+
+
+ def testIssue1144_w3c(self):
+ fname = "test/nt/lists-02.nt"
+ sink1 = ntriples.NTGraphSink(Graph())
+ p1 = ntriples.W3CNTriplesParser(sink1)
+ with open(fname, "r") as f:
+ p1.parse(f)
+ self.assertEqual(14, len(sink1.g))
+ sink2 = ntriples.NTGraphSink(Graph())
+ p2 = ntriples.W3CNTriplesParser(sink2)
+ with open(fname, "rb") as f:
+ p2.parse(f)
+ self.assertEqual(14, len(sink2.g))
+
+
def test_sink(self):
- s = ntriples.Sink()
+ s = ntriples.DummySink()
self.assertTrue(s.length == 0)
s.triple(None, None, None)
self.assertTrue(s.length == 1)
@@ -77,26 +103,26 @@ class NTTestCase(unittest.TestCase):
ntriples.validate = False
self.assertEqual(res, uniquot)
- def test_NTriplesParser_fpath(self):
+ def test_W3CNTriplesParser_fpath(self):
fpath = "test/nt/" + os.listdir("test/nt")[0]
- p = ntriples.NTriplesParser()
+ p = ntriples.W3CNTriplesParser()
self.assertRaises(ntriples.ParseError, p.parse, fpath)
- def test_NTriplesParser_parsestring(self):
- p = ntriples.NTriplesParser()
+ def test_W3CNTriplesParser_parsestring(self):
+ p = ntriples.W3CNTriplesParser()
data = 3
self.assertRaises(ntriples.ParseError, p.parsestring, data)
fname = "test/nt/lists-02.nt"
with open(fname, "r") as f:
data = f.read()
- p = ntriples.NTriplesParser()
+ p = ntriples.W3CNTriplesParser()
res = p.parsestring(data)
self.assertTrue(res == None)
def test_w3_ntriple_variants(self):
uri = "file:///" + os.getcwd() + "/test/nt/test.ntriples"
- parser = ntriples.NTriplesParser()
+ parser = ntriples.W3CNTriplesParser()
u = urlopen(uri)
sink = parser.parse(u)
u.close()
@@ -107,14 +133,14 @@ class NTTestCase(unittest.TestCase):
data = (
"""<http://example.org/resource32> 3 <http://example.org/datatype1> .\n"""
)
- p = ntriples.NTriplesParser()
+ p = ntriples.W3CNTriplesParser()
self.assertRaises(ntriples.ParseError, p.parsestring, data)
def test_cover_eat(self):
data = (
"""<http://example.org/resource32> 3 <http://example.org/datatype1> .\n"""
)
- p = ntriples.NTriplesParser()
+ p = ntriples.W3CNTriplesParser()
p.line = data
self.assertRaises(
ntriples.ParseError, p.eat, re.compile("<http://example.org/datatype1>")
@@ -122,7 +148,7 @@ class NTTestCase(unittest.TestCase):
def test_cover_subjectobjectliteral(self):
# data = '''<http://example.org/resource32> 3 <http://example.org/datatype1> .\n'''
- p = ntriples.NTriplesParser()
+ p = ntriples.W3CNTriplesParser()
p.line = "baz"
self.assertRaises(ntriples.ParseError, p.subject)
self.assertRaises(ntriples.ParseError, p.object)
@@ -134,12 +160,12 @@ class BNodeContextTestCase(unittest.TestCase):
def test_bnode_shared_across_instances(self):
my_sink = FakeSink()
bnode_context = dict()
- p = ntriples.NTriplesParser(my_sink, bnode_context=bnode_context)
+ p = ntriples.W3CNTriplesParser(my_sink, bnode_context=bnode_context)
p.parsestring('''
_:0 <http://purl.obolibrary.org/obo/RO_0002350> <http://www.gbif.org/species/0000001> .
''')
- q = ntriples.NTriplesParser(my_sink, bnode_context=bnode_context)
+ q = ntriples.W3CNTriplesParser(my_sink, bnode_context=bnode_context)
q.parsestring('''
_:0 <http://purl.obolibrary.org/obo/RO_0002350> <http://www.gbif.org/species/0000002> .
''')
@@ -148,12 +174,12 @@ class BNodeContextTestCase(unittest.TestCase):
def test_bnode_distinct_across_instances(self):
my_sink = FakeSink()
- p = ntriples.NTriplesParser(my_sink)
+ p = ntriples.W3CNTriplesParser(my_sink)
p.parsestring('''
_:0 <http://purl.obolibrary.org/obo/RO_0002350> <http://www.gbif.org/species/0000001> .
''')
- q = ntriples.NTriplesParser(my_sink)
+ q = ntriples.W3CNTriplesParser(my_sink)
q.parsestring('''
_:0 <http://purl.obolibrary.org/obo/RO_0002350> <http://www.gbif.org/species/0000002> .
''')
@@ -162,7 +188,7 @@ class BNodeContextTestCase(unittest.TestCase):
def test_bnode_distinct_across_parse(self):
my_sink = FakeSink()
- p = ntriples.NTriplesParser(my_sink)
+ p = ntriples.W3CNTriplesParser(my_sink)
p.parsestring('''
_:0 <http://purl.obolibrary.org/obo/RO_0002350> <http://www.gbif.org/species/0000001> .
@@ -176,7 +202,7 @@ class BNodeContextTestCase(unittest.TestCase):
def test_bnode_shared_across_parse(self):
my_sink = FakeSink()
- p = ntriples.NTriplesParser(my_sink)
+ p = ntriples.W3CNTriplesParser(my_sink)
p.parsestring('''
_:0 <http://purl.obolibrary.org/obo/RO_0002350> <http://www.gbif.org/species/0000001> .
@@ -192,12 +218,12 @@ class BNodeContextTestCase(unittest.TestCase):
my_sink = FakeSink()
bnode_ctx = dict()
- p = ntriples.NTriplesParser(my_sink)
+ p = ntriples.W3CNTriplesParser(my_sink)
p.parsestring('''
_:0 <http://purl.obolibrary.org/obo/RO_0002350> <http://www.gbif.org/species/0000001> .
''', bnode_context=bnode_ctx)
- q = ntriples.NTriplesParser(my_sink)
+ q = ntriples.W3CNTriplesParser(my_sink)
q.parsestring('''
_:0 <http://purl.obolibrary.org/obo/RO_0002350> <http://www.gbif.org/species/0000002> .
''', bnode_context=bnode_ctx)