Merge branch 'master' into improve_graph_parseimprove_graph_parse

author: Nicholas Car <nicholas.car@surroundaustralia.com> 2020-08-27 13:13:45 +1000
committer: GitHub <noreply@github.com> 2020-08-27 13:13:45 +1000
commit: 3afffcd19d3a5d240e83b3a59b53e3ee1120c165 (patch)
tree: 42ba0191f0a8f645cbc5b60aefd8a3cbfc383a8b
parent: 3e42f5eea742563cdeab7d655fe55f7d0e25ea16 (diff)
parent: 94295389204175783c2f369c2826f0ba55a2d42c (diff)
download: rdflib-improve_graph_parse.tar.gz
13 files changed, 294 insertions, 186 deletions
diff --git a/docs/plugin_parsers.rst b/docs/plugin_parsers.rst
index e114958d..81ab7ae6 100644
--- a/docs/plugin_parsers.rst
+++ b/docs/plugin_parsers.rst
@@ -26,7 +26,7 @@ mdata     :class:`~rdflib.plugins.parsers.structureddata.MicrodataParser`
 microdata :class:`~rdflib.plugins.parsers.structureddata.MicrodataParser`
 n3        :class:`~rdflib.plugins.parsers.notation3.N3Parser`
 nquads    :class:`~rdflib.plugins.parsers.nquads.NQuadsParser`
-nt        :class:`~rdflib.plugins.parsers.nt.NTParser`
+nt        :class:`~rdflib.plugins.parsers.ntriples.NTParser`
 rdfa      :class:`~rdflib.plugins.parsers.structureddata.RDFaParser`
 rdfa1.0   :class:`~rdflib.plugins.parsers.structureddata.RDFa10Parser`
 rdfa1.1   :class:`~rdflib.plugins.parsers.structureddata.RDFaParser`
diff --git a/docs/sphinx-requirements.txt b/docs/sphinx-requirements.txt
index c6dfb112..175ef14e 100644
--- a/docs/sphinx-requirements.txt
+++ b/docs/sphinx-requirements.txt
@@ -1,3 +1,3 @@
-sphinx==3.2.0
+sphinx==3.2.1
 sphinxcontrib-apidoc
 git+https://github.com/gniezen/n3pygments.git
diff --git a/rdflib/__init__.py b/rdflib/__init__.py
index bce8204f..06b1c2eb 100644
--- a/rdflib/__init__.py
+++ b/rdflib/__init__.py
@@ -92,7 +92,11 @@ _interactive_mode = False
 try:
     import __main__
 
-    if not hasattr(__main__, "__file__") and sys.stdout is not None and sys.stderr.isatty():
+    if (
+        not hasattr(__main__, "__file__")
+        and sys.stdout is not None
+        and sys.stderr.isatty()
+    ):
         # show log messages in interactive mode
         _interactive_mode = True
         logger.setLevel(logging.INFO)
diff --git a/rdflib/compare.py b/rdflib/compare.py
index 839cfbb0..c1665b66 100644
--- a/rdflib/compare.py
+++ b/rdflib/compare.py
@@ -335,7 +335,7 @@ class _TripleCanonicalizer(object):
                     coloring.extend(colors)
                     try:
                         si = sequence.index(c)
-                        sequence = sequence[:si] + colors + sequence[si + 1:]
+                        sequence = sequence[:si] + colors + sequence[si + 1 :]
                     except ValueError:
                         sequence = colors[1:] + sequence
         combined_colors = []
diff --git a/rdflib/graph.py b/rdflib/graph.py
index 786f193c..49ebcda2 100644
--- a/rdflib/graph.py
+++ b/rdflib/graph.py
@@ -779,13 +779,17 @@ class Graph(Node):
         # setup the language filtering
         if lang is not None:
             if lang == "":  # we only want not language-tagged literals
+
                 def langfilter(l_):
                     return l_.language is None
+
             else:
+
                 def langfilter(l_):
                     return l_.language == lang
 
         else:  # we don't care about language tags
+
             def langfilter(l_):
                 return True
 
@@ -1079,9 +1083,11 @@ class Graph(Node):
             format = source.content_type
         could_not_guess_format = False
         if format is None:
-            if (hasattr(source, "file")
-                    and getattr(source.file, "name", None)
-                    and isinstance(source.file.name, str)):
+            if (
+                hasattr(source, "file")
+                and getattr(source.file, "name", None)
+                and isinstance(source.file.name, str)
+            ):
                 format = rdflib.util.guess_format(source.file.name)
             if format is None:
                 format = "turtle"
diff --git a/rdflib/parser.py b/rdflib/parser.py
index 9e501c03..73ce2ba7 100644
--- a/rdflib/parser.py
+++ b/rdflib/parser.py
@@ -9,15 +9,11 @@ can plugin to rdflib. If you are wanting to invoke a parser you likely
 want to do so through the Graph class parse method.
 
 """
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
+import codecs
 import os
 import sys
 
-from io import BytesIO
-
+from io import BytesIO, TextIOBase, TextIOWrapper, StringIO, BufferedIOBase
 
 from urllib.request import pathname2url
 from urllib.request import Request
@@ -41,6 +37,8 @@ __all__ = [
 
 
 class Parser(object):
+    __slots__ = set()
+
     def __init__(self):
         pass
 
@@ -48,6 +46,37 @@ class Parser(object):
         pass
 
 
+class BytesIOWrapper(BufferedIOBase):
+    __slots__ = ("wrapped", "encoded", "encoding")
+
+    def __init__(self, wrapped: str, encoding="utf-8"):
+        super(BytesIOWrapper, self).__init__()
+        self.wrapped = wrapped
+        self.encoding = encoding
+        self.encoded = None
+
+    def read(self, *args, **kwargs):
+        if self.encoded is None:
+            b, blen = codecs.getencoder(self.encoding)(self.wrapped)
+            self.encoded = BytesIO(b)
+        return self.encoded.read(*args, **kwargs)
+
+    def read1(self, *args, **kwargs):
+        if self.encoded is None:
+            b = codecs.getencoder(self.encoding)(self.wrapped)
+            self.encoded = BytesIO(b)
+        return self.encoded.read1(*args, **kwargs)
+
+    def readinto(self, *args, **kwargs):
+        raise NotImplementedError()
+
+    def readinto1(self, *args, **kwargs):
+        raise NotImplementedError()
+
+    def write(self, *args, **kwargs):
+        raise NotImplementedError()
+
+
 class InputSource(xmlreader.InputSource, object):
     """
     TODO:
@@ -59,23 +88,39 @@ class InputSource(xmlreader.InputSource, object):
         self.auto_close = False  # see Graph.parse(), true if opened by us
 
     def close(self):
+        c = self.getCharacterStream()
+        if c and hasattr(c, "close"):
+            try:
+                c.close()
+            except Exception:
+                pass
         f = self.getByteStream()
         if f and hasattr(f, "close"):
-            f.close()
+            try:
+                f.close()
+            except Exception:
+                pass
 
 
 class StringInputSource(InputSource):
     """
-    TODO:
+    Constructs an RDFLib Parser InputSource from a Python String or Bytes
     """
 
-    def __init__(self, value, system_id=None):
+    def __init__(self, value, encoding="utf-8", system_id=None):
         super(StringInputSource, self).__init__(system_id)
-        stream = BytesIO(value)
-        self.setByteStream(stream)
-        # TODO:
-        #   encoding = value.encoding
-        #   self.setEncoding(encoding)
+        if isinstance(value, str):
+            stream = StringIO(value)
+            self.setCharacterStream(stream)
+            self.setEncoding(encoding)
+            b_stream = BytesIOWrapper(value, encoding)
+            self.setByteStream(b_stream)
+        else:
+            stream = BytesIO(value)
+            self.setByteStream(stream)
+            c_stream = TextIOWrapper(stream, encoding)
+            self.setCharacterStream(c_stream)
+            self.setEncoding(c_stream.encoding)
 
 
 headers = {
@@ -134,8 +179,18 @@ class FileInputSource(InputSource):
         system_id = URIRef(urljoin("file:", pathname2url(file.name)), base=base)
         super(FileInputSource, self).__init__(system_id)
         self.file = file
-        self.setByteStream(file)
-        # TODO: self.setEncoding(encoding)
+        if isinstance(file, TextIOBase):  # Python3 unicode fp
+            self.setCharacterStream(file)
+            self.setEncoding(file.encoding)
+            try:
+                b = file.buffer
+                self.setByteStream(b)
+            except (AttributeError, LookupError):
+                self.setByteStream(file)
+        else:
+            self.setByteStream(file)
+            # We cannot set characterStream here because
+            # we do not know the Raw Bytes File encoding.
 
     def __repr__(self):
         return repr(self.file)
@@ -171,10 +226,21 @@ def create_input_source(
         else:
             if isinstance(source, str):
                 location = source
+            elif isinstance(source, bytes):
+                data = source
             elif hasattr(source, "read") and not isinstance(source, Namespace):
                 f = source
                 input_source = InputSource()
-                input_source.setByteStream(f)
+                if hasattr(source, "encoding"):
+                    input_source.setCharacterStream(source)
+                    input_source.setEncoding(source.encoding)
+                    try:
+                        b = file.buffer
+                        input_source.setByteStream(b)
+                    except (AttributeError, LookupError):
+                        input_source.setByteStream(source)
+                else:
+                    input_source.setByteStream(f)
                 if f is sys.stdin:
                     input_source.setSystemId("file:///dev/stdin")
                 elif hasattr(f, "name"):
@@ -206,8 +272,8 @@ def create_input_source(
         input_source = FileInputSource(file)
 
     if data is not None:
-        if isinstance(data, str):
-            data = data.encode("utf-8")
+        if not isinstance(data, (str, bytes, bytearray)):
+            raise RuntimeError("parse data can only str, or bytes.")
         input_source = StringInputSource(data)
         auto_close = True
 
diff --git a/rdflib/plugin.py b/rdflib/plugin.py
index cc5b6d35..baa2fb5e 100644
--- a/rdflib/plugin.py
+++ b/rdflib/plugin.py
@@ -11,7 +11,7 @@ following to your setup::
 
     entry_points = {
         'rdf.plugins.parser': [
-            'nt =     rdf.plugins.parsers.nt:NTParser',
+            'nt =     rdf.plugins.parsers.ntriples:NTParser',
             ],
         'rdf.plugins.serializer': [
             'nt =     rdf.plugins.serializers.NTSerializer:NTSerializer',
@@ -185,10 +185,10 @@ register("n3", Parser, "rdflib.plugins.parsers.notation3", "N3Parser")
 register("text/turtle", Parser, "rdflib.plugins.parsers.notation3", "TurtleParser")
 register("turtle", Parser, "rdflib.plugins.parsers.notation3", "TurtleParser")
 register("ttl", Parser, "rdflib.plugins.parsers.notation3", "TurtleParser")
-register("application/n-triples", Parser, "rdflib.plugins.parsers.nt", "NTParser")
-register("ntriples", Parser, "rdflib.plugins.parsers.nt", "NTParser")
-register("nt", Parser, "rdflib.plugins.parsers.nt", "NTParser")
-register("nt11", Parser, "rdflib.plugins.parsers.nt", "NTParser")
+register("application/n-triples", Parser, "rdflib.plugins.parsers.ntriples", "NTParser")
+register("ntriples", Parser, "rdflib.plugins.parsers.ntriples", "NTParser")
+register("nt", Parser, "rdflib.plugins.parsers.ntriples", "NTParser")
+register("nt11", Parser, "rdflib.plugins.parsers.ntriples", "NTParser")
 register("application/n-quads", Parser, "rdflib.plugins.parsers.nquads", "NQuadsParser")
 register("nquads", Parser, "rdflib.plugins.parsers.nquads", "NQuadsParser")
 register("application/trix", Parser, "rdflib.plugins.parsers.trix", "TriXParser")
diff --git a/rdflib/plugins/parsers/notation3.py b/rdflib/plugins/parsers/notation3.py
index c427f153..d866977d 100755
--- a/rdflib/plugins/parsers/notation3.py
+++ b/rdflib/plugins/parsers/notation3.py
@@ -139,10 +139,13 @@ def join(here, there):
         return here + frag
 
     # join('mid:foo@example', '../foo') bzzt
-    if here[bcolonl + 1: bcolonl + 2] != "/":
-        raise ValueError("Base <%s> has no slash after " "colon - with relative '%s'." % (here, there))
+    if here[bcolonl + 1 : bcolonl + 2] != "/":
+        raise ValueError(
+            "Base <%s> has no slash after "
+            "colon - with relative '%s'." % (here, there)
+        )
 
-    if here[bcolonl + 1: bcolonl + 3] == "//":
+    if here[bcolonl + 1 : bcolonl + 3] == "//":
         bpath = here.find("/", bcolonl + 3)
     else:
         bpath = bcolonl + 1
@@ -502,14 +505,14 @@ class SinkParser:
         """
 
         assert tok[0] not in _notNameChars  # not for punctuation
-        if argstr[i: i + 1] == "@":
+        if argstr[i : i + 1] == "@":
             i = i + 1
         else:
             if tok not in self.keywords:
                 return -1  # No, this has neither keywords declaration nor "@"
 
         if (
-            argstr[i: i + len(tok)] == tok
+            argstr[i : i + len(tok)] == tok
             and (argstr[i + len(tok)] in _notKeywordsChars)
             or (colon and argstr[i + len(tok)] == ":")
         ):
@@ -526,7 +529,7 @@ class SinkParser:
 
         assert tok[0] not in _notNameChars  # not for punctuation
 
-        if argstr[i: i + len(tok)].lower() == tok.lower() and (
+        if argstr[i : i + len(tok)].lower() == tok.lower() and (
             argstr[i + len(tok)] in _notQNameChars
         ):
             i = i + len(tok)
@@ -794,23 +797,23 @@ class SinkParser:
             res.append(("->", RDF_type))
             return j
 
-        if argstr[i: i + 2] == "<=":
+        if argstr[i : i + 2] == "<=":
             if self.turtle:
                 self.BadSyntax(argstr, i, "Found '<=' in Turtle mode. ")
 
             res.append(("<-", self._store.newSymbol(Logic_NS + "implies")))
             return i + 2
 
-        if argstr[i: i + 1] == "=":
+        if argstr[i : i + 1] == "=":
             if self.turtle:
                 self.BadSyntax(argstr, i, "Found '=' in Turtle mode")
-            if argstr[i + 1: i + 2] == ">":
+            if argstr[i + 1 : i + 2] == ">":
                 res.append(("->", self._store.newSymbol(Logic_NS + "implies")))
                 return i + 2
             res.append(("->", DAML_sameAs))
             return i + 1
 
-        if argstr[i: i + 2] == ":=":
+        if argstr[i : i + 2] == ":=":
             if self.turtle:
                 self.BadSyntax(argstr, i, "Found ':=' in Turtle mode")
 
@@ -823,7 +826,7 @@ class SinkParser:
             res.append(("->", r[0]))
             return j
 
-        if argstr[i: i + 2] == ">-" or argstr[i: i + 2] == "<-":
+        if argstr[i : i + 2] == ">-" or argstr[i : i + 2] == "<-":
             self.BadSyntax(argstr, j, ">- ... -> syntax is obsolete.")
 
         return -1
@@ -844,8 +847,8 @@ class SinkParser:
         if j < 0:
             return j  # nope
 
-        while argstr[j: j + 1] in "!^":  # no spaces, must follow exactly (?)
-            ch = argstr[j: j + 1]
+        while argstr[j : j + 1] in "!^":  # no spaces, must follow exactly (?)
+            ch = argstr[j : j + 1]
             subj = res.pop()
             obj = self.blankNode(uri=self.here(j))
             j = self.node(argstr, j + 1, res)
@@ -879,7 +882,7 @@ class SinkParser:
         if j < 0:
             return j  # eof
         i = j
-        ch = argstr[i: i + 1]  # Quick 1-character checks first:
+        ch = argstr[i : i + 1]  # Quick 1-character checks first:
 
         if ch == "[":
             bnodeID = self.here(i)
@@ -887,7 +890,7 @@ class SinkParser:
             if j < 0:
                 self.BadSyntax(argstr, i, "EOF after '['")
             # Hack for "is" binding name to anon node
-            if argstr[j: j + 1] == "=":
+            if argstr[j : j + 1] == "=":
                 if self.turtle:
                     self.BadSyntax(
                         argstr, j, "Found '[=' or '[ =' when in turtle mode."
@@ -905,7 +908,7 @@ class SinkParser:
                         self.BadSyntax(
                             argstr, i, "EOF when objectList expected after [ = "
                         )
-                    if argstr[j: j + 1] == ";":
+                    if argstr[j : j + 1] == ";":
                         j = j + 1
                 else:
                     self.BadSyntax(argstr, i, "objectList expected after [= ")
@@ -922,7 +925,7 @@ class SinkParser:
                 self.BadSyntax(
                     argstr, i, "EOF when ']' expected after [ <propertyList>"
                 )
-            if argstr[j: j + 1] != "]":
+            if argstr[j : j + 1] != "]":
                 self.BadSyntax(argstr, j, "']' expected")
             res.append(subj)
             return j + 1
@@ -931,7 +934,7 @@ class SinkParser:
             # if self.turtle:
             #     self.BadSyntax(argstr, i,
             #                     "found '{' while in Turtle mode, Formulas not supported!")
-            ch2 = argstr[i + 1: i + 2]
+            ch2 = argstr[i + 1 : i + 2]
             if ch2 == "$":
                 # a set
                 i += 1
@@ -942,12 +945,12 @@ class SinkParser:
                     i = self.skipSpace(argstr, j)
                     if i < 0:
                         self.BadSyntax(argstr, i, "needed '$}', found end.")
-                    if argstr[i: i + 2] == "$}":
+                    if argstr[i : i + 2] == "$}":
                         j = i + 2
                         break
 
                     if not first_run:
-                        if argstr[i: i + 1] == ",":
+                        if argstr[i : i + 1] == ",":
                             i += 1
                         else:
                             self.BadSyntax(argstr, i, "expected: ','")
@@ -982,7 +985,7 @@ class SinkParser:
                     if i < 0:
                         self.BadSyntax(argstr, i, "needed '}', found end.")
 
-                    if argstr[i: i + 1] == "}":
+                    if argstr[i : i + 1] == "}":
                         j = i + 1
                         break
 
@@ -1001,7 +1004,7 @@ class SinkParser:
 
         if ch == "(":
             thing_type = self._store.newList
-            ch2 = argstr[i + 1: i + 2]
+            ch2 = argstr[i + 1 : i + 2]
             if ch2 == "$":
                 thing_type = self._store.newSet
                 i += 1
@@ -1012,7 +1015,7 @@ class SinkParser:
                 i = self.skipSpace(argstr, j)
                 if i < 0:
                     self.BadSyntax(argstr, i, "needed ')', found end.")
-                if argstr[i: i + 1] == ")":
+                if argstr[i : i + 1] == ")":
                     j = i + 1
                     break
 
@@ -1065,7 +1068,7 @@ class SinkParser:
                     break
                 i = j + 1
 
-            if argstr[j: j + 2] == ":-":
+            if argstr[j : j + 2] == ":-":
                 if self.turtle:
                     self.BadSyntax(argstr, j, "Found in ':-' in Turtle mode")
                 i = j + 2
@@ -1095,7 +1098,7 @@ class SinkParser:
             j = self.skipSpace(argstr, i)
             if j < 0:
                 self.BadSyntax(argstr, j, "EOF found in list of objects")
-            if argstr[i: i + 1] != ";":
+            if argstr[i : i + 1] != ";":
                 return i
             i = i + 1  # skip semicolon and continue
 
@@ -1116,7 +1119,7 @@ class SinkParser:
             j = self.skipSpace(argstr, i)
             if j < 0:
                 return j  # eof
-            ch = argstr[j: j + 1]
+            ch = argstr[j : j + 1]
             if ch != ",":
                 if ch != ".":
                     return -1
@@ -1133,7 +1136,7 @@ class SinkParser:
             j = self.skipSpace(argstr, i)
             if j < 0:
                 self.BadSyntax(argstr, j, "EOF found after object")
-            if argstr[j: j + 1] != ",":
+            if argstr[j : j + 1] != ",":
                 return j  # Found something else!
             i = self.object(argstr, j + 1, res)
             if i < 0:
@@ -1143,11 +1146,11 @@ class SinkParser:
         j = self.skipSpace(argstr, i)
         if j < 0:
             return j  # eof
-        if argstr[j: j + 1] == ".":
+        if argstr[j : j + 1] == ".":
             return j + 1  # skip
-        if argstr[j: j + 1] == "}":
+        if argstr[j : j + 1] == "}":
             return j  # don't skip it
-        if argstr[j: j + 1] == "]":
+        if argstr[j : j + 1] == "]":
             return j
         self.BadSyntax(argstr, j, "expected '.' or '}' or ']' at end of statement")
 
@@ -1212,7 +1215,7 @@ class SinkParser:
                         assert (
                             ":" in uref
                         ), "With no base URI, cannot deal with relative URIs"
-                    if argstr[i - 1: i] == "#" and not uref[-1:] == "#":
+                    if argstr[i - 1 : i] == "#" and not uref[-1:] == "#":
                         uref = uref + "#"  # She meant it! Weirdness in urlparse?
                     symb = self._store.newSymbol(uref)
                     if symb in self._variables:
@@ -1261,7 +1264,7 @@ class SinkParser:
         if j < 0:
             return -1
 
-        if argstr[j: j + 1] != "?":
+        if argstr[j : j + 1] != "?":
             return -1
         j = j + 1
         i = j
@@ -1419,7 +1422,7 @@ class SinkParser:
                 i = j
 
             if argstr[i] in self.string_delimiters:
-                if argstr[i: i + 3] == argstr[i] * 3:
+                if argstr[i : i + 3] == argstr[i] * 3:
                     delim = argstr[i] * 3
                 else:
                     delim = argstr[i]
@@ -1467,7 +1470,7 @@ class SinkParser:
                 # return -1  ## or fall through?
 
             if argstr[i] in self.string_delimiters:
-                if argstr[i: i + 3] == argstr[i] * 3:
+                if argstr[i : i + 3] == argstr[i] * 3:
                     delim = argstr[i] * 3
                 else:
                     delim = argstr[i]
@@ -1476,7 +1479,7 @@ class SinkParser:
                 dt = None
                 j, s = self.strconst(argstr, i, delim)
                 lang = None
-                if argstr[j: j + 1] == "@":  # Language?
+                if argstr[j : j + 1] == "@":  # Language?
                     m = langcode.match(argstr, j + 1)
                     if m is None:
                         raise BadSyntax(
@@ -1487,9 +1490,9 @@ class SinkParser:
                             "Bad language code syntax on string " + "literal, after @",
                         )
                     i = m.end()
-                    lang = argstr[j + 1: i]
+                    lang = argstr[j + 1 : i]
                     j = i
-                if argstr[j: j + 2] == "^^":
+                if argstr[j : j + 2] == "^^":
                     res2 = []
                     j = self.uri_ref2(argstr, j + 2, res2)  # Read datatype URI
                     dt = res2[0]
@@ -1522,15 +1525,15 @@ class SinkParser:
                 if (
                     delim == delim3
                 ):  # done when delim is """ or ''' and, respectively ...
-                    if argstr[j: j + 5] == delim5:  # ... we have "" or '' before
+                    if argstr[j : j + 5] == delim5:  # ... we have "" or '' before
                         i = j + 5
                         ustr = ustr + delim2
                         return i, ustr
-                    if argstr[j: j + 4] == delim4:  # ... we have " or ' before
+                    if argstr[j : j + 4] == delim4:  # ... we have " or ' before
                         i = j + 4
                         ustr = ustr + delim1
                         return i, ustr
-                    if argstr[j: j + 3] == delim3:  # current " or ' is part of delim
+                    if argstr[j : j + 3] == delim3:  # current " or ' is part of delim
                         i = j + 3
                         return i, ustr
 
@@ -1542,8 +1545,8 @@ class SinkParser:
             m = interesting.search(argstr, j)  # was argstr[j:].
             # Note for pos param to work, MUST be compiled  ... re bug?
             assert m, "Quote expected in string at ^ in %s^%s" % (
-                argstr[j - 20: j],
-                argstr[j: j + 20],
+                argstr[j - 20 : j],
+                argstr[j : j + 20],
             )  # at least need a quote
 
             i = m.start()
@@ -1589,7 +1592,7 @@ class SinkParser:
 
             elif ch == "\\":
                 j = i + 1
-                ch = argstr[j: j + 1]  # Will be empty if string ends
+                ch = argstr[j : j + 1]  # Will be empty if string ends
                 if not ch:
                     raise BadSyntax(
                         self._thisDoc,
@@ -1620,14 +1623,14 @@ class SinkParser:
                 self._thisDoc, startline, argstr, i, "unterminated string literal(3)"
             )
         try:
-            return i + n, reg.sub(unicodeExpand, "\\" + prefix + argstr[i: i + n])
+            return i + n, reg.sub(unicodeExpand, "\\" + prefix + argstr[i : i + n])
         except:
             raise BadSyntax(
                 self._thisDoc,
                 startline,
                 argstr,
                 i,
-                "bad string literal hex escape: " + argstr[i: i + n],
+                "bad string literal hex escape: " + argstr[i : i + n],
             )
 
     def uEscape(self, argstr, i, startline):
@@ -1672,7 +1675,7 @@ class BadSyntax(SyntaxError):
             self._why,
             pre,
             argstr[st:i],
-            argstr[i: i + 60],
+            argstr[i : i + 60],
             post,
         )
 
@@ -1896,8 +1899,11 @@ class TurtleParser(Parser):
 
         baseURI = graph.absolutize(source.getPublicId() or source.getSystemId() or "")
         p = SinkParser(sink, baseURI=baseURI, turtle=turtle)
-
-        p.loadStream(source.getByteStream())
+        # N3 parser prefers str stream
+        stream = source.getCharacterStream()
+        if not stream:
+            stream = source.getByteStream()
+        p.loadStream(stream)
 
         for prefix, namespace in p._bindings.items():
             graph.bind(prefix, namespace)
diff --git a/rdflib/plugins/parsers/nquads.py b/rdflib/plugins/parsers/nquads.py
index a3bfbc6e..2a3a9136 100644
--- a/rdflib/plugins/parsers/nquads.py
+++ b/rdflib/plugins/parsers/nquads.py
@@ -31,7 +31,7 @@ from codecs import getreader
 from rdflib import ConjunctiveGraph
 
 # Build up from the NTriples parser:
-from rdflib.plugins.parsers.ntriples import NTriplesParser
+from rdflib.plugins.parsers.ntriples import W3CNTriplesParser
 from rdflib.plugins.parsers.ntriples import ParseError
 from rdflib.plugins.parsers.ntriples import r_tail
 from rdflib.plugins.parsers.ntriples import r_wspace
@@ -39,7 +39,7 @@ from rdflib.plugins.parsers.ntriples import r_wspace
 __all__ = ["NQuadsParser"]
 
 
-class NQuadsParser(NTriplesParser):
+class NQuadsParser(W3CNTriplesParser):
     def parse(self, inputsource, sink, bnode_context=None, **kwargs):
         """
         Parse inputsource as an N-Quads file.
@@ -57,13 +57,14 @@ class NQuadsParser(NTriplesParser):
         )
         self.sink = ConjunctiveGraph(store=sink.store, identifier=sink.identifier)
 
-        source = inputsource.getByteStream()
+        source = inputsource.getCharacterStream()
+        if not source:
+            source = inputsource.getByteStream()
+            source = getreader("utf-8")(source)
 
         if not hasattr(source, "read"):
             raise ParseError("Item to parse must be a file-like object.")
 
-        source = getreader("utf-8")(source)
-
         self.file = source
         self.buffer = ""
         while True:
diff --git a/rdflib/plugins/parsers/nt.py b/rdflib/plugins/parsers/nt.py
deleted file mode 100644
index c37a1aa0..00000000
--- a/rdflib/plugins/parsers/nt.py
+++ /dev/null
@@ -1,33 +0,0 @@
-from rdflib.parser import Parser
-from rdflib.plugins.parsers.ntriples import NTriplesParser
-
-__all__ = ["NTSink", "NTParser"]
-
-
-class NTSink(object):
-    def __init__(self, graph):
-        self.graph = graph
-
-    def triple(self, s, p, o):
-        self.graph.add((s, p, o))
-
-
-class NTParser(Parser):
-    """parser for the ntriples format, often stored with the .nt extension
-
-    See http://www.w3.org/TR/rdf-testcases/#ntriples"""
-
-    def parse(self, source, sink, **kwargs):
-        '''
-        Parse the NT format
-
-        :type source: `rdflib.parser.InputSource`
-        :param source: the source of NT-formatted data
-        :type sink: `rdflib.graph.Graph`
-        :param sink: where to send parsed triples
-        :param kwargs: Additional arguments to pass to `.NTriplesParser.parse`
-        '''
-        f = source.getByteStream()  # TODO getCharacterStream?
-        parser = NTriplesParser(NTSink(sink))
-        parser.parse(f, **kwargs)
-        f.close()
diff --git a/rdflib/plugins/parsers/ntriples.py b/rdflib/plugins/parsers/ntriples.py
index 33a4a4e6..d43a240c 100644
--- a/rdflib/plugins/parsers/ntriples.py
+++ b/rdflib/plugins/parsers/ntriples.py
@@ -1,9 +1,6 @@
-#!/usr/bin/env python
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+#!/usr/bin/env python3
 
-__doc__ = """
+__doc__ = """\
 N-Triples Parser
 License: GPL 2, W3C, BSD, or MIT
 Author: Sean B. Palmer, inamidst.com
@@ -15,14 +12,13 @@ import codecs
 from rdflib.term import URIRef as URI
 from rdflib.term import BNode as bNode
 from rdflib.term import Literal
-
-
-from rdflib.compat import cast_bytes
 from rdflib.compat import decodeUnicodeEscape
+from rdflib.exceptions import ParserError as ParseError
+from rdflib.parser import Parser
 
-from io import BytesIO
+from io import StringIO, TextIOBase, BytesIO
 
-__all__ = ["unquote", "uriquote", "Sink", "NTriplesParser"]
+__all__ = ["unquote", "uriquote", "W3CNTriplesParser", "NTGraphSink", "NTParser"]
 
 uriref = r'<([^:]+:[^\s"<>]*)>'
 literal = r'"([^"\\]*(?:\\.[^"\\]*)*)"'
@@ -40,15 +36,7 @@ bufsiz = 2048
 validate = False
 
 
-class Node(str):
-    pass
-
-
-class ParseError(Exception):
-    pass
-
-
-class Sink(object):
+class DummySink(object):
     def __init__(self):
         self.length = 0
 
@@ -78,7 +66,7 @@ def unquote(s):
         while s:
             m = r_safe.match(s)
             if m:
-                s = s[m.end():]
+                s = s[m.end() :]
                 result.append(m.group(1))
                 continue
 
@@ -90,7 +78,7 @@ def unquote(s):
 
             m = r_uniquot.match(s)
             if m:
-                s = s[m.end():]
+                s = s[m.end() :]
                 u, U = m.groups()
                 codepoint = int(u or U, 16)
                 if codepoint > 0x10FFFF:
@@ -113,11 +101,10 @@ def uriquote(uri):
         return r_hibyte.sub(lambda m: "%%%02X" % ord(m.group(1)), uri)
 
 
-class NTriplesParser(object):
+class W3CNTriplesParser(object):
     """An N-Triples Parser.
-
+    This is a legacy-style Triples parser for NTriples provided by W3C
     Usage::
-
           p = NTriplesParser(sink=MySink())
           sink = p.parse(f) # file; use parsestring for a string
 
@@ -127,6 +114,8 @@ class NTriplesParser(object):
     `NTriplesParser`.
     """
 
+    __slots__ = ("_bnode_ids", "sink", "buffer", "file", "line")
+
     def __init__(self, sink=None, bnode_context=None):
         if bnode_context is not None:
             self._bnode_ids = bnode_context
@@ -136,7 +125,11 @@ class NTriplesParser(object):
         if sink is not None:
             self.sink = sink
         else:
-            self.sink = Sink()
+            self.sink = DummySink()
+
+        self.buffer = None
+        self.file = None
+        self.line = ""
 
     def parse(self, f, bnode_context=None):
         """
@@ -150,10 +143,13 @@ class NTriplesParser(object):
                               passed in to define a distinct context for a given call to
                               `parse`.
         """
+
         if not hasattr(f, "read"):
             raise ParseError("Item to parse must be a file-like object.")
-        # since N-Triples 1.1 files can and should be utf-8 encoded
-        f = codecs.getreader("utf-8")(f)
+
+        if not hasattr(f, "encoding") and not hasattr(f, "charbuffer"):
+            # someone still using a bytestream here?
+            f = codecs.getreader("utf-8")(f)
 
         self.file = f
         self.buffer = ""
@@ -164,16 +160,17 @@ class NTriplesParser(object):
             try:
                 self.parseline(bnode_context=bnode_context)
             except ParseError:
-                raise ParseError("Invalid line: %r" % self.line)
+                raise ParseError("Invalid line: {}".format(self.line))
         return self.sink
 
     def parsestring(self, s, **kwargs):
         """Parse s as an N-Triples string."""
-        if not isinstance(s, str):
+        if not isinstance(s, (str, bytes, bytearray)):
             raise ParseError("Item to parse must be a string instance.")
-        f = BytesIO()
-        f.write(cast_bytes(s))
-        f.seek(0)
+        if isinstance(s, (bytes, bytearray)):
+            f = codecs.getreader("utf-8")(BytesIO(s))
+        else:
+            f = StringIO(s)
         self.parse(f, **kwargs)
 
     def readline(self):
@@ -189,7 +186,7 @@ class NTriplesParser(object):
         while True:
             m = r_line.match(self.buffer)
             if m:  # the more likely prospect
-                self.buffer = self.buffer[m.end():]
+                self.buffer = self.buffer[m.end() :]
                 return m.group(1)
             else:
                 buffer = self.file.read(bufsiz)
@@ -211,12 +208,12 @@ class NTriplesParser(object):
         predicate = self.predicate()
         self.eat(r_wspaces)
 
-        object = self.object(bnode_context)
+        object_ = self.object(bnode_context)
         self.eat(r_tail)
 
         if self.line:
-            raise ParseError("Trailing garbage")
-        self.sink.triple(subject, predicate, object)
+            raise ParseError("Trailing garbage: {}".format(self.line))
+        self.sink.triple(subject, predicate, object_)
 
     def peek(self, token):
         return self.line.startswith(token)
@@ -227,7 +224,7 @@ class NTriplesParser(object):
             # print(dir(pattern))
             # print repr(self.line), type(self.line)
             raise ParseError("Failed to eat %s at %s" % (pattern.pattern, self.line))
-        self.line = self.line[m.end():]
+        self.line = self.line[m.end() :]
         return m
 
     def subject(self, bnode_context=None):
@@ -295,13 +292,44 @@ class NTriplesParser(object):
         return False
 
 
-# # Obsolete, unused
-# def parseURI(uri):
-#     import urllib
-#     parser = NTriplesParser()
-#     u = urllib.urlopen(uri)
-#     sink = parser.parse(u)
-#     u.close()
-#     # for triple in sink:
-#     #     print triple
-#     print 'Length of input:', sink.length
+class NTGraphSink(object):
+    __slots__ = ("g",)
+
+    def __init__(self, graph):
+        self.g = graph
+
+    def triple(self, s, p, o):
+        self.g.add((s, p, o))
+
+
+class NTParser(Parser):
+    """parser for the ntriples format, often stored with the .nt extension
+
+    See http://www.w3.org/TR/rdf-testcases/#ntriples"""
+
+    __slots__ = set()
+
+    @classmethod
+    def parse(cls, source, sink, **kwargs):
+        """
+        Parse the NT format
+
+        :type source: `rdflib.parser.InputSource`
+        :param source: the source of NT-formatted data
+        :type sink: `rdflib.graph.Graph`
+        :param sink: where to send parsed triples
+        :param kwargs: Additional arguments to pass to `.NTriplesParser.parse`
+        """
+        f = source.getCharacterStream()
+        if not f:
+            b = source.getByteStream()
+            # TextIOBase includes: StringIO and TextIOWrapper
+            if isinstance(b, TextIOBase):
+                # f is not really a ByteStream, but a CharacterStream
+                f = b
+            else:
+                # since N-Triples 1.1 files can and should be utf-8 encoded
+                f = codecs.getreader("utf-8")(b)
+        parser = W3CNTriplesParser(NTGraphSink(sink))
+        parser.parse(f, **kwargs)
+        f.close()
diff --git a/rdflib/plugins/parsers/trig.py b/rdflib/plugins/parsers/trig.py
index 8f270de0..938fb259 100644
--- a/rdflib/plugins/parsers/trig.py
+++ b/rdflib/plugins/parsers/trig.py
@@ -82,7 +82,7 @@ class TrigSinkParser(SinkParser):
         if j < 0:
             self.BadSyntax(argstr, i, "EOF found when expected graph")
 
-        if argstr[j: j + 1] == "=":  # optional = for legacy support
+        if argstr[j : j + 1] == "=":  # optional = for legacy support
 
             i = self.skipSpace(argstr, j + 1)
             if i < 0:
@@ -90,7 +90,7 @@ class TrigSinkParser(SinkParser):
         else:
             i = j
 
-        if argstr[i: i + 1] != "{":
+        if argstr[i : i + 1] != "{":
             return -1  # the node wasn't part of a graph
 
         j = i + 1
@@ -106,7 +106,7 @@ class TrigSinkParser(SinkParser):
             if i < 0:
                 self.BadSyntax(argstr, i, "needed '}', found end.")
 
-            if argstr[i: i + 1] == "}":
+            if argstr[i : i + 1] == "}":
                 j = i + 1
                 break
 
@@ -153,7 +153,11 @@ class TrigParser(Parser):
         )
         p = TrigSinkParser(sink, baseURI=baseURI, turtle=True)
 
-        p.loadStream(source.getByteStream())
+        stream = source.getCharacterStream()  # try to get str stream first
+        if not stream:
+            # fallback to get the bytes stream
+            stream = source.getByteStream()
+        p.loadStream(stream)
 
         for prefix, namespace in p._bindings.items():
             conj_graph.bind(prefix, namespace)
diff --git a/test/test_nt_misc.py b/test/test_nt_misc.py
index af7049d8..2d25e742 100644
--- a/test/test_nt_misc.py
+++ b/test/test_nt_misc.py
@@ -34,8 +34,34 @@ class NTTestCase(unittest.TestCase):
         s = g.serialize(format="nt").strip()
         self.assertEqual(s, '<foo> <foo> "test\\n"@en .'.encode("latin-1"))
 
+    def testIssue1144_rdflib(self):
+        fname = "test/nt/lists-02.nt"
+        g1 = Graph()
+        with open(fname, "r") as f:
+            g1.parse(f, format='nt')
+        self.assertEqual(14, len(g1))
+        g2 = Graph()
+        with open(fname, "rb") as fb:
+            g2.parse(fb, format='nt')
+        self.assertEqual(14, len(g2))
+
+
+    def testIssue1144_w3c(self):
+        fname = "test/nt/lists-02.nt"
+        sink1 = ntriples.NTGraphSink(Graph())
+        p1 = ntriples.W3CNTriplesParser(sink1)
+        with open(fname, "r") as f:
+            p1.parse(f)
+        self.assertEqual(14, len(sink1.g))
+        sink2 = ntriples.NTGraphSink(Graph())
+        p2 = ntriples.W3CNTriplesParser(sink2)
+        with open(fname, "rb") as f:
+            p2.parse(f)
+        self.assertEqual(14, len(sink2.g))
+
+
     def test_sink(self):
-        s = ntriples.Sink()
+        s = ntriples.DummySink()
         self.assertTrue(s.length == 0)
         s.triple(None, None, None)
         self.assertTrue(s.length == 1)
@@ -77,26 +103,26 @@ class NTTestCase(unittest.TestCase):
         ntriples.validate = False
         self.assertEqual(res, uniquot)
 
-    def test_NTriplesParser_fpath(self):
+    def test_W3CNTriplesParser_fpath(self):
         fpath = "test/nt/" + os.listdir("test/nt")[0]
-        p = ntriples.NTriplesParser()
+        p = ntriples.W3CNTriplesParser()
         self.assertRaises(ntriples.ParseError, p.parse, fpath)
 
-    def test_NTriplesParser_parsestring(self):
-        p = ntriples.NTriplesParser()
+    def test_W3CNTriplesParser_parsestring(self):
+        p = ntriples.W3CNTriplesParser()
         data = 3
         self.assertRaises(ntriples.ParseError, p.parsestring, data)
         fname = "test/nt/lists-02.nt"
         with open(fname, "r") as f:
             data = f.read()
-        p = ntriples.NTriplesParser()
+        p = ntriples.W3CNTriplesParser()
         res = p.parsestring(data)
         self.assertTrue(res == None)
 
     def test_w3_ntriple_variants(self):
         uri = "file:///" + os.getcwd() + "/test/nt/test.ntriples"
 
-        parser = ntriples.NTriplesParser()
+        parser = ntriples.W3CNTriplesParser()
         u = urlopen(uri)
         sink = parser.parse(u)
         u.close()
@@ -107,14 +133,14 @@ class NTTestCase(unittest.TestCase):
         data = (
             """<http://example.org/resource32> 3 <http://example.org/datatype1> .\n"""
         )
-        p = ntriples.NTriplesParser()
+        p = ntriples.W3CNTriplesParser()
         self.assertRaises(ntriples.ParseError, p.parsestring, data)
 
     def test_cover_eat(self):
         data = (
             """<http://example.org/resource32> 3 <http://example.org/datatype1> .\n"""
         )
-        p = ntriples.NTriplesParser()
+        p = ntriples.W3CNTriplesParser()
         p.line = data
         self.assertRaises(
             ntriples.ParseError, p.eat, re.compile("<http://example.org/datatype1>")
@@ -122,7 +148,7 @@ class NTTestCase(unittest.TestCase):
 
     def test_cover_subjectobjectliteral(self):
         # data = '''<http://example.org/resource32> 3 <http://example.org/datatype1> .\n'''
-        p = ntriples.NTriplesParser()
+        p = ntriples.W3CNTriplesParser()
         p.line = "baz"
         self.assertRaises(ntriples.ParseError, p.subject)
         self.assertRaises(ntriples.ParseError, p.object)
@@ -134,12 +160,12 @@ class BNodeContextTestCase(unittest.TestCase):
     def test_bnode_shared_across_instances(self):
         my_sink = FakeSink()
         bnode_context = dict()
-        p = ntriples.NTriplesParser(my_sink, bnode_context=bnode_context)
+        p = ntriples.W3CNTriplesParser(my_sink, bnode_context=bnode_context)
         p.parsestring('''
         _:0 <http://purl.obolibrary.org/obo/RO_0002350> <http://www.gbif.org/species/0000001> .
         ''')
 
-        q = ntriples.NTriplesParser(my_sink, bnode_context=bnode_context)
+        q = ntriples.W3CNTriplesParser(my_sink, bnode_context=bnode_context)
         q.parsestring('''
         _:0 <http://purl.obolibrary.org/obo/RO_0002350> <http://www.gbif.org/species/0000002> .
         ''')
@@ -148,12 +174,12 @@ class BNodeContextTestCase(unittest.TestCase):
 
     def test_bnode_distinct_across_instances(self):
         my_sink = FakeSink()
-        p = ntriples.NTriplesParser(my_sink)
+        p = ntriples.W3CNTriplesParser(my_sink)
         p.parsestring('''
         _:0 <http://purl.obolibrary.org/obo/RO_0002350> <http://www.gbif.org/species/0000001> .
         ''')
 
-        q = ntriples.NTriplesParser(my_sink)
+        q = ntriples.W3CNTriplesParser(my_sink)
         q.parsestring('''
         _:0 <http://purl.obolibrary.org/obo/RO_0002350> <http://www.gbif.org/species/0000002> .
         ''')
@@ -162,7 +188,7 @@ class BNodeContextTestCase(unittest.TestCase):
 
     def test_bnode_distinct_across_parse(self):
         my_sink = FakeSink()
-        p = ntriples.NTriplesParser(my_sink)
+        p = ntriples.W3CNTriplesParser(my_sink)
 
         p.parsestring('''
         _:0 <http://purl.obolibrary.org/obo/RO_0002350> <http://www.gbif.org/species/0000001> .
@@ -176,7 +202,7 @@ class BNodeContextTestCase(unittest.TestCase):
 
     def test_bnode_shared_across_parse(self):
         my_sink = FakeSink()
-        p = ntriples.NTriplesParser(my_sink)
+        p = ntriples.W3CNTriplesParser(my_sink)
 
         p.parsestring('''
         _:0 <http://purl.obolibrary.org/obo/RO_0002350> <http://www.gbif.org/species/0000001> .
@@ -192,12 +218,12 @@ class BNodeContextTestCase(unittest.TestCase):
         my_sink = FakeSink()
         bnode_ctx = dict()
 
-        p = ntriples.NTriplesParser(my_sink)
+        p = ntriples.W3CNTriplesParser(my_sink)
         p.parsestring('''
         _:0 <http://purl.obolibrary.org/obo/RO_0002350> <http://www.gbif.org/species/0000001> .
         ''', bnode_context=bnode_ctx)
 
-        q = ntriples.NTriplesParser(my_sink)
+        q = ntriples.W3CNTriplesParser(my_sink)
         q.parsestring('''
         _:0 <http://purl.obolibrary.org/obo/RO_0002350> <http://www.gbif.org/species/0000002> .
         ''', bnode_context=bnode_ctx)
author	Nicholas Car <nicholas.car@surroundaustralia.com>	2020-08-27 13:13:45 +1000
committer	GitHub <noreply@github.com>	2020-08-27 13:13:45 +1000
commit	3afffcd19d3a5d240e83b3a59b53e3ee1120c165 (patch)
tree	42ba0191f0a8f645cbc5b60aefd8a3cbfc383a8b
parent	3e42f5eea742563cdeab7d655fe55f7d0e25ea16 (diff)
parent	94295389204175783c2f369c2826f0ba55a2d42c (diff)
download	rdflib-improve_graph_parse.tar.gz