ENH: Multiple comment tokens in loadtxt

author: Joseph Martinot-Lagarde <joseph.martinot-lagarde@onera.fr> 2015-04-22 11:53:58 +0200
committer: Joseph Martinot-Lagarde <joseph.martinot-lagarde@onera.fr> 2015-04-22 11:53:58 +0200
commit: 36dbfa5dfd62c559dfdc4aa49bb0192df8a33abd (patch)
tree: 15e04c4e42f976708c4fb5a21d75a3af598fdbae /numpy/lib
parent: 8b6effadd7836f7e80f0f3e7dd9dd43d20ad1590 (diff)
download: numpy-36dbfa5dfd62c559dfdc4aa49bb0192df8a33abd.tar.gz
2 files changed, 52 insertions, 10 deletions
diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py
index 5ebeae6c3..b56d7d5a9 100644
--- a/numpy/lib/npyio.py
+++ b/numpy/lib/npyio.py
@@ -717,8 +717,9 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
         each row will be interpreted as an element of the array.  In this
         case, the number of columns used must match the number of fields in
         the data-type.
-    comments : str, optional
-        The character used to indicate the start of a comment;
+    comments : str or sequence, optional
+        The characters or list of characters used to indicate the start of a
+        comment;
         default: '#'.
     delimiter : str, optional
         The string used to separate values.  By default, this is any
@@ -791,7 +792,14 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
     """
     # Type conversions for Py3 convenience
     if comments is not None:
-        comments = asbytes(comments)
+        if isinstance(comments, (basestring, bytes)):
+            comments = [asbytes(comments)]
+        else:
+            comments = [asbytes(comment) for comment in comments]
+
+        # Compile regex for comments beforehand
+        comments = (re.escape(comment) for comment in comments)
+        regex_comments = re.compile(asbytes('|').join(comments))
     user_converters = converters
     if delimiter is not None:
         delimiter = asbytes(delimiter)
@@ -869,10 +877,10 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
         returns bytes.
 
         """
-        if comments is None:
-            line = asbytes(line).strip(asbytes('\r\n'))
-        else:
-            line = asbytes(line).split(comments)[0].strip(asbytes('\r\n'))
+        line = asbytes(line)
+        if comments is not None:
+            line = regex_comments.split(asbytes(line), maxsplit=1)[0]
+        line = line.strip(asbytes('\r\n'))
         if line:
             return line.split(delimiter)
         else:
diff --git a/numpy/lib/tests/test_io.py b/numpy/lib/tests/test_io.py
index 8437be14f..8a939f85e 100644
--- a/numpy/lib/tests/test_io.py
+++ b/numpy/lib/tests/test_io.py
@@ -15,7 +15,7 @@ import numpy as np
 import numpy.ma as ma
 from numpy.lib._iotools import (ConverterError, ConverterLockError,
                                 ConversionWarning)
-from numpy.compat import asbytes, asbytes_nested, bytes, asstr
+from numpy.compat import asbytes, bytes, unicode
 from nose import SkipTest
 from numpy.ma.testutils import (
     TestCase, assert_equal, assert_array_equal, assert_allclose,
@@ -553,15 +553,49 @@ class TestLoadTxt(TestCase):
         a = np.array([[2, -999], [7, 9]], int)
         assert_array_equal(x, a)
 
-    def test_comments(self):
+    def test_comments_unicode(self):
         c = TextIO()
         c.write('# comment\n1,2,3,5\n')
         c.seek(0)
         x = np.loadtxt(c, dtype=int, delimiter=',',
-                       comments='#')
+                       comments=unicode('#'))
+        a = np.array([1, 2, 3, 5], int)
+        assert_array_equal(x, a)
+
+    def test_comments_byte(self):
+        c = TextIO()
+        c.write('# comment\n1,2,3,5\n')
+        c.seek(0)
+        x = np.loadtxt(c, dtype=int, delimiter=',',
+                       comments=b'#')
+        a = np.array([1, 2, 3, 5], int)
+        assert_array_equal(x, a)
+
+    def test_comments_multiple(self):
+        c = TextIO()
+        c.write('# comment\n1,2,3\n@ comment2\n4,5,6 // comment3')
+        c.seek(0)
+        x = np.loadtxt(c, dtype=int, delimiter=',',
+                       comments=['#', '@', '//'])
+        a = np.array([[1, 2, 3], [4, 5, 6]], int)
+        assert_array_equal(x, a)
+
+    def test_comments_multi_chars(self):
+        c = TextIO()
+        c.write('/* comment\n1,2,3,5\n')
+        c.seek(0)
+        x = np.loadtxt(c, dtype=int, delimiter=',',
+                       comments='/*')
         a = np.array([1, 2, 3, 5], int)
         assert_array_equal(x, a)
 
+        # Check that '/*' is not transformed to ['/', '*']
+        c = TextIO()
+        c.write('*/ comment\n1,2,3,5\n')
+        c.seek(0)
+        assert_raises(ValueError, np.loadtxt, c, dtype=int, delimiter=',',
+                      comments='/*')
+
     def test_skiprows(self):
         c = TextIO()
         c.write('comment\n1,2,3,5\n')
author	Joseph Martinot-Lagarde <joseph.martinot-lagarde@onera.fr>	2015-04-22 11:53:58 +0200
committer	Joseph Martinot-Lagarde <joseph.martinot-lagarde@onera.fr>	2015-04-22 11:53:58 +0200
commit	36dbfa5dfd62c559dfdc4aa49bb0192df8a33abd (patch)
tree	15e04c4e42f976708c4fb5a21d75a3af598fdbae /numpy/lib
parent	8b6effadd7836f7e80f0f3e7dd9dd43d20ad1590 (diff)
download	numpy-36dbfa5dfd62c559dfdc4aa49bb0192df8a33abd.tar.gz