utils: pinned down unicode/bytes handling for ab64/b64s helpers

author: Eli Collins <elic@assurancetechnologies.com> 2016-06-22 18:30:44 -0400
committer: Eli Collins <elic@assurancetechnologies.com> 2016-06-22 18:30:44 -0400
commit: febffda5c48ce918258cd89e200c314e92ee5bb7 (patch)
tree: dc808043bf1ea1b6bcafb7ac736db6f096b590aa
parent: d94b6212236e433b515b009581f1c165ecf69059 (diff)
download: passlib-febffda5c48ce918258cd89e200c314e92ee5bb7.tar.gz
4 files changed, 115 insertions, 20 deletions
diff --git a/docs/modular_crypt_format.rst b/docs/modular_crypt_format.rst
index 7080897..86988a6 100644
--- a/docs/modular_crypt_format.rst
+++ b/docs/modular_crypt_format.rst
@@ -73,7 +73,7 @@ by the modular crypt format hashes found in Passlib:
    they may use the ``$`` character as an internal field separator.
 
    This is the least adhered-to of any modular crypt format convention.
-   Other characters (such as ``=,-``) are sometimes
+   Other characters (such as ``+=,-``) are sometimes
    used by various formats, though sparingly.
 
    The only hard and fast stricture
@@ -81,11 +81,13 @@ by the modular crypt format hashes found in Passlib:
    since this would interfere with parsing of the Unix shadow password file,
    where these hashes are typically stored.
 
-   Pretty much all modular-crypt-format hashes
+   Pretty much all older modular-crypt-format hashes
    use ascii letters, numbers, ``.``, and ``/``
    to provide base64 encoding of their raw data,
    though the exact character value assignments vary between hashes
    (see :data:`passlib.utils.h64`).
+   Many newer hashes use ``+`` instead of ``.``,
+   to adhere closer to the base64 standard.
 
 4. Hash schemes should put their "digest" portion
    at the end of the hash, preferably separated
diff --git a/passlib/tests/test_utils.py b/passlib/tests/test_utils.py
index e6183fe..19761ea 100644
--- a/passlib/tests/test_utils.py
+++ b/passlib/tests/test_utils.py
@@ -11,7 +11,7 @@ import warnings
 # module
 from passlib.utils import is_ascii_safe
 from passlib.utils.compat import irange, PY3, u, unicode, join_bytes, PYPY
-from passlib.tests.utils import TestCase
+from passlib.tests.utils import TestCase, hb
 
 #=============================================================================
 # byte funcs
@@ -550,13 +550,94 @@ class Base64EngineTest(TestCase):
         # dup charmap letter
         self.assertRaises(ValueError, Base64Engine, AB64_CHARS[:-1] + "A")
 
-    def test_ab64(self):
+    def test_ab64_decode(self):
+        """ab64_decode()"""
         from passlib.utils import ab64_decode
-        # TODO: make ab64_decode (and a b64 variant) *much* stricter about
-        # padding chars, etc.
 
-        # 1 mod 4 not valid
-        self.assertRaises(ValueError, ab64_decode, "abcde")
+        # accept bytes or unicode
+        self.assertEqual(ab64_decode(b"abc"), hb("69b7"))
+        self.assertEqual(ab64_decode(u("abc")), hb("69b7"))
+
+        # reject non-ascii unicode
+        self.assertRaises(ValueError, ab64_decode, u("ab\xff"))
+
+        # underlying a2b_ascii treats non-base64 chars as "Incorrect padding"
+        self.assertRaises(TypeError, ab64_decode, b"ab\xff")
+        self.assertRaises(TypeError, ab64_decode, b"ab!")
+        self.assertRaises(TypeError, ab64_decode, u("ab!"))
+
+        # insert correct padding, handle dirty padding bits
+        self.assertEqual(ab64_decode(b"abcd"), hb("69b71d"))  # 0 mod 4
+        self.assertRaises(ValueError, ab64_decode, b"abcde")  # 1 mod 4
+        self.assertEqual(ab64_decode(b"abcdef"), hb("69b71d79"))  # 2 mod 4, dirty padding bits
+        self.assertEqual(ab64_decode(b"abcdeQ"), hb("69b71d79"))  # 2 mod 4, clean padding bits
+        self.assertEqual(ab64_decode(b"abcdefg"), hb("69b71d79f8"))  # 3 mod 4, clean padding bits
+
+        # support "./" or "+/" altchars
+        # (lets us transition to "+/" representation, merge w/ b64s_decode)
+        self.assertEqual(ab64_decode(b"ab+/"), hb("69bfbf"))
+        self.assertEqual(ab64_decode(b"ab./"), hb("69bfbf"))
+
+    def test_ab64_encode(self):
+        """ab64_encode()"""
+        from passlib.utils import ab64_encode
+
+        # accept bytes
+        self.assertEqual(ab64_encode(hb("69b7")), b"abc")
+
+        # reject unicode
+        self.assertRaises(TypeError if PY3 else UnicodeEncodeError,
+                          ab64_encode, hb("69b7").decode("latin-1"))
+
+        # insert correct padding before decoding
+        self.assertEqual(ab64_encode(hb("69b71d")), b"abcd")  # 0 mod 4
+        self.assertEqual(ab64_encode(hb("69b71d79")), b"abcdeQ")  # 2 mod 4
+        self.assertEqual(ab64_encode(hb("69b71d79f8")), b"abcdefg")  # 3 mod 4
+
+        # output "./" altchars
+        self.assertEqual(ab64_encode(hb("69bfbf")), b"ab./")
+
+    def test_b64s_decode(self):
+        """b64s_decode()"""
+        from passlib.utils import b64s_decode
+
+        # accept bytes or unicode
+        self.assertEqual(b64s_decode(b"abc"), hb("69b7"))
+        self.assertEqual(b64s_decode(u("abc")), hb("69b7"))
+
+        # reject non-ascii unicode
+        self.assertRaises(ValueError, b64s_decode, u("ab\xff"))
+
+        # underlying a2b_ascii treats non-base64 chars as "Incorrect padding"
+        self.assertRaises(TypeError, b64s_decode, b"ab\xff")
+        self.assertRaises(TypeError, b64s_decode, b"ab!")
+        self.assertRaises(TypeError, b64s_decode, u("ab!"))
+
+        # insert correct padding, handle dirty padding bits
+        self.assertEqual(b64s_decode(b"abcd"), hb("69b71d"))  # 0 mod 4
+        self.assertRaises(ValueError, b64s_decode, b"abcde")  # 1 mod 4
+        self.assertEqual(b64s_decode(b"abcdef"), hb("69b71d79"))  # 2 mod 4, dirty padding bits
+        self.assertEqual(b64s_decode(b"abcdeQ"), hb("69b71d79"))  # 2 mod 4, clean padding bits
+        self.assertEqual(b64s_decode(b"abcdefg"), hb("69b71d79f8"))  # 3 mod 4, clean padding bits
+
+    def test_b64s_encode(self):
+        """b64s_encode()"""
+        from passlib.utils import b64s_encode
+
+        # accept bytes
+        self.assertEqual(b64s_encode(hb("69b7")), b"abc")
+
+        # reject unicode
+        self.assertRaises(TypeError if PY3 else UnicodeEncodeError,
+                          b64s_encode, hb("69b7").decode("latin-1"))
+
+        # insert correct padding before decoding
+        self.assertEqual(b64s_encode(hb("69b71d")), b"abcd")  # 0 mod 4
+        self.assertEqual(b64s_encode(hb("69b71d79")), b"abcdeQ")  # 2 mod 4
+        self.assertEqual(b64s_encode(hb("69b71d79f8")), b"abcdefg")  # 3 mod 4
+
+        # output "+/" altchars
+        self.assertEqual(b64s_encode(hb("69bfbf")), b"ab+/")
 
 class _Base64Test(TestCase):
     """common tests for all Base64Engine instances"""
diff --git a/passlib/tests/utils.py b/passlib/tests/utils.py
index 4b23082..917cae4 100644
--- a/passlib/tests/utils.py
+++ b/passlib/tests/utils.py
@@ -232,10 +232,7 @@ def hb(source):
 
     usage: ``hb("deadbeef23")``
     """
-    source = re.sub("\s", "", source)
-    if PY3:
-        source = source.encode("ascii")
-    return unhexlify(source)
+    return unhexlify(re.sub("\s", "", source))
 
 def limit(value, lower, upper):
     if value < lower:
diff --git a/passlib/utils/__init__.py b/passlib/utils/__init__.py
index e9ddea1..e8be28c 100644
--- a/passlib/utils/__init__.py
+++ b/passlib/utils/__init__.py
@@ -1342,10 +1342,13 @@ _BASE64_STRIP = b"=\n"
 _BASE64_PAD1 = b"="
 _BASE64_PAD2 = b"=="
 
+# XXX: Passlib 1.8/1.9 -- deprecate everything that's using ab64_encode(),
+#      have it start outputing b64s_encode() instead? can use a64_decode() to retain backwards compat.
+
 def ab64_encode(data):
     """
-    base64 encoder which omits trailing padding & whitespace.
-    uses ``.`` instead of ``+``, but otherwise the same as normal base64.
+    encode using shortened base64 format which omits padding & whitespace.
+    uses custom ``./`` altchars.
 
     it is primarily used by Passlib's custom pbkdf2 hashes.
     """
@@ -1353,25 +1356,37 @@ def ab64_encode(data):
 
 def ab64_decode(data):
     """
-    base64 decoder which omits trailing padding & whitespace.
-    uses ``.`` instead of ``+``, but otherwise the same as normal base64.
+    decode from shortened base64 format which omits padding & whitespace.
+    uses custom ``./`` altchars, but supports decoding normal ``+/`` altchars as well.
 
     it is primarily used by Passlib's custom pbkdf2 hashes.
     """
+    if isinstance(data, unicode):
+        # needs bytes for replace() call, but want to accept ascii-unicode ala a2b_base64()
+        try:
+            data = data.encode("ascii")
+        except UnicodeEncodeError:
+            raise suppress_cause(ValueError("string argument should contain only ASCII characters"))
     return b64s_decode(data.replace(b".", b"+"))
 
 def b64s_encode(data):
     """
-    base64 encoder which omits trailing padding & whitespace.
-    otherwise uses default ``+/`` altchars.
+    encode using shortened base64 format which omits padding & whitespace.
+    uses default ``+/`` altchars.
     """
     return b2a_base64(data).rstrip(_BASE64_STRIP)
 
 def b64s_decode(data):
     """
-    base64 decoder which omits trailing padding & whitespace.
-    otherwise uses default ``+/`` altchars.
+    decode from shortened base64 format which omits padding & whitespace.
+    uses default ``+/`` altchars.
     """
+    if isinstance(data, unicode):
+        # needs bytes for replace() call, but want to accept ascii-unicode ala a2b_base64()
+        try:
+            data = data.encode("ascii")
+        except UnicodeEncodeError:
+            raise suppress_cause(ValueError("string argument should contain only ASCII characters"))
     off = len(data) & 3
     if off == 0:
         pass
author	Eli Collins <elic@assurancetechnologies.com>	2016-06-22 18:30:44 -0400
committer	Eli Collins <elic@assurancetechnologies.com>	2016-06-22 18:30:44 -0400
commit	febffda5c48ce918258cd89e200c314e92ee5bb7 (patch)
tree	dc808043bf1ea1b6bcafb7ac736db6f096b590aa
parent	d94b6212236e433b515b009581f1c165ecf69059 (diff)
download	passlib-febffda5c48ce918258cd89e200c314e92ee5bb7.tar.gz