Fix Unicode object -> chararray conversion on narrow Python builds

author: mdroe <mdroe@localhost> 2009-10-14 15:01:41 +0000
committer: mdroe <mdroe@localhost> 2009-10-14 15:01:41 +0000
commit: fbbf05cfefe98fd284c08c2f3a78c7cf5503821a (patch)
tree: 87da915d7d0e3f96b5ba1b072563b44c31e6545a /numpy/core/defchararray.py
parent: c4db9cf34c643bd422060de6b636f53a57557c4d (diff)
download: numpy-fbbf05cfefe98fd284c08c2f3a78c7cf5503821a.tar.gz
1 files changed, 32 insertions, 0 deletions
diff --git a/numpy/core/defchararray.py b/numpy/core/defchararray.py
index 45b061d69..4ad5c59d3 100644
--- a/numpy/core/defchararray.py
+++ b/numpy/core/defchararray.py
@@ -1714,6 +1714,7 @@ class chararray(ndarray):
             self = ndarray.__new__(subtype, shape, (dtype, itemsize),
                                    order=order)
         else:
+            print shape, dtype, itemsize
             self = ndarray.__new__(subtype, shape, (dtype, itemsize),
                                    buffer=buffer,
                                    offset=offset, strides=strides,
@@ -2422,9 +2423,40 @@ def array(obj, itemsize=None, copy=True, unicode=None, order=None):
                 unicode = True
             else:
                 unicode = False
+
         if itemsize is None:
             itemsize = _len(obj)
         shape = _len(obj) / itemsize
+
+        if unicode:
+            if sys.maxunicode == 0xffff:
+                # On a narrow Python build, the buffer for Unicode
+                # strings is UCS2, which doesn't match the buffer for
+                # Numpy Unicode types, which is ALWAYS UCS4.
+                # Therefore, we need to convert the buffer.  On Python
+                # 2.6 and later, we can use the utf_32 codec.  Earlier
+                # versions don't have that codec, so we convert to a
+                # numerical array that matches the input buffer, and
+                # then use Numpy to convert it to UCS4.  All of this
+                # should happen in native endianness.
+                if sys.hexversion >= 0x2060000:
+                    obj = obj.encode('utf_32')
+                else:
+                    if isinstance(obj, str):
+                        ascii = numpy.frombuffer(obj, 'u1')
+                        ucs4 = numpy.array(ascii, 'u4')
+                        obj = ucs4.data
+                    else:
+                        ucs2 = numpy.frombuffer(obj, 'u2')
+                        ucs4 = numpy.array(ucs2, 'u4')
+                        obj = ucs4.data
+            else:
+                obj = unicode(obj)
+        else:
+            # Let the default Unicode -> string encoding (if any) take
+            # precedence.
+            obj = str(obj)
+
         return chararray(shape, itemsize=itemsize, unicode=unicode,
                          buffer=obj, order=order)
author	mdroe <mdroe@localhost>	2009-10-14 15:01:41 +0000
committer	mdroe <mdroe@localhost>	2009-10-14 15:01:41 +0000
commit	fbbf05cfefe98fd284c08c2f3a78c7cf5503821a (patch)
tree	87da915d7d0e3f96b5ba1b072563b44c31e6545a /numpy/core/defchararray.py
parent	c4db9cf34c643bd422060de6b636f53a57557c4d (diff)
download	numpy-fbbf05cfefe98fd284c08c2f3a78c7cf5503821a.tar.gz