diff options
author | mdroe <mdroe@localhost> | 2009-10-14 15:01:41 +0000 |
---|---|---|
committer | mdroe <mdroe@localhost> | 2009-10-14 15:01:41 +0000 |
commit | fbbf05cfefe98fd284c08c2f3a78c7cf5503821a (patch) | |
tree | 87da915d7d0e3f96b5ba1b072563b44c31e6545a /numpy/core/defchararray.py | |
parent | c4db9cf34c643bd422060de6b636f53a57557c4d (diff) | |
download | numpy-fbbf05cfefe98fd284c08c2f3a78c7cf5503821a.tar.gz |
Fix Unicode object -> chararray conversion on narrow Python builds
Diffstat (limited to 'numpy/core/defchararray.py')
-rw-r--r-- | numpy/core/defchararray.py | 32 |
1 files changed, 32 insertions, 0 deletions
diff --git a/numpy/core/defchararray.py b/numpy/core/defchararray.py index 45b061d69..4ad5c59d3 100644 --- a/numpy/core/defchararray.py +++ b/numpy/core/defchararray.py @@ -1714,6 +1714,7 @@ class chararray(ndarray): self = ndarray.__new__(subtype, shape, (dtype, itemsize), order=order) else: + print shape, dtype, itemsize self = ndarray.__new__(subtype, shape, (dtype, itemsize), buffer=buffer, offset=offset, strides=strides, @@ -2422,9 +2423,40 @@ def array(obj, itemsize=None, copy=True, unicode=None, order=None): unicode = True else: unicode = False + if itemsize is None: itemsize = _len(obj) shape = _len(obj) / itemsize + + if unicode: + if sys.maxunicode == 0xffff: + # On a narrow Python build, the buffer for Unicode + # strings is UCS2, which doesn't match the buffer for + # Numpy Unicode types, which is ALWAYS UCS4. + # Therefore, we need to convert the buffer. On Python + # 2.6 and later, we can use the utf_32 codec. Earlier + # versions don't have that codec, so we convert to a + # numerical array that matches the input buffer, and + # then use Numpy to convert it to UCS4. All of this + # should happen in native endianness. + if sys.hexversion >= 0x2060000: + obj = obj.encode('utf_32') + else: + if isinstance(obj, str): + ascii = numpy.frombuffer(obj, 'u1') + ucs4 = numpy.array(ascii, 'u4') + obj = ucs4.data + else: + ucs2 = numpy.frombuffer(obj, 'u2') + ucs4 = numpy.array(ucs2, 'u4') + obj = ucs4.data + else: + obj = unicode(obj) + else: + # Let the default Unicode -> string encoding (if any) take + # precedence. + obj = str(obj) + return chararray(shape, itemsize=itemsize, unicode=unicode, buffer=obj, order=order) |