summaryrefslogtreecommitdiff
path: root/numpy/core/defchararray.py
diff options
context:
space:
mode:
authormdroe <mdroe@localhost>2009-10-14 15:01:41 +0000
committermdroe <mdroe@localhost>2009-10-14 15:01:41 +0000
commitfbbf05cfefe98fd284c08c2f3a78c7cf5503821a (patch)
tree87da915d7d0e3f96b5ba1b072563b44c31e6545a /numpy/core/defchararray.py
parentc4db9cf34c643bd422060de6b636f53a57557c4d (diff)
downloadnumpy-fbbf05cfefe98fd284c08c2f3a78c7cf5503821a.tar.gz
Fix Unicode object -> chararray conversion on narrow Python builds
Diffstat (limited to 'numpy/core/defchararray.py')
-rw-r--r--numpy/core/defchararray.py32
1 files changed, 32 insertions, 0 deletions
diff --git a/numpy/core/defchararray.py b/numpy/core/defchararray.py
index 45b061d69..4ad5c59d3 100644
--- a/numpy/core/defchararray.py
+++ b/numpy/core/defchararray.py
@@ -1714,6 +1714,7 @@ class chararray(ndarray):
self = ndarray.__new__(subtype, shape, (dtype, itemsize),
order=order)
else:
+ print shape, dtype, itemsize
self = ndarray.__new__(subtype, shape, (dtype, itemsize),
buffer=buffer,
offset=offset, strides=strides,
@@ -2422,9 +2423,40 @@ def array(obj, itemsize=None, copy=True, unicode=None, order=None):
unicode = True
else:
unicode = False
+
if itemsize is None:
itemsize = _len(obj)
shape = _len(obj) / itemsize
+
+ if unicode:
+ if sys.maxunicode == 0xffff:
+ # On a narrow Python build, the buffer for Unicode
+ # strings is UCS2, which doesn't match the buffer for
+ # Numpy Unicode types, which is ALWAYS UCS4.
+ # Therefore, we need to convert the buffer. On Python
+ # 2.6 and later, we can use the utf_32 codec. Earlier
+ # versions don't have that codec, so we convert to a
+ # numerical array that matches the input buffer, and
+ # then use Numpy to convert it to UCS4. All of this
+ # should happen in native endianness.
+ if sys.hexversion >= 0x2060000:
+ obj = obj.encode('utf_32')
+ else:
+ if isinstance(obj, str):
+ ascii = numpy.frombuffer(obj, 'u1')
+ ucs4 = numpy.array(ascii, 'u4')
+ obj = ucs4.data
+ else:
+ ucs2 = numpy.frombuffer(obj, 'u2')
+ ucs4 = numpy.array(ucs2, 'u4')
+ obj = ucs4.data
+ else:
+ obj = unicode(obj)
+ else:
+ # Let the default Unicode -> string encoding (if any) take
+ # precedence.
+ obj = str(obj)
+
return chararray(shape, itemsize=itemsize, unicode=unicode,
buffer=obj, order=order)