diff options
Diffstat (limited to 'numpy/core/defchararray.py')
-rw-r--r-- | numpy/core/defchararray.py | 32 |
1 files changed, 32 insertions, 0 deletions
diff --git a/numpy/core/defchararray.py b/numpy/core/defchararray.py index 45b061d69..4ad5c59d3 100644 --- a/numpy/core/defchararray.py +++ b/numpy/core/defchararray.py @@ -1714,6 +1714,7 @@ class chararray(ndarray): self = ndarray.__new__(subtype, shape, (dtype, itemsize), order=order) else: + print shape, dtype, itemsize self = ndarray.__new__(subtype, shape, (dtype, itemsize), buffer=buffer, offset=offset, strides=strides, @@ -2422,9 +2423,40 @@ def array(obj, itemsize=None, copy=True, unicode=None, order=None): unicode = True else: unicode = False + if itemsize is None: itemsize = _len(obj) shape = _len(obj) / itemsize + + if unicode: + if sys.maxunicode == 0xffff: + # On a narrow Python build, the buffer for Unicode + # strings is UCS2, which doesn't match the buffer for + # Numpy Unicode types, which is ALWAYS UCS4. + # Therefore, we need to convert the buffer. On Python + # 2.6 and later, we can use the utf_32 codec. Earlier + # versions don't have that codec, so we convert to a + # numerical array that matches the input buffer, and + # then use Numpy to convert it to UCS4. All of this + # should happen in native endianness. + if sys.hexversion >= 0x2060000: + obj = obj.encode('utf_32') + else: + if isinstance(obj, str): + ascii = numpy.frombuffer(obj, 'u1') + ucs4 = numpy.array(ascii, 'u4') + obj = ucs4.data + else: + ucs2 = numpy.frombuffer(obj, 'u2') + ucs4 = numpy.array(ucs2, 'u4') + obj = ucs4.data + else: + obj = unicode(obj) + else: + # Let the default Unicode -> string encoding (if any) take + # precedence. + obj = str(obj) + return chararray(shape, itemsize=itemsize, unicode=unicode, buffer=obj, order=order) |