summaryrefslogtreecommitdiff
path: root/numpy/core
diff options
context:
space:
mode:
authorTravis Oliphant <oliphant@enthought.com>2006-02-09 04:38:58 +0000
committerTravis Oliphant <oliphant@enthought.com>2006-02-09 04:38:58 +0000
commitf1bffaafc8e94cb5e1e94bd0527410108903669c (patch)
tree3971fa5d91cc728a70a9f7a3d9fa721db2e1368c /numpy/core
parent520b1c94e456dacf814b1a2d1ff13e8133294cb1 (diff)
downloadnumpy-f1bffaafc8e94cb5e1e94bd0527410108903669c.tar.gz
Created NumPy unicode as 32-bit.
Diffstat (limited to 'numpy/core')
-rw-r--r--numpy/core/include/numpy/arrayobject.h7
-rw-r--r--numpy/core/src/arrayobject.c51
-rw-r--r--numpy/core/src/arraytypes.inc.src79
-rw-r--r--numpy/core/src/multiarraymodule.c2
-rw-r--r--numpy/core/src/scalartypes.inc.src7
-rw-r--r--numpy/core/src/ucsnarrow.c72
6 files changed, 160 insertions, 58 deletions
diff --git a/numpy/core/include/numpy/arrayobject.h b/numpy/core/include/numpy/arrayobject.h
index 4f0d1b1ea..60f4123bc 100644
--- a/numpy/core/include/numpy/arrayobject.h
+++ b/numpy/core/include/numpy/arrayobject.h
@@ -303,6 +303,7 @@ typedef enum {
#define PyArray_UINT32 PyArray_ULONG
typedef long Int32;
typedef unsigned long UInt32;
+ typedef unsigned long PyArray_UCS4;
#define STRBITSOF_LONG "32"
#elif BITSOF_LONG == 64
#define PyArray_INT64 PyArray_LONG
@@ -346,6 +347,7 @@ typedef enum {
# define PyArray_UINT32 PyArray_ULONGLONG
typedef longlong Int32;
typedef ulonglong UInt32;
+ typedef ulonglong PyArray_UCS4;
# endif
# define MAX_LONGLONG MAX_INT32
# define MIN_LONGLONG MIN_INT32
@@ -406,6 +408,7 @@ typedef enum {
#define PyArray_UINT32 PyArray_UINT
typedef int Int32;
typedef unsigned int UInt32;
+ typedef unsigned int PyArray_UCS4;
#endif
#define STRBITSOF_INT "32"
#elif BITSOF_INT == 64
@@ -448,6 +451,7 @@ typedef enum {
#define PyArray_UINT32 PyArray_USHORT
typedef short Int32;
typedef unsigned short UInt32;
+ typedef unsigned short PyArray_UCS4;
#endif
#define STRBITSOF_SHORT "32"
#elif BITSOF_SHORT == 64
@@ -491,6 +495,7 @@ typedef enum {
#define PyArray_UINT32 PyArray_UBYTE
typedef signed char Int32;
typedef unsigned char UInt32;
+ typedef unsigned char PyArray_UCS4;
#endif
#define STRBITSOF_CHAR "32"
#elif BITSOF_CHAR == 64
@@ -733,8 +738,6 @@ typedef Py_uintptr_t uintp;
#define INTP_FMT "Ld"
#endif
-#define UInt32 PyArray_UCS4
-
#define ERR(str) fprintf(stderr, #str); fflush(stderr);
#define ERR2(str) fprintf(stderr, str); fflush(stderr);
diff --git a/numpy/core/src/arrayobject.c b/numpy/core/src/arrayobject.c
index 22f745ebf..48ce04c52 100644
--- a/numpy/core/src/arrayobject.c
+++ b/numpy/core/src/arrayobject.c
@@ -867,7 +867,11 @@ PyArray_Scalar(void *data, PyArray_Descr *descr, PyObject *base)
}
else if (type_num == PyArray_UNICODE) {
PyUnicodeObject *uni = (PyUnicodeObject*)obj;
- int length = itemsize / 4;
+ int length = itemsize >> 2;
+
+#ifndef Py_UNICODE_WIDE
+ length *= 2;
+#endif
/* Need an extra slot and need to use
Python memory manager */
uni->str = NULL;
@@ -883,10 +887,21 @@ PyArray_Scalar(void *data, PyArray_Descr *descr, PyObject *base)
uni->hash = -1;
uni->defenc = NULL;
#ifndef Py_UNICODE_WIDE
- /* Allocate enough for 2-characters per itemsize
- get the actual number of characters converted
- and reallocate when done.
+ /* Allocated enough for 2-characters per itemsize.
+ Now convert from the data-buffer
*/
+ if (!PyArray_ISNBO(descr->byteorder)) {
+ /* byteswap the data */
+ byte_swap_vector(data, itemsize >> 2, 4);
+ }
+ length = PyUCS2Buffer_FromUCS4(uni->str, (PyArray_UCS4 *)data,
+ itemsize >> 2);
+ /* Resize the unicode result */
+ if (MyPyUnicode_Resize(uni, length) < 0) {
+ Py_DECREF(obj);
+ return NULL;
+ }
+ return obj;
#endif
}
else {
@@ -5089,7 +5104,7 @@ _array_small_type(PyArray_Descr *chktype, PyArray_Descr* mintype)
because string itemsize is twice as large */
if (outtype->type_num == PyArray_UNICODE &&
mintype->type_num == PyArray_STRING) {
- testsize = MAX(chksize, 2*minsize);
+ testsize = MAX(chksize, 4*minsize);
}
else {
testsize = MAX(chksize, minsize);
@@ -5172,6 +5187,9 @@ _array_find_type(PyObject *op, PyArray_Descr *minitype, int max)
if (PyUnicode_Check(op)) {
chktype = PyArray_DescrNewFromType(PyArray_UNICODE);
chktype->elsize = PyUnicode_GET_DATA_SIZE(op);
+#ifndef Py_UNICODE_WIDE
+ chktype->elsize <<= 1;
+#endif
goto finish;
}
@@ -5541,10 +5559,10 @@ PyArray_CastToType(PyArrayObject *mp, PyArray_Descr *at, int fortran)
if (at == NULL) return NULL;
if (mpd->type_num == PyArray_STRING && \
at->type_num == PyArray_UNICODE)
- at->elsize = mpd->elsize*4;
+ at->elsize = mpd->elsize << 2;
if (mpd->type_num == PyArray_UNICODE &&
at->type_num == PyArray_STRING)
- at->elsize = mpd->elsize/4;
+ at->elsize = mpd->elsize >> 2;
if (at->type_num == PyArray_VOID)
at->elsize = mpd->elsize;
}
@@ -5848,7 +5866,7 @@ _array_typedescr_fromstr(char *str)
break;
case PyArray_UNICODELTR:
type_num = PyArray_UNICODE;
- size *= 4;
+ size <<= 2;
break;
case 'V':
type_num = PyArray_VOID;
@@ -6416,7 +6434,7 @@ PyArray_CanCastTo(PyArray_Descr *from, PyArray_Descr *to)
ret = (from->elsize <= to->elsize);
}
else if (totype == PyArray_UNICODE) {
- ret = (from->elsize * 4 \
+ ret = (from->elsize << 2 \
<= to->elsize);
}
}
@@ -8155,14 +8173,17 @@ arraydescr_protocol_typestr_get(PyArray_Descr *self)
{
char basic_=self->kind;
char endian = self->byteorder;
+ int size=self->elsize;
if (endian == '=') {
endian = '<';
if (!PyArray_IsNativeByteOrder(endian)) endian = '>';
}
-
- return PyString_FromFormat("%c%c%d", endian, basic_,
- self->elsize);
+
+ if (self->type_num == PyArray_UNICODE) {
+ size >>= 2;
+ }
+ return PyString_FromFormat("%c%c%d", endian, basic_, size);
}
static PyObject *
@@ -8393,7 +8414,11 @@ arraydescr_reduce(PyArray_Descr *self, PyObject *args)
Py_INCREF(obj);
}
else {
- obj = PyString_FromFormat("%c%d",self->kind, self->elsize);
+ elsize = self->elsize;
+ if (self->type_num == PyArray_UNICODE) {
+ elsize >>= 2;
+ }
+ obj = PyString_FromFormat("%c%d",self->kind, elsize);
}
PyTuple_SET_ITEM(ret, 1, Py_BuildValue("(Nii)", obj, 0, 1));
diff --git a/numpy/core/src/arraytypes.inc.src b/numpy/core/src/arraytypes.inc.src
index 6a81f7e09..e1b17a3a6 100644
--- a/numpy/core/src/arraytypes.inc.src
+++ b/numpy/core/src/arraytypes.inc.src
@@ -221,7 +221,7 @@ UNICODE_getitem(char *ip, PyArrayObject *ap)
int mysize;
PyArray_UCS4 *dptr;
- mysize = ap->descr->elsize / 4
+ mysize = ap->descr->elsize >> 2;
dptr = (PyArray_UCS4 *)ip + mysize-1;
while(mysize > 0 && *dptr-- == 0) mysize--;
if (!PyArray_ISNOTSWAPPED(ap) && (obj != NULL)) {
@@ -230,7 +230,16 @@ UNICODE_getitem(char *ip, PyArrayObject *ap)
#ifdef Py_UNICODE_WIDE
obj = PyUnicode_FromUnicode((const PyArray_UCS4 *)ip, mysize);
#else
- obj = MyPyUnicode_FromUCS4((const PyArray_UCS4 *)ip, mysize);
+ /* create new empty unicode object of length mysize*2 */
+ obj = MyPyUnicode_New(mysize*2);
+ if (obj == NULL) return obj;
+ mysize = PyUCS2Buffer_FromUCS4(((PyUnicodeObject *)obj)->str,
+ (PyArray_UCS4 *)ip, mysize);
+ /* reset length of unicode object to ucs2size */
+ if (MyPyUnicode_Resize((PyUnicodeObject *)obj, mysize) < 0) {
+ Py_DECREF(obj);
+ return NULL;
+ }
#endif
return obj;
@@ -240,9 +249,8 @@ static int
UNICODE_setitem(PyObject *op, char *ov, PyArrayObject *ap)
{
PyObject *temp;
- PyArray_UCS4 *ptr;
+ Py_UNICODE *ptr;
int datalen;
- size_t size = sizeof(PyArray_UCS4);
if ((temp=PyObject_Unicode(op)) == NULL) return -1;
ptr = PyUnicode_AS_UNICODE(temp);
@@ -255,16 +263,17 @@ UNICODE_setitem(PyObject *op, char *ov, PyArrayObject *ap)
#ifdef Py_UNICODE_WIDE
memcpy(ov, ptr, MIN(ap->descr->elsize, datalen));
#else
- MyPyUnicode_AsUCS4(ov, ptr, MIN(ap->descr->elsize, datalen));
+ datalen = PyUCS2Buffer_AsUCS4(ptr, (PyArray_UCS4 *)ov, datalen >> 1,
+ ap->descr->elsize >> 2);
+ datalen <<= 2;
#endif
/* Fill in the rest of the space with 0 */
if (ap->descr->elsize > datalen) {
memset(ov + datalen, 0, (ap->descr->elsize - datalen));
}
-
- if (!PyArray_ISNOTSWAPPED(ap))
- byte_swap_vector(ov, ap->descr->elsize / 4, 4)
+ if (!PyArray_ISNOTSWAPPED(ap))
+ byte_swap_vector(ov, ap->descr->elsize >> 2, 4);
Py_DECREF(temp);
return 0;
}
@@ -1177,20 +1186,17 @@ VOID_copyswapn (char *dst, char *src, intp n, int swap, int itemsize)
static void
UNICODE_copyswapn (char *dst, char *src, intp n, int swap, int itemsize)
{
- int size = sizeof(PyArray_UCS4);
if (src != NULL)
memcpy(dst, src, itemsize * n);
if (swap) {
register char *a, *b, c;
- int j, i = size / 2;
for (a = (char *)dst; n>0; n--) {
- b = a + (size-1);
- for (j=0; j<i; j++) {
- c=*a; *a++ = *b; *b-- = c;
- }
- a += i / 2;
+ b = a + 3;
+ c=*a; *a++ = *b; *b-- = c;
+ c=*a; *a++ = *b; *b-- = c;
+ a += 1;
}
}
}
@@ -1222,19 +1228,16 @@ VOID_copyswap (char *dst, char *src, int swap, int itemsize)
static void
UNICODE_copyswap (char *dst, char *src, int swap, int itemsize)
{
- int size = sizeof(PyArray_UCS4);
if (src != NULL)
memcpy(dst, src, itemsize);
if (swap) {
register char *a, *b, c;
- int j, i = size / 2;
a = (char *)dst;
- b = a + (size-1);
- for (j=0; j<i; j++) {
- c=*a; *a++ = *b; *b-- = c;
- }
+ b = a + 3;
+ c=*a; *a++ = *b; *b-- = c;
+ c=*a; *a++ = *b; *b-- = c;
}
}
@@ -1319,19 +1322,30 @@ STRING_nonzero (char *ip, PyArrayObject *ap)
#ifdef Py_UNICODE_WIDE
#define PyArray_UCS4_ISSPACE Py_UNICODE_ISSPACE
#else
-#define PyArray_UCS4_ISSPACE(x) FALSE
+#define PyArray_UCS4_ISSPACE(ch) Py_STRING_ISSPACE((char)ch)
#endif
-/* fixme: This must deal with unaligned and byte-swapped data
- and what-to do for UCS2-builds
- */
static Bool
UNICODE_nonzero (PyArray_UCS4 *ip, PyArrayObject *ap)
{
- int len = ap->descr->elsize >> 1;
+ int len = ap->descr->elsize >> 2;
int i;
Bool nonz = FALSE;
-
+ char *buffer=NULL;
+
+ if ((!PyArray_ISNOTSWAPPED(ap)) || \
+ (!PyArray_ISALIGNED(ap))) {
+ buffer = _pya_malloc(ap->descr->elsize);
+ if (buffer == NULL) {
+ return nonz;
+ }
+ memcpy(buffer, ip, ap->descr->elsize);
+ if (!PyArray_ISNOTSWAPPED(ap)) {
+ byte_swap_vector(buffer, len, 4);
+ }
+ ip = (PyArray_UCS4 *)buffer;
+ }
+
for (i=0; i<len; i++) {
if (!PyArray_UCS4_ISSPACE(*ip)) {
nonz = TRUE;
@@ -1339,6 +1353,7 @@ UNICODE_nonzero (PyArray_UCS4 *ip, PyArrayObject *ap)
}
ip++;
}
+ _pya_free(buffer);
return nonz;
}
@@ -1897,9 +1912,6 @@ set_typeinfo(PyObject *dict)
_letter_to_num[PyArray_@name@LTR] = PyArray_@name@;
/**end repeat**/
_letter_to_num[PyArray_STRINGLTR2] = PyArray_STRING;
- _letter_to_num[PyArray_UCS4LTR] = PyArray_UNICODE;
- _letter_to_num[PyArray_UCS2LTR] = PyArray_UNICODE;
-
/**begin repeat
#name=BOOL,BYTE,UBYTE,SHORT,USHORT,INT,UINT,LONG,ULONG,LONGLONG,ULONGLONG,FLOAT,DOUBLE,LONGDOUBLE,CFLOAT,CDOUBLE,CLONGDOUBLE,OBJECT,STRING,UNICODE,VOID#
@@ -1907,13 +1919,6 @@ set_typeinfo(PyObject *dict)
@name@_Descr.fields = Py_None;
/**end repeat**/
- if (sizeof(PyArray_UCS4) == 2) {
- UNICODE_Descr.kind = PyArray_UCS2LTR;
- }
- else if (sizeof(PyArray_UCS4) == 4) {
- UNICODE_Descr.kind = PyArray_UCS4LTR;
- }
-
/* Set a dictionary with type information */
infodict = PyDict_New();
if (infodict == NULL) return -1;
diff --git a/numpy/core/src/multiarraymodule.c b/numpy/core/src/multiarraymodule.c
index ddacb68c9..f65c74e88 100644
--- a/numpy/core/src/multiarraymodule.c
+++ b/numpy/core/src/multiarraymodule.c
@@ -3885,7 +3885,7 @@ PyArray_DescrConverter(PyObject *obj, PyArray_Descr **at)
the number of bytes.
*/
else if (check_num == PyArray_UNICODELTR) {
- elsize *= 4;
+ elsize <<= 2;
}
/* Support for generic processing
c4, i4, f8, etc...
diff --git a/numpy/core/src/scalartypes.inc.src b/numpy/core/src/scalartypes.inc.src
index 58f9bebb2..1ef8372f2 100644
--- a/numpy/core/src/scalartypes.inc.src
+++ b/numpy/core/src/scalartypes.inc.src
@@ -184,8 +184,11 @@ PyArray_FromScalar(PyObject *scalar, PyArray_Descr *outcode)
memptr = (char *)PyUnicode_AS_DATA(scalar);
#ifdef Py_UNICODE_WIDE
break;
-#else:
- PyUCS2Unicode_AsUCS4(PyArray_DATA(r), memptr, PyArray_ITEMSIZE(r));
+#else
+ PyUCS2Buffer_AsUCS4((Py_UNICODE *)memptr,
+ (PyArray_UCS4 *)PyArray_DATA(r),
+ PyUnicode_GET_SIZE(scalar),
+ PyArray_ITEMSIZE(r) >> 2);
goto finish;
#endif
default:
diff --git a/numpy/core/src/ucsnarrow.c b/numpy/core/src/ucsnarrow.c
index 8960026d5..6480303a9 100644
--- a/numpy/core/src/ucsnarrow.c
+++ b/numpy/core/src/ucsnarrow.c
@@ -1,3 +1,7 @@
+/* Functions only needed on narrow builds of Python
+ for converting back and forth between the NumPy Unicode data-type (always 4-byte)
+ and the Python Unicode scalar (2-bytes on a narrow build).
+ */
/* the ucs2 buffer must be large enough to hold 2*ucs4length characters
due to the use of surrogate pairs.
@@ -29,12 +33,74 @@ PyUCS2Buffer_FromUCS4(Py_UNICODE *ucs2, PyArray_UCS4 *ucs4, int ucs4length)
}
-/* This converts a UCS2 buffer (from a Python unicode object)
+/* This converts a UCS2 buffer of the given length to UCS4 buffer.
+ It converts up to ucs4len characters of UCS2
-*/
+ It returns the number of characters converted which can
+ be less than ucslen if there are surrogate pairs in ucs2.
+ The return value is the actual size of the used part of the ucs4 buffer.
+*/
static int
-PyUCS2Buffer_AsUCS4(Py_UNICODE *ucs2, PyArray_UCS4 *ucs4, int ucs4length, int ucs2length)
+PyUCS2Buffer_AsUCS4(Py_UNICODE *ucs2, PyArray_UCS4 *ucs4, int ucs2len, int ucs4len)
{
+ register int i;
+ register PyArray_UCS4 chr;
+ register Py_UNICODE ch;
+ register int numchars=0;
+
+ for (i=0; (i < ucs2len-1) && (numchars < ucs4len); i++) {
+ ch = *ucs2++;
+ if (ch >= 0xd800 || ch <= 0xdfff) {
+ /* surrogate pair */
+ chr = ((PyArray_UCS4)(ch-0xd800)) << 10;
+ chr += *ucs2++ + 0x2400; /* -0xdc00 + 0x10000 */
+ i++;
+ }
+ else {
+ chr = (PyArray_UCS4) ch;
+ }
+ *ucs4++ = chr;
+ numchars++;
+ }
+ return numchars;
+}
+
+
+static PyObject *
+MyPyUnicode_New(int length)
+{
+ PyUnicodeObject *unicode;
+ unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
+ if (unicode == NULL) return NULL;
+ unicode->str = PyMem_NEW(Py_UNICODE, length+1);
+ if (!unicode->str) {
+ _Py_ForgetReference((PyObject *)unicode);
+ PyObject_Del(unicode);
+ return PyErr_NoMemory();
+ }
+ unicode->str[0] = 0;
+ unicode->str[length] = 0;
+ unicode->length = length;
+ unicode->hash = -1;
+ unicode->defenc = NULL;
+ return (PyObject *)unicode;
+}
+
+static int
+MyPyUnicode_Resize(PyUnicodeObject *uni, int length)
+{
+ void *oldstr;
+
+ oldstr = uni->str;
+ PyMem_RESIZE(uni->str, Py_UNICODE, length+1);
+ if (!uni->str) {
+ uni->str = oldstr;
+ PyErr_NoMemory();
+ return -1;
+ }
+ uni->str[length] = 0;
+ uni->length = length;
+ return 0;
}