Created NumPy unicode as 32-bit.

author: Travis Oliphant <oliphant@enthought.com> 2006-02-09 04:38:58 +0000
committer: Travis Oliphant <oliphant@enthought.com> 2006-02-09 04:38:58 +0000
commit: f1bffaafc8e94cb5e1e94bd0527410108903669c (patch)
tree: 3971fa5d91cc728a70a9f7a3d9fa721db2e1368c /numpy/core
parent: 520b1c94e456dacf814b1a2d1ff13e8133294cb1 (diff)
download: numpy-f1bffaafc8e94cb5e1e94bd0527410108903669c.tar.gz
6 files changed, 160 insertions, 58 deletions
diff --git a/numpy/core/include/numpy/arrayobject.h b/numpy/core/include/numpy/arrayobject.h
index 4f0d1b1ea..60f4123bc 100644
--- a/numpy/core/include/numpy/arrayobject.h
+++ b/numpy/core/include/numpy/arrayobject.h
@@ -303,6 +303,7 @@ typedef enum {
 #define PyArray_UINT32 PyArray_ULONG
 	typedef long Int32;
 	typedef unsigned long UInt32;
+	typedef unsigned long PyArray_UCS4;
 #define STRBITSOF_LONG "32"
 #elif BITSOF_LONG == 64
 #define PyArray_INT64 PyArray_LONG
@@ -346,6 +347,7 @@ typedef enum {
 #    define PyArray_UINT32 PyArray_ULONGLONG
 	typedef longlong Int32;
 	typedef ulonglong UInt32;
+	typedef ulonglong PyArray_UCS4;
 #  endif
 #  define MAX_LONGLONG MAX_INT32
 #  define MIN_LONGLONG MIN_INT32
@@ -406,6 +408,7 @@ typedef enum {
 #define PyArray_UINT32 PyArray_UINT
 	typedef int Int32;
 	typedef unsigned int UInt32;
+	typedef unsigned int PyArray_UCS4;
 #endif
 #define STRBITSOF_INT "32"
 #elif BITSOF_INT == 64
@@ -448,6 +451,7 @@ typedef enum {
 #define PyArray_UINT32 PyArray_USHORT
 	typedef short Int32;
 	typedef unsigned short UInt32;
+	typedef unsigned short PyArray_UCS4;
 #endif
 #define STRBITSOF_SHORT "32"
 #elif BITSOF_SHORT == 64
@@ -491,6 +495,7 @@ typedef enum {
 #define PyArray_UINT32 PyArray_UBYTE
 	typedef signed char Int32;
 	typedef unsigned char UInt32;
+	typedef unsigned char PyArray_UCS4;
 #endif
 #define STRBITSOF_CHAR "32"
 #elif BITSOF_CHAR == 64
@@ -733,8 +738,6 @@ typedef Py_uintptr_t uintp;
         #define INTP_FMT "Ld"
 #endif
 
-#define UInt32 PyArray_UCS4
-
 #define ERR(str) fprintf(stderr, #str); fflush(stderr);
 #define ERR2(str) fprintf(stderr, str); fflush(stderr);
 
diff --git a/numpy/core/src/arrayobject.c b/numpy/core/src/arrayobject.c
index 22f745ebf..48ce04c52 100644
--- a/numpy/core/src/arrayobject.c
+++ b/numpy/core/src/arrayobject.c
@@ -867,7 +867,11 @@ PyArray_Scalar(void *data, PyArray_Descr *descr, PyObject *base)
 		}
 		else if (type_num == PyArray_UNICODE) {
 			PyUnicodeObject *uni = (PyUnicodeObject*)obj;
-			int length = itemsize / 4;
+			int length = itemsize >> 2;
+
+#ifndef Py_UNICODE_WIDE
+			length *= 2;
+#endif
 			/* Need an extra slot and need to use 
 			   Python memory manager */
 			uni->str = NULL;
@@ -883,10 +887,21 @@ PyArray_Scalar(void *data, PyArray_Descr *descr, PyObject *base)
 			uni->hash = -1;
 			uni->defenc = NULL;
 #ifndef Py_UNICODE_WIDE
-                        /* Allocate enough for 2-characters per itemsize
-                           get the actual number of characters converted 
-                           and reallocate when done.
+                        /* Allocated enough for 2-characters per itemsize.
+			   Now convert from the data-buffer
                          */
+			if (!PyArray_ISNBO(descr->byteorder)) {
+				/* byteswap the data */
+				byte_swap_vector(data, itemsize >> 2, 4);
+			}
+			length = PyUCS2Buffer_FromUCS4(uni->str, (PyArray_UCS4 *)data,
+						       itemsize >> 2);
+			/* Resize the unicode result */
+			if (MyPyUnicode_Resize(uni, length) < 0) {
+				Py_DECREF(obj);
+				return NULL;
+			}
+			return obj;
 #endif
 		}
 		else { 
@@ -5089,7 +5104,7 @@ _array_small_type(PyArray_Descr *chktype, PyArray_Descr* mintype)
 		   because string itemsize is twice as large */
 		if (outtype->type_num == PyArray_UNICODE && 
 		    mintype->type_num == PyArray_STRING) {
-			testsize = MAX(chksize, 2*minsize);
+			testsize = MAX(chksize, 4*minsize);
 		}
 		else {
 			testsize = MAX(chksize, minsize);
@@ -5172,6 +5187,9 @@ _array_find_type(PyObject *op, PyArray_Descr *minitype, int max)
 	if (PyUnicode_Check(op)) {
 		chktype = PyArray_DescrNewFromType(PyArray_UNICODE);
 		chktype->elsize = PyUnicode_GET_DATA_SIZE(op);
+#ifndef Py_UNICODE_WIDE
+		chktype->elsize <<= 1;
+#endif		
 		goto finish;
 	}
 
@@ -5541,10 +5559,10 @@ PyArray_CastToType(PyArrayObject *mp, PyArray_Descr *at, int fortran)
 		if (at == NULL) return NULL;
 		if (mpd->type_num == PyArray_STRING &&	\
 		    at->type_num == PyArray_UNICODE)
-			at->elsize = mpd->elsize*4;
+			at->elsize = mpd->elsize << 2;
 		if (mpd->type_num == PyArray_UNICODE &&
 		    at->type_num == PyArray_STRING) 
-			at->elsize = mpd->elsize/4;
+			at->elsize = mpd->elsize >> 2;
 		if (at->type_num == PyArray_VOID)
 			at->elsize = mpd->elsize;
 	}
@@ -5848,7 +5866,7 @@ _array_typedescr_fromstr(char *str)
 		break;
 	case PyArray_UNICODELTR:
 		type_num = PyArray_UNICODE;
-		size *= 4;
+		size <<= 2;
 		break;
 	case 'V':
 		type_num = PyArray_VOID;
@@ -6416,7 +6434,7 @@ PyArray_CanCastTo(PyArray_Descr *from, PyArray_Descr *to)
 				ret = (from->elsize <= to->elsize);
 			}
 			else if (totype == PyArray_UNICODE) {
-				ret = (from->elsize * 4 \
+				ret = (from->elsize << 2 \
 				       <= to->elsize);
 			}
 		}
@@ -8155,14 +8173,17 @@ arraydescr_protocol_typestr_get(PyArray_Descr *self)
 {
         char basic_=self->kind;
         char endian = self->byteorder;
+	int size=self->elsize;
 
         if (endian == '=') {
                 endian = '<';
                 if (!PyArray_IsNativeByteOrder(endian)) endian = '>';
         }
-       
-        return PyString_FromFormat("%c%c%d", endian, basic_,
-                                   self->elsize);
+   
+	if (self->type_num == PyArray_UNICODE) {
+		size >>= 2;
+	}
+        return PyString_FromFormat("%c%c%d", endian, basic_, size);
 }
 
 static PyObject *
@@ -8393,7 +8414,11 @@ arraydescr_reduce(PyArray_Descr *self, PyObject *args)
 		Py_INCREF(obj);
 	}
 	else {
-		obj = PyString_FromFormat("%c%d",self->kind, self->elsize);
+		elsize = self->elsize;
+		if (self->type_num == PyArray_UNICODE) {
+			elsize >>= 2;
+		}
+		obj = PyString_FromFormat("%c%d",self->kind, elsize);
 	}
 	PyTuple_SET_ITEM(ret, 1, Py_BuildValue("(Nii)", obj, 0, 1));
 	
diff --git a/numpy/core/src/arraytypes.inc.src b/numpy/core/src/arraytypes.inc.src
index 6a81f7e09..e1b17a3a6 100644
--- a/numpy/core/src/arraytypes.inc.src
+++ b/numpy/core/src/arraytypes.inc.src
@@ -221,7 +221,7 @@ UNICODE_getitem(char *ip, PyArrayObject *ap)
 	int mysize;
 	PyArray_UCS4 *dptr;
 
-	mysize = ap->descr->elsize / 4
+	mysize = ap->descr->elsize >> 2;
 	dptr = (PyArray_UCS4 *)ip + mysize-1;
 	while(mysize > 0 && *dptr-- == 0) mysize--;
 	if (!PyArray_ISNOTSWAPPED(ap) && (obj != NULL)) {
@@ -230,7 +230,16 @@ UNICODE_getitem(char *ip, PyArrayObject *ap)
 #ifdef Py_UNICODE_WIDE
 	obj = PyUnicode_FromUnicode((const PyArray_UCS4 *)ip, mysize);
 #else
-        obj = MyPyUnicode_FromUCS4((const PyArray_UCS4 *)ip, mysize);
+	/* create new empty unicode object of length mysize*2 */
+	obj = MyPyUnicode_New(mysize*2);
+	if (obj == NULL) return obj;
+	mysize = PyUCS2Buffer_FromUCS4(((PyUnicodeObject *)obj)->str, 
+				       (PyArray_UCS4 *)ip, mysize);
+	/* reset length of unicode object to ucs2size */
+	if (MyPyUnicode_Resize((PyUnicodeObject *)obj, mysize) < 0) {
+		Py_DECREF(obj);
+		return NULL;
+	}
 #endif
 
 	return obj;
@@ -240,9 +249,8 @@ static int
 UNICODE_setitem(PyObject *op, char *ov, PyArrayObject *ap) 
 {
         PyObject *temp;
-	PyArray_UCS4 *ptr;
+	Py_UNICODE *ptr;
 	int datalen;
-	size_t size = sizeof(PyArray_UCS4);
 
 	if ((temp=PyObject_Unicode(op)) == NULL) return -1;
 	ptr = PyUnicode_AS_UNICODE(temp);
@@ -255,16 +263,17 @@ UNICODE_setitem(PyObject *op, char *ov, PyArrayObject *ap)
 #ifdef Py_UNICODE_WIDE
 	memcpy(ov, ptr, MIN(ap->descr->elsize, datalen));
 #else
-        MyPyUnicode_AsUCS4(ov, ptr, MIN(ap->descr->elsize, datalen));
+        datalen = PyUCS2Buffer_AsUCS4(ptr, (PyArray_UCS4 *)ov, datalen >> 1,
+				      ap->descr->elsize >> 2);
+	datalen <<= 2;
 #endif
 	/* Fill in the rest of the space with 0 */
 	if (ap->descr->elsize > datalen) {
 		memset(ov + datalen, 0, (ap->descr->elsize - datalen));
 	}
-
-	if (!PyArray_ISNOTSWAPPED(ap)) 
-		byte_swap_vector(ov, ap->descr->elsize / 4, 4)
 	
+	if (!PyArray_ISNOTSWAPPED(ap)) 
+		byte_swap_vector(ov, ap->descr->elsize >> 2, 4);
 	Py_DECREF(temp);
 	return 0;
 }
@@ -1177,20 +1186,17 @@ VOID_copyswapn (char *dst, char *src, intp n, int swap, int itemsize)
 static void
 UNICODE_copyswapn (char *dst, char *src, intp n, int swap, int itemsize)
 {
-	int size = sizeof(PyArray_UCS4);
 
 	if (src != NULL) 
 		memcpy(dst, src, itemsize * n);
 	
 	if (swap) {
 		register char *a, *b, c;
-		int j, i = size / 2;
 		for (a = (char *)dst; n>0; n--) {
-			b = a + (size-1);
-                        for (j=0; j<i; j++) {
-                                c=*a; *a++ = *b; *b-- = c;
-                        }
-			a += i / 2;
+			b = a + 3;
+			c=*a; *a++ = *b; *b-- = c;
+			c=*a; *a++ = *b; *b-- = c;
+			a += 1;
 		}
 	}
 }
@@ -1222,19 +1228,16 @@ VOID_copyswap (char *dst, char *src, int swap, int itemsize)
 static void
 UNICODE_copyswap (char *dst, char *src, int swap, int itemsize)
 {
-	int size = sizeof(PyArray_UCS4);
 
 	if (src != NULL) 
 		memcpy(dst, src, itemsize);
 	
 	if (swap) {
 		register char *a, *b, c;
-		int j, i = size / 2;
                 a = (char *)dst;
-                b = a + (size-1);
-                for (j=0; j<i; j++) {
-                        c=*a; *a++ = *b; *b-- = c;
-                }
+                b = a + 3;
+		c=*a; *a++ = *b; *b-- = c;
+		c=*a; *a++ = *b; *b-- = c;
 	}
 }
 
@@ -1319,19 +1322,30 @@ STRING_nonzero (char *ip, PyArrayObject *ap)
 #ifdef Py_UNICODE_WIDE
 #define PyArray_UCS4_ISSPACE Py_UNICODE_ISSPACE
 #else
-#define PyArray_UCS4_ISSPACE(x) FALSE
+#define PyArray_UCS4_ISSPACE(ch) Py_STRING_ISSPACE((char)ch)
 #endif
 
-/* fixme:  This must deal with unaligned and byte-swapped data
-   and what-to do for UCS2-builds
- */
 static Bool
 UNICODE_nonzero (PyArray_UCS4 *ip, PyArrayObject *ap)
 {
-	int len = ap->descr->elsize >> 1;
+	int len = ap->descr->elsize >> 2;
 	int i;
 	Bool nonz = FALSE;
-	
+	char *buffer=NULL;	
+
+	if ((!PyArray_ISNOTSWAPPED(ap)) || \
+	    (!PyArray_ISALIGNED(ap))) {
+		buffer = _pya_malloc(ap->descr->elsize);
+		if (buffer == NULL) {
+			return nonz;
+		}
+		memcpy(buffer, ip, ap->descr->elsize);
+		if (!PyArray_ISNOTSWAPPED(ap)) {
+			byte_swap_vector(buffer, len, 4);
+		}
+		ip = (PyArray_UCS4 *)buffer;
+	}
+
 	for (i=0; i<len; i++) {
 		if (!PyArray_UCS4_ISSPACE(*ip)) {
 			nonz = TRUE;
@@ -1339,6 +1353,7 @@ UNICODE_nonzero (PyArray_UCS4 *ip, PyArrayObject *ap)
 		}
 		ip++;
 	}
+	_pya_free(buffer);
 	return nonz;
 }
 
@@ -1897,9 +1912,6 @@ set_typeinfo(PyObject *dict)
 	_letter_to_num[PyArray_@name@LTR] = PyArray_@name@;
 /**end repeat**/
 	_letter_to_num[PyArray_STRINGLTR2] = PyArray_STRING;
-	_letter_to_num[PyArray_UCS4LTR] = PyArray_UNICODE;
-	_letter_to_num[PyArray_UCS2LTR] = PyArray_UNICODE;
-
 
 /**begin repeat
 #name=BOOL,BYTE,UBYTE,SHORT,USHORT,INT,UINT,LONG,ULONG,LONGLONG,ULONGLONG,FLOAT,DOUBLE,LONGDOUBLE,CFLOAT,CDOUBLE,CLONGDOUBLE,OBJECT,STRING,UNICODE,VOID#
@@ -1907,13 +1919,6 @@ set_typeinfo(PyObject *dict)
 	@name@_Descr.fields = Py_None;
 /**end repeat**/
 	
-	if (sizeof(PyArray_UCS4) == 2) {
-		UNICODE_Descr.kind = PyArray_UCS2LTR;
-	}
-	else if (sizeof(PyArray_UCS4) == 4) {
-		UNICODE_Descr.kind = PyArray_UCS4LTR;
-	}
-
 	/* Set a dictionary with type information */
 	infodict = PyDict_New();
 	if (infodict == NULL) return -1;
diff --git a/numpy/core/src/multiarraymodule.c b/numpy/core/src/multiarraymodule.c
index ddacb68c9..f65c74e88 100644
--- a/numpy/core/src/multiarraymodule.c
+++ b/numpy/core/src/multiarraymodule.c
@@ -3885,7 +3885,7 @@ PyArray_DescrConverter(PyObject *obj, PyArray_Descr **at)
 			   the number of bytes.
 			*/
 			else if (check_num == PyArray_UNICODELTR) {
-				elsize *= 4;
+				elsize <<= 2;
 			}
 			/* Support for generic processing 
 			   c4, i4, f8, etc...
diff --git a/numpy/core/src/scalartypes.inc.src b/numpy/core/src/scalartypes.inc.src
index 58f9bebb2..1ef8372f2 100644
--- a/numpy/core/src/scalartypes.inc.src
+++ b/numpy/core/src/scalartypes.inc.src
@@ -184,8 +184,11 @@ PyArray_FromScalar(PyObject *scalar, PyArray_Descr *outcode)
 		memptr = (char *)PyUnicode_AS_DATA(scalar);
 #ifdef Py_UNICODE_WIDE
 		break;
-#else:
-                PyUCS2Unicode_AsUCS4(PyArray_DATA(r), memptr, PyArray_ITEMSIZE(r));
+#else
+		PyUCS2Buffer_AsUCS4((Py_UNICODE *)memptr, 
+				    (PyArray_UCS4 *)PyArray_DATA(r),
+				    PyUnicode_GET_SIZE(scalar),
+				    PyArray_ITEMSIZE(r) >> 2);
                 goto finish;
 #endif
 	default:
diff --git a/numpy/core/src/ucsnarrow.c b/numpy/core/src/ucsnarrow.c
index 8960026d5..6480303a9 100644
--- a/numpy/core/src/ucsnarrow.c
+++ b/numpy/core/src/ucsnarrow.c
@@ -1,3 +1,7 @@
+/* Functions only needed on narrow builds of Python 
+   for converting back and forth between the NumPy Unicode data-type (always 4-byte)
+   and the Python Unicode scalar (2-bytes on a narrow build). 
+ */
 
 /* the ucs2 buffer must be large enough to hold 2*ucs4length characters
    due to the use of surrogate pairs. 
@@ -29,12 +33,74 @@ PyUCS2Buffer_FromUCS4(Py_UNICODE *ucs2, PyArray_UCS4 *ucs4, int ucs4length)
 }
 
 
-/* This converts a UCS2 buffer (from a Python unicode object)
+/* This converts a UCS2 buffer of the given length to UCS4 buffer.
+   It converts up to ucs4len characters of UCS2
 
-*/
+   It returns the number of characters converted which can
+   be less than ucslen if there are surrogate pairs in ucs2.
 
+   The return value is the actual size of the used part of the ucs4 buffer. 
+*/
 
 static int
-PyUCS2Buffer_AsUCS4(Py_UNICODE *ucs2, PyArray_UCS4 *ucs4, int ucs4length, int ucs2length)
+PyUCS2Buffer_AsUCS4(Py_UNICODE *ucs2, PyArray_UCS4 *ucs4, int ucs2len, int ucs4len)
 {
+	register int i;
+	register PyArray_UCS4 chr;
+	register Py_UNICODE ch;
+	register int numchars=0;
+
+	for (i=0; (i < ucs2len-1) && (numchars < ucs4len); i++) {
+		ch = *ucs2++;
+		if (ch >= 0xd800 || ch <= 0xdfff) {
+			/* surrogate pair */
+			chr = ((PyArray_UCS4)(ch-0xd800)) << 10;
+			chr += *ucs2++ + 0x2400;  /* -0xdc00 + 0x10000 */
+			i++;
+		}
+		else {
+			chr = (PyArray_UCS4) ch;
+		}
+		*ucs4++ = chr;
+		numchars++;
+	}
+	return numchars;
+}
+
+
+static PyObject *
+MyPyUnicode_New(int length)
+{
+	PyUnicodeObject *unicode;
+	unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
+	if (unicode == NULL) return NULL;
+	unicode->str = PyMem_NEW(Py_UNICODE, length+1);
+	if (!unicode->str) {
+		_Py_ForgetReference((PyObject *)unicode);
+		PyObject_Del(unicode);
+		return PyErr_NoMemory();
+	}
+	unicode->str[0] = 0;
+	unicode->str[length] = 0;
+	unicode->length = length;
+	unicode->hash = -1;
+	unicode->defenc = NULL;
+	return (PyObject *)unicode;
+}
+
+static int
+MyPyUnicode_Resize(PyUnicodeObject *uni, int length)
+{	
+	void *oldstr;
+	
+	oldstr = uni->str;
+	PyMem_RESIZE(uni->str, Py_UNICODE, length+1);
+	if (!uni->str) {
+		uni->str = oldstr;
+		PyErr_NoMemory();
+		return -1;
+	}
+	uni->str[length] = 0;
+	uni->length = length;
+	return 0;
 }
author	Travis Oliphant <oliphant@enthought.com>	2006-02-09 04:38:58 +0000
committer	Travis Oliphant <oliphant@enthought.com>	2006-02-09 04:38:58 +0000
commit	f1bffaafc8e94cb5e1e94bd0527410108903669c (patch)
tree	3971fa5d91cc728a70a9f7a3d9fa721db2e1368c /numpy/core
parent	520b1c94e456dacf814b1a2d1ff13e8133294cb1 (diff)
download	numpy-f1bffaafc8e94cb5e1e94bd0527410108903669c.tar.gz