""" Module for character arrays. .. note:: The chararray module exists for backwards compatibility with Numarray, it is not recommended for new development. If one needs arrays of strings, use arrays of `dtype` object. The preferred alias for `defchararray` is `numpy.char`. """ import sys from numerictypes import string_, unicode_, integer, object_ from numeric import ndarray, broadcast, empty, compare_chararrays from numeric import array as narray __all__ = ['chararray'] _globalvar = 0 _unicode = unicode # special sub-class for character arrays (string_ and unicode_) # This adds + and * operations and methods of str and unicode types # which operate on an element-by-element basis # It also strips white-space on element retrieval and on # comparisons class chararray(ndarray): """ chararray(shape, itemsize=1, unicode=False, buffer=None, offset=0, strides=None, order=None) An array of fixed size (perhaps unicode) strings. .. note:: The chararray module exists for backwards compatibility with Numarray, it is not recommended for new development. If one needs arrays of strings, use arrays of `dtype` object. Create the array, using `buffer` (with `offset` and `strides`) if it is not ``None``. If `buffer` is ``None``, then construct a new array with `strides` in "C order," unless both ``len(shape) >= 2`` and ``order='Fortran'``, in which case `strides` is in "Fortran order." Parameters ---------- shape : tuple Shape of the array. itemsize : int_like, > 0, optional Length of each array element, in number of characters. Default is 1. unicode : {True, False}, optional Are the array elements of unicode-type (``True``) or string-type (``False``, the default). buffer : integer, > 0, optional Memory address of the start of the array data. If ``None`` (the default), a new array is created. offset : integer, >= 0, optional Fixed stride displacement from the beginning of an axis? Default is 0. strides : array_like(?), optional Strides for the array (see `numpy.ndarray.strides` for full description), default is ``None``. order : {'C', 'F'}, optional The order in which the array data is stored in memory: 'C' -> "row major" order (the default), 'F' -> "column major" (Fortran) order Examples -------- >>> charar = np.chararray((3, 3)) >>> charar[:,:] = 'abc' >>> charar chararray([['a', 'a', 'a'], ['a', 'a', 'a'], ['a', 'a', 'a']], dtype='|S1') >>> charar = np.chararray(charar.shape, itemsize=5) >>> charar[:,:] = 'abc' >>> charar chararray([['abc', 'abc', 'abc'], ['abc', 'abc', 'abc'], ['abc', 'abc', 'abc']], dtype='|S5') """ def __new__(subtype, shape, itemsize=1, unicode=False, buffer=None, offset=0, strides=None, order='C'): global _globalvar if unicode: dtype = unicode_ else: dtype = string_ _globalvar = 1 if buffer is None: self = ndarray.__new__(subtype, shape, (dtype, itemsize), order=order) else: self = ndarray.__new__(subtype, shape, (dtype, itemsize), buffer=buffer, offset=offset, strides=strides, order=order) _globalvar = 0 return self def __array_finalize__(self, obj): # The b is a special case because it is used for reconstructing. if not _globalvar and self.dtype.char not in 'SUbc': raise ValueError, "Can only create a chararray from string data." def __getitem__(self, obj): val = ndarray.__getitem__(self, obj) if isinstance(val, (string_, unicode_)): temp = val.rstrip() if len(temp) == 0: val = '' else: val = temp return val def __eq__(self, other): return compare_chararrays(self, other, '==', True) def __ne__(self, other): return compare_chararrays(self, other, '!=', True) def __ge__(self, other): return compare_chararrays(self, other, '>=', True) def __le__(self, other): return compare_chararrays(self, other, '<=', True) def __gt__(self, other): return compare_chararrays(self, other, '>', True) def __lt__(self, other): return compare_chararrays(self, other, '<', True) def __add__(self, other): b = broadcast(self, other) arr = b.iters[1].base outitem = self.itemsize + arr.itemsize result = chararray(b.shape, outitem, self.dtype is unicode_) res = result.flat for k, val in enumerate(b): res[k] = (val[0] + val[1]) return result def __radd__(self, other): b = broadcast(other, self) outitem = b.iters[0].base.itemsize + \ b.iters[1].base.itemsize result = chararray(b.shape, outitem, self.dtype is unicode_) res = result.flat for k, val in enumerate(b): res[k] = (val[0] + val[1]) return result def __mul__(self, other): b = broadcast(self, other) arr = b.iters[1].base if not issubclass(arr.dtype.type, integer): raise ValueError, "Can only multiply by integers" outitem = b.iters[0].base.itemsize * arr.max() result = chararray(b.shape, outitem, self.dtype is unicode_) res = result.flat for k, val in enumerate(b): res[k] = val[0]*val[1] return result def __rmul__(self, other): b = broadcast(self, other) arr = b.iters[1].base if not issubclass(arr.dtype.type, integer): raise ValueError, "Can only multiply by integers" outitem = b.iters[0].base.itemsize * arr.max() result = chararray(b.shape, outitem, self.dtype is unicode_) res = result.flat for k, val in enumerate(b): res[k] = val[0]*val[1] return result def __mod__(self, other): b = broadcast(self, other) res = [None]*b.size maxsize = -1 for k,val in enumerate(b): newval = val[0] % val[1] maxsize = max(len(newval), maxsize) res[k] = newval newarr = chararray(b.shape, maxsize, self.dtype is unicode_) newarr[:] = res return newarr def __rmod__(self, other): return NotImplemented def argsort(self, axis=-1, kind='quicksort', order=None): """ Return the indices that sort the array lexicographically. For full documentation see `numpy.argsort`, for which this method is in fact merely a "thin wrapper." Examples -------- >>> c = np.array(['a1b c', '1b ca', 'b ca1', 'Ca1b'], 'S5') >>> c = c.view(np.chararray); c chararray(['a1b c', '1b ca', 'b ca1', 'Ca1b'], dtype='|S5') >>> c[c.argsort()] chararray(['1b ca', 'Ca1b', 'a1b c', 'b ca1'], dtype='|S5') """ return self.__array__().argsort(axis, kind, order) def _generalmethod(self, name, myiter): res = [None]*myiter.size maxsize = -1 for k, val in enumerate(myiter): newval = [] for chk in val[1:]: if not chk or (chk.dtype is object_ and chk.item() is None): break newval.append(chk) newitem = getattr(val[0],name)(*newval) maxsize = max(len(newitem), maxsize) res[k] = newitem newarr = chararray(myiter.shape, maxsize, self.dtype is unicode_) newarr[:] = res return newarr def _typedmethod(self, name, myiter, dtype): result = empty(myiter.shape, dtype=dtype) res = result.flat for k, val in enumerate(myiter): newval = [] for chk in val[1:]: if not chk or (chk.dtype is object_ and chk.item() is None): break newval.append(chk) this_str = val[0].rstrip('\x00') newitem = getattr(this_str,name)(*newval) res[k] = newitem return result def _samemethod(self, name): result = self.copy() res = result.flat for k, val in enumerate(self.flat): res[k] = getattr(val, name)() return result def capitalize(self): """ Capitalize the first character of each array element. For each element of `self`, if the first character is a letter possessing both "upper-case" and "lower-case" forms, and it is presently in lower-case, change it to upper-case; otherwise, leave it untouched. Parameters ---------- None Returns ------- ret : chararray `self` with each element "title-cased." Examples -------- >>> c = np.array(['a1b2','1b2a','b2a1','2a1b'],'S4').view(np.chararray); c chararray(['a1b2', '1b2a', 'b2a1', '2a1b'], dtype='|S4') >>> c.capitalize() chararray(['A1b2', '1b2a', 'B2a1', '2a1b'], dtype='|S4') """ return self._samemethod('capitalize') if sys.version[:3] >= '2.4': def center(self, width, fillchar=' '): return self._generalmethod('center', broadcast(self, width, fillchar)) def ljust(self, width, fillchar=' '): return self._generalmethod('ljust', broadcast(self, width, fillchar)) def rjust(self, width, fillchar=' '): return self._generalmethod('rjust', broadcast(self, width, fillchar)) def rsplit(self, sep=None, maxsplit=None): return self._typedmethod('rsplit', broadcast(self, sep, maxsplit), object) else: def ljust(self, width): return self._generalmethod('ljust', broadcast(self, width)) def rjust(self, width): return self._generalmethod('rjust', broadcast(self, width)) def center(self, width): return self._generalmethod('center', broadcast(self, width)) def count(self, sub, start=None, end=None): """ Return the number of occurrences of a sub-string in each array element. Parameters ---------- sub : string The sub-string to count. start : int, optional The string index at which to start counting in each element. end : int, optional The string index at which to end counting in each element. Returns ------- ret : ndarray of ints Array whose elements are the number of occurrences of `sub` in each element of `self`. Examples -------- >>> c = np.array(['aAaAaA', ' aA ', 'abBABba']).view(np.chararray) >>> c chararray(['aAaAaA', ' aA', 'abBABba'], dtype='|S7') >>> c.count('A') array([3, 1, 1]) >>> c.count('aA') array([3, 1, 0]) >>> c.count('A', start=1, end=4) array([2, 1, 1]) >>> c.count('A', start=1, end=3) array([1, 0, 0]) """ return self._typedmethod('count', broadcast(self, sub, start, end), int) def decode(self,encoding=None,errors=None): """ Return elements decoded according to the value of `encoding`. Parameters ---------- encoding : string, optional The encoding to use; for a list of acceptable values, see the Python docstring for the package 'encodings' error : Python exception object?, optional The exception to raise if decoding fails? Returns ------- ret : chararray A view of `self`, suitably decoded. See Also -------- encode encodings (package) Examples -------- >>> c = np.array(['aAaAaA', ' aA ', 'abBABba']).view(np.chararray) >>> c chararray(['aAaAaA', ' aA', 'abBABba'], dtype='|S7') >>> c = c.encode(encoding='cp037'); c chararray(['\\x81\\xc1\\x81\\xc1\\x81\\xc1', '@@\\x81\\xc1@@', '\\x81\\x82\\xc2\\xc1\\xc2\\x82\\x81'], dtype='|S7') >>> c.decode(encoding='cp037') chararray(['aAaAaA', ' aA', 'abBABba'], dtype='|S7') """ return self._generalmethod('decode', broadcast(self, encoding, errors)) def encode(self,encoding=None,errors=None): """ Return elements encoded according to the value of `encoding`. Parameters ---------- encoding : string, optional The encoding to use; for a list of acceptable values, see the Python docstring for `encodings`. error : Python exception object, optional The exception to raise if encoding fails. Returns ------- ret : chararray A view of `self`, suitably encoded. See Also -------- decode Examples -------- >>> c = np.array(['aAaAaA', ' aA ', 'abBABba']).view(np.chararray) >>> c chararray(['aAaAaA', ' aA', 'abBABba'], dtype='|S7') >>> c.encode(encoding='cp037') chararray(['\\x81\\xc1\\x81\\xc1\\x81\\xc1', '@@\\x81\\xc1@@', '\\x81\\x82\\xc2\\xc1\\xc2\\x82\\x81'], dtype='|S7') """ return self._generalmethod('encode', broadcast(self, encoding, errors)) def endswith(self, suffix, start=None, end=None): """ Check whether elements end with specified suffix Given an array of strings, return a new bool array of same shape with the result of comparing suffix against each element; each element of bool array is ``True`` if element ends with specified suffix and ``False`` otherwise. Parameters ---------- suffix : string Compare each element in array to this. start : int, optional For each element, start searching from this position. end : int, optional For each element, stop comparing at this position. Returns ------- endswith : ndarray Output array of bools See Also -------- count find index startswith Examples -------- >>> s = chararray(3, itemsize=3) >>> s[0] = 'foo' >>> s[1] = 'bar' >>> s chararray(['foo', 'bar'], dtype='|S3') >>> s.endswith('ar') array([False, True], dtype=bool) >>> s.endswith('a', start=1, end=2) array([False, True], dtype=bool) """ return self._typedmethod('endswith', broadcast(self, suffix, start, end), bool) def expandtabs(self, tabsize=None): return self._generalmethod('endswith', broadcast(self, tabsize)) def find(self, sub, start=None, end=None): return self._typedmethod('find', broadcast(self, sub, start, end), int) def index(self, sub, start=None, end=None): return self._typedmethod('index', broadcast(self, sub, start, end), int) def _ismethod(self, name): result = empty(self.shape, dtype=bool) res = result.flat for k, val in enumerate(self.flat): item = val.rstrip('\x00') res[k] = getattr(item, name)() return result def isalnum(self): return self._ismethod('isalnum') def isalpha(self): return self._ismethod('isalpha') def isdigit(self): return self._ismethod('isdigit') def islower(self): return self._ismethod('islower') def isspace(self): return self._ismethod('isspace') def istitle(self): return self._ismethod('istitle') def isupper(self): return self._ismethod('isupper') def join(self, seq): return self._generalmethod('join', broadcast(self, seq)) def lower(self): """ Assure that every character of each array element is lower-case. For each character possessing both "upper-case" and "lower-case" forms, if it is in upper-case, change it to lower; otherwise, leave it unchanged. Parameters ---------- None Returns ------- ret : chararray `self` with all capital letters changed to lower-case. Examples -------- >>> c = np.array(['A1B C', '1BCA', 'BCA1']).view(np.chararray); c chararray(['A1B C', '1BCA', 'BCA1'], dtype='|S5') >>> c.lower() chararray(['a1b c', '1bca', 'bca1'], dtype='|S5') """ return self._samemethod('lower') def lstrip(self, chars): """ Remove leading characters from each element. Returns a view of ``self`` with `chars` stripped from the start of each element. Note: **No Default** - `chars` must be specified (but if it is explicitly ``None`` or the empty string '', leading whitespace is removed). Parameters ---------- chars : string_like or None Character(s) to strip; whitespace stripped if `chars` == ``None`` or `chars` == ''. Returns ------- ret : chararray View of ``self``, each element suitably stripped. Raises ------ TypeError: lstrip() takes exactly 2 arguments (1 given) If `chars` is not supplied. Examples -------- >>> c = np.array(['aAaAaA', ' aA ', 'abBABba']).view(np.chararray) >>> c chararray(['aAaAaA', ' aA', 'abBABba'], dtype='|S7') >>> c.lstrip('a') # 'a' unstripped from c[1] because whitespace leading chararray(['AaAaA', ' aA', 'bBABba'], dtype='|S6') >>> c.lstrip('A') # leaves c unchanged chararray(['aAaAaA', ' aA', 'abBABba'], dtype='|S7') >>> (c.lstrip(' ') == c.lstrip('')).all() True >>> (c.lstrip(' ') == c.lstrip(None)).all() True """ return self._generalmethod('lstrip', broadcast(self, chars)) def replace(self, old, new, count=None): return self._generalmethod('replace', broadcast(self, old, new, count)) def rfind(self, sub, start=None, end=None): return self._typedmethod('rfind', broadcast(self, sub, start, end), int) def rindex(self, sub, start=None, end=None): return self._typedmethod('rindex', broadcast(self, sub, start, end), int) def rstrip(self, chars=None): """ Remove trailing characters. Returns a view of ``self`` with `chars` stripped from the end of each element. Parameters ---------- chars : string_like, optional, default=None Character(s) to remove. Returns ------- ret : chararray View of ``self``, each element suitably stripped. Examples -------- >>> c = np.array(['aAaAaA', 'abBABba'], dtype='S7').view(np.chararray); c chararray(['aAaAaA', 'abBABba'], dtype='|S7') >>> c.rstrip('a') chararray(['aAaAaA', 'abBABb'], dtype='|S6') >>> c.rstrip('A') chararray(['aAaAa', 'abBABba'], dtype='|S7') """ return self._generalmethod('rstrip', broadcast(self, chars)) def split(self, sep=None, maxsplit=None): return self._typedmethod('split', broadcast(self, sep, maxsplit), object) def splitlines(self, keepends=None): return self._typedmethod('splitlines', broadcast(self, keepends), object) def startswith(self, prefix, start=None, end=None): return self._typedmethod('startswith', broadcast(self, prefix, start, end), bool) def strip(self, chars=None): """ Remove leading and trailing characters, whitespace by default. Returns a view of ``self`` with `chars` stripped from the start and end of each element; by default leading and trailing whitespace is removed. Parameters ---------- chars : string_like, optional, default=None Character(s) to strip; whitespace by default. Returns ------- ret : chararray View of ``self``, each element suitably stripped. Examples -------- >>> c = np.array(['aAaAaA', ' aA ', 'abBABba']).view(np.chararray) >>> c chararray(['aAaAaA', ' aA', 'abBABba'], dtype='|S7') >>> c.strip() chararray(['aAaAaA', 'aA', 'abBABba'], dtype='|S7') >>> c.strip('a') # 'a' unstripped from c[1] because whitespace leads chararray(['AaAaA', ' aA', 'bBABb'], dtype='|S6') >>> c.strip('A') # 'A' unstripped from c[1] because (unprinted) ws trails chararray(['aAaAa', ' aA', 'abBABba'], dtype='|S7') """ return self._generalmethod('strip', broadcast(self, chars)) def swapcase(self): """ Switch upper-case letters to lower-case, and vice-versa. Parameters ---------- None Returns ------- ret : chararray `self` with all lower-case letters capitalized and all upper-case changed to lower case. Examples -------- >>> c=np.array(['a1B c','1b Ca','b Ca1','cA1b'],'S5').view(np.chararray);c chararray(['a1B c', '1b Ca', 'b Ca1', 'cA1b'], dtype='|S5') >>> c.swapcase() chararray(['A1b C', '1B cA', 'B cA1', 'Ca1B'], dtype='|S5') """ return self._samemethod('swapcase') def title(self): """ Capitalize the first character of each array element. For each element of `self`, if the first character is a letter possessing both "upper-case" and "lower-case" forms, and it is presently in lower-case, change it to upper-case; otherwise, leave it untouched. Parameters ---------- None Returns ------- ret : chararray `self` with Examples -------- >>> c=np.array(['a1b c','1b ca','b ca1','ca1b'],'S5').view(np.chararray);c chararray(['a1b c', '1b ca', 'b ca1', 'ca1b'], dtype='|S5') >>> c.title() chararray(['A1B C', '1B Ca', 'B Ca1', 'Ca1B'], dtype='|S5') """ return self._samemethod('title') def translate(self, table, deletechars=None): if self.dtype is unicode_: return self._generalmethod('translate', broadcast(self, table)) else: return self._generalmethod('translate', broadcast(self, table, deletechars)) def upper(self): """ Capitalize every character of each array element. For each character possessing both "upper-case" and "lower-case" forms, if it is in lower-case, change it to upper; otherwise, leave it unchanged. Parameters ---------- None Returns ------- ret : chararray `self` with all characters capitalized. Examples -------- >>> c = np.array(['a1b c', '1bca', 'bca1']).view(np.chararray); c chararray(['a1b c', '1bca', 'bca1'], dtype='|S5') >>> c.upper() chararray(['A1B C', '1BCA', 'BCA1'], dtype='|S5') """ return self._samemethod('upper') def zfill(self, width): return self._generalmethod('zfill', broadcast(self, width)) def array(obj, itemsize=None, copy=True, unicode=False, order=None): if isinstance(obj, chararray): if itemsize is None: itemsize = obj.itemsize if copy or (itemsize != obj.itemsize) \ or (not unicode and obj.dtype == unicode_) \ or (unicode and obj.dtype == string_): return obj.astype("%s%d" % (obj.dtype.char, itemsize)) else: return obj if isinstance(obj, ndarray) and (obj.dtype in [unicode_, string_]): new = obj.view(chararray) if unicode and obj.dtype == string_: return new.astype((unicode_, obj.itemsize)) elif obj.dtype == unicode_: return new.astype((string_, obj.itemsize)) if copy: return new.copy() else: return new if unicode: dtype = "U" else: dtype = "S" if itemsize is not None: dtype += str(itemsize) if isinstance(obj, (str, _unicode)): if itemsize is None: itemsize = len(obj) shape = len(obj) / itemsize return chararray(shape, itemsize=itemsize, unicode=unicode, buffer=obj) # default val = narray(obj, dtype=dtype, order=order, subok=1) return val.view(chararray) def asarray(obj, itemsize=None, unicode=False, order=None): return array(obj, itemsize, copy=False, unicode=unicode, order=order)