summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRaymond Hettinger <python@rcn.com>2003-12-06 16:23:06 +0000
committerRaymond Hettinger <python@rcn.com>2003-12-06 16:23:06 +0000
commitd25c1c635164daa5c300342ac99c0810fd9b575c (patch)
treedf412ba3ffaa8fee35e2e12f96aab0beecdaaec0
parentb8d5f245b7077d869121835ed72656ac14962ef0 (diff)
downloadcpython-git-d25c1c635164daa5c300342ac99c0810fd9b575c.tar.gz
Implement itertools.groupby()
Original idea by Guido van Rossum. Idea for skipable inner iterators by Raymond Hettinger. Idea for argument order and identity function default by Alex Martelli. Implementation by Hye-Shik Chang (with tweaks by Raymond Hettinger).
-rw-r--r--Doc/lib/libitertools.tex60
-rw-r--r--Lib/test/test_itertools.py108
-rw-r--r--Misc/NEWS5
-rw-r--r--Modules/itertoolsmodule.c322
4 files changed, 493 insertions, 2 deletions
diff --git a/Doc/lib/libitertools.tex b/Doc/lib/libitertools.tex
index 6f9f5c6259..82912b0501 100644
--- a/Doc/lib/libitertools.tex
+++ b/Doc/lib/libitertools.tex
@@ -130,6 +130,54 @@ by functions or loops that truncate the stream.
\end{verbatim}
\end{funcdesc}
+\begin{funcdesc}{groupby}{iterable\optional{, key}}
+ Make an iterator that returns consecutive keys and groups from the
+ \var{iterable}. \var{key} is function computing a key value for each
+ element. If not specified or is \code{None}, \var{key} defaults to an
+ identity function (returning the element unchanged). Generally, the
+ iterable needs to already be sorted on the same key function.
+
+ The returned group is itself an iterator that shares the underlying
+ iterable with \function{groupby()}. Because the source is shared, when
+ the \function{groupby} object is advanced, the previous group is no
+ longer visible. So, if that data is needed later, it should be stored
+ as a list:
+
+ \begin{verbatim}
+ groups = []
+ uniquekeys = []
+ for k, g in groupby(data, keyfunc):
+ groups.append(list(g)) # Store group iterator as a list
+ uniquekeys.append(k)
+ \end{verbatim}
+
+ \function{groupby()} is equivalent to:
+
+ \begin{verbatim}
+ class groupby(object):
+ def __init__(self, iterable, key=None):
+ if key is None:
+ key = lambda x: x
+ self.keyfunc = key
+ self.it = iter(iterable)
+ self.tgtkey = self.currkey = self.currvalue = xrange(0)
+ def __iter__(self):
+ return self
+ def next(self):
+ while self.currkey == self.tgtkey:
+ self.currvalue = self.it.next() # Exit on StopIteration
+ self.currkey = self.keyfunc(self.currvalue)
+ self.tgtkey = self.currkey
+ return (self.currkey, self._grouper(self.tgtkey))
+ def _grouper(self, tgtkey):
+ while self.currkey == tgtkey:
+ yield self.currvalue
+ self.currvalue = self.it.next() # Exit on StopIteration
+ self.currkey = self.keyfunc(self.currvalue)
+ \end{verbatim}
+ \versionadded{2.4}
+\end{funcdesc}
+
\begin{funcdesc}{ifilter}{predicate, iterable}
Make an iterator that filters elements from iterable returning only
those for which the predicate is \code{True}.
@@ -346,6 +394,18 @@ Martin
Walter
Samuele
+# Show a dictionary sorted and grouped by value
+>>> from operator import itemgetter
+>>> d = dict(a=1, b=2, c=1, d=2, e=1, f=2, g=3)
+>>> di = list.sorted(d.iteritems(), key=itemgetter(1))
+>>> for k, g in groupby(di, key=itemgetter(1)):
+... print k, map(itemgetter(0), g)
+...
+1 ['a', 'c', 'e']
+2 ['b', 'd', 'f']
+3 ['g']
+
+
\end{verbatim}
This section shows how itertools can be combined to create other more
diff --git a/Lib/test/test_itertools.py b/Lib/test/test_itertools.py
index 543acc199c..b4c0a8bff2 100644
--- a/Lib/test/test_itertools.py
+++ b/Lib/test/test_itertools.py
@@ -61,6 +61,94 @@ class TestBasicOps(unittest.TestCase):
self.assertRaises(TypeError, cycle, 5)
self.assertEqual(list(islice(cycle(gen3()),10)), [0,1,2,0,1,2,0,1,2,0])
+ def test_groupby(self):
+ # Check whether it accepts arguments correctly
+ self.assertEqual([], list(groupby([])))
+ self.assertEqual([], list(groupby([], key=id)))
+ self.assertRaises(TypeError, list, groupby('abc', []))
+ self.assertRaises(TypeError, groupby, None)
+
+ # Check normal input
+ s = [(0, 10, 20), (0, 11,21), (0,12,21), (1,13,21), (1,14,22),
+ (2,15,22), (3,16,23), (3,17,23)]
+ dup = []
+ for k, g in groupby(s, lambda r:r[0]):
+ for elem in g:
+ self.assertEqual(k, elem[0])
+ dup.append(elem)
+ self.assertEqual(s, dup)
+
+ # Check nested case
+ dup = []
+ for k, g in groupby(s, lambda r:r[0]):
+ for ik, ig in groupby(g, lambda r:r[2]):
+ for elem in ig:
+ self.assertEqual(k, elem[0])
+ self.assertEqual(ik, elem[2])
+ dup.append(elem)
+ self.assertEqual(s, dup)
+
+ # Check case where inner iterator is not used
+ keys = [k for k, g in groupby(s, lambda r:r[0])]
+ expectedkeys = set([r[0] for r in s])
+ self.assertEqual(set(keys), expectedkeys)
+ self.assertEqual(len(keys), len(expectedkeys))
+
+ # Exercise pipes and filters style
+ s = 'abracadabra'
+ # sort s | uniq
+ r = [k for k, g in groupby(list.sorted(s))]
+ self.assertEqual(r, ['a', 'b', 'c', 'd', 'r'])
+ # sort s | uniq -d
+ r = [k for k, g in groupby(list.sorted(s)) if list(islice(g,1,2))]
+ self.assertEqual(r, ['a', 'b', 'r'])
+ # sort s | uniq -c
+ r = [(len(list(g)), k) for k, g in groupby(list.sorted(s))]
+ self.assertEqual(r, [(5, 'a'), (2, 'b'), (1, 'c'), (1, 'd'), (2, 'r')])
+ # sort s | uniq -c | sort -rn | head -3
+ r = list.sorted([(len(list(g)) , k) for k, g in groupby(list.sorted(s))], reverse=True)[:3]
+ self.assertEqual(r, [(5, 'a'), (2, 'r'), (2, 'b')])
+
+ # iter.next failure
+ class ExpectedError(Exception):
+ pass
+ def delayed_raise(n=0):
+ for i in range(n):
+ yield 'yo'
+ raise ExpectedError
+ def gulp(iterable, keyp=None, func=list):
+ return [func(g) for k, g in groupby(iterable, keyp)]
+
+ # iter.next failure on outer object
+ self.assertRaises(ExpectedError, gulp, delayed_raise(0))
+ # iter.next failure on inner object
+ self.assertRaises(ExpectedError, gulp, delayed_raise(1))
+
+ # __cmp__ failure
+ class DummyCmp:
+ def __cmp__(self, dst):
+ raise ExpectedError
+ s = [DummyCmp(), DummyCmp(), None]
+
+ # __cmp__ failure on outer object
+ self.assertRaises(ExpectedError, gulp, s, func=id)
+ # __cmp__ failure on inner object
+ self.assertRaises(ExpectedError, gulp, s)
+
+ # keyfunc failure
+ def keyfunc(obj):
+ if keyfunc.skip > 0:
+ keyfunc.skip -= 1
+ return obj
+ else:
+ raise ExpectedError
+
+ # keyfunc failure on outer object
+ keyfunc.skip = 0
+ self.assertRaises(ExpectedError, gulp, [None], keyfunc)
+ keyfunc.skip = 1
+ self.assertRaises(ExpectedError, gulp, [None, None], keyfunc)
+
def test_ifilter(self):
self.assertEqual(list(ifilter(isEven, range(6))), [0,2,4])
self.assertEqual(list(ifilter(None, [0,1,0,2,0])), [1,2])
@@ -268,7 +356,7 @@ class TestBasicOps(unittest.TestCase):
def test_StopIteration(self):
self.assertRaises(StopIteration, izip().next)
- for f in (chain, cycle, izip):
+ for f in (chain, cycle, izip, groupby):
self.assertRaises(StopIteration, f([]).next)
self.assertRaises(StopIteration, f(StopNow()).next)
@@ -426,6 +514,14 @@ class TestVariousIteratorArgs(unittest.TestCase):
self.assertRaises(TypeError, list, cycle(N(s)))
self.assertRaises(ZeroDivisionError, list, cycle(E(s)))
+ def test_groupby(self):
+ for s in (range(10), range(0), range(1000), (7,11), xrange(2000,2200,5)):
+ for g in (G, I, Ig, S, L, R):
+ self.assertEqual([k for k, sb in groupby(g(s))], list(g(s)))
+ self.assertRaises(TypeError, groupby, X(s))
+ self.assertRaises(TypeError, list, groupby(N(s)))
+ self.assertRaises(ZeroDivisionError, list, groupby(E(s)))
+
def test_ifilter(self):
for s in (range(10), range(0), range(1000), (7,11), xrange(2000,2200,5)):
for g in (G, I, Ig, S, L, R):
@@ -571,6 +667,16 @@ Martin
Walter
Samuele
+>>> from operator import itemgetter
+>>> d = dict(a=1, b=2, c=1, d=2, e=1, f=2, g=3)
+>>> di = list.sorted(d.iteritems(), key=itemgetter(1))
+>>> for k, g in groupby(di, itemgetter(1)):
+... print k, map(itemgetter(0), g)
+...
+1 ['a', 'c', 'e']
+2 ['b', 'd', 'f']
+3 ['g']
+
>>> def take(n, seq):
... return list(islice(seq, n))
diff --git a/Misc/NEWS b/Misc/NEWS
index af61c4ebb8..a28d0424c4 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -164,6 +164,11 @@ Extension modules
SF bug #812202). Generators that do not define genrandbits() now
issue a warning when randrange() is called with a range that large.
+- itertools has a new function, groupby() for aggregating iterables
+ into groups sharing the same key (as determined by a key function).
+ It offers some of functionality of SQL's groupby keyword and of
+ the Unix uniq filter.
+
- itertools now has a new function, tee() which produces two independent
iterators from a single iterable.
diff --git a/Modules/itertoolsmodule.c b/Modules/itertoolsmodule.c
index a341a6630a..387133c474 100644
--- a/Modules/itertoolsmodule.c
+++ b/Modules/itertoolsmodule.c
@@ -7,6 +7,323 @@
All rights reserved.
*/
+
+/* groupby object ***********************************************************/
+
+typedef struct {
+ PyObject_HEAD
+ PyObject *it;
+ PyObject *keyfunc;
+ PyObject *tgtkey;
+ PyObject *currkey;
+ PyObject *currvalue;
+} groupbyobject;
+
+static PyTypeObject groupby_type;
+static PyObject *_grouper_create(groupbyobject *, PyObject *);
+
+static PyObject *
+groupby_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
+{
+ static char *kwargs[] = {"iterable", "key", NULL};
+ groupbyobject *gbo;
+ PyObject *it, *keyfunc = Py_None;
+
+ if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|O:groupby", kwargs,
+ &it, &keyfunc))
+ return NULL;
+
+ gbo = (groupbyobject *)type->tp_alloc(type, 0);
+ if (gbo == NULL)
+ return NULL;
+ gbo->tgtkey = NULL;
+ gbo->currkey = NULL;
+ gbo->currvalue = NULL;
+ gbo->keyfunc = keyfunc;
+ Py_INCREF(keyfunc);
+ gbo->it = PyObject_GetIter(it);
+ if (gbo->it == NULL) {
+ Py_DECREF(gbo);
+ return NULL;
+ }
+ return (PyObject *)gbo;
+}
+
+static void
+groupby_dealloc(groupbyobject *gbo)
+{
+ PyObject_GC_UnTrack(gbo);
+ Py_XDECREF(gbo->it);
+ Py_XDECREF(gbo->keyfunc);
+ Py_XDECREF(gbo->tgtkey);
+ Py_XDECREF(gbo->currkey);
+ Py_XDECREF(gbo->currvalue);
+ gbo->ob_type->tp_free(gbo);
+}
+
+static int
+groupby_traverse(groupbyobject *gbo, visitproc visit, void *arg)
+{
+ int err;
+
+ if (gbo->it) {
+ err = visit(gbo->it, arg);
+ if (err)
+ return err;
+ }
+ if (gbo->keyfunc) {
+ err = visit(gbo->keyfunc, arg);
+ if (err)
+ return err;
+ }
+ if (gbo->tgtkey) {
+ err = visit(gbo->tgtkey, arg);
+ if (err)
+ return err;
+ }
+ if (gbo->currkey) {
+ err = visit(gbo->currkey, arg);
+ if (err)
+ return err;
+ }
+ if (gbo->currvalue) {
+ err = visit(gbo->currvalue, arg);
+ if (err)
+ return err;
+ }
+ return 0;
+}
+
+static PyObject *
+groupby_next(groupbyobject *gbo)
+{
+ PyObject *newvalue, *newkey, *r, *grouper;
+
+ /* skip to next iteration group */
+ for (;;) {
+ if (gbo->currkey == NULL)
+ /* pass */;
+ else if (gbo->tgtkey == NULL)
+ break;
+ else {
+ int rcmp;
+
+ rcmp = PyObject_RichCompareBool(gbo->tgtkey,
+ gbo->currkey, Py_EQ);
+ if (rcmp == -1)
+ return NULL;
+ else if (rcmp == 0)
+ break;
+ }
+
+ newvalue = PyIter_Next(gbo->it);
+ if (newvalue == NULL)
+ return NULL;
+
+ if (gbo->keyfunc == Py_None) {
+ newkey = newvalue;
+ Py_INCREF(newvalue);
+ } else {
+ newkey = PyObject_CallFunctionObjArgs(gbo->keyfunc,
+ newvalue, NULL);
+ if (newkey == NULL) {
+ Py_DECREF(newvalue);
+ return NULL;
+ }
+ }
+
+ Py_XDECREF(gbo->currkey);
+ gbo->currkey = newkey;
+ Py_XDECREF(gbo->currvalue);
+ gbo->currvalue = newvalue;
+ }
+
+ Py_XDECREF(gbo->tgtkey);
+ gbo->tgtkey = gbo->currkey;
+ Py_INCREF(gbo->currkey);
+
+ grouper = _grouper_create(gbo, gbo->tgtkey);
+ if (grouper == NULL)
+ return NULL;
+
+ r = PyTuple_Pack(2, gbo->currkey, grouper);
+ Py_DECREF(grouper);
+ return r;
+}
+
+PyDoc_STRVAR(groupby_doc,
+"groupby(iterable[, keyfunc]) -> create an iterator which returns\n\
+(key, sub-iterator) grouped by each value of key(value).\n");
+
+static PyTypeObject groupby_type = {
+ PyObject_HEAD_INIT(NULL)
+ 0, /* ob_size */
+ "itertools.groupby", /* tp_name */
+ sizeof(groupbyobject), /* tp_basicsize */
+ 0, /* tp_itemsize */
+ /* methods */
+ (destructor)groupby_dealloc, /* tp_dealloc */
+ 0, /* tp_print */
+ 0, /* tp_getattr */
+ 0, /* tp_setattr */
+ 0, /* tp_compare */
+ 0, /* tp_repr */
+ 0, /* tp_as_number */
+ 0, /* tp_as_sequence */
+ 0, /* tp_as_mapping */
+ 0, /* tp_hash */
+ 0, /* tp_call */
+ 0, /* tp_str */
+ PyObject_GenericGetAttr, /* tp_getattro */
+ 0, /* tp_setattro */
+ 0, /* tp_as_buffer */
+ Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC |
+ Py_TPFLAGS_BASETYPE, /* tp_flags */
+ groupby_doc, /* tp_doc */
+ (traverseproc)groupby_traverse, /* tp_traverse */
+ 0, /* tp_clear */
+ 0, /* tp_richcompare */
+ 0, /* tp_weaklistoffset */
+ PyObject_SelfIter, /* tp_iter */
+ (iternextfunc)groupby_next, /* tp_iternext */
+ 0, /* tp_methods */
+ 0, /* tp_members */
+ 0, /* tp_getset */
+ 0, /* tp_base */
+ 0, /* tp_dict */
+ 0, /* tp_descr_get */
+ 0, /* tp_descr_set */
+ 0, /* tp_dictoffset */
+ 0, /* tp_init */
+ 0, /* tp_alloc */
+ groupby_new, /* tp_new */
+ PyObject_GC_Del, /* tp_free */
+};
+
+
+/* _grouper object (internal) ************************************************/
+
+typedef struct {
+ PyObject_HEAD
+ PyObject *parent;
+ PyObject *tgtkey;
+} _grouperobject;
+
+static PyTypeObject _grouper_type;
+
+static PyObject *
+_grouper_create(groupbyobject *parent, PyObject *tgtkey)
+{
+ _grouperobject *igo;
+
+ igo = PyObject_New(_grouperobject, &_grouper_type);
+ if (igo == NULL)
+ return NULL;
+ igo->parent = (PyObject *)parent;
+ Py_INCREF(parent);
+ igo->tgtkey = tgtkey;
+ Py_INCREF(tgtkey);
+
+ return (PyObject *)igo;
+}
+
+static void
+_grouper_dealloc(_grouperobject *igo)
+{
+ Py_DECREF(igo->parent);
+ Py_DECREF(igo->tgtkey);
+ PyObject_Del(igo);
+}
+
+static PyObject *
+_grouper_next(_grouperobject *igo)
+{
+ groupbyobject *gbo = (groupbyobject *)igo->parent;
+ PyObject *newvalue, *newkey, *r;
+ int rcmp;
+
+ if (gbo->currvalue == NULL) {
+ newvalue = PyIter_Next(gbo->it);
+ if (newvalue == NULL)
+ return NULL;
+
+ if (gbo->keyfunc == Py_None) {
+ newkey = newvalue;
+ Py_INCREF(newvalue);
+ } else {
+ newkey = PyObject_CallFunctionObjArgs(gbo->keyfunc,
+ newvalue, NULL);
+ if (newkey == NULL) {
+ Py_DECREF(newvalue);
+ return NULL;
+ }
+ }
+
+ assert(gbo->currkey == NULL);
+ gbo->currkey = newkey;
+ gbo->currvalue = newvalue;
+ }
+
+ assert(gbo->currkey != NULL);
+ rcmp = PyObject_RichCompareBool(igo->tgtkey, gbo->currkey, Py_EQ);
+ if (rcmp <= 0)
+ /* got any error or current group is end */
+ return NULL;
+
+ r = gbo->currvalue;
+ gbo->currvalue = NULL;
+ Py_DECREF(gbo->currkey);
+ gbo->currkey = NULL;
+
+ return r;
+}
+
+static PyTypeObject _grouper_type = {
+ PyObject_HEAD_INIT(NULL)
+ 0, /* ob_size */
+ "itertools._grouper", /* tp_name */
+ sizeof(_grouperobject), /* tp_basicsize */
+ 0, /* tp_itemsize */
+ /* methods */
+ (destructor)_grouper_dealloc, /* tp_dealloc */
+ 0, /* tp_print */
+ 0, /* tp_getattr */
+ 0, /* tp_setattr */
+ 0, /* tp_compare */
+ 0, /* tp_repr */
+ 0, /* tp_as_number */
+ 0, /* tp_as_sequence */
+ 0, /* tp_as_mapping */
+ 0, /* tp_hash */
+ 0, /* tp_call */
+ 0, /* tp_str */
+ PyObject_GenericGetAttr, /* tp_getattro */
+ 0, /* tp_setattro */
+ 0, /* tp_as_buffer */
+ Py_TPFLAGS_DEFAULT, /* tp_flags */
+ 0, /* tp_doc */
+ 0, /* tp_traverse */
+ 0, /* tp_clear */
+ 0, /* tp_richcompare */
+ 0, /* tp_weaklistoffset */
+ PyObject_SelfIter, /* tp_iter */
+ (iternextfunc)_grouper_next, /* tp_iternext */
+ 0, /* tp_methods */
+ 0, /* tp_members */
+ 0, /* tp_getset */
+ 0, /* tp_base */
+ 0, /* tp_dict */
+ 0, /* tp_descr_get */
+ 0, /* tp_descr_set */
+ 0, /* tp_dictoffset */
+ 0, /* tp_init */
+ 0, /* tp_alloc */
+ 0, /* tp_new */
+ PyObject_Del, /* tp_free */
+};
+
+
+
/* tee object and with supporting function and objects ***************/
/* The teedataobject pre-allocates space for LINKCELLS number of objects.
@@ -2103,6 +2420,7 @@ tee(it, n=2) --> (it1, it2 , ... itn) splits one iterator into n\n\
chain(p, q, ...) --> p0, p1, ... plast, q0, q1, ... \n\
takewhile(pred, seq) --> seq[0], seq[1], until pred fails\n\
dropwhile(pred, seq) --> seq[n], seq[n+1], starting when pred fails\n\
+groupby(iterable[, keyfunc]) --> sub-iterators grouped by value of keyfunc(v)\n\
");
@@ -2130,6 +2448,7 @@ inititertools(void)
&count_type,
&izip_type,
&repeat_type,
+ &groupby_type,
NULL
};
@@ -2148,5 +2467,6 @@ inititertools(void)
return;
if (PyType_Ready(&tee_type) < 0)
return;
-
+ if (PyType_Ready(&_grouper_type) < 0)
+ return;
}