numpy/lib/arraysetops.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206

"""
Set operations for 1D numeric arrays based on sort() function.

Contains:
  ediff1d,
  unique1d,
  intersect1d,
  intersect1d_nu,
  setxor1d,
  setmember1d,
  union1d,
  setdiff1d

Concerning the speed, test_unique1d_speed() reveals that up to 10000000
elements unique1d() is about 10 times faster than the standard dictionary-based
numpy.unique().

Limitations: Except unique1d, union1d and intersect1d_nu, all functions expect
inputs with unique elements. Speed could be gained in some operations by an
implementaion of sort(), that can provide directly the permutation vectors,
avoiding thus calls to argsort().

To do: Optionally return indices analogously to unique1d for all functions.

Author: Robert Cimrman
"""
__all__ = ['unique1d', 'intersect1d', 'intersect1d_nu', 'setxor1d',
           'setmember1d', 'union1d', 'setdiff1d']


# 02.11.2005, c
import time
import numpy

##
# 03.11.2005, c
def ediff1d( ar1, to_end = None, to_begin = None ):
    """Array difference with prefixed and/or appended value."""
    dar1 = ar1[1:] - ar1[:-1]
    if to_end and to_begin:
        shape = (ar1.shape[0] + 1,) + ar1.shape[1:]
        ed = numpy.empty( shape, dtype = ar1.dtype )
        ed[0], ed[-1] = to_begin, to_end
        ed[1:-1] = dar1
    elif to_end:
        ed = numpy.empty( ar1.shape, dtype = ar1.dtype )
        ed[-1] = to_end
        ed[:-1] = dar1
    elif to_begin:
        ed = numpy.empty( ar1.shape, dtype = ar1.dtype )
        ed[0] = to_begin
        ed[1:] = dar1
    else:
        ed = dar1

    return ed


##
# 01.11.2005, c
# 02.11.2005
def unique1d( ar1, retindx = False ):
    """Unique elements of 1D array. When ret_indx is True, return also the
    indices indx such that ar1.flat[indx] is the resulting array of unique
    elements."""
    if retindx:
        ar = numpy.array(ar1).ravel()
        perm = ar.argsort()
        aux = ar.take(perm)
        flag = ediff1d( aux, 1 ) != 0
        return perm.compress(flag), aux.compress(flag)
    else:
        ar = numpy.array( ar1 ).flatten()
        ar.sort()
        return ar.compress( ediff1d( ar, 1 ) != 0)

##
# 01.11.2005, c
def intersect1d( ar1, ar2 ):
    """Intersection of 1D arrays with unique elements."""
    aux = numpy.concatenate((ar1,ar2))
    aux.sort()
    return aux.compress( (aux[1:] - aux[:-1]) == 0)

##
# 01.11.2005, c
def intersect1d_nu( ar1, ar2 ):
    """Intersection of 1D arrays with any elements."""
    # Might be faster then unique1d( intersect1d( ar1, ar2 ) )?
    aux = numpy.concatenate((unique1d(ar1), unique1d(ar2)))
    aux.sort()
    return aux.compress( (aux[1:] - aux[:-1]) == 0)

##
# 01.11.2005, c
def setxor1d( ar1, ar2 ):
    """Set exclusive-or of 1D arrays with unique elements."""
    aux = numpy.concatenate( (ar1, ar2 ) )
    aux.sort()
    flag = ediff1d( aux, to_end = 1, to_begin = 1 ) == 0
    flag2 = ediff1d( flag, 0 ) == 0
    return aux.compress( flag2 )

##
# 03.11.2005, c
# 05.01.2006
def setmember1d( ar1, ar2 ):
    """Return an array of shape of ar1 containing 1 where the elements of
    ar1 are in ar2 and 0 otherwise."""
    concat = numpy.concatenate
    zlike = numpy.zeros_like
    ar = concat( (ar1, ar2 ) )
    tt = concat( (zlike( ar1 ),
                  zlike( ar2 ) + 1) )
    perm = ar.argsort()
    aux = ar.take(perm)
    aux2 = tt.take(perm)
    flag = ediff1d( aux, 1 ) == 0

    ii = numpy.where( flag * aux2 )[0]
    aux = perm[ii+1]
    perm[ii+1] = perm[ii]
    perm[ii] = aux

    indx = perm.argsort()[:len( ar1 )]

    return flag.take( indx )

##
# 03.11.2005, c
def union1d( ar1, ar2 ):
    """Union of 1D arrays with unique elements."""
    return unique1d( numpy.concatenate( (ar1, ar2) ) )

##
# 03.11.2005, c
def setdiff1d( ar1, ar2 ):
    """Set difference of 1D arrays with unique elements."""
    aux = setmember1d( ar1, ar2 )
    return ar1.compress(aux == 0)

##
# 02.11.2005, c
def test_unique1d_speed( plot_results = False ):
#    exponents = numpy.linspace( 2, 7, 9 )
    exponents = numpy.linspace( 2, 6, 9 )
    ratios = []
    nItems = []
    dt1s = []
    dt2s = []
    for ii in exponents:

        nItem = 10 ** ii
        print 'using %d items:' % nItem
        a = numpy.fix( nItem / 10 * numpy.random.random( nItem ) )

        print 'dictionary:'
        tt = time.clock()
        b = numpy.unique( a )
        dt1 = time.clock() - tt
        print dt1

        print 'array:'
        tt = time.clock()
        c = unique1d( a )
        dt2 = time.clock() - tt
        print dt2


        if dt1 < 1e-8:
            ratio = 'ND'
        else:
            ratio = dt2 / dt1
        print 'ratio:', ratio
        print 'nUnique: %d == %d\n' % (len( b ), len( c ))

        nItems.append( nItem )
        ratios.append( ratio )
        dt1s.append( dt1 )
        dt2s.append( dt2 )

        assert numpy.alltrue( b == c )


    print nItems
    print dt1s
    print dt2s
    print ratios

    if plot_results:
        import pylab

        def plotMe( fig, fun, nItems, dt1s, dt2s ):
            pylab.figure( fig )
            fun( nItems, dt1s, 'g-o', linewidth = 2, markersize = 8 )
            fun( nItems, dt2s, 'b-x', linewidth = 2, markersize = 8 )
            pylab.legend( ('dictionary', 'array' ) )
            pylab.xlabel( 'nItem' )
            pylab.ylabel( 'time [s]' )

        plotMe( 1, pylab.loglog, nItems, dt1s, dt2s )
        plotMe( 2, pylab.plot, nItems, dt1s, dt2s )
        pylab.show()

if (__name__ == '__main__'):
    test_unique1d_speed( plot_results = True )