15 files changed, 601 insertions, 254 deletions
diff --git a/numpy/lib/arraypad.py b/numpy/lib/arraypad.py
index 600301c56..e9ca9de4d 100644
--- a/numpy/lib/arraypad.py
+++ b/numpy/lib/arraypad.py
@@ -74,6 +74,35 @@ def _round_ifneeded(arr, dtype):
         arr.round(out=arr)
 
 
+def _slice_at_axis(shape, sl, axis):
+    """
+    Construct a slice tuple the length of shape, with sl at the specified axis
+    """
+    slice_tup = (slice(None),)
+    return slice_tup * axis + (sl,) + slice_tup * (len(shape) - axis - 1)
+
+
+def _slice_first(shape, n, axis):
+    """ Construct a slice tuple to take the first n elements along axis """
+    return _slice_at_axis(shape, slice(0, n), axis=axis)
+
+
+def _slice_last(shape, n, axis):
+    """ Construct a slice tuple to take the last n elements along axis """
+    dim = shape[axis]  # doing this explicitly makes n=0 work
+    return _slice_at_axis(shape, slice(dim - n, dim), axis=axis)
+
+
+def _do_prepend(arr, pad_chunk, axis):
+    return np.concatenate(
+        (pad_chunk.astype(arr.dtype, copy=False), arr), axis=axis)
+
+
+def _do_append(arr, pad_chunk, axis):
+    return np.concatenate(
+        (arr, pad_chunk.astype(arr.dtype, copy=False)), axis=axis)
+
+
 def _prepend_const(arr, pad_amt, val, axis=-1):
     """
     Prepend constant `val` along `axis` of `arr`.
@@ -100,8 +129,7 @@ def _prepend_const(arr, pad_amt, val, axis=-1):
         return arr
     padshape = tuple(x if i != axis else pad_amt
                      for (i, x) in enumerate(arr.shape))
-    return np.concatenate((np.full(padshape, val, dtype=arr.dtype), arr),
-                          axis=axis)
+    return _do_prepend(arr, np.full(padshape, val, dtype=arr.dtype), axis)
 
 
 def _append_const(arr, pad_amt, val, axis=-1):
@@ -130,8 +158,8 @@ def _append_const(arr, pad_amt, val, axis=-1):
         return arr
     padshape = tuple(x if i != axis else pad_amt
                      for (i, x) in enumerate(arr.shape))
-    return np.concatenate((arr, np.full(padshape, val, dtype=arr.dtype)),
-                          axis=axis)
+    return _do_append(arr, np.full(padshape, val, dtype=arr.dtype), axis)
+
 
 
 def _prepend_edge(arr, pad_amt, axis=-1):
@@ -156,11 +184,9 @@ def _prepend_edge(arr, pad_amt, axis=-1):
     if pad_amt == 0:
         return arr
 
-    edge_slice = tuple(slice(None) if i != axis else slice(0, 1)
-                       for (i, x) in enumerate(arr.shape))
+    edge_slice = _slice_first(arr.shape, 1, axis=axis)
     edge_arr = arr[edge_slice]
-    return np.concatenate((edge_arr.repeat(pad_amt, axis=axis), arr),
-                          axis=axis)
+    return _do_prepend(arr, edge_arr.repeat(pad_amt, axis=axis), axis)
 
 
 def _append_edge(arr, pad_amt, axis=-1):
@@ -186,11 +212,9 @@ def _append_edge(arr, pad_amt, axis=-1):
     if pad_amt == 0:
         return arr
 
-    edge_slice = tuple(slice(None) if i != axis else slice(x - 1, x)
-                       for (i, x) in enumerate(arr.shape))
+    edge_slice = _slice_last(arr.shape, 1, axis=axis)
     edge_arr = arr[edge_slice]
-    return np.concatenate((arr, edge_arr.repeat(pad_amt, axis=axis)),
-                          axis=axis)
+    return _do_append(arr, edge_arr.repeat(pad_amt, axis=axis), axis)
 
 
 def _prepend_ramp(arr, pad_amt, end, axis=-1):
@@ -228,8 +252,7 @@ def _prepend_ramp(arr, pad_amt, end, axis=-1):
                                reverse=True).astype(np.float64)
 
     # Appropriate slicing to extract n-dimensional edge along `axis`
-    edge_slice = tuple(slice(None) if i != axis else slice(0, 1)
-                       for (i, x) in enumerate(arr.shape))
+    edge_slice = _slice_first(arr.shape, 1, axis=axis)
 
     # Extract edge, and extend along `axis`
     edge_pad = arr[edge_slice].repeat(pad_amt, axis)
@@ -241,7 +264,7 @@ def _prepend_ramp(arr, pad_amt, end, axis=-1):
     _round_ifneeded(ramp_arr, arr.dtype)
 
     # Ramp values will most likely be float, cast them to the same type as arr
-    return np.concatenate((ramp_arr.astype(arr.dtype), arr), axis=axis)
+    return _do_prepend(arr, ramp_arr, axis)
 
 
 def _append_ramp(arr, pad_amt, end, axis=-1):
@@ -279,8 +302,7 @@ def _append_ramp(arr, pad_amt, end, axis=-1):
                                reverse=False).astype(np.float64)
 
     # Slice a chunk from the edge to calculate stats on
-    edge_slice = tuple(slice(None) if i != axis else slice(x - 1, x)
-                       for (i, x) in enumerate(arr.shape))
+    edge_slice = _slice_last(arr.shape, 1, axis=axis)
 
     # Extract edge, and extend along `axis`
     edge_pad = arr[edge_slice].repeat(pad_amt, axis)
@@ -292,7 +314,7 @@ def _append_ramp(arr, pad_amt, end, axis=-1):
     _round_ifneeded(ramp_arr, arr.dtype)
 
     # Ramp values will most likely be float, cast them to the same type as arr
-    return np.concatenate((arr, ramp_arr.astype(arr.dtype)), axis=axis)
+    return _do_append(arr, ramp_arr, axis)
 
 
 def _prepend_max(arr, pad_amt, num, axis=-1):
@@ -332,15 +354,13 @@ def _prepend_max(arr, pad_amt, num, axis=-1):
             num = None
 
     # Slice a chunk from the edge to calculate stats on
-    max_slice = tuple(slice(None) if i != axis else slice(num)
-                      for (i, x) in enumerate(arr.shape))
+    max_slice = _slice_first(arr.shape, num, axis=axis)
 
     # Extract slice, calculate max
     max_chunk = arr[max_slice].max(axis=axis, keepdims=True)
 
     # Concatenate `arr` with `max_chunk`, extended along `axis` by `pad_amt`
-    return np.concatenate((max_chunk.repeat(pad_amt, axis=axis), arr),
-                          axis=axis)
+    return _do_prepend(arr, max_chunk.repeat(pad_amt, axis=axis), axis)
 
 
 def _append_max(arr, pad_amt, num, axis=-1):
@@ -379,11 +399,8 @@ def _append_max(arr, pad_amt, num, axis=-1):
             num = None
 
     # Slice a chunk from the edge to calculate stats on
-    end = arr.shape[axis] - 1
     if num is not None:
-        max_slice = tuple(
-            slice(None) if i != axis else slice(end, end - num, -1)
-            for (i, x) in enumerate(arr.shape))
+        max_slice = _slice_last(arr.shape, num, axis=axis)
     else:
         max_slice = tuple(slice(None) for x in arr.shape)
 
@@ -391,8 +408,7 @@ def _append_max(arr, pad_amt, num, axis=-1):
     max_chunk = arr[max_slice].max(axis=axis, keepdims=True)
 
     # Concatenate `arr` with `max_chunk`, extended along `axis` by `pad_amt`
-    return np.concatenate((arr, max_chunk.repeat(pad_amt, axis=axis)),
-                          axis=axis)
+    return _do_append(arr, max_chunk.repeat(pad_amt, axis=axis), axis)
 
 
 def _prepend_mean(arr, pad_amt, num, axis=-1):
@@ -431,16 +447,14 @@ def _prepend_mean(arr, pad_amt, num, axis=-1):
             num = None
 
     # Slice a chunk from the edge to calculate stats on
-    mean_slice = tuple(slice(None) if i != axis else slice(num)
-                       for (i, x) in enumerate(arr.shape))
+    mean_slice = _slice_first(arr.shape, num, axis=axis)
 
     # Extract slice, calculate mean
     mean_chunk = arr[mean_slice].mean(axis, keepdims=True)
     _round_ifneeded(mean_chunk, arr.dtype)
 
     # Concatenate `arr` with `mean_chunk`, extended along `axis` by `pad_amt`
-    return np.concatenate((mean_chunk.repeat(pad_amt, axis).astype(arr.dtype),
-                           arr), axis=axis)
+    return _do_prepend(arr, mean_chunk.repeat(pad_amt, axis), axis=axis)
 
 
 def _append_mean(arr, pad_amt, num, axis=-1):
@@ -479,11 +493,8 @@ def _append_mean(arr, pad_amt, num, axis=-1):
             num = None
 
     # Slice a chunk from the edge to calculate stats on
-    end = arr.shape[axis] - 1
     if num is not None:
-        mean_slice = tuple(
-            slice(None) if i != axis else slice(end, end - num, -1)
-            for (i, x) in enumerate(arr.shape))
+        mean_slice = _slice_last(arr.shape, num, axis=axis)
     else:
         mean_slice = tuple(slice(None) for x in arr.shape)
 
@@ -492,8 +503,7 @@ def _append_mean(arr, pad_amt, num, axis=-1):
     _round_ifneeded(mean_chunk, arr.dtype)
 
     # Concatenate `arr` with `mean_chunk`, extended along `axis` by `pad_amt`
-    return np.concatenate(
-        (arr, mean_chunk.repeat(pad_amt, axis).astype(arr.dtype)), axis=axis)
+    return _do_append(arr, mean_chunk.repeat(pad_amt, axis), axis=axis)
 
 
 def _prepend_med(arr, pad_amt, num, axis=-1):
@@ -532,16 +542,14 @@ def _prepend_med(arr, pad_amt, num, axis=-1):
             num = None
 
     # Slice a chunk from the edge to calculate stats on
-    med_slice = tuple(slice(None) if i != axis else slice(num)
-                      for (i, x) in enumerate(arr.shape))
+    med_slice = _slice_first(arr.shape, num, axis=axis)
 
     # Extract slice, calculate median
     med_chunk = np.median(arr[med_slice], axis=axis, keepdims=True)
     _round_ifneeded(med_chunk, arr.dtype)
 
     # Concatenate `arr` with `med_chunk`, extended along `axis` by `pad_amt`
-    return np.concatenate(
-        (med_chunk.repeat(pad_amt, axis).astype(arr.dtype), arr), axis=axis)
+    return _do_prepend(arr, med_chunk.repeat(pad_amt, axis), axis=axis)
 
 
 def _append_med(arr, pad_amt, num, axis=-1):
@@ -580,11 +588,8 @@ def _append_med(arr, pad_amt, num, axis=-1):
             num = None
 
     # Slice a chunk from the edge to calculate stats on
-    end = arr.shape[axis] - 1
     if num is not None:
-        med_slice = tuple(
-            slice(None) if i != axis else slice(end, end - num, -1)
-            for (i, x) in enumerate(arr.shape))
+        med_slice = _slice_last(arr.shape, num, axis=axis)
     else:
         med_slice = tuple(slice(None) for x in arr.shape)
 
@@ -593,8 +598,7 @@ def _append_med(arr, pad_amt, num, axis=-1):
     _round_ifneeded(med_chunk, arr.dtype)
 
     # Concatenate `arr` with `med_chunk`, extended along `axis` by `pad_amt`
-    return np.concatenate(
-        (arr, med_chunk.repeat(pad_amt, axis).astype(arr.dtype)), axis=axis)
+    return _do_append(arr, med_chunk.repeat(pad_amt, axis), axis=axis)
 
 
 def _prepend_min(arr, pad_amt, num, axis=-1):
@@ -634,15 +638,13 @@ def _prepend_min(arr, pad_amt, num, axis=-1):
             num = None
 
     # Slice a chunk from the edge to calculate stats on
-    min_slice = tuple(slice(None) if i != axis else slice(num)
-                      for (i, x) in enumerate(arr.shape))
+    min_slice = _slice_first(arr.shape, num, axis=axis)
 
     # Extract slice, calculate min
     min_chunk = arr[min_slice].min(axis=axis, keepdims=True)
 
     # Concatenate `arr` with `min_chunk`, extended along `axis` by `pad_amt`
-    return np.concatenate((min_chunk.repeat(pad_amt, axis=axis), arr),
-                          axis=axis)
+    return _do_prepend(arr, min_chunk.repeat(pad_amt, axis), axis=axis)
 
 
 def _append_min(arr, pad_amt, num, axis=-1):
@@ -681,11 +683,8 @@ def _append_min(arr, pad_amt, num, axis=-1):
             num = None
 
     # Slice a chunk from the edge to calculate stats on
-    end = arr.shape[axis] - 1
     if num is not None:
-        min_slice = tuple(
-            slice(None) if i != axis else slice(end, end - num, -1)
-            for (i, x) in enumerate(arr.shape))
+        min_slice = _slice_last(arr.shape, num, axis=axis)
     else:
         min_slice = tuple(slice(None) for x in arr.shape)
 
@@ -693,8 +692,7 @@ def _append_min(arr, pad_amt, num, axis=-1):
     min_chunk = arr[min_slice].min(axis=axis, keepdims=True)
 
     # Concatenate `arr` with `min_chunk`, extended along `axis` by `pad_amt`
-    return np.concatenate((arr, min_chunk.repeat(pad_amt, axis=axis)),
-                          axis=axis)
+    return _do_append(arr, min_chunk.repeat(pad_amt, axis), axis=axis)
 
 
 def _pad_ref(arr, pad_amt, method, axis=-1):
@@ -737,15 +735,13 @@ def _pad_ref(arr, pad_amt, method, axis=-1):
     # Prepended region
 
     # Slice off a reverse indexed chunk from near edge to pad `arr` before
-    ref_slice = tuple(slice(None) if i != axis else slice(pad_amt[0], 0, -1)
-                      for (i, x) in enumerate(arr.shape))
+    ref_slice = _slice_at_axis(arr.shape, slice(pad_amt[0], 0, -1), axis=axis)
 
     ref_chunk1 = arr[ref_slice]
 
     # Memory/computationally more expensive, only do this if `method='odd'`
     if 'odd' in method and pad_amt[0] > 0:
-        edge_slice1 = tuple(slice(None) if i != axis else slice(0, 1)
-                            for (i, x) in enumerate(arr.shape))
+        edge_slice1 = _slice_first(arr.shape, 1, axis=axis)
         edge_chunk = arr[edge_slice1]
         ref_chunk1 = 2 * edge_chunk - ref_chunk1
         del edge_chunk
@@ -756,15 +752,12 @@ def _pad_ref(arr, pad_amt, method, axis=-1):
     # Slice off a reverse indexed chunk from far edge to pad `arr` after
     start = arr.shape[axis] - pad_amt[1] - 1
     end = arr.shape[axis] - 1
-    ref_slice = tuple(slice(None) if i != axis else slice(start, end)
-                      for (i, x) in enumerate(arr.shape))
-    rev_idx = tuple(slice(None) if i != axis else slice(None, None, -1)
-                    for (i, x) in enumerate(arr.shape))
+    ref_slice = _slice_at_axis(arr.shape, slice(start, end), axis=axis)
+    rev_idx = _slice_at_axis(arr.shape, slice(None, None, -1), axis=axis)
     ref_chunk2 = arr[ref_slice][rev_idx]
 
     if 'odd' in method:
-        edge_slice2 = tuple(slice(None) if i != axis else slice(x - 1, x)
-                            for (i, x) in enumerate(arr.shape))
+        edge_slice2 = _slice_last(arr.shape, 1, axis=axis)
         edge_chunk = arr[edge_slice2]
         ref_chunk2 = 2 * edge_chunk - ref_chunk2
         del edge_chunk
@@ -813,16 +806,13 @@ def _pad_sym(arr, pad_amt, method, axis=-1):
     # Prepended region
 
     # Slice off a reverse indexed chunk from near edge to pad `arr` before
-    sym_slice = tuple(slice(None) if i != axis else slice(0, pad_amt[0])
-                      for (i, x) in enumerate(arr.shape))
-    rev_idx = tuple(slice(None) if i != axis else slice(None, None, -1)
-                    for (i, x) in enumerate(arr.shape))
+    sym_slice = _slice_first(arr.shape, pad_amt[0], axis=axis)
+    rev_idx = _slice_at_axis(arr.shape, slice(None, None, -1), axis=axis)
     sym_chunk1 = arr[sym_slice][rev_idx]
 
     # Memory/computationally more expensive, only do this if `method='odd'`
     if 'odd' in method and pad_amt[0] > 0:
-        edge_slice1 = tuple(slice(None) if i != axis else slice(0, 1)
-                            for (i, x) in enumerate(arr.shape))
+        edge_slice1 = _slice_first(arr.shape, 1, axis=axis)
         edge_chunk = arr[edge_slice1]
         sym_chunk1 = 2 * edge_chunk - sym_chunk1
         del edge_chunk
@@ -831,15 +821,11 @@ def _pad_sym(arr, pad_amt, method, axis=-1):
     # Appended region
 
     # Slice off a reverse indexed chunk from far edge to pad `arr` after
-    start = arr.shape[axis] - pad_amt[1]
-    end = arr.shape[axis]
-    sym_slice = tuple(slice(None) if i != axis else slice(start, end)
-                      for (i, x) in enumerate(arr.shape))
+    sym_slice = _slice_last(arr.shape, pad_amt[1], axis=axis)
     sym_chunk2 = arr[sym_slice][rev_idx]
 
     if 'odd' in method:
-        edge_slice2 = tuple(slice(None) if i != axis else slice(x - 1, x)
-                            for (i, x) in enumerate(arr.shape))
+        edge_slice2 = _slice_last(arr.shape, 1, axis=axis)
         edge_chunk = arr[edge_slice2]
         sym_chunk2 = 2 * edge_chunk - sym_chunk2
         del edge_chunk
@@ -885,18 +871,14 @@ def _pad_wrap(arr, pad_amt, axis=-1):
     # Prepended region
 
     # Slice off a reverse indexed chunk from near edge to pad `arr` before
-    start = arr.shape[axis] - pad_amt[0]
-    end = arr.shape[axis]
-    wrap_slice = tuple(slice(None) if i != axis else slice(start, end)
-                       for (i, x) in enumerate(arr.shape))
+    wrap_slice = _slice_last(arr.shape, pad_amt[0], axis=axis)
     wrap_chunk1 = arr[wrap_slice]
 
     ##########################################################################
     # Appended region
 
     # Slice off a reverse indexed chunk from far edge to pad `arr` after
-    wrap_slice = tuple(slice(None) if i != axis else slice(0, pad_amt[1])
-                       for (i, x) in enumerate(arr.shape))
+    wrap_slice = _slice_first(arr.shape, pad_amt[1], axis=axis)
     wrap_chunk2 = arr[wrap_slice]
 
     # Concatenate `arr` with both chunks, extending along `axis`
diff --git a/numpy/lib/arraysetops.py b/numpy/lib/arraysetops.py
index e8eda297f..4d3f35183 100644
--- a/numpy/lib/arraysetops.py
+++ b/numpy/lib/arraysetops.py
@@ -298,7 +298,7 @@ def _unique1d(ar, return_index=False, return_inverse=False,
     return ret
 
 
-def intersect1d(ar1, ar2, assume_unique=False):
+def intersect1d(ar1, ar2, assume_unique=False, return_indices=False):
     """
     Find the intersection of two arrays.
 
@@ -307,15 +307,28 @@ def intersect1d(ar1, ar2, assume_unique=False):
     Parameters
     ----------
     ar1, ar2 : array_like
-        Input arrays.
+        Input arrays. Will be flattened if not already 1D.
     assume_unique : bool
         If True, the input arrays are both assumed to be unique, which
         can speed up the calculation.  Default is False.
-
+    return_indices : bool
+        If True, the indices which correspond to the intersection of the 
+        two arrays are returned. The first instance of a value is used 
+        if there are multiple. Default is False. 
+    
+        .. versionadded:: 1.15.0    
+        
     Returns
     -------
     intersect1d : ndarray
         Sorted 1D array of common and unique elements.
+    comm1 : ndarray
+        The indices of the first occurrences of the common values in `ar1`.
+        Only provided if `return_indices` is True.
+    comm2 : ndarray
+        The indices of the first occurrences of the common values in `ar2`. 
+        Only provided if `return_indices` is True.
+
 
     See Also
     --------
@@ -332,14 +345,49 @@ def intersect1d(ar1, ar2, assume_unique=False):
     >>> from functools import reduce
     >>> reduce(np.intersect1d, ([1, 3, 4, 3], [3, 1, 2, 1], [6, 3, 4, 2]))
     array([3])
+    
+    To return the indices of the values common to the input arrays
+    along with the intersected values:
+    >>> x = np.array([1, 1, 2, 3, 4])
+    >>> y = np.array([2, 1, 4, 6])
+    >>> xy, x_ind, y_ind = np.intersect1d(x, y, return_indices=True)
+    >>> x_ind, y_ind
+    (array([0, 2, 4]), array([1, 0, 2]))
+    >>> xy, x[x_ind], y[y_ind]
+    (array([1, 2, 4]), array([1, 2, 4]), array([1, 2, 4]))
+    
     """
     if not assume_unique:
-        # Might be faster than unique( intersect1d( ar1, ar2 ) )?
-        ar1 = unique(ar1)
-        ar2 = unique(ar2)
+        if return_indices:
+            ar1, ind1 = unique(ar1, return_index=True)
+            ar2, ind2 = unique(ar2, return_index=True)
+        else:
+            ar1 = unique(ar1)
+            ar2 = unique(ar2)
+    else:
+        ar1 = ar1.ravel()
+        ar2 = ar2.ravel()
+        
     aux = np.concatenate((ar1, ar2))
-    aux.sort()
-    return aux[:-1][aux[1:] == aux[:-1]]
+    if return_indices:
+        aux_sort_indices = np.argsort(aux, kind='mergesort')
+        aux = aux[aux_sort_indices]
+    else:
+        aux.sort()
+
+    mask = aux[1:] == aux[:-1]
+    int1d = aux[:-1][mask]
+
+    if return_indices:
+        ar1_indices = aux_sort_indices[:-1][mask]
+        ar2_indices = aux_sort_indices[1:][mask] - ar1.size
+        if not assume_unique:
+            ar1_indices = ind1[ar1_indices]
+            ar2_indices = ind2[ar2_indices]
+
+        return int1d, ar1_indices, ar2_indices
+    else:
+        return int1d
 
 def setxor1d(ar1, ar2, assume_unique=False):
     """
@@ -660,3 +708,4 @@ def setdiff1d(ar1, ar2, assume_unique=False):
         ar1 = unique(ar1)
         ar2 = unique(ar2)
     return ar1[in1d(ar1, ar2, assume_unique=True, invert=True)]
+
diff --git a/numpy/lib/format.py b/numpy/lib/format.py
index a0ec55f01..23eac7e7d 100644
--- a/numpy/lib/format.py
+++ b/numpy/lib/format.py
@@ -1,9 +1,8 @@
 """
+Binary serialization
 
-.. _binary-serialization:
-
-Binary Serialization
-====================
+NPY format
+==========
 
 A simple format for saving numpy arrays to disk with the full
 information about them.
diff --git a/numpy/lib/histograms.py b/numpy/lib/histograms.py
index 90e19769e..2922b3a86 100644
--- a/numpy/lib/histograms.py
+++ b/numpy/lib/histograms.py
@@ -877,12 +877,6 @@ def histogramdd(sample, bins=10, range=None, normed=False, weights=None):
         # bins is an integer
         bins = D*[bins]
 
-    # avoid rounding issues for comparisons when dealing with inexact types
-    if np.issubdtype(sample.dtype, np.inexact):
-        edge_dt = sample.dtype
-    else:
-        edge_dt = float
-
     # normalize the range argument
     if range is None:
         range = (None,) * D
@@ -896,13 +890,12 @@ def histogramdd(sample, bins=10, range=None, normed=False, weights=None):
                 raise ValueError(
                     '`bins[{}]` must be positive, when an integer'.format(i))
             smin, smax = _get_outer_edges(sample[:,i], range[i])
-            edges[i] = np.linspace(smin, smax, bins[i] + 1, dtype=edge_dt)
+            edges[i] = np.linspace(smin, smax, bins[i] + 1)
         elif np.ndim(bins[i]) == 1:
-            edges[i] = np.asarray(bins[i], edge_dt)
-            # not just monotonic, due to the use of mindiff below
-            if np.any(edges[i][:-1] >= edges[i][1:]):
+            edges[i] = np.asarray(bins[i])
+            if np.any(edges[i][:-1] > edges[i][1:]):
                 raise ValueError(
-                    '`bins[{}]` must be strictly increasing, when an array'
+                    '`bins[{}]` must be monotonically increasing, when an array'
                     .format(i))
         else:
             raise ValueError(
@@ -913,7 +906,8 @@ def histogramdd(sample, bins=10, range=None, normed=False, weights=None):
 
     # Compute the bin number each sample falls into.
     Ncount = tuple(
-        np.digitize(sample[:, i], edges[i])
+        # avoid np.digitize to work around gh-11022
+        np.searchsorted(edges[i], sample[:, i], side='right')
         for i in _range(D)
     )
 
@@ -921,16 +915,10 @@ def histogramdd(sample, bins=10, range=None, normed=False, weights=None):
     # For the rightmost bin, we want values equal to the right edge to be
     # counted in the last bin, and not as an outlier.
     for i in _range(D):
-        # Rounding precision
-        mindiff = dedges[i].min()
-        if not np.isinf(mindiff):
-            decimal = int(-np.log10(mindiff)) + 6
-            # Find which points are on the rightmost edge.
-            not_smaller_than_edge = (sample[:, i] >= edges[i][-1])
-            on_edge = (np.around(sample[:, i], decimal) ==
-                       np.around(edges[i][-1], decimal))
-            # Shift these points one bin to the left.
-            Ncount[i][on_edge & not_smaller_than_edge] -= 1
+        # Find which points are on the rightmost edge.
+        on_edge = (sample[:, i] == edges[i][-1])
+        # Shift these points one bin to the left.
+        Ncount[i][on_edge] -= 1
 
     # Compute the sample indices in the flattened histogram matrix.
     # This raises an error if the array is too large.
diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py
index 95804be7f..b109d65e1 100644
--- a/numpy/lib/npyio.py
+++ b/numpy/lib/npyio.py
@@ -475,7 +475,7 @@ def save(file, arr, allow_pickle=True, fix_imports=True):
 
     Notes
     -----
-    For a description of the ``.npy`` format, see :ref:`binary-serialization`.
+    For a description of the ``.npy`` format, see :py:mod:`numpy.lib.format`.
 
     Examples
     --------
@@ -559,7 +559,7 @@ def savez(file, *args, **kwds):
     The ``.npz`` file format is a zipped archive of files named after the
     variables they contain.  The archive is not compressed and each file
     in the archive contains one variable in ``.npy`` format. For a
-    description of the ``.npy`` format, see :ref:`binary-serialization`.
+    description of the ``.npy`` format, see :py:mod:`numpy.lib.format`.
 
     When opening the saved ``.npz`` file with `load` a `NpzFile` object is
     returned. This is a dictionary-like object which can be queried for
@@ -639,7 +639,7 @@ def savez_compressed(file, *args, **kwds):
     variables they contain.  The archive is compressed with
     ``zipfile.ZIP_DEFLATED`` and each file in the archive contains one variable
     in ``.npy`` format. For a description of the ``.npy`` format, see 
-    :ref:`binary-serialization`.
+    :py:mod:`numpy.lib.format`.
 
 
     When opening the saved ``.npz`` file with `load` a `NpzFile` object is
diff --git a/numpy/lib/polynomial.py b/numpy/lib/polynomial.py
index 41b5e2f64..078608bbb 100644
--- a/numpy/lib/polynomial.py
+++ b/numpy/lib/polynomial.py
@@ -113,11 +113,6 @@ def poly(seq_of_zeros):
     >>> np.poly(P)
     array([ 1.        ,  0.        ,  0.16666667])
 
-    Or a square matrix object:
-
-    >>> np.poly(np.matrix(P))
-    array([ 1.        ,  0.        ,  0.16666667])
-
     Note how in all cases the leading coefficient is always 1.
 
     """
diff --git a/numpy/lib/scimath.py b/numpy/lib/scimath.py
index e07caf805..f1838fee6 100644
--- a/numpy/lib/scimath.py
+++ b/numpy/lib/scimath.py
@@ -555,7 +555,7 @@ def arctanh(x):
     --------
     >>> np.set_printoptions(precision=4)
 
-    >>> np.emath.arctanh(np.matrix(np.eye(2)))
+    >>> np.emath.arctanh(np.eye(2))
     array([[ Inf,   0.],
            [  0.,  Inf]])
     >>> np.emath.arctanh([1j])
diff --git a/numpy/lib/shape_base.py b/numpy/lib/shape_base.py
index 41ef28ef3..65104115a 100644
--- a/numpy/lib/shape_base.py
+++ b/numpy/lib/shape_base.py
@@ -16,10 +16,235 @@ from numpy.matrixlib.defmatrix import matrix  # this raises all the right alarm
 __all__ = [
     'column_stack', 'row_stack', 'dstack', 'array_split', 'split',
     'hsplit', 'vsplit', 'dsplit', 'apply_over_axes', 'expand_dims',
-    'apply_along_axis', 'kron', 'tile', 'get_array_wrap'
+    'apply_along_axis', 'kron', 'tile', 'get_array_wrap', 'take_along_axis',
+    'put_along_axis'
     ]
 
 
+def _make_along_axis_idx(arr_shape, indices, axis):
+	# compute dimensions to iterate over
+    if not _nx.issubdtype(indices.dtype, _nx.integer):
+        raise IndexError('`indices` must be an integer array')
+    if len(arr_shape) != indices.ndim:
+        raise ValueError(
+            "`indices` and `arr` must have the same number of dimensions")
+    shape_ones = (1,) * indices.ndim
+    dest_dims = list(range(axis)) + [None] + list(range(axis+1, indices.ndim))
+
+    # build a fancy index, consisting of orthogonal aranges, with the
+    # requested index inserted at the right location
+    fancy_index = []
+    for dim, n in zip(dest_dims, arr_shape):
+        if dim is None:
+            fancy_index.append(indices)
+        else:
+            ind_shape = shape_ones[:dim] + (-1,) + shape_ones[dim+1:]
+            fancy_index.append(_nx.arange(n).reshape(ind_shape))
+
+    return tuple(fancy_index)
+
+
+def take_along_axis(arr, indices, axis):
+    """
+    Take values from the input array by matching 1d index and data slices.
+
+    This iterates over matching 1d slices oriented along the specified axis in
+    the index and data arrays, and uses the former to look up values in the
+    latter. These slices can be different lengths.
+
+    Functions returning an index along an axis, like `argsort` and
+    `argpartition`, produce suitable indices for this function.
+
+    .. versionadded:: 1.15.0
+
+    Parameters
+    ----------
+    arr: ndarray (Ni..., M, Nk...)
+        Source array
+    indices: ndarray (Ni..., J, Nk...)
+        Indices to take along each 1d slice of `arr`. This must match the
+        dimension of arr, but dimensions Ni and Nj only need to broadcast
+        against `arr`.
+    axis: int
+        The axis to take 1d slices along. If axis is None, the input array is
+        treated as if it had first been flattened to 1d, for consistency with
+        `sort` and `argsort`.
+
+    Returns
+    -------
+    out: ndarray (Ni..., J, Nk...)
+        The indexed result.
+
+    Notes
+    -----
+    This is equivalent to (but faster than) the following use of `ndindex` and
+    `s_`, which sets each of ``ii`` and ``kk`` to a tuple of indices::
+
+        Ni, M, Nk = a.shape[:axis], a.shape[axis], a.shape[axis+1:]
+        J = indices.shape[axis]  # Need not equal M
+        out = np.empty(Nk + (J,) + Nk)
+
+        for ii in ndindex(Ni):
+            for kk in ndindex(Nk):
+                a_1d       = a      [ii + s_[:,] + kk]
+                indices_1d = indices[ii + s_[:,] + kk]
+                out_1d     = out    [ii + s_[:,] + kk]
+                for j in range(J):
+                    out_1d[j] = a_1d[indices_1d[j]]
+
+    Equivalently, eliminating the inner loop, the last two lines would be::
+
+                out_1d[:] = a_1d[indices_1d]
+
+    See Also
+    --------
+    take : Take along an axis, using the same indices for every 1d slice
+    put_along_axis :
+        Put values into the destination array by matching 1d index and data slices
+
+    Examples
+    --------
+
+    For this sample array
+
+    >>> a = np.array([[10, 30, 20], [60, 40, 50]])
+
+    We can sort either by using sort directly, or argsort and this function
+
+    >>> np.sort(a, axis=1)
+    array([[10, 20, 30],
+           [40, 50, 60]])
+    >>> ai = np.argsort(a, axis=1); ai
+    array([[0, 2, 1],
+           [1, 2, 0]], dtype=int64)
+    >>> np.take_along_axis(a, ai, axis=1)
+    array([[10, 20, 30],
+           [40, 50, 60]])
+
+    The same works for max and min, if you expand the dimensions:
+
+    >>> np.expand_dims(np.max(a, axis=1), axis=1)
+    array([[30],
+           [60]])
+    >>> ai = np.expand_dims(np.argmax(a, axis=1), axis=1)
+    >>> ai
+    array([[1],
+           [0], dtype=int64)
+    >>> np.take_along_axis(a, ai, axis=1)
+    array([[30],
+           [60]])
+
+    If we want to get the max and min at the same time, we can stack the
+    indices first
+
+    >>> ai_min = np.expand_dims(np.argmin(a, axis=1), axis=1)
+    >>> ai_max = np.expand_dims(np.argmax(a, axis=1), axis=1)
+    >>> ai = np.concatenate([ai_min, ai_max], axis=axis)
+    >> ai
+    array([[0, 1],
+           [1, 0]], dtype=int64)
+    >>> np.take_along_axis(a, ai, axis=1)
+    array([[10, 30],
+           [40, 60]])
+    """
+    # normalize inputs
+    if axis is None:
+        arr = arr.flat
+        arr_shape = (len(arr),)  # flatiter has no .shape
+        axis = 0
+    else:
+        axis = normalize_axis_index(axis, arr.ndim)
+        arr_shape = arr.shape
+
+    # use the fancy index
+    return arr[_make_along_axis_idx(arr_shape, indices, axis)]
+
+
+def put_along_axis(arr, indices, values, axis):
+    """
+    Put values into the destination array by matching 1d index and data slices.
+
+    This iterates over matching 1d slices oriented along the specified axis in
+    the index and data arrays, and uses the former to place values into the
+    latter. These slices can be different lengths.
+
+    Functions returning an index along an axis, like `argsort` and
+    `argpartition`, produce suitable indices for this function.
+
+    .. versionadded:: 1.15.0
+
+    Parameters
+    ----------
+    arr: ndarray (Ni..., M, Nk...)
+        Destination array.
+    indices: ndarray (Ni..., J, Nk...)
+        Indices to change along each 1d slice of `arr`. This must match the
+        dimension of arr, but dimensions in Ni and Nj may be 1 to broadcast
+        against `arr`.
+    values: array_like (Ni..., J, Nk...)
+        values to insert at those indices. Its shape and dimension are
+        broadcast to match that of `indices`.
+    axis: int
+        The axis to take 1d slices along. If axis is None, the destination
+        array is treated as if a flattened 1d view had been created of it.
+
+    Notes
+    -----
+    This is equivalent to (but faster than) the following use of `ndindex` and
+    `s_`, which sets each of ``ii`` and ``kk`` to a tuple of indices::
+
+        Ni, M, Nk = a.shape[:axis], a.shape[axis], a.shape[axis+1:]
+        J = indices.shape[axis]  # Need not equal M
+
+        for ii in ndindex(Ni):
+            for kk in ndindex(Nk):
+                a_1d       = a      [ii + s_[:,] + kk]
+                indices_1d = indices[ii + s_[:,] + kk]
+                values_1d  = values [ii + s_[:,] + kk]
+                for j in range(J):
+                    a_1d[indices_1d[j]] = values_1d[j]
+
+    Equivalently, eliminating the inner loop, the last two lines would be::
+
+                a_1d[indices_1d] = values_1d
+
+    See Also
+    --------
+    take_along_axis :
+        Take values from the input array by matching 1d index and data slices
+
+    Examples
+    --------
+
+    For this sample array
+
+    >>> a = np.array([[10, 30, 20], [60, 40, 50]])
+
+    We can replace the maximum values with:
+
+    >>> ai = np.expand_dims(np.argmax(a, axis=1), axis=1)
+    >>> ai
+    array([[1],
+           [0]], dtype=int64)
+    >>> np.put_along_axis(a, ai, 99, axis=1)
+    >>> a
+    array([[10, 99, 20],
+           [99, 40, 50]])
+
+    """
+    # normalize inputs
+    if axis is None:
+        arr = arr.flat
+        axis = 0
+        arr_shape = (len(arr),)  # flatiter has no .shape
+    else:
+        axis = normalize_axis_index(axis, arr.ndim)
+        arr_shape = arr.shape
+
+    # use the fancy index
+    arr[_make_along_axis_idx(arr_shape, indices, axis)] = values
+
+
 def apply_along_axis(func1d, axis, arr, *args, **kwargs):
     """
     Apply a function to 1-D slices along the given axis.
diff --git a/numpy/lib/tests/test_arraysetops.py b/numpy/lib/tests/test_arraysetops.py
index 76c36c53e..dace5ade8 100644
--- a/numpy/lib/tests/test_arraysetops.py
+++ b/numpy/lib/tests/test_arraysetops.py
@@ -32,7 +32,46 @@ class TestSetOps(object):
         assert_array_equal(c, ed)
 
         assert_array_equal([], intersect1d([], []))
-
+        
+    def test_intersect1d_indices(self):
+        # unique inputs
+        a = np.array([1, 2, 3, 4]) 
+        b = np.array([2, 1, 4, 6])
+        c, i1, i2 = intersect1d(a, b, assume_unique=True, return_indices=True)
+        ee = np.array([1, 2, 4])
+        assert_array_equal(c, ee)
+        assert_array_equal(a[i1], ee)
+        assert_array_equal(b[i2], ee)
+        
+        # non-unique inputs
+        a = np.array([1, 2, 2, 3, 4, 3, 2])
+        b = np.array([1, 8, 4, 2, 2, 3, 2, 3])
+        c, i1, i2 = intersect1d(a, b, return_indices=True)
+        ef = np.array([1, 2, 3, 4])
+        assert_array_equal(c, ef)
+        assert_array_equal(a[i1], ef)
+        assert_array_equal(b[i2], ef)
+                
+        # non1d, unique inputs
+        a = np.array([[2, 4, 5, 6], [7, 8, 1, 15]])
+        b = np.array([[3, 2, 7, 6], [10, 12, 8, 9]])
+        c, i1, i2 = intersect1d(a, b, assume_unique=True, return_indices=True)
+        ui1 = np.unravel_index(i1, a.shape)
+        ui2 = np.unravel_index(i2, b.shape)
+        ea = np.array([2, 6, 7, 8])
+        assert_array_equal(ea, a[ui1])
+        assert_array_equal(ea, b[ui2])
+    
+        # non1d, not assumed to be uniqueinputs
+        a = np.array([[2, 4, 5, 6, 6], [4, 7, 8, 7, 2]])
+        b = np.array([[3, 2, 7, 7], [10, 12, 8, 7]])
+        c, i1, i2 = intersect1d(a, b, return_indices=True)
+        ui1 = np.unravel_index(i1, a.shape)
+        ui2 = np.unravel_index(i2, b.shape)
+        ea = np.array([2, 7, 8])
+        assert_array_equal(ea, a[ui1])
+        assert_array_equal(ea, b[ui2])
+        
     def test_setxor1d(self):
         a = np.array([5, 7, 1, 2])
         b = np.array([2, 4, 3, 1, 5])
@@ -74,8 +113,6 @@ class TestSetOps(object):
         assert_array_equal([1,7,8], ediff1d(two_elem, to_end=[7,8]))
         assert_array_equal([7,1], ediff1d(two_elem, to_begin=7))
         assert_array_equal([5,6,1], ediff1d(two_elem, to_begin=[5,6]))
-        assert(isinstance(ediff1d(np.matrix(1)), np.matrix))
-        assert(isinstance(ediff1d(np.matrix(1), to_begin=1), np.matrix))
 
     def test_isin(self):
         # the tests for in1d cover most of isin's behavior
diff --git a/numpy/lib/tests/test_function_base.py b/numpy/lib/tests/test_function_base.py
index 5dc96775b..4103a9eb3 100644
--- a/numpy/lib/tests/test_function_base.py
+++ b/numpy/lib/tests/test_function_base.py
@@ -287,9 +287,6 @@ class TestAverage(object):
         assert_almost_equal(y5.mean(0), average(y5, 0))
         assert_almost_equal(y5.mean(1), average(y5, 1))
 
-        y6 = np.matrix(rand(5, 5))
-        assert_array_equal(y6.mean(0), average(y6, 0))
-
     def test_weights(self):
         y = np.arange(10)
         w = np.arange(10)
@@ -357,14 +354,6 @@ class TestAverage(object):
         assert_equal(type(np.average(a)), subclass)
         assert_equal(type(np.average(a, weights=w)), subclass)
 
-        # also test matrices
-        a = np.matrix([[1,2],[3,4]])
-        w = np.matrix([[1,2],[3,4]])
-
-        r = np.average(a, axis=0, weights=w)
-        assert_equal(type(r), np.matrix)
-        assert_equal(r, [[2.5, 10.0/3]])
-
     def test_upcasting(self):
         types = [('i4', 'i4', 'f8'), ('i4', 'f4', 'f8'), ('f4', 'i4', 'f8'),
                  ('f4', 'f4', 'f4'), ('f4', 'f8', 'f8')]
@@ -1623,16 +1612,6 @@ class TestTrapz(object):
         xm = np.ma.array(x, mask=mask)
         assert_almost_equal(trapz(y, xm), r)
 
-    def test_matrix(self):
-        # Test to make sure matrices give the same answer as ndarrays
-        x = np.linspace(0, 5)
-        y = x * x
-        r = trapz(y, x)
-        mx = np.matrix(x)
-        my = np.matrix(y)
-        mr = trapz(my, mx)
-        assert_almost_equal(mr, r)
-
 
 class TestSinc(object):
 
diff --git a/numpy/lib/tests/test_histograms.py b/numpy/lib/tests/test_histograms.py
index 597b5b376..e16ae12c2 100644
--- a/numpy/lib/tests/test_histograms.py
+++ b/numpy/lib/tests/test_histograms.py
@@ -613,8 +613,6 @@ class TestHistogramdd(object):
         assert_raises(ValueError, np.histogramdd, x, bins=[-1, 2, 4, 5])
         assert_raises(ValueError, np.histogramdd, x, bins=[1, 0.99, 1, 1])
         assert_raises(
-            ValueError, np.histogramdd, x, bins=[1, 1, 1, [1, 2, 2, 3]])
-        assert_raises(
             ValueError, np.histogramdd, x, bins=[1, 1, 1, [1, 2, 3, -3]])
         assert_(np.histogramdd(x, bins=[1, 1, 1, [1, 2, 3, 4]]))
 
@@ -646,7 +644,7 @@ class TestHistogramdd(object):
         bins = [[0., 0.5, 1.0]]
         hist, _ = histogramdd(x, bins=bins)
         assert_(hist[0] == 0.0)
-        assert_(hist[1] == 1.)
+        assert_(hist[1] == 0.0)
         x = [1.0001]
         bins = [[0., 0.5, 1.0]]
         hist, _ = histogramdd(x, bins=bins)
@@ -660,3 +658,40 @@ class TestHistogramdd(object):
                       range=[[0.0, 1.0], [0.25, 0.75], [0.25, np.inf]])
         assert_raises(ValueError, histogramdd, vals,
                       range=[[0.0, 1.0], [np.nan, 0.75], [0.25, 0.5]])
+
+    def test_equal_edges(self):
+        """ Test that adjacent entries in an edge array can be equal """
+        x = np.array([0, 1, 2])
+        y = np.array([0, 1, 2])
+        x_edges = np.array([0, 2, 2])
+        y_edges = 1
+        hist, edges = histogramdd((x, y), bins=(x_edges, y_edges))
+
+        hist_expected = np.array([
+            [2.],
+            [1.],  # x == 2 falls in the final bin
+        ])
+        assert_equal(hist, hist_expected)
+
+    def test_edge_dtype(self):
+        """ Test that if an edge array is input, its type is preserved """
+        x = np.array([0, 10, 20])
+        y = x / 10
+        x_edges = np.array([0, 5, 15, 20])
+        y_edges = x_edges / 10
+        hist, edges = histogramdd((x, y), bins=(x_edges, y_edges))
+
+        assert_equal(edges[0].dtype, x_edges.dtype)
+        assert_equal(edges[1].dtype, y_edges.dtype)
+
+    def test_large_integers(self):
+        big = 2**60  # Too large to represent with a full precision float
+
+        x = np.array([0], np.int64)
+        x_edges = np.array([-1, +1], np.int64)
+        y = big + x
+        y_edges = big + x_edges
+
+        hist, edges = histogramdd((x, y), bins=(x_edges, y_edges))
+
+        assert_equal(hist[0, 0], 1)
diff --git a/numpy/lib/tests/test_index_tricks.py b/numpy/lib/tests/test_index_tricks.py
index f934e952a..089a7589a 100644
--- a/numpy/lib/tests/test_index_tricks.py
+++ b/numpy/lib/tests/test_index_tricks.py
@@ -184,37 +184,6 @@ class TestConcatenator(object):
         assert_array_equal(d[:5, :], b)
         assert_array_equal(d[5:, :], c)
 
-    def test_matrix(self):
-        a = [1, 2]
-        b = [3, 4]
-
-        ab_r = np.r_['r', a, b]
-        ab_c = np.r_['c', a, b]
-
-        assert_equal(type(ab_r), np.matrix)
-        assert_equal(type(ab_c), np.matrix)
-
-        assert_equal(np.array(ab_r), [[1,2,3,4]])
-        assert_equal(np.array(ab_c), [[1],[2],[3],[4]])
-
-        assert_raises(ValueError, lambda: np.r_['rc', a, b])
-
-    def test_matrix_scalar(self):
-        r = np.r_['r', [1, 2], 3]
-        assert_equal(type(r), np.matrix)
-        assert_equal(np.array(r), [[1,2,3]])
-
-    def test_matrix_builder(self):
-        a = np.array([1])
-        b = np.array([2])
-        c = np.array([3])
-        d = np.array([4])
-        actual = np.r_['a, b; c, d']
-        expected = np.bmat([[a, b], [c, d]])
-
-        assert_equal(actual, expected)
-        assert_equal(type(actual), type(expected))
-
     def test_0d(self):
         assert_equal(r_[0, np.array(1), 2], [0, 1, 2])
         assert_equal(r_[[0, 1, 2], np.array(3)], [0, 1, 2, 3])
diff --git a/numpy/lib/tests/test_nanfunctions.py b/numpy/lib/tests/test_nanfunctions.py
index 42e221506..504372faf 100644
--- a/numpy/lib/tests/test_nanfunctions.py
+++ b/numpy/lib/tests/test_nanfunctions.py
@@ -113,42 +113,46 @@ class TestNanFunctions_MinMax(object):
         for f in self.nanfuncs:
             assert_(f(0.) == 0.)
 
-    def test_matrices(self):
+    def test_subclass(self):
+        class MyNDArray(np.ndarray):
+            pass
+
         # Check that it works and that type and
         # shape are preserved
-        mat = np.matrix(np.eye(3))
+        mine = np.eye(3).view(MyNDArray)
         for f in self.nanfuncs:
-            res = f(mat, axis=0)
-            assert_(isinstance(res, np.matrix))
-            assert_(res.shape == (1, 3))
-            res = f(mat, axis=1)
-            assert_(isinstance(res, np.matrix))
-            assert_(res.shape == (3, 1))
-            res = f(mat)
-            assert_(np.isscalar(res))
+            res = f(mine, axis=0)
+            assert_(isinstance(res, MyNDArray))
+            assert_(res.shape == (3,))
+            res = f(mine, axis=1)
+            assert_(isinstance(res, MyNDArray))
+            assert_(res.shape == (3,))
+            res = f(mine)
+            assert_(res.shape == ())
+
         # check that rows of nan are dealt with for subclasses (#4628)
-        mat[1] = np.nan
+        mine[1] = np.nan
         for f in self.nanfuncs:
             with warnings.catch_warnings(record=True) as w:
                 warnings.simplefilter('always')
-                res = f(mat, axis=0)
-                assert_(isinstance(res, np.matrix))
+                res = f(mine, axis=0)
+                assert_(isinstance(res, MyNDArray))
                 assert_(not np.any(np.isnan(res)))
                 assert_(len(w) == 0)
 
             with warnings.catch_warnings(record=True) as w:
                 warnings.simplefilter('always')
-                res = f(mat, axis=1)
-                assert_(isinstance(res, np.matrix))
-                assert_(np.isnan(res[1, 0]) and not np.isnan(res[0, 0])
-                        and not np.isnan(res[2, 0]))
+                res = f(mine, axis=1)
+                assert_(isinstance(res, MyNDArray))
+                assert_(np.isnan(res[1]) and not np.isnan(res[0])
+                        and not np.isnan(res[2]))
                 assert_(len(w) == 1, 'no warning raised')
                 assert_(issubclass(w[0].category, RuntimeWarning))
 
             with warnings.catch_warnings(record=True) as w:
                 warnings.simplefilter('always')
-                res = f(mat)
-                assert_(np.isscalar(res))
+                res = f(mine)
+                assert_(res.shape == ())
                 assert_(res != np.nan)
                 assert_(len(w) == 0)
 
@@ -209,19 +213,22 @@ class TestNanFunctions_ArgminArgmax(object):
         for f in self.nanfuncs:
             assert_(f(0.) == 0.)
 
-    def test_matrices(self):
+    def test_subclass(self):
+        class MyNDArray(np.ndarray):
+            pass
+
         # Check that it works and that type and
         # shape are preserved
-        mat = np.matrix(np.eye(3))
+        mine = np.eye(3).view(MyNDArray)
         for f in self.nanfuncs:
-            res = f(mat, axis=0)
-            assert_(isinstance(res, np.matrix))
-            assert_(res.shape == (1, 3))
-            res = f(mat, axis=1)
-            assert_(isinstance(res, np.matrix))
-            assert_(res.shape == (3, 1))
-            res = f(mat)
-            assert_(np.isscalar(res))
+            res = f(mine, axis=0)
+            assert_(isinstance(res, MyNDArray))
+            assert_(res.shape == (3,))
+            res = f(mine, axis=1)
+            assert_(isinstance(res, MyNDArray))
+            assert_(res.shape == (3,))
+            res = f(mine)
+            assert_(res.shape == ())
 
 
 class TestNanFunctions_IntTypes(object):
@@ -381,19 +388,27 @@ class SharedNanFunctionsTestsMixin(object):
         for f in self.nanfuncs:
             assert_(f(0.) == 0.)
 
-    def test_matrices(self):
+    def test_subclass(self):
+        class MyNDArray(np.ndarray):
+            pass
+
         # Check that it works and that type and
         # shape are preserved
-        mat = np.matrix(np.eye(3))
+        array = np.eye(3)
+        mine = array.view(MyNDArray)
         for f in self.nanfuncs:
-            res = f(mat, axis=0)
-            assert_(isinstance(res, np.matrix))
-            assert_(res.shape == (1, 3))
-            res = f(mat, axis=1)
-            assert_(isinstance(res, np.matrix))
-            assert_(res.shape == (3, 1))
-            res = f(mat)
-            assert_(np.isscalar(res))
+            expected_shape = f(array, axis=0).shape
+            res = f(mine, axis=0)
+            assert_(isinstance(res, MyNDArray))
+            assert_(res.shape == expected_shape)
+            expected_shape = f(array, axis=1).shape
+            res = f(mine, axis=1)
+            assert_(isinstance(res, MyNDArray))
+            assert_(res.shape == expected_shape)
+            expected_shape = f(array).shape
+            res = f(mine)
+            assert_(isinstance(res, MyNDArray))
+            assert_(res.shape == expected_shape)
 
 
 class TestNanFunctions_SumProd(SharedNanFunctionsTestsMixin):
@@ -481,18 +496,6 @@ class TestNanFunctions_CumSumProd(SharedNanFunctionsTestsMixin):
                 res = f(d, axis=axis)
                 assert_equal(res.shape, (3, 5, 7, 11))
 
-    def test_matrices(self):
-        # Check that it works and that type and
-        # shape are preserved
-        mat = np.matrix(np.eye(3))
-        for f in self.nanfuncs:
-            for axis in np.arange(2):
-                res = f(mat, axis=axis)
-                assert_(isinstance(res, np.matrix))
-                assert_(res.shape == (3, 3))
-            res = f(mat)
-            assert_(res.shape == (1, 3*3))
-
     def test_result_values(self):
         for axis in (-2, -1, 0, 1, None):
             tgt = np.cumprod(_ndat_ones, axis=axis)
diff --git a/numpy/lib/tests/test_shape_base.py b/numpy/lib/tests/test_shape_base.py
index 0e86fbb19..c95894f94 100644
--- a/numpy/lib/tests/test_shape_base.py
+++ b/numpy/lib/tests/test_shape_base.py
@@ -2,16 +2,106 @@ from __future__ import division, absolute_import, print_function
 
 import numpy as np
 import warnings
+import functools
 
 from numpy.lib.shape_base import (
     apply_along_axis, apply_over_axes, array_split, split, hsplit, dsplit,
-    vsplit, dstack, column_stack, kron, tile, expand_dims,
+    vsplit, dstack, column_stack, kron, tile, expand_dims, take_along_axis,
+    put_along_axis
     )
 from numpy.testing import (
     assert_, assert_equal, assert_array_equal, assert_raises, assert_warns
     )
 
 
+def _add_keepdims(func):
+    """ hack in keepdims behavior into a function taking an axis """
+    @functools.wraps(func)
+    def wrapped(a, axis, **kwargs):
+        res = func(a, axis=axis, **kwargs)
+        if axis is None:
+            axis = 0  # res is now a scalar, so we can insert this anywhere
+        return np.expand_dims(res, axis=axis)
+    return wrapped
+
+
+class TestTakeAlongAxis(object):
+    def test_argequivalent(self):
+        """ Test it translates from arg<func> to <func> """
+        from numpy.random import rand
+        a = rand(3, 4, 5)
+
+        funcs = [
+            (np.sort, np.argsort, dict()),
+            (_add_keepdims(np.min), _add_keepdims(np.argmin), dict()),
+            (_add_keepdims(np.max), _add_keepdims(np.argmax), dict()),
+            (np.partition, np.argpartition, dict(kth=2)),
+        ]
+
+        for func, argfunc, kwargs in funcs:
+            for axis in list(range(a.ndim)) + [None]:
+                a_func = func(a, axis=axis, **kwargs)
+                ai_func = argfunc(a, axis=axis, **kwargs)
+                assert_equal(a_func, take_along_axis(a, ai_func, axis=axis))
+
+    def test_invalid(self):
+        """ Test it errors when indices has too few dimensions """
+        a = np.ones((10, 10))
+        ai = np.ones((10, 2), dtype=np.intp)
+
+        # sanity check
+        take_along_axis(a, ai, axis=1)
+
+        # not enough indices
+        assert_raises(ValueError, take_along_axis, a, np.array(1), axis=1)
+        # bool arrays not allowed
+        assert_raises(IndexError, take_along_axis, a, ai.astype(bool), axis=1)
+        # float arrays not allowed
+        assert_raises(IndexError, take_along_axis, a, ai.astype(float), axis=1)
+        # invalid axis
+        assert_raises(np.AxisError, take_along_axis, a, ai, axis=10)
+
+    def test_empty(self):
+        """ Test everything is ok with empty results, even with inserted dims """
+        a  = np.ones((3, 4, 5))
+        ai = np.ones((3, 0, 5), dtype=np.intp)
+
+        actual = take_along_axis(a, ai, axis=1)
+        assert_equal(actual.shape, ai.shape)
+
+    def test_broadcast(self):
+        """ Test that non-indexing dimensions are broadcast in both directions """
+        a  = np.ones((3, 4, 1))
+        ai = np.ones((1, 2, 5), dtype=np.intp)
+        actual = take_along_axis(a, ai, axis=1)
+        assert_equal(actual.shape, (3, 2, 5))
+
+
+class TestPutAlongAxis(object):
+    def test_replace_max(self):
+        a_base = np.array([[10, 30, 20], [60, 40, 50]])
+
+        for axis in list(range(a_base.ndim)) + [None]:
+            # we mutate this in the loop
+            a = a_base.copy()
+
+            # replace the max with a small value
+            i_max = _add_keepdims(np.argmax)(a, axis=axis)
+            put_along_axis(a, i_max, -99, axis=axis)
+
+            # find the new minimum, which should max
+            i_min = _add_keepdims(np.argmin)(a, axis=axis)
+
+            assert_equal(i_min, i_max)
+
+    def test_broadcast(self):
+        """ Test that non-indexing dimensions are broadcast in both directions """
+        a  = np.ones((3, 4, 1))
+        ai = np.arange(10, dtype=np.intp).reshape((1, 2, 5)) % 4
+        put_along_axis(a, ai, 20, axis=1)
+        assert_equal(take_along_axis(a, ai, axis=1), 20)
+
+
 class TestApplyAlongAxis(object):
     def test_simple(self):
         a = np.ones((20, 10), 'd')
@@ -29,19 +119,21 @@ class TestApplyAlongAxis(object):
                            [[27, 30, 33], [36, 39, 42], [45, 48, 51]])
 
     def test_preserve_subclass(self):
-        # this test is particularly malicious because matrix
-        # refuses to become 1d
         def double(row):
             return row * 2
-        m = np.matrix([[0, 1], [2, 3]])
-        expected = np.matrix([[0, 2], [4, 6]])
+
+        class MyNDArray(np.ndarray):
+            pass
+
+        m = np.array([[0, 1], [2, 3]]).view(MyNDArray)
+        expected = np.array([[0, 2], [4, 6]]).view(MyNDArray)
 
         result = apply_along_axis(double, 0, m)
-        assert_(isinstance(result, np.matrix))
+        assert_(isinstance(result, MyNDArray))
         assert_array_equal(result, expected)
 
         result = apply_along_axis(double, 1, m)
-        assert_(isinstance(result, np.matrix))
+        assert_(isinstance(result, MyNDArray))
         assert_array_equal(result, expected)
 
     def test_subclass(self):
@@ -492,16 +584,10 @@ class TestSqueeze(object):
 
 class TestKron(object):
     def test_return_type(self):
-        a = np.ones([2, 2])
-        m = np.asmatrix(a)
-        assert_equal(type(kron(a, a)), np.ndarray)
-        assert_equal(type(kron(m, m)), np.matrix)
-        assert_equal(type(kron(a, m)), np.matrix)
-        assert_equal(type(kron(m, a)), np.matrix)
-
         class myarray(np.ndarray):
             __array_priority__ = 0.0
 
+        a = np.ones([2, 2])
         ma = myarray(a.shape, a.dtype, a.data)
         assert_equal(type(kron(a, a)), np.ndarray)
         assert_equal(type(kron(ma, ma)), myarray)
diff --git a/numpy/lib/twodim_base.py b/numpy/lib/twodim_base.py
index 402c18850..cca316e9a 100644
--- a/numpy/lib/twodim_base.py
+++ b/numpy/lib/twodim_base.py
@@ -650,7 +650,7 @@ def histogram2d(x, y, bins=10, range=None, normed=False, weights=None):
         N = 1
 
     if N != 1 and N != 2:
-        xedges = yedges = asarray(bins, float)
+        xedges = yedges = asarray(bins)
         bins = [xedges, yedges]
     hist, edges = histogramdd([x, y], bins, range, normed, weights)
     return hist, edges[0], edges[1]