13 files changed, 455 insertions, 133 deletions
diff --git a/numpy/core/bscript b/numpy/core/bscript
index 44f7f14f0..ee5543354 100644
--- a/numpy/core/bscript
+++ b/numpy/core/bscript
@@ -492,8 +492,10 @@ def pre_build(context):
             pattern="ufunc_api",
             name="ufunc_api")
 
-        ufunc_templates = ["src/umath/loops.c.src",
-                "src/umath/funcs.inc.src"]
+        ufunc_templates = [
+            "src/umath/loops.c.src",
+            "src/umath/funcs.inc.src",
+            "src/umath/simd.inc.src"]
         bld(target="ufunc_templates", source=ufunc_templates)
 
         bld(features="umath_gen",
diff --git a/numpy/core/include/numpy/npy_math.h b/numpy/core/include/numpy/npy_math.h
index 625999022..537a63ee4 100644
--- a/numpy/core/include/numpy/npy_math.h
+++ b/numpy/core/include/numpy/npy_math.h
@@ -6,7 +6,7 @@ extern "C" {
 #endif
 
 #include <math.h>
-#ifdef NPY_HAVE_CONFIG_H
+#ifdef HAVE_NPY_CONFIG_H
 #include <npy_config.h>
 #endif
 #include <numpy/npy_common.h>
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index a6ddfb843..926142b55 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -837,7 +837,9 @@ def configuration(parent_package='',top_path=None):
         subpath = join('src', 'umath')
         # NOTE: For manual template conversion of loops.h.src, read the note
         #       in that file.
-        sources = [join(local_dir, subpath, 'loops.c.src')]
+        sources = [
+            join(local_dir, subpath, 'loops.c.src'),
+            join(local_dir, subpath, 'simd.inc.src')]
 
         # numpy.distutils generate .c from .c.src in weird directories, we have
         # to add them there as they depend on the build_dir
@@ -864,12 +866,14 @@ def configuration(parent_package='',top_path=None):
             join('src', 'umath', 'umathmodule.c'),
             join('src', 'umath', 'reduction.c'),
             join('src', 'umath', 'funcs.inc.src'),
+            join('src', 'umath', 'simd.inc.src'),
             join('src', 'umath', 'loops.c.src'),
             join('src', 'umath', 'ufunc_object.c'),
             join('src', 'umath', 'ufunc_type_resolution.c')]
 
     umath_deps = [
             generate_umath_py,
+            join('src', 'umath', 'simd.inc.src'),
             join(codegen_dir,'generate_ufunc_api.py')]
 
     if not ENABLE_SEPARATE_COMPILATION:
@@ -877,6 +881,7 @@ def configuration(parent_package='',top_path=None):
         umath_src = [join('src', 'umath', 'umathmodule_onefile.c')]
         umath_src.append(generate_umath_templated_sources)
         umath_src.append(join('src', 'umath', 'funcs.inc.src'))
+        umath_src.append(join('src', 'umath', 'simd.inc.src'))
 
     config.add_extension('umath',
                          sources = umath_src +
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 5eae448ee..c0287b8c8 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -5,7 +5,6 @@
 
 #include "Python.h"
 
-#include "npy_config.h"
 #ifdef  ENABLE_SEPARATE_COMPILATION
 #define PY_ARRAY_UNIQUE_SYMBOL _npy_umathmodule_ARRAY_API
 #define NO_IMPORT_ARRAY
@@ -20,11 +19,15 @@
 #include "npy_pycompat.h"
 
 #include "ufunc_object.h"
-#include <assert.h>
 
-#ifdef HAVE_EMMINTRIN_H
-#include <emmintrin.h>
-#endif
+
+/*
+ * include vectorized functions and dispatchers
+ * this file is safe to include also for generic builds
+ * platform specific instructions are either masked via the proprocessor or
+ * runtime detected
+ */
+#include "simd.inc"
 
 
 /*
@@ -38,15 +41,6 @@
         && (steps[0] == steps[2])\
         && (steps[0] == 0))
 
-/*
- * stride is equal to element size and input and destination are equal or
- * don't overlap within one register
- */
-#define IS_BLOCKABLE_UNARY(esize, vsize) \
-    (steps[0] == (esize) && steps[0] == steps[1] && \
-     (npy_is_aligned(args[0], esize) && npy_is_aligned(args[1], esize)) && \
-     ((abs(args[1] - args[0]) >= (vsize)) || ((abs(args[1] - args[0]) == 0))))
-
 #define OUTPUT_LOOP\
     char *op1 = args[1];\
     npy_intp os1 = steps[1];\
@@ -68,22 +62,6 @@
     npy_intp i;\
     for(i = 0; i < n; i++, ip1 += is1, op1 += os1, op2 += os2)
 
-/* align output to alignment
- * store alignment is usually more important than load alignment */
-#define UNARY_LOOP_BLOCK_ALIGN_OUT(type, alignment)\
-    type *ip1 = (type *)args[0], *op1 = (type *)args[1];\
-    npy_intp n = dimensions[0];\
-    npy_intp i, peel = npy_aligned_block_offset(op1, sizeof(type),\
-                                                alignment, n);\
-    for(i = 0; i < peel; i++)
-
-#define UNARY_LOOP_BLOCKED(type, vsize)\
-    for(; i < npy_blocked_end(peel, sizeof(type), vsize, n);\
-            i += (vsize / sizeof(type)))
-
-#define UNARY_LOOP_BLOCKED_END\
-    for (; i < n; i++)
-
 #define BINARY_LOOP\
     char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];\
     npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2];\
@@ -1298,48 +1276,25 @@ TIMEDELTA_mm_d_divide(char **args, npy_intp *dimensions, npy_intp *steps, void *
  *****************************************************************************
  */
 
-
 /**begin repeat
+ * Float types
  *  #type = npy_float, npy_double#
  *  #TYPE = FLOAT, DOUBLE#
  *  #scalarf = npy_sqrtf, npy_sqrt#
- *  #vtype = __m128, __m128d#
- *  #vsuf = ps, pd#
  */
+
 NPY_NO_EXPORT void
 @TYPE@_sqrt(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
 {
-#ifdef HAVE_EMMINTRIN_H
-    if (IS_BLOCKABLE_UNARY(sizeof(@type@), 16)) {
-        UNARY_LOOP_BLOCK_ALIGN_OUT(@type@, 16) {
-            op1[i] = @scalarf@(ip1[i]);
-        }
-        assert(npy_is_aligned(&op1[i], 16));
-        if (npy_is_aligned(&ip1[i], 16)) {
-            UNARY_LOOP_BLOCKED(@type@, 16) {
-                @vtype@ d = _mm_load_@vsuf@(&ip1[i]);
-                _mm_store_@vsuf@(&op1[i], _mm_sqrt_@vsuf@(d));
-            }
-        }
-        else {
-            UNARY_LOOP_BLOCKED(@type@, 16) {
-                @vtype@ d = _mm_loadu_@vsuf@(&ip1[i]);
-                _mm_store_@vsuf@(&op1[i], _mm_sqrt_@vsuf@(d));
-            }
-        }
-        UNARY_LOOP_BLOCKED_END {
-            op1[i] = @scalarf@(ip1[i]);
-        }
+    if (run_unary_simd_sqrt_@TYPE@(args, dimensions, steps)) {
+        return;
     }
-    else
-#endif
-    {
-        UNARY_LOOP {
-            const @type@ in1 = *(@type@ *)ip1;
-            *(@type@ *)op1 = @scalarf@(in1);
-        }
+    UNARY_LOOP {
+        const @type@ in1 = *(@type@ *)ip1;
+        *(@type@ *)op1 = @scalarf@(in1);
     }
 }
+
 /**end repeat**/
 
 
@@ -1568,6 +1523,9 @@ NPY_NO_EXPORT void
 NPY_NO_EXPORT void
 @TYPE@_absolute(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
 {
+    if (run_unary_simd_absolute_@TYPE@(args, dimensions, steps)) {
+        return;
+    }
     UNARY_LOOP {
         const @type@ in1 = *(@type@ *)ip1;
         const @type@ tmp = in1 > 0 ? in1 : -in1;
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
new file mode 100644
index 000000000..916473a0b
--- /dev/null
+++ b/numpy/core/src/umath/simd.inc.src
@@ -0,0 +1,173 @@
+/* -*- c -*- */
+
+/*
+ * This file is for the definitions of simd vectorized operations.
+ *
+ * Currently contains sse2 functions that are built on amd64, x32 or
+ * non-generic builds (CFLAGS=-march=...)
+ * In future it may contain other instruction sets like AVX or NEON detected
+ * at runtime in which case it needs to be included indirectly via a file
+ * compiled with special options (or use gcc target attributes) so the binary
+ * stays portable.
+ */
+
+
+#ifndef __NPY_SIMD_INC
+#define __NPY_SIMD_INC
+
+#include "lowlevel_strided_loops.h"
+#include "npy_config.h"
+#include <assert.h>
+#include <stdlib.h>
+
+/*
+ * stride is equal to element size and input and destination are equal or
+ * don't overlap within one register
+ */
+#define IS_BLOCKABLE_UNARY(esize, vsize) \
+    (steps[0] == (esize) && steps[0] == steps[1] && \
+     (npy_is_aligned(args[0], esize) && npy_is_aligned(args[1], esize)) && \
+     ((abs(args[1] - args[0]) >= (vsize)) || ((abs(args[1] - args[0]) == 0))))
+
+
+/* align var to alignment */
+#define UNARY_LOOP_BLOCK_ALIGN_VAR(var, type, alignment)\
+    npy_intp i, peel = npy_aligned_block_offset(var, sizeof(type),\
+                                                alignment, n);\
+    for(i = 0; i < peel; i++)
+
+#define UNARY_LOOP_BLOCKED(type, vsize)\
+    for(; i < npy_blocked_end(peel, sizeof(type), vsize, n);\
+            i += (vsize / sizeof(type)))
+
+#define UNARY_LOOP_BLOCKED_END\
+    for (; i < n; i++)
+
+
+/*
+ * Dispatcher functions
+ * decide whether the operation can be vectorized and run it
+ * if it was run returns true and false if nothing was done
+ */
+
+/**begin repeat
+ * Float types
+ *  #type = npy_float, npy_double, npy_longdouble#
+ *  #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
+ *  #vector = 1, 1, 0#
+ */
+
+/**begin repeat1
+ * #func = sqrt, absolute#
+ */
+
+#if @vector@
+
+/* prototypes */
+static void
+sse2_@func@_@TYPE@(@type@ * op, const @type@ * ip, const npy_intp n);
+
+#endif
+
+static NPY_INLINE int
+run_unary_simd_@func@_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps)
+{
+#if @vector@ && defined HAVE_EMMINTRIN_H
+    if (IS_BLOCKABLE_UNARY(sizeof(@type@), 16)) {
+        sse2_@func@_@TYPE@((@type@*)args[1], (@type@*)args[0], dimensions[0]);
+        return 1;
+    }
+#endif
+    return 0;
+}
+/**end repeat1**/
+
+/**end repeat**/
+
+
+/*
+ * Vectorized operations
+ */
+
+/**begin repeat
+ *  #type = npy_float, npy_double#
+ *  #TYPE = FLOAT, DOUBLE#
+ *  #scalarf = npy_sqrtf, npy_sqrt#
+ *  #c = f, #
+ *  #vtype = __m128, __m128d#
+ *  #vpre = _mm, _mm#
+ *  #vsuf = ps, pd#
+ */
+
+#ifdef HAVE_EMMINTRIN_H
+#include <emmintrin.h>
+
+
+static void
+sse2_sqrt_@TYPE@(@type@ * op, const @type@ * ip, const npy_intp n)
+{
+    /* align output to 16 bytes */
+    UNARY_LOOP_BLOCK_ALIGN_VAR(op, @type@, 16) {
+        op[i] = @scalarf@(ip[i]);
+    }
+    assert(npy_is_aligned(&op[i], 16));
+    if (npy_is_aligned(&ip[i], 16)) {
+        UNARY_LOOP_BLOCKED(@type@, 16) {
+            @vtype@ d = @vpre@_load_@vsuf@(&ip[i]);
+            @vpre@_store_@vsuf@(&op[i], @vpre@_sqrt_@vsuf@(d));
+        }
+    }
+    else {
+        UNARY_LOOP_BLOCKED(@type@, 16) {
+            @vtype@ d = @vpre@_loadu_@vsuf@(&ip[i]);
+            @vpre@_store_@vsuf@(&op[i], @vpre@_sqrt_@vsuf@(d));
+        }
+    }
+    UNARY_LOOP_BLOCKED_END {
+        op[i] = @scalarf@(ip[i]);
+    }
+}
+
+
+static void
+sse2_absolute_@TYPE@(@type@ * op, const @type@ * ip, const npy_intp n)
+{
+    /*
+     * get 0x7FFFFFFF mask (everything but signbit set)
+     * float & ~mask will remove the sign
+     * this is equivalent to how the compiler implements fabs on amd64
+     */
+    const @vtype@ mask = @vpre@_set1_@vsuf@(-0.@c@);
+
+    /* align output to 16 bytes */
+    UNARY_LOOP_BLOCK_ALIGN_VAR(op, @type@, 16) {
+        const @type@ tmp = ip[i] > 0 ? ip[i]: -ip[i];
+        /* add 0 to clear -0.0 */
+        op[i] = tmp + 0;
+    }
+    assert(npy_is_aligned(&op[i], 16));
+    if (npy_is_aligned(&ip[i], 16)) {
+        UNARY_LOOP_BLOCKED(@type@, 16) {
+            @vtype@ a = @vpre@_load_@vsuf@(&ip[i]);
+            @vpre@_store_@vsuf@(&op[i], @vpre@_andnot_@vsuf@(mask, a));
+        }
+    }
+    else {
+        UNARY_LOOP_BLOCKED(@type@, 16) {
+            @vtype@ a = @vpre@_loadu_@vsuf@(&ip[i]);
+            @vpre@_store_@vsuf@(&op[i], @vpre@_andnot_@vsuf@(mask, a));
+        }
+    }
+    UNARY_LOOP_BLOCKED_END {
+        const @type@ tmp = ip[i] > 0 ? ip[i]: -ip[i];
+        /* add 0 to clear -0.0 */
+        op[i] = tmp + 0;
+    }
+}
+
+#endif
+
+
+/**end repeat**/
+
+#endif
diff --git a/numpy/core/tests/test_blasdot.py b/numpy/core/tests/test_blasdot.py
index fb215296f..2e99cf5b0 100644
--- a/numpy/core/tests/test_blasdot.py
+++ b/numpy/core/tests/test_blasdot.py
@@ -111,15 +111,15 @@ def test_dot_array_order():
                 dtype=arr_type, order=a_order)
             assert_array_equal(np.dot(a, a), a.dot(a))
             # (a.a)' = a'.a', note that mse~=1e-31 needs almost_equal
-            assert_almost_equal(a.dot(a), a.T.dot(a.T).T, decimal=30)
+            assert_almost_equal(a.dot(a), a.T.dot(a.T).T, decimal=prec)
 
             #
             # Check with making explicit copy
             #
             a_T = a.T.copy(order=a_order)
-            assert_array_equal(a_T.dot(a_T), a.T.dot(a.T))
-            assert_array_equal(a.dot(a_T), a.dot(a.T))
-            assert_array_equal(a_T.dot(a), a.T.dot(a))
+            assert_almost_equal(a_T.dot(a_T), a.T.dot(a.T), decimal=prec)
+            assert_almost_equal(a.dot(a_T), a.dot(a.T), decimal=prec)
+            assert_almost_equal(a_T.dot(a), a.T.dot(a), decimal=prec)
 
             #
             # Compare with multiarray dot
@@ -135,10 +135,10 @@ def test_dot_array_order():
                 b = np.asarray(np.random.randn(a_dim, b_dim),
                     dtype=arr_type, order=b_order)
                 b_T = b.T.copy(order=b_order)
-                assert_array_equal(a_T.dot(b), a.T.dot(b))
-                assert_array_equal(b_T.dot(a), b.T.dot(a))
+                assert_almost_equal(a_T.dot(b), a.T.dot(b), decimal=prec)
+                assert_almost_equal(b_T.dot(a), b.T.dot(a), decimal=prec)
                 # (b'.a)' = a'.b
-                assert_almost_equal(b.T.dot(a), a.T.dot(b).T, decimal=30)
+                assert_almost_equal(b.T.dot(a), a.T.dot(b).T, decimal=prec)
                 assert_almost_equal(a.dot(b), _dot(a, b), decimal=prec)
                 assert_almost_equal(b.T.dot(a), _dot(b.T, a), decimal=prec)
 
@@ -147,7 +147,7 @@ def test_dot_array_order():
                     c = np.asarray(np.random.randn(b_dim, c_dim),
                         dtype=arr_type, order=c_order)
                     c_T = c.T.copy(order=c_order)
-                    assert_array_equal(c.T.dot(b.T), c_T.dot(b_T))
-                    assert_almost_equal(c.T.dot(b.T).T, b.dot(c), decimal=30)
+                    assert_almost_equal(c.T.dot(b.T), c_T.dot(b_T), decimal=prec)
+                    assert_almost_equal(c.T.dot(b.T).T, b.dot(c), decimal=prec)
                     assert_almost_equal(b.dot(c), _dot(b, c), decimal=prec)
                     assert_almost_equal(c.T.dot(b.T), _dot(c.T, b.T), decimal=prec)
diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py
index 2f6716c23..87903c28a 100644
--- a/numpy/core/tests/test_multiarray.py
+++ b/numpy/core/tests/test_multiarray.py
@@ -1207,7 +1207,7 @@ class TestFancyIndexing(TestCase):
         x[m] = 5
         assert_array_equal(x, array([1,5,3,4]))
 
-    def test_assign_mask(self):
+    def test_assign_mask2(self):
         xorig = array([[1,2,3,4],[5,6,7,8]])
         m = array([0,1],bool)
         m2 = array([[0,1],[1,0]],bool)
diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py
index 1967db547..6bbb15e6b 100644
--- a/numpy/core/tests/test_umath.py
+++ b/numpy/core/tests/test_umath.py
@@ -348,7 +348,7 @@ class TestHypotSpecialValues(TestCase):
         assert_hypot_isnan(np.nan, np.nan)
         assert_hypot_isnan(np.nan, 1)
 
-    def test_nan_outputs(self):
+    def test_nan_outputs2(self):
         assert_hypot_isinf(np.nan, np.inf)
         assert_hypot_isinf(np.inf, np.nan)
         assert_hypot_isinf(np.inf, 0)
@@ -687,6 +687,32 @@ class TestSign(TestCase):
             np.seterr(**olderr)
 
 
+class TestAbsolute(TestCase):
+    def test_abs_blocked(self):
+        "simd tests on abs"
+        for dt in [np.float32, np.float64]:
+            for out, inp, msg in gen_alignment_data(dtype=dt, type='unary',
+                                                    max_size=17):
+                tgt = [ncu.absolute(i) for i in inp]
+                np.absolute(inp, out=out)
+                assert_equal(out, tgt, err_msg=msg)
+                self.assertTrue((out >= 0).all())
+
+                prev = np.geterr()
+                # will throw invalid flag depending on compiler optimizations
+                np.seterr(invalid='ignore')
+                for v in [np.nan, -np.inf, np.inf]:
+                    for i in range(inp.size):
+                        d = np.arange(inp.size, dtype=dt)
+                        inp[:] = -d
+                        inp[i] = v
+                        d[i] = -v if v == -np.inf else v
+                        assert_array_equal(np.abs(inp), d, err_msg=msg)
+                        np.abs(inp, out=out)
+                        assert_array_equal(out, d, err_msg=msg)
+                np.seterr(invalid=prev['invalid'])
+
+
 class TestSpecialMethods(TestCase):
     def test_wrap(self):
         class with_wrap(object):
diff --git a/numpy/linalg/linalg.py b/numpy/linalg/linalg.py
index ae0da3685..5d632fed7 100644
--- a/numpy/linalg/linalg.py
+++ b/numpy/linalg/linalg.py
@@ -20,10 +20,10 @@ import warnings
 
 from numpy.core import array, asarray, zeros, empty, transpose, \
         intc, single, double, csingle, cdouble, inexact, complexfloating, \
-        newaxis, ravel, all, Inf, dot, add, multiply, identity, sqrt, \
-        maximum, flatnonzero, diagonal, arange, fastCopyAndTranspose, sum, \
-        isfinite, size, finfo, absolute, log, exp, errstate, geterrobj
-from numpy.lib import triu
+        newaxis, ravel, all, Inf, dot, add, multiply, sqrt, maximum, \
+        fastCopyAndTranspose, sum, isfinite, size, finfo, errstate, \
+        geterrobj, longdouble, rollaxis, amin, amax
+from numpy.lib import triu, asfarray
 from numpy.linalg import lapack_lite, _umath_linalg
 from numpy.matrixlib.defmatrix import matrix_power
 from numpy.compat import asbytes
@@ -1865,7 +1865,38 @@ def lstsq(a, b, rcond=-1):
     st = s[:min(n, m)].copy().astype(result_real_t)
     return wrap(x), wrap(resids), results['rank'], st
 
-def norm(x, ord=None):
+
+def _multi_svd_norm(x, row_axis, col_axis, op):
+    """Compute the extreme singular values of the 2-D matrices in `x`.
+
+    This is a private utility function used by numpy.linalg.norm().
+
+    Parameters
+    ----------
+    x : ndarray
+    row_axis, col_axis : int
+        The axes of `x` that hold the 2-D matrices.
+    op : callable
+        This should be either numpy.amin or numpy.amax.
+
+    Returns
+    -------
+    result : float or ndarray
+        If `x` is 2-D, the return values is a float.
+        Otherwise, it is an array with ``x.ndim - 2`` dimensions.
+        The return values are either the minimum or maximum of the
+        singular values of the matrices, depending on whether `op`
+        is `numpy.amin` or `numpy.amax`.
+
+    """
+    if row_axis > col_axis:
+        row_axis -= 1
+    y = rollaxis(rollaxis(x, col_axis, x.ndim), row_axis, -1)
+    result = op(svd(y, compute_uv=0), axis=-1)
+    return result
+
+
+def norm(x, ord=None, axis=None):
     """
     Matrix or vector norm.
 
@@ -1875,16 +1906,22 @@ def norm(x, ord=None):
 
     Parameters
     ----------
-    x : {(M,), (M, N)} array_like
-        Input array.
+    x : array_like
+        Input array.  If `axis` is None, `x` must be 1-D or 2-D.
     ord : {non-zero int, inf, -inf, 'fro'}, optional
         Order of the norm (see table under ``Notes``). inf means numpy's
         `inf` object.
+    axis : {int, 2-tuple of ints, None}, optional
+        If `axis` is an integer, it specifies the axis of `x` along which to
+        compute the vector norms.  If `axis` is a 2-tuple, it specifies the
+        axes that hold 2-D matrices, and the matrix norms of these matrices
+        are computed.  If `axis` is None then either a vector norm (when `x`
+        is 1-D) or a matrix norm (when `x` is 2-D) is returned.
 
     Returns
     -------
-    n : float
-        Norm of the matrix or vector.
+    n : float or ndarray
+        Norm of the matrix or vector(s).
 
     Notes
     -----
@@ -1967,44 +2004,96 @@ def norm(x, ord=None):
     >>> LA.norm(a, -3)
     nan
 
+    Using the `axis` argument to compute vector norms:
+
+    >>> c = np.array([[ 1, 2, 3],
+    ...               [-1, 1, 4]])
+    >>> LA.norm(c, axis=0)
+    array([ 1.41421356,  2.23606798,  5.        ])
+    >>> LA.norm(c, axis=1)
+    array([ 3.74165739,  4.24264069])
+    >>> LA.norm(c, ord=1, axis=1)
+    array([6, 6])
+
+    Using the `axis` argument to compute matrix norms:
+
+    >>> m = np.arange(8).reshape(2,2,2)
+    >>> LA.norm(m, axis=(1,2))
+    array([  3.74165739,  11.22497216])
+    >>> LA.norm(m[0, :, :]), LA.norm(m[1, :, :])
+    (3.7416573867739413, 11.224972160321824)
+
     """
     x = asarray(x)
-    if ord is None: # check the default case first and handle it immediately
+
+    # Check the default case first and handle it immediately.
+    if ord is None and axis is None:
+        s = (x.conj() * x).real
         return sqrt(add.reduce((x.conj() * x).ravel().real))
 
+    # Normalize the `axis` argument to a tuple.
+    if axis is None:
+        axis = tuple(range(x.ndim))
+    elif not isinstance(axis, tuple):
+        axis = (axis,)
+
     nd = x.ndim
-    if nd == 1:
+    if len(axis) == 1:
         if ord == Inf:
-            return abs(x).max()
+            return abs(x).max(axis=axis)
         elif ord == -Inf:
-            return abs(x).min()
+            return abs(x).min(axis=axis)
         elif ord == 0:
-            return (x != 0).sum() # Zero norm
+            # Zero norm
+            return (x != 0).sum(axis=axis)
         elif ord == 1:
-            return abs(x).sum() # special case for speedup
-        elif ord == 2:
-            return sqrt(((x.conj()*x).real).sum()) # special case for speedup
+            # special case for speedup
+            return add.reduce(abs(x), axis=axis)
+        elif ord is None or ord == 2:
+            # special case for speedup
+            s = (x.conj() * x).real
+            return sqrt(add.reduce(s, axis=axis))
         else:
             try:
                 ord + 1
             except TypeError:
                 raise ValueError("Invalid norm order for vectors.")
-            return ((abs(x)**ord).sum())**(1.0/ord)
-    elif nd == 2:
+            if x.dtype.type is not longdouble:
+                # Convert to a float type, so integer arrays give
+                # float results.  Don't apply asfarray to longdouble arrays,
+                # because it will downcast to float64.
+                absx = asfarray(abs(x))
+            return add.reduce(absx**ord, axis=axis)**(1.0/ord)
+    elif len(axis) == 2:
+        row_axis, col_axis = axis
+        if not (-x.ndim <= row_axis < x.ndim and
+                -x.ndim <= col_axis < x.ndim):
+            raise ValueError('Invalid axis %r for an array with shape %r' %
+                             (axis, x.shape))
+        if row_axis % x.ndim == col_axis % x.ndim:
+            raise ValueError('Duplicate axes given.')
         if ord == 2:
-            return svd(x, compute_uv=0).max()
+            return _multi_svd_norm(x, row_axis, col_axis, amax)
         elif ord == -2:
-            return svd(x, compute_uv=0).min()
+            return _multi_svd_norm(x, row_axis, col_axis, amin)
         elif ord == 1:
-            return abs(x).sum(axis=0).max()
+            if col_axis > row_axis:
+                col_axis -= 1
+            return add.reduce(abs(x), axis=row_axis).max(axis=col_axis)
         elif ord == Inf:
-            return abs(x).sum(axis=1).max()
+            if row_axis > col_axis:
+                row_axis -= 1
+            return add.reduce(abs(x), axis=col_axis).max(axis=row_axis)
         elif ord == -1:
-            return abs(x).sum(axis=0).min()
+            if col_axis > row_axis:
+                col_axis -= 1
+            return add.reduce(abs(x), axis=row_axis).min(axis=col_axis)
         elif ord == -Inf:
-            return abs(x).sum(axis=1).min()
-        elif ord in ['fro','f']:
-            return sqrt(add.reduce((x.conj() * x).real.ravel()))
+            if row_axis > col_axis:
+                row_axis -= 1
+            return add.reduce(abs(x), axis=col_axis).min(axis=row_axis)
+        elif ord in [None, 'fro', 'f']:
+            return sqrt(add.reduce((x.conj() * x).real, axis=axis))
         else:
             raise ValueError("Invalid norm order for matrices.")
     else:
diff --git a/numpy/linalg/tests/test_linalg.py b/numpy/linalg/tests/test_linalg.py
index 2dc55ab5e..881311c94 100644
--- a/numpy/linalg/tests/test_linalg.py
+++ b/numpy/linalg/tests/test_linalg.py
@@ -515,30 +515,37 @@ class TestEigh(HermitianTestCase, HermitianGeneralizedTestCase, TestCase):
 
 
 class _TestNorm(TestCase):
+
     dt = None
     dec = None
+
     def test_empty(self):
         assert_equal(norm([]), 0.0)
         assert_equal(norm(array([], dtype=self.dt)), 0.0)
         assert_equal(norm(atleast_2d(array([], dtype=self.dt))), 0.0)
 
     def test_vector(self):
-        a = [1.0,2.0,3.0,4.0]
-        b = [-1.0,-2.0,-3.0,-4.0]
-        c = [-1.0, 2.0,-3.0, 4.0]
+        a = [1, 2, 3, 4]
+        b = [-1, -2, -3, -4]
+        c = [-1, 2, -3, 4]
 
         def _test(v):
-            np.testing.assert_almost_equal(norm(v), 30**0.5, decimal=self.dec)
-            np.testing.assert_almost_equal(norm(v,inf), 4.0, decimal=self.dec)
-            np.testing.assert_almost_equal(norm(v,-inf), 1.0, decimal=self.dec)
-            np.testing.assert_almost_equal(norm(v,1), 10.0, decimal=self.dec)
-            np.testing.assert_almost_equal(norm(v,-1), 12.0/25,
-                    decimal=self.dec)
-            np.testing.assert_almost_equal(norm(v,2), 30**0.5,
-                    decimal=self.dec)
-            np.testing.assert_almost_equal(norm(v,-2), ((205./144)**-0.5),
-                    decimal=self.dec)
-            np.testing.assert_almost_equal(norm(v,0), 4, decimal=self.dec)
+            np.testing.assert_almost_equal(norm(v), 30**0.5,
+                                           decimal=self.dec)
+            np.testing.assert_almost_equal(norm(v, inf), 4.0,
+                                           decimal=self.dec)
+            np.testing.assert_almost_equal(norm(v, -inf), 1.0,
+                                           decimal=self.dec)
+            np.testing.assert_almost_equal(norm(v, 1), 10.0,
+                                           decimal=self.dec)
+            np.testing.assert_almost_equal(norm(v, -1), 12.0/25,
+                                           decimal=self.dec)
+            np.testing.assert_almost_equal(norm(v, 2), 30**0.5,
+                                           decimal=self.dec)
+            np.testing.assert_almost_equal(norm(v, -2), ((205./144)**-0.5),
+                                           decimal=self.dec)
+            np.testing.assert_almost_equal(norm(v, 0), 4,
+                                           decimal=self.dec)
 
         for v in (a, b, c,):
             _test(v)
@@ -548,25 +555,82 @@ class _TestNorm(TestCase):
             _test(v)
 
     def test_matrix(self):
-        A = matrix([[1.,3.],[5.,7.]], dtype=self.dt)
-        A = matrix([[1.,3.],[5.,7.]], dtype=self.dt)
+        A = matrix([[1, 3], [5, 7]], dtype=self.dt)
         assert_almost_equal(norm(A), 84**0.5)
-        assert_almost_equal(norm(A,'fro'), 84**0.5)
-        assert_almost_equal(norm(A,inf), 12.0)
-        assert_almost_equal(norm(A,-inf), 4.0)
-        assert_almost_equal(norm(A,1), 10.0)
-        assert_almost_equal(norm(A,-1), 6.0)
-        assert_almost_equal(norm(A,2), 9.1231056256176615)
-        assert_almost_equal(norm(A,-2), 0.87689437438234041)
+        assert_almost_equal(norm(A, 'fro'), 84**0.5)
+        assert_almost_equal(norm(A, inf), 12.0)
+        assert_almost_equal(norm(A, -inf), 4.0)
+        assert_almost_equal(norm(A, 1), 10.0)
+        assert_almost_equal(norm(A, -1), 6.0)
+        assert_almost_equal(norm(A, 2), 9.1231056256176615)
+        assert_almost_equal(norm(A, -2), 0.87689437438234041)
 
         self.assertRaises(ValueError, norm, A, 'nofro')
         self.assertRaises(ValueError, norm, A, -3)
         self.assertRaises(ValueError, norm, A, 0)
 
+    def test_axis(self):
+        # Vector norms.
+        # Compare the use of `axis` with computing the norm of each row
+        # or column separately.
+        A = array([[1, 2, 3], [4, 5, 6]], dtype=self.dt)
+        for order in [None, -1, 0, 1, 2, 3, np.Inf, -np.Inf]:
+            expected0 = [norm(A[:,k], ord=order) for k in range(A.shape[1])]
+            assert_almost_equal(norm(A, ord=order, axis=0), expected0)
+            expected1 = [norm(A[k,:], ord=order) for k in range(A.shape[0])]
+            assert_almost_equal(norm(A, ord=order, axis=1), expected1)
+
+        # Matrix norms.
+        B = np.arange(1, 25, dtype=self.dt).reshape(2, 3, 4)
+
+        for order in [None, -2, 2, -1, 1, np.Inf, -np.Inf, 'fro']:
+            assert_almost_equal(norm(A, ord=order), norm(A, ord=order,
+                                                         axis=(0, 1)))
+
+            n = norm(B, ord=order, axis=(1, 2))
+            expected = [norm(B[k], ord=order) for k in range(B.shape[0])]
+            assert_almost_equal(n, expected)
+
+            n = norm(B, ord=order, axis=(2, 1))
+            expected = [norm(B[k].T, ord=order) for k in range(B.shape[0])]
+            assert_almost_equal(n, expected)
+
+            n = norm(B, ord=order, axis=(0, 2))
+            expected = [norm(B[:,k,:], ord=order) for k in range(B.shape[1])]
+            assert_almost_equal(n, expected)
+
+            n = norm(B, ord=order, axis=(0, 1))
+            expected = [norm(B[:,:,k], ord=order) for k in range(B.shape[2])]
+            assert_almost_equal(n, expected)
+
+    def test_bad_args(self):
+        # Check that bad arguments raise the appropriate exceptions.
+
+        A = array([[1, 2, 3], [4, 5, 6]], dtype=self.dt)
+        B = np.arange(1, 25, dtype=self.dt).reshape(2, 3, 4)
+
+        # Using `axis=<integer>` or passing in a 1-D array implies vector
+        # norms are being computed, so also using `ord='fro'` raises a
+        # ValueError.
+        self.assertRaises(ValueError, norm, A, 'fro', 0)
+        self.assertRaises(ValueError, norm, [3, 4], 'fro', None)
+
+        # Similarly, norm should raise an exception when ord is any finite
+        # number other than 1, 2, -1 or -2 when computing matrix norms.
+        for order in [0, 3]:
+            self.assertRaises(ValueError, norm, A, order, None)
+            self.assertRaises(ValueError, norm, A, order, (0, 1))
+            self.assertRaises(ValueError, norm, B, order, (1, 2))
+
+        # Invalid axis
+        self.assertRaises(ValueError, norm, B, None, 3)
+        self.assertRaises(ValueError, norm, B, None, (2, 3))
+        self.assertRaises(ValueError, norm, B, None, (0, 1, 2))
+
 
 class TestNormDouble(_TestNorm):
     dt = np.double
-    dec= 12
+    dec = 12
 
 
 class TestNormSingle(_TestNorm):
@@ -574,6 +638,11 @@ class TestNormSingle(_TestNorm):
     dec = 6
 
 
+class TestNormInt64(_TestNorm):
+    dt = np.int64
+    dec = 12
+
+
 class TestMatrixRank(object):
     def test_matrix_rank(self):
         # Full rank matrix
diff --git a/numpy/ma/tests/test_old_ma.py b/numpy/ma/tests/test_old_ma.py
index 208086f7b..baa6f69a1 100644
--- a/numpy/ma/tests/test_old_ma.py
+++ b/numpy/ma/tests/test_old_ma.py
@@ -423,7 +423,7 @@ class TestMa(TestCase):
         z = where(c, 1, masked)
         assert_(eq(z, [99, 1, 1, 99, 99, 99]))
 
-    def test_testMinMax(self):
+    def test_testMinMax2(self):
         "Test of minumum, maximum."
         assert_(eq(minimum([1, 2, 3], [4, 0, 9]), [1, 0, 3]))
         assert_(eq(maximum([1, 2, 3], [4, 0, 9]), [4, 2, 9]))
diff --git a/numpy/ma/tests/test_subclassing.py b/numpy/ma/tests/test_subclassing.py
index ab0caf96b..54b202684 100644
--- a/numpy/ma/tests/test_subclassing.py
+++ b/numpy/ma/tests/test_subclassing.py
@@ -116,7 +116,7 @@ class TestSubclassing(TestCase):
         self.assertTrue(isinstance(hypot(mx,mx), mmatrix))
         self.assertTrue(isinstance(hypot(mx,x), mmatrix))
 
-    def test_masked_binary_operations(self):
+    def test_masked_binary_operations2(self):
         "Tests domained_masked_binary_operation"
         (x, mx) = self.data
         xmx = masked_array(mx.data.__array__(), mask=mx.mask)
diff --git a/numpy/testing/utils.py b/numpy/testing/utils.py
index 7a3ea7a1c..2d8d57c5f 100644
--- a/numpy/testing/utils.py
+++ b/numpy/testing/utils.py
@@ -1580,13 +1580,13 @@ def gen_alignment_data(dtype=float32, type='binary', max_size=24):
                     (o, o, o, s, dtype, 'in place2')
                 yield out[1:], inp1()[:-1], inp2()[:-1], bfmt % \
                     (o + 1, o, o, s - 1, dtype, 'out of place')
-                yield out[-1:], inp1()[1:], inp2()[:-1], bfmt % \
+                yield out[:-1], inp1()[1:], inp2()[:-1], bfmt % \
                     (o, o + 1, o, s - 1, dtype, 'out of place')
-                yield out[-1:], inp1()[:-1], inp2()[1:], bfmt % \
+                yield out[:-1], inp1()[:-1], inp2()[1:], bfmt % \
                     (o, o, o + 1, s - 1, dtype, 'out of place')
                 yield inp1()[1:], inp1()[:-1], inp2()[:-1], bfmt % \
                     (o + 1, o, o, s - 1, dtype, 'aliased')
-                yield inp1()[-1:], inp1()[1:], inp2()[:-1], bfmt % \
+                yield inp1()[:-1], inp1()[1:], inp2()[:-1], bfmt % \
                     (o, o + 1, o, s - 1, dtype, 'aliased')
-                yield inp1()[-1:], inp1()[:-1], inp2()[1:], bfmt % \
+                yield inp1()[:-1], inp1()[:-1], inp2()[1:], bfmt % \
                     (o, o, o + 1, s - 1, dtype, 'aliased')