7 files changed, 1942 insertions, 0 deletions
diff --git a/numpy/f2py/doc/multiarray/array_from_pyobj.c b/numpy/f2py/doc/multiarray/array_from_pyobj.c
new file mode 100644
index 000000000..7e0de9a74
--- /dev/null
+++ b/numpy/f2py/doc/multiarray/array_from_pyobj.c
@@ -0,0 +1,323 @@
+/* 
+ * File: array_from_pyobj.c 
+ *
+ * Description:
+ * ------------ 
+ * Provides array_from_pyobj function that returns a contigious array
+ * object with the given dimensions and required storage order, either
+ * in row-major (C) or column-major (Fortran) order. The function
+ * array_from_pyobj is very flexible about its Python object argument
+ * that can be any number, list, tuple, or array.
+ * 
+ * array_from_pyobj is used in f2py generated Python extension
+ * modules.
+ *
+ * Author: Pearu Peterson <pearu@cens.ioc.ee>
+ * Created: 13-16 January 2002
+ * $Id: array_from_pyobj.c,v 1.1 2002/01/16 18:57:33 pearu Exp $
+ */
+
+
+#define ARR_IS_NULL(arr,mess) \
+if (arr==NULL) { \
+    fprintf(stderr,"array_from_pyobj:" mess); \
+    return NULL; \
+}
+
+#define CHECK_DIMS_DEFINED(rank,dims,mess) \
+if (count_nonpos(rank,dims)) { \
+  fprintf(stderr,"array_from_pyobj:" mess); \
+  return NULL; \
+}
+
+#define HAS_PROPER_ELSIZE(arr,type_num) \
+  ((PyArray_DescrFromType(type_num)->elsize) == (arr)->descr->elsize)
+
+/* static */
+/* void f2py_show_args(const int type_num, */
+/* 		    const int *dims, */
+/* 		    const int rank, */
+/* 		    const int intent) { */
+/*   int i; */
+/*   fprintf(stderr,"array_from_pyobj:\n\ttype_num=%d\n\trank=%d\n\tintent=%d\n",\ */
+/* 	  type_num,rank,intent); */
+/*   for (i=0;i<rank;++i) */
+/*     fprintf(stderr,"\tdims[%d]=%d\n",i,dims[i]); */
+/* } */
+
+static
+int count_nonpos(const int rank,
+		 const int *dims) {
+  int i=0,r=0;
+  while (i<rank) {
+    if (dims[i] <= 0) ++r;
+    ++i;
+  }
+  return r;
+}
+
+static void lazy_transpose(PyArrayObject* arr);
+static int check_and_fix_dimensions(const PyArrayObject* arr,
+				    const int rank,
+				    int *dims);
+static
+int array_has_column_major_storage(const PyArrayObject *ap);
+
+static
+PyArrayObject* array_from_pyobj(const int type_num,
+				int *dims,
+				const int rank,
+				const int intent,
+				PyObject *obj) {
+  /* Note about reference counting
+     -----------------------------
+     If the caller returns the array to Python, it must be done with
+     Py_BuildValue("N",arr).
+     Otherwise, if obj!=arr then the caller must call Py_DECREF(arr).
+  */
+
+/*   f2py_show_args(type_num,dims,rank,intent); */
+
+  if (intent & F2PY_INTENT_CACHE) {
+    /* Don't expect correct storage order or anything reasonable when
+       returning cache array. */ 
+    if ((intent & F2PY_INTENT_HIDE) 
+	|| (obj==Py_None)) {
+      PyArrayObject *arr = NULL;
+      CHECK_DIMS_DEFINED(rank,dims,"optional,intent(cache) must"
+			 " have defined dimensions.\n");
+      arr = (PyArrayObject *)PyArray_FromDims(rank,dims,type_num);
+      ARR_IS_NULL(arr,"FromDims failed: optional,intent(cache)\n");
+      if (intent & F2PY_INTENT_OUT)
+	Py_INCREF(arr);
+      return arr;
+    }
+    if (PyArray_Check(obj) 
+	&& ISCONTIGUOUS((PyArrayObject *)obj)
+	&& HAS_PROPER_ELSIZE((PyArrayObject *)obj,type_num)
+	) {
+      if (check_and_fix_dimensions((PyArrayObject *)obj,rank,dims))
+	return NULL; /*XXX: set exception */
+      if (intent & F2PY_INTENT_OUT)
+	Py_INCREF(obj);
+      return (PyArrayObject *)obj;
+    }
+    ARR_IS_NULL(NULL,"intent(cache) must be contiguous array with a proper elsize.\n");
+  }
+
+  if (intent & F2PY_INTENT_HIDE) {
+    PyArrayObject *arr = NULL;
+    CHECK_DIMS_DEFINED(rank,dims,"intent(hide) must have defined dimensions.\n");
+    arr = (PyArrayObject *)PyArray_FromDims(rank,dims,type_num);
+    ARR_IS_NULL(arr,"FromDims failed: intent(hide)\n");
+    if (intent & F2PY_INTENT_OUT) {
+      if ((!(intent & F2PY_INTENT_C)) && (rank>1)) {
+	lazy_transpose(arr);
+	arr->flags &= ~CONTIGUOUS;
+      }
+      Py_INCREF(arr);
+    }
+    return arr;
+  }
+
+  if (PyArray_Check(obj)) { /* here we have always intent(in) or
+			       intent(inout) */
+
+    PyArrayObject *arr = (PyArrayObject *)obj;
+    int is_cont = (intent & F2PY_INTENT_C) ? 
+      (ISCONTIGUOUS(arr)) : (array_has_column_major_storage(arr));
+
+    if (check_and_fix_dimensions(arr,rank,dims))
+      return NULL; /*XXX: set exception */
+
+    if ((intent & F2PY_INTENT_COPY)
+	|| (! (is_cont
+	       && HAS_PROPER_ELSIZE(arr,type_num)
+	       && PyArray_CanCastSafely(arr->descr->type_num,type_num)))) {
+      PyArrayObject *tmp_arr = NULL;
+      if (intent & F2PY_INTENT_INOUT) {
+	ARR_IS_NULL(NULL,"intent(inout) array must be contiguous and"
+		    " with a proper type and size.\n")
+	  }
+      if ((rank>1) && (! (intent & F2PY_INTENT_C)))
+	lazy_transpose(arr);
+      if (PyArray_CanCastSafely(arr->descr->type_num,type_num)) {
+	tmp_arr = (PyArrayObject *)PyArray_CopyFromObject(obj,type_num,0,0);
+	ARR_IS_NULL(arr,"CopyFromObject failed: array.\n");
+      } else {
+	tmp_arr = (PyArrayObject *)PyArray_FromDims(arr->nd,
+						    arr->dimensions,
+						    type_num);
+	ARR_IS_NULL(tmp_arr,"FromDims failed: array with unsafe cast.\n");
+	if (copy_ND_array(arr,tmp_arr))
+	  ARR_IS_NULL(NULL,"copy_ND_array failed: array with unsafe cast.\n");
+      }
+      if ((rank>1) && (! (intent & F2PY_INTENT_C))) {
+	lazy_transpose(arr);
+	lazy_transpose(tmp_arr);
+	tmp_arr->flags &= ~CONTIGUOUS;
+      }
+      arr = tmp_arr;
+    }
+    if (intent & F2PY_INTENT_OUT)
+      Py_INCREF(arr);
+      return arr;
+  }
+
+  if ((obj==Py_None) && (intent & F2PY_OPTIONAL)) {
+    PyArrayObject *arr = NULL;
+    CHECK_DIMS_DEFINED(rank,dims,"optional must have defined dimensions.\n");    
+    arr = (PyArrayObject *)PyArray_FromDims(rank,dims,type_num);
+    ARR_IS_NULL(arr,"FromDims failed: optional.\n");
+    if (intent & F2PY_INTENT_OUT) {
+      if ((!(intent & F2PY_INTENT_C)) && (rank>1)) {
+	lazy_transpose(arr);
+	arr->flags &= ~CONTIGUOUS;
+      }
+      Py_INCREF(arr);
+    }
+    return arr;
+  }
+
+  if (intent & F2PY_INTENT_INOUT) {
+    ARR_IS_NULL(NULL,"intent(inout) argument must be an array.\n");
+  }
+
+  {
+    PyArrayObject *arr = (PyArrayObject *) \
+      PyArray_ContiguousFromObject(obj,type_num,0,0);
+    ARR_IS_NULL(arr,"ContiguousFromObject failed: not a sequence.\n");
+    if (check_and_fix_dimensions(arr,rank,dims))
+      return NULL; /*XXX: set exception */
+    if ((rank>1) && (! (intent & F2PY_INTENT_C))) {
+      PyArrayObject *tmp_arr = NULL;
+      lazy_transpose(arr);
+      arr->flags &= ~CONTIGUOUS;
+      tmp_arr = (PyArrayObject *) PyArray_CopyFromObject((PyObject *)arr,type_num,0,0);
+      Py_DECREF(arr);
+      arr = tmp_arr;
+      ARR_IS_NULL(arr,"CopyFromObject(Array) failed: intent(fortran)\n");
+      lazy_transpose(arr);
+      arr->flags &= ~CONTIGUOUS;
+    }
+    if (intent & F2PY_INTENT_OUT)
+      Py_INCREF(arr);
+    return arr;
+  }
+
+}
+
+           /*****************************************/
+           /* Helper functions for array_from_pyobj */
+           /*****************************************/
+
+static
+int array_has_column_major_storage(const PyArrayObject *ap) {
+  /* array_has_column_major_storage(a) is equivalent to
+     transpose(a).iscontiguous() but more efficient.
+
+     This function can be used in order to decide whether to use a
+     Fortran or C version of a wrapped function. This is relevant, for
+     example, in choosing a clapack or flapack function depending on
+     the storage order of array arguments.
+  */
+  int sd;
+  int i;
+  sd = ap->descr->elsize;
+  for (i=0;i<ap->nd;++i) {
+    if (ap->dimensions[i] == 0) return 1;
+    if (ap->strides[i] != sd) return 0;
+    sd *= ap->dimensions[i];
+  }
+  return 1;
+}
+
+static
+void lazy_transpose(PyArrayObject* arr) {
+  /*
+    Changes the order of array strides and dimensions.  This
+    corresponds to the lazy transpose of a Numeric array in-situ.
+    Note that this function is assumed to be used even times for a
+    given array. Otherwise, the caller should set flags &= ~CONTIGUOUS.
+   */
+  int rank,i,s,j; 
+  rank = arr->nd; 
+  if (rank < 2) return;
+
+  for(i=0,j=rank-1;i<rank/2;++i,--j) {
+    s = arr->strides[i];
+    arr->strides[i] = arr->strides[j];
+    arr->strides[j] = s;
+    s = arr->dimensions[i];
+    arr->dimensions[i] = arr->dimensions[j];
+    arr->dimensions[j] = s;
+  }
+}
+
+static
+int check_and_fix_dimensions(const PyArrayObject* arr,const int rank,int *dims) {
+  /*
+    This function fills in blanks (that are -1's) in dims list using
+    the dimensions from arr. It also checks that non-blank dims will
+    match with the corresponding values in arr dimensions.
+   */
+  const int arr_size = (arr->nd)?PyArray_Size((PyObject *)arr):1;
+
+  if (rank > arr->nd) { /* [1,2] -> [[1],[2]]; 1 -> [[1]]  */
+    int new_size = 1;
+    int free_axe = -1;
+    int i;
+    /* Fill dims where -1 or 0; check dimensions; calc new_size; */
+    for(i=0;i<arr->nd;++i) { 
+      if (dims[i] >= 0) {
+	if (dims[i]!=arr->dimensions[i]) {
+	  fprintf(stderr,"%d-th dimension must be fixed to %d but got %d\n",
+		  i,dims[i],arr->dimensions[i]);
+	  return 1;
+	}
+	if (!dims[i]) dims[i] = 1;
+      } else {
+	dims[i] = arr->dimensions[i] ? arr->dimensions[i] : 1;
+      }
+      new_size *= dims[i];
+    }
+    for(i=arr->nd;i<rank;++i)
+      if (dims[i]>1) {
+	fprintf(stderr,"%d-th dimension must be %d but got 0 (not defined).\n",
+		i,dims[i]);
+	return 1;
+      } else if (free_axe<0)
+	free_axe = i;
+      else
+	dims[i] = 1;
+    if (free_axe>=0) {
+      dims[free_axe] = arr_size/new_size;
+      new_size *= dims[free_axe];
+    }
+    if (new_size != arr_size) {
+      fprintf(stderr,"confused: new_size=%d, arr_size=%d (maybe too many free"
+	      " indices)\n",new_size,arr_size);
+      return 1;
+    }
+  } else {
+    int i;
+    for (i=rank;i<arr->nd;++i)
+      if (arr->dimensions[i]>1) {
+	fprintf(stderr,"too many axes: %d, expected rank=%d\n",arr->nd,rank);
+	return 1;
+      }
+    for (i=0;i<rank;++i)
+      if (dims[i]>=0) {
+	if (arr->dimensions[i]!=dims[i]) {
+	  fprintf(stderr,"%d-th dimension must be fixed to %d but got %d\n",
+		  i,dims[i],arr->dimensions[i]);
+	  return 1;	  
+	}
+	if (!dims[i]) dims[i] = 1;	
+      } else
+	dims[i] = arr->dimensions[i];
+  }
+  return 0;
+}
+
+/* End of file: array_from_pyobj.c */
diff --git a/numpy/f2py/doc/multiarray/bar.c b/numpy/f2py/doc/multiarray/bar.c
new file mode 100644
index 000000000..350636ea6
--- /dev/null
+++ b/numpy/f2py/doc/multiarray/bar.c
@@ -0,0 +1,15 @@
+
+#include <stdio.h>
+
+void bar(int *a,int m,int n) {
+  int i,j;
+  printf("C:");
+  printf("m=%d, n=%d\n",m,n);
+  for (i=0;i<m;++i) {
+    printf("Row %d:\n",i+1);
+    for (j=0;j<n;++j)
+      printf("a(i=%d,j=%d)=%d\n",i,j,a[n*i+j]);
+  }
+  if (m*n)
+    a[0] = 7777;
+}
diff --git a/numpy/f2py/doc/multiarray/foo.f b/numpy/f2py/doc/multiarray/foo.f
new file mode 100644
index 000000000..f8c39c4d1
--- /dev/null
+++ b/numpy/f2py/doc/multiarray/foo.f
@@ -0,0 +1,13 @@
+      subroutine foo(a,m,n)
+      integer a(m,n), m,n,i,j
+      print*, "F77:"
+      print*, "m=",m,", n=",n
+      do 100,i=1,m
+         print*, "Row ",i,":"
+         do 50,j=1,n
+            print*, "a(i=",i,",j=",j,") = ",a(i,j)
+ 50      continue
+ 100  continue
+      if (m*n.gt.0) a(1,1) = 77777
+      end
+      
diff --git a/numpy/f2py/doc/multiarray/fortran_array_from_pyobj.txt b/numpy/f2py/doc/multiarray/fortran_array_from_pyobj.txt
new file mode 100644
index 000000000..c7b945c84
--- /dev/null
+++ b/numpy/f2py/doc/multiarray/fortran_array_from_pyobj.txt
@@ -0,0 +1,284 @@
+
+    _____________________________________________________________
+   /   Proposed internal structure for f2py generated extension  \
+  <    modules regarding the issues with different storage-orders >
+   \   of multi-dimensional matrices in Fortran and C.           /
+    =============================================================
+
+Author: Pearu Peterson
+Date:   14 January, 2001
+
+Definitions:
+============
+
+In the following I will use the following definitions:
+
+1) A matrix is a mathematical object that represents a collection of
+   objects (elements), usually visualized in a table form, and one can
+   define a set of various (algebraic,etc) operations for matrices.
+   One can think of a matrix as a defintion of a certain mapping:
+         (i) |--> A(i)
+   where i belongs to the set of indices (an index itself can be a
+   sequence of objects, for example, a sequence of integers) and A(i)
+   is an element from a specified set, for example, a set of fruits.
+   Symbol A then denotes a matrix of fruits.
+
+2) An array is a storage object that represents a collection of
+   objects stored in a certain systematic way, for example, as an
+   ordered sequence in computer memory.
+
+In order to manipulate matrices using computers, one must store matrix
+elements in computer memory. In the following, I will assume that the
+elements of a matrix is stored as an array. There is no unique way in
+which order one should save matrix elements in the array. However, in
+C and Fortran programming languages, two, unfortunately different,
+conventions are used.
+
+Aim:
+====
+
+The purpose of this writing is to work out an interface for Python
+language so that C and Fortran routines can be called without
+bothering about how multi-dimensional matrices are stored in memory.
+For example, accessing a matrix element A[i,j] in Python will be
+equivalent to accessing the same matrix in C, using A[i][j], or in
+Fortran, using A(i,j).
+
+External conditions:
+====================
+
+In C programming language, it is custom to think that matrices are
+stored in the so-called row-major order, that is, a matrix is stored
+row by row, each row is as a contiguous array in computer memory.
+
+In Fortran programming language, matrices are stored in the
+column-major order: each column is a contiguous array in computer
+memory.
+
+In Python programming language, matrices can be stored using Python
+Numeric array() function that uses internally C approach, that is,
+elements of matrices are stored in row-major order. For example,
+A = array([[1,2,3],[4,5,6]]) represents a 2-by-3 matrix
+
+             / 1   2   3 \
+             |           |
+             \ 4   5   6 /
+
+and its elements are stored in computer memory as the following array:
+
+         1  2  3  4  5  6
+
+The same matrix, if used in Fortran, would be stored in computer
+memory as the following array:
+
+         1  4  2  5  3  6
+
+Problem and solution:
+=====================
+
+A problem arises if one wants to use the same matrix both in C and in
+Fortran functions. Then the difference in storage order of a matrix
+elements must be taken into account. This technical detail can be very
+confusing even for an experienced programmer. This is because when
+passing a matrix to a Fortran subroutine, you must (mentally or
+programmically) transpose the matrix and when the subroutine returns,
+you must transpose it back.
+
+As will be discussed below, there is a way to overcome these
+difficulties in Python by creating an interface between Python and
+Fortran code layers that takes care of this transition internally. So
+that if you will read the manual pages of the Fortran codes, then you
+need not to think about how matrices are actually stored, the storage
+order will be the same, seemingly.
+
+Python / C / Fortran interface:
+===============================
+
+The interface between Python and Fortran codes will use the following
+Python Numeric feature: transposing a Numeric array does not involve
+copying of its data but just permuting the dimensions and strides of
+the array (the so-called lazy transpose).
+
+However, when passing a Numeric array data pointer to Fortran or C
+function, the data must be contiguous in memory. If it is not, then
+data is rearranged inplace. I don't think that it can be avoided.
+This is certainly a penalty hit to performance. However, one can
+easily avoid it by creating a Numeric array with the right storage
+order, so that after transposing, the array data will be contiguous in
+memory and the data pointer can safely passed on to the Fortran
+subroutine.  This lazy-transpose operation will be done within the
+interface and users need not to bother about this detail anymore (that
+is, after they initialize Numeric array with matrix elements using the
+proper order. Of course, the proper order depends on the target
+function: C or Fortran). The interface should be smart enough to
+minimize the need of real-transpose operations and the need to
+additional memory storage as well.
+
+Statement of the problem:
+=========================
+
+Consider a M-by-N matrix A of integers, where M and N are the number A
+rows and columns, respectively. 
+
+In Fortran language, the storing array of this matrix can be defined
+as follows:
+
+      integer A(M,N)
+
+in C:
+
+      int A[M][N];
+
+and in Python:
+
+      A = Numeric.zeros((M,N),'i')
+
+Consider also the corresponding Fortran and C functions that
+that use matrix arguments:
+
+Fortran:
+      subroutine FUN(A,M,N)
+      integer A(M,N)
+      ...
+      end
+C:
+      void cun(int *a,int m,int n) {
+      ...
+      }
+
+and the corresponding Python interface signatures:
+
+      def py_fun(a):
+          ...
+      def py_cun(a):
+          ...
+
+Main goal:
+==========
+
+Our goal is to generate Python C/API functions py_fun and py_cun such
+that their usage in Python would be identical. The cruical part of
+their implementation are in functions that take a PyObject and
+return a PyArrayObject such that it is contiguous and its data pointer
+is suitable for passing on to the arguments of C or Fortran functions.
+The prototypes of these functions are:
+
+PyArrayObject* fortran_array_from_pyobj(
+     int typecode,
+     int *dims,
+     int rank,
+     int intent,
+     PyObject *obj);
+
+and
+
+PyArrayObject* c_array_from_pyobj(
+     int typecode,
+     int *dims,
+     int rank,
+     int intent,
+     PyObject *obj);
+
+for wrapping Fortran and C functions, respectively.
+
+Pseudo-code for fortran_array_from_pyobj:
+=========================================
+
+if type(obj) is ArrayType:
+    #raise not check(len(ravel(obj)) >= dims[0]*dims[1]*...*dims[rank-1])
+    if obj.typecode is typecode:
+        if is_contiguous(obj):
+            transpose_data_inplace(obj) # real-transpose
+            set_transpose_strides(obj)  # lazy-transpose
+            Py_INCREF(obj);
+            return obj
+        set_transpose_strides(obj)       
+        if is_contiguous(obj):
+            set_transpose_strides(obj)
+            Py_INCREF(obj);
+            return obj
+        else:
+            tmp_obj = PyArray_ContiguousFromObject(obj,typecode,0,0)
+            swap_datapointer_and_typeinfo(obj,tmp_obj)
+            Py_DECREF(tmp_obj);
+            set_transpose_strides(obj)
+            Py_INCREF(obj);
+            return obj
+    else:
+        tmp_obj = PyArray_FromDims(rank,dims,typecode)
+        set_transpose_strides(tmp_obj)
+        if intent in [in,inout]:
+            copy_ND_array(obj,tmp_obj)
+        swap_datapointer_and_typeinfo(obj,tmp_obj)
+        Py_DECREF(tmp_obj);
+        Py_INCREF(obj);
+        return obj
+elif obj is None: # happens when only intent is 'hide'
+    tmp_obj = PyArray_FromDims(rank,dims,typecode)
+    if intent is out:
+        set_transpose_strides(tmp_obj)
+    # otherwise tmp_obj->data is used as a work array
+    Py_INCREF(tmp_obj)
+    return tmp_obj
+else:
+    tmp_obj = PyArray_ContiguousFromObject(obj,typecode,0,0)
+    #raise not check(len(ravel(obj)) >= dims[0]*dims[1]*...*dims[rank-1])
+    set_transpose_strides(tmp_obj)
+    transpose_data_inplace(tmp_obj)
+    Py_INCREF(tmp_obj)
+    return tmp_obj
+
+Notes:
+    1) CPU expensive tasks are in transpose_data_inplace and
+       copy_ND_array, PyArray_ContiguousFromObject.
+    2) Memory expensive tasks are in PyArray_FromDims,
+       PyArray_ContiguousFromObject
+    3) Side-effects are expected when set_transpose_strides and
+    transpose_data_inplace are used. For example:
+        >>> a = Numeric([[1,2,3],[4,5,6]],'d')
+        >>> a.is_contiguous()
+        1
+        >>> py_fun(a)
+        >>> a.typecode()
+        'i'
+        >>> a.is_contiguous()
+        0
+        >>> transpose(a).is_contiguous()
+        1
+
+Pseudo-code for c_array_from_pyobj:
+===================================
+
+if type(obj) is ArrayType:
+    #raise not check(len(ravel(obj)) >= dims[0]*dims[1]*...*dims[rank-1])
+    if obj.typecode is typecode:
+        if is_contiguous(obj):
+            Py_INCREF(obj);
+            return obj
+        else:
+            tmp_obj = PyArray_ContiguousFromObject(obj,typecode,0,0)
+            swap_datapointer_and_typeinfo(obj,tmp_obj)
+            Py_DECREF(tmp_obj);
+            Py_INCREF(obj);
+            return obj
+    else:
+        tmp_obj = PyArray_FromDims(rank,dims,typecode)
+        if intent in [in,inout]:
+            copy_ND_array(obj,tmp_obj)
+        swap_datapointer_and_typeinfo(obj,tmp_obj)
+        Py_DECREF(tmp_obj);
+        Py_INCREF(obj);
+        return obj
+elif obj is None: # happens when only intent is 'hide'
+    tmp_obj = PyArray_FromDims(rank,dims,typecode)
+    Py_INCREF(tmp_obj)
+    return tmp_obj
+else:
+    tmp_obj = PyArray_ContiguousFromObject(obj,typecode,0,0)
+    #raise not check(len(ravel(obj)) >= dims[0]*dims[1]*...*dims[rank-1])
+    Py_INCREF(tmp_obj)
+    return tmp_obj
+
+
+14 January, 2002
+Pearu Peterson <pearu@cens.ioc.ee>
+\ No newline at end of file
diff --git a/numpy/f2py/doc/multiarray/fun.pyf b/numpy/f2py/doc/multiarray/fun.pyf
new file mode 100644
index 000000000..ed5d1923f
--- /dev/null
+++ b/numpy/f2py/doc/multiarray/fun.pyf
@@ -0,0 +1,89 @@
+!%f90 -*- f90 -*-
+
+!  Example:
+!    Using f2py for wrapping multi-dimensional Fortran and C arrays
+!    [NEW APPROACH, use it with f2py higher than 2.8.x]
+!  $Id: fun.pyf,v 1.3 2002/01/18 10:06:50 pearu Exp $
+
+! Usage (with gcc compiler):
+!   f2py -c fun.pyf foo.f bar.c
+
+python module fun ! in 
+    interface  ! in :fun
+
+! >>> from Numeric import *
+! >>> import fun
+! >>> a=array([[1,2,3],[4,5,6]])
+
+        subroutine foo(a,m,n) ! in :fun:foo.f
+          integer dimension(m,n) :: a
+          intent(in,out,copy) :: a
+          integer optional,check(shape(a,0)==m),depend(a) :: m=shape(a,0)
+          integer optional,check(shape(a,1)==n),depend(a) :: n=shape(a,1)
+        end subroutine foo
+
+! >>> print fun.foo.__doc__
+! foo - Function signature:
+!   a = foo(a,[m,n])
+! Required arguments:
+!   a : input rank-2 array('i') with bounds (m,n)
+! Optional arguments:
+!   m := shape(a,0) input int
+!   n := shape(a,1) input int
+! Return objects:
+!   a : rank-2 array('i') with bounds (m,n)
+
+! >>> print fun.foo(a)
+!  F77:
+!  m= 2, n= 3
+!  Row  1:
+!  a(i= 1,j= 1) =  1
+!  a(i= 1,j= 2) =  2
+!  a(i= 1,j= 3) =  3
+!  Row  2:
+!  a(i= 2,j= 1) =  4
+!  a(i= 2,j= 2) =  5
+!  a(i= 2,j= 3) =  6
+! [[77777     2     3]
+!  [    4     5     6]]
+
+
+        subroutine bar(a,m,n)
+          intent(c)
+          intent(c) bar
+          integer dimension(m,n) :: a
+          intent(in,out) :: a
+          integer optional,check(shape(a,0)==m),depend(a) :: m=shape(a,0)
+          integer optional,check(shape(a,1)==n),depend(a) :: n=shape(a,1)
+          intent(in) m,n
+        end subroutine bar
+
+! >>> print fun.bar.__doc__
+! bar - Function signature:
+!   a = bar(a,[m,n])
+! Required arguments:
+!   a : input rank-2 array('i') with bounds (m,n)
+! Optional arguments:
+!   m := shape(a,0) input int
+!   n := shape(a,1) input int
+! Return objects:
+!   a : rank-2 array('i') with bounds (m,n)
+
+! >>> print fun.bar(a)
+! C:m=2, n=3
+! Row 1:
+! a(i=0,j=0)=1
+! a(i=0,j=1)=2
+! a(i=0,j=2)=3
+! Row 2:
+! a(i=1,j=0)=4
+! a(i=1,j=1)=5
+! a(i=1,j=2)=6
+! [[7777    2    3]
+!  [   4    5    6]]
+
+    end interface 
+end python module fun
+
+! This file was auto-generated with f2py (version:2.9.166).
+! See http://cens.ioc.ee/projects/f2py2e/
diff --git a/numpy/f2py/doc/multiarray/run.pyf b/numpy/f2py/doc/multiarray/run.pyf
new file mode 100644
index 000000000..bb12a439b
--- /dev/null
+++ b/numpy/f2py/doc/multiarray/run.pyf
@@ -0,0 +1,91 @@
+!%f90 -*- f90 -*-
+
+!  Example:
+!    Using f2py for wrapping multi-dimensional Fortran and C arrays
+!    [OLD APPROACH, do not use it with f2py higher than 2.8.x]
+!  $Id: run.pyf,v 1.1 2002/01/14 15:49:46 pearu Exp $
+
+! Usage (with gcc compiler):
+!   f2py -c run.pyf foo.f bar.c -DNO_APPEND_FORTRAN
+
+python module run ! in 
+    interface  ! in :run
+
+! >>> from Numeric import *
+! >>> import run
+! >>> a=array([[1,2,3],[4,5,6]],'i')
+
+        subroutine foo(a,m,n)
+          fortranname foo_
+          integer dimension(m,n) :: a
+          integer optional,check(shape(a,1)==m),depend(a) :: m=shape(a,1)
+          integer optional,check(shape(a,0)==n),depend(a) :: n=shape(a,0)
+        end subroutine foo
+
+! >>> print run.foo.__doc__
+! foo - Function signature:
+!   foo(a,[m,n])
+! Required arguments:
+!   a : input rank-2 array('i') with bounds (n,m)
+! Optional arguments:
+!   m := shape(a,1) input int
+!   n := shape(a,0) input int
+
+! >>> run.foo(a)
+!  F77:
+!  m= 3, n= 2
+!  Row  1:
+!  a(i= 1,j= 1) =  1
+!  a(i= 1,j= 2) =  4
+!  Row  2:
+!  a(i= 2,j= 1) =  2
+!  a(i= 2,j= 2) =  5
+!  Row  3:
+!  a(i= 3,j= 1) =  3
+!  a(i= 3,j= 2) =  6
+
+! >>> run.foo(transpose(a))
+!  F77:
+!  m= 2, n= 3
+!  Row  1:
+!  a(i= 1,j= 1) =  1
+!  a(i= 1,j= 2) =  2
+!  a(i= 1,j= 3) =  3
+!  Row  2:
+!  a(i= 2,j= 1) =  4
+!  a(i= 2,j= 2) =  5
+!  a(i= 2,j= 3) =  6
+
+        subroutine bar(a,m,n)
+          intent(c)
+          integer dimension(m,n) :: a
+          integer optional,check(shape(a,0)==m),depend(a) :: m=shape(a,0)
+          integer optional,check(shape(a,1)==n),depend(a) :: n=shape(a,1)
+        end subroutine bar
+
+! >>> print run.bar.__doc__
+! bar - Function signature:
+!   bar(a,[m,n])
+! Required arguments:
+!   a :  rank-2 array('i') with bounds (m,n)
+! Optional arguments:
+!   m := shape(a,0)  int
+!   n := shape(a,1)  int
+
+! >>> run.bar(a)
+! C:m=2, n=3
+! Row 1:
+! a(i=0,j=0)=1
+! a(i=0,j=1)=2
+! a(i=0,j=2)=3
+! Row 2:
+! a(i=1,j=0)=4
+! a(i=1,j=1)=5
+! a(i=1,j=2)=6
+
+
+    end interface 
+end python module run
+
+! This file was auto-generated with f2py (version:2.8.172).
+! See http://cens.ioc.ee/projects/f2py2e/
diff --git a/numpy/f2py/doc/multiarray/transpose.txt b/numpy/f2py/doc/multiarray/transpose.txt
new file mode 100644
index 000000000..a8d41e6df
--- /dev/null
+++ b/numpy/f2py/doc/multiarray/transpose.txt
@@ -0,0 +1,1127 @@
+From: Phil Garner (garner@signal.dra.hmg.gb)
+ Subject: In place matrix transpose 
+ Newsgroups: sci.math.num-analysis
+ Date: 1993-08-05 06:35:06 PST 
+
+
+Someone was talking about matrix transposes earlier on.  It's a
+curious subject.  I found that an in-place transpose is about 12 times
+slower than the trivial copying method.
+
+Here's somthing I nicked from netlib and translated into C to do the
+in-place one for those that are interested: (matrix must be in one
+block)
+
+
+typedef float scalar; /* float -> double for double precision */
+
+/*
+ * In Place Matrix Transpose
+ * From: Algorithm 380 collected algorithms from ACM.
+ * Converted to C by Phil Garner
+ *
+ * Algorithm appeared in comm. ACM, vol. 13, no. 05,
+ * p. 324.
+ */
+int trans(scalar *a, unsigned m, unsigned n, int *move, int iwrk)
+{
+  scalar b;
+  int i, j, k, i1, i2, ia, ib, ncount, kmi, Max, mn;
+
+  /*
+   * a is a one-dimensional array of length mn=m*n, which
+   * contains the m by n matrix to be transposed.
+   * move is a one-dimensional array of length iwrk
+   * used to store information to speed up the process. the
+   * value iwrk=(m+n)/2 is recommended. Return val indicates the
+   * success or failure of the routine.
+   * normal return = 0
+   * errors
+   * -2, iwrk negative or zero.
+   * ret > 0, (should never occur). in this case
+   * we set ret equal to the final value of i when the search
+   * is completed but some loops have not been moved.
+   * check arguments and initialise
+   */
+
+  /* Function Body */
+  if (n < 2 || m < 2)
+    return 0;
+  if (iwrk < 1)
+    return -2;
+
+  /* If matrix is square, exchange elements a(i,j) and a(j,i). */
+  if (n == m)
+    {
+      for (i = 0; i < m - 1; ++i)
+        for (j = i + 1; j < m; ++j)
+          {
+            i1 = i + j * m;
+            i2 = j + i * m;
+            b = a[i1];
+            a[i1] = a[i2];
+            a[i2] = b;
+          }   return 0;
+    }
+
+  /* Non square matrix */
+  ncount = 2;
+  for (i = 0; i < iwrk; ++i)
+    move[i] = 0;
+
+  if (n > 2)
+    /* Count number,ncount, of single points. */
+    for (ia = 1; ia < n - 1; ++ia)
+      {
+        ib = ia * (m - 1) / (n - 1);
+        if (ia * (m - 1) != ib * (n - 1))
+          continue;
+        ++ncount;
+        i = ia * m + ib;
+        if (i > iwrk)
+          continue;
+        move[i] = 1;
+      }
+
+  /* Set initial values for search. */
+  mn = m * n;
+  k = mn - 1;
+  kmi = k - 1;
+  Max = mn;
+  i = 1;
+
+  while (1)
+    {
+      /* Rearrange elements of a loop. */
+      /* At least one loop must be re-arranged. */
+      i1 = i;
+      while (1)
+        {
+          b = a[i1];
+          while (1)
+            {
+              i2 = n * i1 - k * (i1 / m);
+              if (i1 <= iwrk)
+                move[i1 - 1] = 2;
+              ++ncount;
+              if (i2 == i || i2 >= kmi)
+                {
+                  if (Max == kmi || i2 == i)
+                    break;
+                  Max = kmi;
+                }
+              a[i1] = a[i2];
+              i1 = i2;
+            }
+
+          /* Test for symmetric pair of loops. */
+          a[i1] = b;
+          if (ncount >= mn)
+            return 0;
+          if (i2 == Max || Max == kmi)
+            break;
+          Max = kmi;
+          i1 = Max;
+        }
+
+      /* Search for loops to be rearranged. */
+      while (1)
+        {
+          Max = k - i;
+          ++i;
+          kmi = k - i;
+          if (i > Max)
+            return i;
+          if (i <= iwrk)
+            {
+              if (move[i-1] < 1)
+                break;
+              continue;
+            }
+          if (i == n * i - k * (i / m))
+            continue;
+          i1 = i;
+          while (1)
+            {
+              i2 = n * i1 - k * (i1 / m);
+              if (i2 <= i || i2 >= Max)
+                break;
+              i1 = i2;
+            }
+          if (i2 == i)
+            break;
+        }
+    } /* End never reached */
+}
+
+--
+                       ,-----------------------------       ______
+ ____                  | Phil Garner.            \___|     |/   \ \   ____
+/__/ `--,   _L__L\_    | garner@signal.dra.hmg.gb    |    _|`---'  \_/__/ `--,
+`-0---0-'  `-0--0-'    `--OO-------------------O-----'     `---0---' `-0---0-'
+
+ From: Murray Dow (mld900@anusf.anu.edu.au)
+ Subject: Re: In place matrix transpose 
+ Newsgroups: sci.math.num-analysis
+ Date: 1993-08-09 19:45:57 PST 
+
+
+In article <23qmp3INN3gl@mentor.dra.hmg.gb>, garner@signal.dra.hmg.gb (Phil Garner) writes:
+|> Someone was talking about matrix transposes earlier on.  It's a
+|> curious subject.  I found that an in-place transpose is about 12 times
+|> slower than the trivial copying method.
+|> 
+
+Algorithm 380 from CACM is sloweer than ALG 467. Here are my times
+from a VP2200 vector computer. Note that the CACM algorithms are scalar.
+Times are in seconds, for a 900*904 matrix:
+
+380   NAG   467   disc copy
+1.03  1.14  .391  .177
+
+Compare two vector algortihms, one I wrote and the second a matrix
+copy:
+
+My Alg    Matrix copy
+.0095     .0097
+
+Conclusions: dont use Alg 380 from Netlib. If you have the available memory,
+do a matrix copy. If you don't have the memory, I will send you my algorithm
+when I have published it.
+-- 
+Murray Dow                         GPO Box 4 Canberra ACT 2601 Australia
+Supercomputer Facility             Phone: +61 6 2495028
+Australian National University     Fax:   +61 6 2473425
+mld900@anusf.anu.edu.au
+
+=============================================================================
+
+From: Mark Smotherman (mark@hubcap.clemson.edu)
+ Subject: Matrix transpose benchmark [was Re: MIPS R8000 == TFP?] 
+ Newsgroups: comp.arch, comp.benchmarks, comp.sys.super
+ Date: 1994-07-01 06:35:51 PST 
+
+
+mccalpin@perelandra.cms.udel.edu (John D. McCalpin) writes:
+
+> 
+>Of course, these results are all for the naive algorithm.  I would be 
+>interested to see what an efficient blocked algorithm looks like.  
+>Anyone care to offer one?  There is clearly a lot of performance
+>to be gained by the effort....
+
+Here is a matrix transpose benchmark generator.  Enter something like
+
+        10d10eij;
+
+and you get a benchmark program with tiles of size 10 for the i and j
+inner loops.  Please email code improvements and flames.
+
+Enjoy!
+
+
+/*---------------------------------------------------------------------------
+
+        Matrix Transpose Generator
+
+        Copyright 1993, Dept. of Computer Science, Clemson University
+ 
+        Permission to use, copy, modify, and distribute this software and
+        its documentation for any purpose and without fee is hereby granted,
+        provided that the above copyright notice appears in all copies.
+ 
+        Clemson University and its Dept. of Computer Science make no
+        representations about the suitability of this software for any
+        purpose.  It is provided "as is" without express or implied warranty.
+ 
+        Original author: Mark Smotherman
+
+  -------------------------------------------------------------------------*/
+
+
+/* tpgen.c version 1.0
+ *
+ * generate a matrix transpose loop nest, with tiling and unrolling
+ * (timing code using getrusage is included in the generated program)
+ *
+ * mark smotherman
+ * mark@cs.clemson.edu
+ * clemson university
+ * 9 july 1993
+ *
+ * a loop nest can be described by the order of its loop indices, so
+ * this program takes as input a simple language describing these indices:
+ *  <number>d  ==> generate tiling loop for index i with step size of <number>
+ *  <number>e  ==> generate tiling loop for index j with step size of <number>
+ *  <number>i  ==> generate loop for index i with unrolling factor of <number>
+ *  <number>j  ==> generate loop for index j with unrolling factor of <number>
+ *  ;          ==> input terminator (required)
+ * rules are:
+ *  i,j tokens must appear
+ *  if d appears, it must appear before i
+ *  if e appears, it must appear before j
+ *  ; must appear
+ * matrix size is controlled by #define N in this program.
+ *
+ * this code was adapted from mmgen.c v1.2 and extended to generate pre-
+ * condition loops for unrolling factors that do not evenly divide the
+ * matrix size (or the tiling step size for loop nests with a tiling loop).
+ * note that this program only provides a preconditioning loop for the
+ * innermost loop.  unrolling factors for non-innermost loops that do not
+ * evenly divide the matrix size (or step size) are not supported.
+ *
+ * my interest in this program generator is to hook it to a sentence
+ * generator and a minimum execution time finder, that is
+ *   while((sentence=sgen())!=NULL){
+ *     genprogram=tpgen(sentence);
+ *     system("cc -O4 genprogram.c");
+ *     system("a.out >> tpresults");
+ *   }
+ *   findmintime(tpresults);
+ * this will find the optimum algorithm for the host system via an
+ * exhaustive search.
+ *
+ * please report bugs and suggestions for enhancements to me.
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+#define N 500
+
+#define ALLOC1 temp1=(struct line *)malloc(sizeof(struct line));\
+temp1->indentcnt=indentcnt;
+
+#define LINK1 temp1->next=insertbefore;\
+insertafter->next=temp1;\
+insertafter=temp1;
+
+#define INSERT1 temp1->next=start;\
+start=temp1;
+
+#define ALLOC2 temp1=(struct line *)malloc(sizeof(struct line));\
+temp2=(struct line *)malloc(sizeof(struct line));\
+temp1->indentcnt=indentcnt;\
+temp2->indentcnt=indentcnt++;
+
+#define LINK2 temp1->next=temp2;\
+temp2->next=insertbefore;\
+insertafter->next=temp1;\
+insertafter=temp1;\
+insertbefore=temp2;
+
+struct line{ int indentcnt; char line[256]; struct line *next; };
+
+int indentcnt;
+int iflag,jflag;
+int ijflag,jiflag;
+int dflag,eflag;
+int counter;
+int iistep,jjstep;
+int iunroll,junroll;
+int precond;
+
+char c;
+int i,ttp,nt;
+char *p0;
+char tptype[80];
+char number[10];
+
+struct line *start,*head,*insertafter,*insertbefore,*temp1,*temp2;
+
+void processloop();
+void processstmt();
+
+main(){
+
+  indentcnt=0;
+  iflag=jflag=0;
+  ijflag=jiflag=0;
+  dflag=eflag=0;
+  iunroll=junroll=0;
+  counter=1;
+  precond=0;
+  ttp=0;
+
+  start=NULL;
+  ALLOC2
+  sprintf(temp1->line,"/* begin */\nt_start=second();\n");
+  sprintf(temp2->line,"/* end */\nt_end = second();\n");
+  head=temp1; temp1->next=temp2; temp2->next=NULL;
+  insertafter=temp1; insertbefore=temp2;
+
+  while((c=getchar())!=';'){
+    tptype[ttp++]=c;
+    if(isdigit(c)){
+      nt=0;
+      while(isdigit(c)){
+        number[nt++]=c;
+        c=getchar();
+        if(c==';'){ fprintf(stderr,"unexpected ;!\n"); exit(1); }
+        tptype[ttp++]=c;
+      }
+      number[nt]='\0';
+      sscanf(number,"%d",&counter);
+    }
+    switch(c){
+      case 'd':
+        if(iflag){ fprintf(stderr,"d cannot appear after i!\n"); exit(1); }
+        dflag++;
+        ALLOC1
+        sprintf(temp1->line,"#define IISTEP %d\n",counter);
+        INSERT1
+        iistep=counter;
+        counter=1;
+        ALLOC2
+        sprintf(temp1->line,"for(ii=0;ii<%d;ii+=IISTEP){\n",N);
+        sprintf(temp2->line,"}\n",N);
+        LINK2
+        ALLOC1
+        sprintf(temp1->line,"it=min(ii+IISTEP,%d);\n",N);
+        LINK1
+        break;
+      case 'e':
+        if(jflag){ fprintf(stderr,"e cannot appear after j!\n"); exit(1); }
+        eflag++;
+        ALLOC1
+        sprintf(temp1->line,"#define JJSTEP %d\n",counter);
+        INSERT1
+        jjstep=counter;
+        counter=1;
+        ALLOC2
+        sprintf(temp1->line,"for(jj=0;jj<%d;jj+=JJSTEP){\n",N);
+        sprintf(temp2->line,"}\n",N);
+        LINK2
+        ALLOC1
+        sprintf(temp1->line,"jt=min(jj+JJSTEP,%d);\n",N);
+        LINK1
+        break;
+      case 'i':
+        iunroll=counter;
+        counter=1;
+        iflag++; if(jflag) jiflag++;
+        if(dflag) precond=iistep%iunroll; else precond=N%iunroll;
+        if(precond&&(jiflag==0)){
+          fprintf(stderr,"unrolling factor for outer loop i\n");
+          fprintf(stderr,"  does not evenly divide matrix/step size!\n");
+          exit(1);
+        }
+        if(dflag&&(iunroll>1)&&(N%iistep)){
+          fprintf(stderr,"with unrolling of i, step size for tiled loop ii\n");
+          fprintf(stderr,"  does not evenly divide matrix size!\n");
+          exit(1);
+        }
+        processloop('i',dflag,iunroll,precond,junroll);
+        break;
+      case 'j':
+        junroll=counter;
+        counter=1;
+        jflag++; if(iflag) ijflag++;
+        if(eflag) precond=jjstep%junroll; else precond=N%junroll;
+        if(precond&&(ijflag==0)){
+          fprintf(stderr,"unrolling factor for outer loop j\n");
+          fprintf(stderr,"  does not evenly divide matrix/step size!\n");
+          exit(1);
+        }
+        if(eflag&&(junroll>1)&&(N%jjstep)){
+          fprintf(stderr,"with unrolling of j, step size for tiled loop jj\n");
+          fprintf(stderr,"  does not evenly divide matrix size!\n");
+          exit(1);
+        }
+        processloop('j',eflag,junroll,precond,iunroll);
+        break;
+      default: break;
+    }
+  }
+  processstmt();
+
+  tptype[ttp++]=c;
+
+  if((iflag==0)||(jflag==0)){
+    fprintf(stderr,
+      "one of the loops (i,j) was not specified!\n");
+    exit(1);
+  }
+
+  temp1=start;
+  while(temp1!=NULL){
+    printf("%s",temp1->line);
+    temp1=temp1->next;
+  }
+  printf("#include <stdio.h>\n");
+  printf("#include <sys/time.h>\n");
+  printf("#include <sys/resource.h>\n");
+  if(dflag|eflag) printf("#define min(a,b) ((a)<=(b)?(a):(b))\n");
+  printf("double second();\n");
+  printf("double t_start,t_end,t_total;\n");
+  printf("int times;\n");
+  printf("\ndouble b[%d][%d],dummy[10000],bt[%d][%d];\n\nmain(){\n"
+    ,N,N,N,N);
+  if(precond) printf("  int i,j,n;\n"); else printf("  int i,j;\n");
+  if(dflag) printf("  int ii,it;\n");
+  if(eflag) printf("  int jj,jt;\n");
+  printf("/* set coefficients so that result matrix should have \n");
+  printf(" * column entries equal to column index\n");
+  printf(" */\n");
+  printf("  for (i=0;i<%d;i++){\n",N);
+  printf("    for (j=0;j<%d;j++){\n",N);
+  printf("      b[i][j] = (double) i;\n");
+  printf("    }\n");
+  printf("  }\n");
+  printf("\n  t_total=0.0;\n  for(times=0;times<10;times++){\n\n",N);
+  printf("/* try to flush cache */\n");
+  printf("  for(i=0;i<10000;i++){\n",N);
+  printf("    dummy[i] = 0.0;\n");
+  printf("  }\n");
+  printf("%s",head->line);
+  temp1=head->next;
+  while(temp1!=NULL){
+    for(i=0;i<temp1->indentcnt;i++) printf("  ");
+    while((p0=strstr(temp1->line,"+0"))!=NULL){
+      *p0++=' '; *p0=' ';
+    }
+    printf("%s",temp1->line);
+    temp1=temp1->next;
+  }
+  printf("\n  t_total+=t_end-t_start;\n  }\n");
+  printf("/* check result */\n");
+  printf("  for (j=0;j<%d;j++){\n",N);
+  printf("    for (i=0;i<%d;i++){\n",N);
+  printf("      if (bt[i][j]!=((double)j)){\n");
+  printf("        fprintf(stderr,\"error in bt[%cd][%cd]",'%','%');
+  printf("\\n\",i,j);\n");
+  printf("        fprintf(stderr,\" for %s\\n\");\n",tptype);
+  printf("        exit(1);\n");
+  printf("      }\n");
+  printf("    }\n");
+  printf("  }\n");
+  tptype[ttp]='\0';
+  printf("  printf(\"%c10.2f secs\",t_total);\n",'%');
+  printf("  printf(\" for 10 runs of %s\\n\");\n",tptype);
+  printf("}\n");
+  printf("double second(){\n");
+  printf("  void getrusage();\n");
+  printf("  struct rusage ru;\n");
+  printf("  double t;\n");
+  printf("  getrusage(RUSAGE_SELF,&ru);\n");
+  printf("  t = ((double)ru.ru_utime.tv_sec) +\n");
+  printf("    ((double)ru.ru_utime.tv_usec)/1.0e6;\n");
+  printf("  return t;\n");
+  printf("}\n");
+
+}
+
+void processloop(index,flag,unroll,precond,unroll2)
+char index;
+int flag,unroll,precond,unroll2;
+{
+  char build[80],temp[40];
+  int n;
+  if(precond){
+    ALLOC1
+    sprintf(temp1->line,"/* preconditioning loop for unrolling factor */\n");
+    LINK1
+    if(unroll2==1){
+      build[0]='\0';
+      if(flag){
+        if(index='i')
+          sprintf(temp,"n=IISTEP%c%d; ",'%',unroll);
+        else
+          sprintf(temp,"n=JJSTEP%c%d; ",'%',unroll);
+        strcat(build,temp);
+        sprintf(temp,"for(%c=%c%c;%c<%c%c+n;%c++) ",index,index,index,
+          index,index,index,index);
+        strcat(build,temp);
+      }else{
+        sprintf(temp,"n=%d%c%d; ",N,'%',unroll);
+        strcat(build,temp);
+        sprintf(temp,"for(%c=0;%c<n;%c++) ",index,index,index);
+        strcat(build,temp);
+      }
+      sprintf(temp,"bt[i][j]=b[j][i];\n");
+      strcat(build,temp);
+      ALLOC1
+      sprintf(temp1->line,"%s\n",build);
+      LINK1
+    }else{
+      if(flag){
+        ALLOC1
+        if(index=='i')
+          sprintf(temp1->line,"n=IISTEP%c%d;\n",'%',unroll);
+        else
+          sprintf(temp1->line,"n=JJSTEP%c%d;\n",'%',unroll);
+        LINK1
+        ALLOC1
+        sprintf(temp1->line,"for(%c=%c%c;%c<%c%c+n;%c++){\n",index,index,index,
+          index,index,index,index);
+        LINK1
+      }else{
+        ALLOC1
+        sprintf(temp1->line,"n=%d%c%d;\n",N,'%',unroll);
+        LINK1
+        ALLOC1
+        sprintf(temp1->line,"for(%c=0;%c<n;%c++){\n",index,index,index);
+        LINK1
+      }
+      if(index=='i'){
+        for(n=0;n<unroll2;n++){
+          ALLOC1
+          sprintf(temp1->line,"  bt[i][j+%d]=b[j+%d][i];\n",n,n);
+          LINK1
+        }
+      }else{
+        for(n=0;n<unroll2;n++){
+          ALLOC1
+          sprintf(temp1->line,"  bt[i+%d][j]=b[j][i+%d];\n",n,n);
+          LINK1
+        }
+      }
+      ALLOC1
+      sprintf(temp1->line,"}\n");
+      LINK1
+    }
+    ALLOC2
+    if(flag){
+      sprintf(temp1->line,"for(%c=%c%c+n;%c<%ct;%c+=%d){\n",index,index,index,
+        index,index,index,unroll);
+    }else{
+      sprintf(temp1->line,"for(%c=n;%c<%d;%c+=%d){\n",index,index,N,index,
+        unroll);
+    }
+    sprintf(temp2->line,"}\n",N);
+    LINK2
+  }else{
+    ALLOC2
+    if(unroll==1){
+      if(flag){
+        sprintf(temp1->line,"for(%c=%c%c;%c<%ct;%c++){\n",index,index,index,
+          index,index,index);
+      }else{
+        sprintf(temp1->line,"for(%c=0;%c<%d;%c++){\n",index,index,N,index);
+      }
+    }else{
+      if(flag){
+        sprintf(temp1->line,"for(%c=%c%c;%c<%ct;%c+=%d){\n",index,index,index,
+          index,index,index,unroll);
+      }else{
+        sprintf(temp1->line,"for(%c=0;%c<%d;%c+=%d){\n",index,index,N,index,
+          unroll);
+      }
+    }
+    sprintf(temp2->line,"}\n",N);
+    LINK2
+  }
+}
+
+void processstmt()
+{
+  int i,j;
+  for(i=0;i<iunroll;i++){
+    for(j=0;j<junroll;j++){
+      ALLOC1
+      sprintf(temp1->line,"bt[i+%d][j+%d]=b[j+%d][i+%d];\n",i,j,j,i);
+      LINK1
+    }
+  }
+}
+-- 
+Mark Smotherman, Computer Science Dept., Clemson University, Clemson, SC
+
+=======================================================================
+From: has (h.genceli@bre.com)
+ Subject: transpose of a nxm matrix stored in a vector !!! 
+ Newsgroups: sci.math.num-analysis
+ Date: 2000/07/25 
+
+
+If I have a matrix nrows x ncols, I can store it in a vector.
+so A(i,j) is really a[i*ncols+j]. So really TRANS of A
+(say B) is really is also a vector B where
+
+0<=i b[j*nrows+i] <nrows, 0<=j<ncols
+b[j*nrows+i] = a[i*ncols+j].
+
+Fine but I want to use only one array a to do this transformation.
+
+i.e a[j*nrows+i] = a[i*ncols+j]. this will itself
+erase some elements so each time a swap is necessary in a loop.
+
+temp = a[j*nrows+i]
+a[j*nrows+i] = a[i*ncols+j]
+a[i*ncols+j] = temp
+
+but still this will lose some info as it is, so indexing
+should have more intelligence in it ???? anybody
+can give me a lead here, thanks.
+
+Has
+
+ From: wei-choon ng (wng@ux8.cso.uiuc.edu)
+ Subject: Re: transpose of a nxm matrix stored in a vector !!! 
+ Newsgroups: sci.math.num-analysis
+ Date: 2000/07/25 
+
+
+has <h.genceli@bre.com> wrote:
+> If I have a matrix nrows x ncols, I can store it in a vector.
+> so A(i,j) is really a[i*ncols+j]. So really TRANS of A
+> (say B) is really is also a vector B where
+
+[snip]
+
+Hey, if you just want to do a transpose-matrix vector multiply, there is
+no need to explicitly store the transpose matrix in another array and
+doubling the storage!
+
+W.C.
+-- 
+
+ From: Robin Becker (robin@jessikat.fsnet.co.uk)
+ Subject: Re: transpose of a nxm matrix stored in a vector !!! 
+ Newsgroups: sci.math.num-analysis
+ Date: 2000/07/25 
+
+
+In article <snr532fo3j1180@corp.supernews.com>, has <h.genceli@bre.com>
+writes
+>If I have a matrix nrows x ncols, I can store it in a vector.
+>so A(i,j) is really a[i*ncols+j]. So really TRANS of A
+>(say B) is really is also a vector B where
+>
+>0<=i b[j*nrows+i] <nrows, 0<=j<ncols
+>b[j*nrows+i] = a[i*ncols+j].
+>
+>Fine but I want to use only one array a to do this transformation.
+>
+>i.e a[j*nrows+i] = a[i*ncols+j]. this will itself
+>erase some elements so each time a swap is necessary in a loop.
+>
+>temp = a[j*nrows+i]
+>a[j*nrows+i] = a[i*ncols+j]
+>a[i*ncols+j] = temp
+>
+>but still this will lose some info as it is, so indexing
+>should have more intelligence in it ???? anybody
+>can give me a lead here, thanks.
+>
+>Has
+>
+>
+>
+
+void dmx_transpose(unsigned n, unsigned m, double* a, double* b)
+{
+        unsigned        size = m*n;
+        if(b!=a){
+                real    *bmn, *aij, *anm;
+                bmn = b + size; /*b+n*m*/
+                anm = a + size;
+                while(b<bmn) for(aij=a++;aij<anm; aij+=n ) *b++ = *aij;
+                }
+        else if(size>3){
+                unsigned i,row,column,current;
+                for(i=1, size -= 2;i<size;i++){
+                        current = i;
+                        do      {
+                                /*current = row+n*column*/
+                                column = current/m;
+                                row = current%m;
+                                current = n*row +  column;
+                                } while(current < i);
+
+                        if (current >i) {
+                                real temp = a[i];
+                                a[i] = a[current];
+                                a[current] = temp;
+                                }
+                        }
+                }
+}
+-- 
+Robin Becker
+
+ From: E. Robert Tisdale (edwin@netwood.net)
+ Subject: Re: transpose of a nxm matrix stored in a vector !!! 
+ Newsgroups: sci.math.num-analysis
+ Date: 2000/07/25 
+
+
+Take a look at
+The C++ Scalar, Vector, Matrix and Tensor class library
+
+    http://www.netwood.net/~edwin/svmt/
+
+<Type><System>SubVector&
+        <Type><System>SubVector::transpose(Extent p, Extent q) {
+  <Type><System>SubVector&
+                v = *this;
+  if (1 < p && 1 < q) {
+    // A vector v of extent n = qp is viewed as a q by p matrix U and
+    // a p by q matrix V where U_{ij} = v_{p*i+j} and V_{ij} = v_{q*i+j}.
+    // The vector v is modified in-place so that V is the transpose of U.
+    // The algorithm searches for every sequence k_s of S indices
+    // such that a circular shift of elements v_{k_s} <-- v_{k_{s+1}}
+    // and v_{k_{S-1}} <-- v_{k_0} effects an in-place transpose.
+    Extent      n = q*p;
+    Extent      m = 0;                  // count up to n-2
+    Offset      l = 0;                  // 1 <= l <= n-2
+    while (++l < n-1 && m < n-2) {
+      Offset    k = l;
+      Offset    j = k;
+      while (l < (k = (j%p)*q + j/p)) { // Search backward for k < l.
+        j = k;
+        }
+      // If a sequence of indices beginning with l has any index k < l,
+      // it has already been transposed.  The sequence length S = 1
+      // and diagonal element v_k is its own transpose if k = j.
+      // Skip every index sequence that has already been transposed.
+      if (k == l) {                     // a new sequence
+        if (k < j) {                    // with 1 < S
+          TYPE  x = v[k];               // save v_{k_0}
+          do {
+            v[k] = v[j];                // v_{k_{s}} <-- v_{k_{s+1}}
+            k = j;
+            ++m;
+            } while (l < (j = (k%q)*p + k/q));
+          v[k] = x;                     // v_{k_{S-1}} <-- v_{k_0}
+          }
+        ++m;
+        }
+      }
+    } return v;
+  }
+
+
+
+<Type><System>SubVector&
+
+Read the rest of this message... (50 more lines)
+
+ From: Victor Eijkhout (eijkhout@disco.cs.utk.edu)
+ Subject: Re: transpose of a nxm matrix stored in a vector !!! 
+ Newsgroups: sci.math.num-analysis
+ Date: 2000/07/25 
+
+
+"Alan Miller" <amiller @ vic.bigpond.net.au> writes:
+
+> The attached routine does an in situ transpose. 
+> begin 666 Dtip.f90
+> M4U5"4D]55$E.12!D=&EP("AA+"!N,2P@;C(L(&YD:6TI#0HA("TM+2TM+2TM
+
+Hm. F90? You're not silently allocating a temporary I hope?
+
+(Why did you have to encode this? Now I have to save, this decode, ...
+and all for plain ascii?)
+
+-- 
+Victor Eijkhout
+"When I was coming up, [..] we knew exactly who the they were. It was us
+versus them, and it was clear who the them was were. Today, we are not 
+so sure who the they are, but we know they're there." [G.W. Bush]
+
+ From: Alan Miller (amiller_@_vic.bigpond.net.au)
+ Subject: Re: transpose of a nxm matrix stored in a vector !!! 
+ Newsgroups: sci.math.num-analysis
+ Date: 2000/07/25 
+
+
+Victor Eijkhout wrote in message ...
+>"Alan Miller" <amiller @ vic.bigpond.net.au> writes:
+>
+>> The attached routine does an in situ transpose. 
+>> begin 666 Dtip.f90
+>> M4U5"4D]55$E.12!D=&EP("AA+"!N,2P@;C(L(&YD:6TI#0HA("TM+2TM+2TM
+>
+>Hm. F90? You're not silently allocating a temporary I hope?
+>
+>(Why did you have to encode this? Now I have to save, this decode, ...
+>and all for plain ascii?)
+>
+
+I know the problem.
+I sometimes use a Unix system, and have to use decode64 to read
+attachments.   On the other hand, Windows wraps lines around,
+formats then and generally makes the code unreadable.
+
+The straight code for dtip (double transpose in place) is attached
+this time.
+
+>--
+>Victor Eijkhout
+
+
+--
+Alan Miller, Retired Scientist (Statistician)
+CSIRO Mathematical & Information Sciences
+Alan.Miller -at- vic.cmis.csiro.au
+http://www.ozemail.com.au/~milleraj
+http://users.bigpond.net.au/amiller/
+
+
+=================================================================
+
+From: Darran Edmundson (dedmunds@sfu.ca)
+ Subject: array reordering algorithm? 
+ Newsgroups: sci.math.num-analysis
+ Date: 1995/04/30 
+
+
+A code I've written refers to a complex array as two separate real arrays.  
+However, I have a canned subroutine which expects a single array where the 
+real and imaginary values alternate.  Essentially I have a case of mismatched
+data structures, yet for reasons that I'd rather not go into, I'm stuck with them.
+
+Assuming that the two real arrays A and B are sequential in memory, and
+that the single array of alternating real/imaginary values C shares the same
+space, what I need is a porting subroutine that remaps the data from one format
+to the other - using as little space as possible.
+
+I think of the problem as follows.  Imagine an array of dimension 10 containing
+the values 1,3,5,7,9,2,4,6,8,10 in this order.  
+
+ A(1) /  1   \  C(1)
+ A(2) |  3   |  C(2)
+ A(3) |  5   |  C(3)
+ A(4) |  7   |  C(4)
+ A(5) \  9   |  C(5)
+             |
+ B(1) /  2   |  C(6)
+ B(2) |  4   |  C(7)
+ B(3) |  6   |  C(8)
+ B(4) |  8   |  C(9)
+ B(5) \ 10   /  C(10)
+
+Given that I know this initial pattern, I want to sort the array C in-place *without 
+making comparisons*.  That is, the algorithm can only depend on the initial 
+knowledge of the pattern.  Do you see what a sort is going to do?  It will
+make the A and B arrays alternate, i.e. C(1)=A(1), C(2)=B(1), C(3)=A(2),
+C(4)=B(2), etc.  It's not a real sort though because I can't actually refer to the 
+values above (i.e. no comparisons) because A and B will be holding real data,
+not this contrived pattern.  The pattern above exists though - it's the
+natural ordering in memory of A and B.
+
+Either pair swapping only or a small amount of workspace can be used.  The 
+in-place is important - imagine scaling this problem up to an 
+array of 32 or 64 million double precision values and you can easily see how 
+duplicating the array is not a feasible solution.
+
+Any ideas?  I've been stumped on this for a day and a half now.
+
+Darran Edmundson
+dedmunds@sfu.ca
+
+ From: Roger Critchlow (rec@elf115.elf.org)
+ Subject: Re: array reordering algorithm? 
+ Newsgroups: sci.math.num-analysis
+ Date: 1995/04/30 
+
+
+   Any ideas?  I've been stumped on this for a day and a half now.
+
+Here's some code for in situ permutations of arrays that I wrote
+a few years ago.  It all started from the in situ transposition
+algorithms in the Collected Algorithms of the ACM, the references
+for which always get lost during the decryption from fortran.
+
+This is the minimum space algorithm.  All you need to supply is
+a function which computes the new order array index from the old
+order array index.
+
+If you can spare n*m bits to record the indexes of elements which
+have been permuted, then you can speed things up.
+
+-- rec --
+
+------------------------------------------------------------------------
+/*
+** Arbitrary in situ permutations of an m by n array of base type TYPE.
+** Copyright 1995 by Roger E Critchlow Jr, rec@elf.org, San Francisco, CA.
+** Fair use permitted, caveat emptor.
+*/
+typedef int TYPE;
+
+int transposition(int ij, int m, int n)         /* transposition about diagonal from upper left to lower right */
+{ return ((ij%m)*n+ (ij/m)); }
+
+int countertrans(int ij, int m, int n)          /* transposition about diagonal from upper right to lower left */
+{ return ((m-1-(ij%m))*n+ (n-1-(ij/m))); }
+
+int rotate90cw(int ij, int m, int n)            /* 90 degree clockwise rotation */
+{ return ((m-1-(ij%m))*n+ (ij/m)); }
+
+int rotate90ccw(int ij, int m, int n)           /* 90 degree counter clockwise rotation */
+{ return ((ij%m)*n+ (n-1-(ij/m))); }
+
+int rotate180(int ij, int m, int n)             /* 180 degree rotation */
+{ return ((m-1-(ij/n))*n+ (n-1-(ij%n))); }
+
+int reflecth(int ij, int m, int n)              /* reflection across horizontal plane */
+{ return ((m-1-(ij/n))*n+ (ij%n)); }
+
+int reflectv(int ij, int m, int n)              /* reflection across vertical plane */
+{ return ((ij/n)*n+ (n-1-(ij%n))); }
+
+int in_situ_permutation(TYPE a[], int m, int n, int (*origination)(int ij, int m, int n))
+{
+  int ij, oij, dij, n_to_do;
+  TYPE b;
+  n_to_do = m*n;
+  for (ij = 0; ij < m*n && n_to_do > 0; ij += 1) {
+    /* Test for previously permuted */
+    for (oij = origination(ij,m,n); oij > ij; oij = origination(oij,m,n))
+      ;
+    if (oij < ij)
+      continue;
+    /* Chase the cycle */
+    dij = ij;
+    b = a[ij];
+    for (oij = origination(dij,m,n); oij != ij; oij = origination(dij,m,n)) {
+      a[dij] = a[oij];
+      dij = oij;
+      n_to_do -= 1;
+    }
+    a[dij] = b;
+    n_to_do -= 1;
+  } return 0;
+}
+
+#define TESTING 1
+#if TESTING
+
+/* fill a matrix with sequential numbers, row major ordering */
+void fill_matrix_rows(a, m, n) TYPE *a; int m, n;
+{
+  int i, j;
+  for (i = 0; i < m; i += 1)
+    for (j = 0; j < n; j += 1)
+      a[i*n+j] = i*n+j;
+}
+
+/* fill a matrix with sequential numbers, column major ordering */
+void fill_matrix_cols(a, m, n) TYPE *a; int m, n;
+{
+  int i, j;
+  for (i = 0; i < m; i += 1)
+    for (j = 0; j < n; j += 1)
+      a[i*n+j] = j*m+i;
+}
+
+/* test a matrix for sequential numbers, row major ordering */
+int test_matrix_rows(a, m, n) TYPE *a; int m, n;
+{
+  int i, j, o;
+  for (o = i = 0; i < m; i += 1)
+    for (j = 0; j < n; j += 1)
+      o += a[i*n+j] != i*n+j;
+  return o;
+}
+
+/* test a matrix for sequential numbers, column major ordering */
+int test_matrix_cols(a, m, n) TYPE *a; int m, n;
+{
+  int i, j, o;
+  for (o = i = 0; i < m; i += 1)
+    for (j = 0; j < n; j += 1)
+      o += a[i*n+j] != j*m+i;
+  return o;
+}
+ 
+/* print a matrix */
+void print_matrix(a, m, n) TYPE *a; int m, n;
+{
+  char *format;
+  int i, j;
+  if (m*n < 10) format = "%2d";
+  if (m*n < 100) format = "%3d";
+  if (m*n < 1000) format = "%4d";
+  if (m*n < 10000) format = "%5d";
+  for (i = 0; i < m; i += 1) {
+    for (j = 0; j < n; j += 1)
+      printf(format, a[i*n+j]);
+    printf("\n");
+  }
+}
+
+#if TEST_TRANSPOSE
+#define MAXSIZE 1000
+
+main()
+{
+  int i, j, m, n, o;
+  TYPE a[MAXSIZE];
+  for (m = 1; m < sizeof(a)/sizeof(a[0]); m += 1)
+    for (n = 1; m*n < sizeof(a)/sizeof(a[0]); n += 1) {
+      fill_matrix_rows(a, m, n);                                /* {0 1} {2 3} */
+      if (o = transpose(a, m, n))
+        printf(">> transpose returned %d for a[%d][%d], row major\n", o, m, n);
+      if ((o = test_matrix_cols(a, n, m)) != 0)                 /* {0 2} {1 3} */
+        printf(">> transpose made %d mistakes for a[%d][%d], row major\n", o, m, n);
+      /* column major */
+      fill_matrix_rows(a, m, n);
+      if (o = transpose(a, m, n))
+        printf(">> transpose returned %d for a[%d][%d], column major\n", o, m, n);
+      if ((o = test_matrix_cols(a, n, m)) != 0)
+        printf(">> transpose made %d mistakes for a[%d][%d], column major\n", o, m, n);
+    } return 0;
+}
+#endif                                                          /* TEST_TRANSPOSE */
+
+
+#define TEST_DISPLAY 1
+#if TEST_DISPLAY
+main(argc, argv) int argc; char *argv[];
+{
+  TYPE *a;
+  int m = 5, n = 5;
+  extern void *malloc();
+  if (argc > 1) {
+    m = atoi(argv[1]);
+    if (argc > 2)
+      n = atoi(argv[2]);
+  }
+  a = malloc(m*n*sizeof(TYPE));
+
+  printf("matrix\n");
+  fill_matrix_rows(a, m, n);
+  print_matrix(a, m, n);
+  printf("transposition\n");
+  in_situ_permutation(a, m, n, transposition);
+  print_matrix(a, n, m);
+
+  printf("counter transposition\n");
+  fill_matrix_rows(a, m, n);
+  in_situ_permutation(a, m, n, countertrans);
+  print_matrix(a, n, m);
+
+  printf("rotate 90 degrees clockwise\n");
+  fill_matrix_rows(a, m, n);
+  in_situ_permutation(a, m, n, rotate90cw);
+  print_matrix(a, n, m);
+
+  printf("rotate 90 degrees counterclockwise\n");
+  fill_matrix_rows(a, m, n);
+  in_situ_permutation(a, m, n, rotate90ccw);
+  print_matrix(a, n, m);
+
+  printf("rotate 180 degrees\n");
+  fill_matrix_rows(a, m, n);
+  in_situ_permutation(a, m, n, rotate180);
+  print_matrix(a, m, n);
+
+  printf("reflect across horizontal\n");
+  fill_matrix_rows(a, m, n);
+  in_situ_permutation(a, m, n, reflecth);
+  print_matrix(a, m, n);
+
+  printf("reflect across vertical\n");
+  fill_matrix_rows(a, m, n);
+  in_situ_permutation(a, m, n, reflectv);
+  print_matrix(a, m, n);
+
+  return 0;
+}
+     
+#endif
+#endif
+