1 files changed, 210 insertions, 33 deletions
diff --git a/numpy/f2py/crackfortran.py b/numpy/f2py/crackfortran.py
index 515bdd787..cfd58dfed 100755
--- a/numpy/f2py/crackfortran.py
+++ b/numpy/f2py/crackfortran.py
@@ -84,7 +84,7 @@ Usage:
                        'optional','required', etc)
      K = D['kindselector'] = {['*','kind']} (only if D['typespec'] =
                          'complex' | 'integer' | 'logical' | 'real' )
-     C = D['charselector'] = {['*','len','kind']}
+     C = D['charselector'] = {['*','len','kind','f2py_len']}
                              (only if D['typespec']=='character')
      D['='] --- initialization expression string
      D['typename'] --- name of the type if D['typespec']=='type'
@@ -97,7 +97,7 @@ Usage:
      D['typespec>']*K['*']
      D['typespec'](kind=K['kind'])
      character*C['*']
-     character(len=C['len'],kind=C['kind'])
+     character(len=C['len'],kind=C['kind'], f2py_len=C['f2py_len'])
      (see also fortran type declaration statement formats below)
 
 Fortran 90 type declaration statement format (F77 is subset of F90)
@@ -146,6 +146,11 @@ import re
 import os
 import copy
 import platform
+import codecs
+try:
+    import chardet
+except ImportError:
+    chardet = None
 
 from . import __version__
 
@@ -301,12 +306,38 @@ _has_fix_header = re.compile(r'-\*-\s*fix\s*-\*-', re.I).search
 _free_f90_start = re.compile(r'[^c*]\s*[^\s\d\t]', re.I).match
 
 
+def openhook(filename, mode):
+    """Ensures that filename is opened with correct encoding parameter.
+
+    This function uses chardet package, when available, for
+    determining the encoding of the file to be opened. When chardet is
+    not available, the function detects only UTF encodings, otherwise,
+    ASCII encoding is used as fallback.
+    """
+    bytes = min(32, os.path.getsize(filename))
+    with open(filename, 'rb') as f:
+        raw = f.read(bytes)
+    if raw.startswith(codecs.BOM_UTF8):
+        encoding = 'UTF-8-SIG'
+    elif raw.startswith((codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE)):
+        encoding = 'UTF-32'
+    elif raw.startswith((codecs.BOM_LE, codecs.BOM_BE)):
+        encoding = 'UTF-16'
+    else:
+        if chardet is not None:
+            encoding = chardet.detect(raw)['encoding']
+        else:
+            # hint: install chardet to ensure correct encoding handling
+            encoding = 'ascii'
+    return open(filename, mode, encoding=encoding)
+
+
 def is_free_format(file):
     """Check if file is in free format Fortran."""
     # f90 allows both fixed and free format, assuming fixed unless
     # signs of free format are detected.
     result = 0
-    with open(file, 'r') as f:
+    with openhook(file, 'r') as f:
         line = f.readline()
         n = 15  # the number of non-comment lines to scan for hints
         if _has_f_header(line):
@@ -356,9 +387,16 @@ def readfortrancode(ffile, dowithline=show, istop=1):
     ll, l1 = '', ''
     spacedigits = [' '] + [str(_m) for _m in range(10)]
     filepositiontext = ''
-    fin = fileinput.FileInput(ffile)
+    fin = fileinput.FileInput(ffile, openhook=openhook)
     while True:
-        l = fin.readline()
+        try:
+            l = fin.readline()
+        except UnicodeDecodeError as msg:
+            raise Exception(
+                f'readfortrancode: reading {fin.filename()}#{fin.lineno()}'
+                f' failed with\n{msg}.\nIt is likely that installing chardet'
+                ' package will help f2py determine the input file encoding'
+                ' correctly.')
         if not l:
             break
         if fin.isfirstline():
@@ -1546,7 +1584,9 @@ kindselector = re.compile(
 charselector = re.compile(
     r'\s*(\((?P<lenkind>.*)\)|\*\s*(?P<charlen>.*))\s*\Z', re.I)
 lenkindpattern = re.compile(
-    r'\s*(kind\s*=\s*(?P<kind>.*?)\s*(@,@\s*len\s*=\s*(?P<len>.*)|)|(len\s*=\s*|)(?P<len2>.*?)\s*(@,@\s*(kind\s*=\s*|)(?P<kind2>.*)|))\s*\Z', re.I)
+    r'\s*(kind\s*=\s*(?P<kind>.*?)\s*(@,@\s*len\s*=\s*(?P<len>.*)|)'
+    r'|(len\s*=\s*|)(?P<len2>.*?)\s*(@,@\s*(kind\s*=\s*|)(?P<kind2>.*)'
+    r'|(f2py_len\s*=\s*(?P<f2py_len>.*))|))\s*\Z', re.I)
 lenarraypattern = re.compile(
     r'\s*(@\(@\s*(?!/)\s*(?P<array>.*?)\s*@\)@\s*\*\s*(?P<len>.*?)|(\*\s*(?P<len2>.*?)|)\s*(@\(@\s*(?!/)\s*(?P<array2>.*?)\s*@\)@|))\s*(=\s*(?P<init>.*?)|(@\(@|)/\s*(?P<init2>.*?)\s*/(@\)@|)|)\s*\Z', re.I)
 
@@ -1788,6 +1828,9 @@ def cracktypespec(typespec, selector):
                         lenkind[lk] = lenkind[lk + '2']
                     charselect[lk] = lenkind[lk]
                     del lenkind[lk + '2']
+                if lenkind['f2py_len'] is not None:
+                    # used to specify the length of assumed length strings
+                    charselect['f2py_len'] = lenkind['f2py_len']
             del charselect['lenkind']
             for k in list(charselect.keys()):
                 if not charselect[k]:
@@ -1857,6 +1900,7 @@ def setcharselector(decl, sel, force=0):
     if 'charselector' not in decl:
         decl['charselector'] = sel
         return decl
+
     for k in list(sel.keys()):
         if force or k not in decl['charselector']:
             decl['charselector'][k] = sel[k]
@@ -2465,6 +2509,7 @@ def _eval_scalar(value, params):
     if _is_kind_number(value):
         value = value.split('_')[0]
     try:
+        # TODO: use symbolic from PR #19805
         value = eval(value, {}, params)
         value = (repr if isinstance(value, str) else str)(value)
     except (NameError, SyntaxError, TypeError):
@@ -2534,7 +2579,6 @@ def analyzevars(block):
                 elif n in block['args']:
                     outmess('analyzevars: typespec of variable %s is not defined in routine %s.\n' % (
                         repr(n), block['name']))
-
         if 'charselector' in vars[n]:
             if 'len' in vars[n]['charselector']:
                 l = vars[n]['charselector']['len']
@@ -2667,26 +2711,6 @@ def analyzevars(block):
                         dimension_exprs[d] = solver_and_deps
                     vars[n]['dimension'].append(d)
 
-        if 'dimension' in vars[n]:
-            if isstringarray(vars[n]):
-                if 'charselector' in vars[n]:
-                    d = vars[n]['charselector']
-                    if '*' in d:
-                        d = d['*']
-                        errmess('analyzevars: character array "character*%s %s(%s)" is considered as "character %s(%s)"; "intent(c)" is forced.\n'
-                                % (d, n,
-                                   ','.join(vars[n]['dimension']),
-                                   n, ','.join(vars[n]['dimension'] + [d])))
-                        vars[n]['dimension'].append(d)
-                        del vars[n]['charselector']
-                        if 'intent' not in vars[n]:
-                            vars[n]['intent'] = []
-                        if 'c' not in vars[n]['intent']:
-                            vars[n]['intent'].append('c')
-                    else:
-                        errmess(
-                            "analyzevars: charselector=%r unhandled.\n" % (d))
-
         if 'check' not in vars[n] and 'args' in block and n in block['args']:
             # n is an argument that has no checks defined. Here we
             # generate some consistency checks for n, and when n is an
@@ -3220,6 +3244,13 @@ def vars2fortran(block, vars, args, tab='', as_interface=False):
         if 'attrspec' in vars[a]:
             attr = [l for l in vars[a]['attrspec']
                     if l not in ['external']]
+            if as_interface and 'intent(in)' in attr and 'intent(out)' in attr:
+                # In Fortran, intent(in, out) are conflicting while
+                # intent(in, out) can be specified only via
+                # `!f2py intent(out) ..`.
+                # So, for the Fortran interface, we'll drop
+                # intent(out) to resolve the conflict.
+                attr.remove('intent(out)')
             if attr:
                 vardef = '%s, %s' % (vardef, ','.join(attr))
                 c = ','
@@ -3255,14 +3286,23 @@ def vars2fortran(block, vars, args, tab='', as_interface=False):
 ######
 
 
+# We expose post_processing_hooks as global variable so that
+# user-libraries could register their own hooks to f2py.
+post_processing_hooks = []
+
+
 def crackfortran(files):
-    global usermodules
+    global usermodules, post_processing_hooks
 
     outmess('Reading fortran codes...\n', 0)
     readfortrancode(files, crackline)
     outmess('Post-processing...\n', 0)
     usermodules = []
     postlist = postcrack(grouplist[0])
+    outmess('Applying post-processing hooks...\n', 0)
+    for hook in post_processing_hooks:
+        outmess(f'  {hook.__name__}\n', 0)
+        postlist = traverse(postlist, hook)
     outmess('Post-processing (stage 2)...\n', 0)
     postlist = postcrack2(postlist)
     return usermodules + postlist
@@ -3282,6 +3322,142 @@ def crack2fortran(block):
 """ % (f2py_version)
     return header + pyf + footer
 
+
+def _is_visit_pair(obj):
+    return (isinstance(obj, tuple)
+            and len(obj) == 2
+            and isinstance(obj[0], (int, str)))
+
+
+def traverse(obj, visit, parents=[], result=None, *args, **kwargs):
+    '''Traverse f2py data structure with the following visit function:
+
+    def visit(item, parents, result, *args, **kwargs):
+        """
+
+        parents is a list of key-"f2py data structure" pairs from which
+        items are taken from.
+
+        result is a f2py data structure that is filled with the
+        return value of the visit function.
+
+        item is 2-tuple (index, value) if parents[-1][1] is a list
+        item is 2-tuple (key, value) if parents[-1][1] is a dict
+
+        The return value of visit must be None, or of the same kind as
+        item, that is, if parents[-1] is a list, the return value must
+        be 2-tuple (new_index, new_value), or if parents[-1] is a
+        dict, the return value must be 2-tuple (new_key, new_value).
+
+        If new_index or new_value is None, the return value of visit
+        is ignored, that is, it will not be added to the result.
+
+        If the return value is None, the content of obj will be
+        traversed, otherwise not.
+        """
+    '''
+
+    if _is_visit_pair(obj):
+        if obj[0] == 'parent_block':
+            # avoid infinite recursion
+            return obj
+        new_result = visit(obj, parents, result, *args, **kwargs)
+        if new_result is not None:
+            assert _is_visit_pair(new_result)
+            return new_result
+        parent = obj
+        result_key, obj = obj
+    else:
+        parent = (None, obj)
+        result_key = None
+
+    if isinstance(obj, list):
+        new_result = []
+        for index, value in enumerate(obj):
+            new_index, new_item = traverse((index, value), visit,
+                                           parents=parents + [parent],
+                                           result=result, *args, **kwargs)
+            if new_index is not None:
+                new_result.append(new_item)
+    elif isinstance(obj, dict):
+        new_result = dict()
+        for key, value in obj.items():
+            new_key, new_value = traverse((key, value), visit,
+                                          parents=parents + [parent],
+                                          result=result, *args, **kwargs)
+            if new_key is not None:
+                new_result[new_key] = new_value
+    else:
+        new_result = obj
+
+    if result_key is None:
+        return new_result
+    return result_key, new_result
+
+
+def character_backward_compatibility_hook(item, parents, result,
+                                          *args, **kwargs):
+    """Previously, Fortran character was incorrectly treated as
+    character*1. This hook fixes the usage of the corresponding
+    variables in `check`, `dimension`, `=`, and `callstatement`
+    expressions.
+
+    The usage of `char*` in `callprotoargument` expression can be left
+    unchanged because C `character` is C typedef of `char`, although,
+    new implementations should use `character*` in the corresponding
+    expressions.
+
+    See https://github.com/numpy/numpy/pull/19388 for more information.
+
+    """
+    parent_key, parent_value = parents[-1]
+    key, value = item
+
+    def fix_usage(varname, value):
+        value = re.sub(r'[*]\s*\b' + varname + r'\b', varname, value)
+        value = re.sub(r'\b' + varname + r'\b\s*[\[]\s*0\s*[\]]',
+                       varname, value)
+        return value
+
+    if parent_key in ['dimension', 'check']:
+        assert parents[-3][0] == 'vars'
+        vars_dict = parents[-3][1]
+    elif key == '=':
+        assert parents[-2][0] == 'vars'
+        vars_dict = parents[-2][1]
+    else:
+        vars_dict = None
+
+    new_value = None
+    if vars_dict is not None:
+        new_value = value
+        for varname, vd in vars_dict.items():
+            if ischaracter(vd):
+                new_value = fix_usage(varname, new_value)
+    elif key == 'callstatement':
+        vars_dict = parents[-2][1]['vars']
+        new_value = value
+        for varname, vd in vars_dict.items():
+            if ischaracter(vd):
+                # replace all occurrences of `<varname>` with
+                # `&<varname>` in argument passing
+                new_value = re.sub(
+                    r'(?<![&])\b' + varname + r'\b', '&' + varname, new_value)
+
+    if new_value is not None:
+        if new_value != value:
+            # We report the replacements here so that downstream
+            # software could update their source codes
+            # accordingly. However, such updates are recommended only
+            # when BC with numpy 1.21 or older is not required.
+            outmess(f'character_bc_hook[{parent_key}.{key}]:'
+                    f' replaced `{value}` -> `{new_value}`\n', 1)
+        return (key, new_value)
+
+
+post_processing_hooks.append(character_backward_compatibility_hook)
+
+
 if __name__ == "__main__":
     files = []
     funcs = []
@@ -3341,17 +3517,18 @@ if __name__ == "__main__":
             funcs.append(l)
     if not strictf77 and f77modulename and not skipemptyends:
         outmess("""\
-  Warning: You have specified module name for non Fortran 77 code
-  that should not need one (expect if you are scanning F90 code
-  for non module blocks but then you should use flag -skipemptyends
-  and also be sure that the files do not contain programs without program statement).
+  Warning: You have specified module name for non Fortran 77 code that
+  should not need one (expect if you are scanning F90 code for non
+  module blocks but then you should use flag -skipemptyends and also
+  be sure that the files do not contain programs without program
+  statement).
 """, 0)
 
     postlist = crackfortran(files)
     if pyffilename:
         outmess('Writing fortran code to file %s\n' % repr(pyffilename), 0)
         pyf = crack2fortran(postlist)
-        with open(pyffilename, 'w') as f: 
+        with open(pyffilename, 'w') as f:
             f.write(pyf)
     if showblocklist:
         show(postlist)