BUG: Use whole file for encoding checks with `charset_normalizer` [f2py] (#22872)

* BUG: Use whole file for encoding checks [f2py] * DOC: Add a code comment Co-authored-by: melissawm <melissawm@gmail.com> * TST: Add a conditional unicode f2py test * MAINT: Add chardet as a test requirement * ENH: Cleanup and switch f2py to charset_normalizer * MAINT: Remove chardet for charset_normalizer * TST: Simplify UTF-8 encoding [f2py] Co-authored-by: melissawm <melissawm@gmail.com>
author: Rohit Goswami <rgoswami@quansight.com> 2022-12-26 04:50:55 +0530
committer: GitHub <noreply@github.com> 2022-12-25 16:20:55 -0700
commit: fe73a8498417d762a2102d759fa4c88613b398ef (patch)
tree: 5ab14fe516125a7b5aedd1dcfa9840c2d0636419
parent: 235dbe1f9ea0955c0119f79a5c6614cd0268ef05 (diff)
download: numpy-fe73a8498417d762a2102d759fa4c88613b398ef.tar.gz
4 files changed, 45 insertions, 25 deletions
diff --git a/numpy/f2py/crackfortran.py b/numpy/f2py/crackfortran.py
index 27e257c48..84579fdcc 100755
--- a/numpy/f2py/crackfortran.py
+++ b/numpy/f2py/crackfortran.py
@@ -148,9 +148,9 @@ import copy
 import platform
 import codecs
 try:
-    import chardet
+    import charset_normalizer
 except ImportError:
-    chardet = None
+    charset_normalizer = None
 
 from . import __version__
 
@@ -309,26 +309,31 @@ _free_f90_start = re.compile(r'[^c*]\s*[^\s\d\t]', re.I).match
 def openhook(filename, mode):
     """Ensures that filename is opened with correct encoding parameter.
 
-    This function uses chardet package, when available, for
-    determining the encoding of the file to be opened. When chardet is
-    not available, the function detects only UTF encodings, otherwise,
-    ASCII encoding is used as fallback.
+    This function uses charset_normalizer package, when available, for
+    determining the encoding of the file to be opened. When charset_normalizer
+    is not available, the function detects only UTF encodings, otherwise, ASCII
+    encoding is used as fallback.
     """
-    bytes = min(32, os.path.getsize(filename))
-    with open(filename, 'rb') as f:
-        raw = f.read(bytes)
-    if raw.startswith(codecs.BOM_UTF8):
-        encoding = 'UTF-8-SIG'
-    elif raw.startswith((codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE)):
-        encoding = 'UTF-32'
-    elif raw.startswith((codecs.BOM_LE, codecs.BOM_BE)):
-        encoding = 'UTF-16'
+    # Reads in the entire file. Robust detection of encoding.
+    # Correctly handles comments or late stage unicode characters
+    # gh-22871
+    if charset_normalizer is not None:
+        encoding = charset_normalizer.from_path(filename).best().encoding
     else:
-        if chardet is not None:
-            encoding = chardet.detect(raw)['encoding']
-        else:
-            # hint: install chardet to ensure correct encoding handling
-            encoding = 'ascii'
+        # hint: install charset_normalizer for correct encoding handling
+        # No need to read the whole file for trying with startswith
+        nbytes = min(32, os.path.getsize(filename))
+        with open(filename, 'rb') as fhandle:
+            raw = fhandle.read(nbytes)
+            if raw.startswith(codecs.BOM_UTF8):
+                encoding = 'UTF-8-SIG'
+            elif raw.startswith((codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE)):
+                encoding = 'UTF-32'
+            elif raw.startswith((codecs.BOM_LE, codecs.BOM_BE)):
+                encoding = 'UTF-16'
+            else:
+                # Fallback, without charset_normalizer
+                encoding = 'ascii'
     return open(filename, mode, encoding=encoding)
 
 
@@ -394,7 +399,7 @@ def readfortrancode(ffile, dowithline=show, istop=1):
         except UnicodeDecodeError as msg:
             raise Exception(
                 f'readfortrancode: reading {fin.filename()}#{fin.lineno()}'
-                f' failed with\n{msg}.\nIt is likely that installing chardet'
+                f' failed with\n{msg}.\nIt is likely that installing charset_normalizer'
                 ' package will help f2py determine the input file encoding'
                 ' correctly.')
         if not l:
diff --git a/numpy/f2py/tests/src/crackfortran/unicode_comment.f90 b/numpy/f2py/tests/src/crackfortran/unicode_comment.f90
new file mode 100644
index 000000000..13515ce98
--- /dev/null
+++ b/numpy/f2py/tests/src/crackfortran/unicode_comment.f90
@@ -0,0 +1,4 @@
+subroutine foo(x)
+  real(8), intent(in) :: x
+  ! Écrit à l'écran la valeur de x
+end subroutine
diff --git a/numpy/f2py/tests/test_crackfortran.py b/numpy/f2py/tests/test_crackfortran.py
index dcf8760db..73ac4e276 100644
--- a/numpy/f2py/tests/test_crackfortran.py
+++ b/numpy/f2py/tests/test_crackfortran.py
@@ -1,4 +1,6 @@
+import importlib
 import codecs
+import unicodedata
 import pytest
 import numpy as np
 from numpy.f2py.crackfortran import markinnerspaces
@@ -257,13 +259,20 @@ class TestFortranReader(util.F2PyTest):
     def test_input_encoding(self, tmp_path, encoding):
         # gh-635
         f_path = tmp_path / f"input_with_{encoding}_encoding.f90"
-        # explicit BOM is required for UTF8
-        bom = {'utf-8': codecs.BOM_UTF8}.get(encoding, b'')
         with f_path.open('w', encoding=encoding) as ff:
-            ff.write(bom.decode(encoding) +
-                     """
+            ff.write("""
                      subroutine foo()
                      end subroutine foo
                      """)
         mod = crackfortran.crackfortran([str(f_path)])
         assert mod[0]['name'] == 'foo'
+
+class TestUnicodeComment(util.F2PyTest):
+    sources = [util.getpath("tests", "src", "crackfortran", "unicode_comment.f90")]
+
+    @pytest.mark.skipif(
+        (importlib.util.find_spec("charset_normalizer") is None),
+        reason="test requires charset_normalizer which is not installed",
+    )
+    def test_encoding_comment(self):
+        self.module.foo(3)
diff --git a/test_requirements.txt b/test_requirements.txt
index 3e7d3fef7..67b6a4866 100644
--- a/test_requirements.txt
+++ b/test_requirements.txt
@@ -12,3 +12,5 @@ cffi; python_version < '3.10'
 # NOTE: Keep mypy in sync with environment.yml
 mypy==0.981; platform_python_implementation != "PyPy"
 typing_extensions>=4.2.0
+# for optional f2py encoding detection
+charset-normalizer
author	Rohit Goswami <rgoswami@quansight.com>	2022-12-26 04:50:55 +0530
committer	GitHub <noreply@github.com>	2022-12-25 16:20:55 -0700
commit	fe73a8498417d762a2102d759fa4c88613b398ef (patch)
tree	5ab14fe516125a7b5aedd1dcfa9840c2d0636419
parent	235dbe1f9ea0955c0119f79a5c6614cd0268ef05 (diff)
download	numpy-fe73a8498417d762a2102d759fa4c88613b398ef.tar.gz