summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRohit Goswami <rgoswami@quansight.com>2022-12-26 04:50:55 +0530
committerGitHub <noreply@github.com>2022-12-25 16:20:55 -0700
commitfe73a8498417d762a2102d759fa4c88613b398ef (patch)
tree5ab14fe516125a7b5aedd1dcfa9840c2d0636419
parent235dbe1f9ea0955c0119f79a5c6614cd0268ef05 (diff)
downloadnumpy-fe73a8498417d762a2102d759fa4c88613b398ef.tar.gz
BUG: Use whole file for encoding checks with `charset_normalizer` [f2py] (#22872)
* BUG: Use whole file for encoding checks [f2py] * DOC: Add a code comment Co-authored-by: melissawm <melissawm@gmail.com> * TST: Add a conditional unicode f2py test * MAINT: Add chardet as a test requirement * ENH: Cleanup and switch f2py to charset_normalizer * MAINT: Remove chardet for charset_normalizer * TST: Simplify UTF-8 encoding [f2py] Co-authored-by: melissawm <melissawm@gmail.com>
-rwxr-xr-xnumpy/f2py/crackfortran.py47
-rw-r--r--numpy/f2py/tests/src/crackfortran/unicode_comment.f904
-rw-r--r--numpy/f2py/tests/test_crackfortran.py17
-rw-r--r--test_requirements.txt2
4 files changed, 45 insertions, 25 deletions
diff --git a/numpy/f2py/crackfortran.py b/numpy/f2py/crackfortran.py
index 27e257c48..84579fdcc 100755
--- a/numpy/f2py/crackfortran.py
+++ b/numpy/f2py/crackfortran.py
@@ -148,9 +148,9 @@ import copy
import platform
import codecs
try:
- import chardet
+ import charset_normalizer
except ImportError:
- chardet = None
+ charset_normalizer = None
from . import __version__
@@ -309,26 +309,31 @@ _free_f90_start = re.compile(r'[^c*]\s*[^\s\d\t]', re.I).match
def openhook(filename, mode):
"""Ensures that filename is opened with correct encoding parameter.
- This function uses chardet package, when available, for
- determining the encoding of the file to be opened. When chardet is
- not available, the function detects only UTF encodings, otherwise,
- ASCII encoding is used as fallback.
+ This function uses charset_normalizer package, when available, for
+ determining the encoding of the file to be opened. When charset_normalizer
+ is not available, the function detects only UTF encodings, otherwise, ASCII
+ encoding is used as fallback.
"""
- bytes = min(32, os.path.getsize(filename))
- with open(filename, 'rb') as f:
- raw = f.read(bytes)
- if raw.startswith(codecs.BOM_UTF8):
- encoding = 'UTF-8-SIG'
- elif raw.startswith((codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE)):
- encoding = 'UTF-32'
- elif raw.startswith((codecs.BOM_LE, codecs.BOM_BE)):
- encoding = 'UTF-16'
+ # Reads in the entire file. Robust detection of encoding.
+ # Correctly handles comments or late stage unicode characters
+ # gh-22871
+ if charset_normalizer is not None:
+ encoding = charset_normalizer.from_path(filename).best().encoding
else:
- if chardet is not None:
- encoding = chardet.detect(raw)['encoding']
- else:
- # hint: install chardet to ensure correct encoding handling
- encoding = 'ascii'
+ # hint: install charset_normalizer for correct encoding handling
+ # No need to read the whole file for trying with startswith
+ nbytes = min(32, os.path.getsize(filename))
+ with open(filename, 'rb') as fhandle:
+ raw = fhandle.read(nbytes)
+ if raw.startswith(codecs.BOM_UTF8):
+ encoding = 'UTF-8-SIG'
+ elif raw.startswith((codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE)):
+ encoding = 'UTF-32'
+ elif raw.startswith((codecs.BOM_LE, codecs.BOM_BE)):
+ encoding = 'UTF-16'
+ else:
+ # Fallback, without charset_normalizer
+ encoding = 'ascii'
return open(filename, mode, encoding=encoding)
@@ -394,7 +399,7 @@ def readfortrancode(ffile, dowithline=show, istop=1):
except UnicodeDecodeError as msg:
raise Exception(
f'readfortrancode: reading {fin.filename()}#{fin.lineno()}'
- f' failed with\n{msg}.\nIt is likely that installing chardet'
+ f' failed with\n{msg}.\nIt is likely that installing charset_normalizer'
' package will help f2py determine the input file encoding'
' correctly.')
if not l:
diff --git a/numpy/f2py/tests/src/crackfortran/unicode_comment.f90 b/numpy/f2py/tests/src/crackfortran/unicode_comment.f90
new file mode 100644
index 000000000..13515ce98
--- /dev/null
+++ b/numpy/f2py/tests/src/crackfortran/unicode_comment.f90
@@ -0,0 +1,4 @@
+subroutine foo(x)
+ real(8), intent(in) :: x
+ ! Écrit à l'écran la valeur de x
+end subroutine
diff --git a/numpy/f2py/tests/test_crackfortran.py b/numpy/f2py/tests/test_crackfortran.py
index dcf8760db..73ac4e276 100644
--- a/numpy/f2py/tests/test_crackfortran.py
+++ b/numpy/f2py/tests/test_crackfortran.py
@@ -1,4 +1,6 @@
+import importlib
import codecs
+import unicodedata
import pytest
import numpy as np
from numpy.f2py.crackfortran import markinnerspaces
@@ -257,13 +259,20 @@ class TestFortranReader(util.F2PyTest):
def test_input_encoding(self, tmp_path, encoding):
# gh-635
f_path = tmp_path / f"input_with_{encoding}_encoding.f90"
- # explicit BOM is required for UTF8
- bom = {'utf-8': codecs.BOM_UTF8}.get(encoding, b'')
with f_path.open('w', encoding=encoding) as ff:
- ff.write(bom.decode(encoding) +
- """
+ ff.write("""
subroutine foo()
end subroutine foo
""")
mod = crackfortran.crackfortran([str(f_path)])
assert mod[0]['name'] == 'foo'
+
+class TestUnicodeComment(util.F2PyTest):
+ sources = [util.getpath("tests", "src", "crackfortran", "unicode_comment.f90")]
+
+ @pytest.mark.skipif(
+ (importlib.util.find_spec("charset_normalizer") is None),
+ reason="test requires charset_normalizer which is not installed",
+ )
+ def test_encoding_comment(self):
+ self.module.foo(3)
diff --git a/test_requirements.txt b/test_requirements.txt
index 3e7d3fef7..67b6a4866 100644
--- a/test_requirements.txt
+++ b/test_requirements.txt
@@ -12,3 +12,5 @@ cffi; python_version < '3.10'
# NOTE: Keep mypy in sync with environment.yml
mypy==0.981; platform_python_implementation != "PyPy"
typing_extensions>=4.2.0
+# for optional f2py encoding detection
+charset-normalizer