diff options
author | Rohit Goswami <rgoswami@quansight.com> | 2022-12-26 04:50:55 +0530 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-12-25 16:20:55 -0700 |
commit | fe73a8498417d762a2102d759fa4c88613b398ef (patch) | |
tree | 5ab14fe516125a7b5aedd1dcfa9840c2d0636419 | |
parent | 235dbe1f9ea0955c0119f79a5c6614cd0268ef05 (diff) | |
download | numpy-fe73a8498417d762a2102d759fa4c88613b398ef.tar.gz |
BUG: Use whole file for encoding checks with `charset_normalizer` [f2py] (#22872)
* BUG: Use whole file for encoding checks [f2py]
* DOC: Add a code comment
Co-authored-by: melissawm <melissawm@gmail.com>
* TST: Add a conditional unicode f2py test
* MAINT: Add chardet as a test requirement
* ENH: Cleanup and switch f2py to charset_normalizer
* MAINT: Remove chardet for charset_normalizer
* TST: Simplify UTF-8 encoding [f2py]
Co-authored-by: melissawm <melissawm@gmail.com>
-rwxr-xr-x | numpy/f2py/crackfortran.py | 47 | ||||
-rw-r--r-- | numpy/f2py/tests/src/crackfortran/unicode_comment.f90 | 4 | ||||
-rw-r--r-- | numpy/f2py/tests/test_crackfortran.py | 17 | ||||
-rw-r--r-- | test_requirements.txt | 2 |
4 files changed, 45 insertions, 25 deletions
diff --git a/numpy/f2py/crackfortran.py b/numpy/f2py/crackfortran.py index 27e257c48..84579fdcc 100755 --- a/numpy/f2py/crackfortran.py +++ b/numpy/f2py/crackfortran.py @@ -148,9 +148,9 @@ import copy import platform import codecs try: - import chardet + import charset_normalizer except ImportError: - chardet = None + charset_normalizer = None from . import __version__ @@ -309,26 +309,31 @@ _free_f90_start = re.compile(r'[^c*]\s*[^\s\d\t]', re.I).match def openhook(filename, mode): """Ensures that filename is opened with correct encoding parameter. - This function uses chardet package, when available, for - determining the encoding of the file to be opened. When chardet is - not available, the function detects only UTF encodings, otherwise, - ASCII encoding is used as fallback. + This function uses charset_normalizer package, when available, for + determining the encoding of the file to be opened. When charset_normalizer + is not available, the function detects only UTF encodings, otherwise, ASCII + encoding is used as fallback. """ - bytes = min(32, os.path.getsize(filename)) - with open(filename, 'rb') as f: - raw = f.read(bytes) - if raw.startswith(codecs.BOM_UTF8): - encoding = 'UTF-8-SIG' - elif raw.startswith((codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE)): - encoding = 'UTF-32' - elif raw.startswith((codecs.BOM_LE, codecs.BOM_BE)): - encoding = 'UTF-16' + # Reads in the entire file. Robust detection of encoding. + # Correctly handles comments or late stage unicode characters + # gh-22871 + if charset_normalizer is not None: + encoding = charset_normalizer.from_path(filename).best().encoding else: - if chardet is not None: - encoding = chardet.detect(raw)['encoding'] - else: - # hint: install chardet to ensure correct encoding handling - encoding = 'ascii' + # hint: install charset_normalizer for correct encoding handling + # No need to read the whole file for trying with startswith + nbytes = min(32, os.path.getsize(filename)) + with open(filename, 'rb') as fhandle: + raw = fhandle.read(nbytes) + if raw.startswith(codecs.BOM_UTF8): + encoding = 'UTF-8-SIG' + elif raw.startswith((codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE)): + encoding = 'UTF-32' + elif raw.startswith((codecs.BOM_LE, codecs.BOM_BE)): + encoding = 'UTF-16' + else: + # Fallback, without charset_normalizer + encoding = 'ascii' return open(filename, mode, encoding=encoding) @@ -394,7 +399,7 @@ def readfortrancode(ffile, dowithline=show, istop=1): except UnicodeDecodeError as msg: raise Exception( f'readfortrancode: reading {fin.filename()}#{fin.lineno()}' - f' failed with\n{msg}.\nIt is likely that installing chardet' + f' failed with\n{msg}.\nIt is likely that installing charset_normalizer' ' package will help f2py determine the input file encoding' ' correctly.') if not l: diff --git a/numpy/f2py/tests/src/crackfortran/unicode_comment.f90 b/numpy/f2py/tests/src/crackfortran/unicode_comment.f90 new file mode 100644 index 000000000..13515ce98 --- /dev/null +++ b/numpy/f2py/tests/src/crackfortran/unicode_comment.f90 @@ -0,0 +1,4 @@ +subroutine foo(x) + real(8), intent(in) :: x + ! Écrit à l'écran la valeur de x +end subroutine diff --git a/numpy/f2py/tests/test_crackfortran.py b/numpy/f2py/tests/test_crackfortran.py index dcf8760db..73ac4e276 100644 --- a/numpy/f2py/tests/test_crackfortran.py +++ b/numpy/f2py/tests/test_crackfortran.py @@ -1,4 +1,6 @@ +import importlib import codecs +import unicodedata import pytest import numpy as np from numpy.f2py.crackfortran import markinnerspaces @@ -257,13 +259,20 @@ class TestFortranReader(util.F2PyTest): def test_input_encoding(self, tmp_path, encoding): # gh-635 f_path = tmp_path / f"input_with_{encoding}_encoding.f90" - # explicit BOM is required for UTF8 - bom = {'utf-8': codecs.BOM_UTF8}.get(encoding, b'') with f_path.open('w', encoding=encoding) as ff: - ff.write(bom.decode(encoding) + - """ + ff.write(""" subroutine foo() end subroutine foo """) mod = crackfortran.crackfortran([str(f_path)]) assert mod[0]['name'] == 'foo' + +class TestUnicodeComment(util.F2PyTest): + sources = [util.getpath("tests", "src", "crackfortran", "unicode_comment.f90")] + + @pytest.mark.skipif( + (importlib.util.find_spec("charset_normalizer") is None), + reason="test requires charset_normalizer which is not installed", + ) + def test_encoding_comment(self): + self.module.foo(3) diff --git a/test_requirements.txt b/test_requirements.txt index 3e7d3fef7..67b6a4866 100644 --- a/test_requirements.txt +++ b/test_requirements.txt @@ -12,3 +12,5 @@ cffi; python_version < '3.10' # NOTE: Keep mypy in sync with environment.yml mypy==0.981; platform_python_implementation != "PyPy" typing_extensions>=4.2.0 +# for optional f2py encoding detection +charset-normalizer |