Skip to content

Commit

Permalink
WIP on #799; function to check encoding, and unit tests, complete, bu…
Browse files Browse the repository at this point in the history
…t determing which files to check not yet implemented.
  • Loading branch information
mjordan committed Sep 14, 2024
1 parent 7f7aafa commit ca745ac
Show file tree
Hide file tree
Showing 11 changed files with 63 additions and 3 deletions.
1 change: 1 addition & 0 deletions tests/assets/file_is_utf8_test/false_big5.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
�j���X
1 change: 1 addition & 0 deletions tests/assets/file_is_utf8_test/false_gb2312.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
���ұ�׼
1 change: 1 addition & 0 deletions tests/assets/file_is_utf8_test/false_iso8859-8.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
�?�?�?��
1 change: 1 addition & 0 deletions tests/assets/file_is_utf8_test/false_latin1.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
�, �, �
1 change: 1 addition & 0 deletions tests/assets/file_is_utf8_test/false_oem850.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
�sterreich
1 change: 1 addition & 0 deletions tests/assets/file_is_utf8_test/false_windows1256.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
�����
1 change: 1 addition & 0 deletions tests/assets/file_is_utf8_test/true_ascii.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Testing
1 change: 1 addition & 0 deletions tests/assets/file_is_utf8_test/true_utf8-bom.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
עִבְרִית
1 change: 1 addition & 0 deletions tests/assets/file_is_utf8_test/true_utf8.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is a utf-8 file (no BOM).
25 changes: 25 additions & 0 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import os
from ruamel.yaml import YAML
import collections
import glob
import tempfile
import unittest

Expand Down Expand Up @@ -1611,5 +1612,29 @@ def test_mimeypes_from_extensions_with_configs(self):
self.assertEqual(fixture["mime_type"], mimetype)


class TestFileIsUtf8(unittest.TestCase):
def test_file_is_utf8(self):
current_dir = os.path.dirname(os.path.abspath(__file__))
input_files_dir = os.path.join(current_dir, "assets", "file_is_utf8_test")
files_to_test = glob.glob("true_*.txt", root_dir=input_files_dir)

for file_to_test in files_to_test:
is_utf8 = workbench_utils.file_is_utf8(
os.path.join(input_files_dir, file_to_test)
)
self.assertEqual(is_utf8, True)

def test_file_is_not_utf8(self):
current_dir = os.path.dirname(os.path.abspath(__file__))
input_files_dir = os.path.join(current_dir, "assets", "file_is_utf8_test")
files_to_test = glob.glob("false_*.txt", root_dir=input_files_dir)

for file_to_test in files_to_test:
is_utf8 = workbench_utils.file_is_utf8(
os.path.join(input_files_dir, file_to_test)
)
self.assertEqual(is_utf8, False)


if __name__ == "__main__":
unittest.main()
32 changes: 29 additions & 3 deletions workbench_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4724,7 +4724,7 @@ def execute_entity_post_task_script(
# # equivalents (at least the unidecode() equivalents). Also, while Requests requires filenames to be encoded
# # in latin-1, Drupal passes filenames through its validateUtf8() function. So ASCII is a low common denominator
# # of both requirements.
# ascii_only = is_ascii(filename)
# ascii_only = string_is_ascii(filename)
# if ascii_only is False:
# original_filename = copy.copy(filename)
# filename = unidecode(filename)
Expand Down Expand Up @@ -4840,7 +4840,7 @@ def create_file(config, filename, file_fieldname, node_csv_row, node_id):
# equivalents (at least the unidecode() equivalents). Also, while Requests requires filenames to be encoded
# in latin-1, Drupal passes filenames through its validateUtf8() function. So ASCII is a low common denominator
# of both requirements.
ascii_only = is_ascii(filename)
ascii_only = string_is_ascii(filename)
if ascii_only is False:
original_filename = copy.copy(filename)
filename = unidecode(filename)
Expand Down Expand Up @@ -10293,7 +10293,7 @@ def calculate_response_time_trend(config, response_time):
return average


def is_ascii(input):
def string_is_ascii(input):
"""Check if a string contains only ASCII characters."""
"""Parameters
----------
Expand All @@ -10308,6 +10308,32 @@ def is_ascii(input):
return all(ord(c) < 128 for c in input)


def file_is_utf8(file_path):
"""Check if a file is encoded as UTF-8, or backward-compatible encodings such as ASCII. BOM is ignored."""
"""Parameters
----------
file_path : str
The absolute or relative path to the file.
Returns
-------
boolean
True if file is encoded as UTF-8. False if not or if file cannot be found.
"""
if os.path.exists(file_path):
with open(file_path, "rb") as f:
try:
f.read().decode("utf-8-sig")
file_is_utf8 = True
except UnicodeDecodeError:
file_is_utf8 = False
return file_is_utf8
else:
logging.error(
f'File "{file_path}" not found; Workbench cannot determine if it is encoded as UTF-8.'
)
return False


def quick_delete_node(config, args):
logging.info("--quick_delete_node task started for " + args.quick_delete_node)

Expand Down

0 comments on commit ca745ac

Please sign in to comment.