From 0b6d1dffc9d845ac04344557e153b2e3e7457a40 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 16 Oct 2024 17:17:09 +0200 Subject: [PATCH 1/2] filter out non ascii character when found in product --- ppanggolin/annotate/annotate.py | 22 +++++++++++++++++++++- ppanggolin/utils.py | 25 +++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 1 deletion(-) diff --git a/ppanggolin/annotate/annotate.py b/ppanggolin/annotate/annotate.py index 69420cff..05607d11 100644 --- a/ppanggolin/annotate/annotate.py +++ b/ppanggolin/annotate/annotate.py @@ -24,7 +24,7 @@ init_contig_counter, contig_counter) from ppanggolin.pangenome import Pangenome from ppanggolin.genome import Organism, Gene, RNA, Contig -from ppanggolin.utils import read_compressed_or_not, mk_file_name, detect_filetype, check_input_files +from ppanggolin.utils import read_compressed_or_not, mk_file_name, detect_filetype, check_input_files, has_non_ascii, replace_non_ascii from ppanggolin.formats import write_pangenome from ppanggolin.metadata import Metadata @@ -53,6 +53,8 @@ def check_annotate_args(args: argparse.Namespace): check_input_files(args.anno, True) + + def create_gene(org: Organism, contig: Contig, gene_counter: int, rna_counter: int, gene_id: str, dbxrefs: Set[str], coordinates: List[Tuple[int, int]], strand: str, gene_type: str, position: int = None, gene_name: str = "", product: str = "", genetic_code: int = 11, protein_id: str = "") -> Gene: @@ -74,6 +76,15 @@ def create_gene(org: Organism, contig: Contig, gene_counter: int, rna_counter: i :param genetic_code: Genetic code used :param protein_id: Protein identifier """ + # check for non ascii character in product field + if has_non_ascii(product): + + logging.getLogger("PPanGGOLiN").warning( + f"In genome '{org.name}', the 'product' field of gene '{gene_id}' contains non-ASCII characters: '{product}'. " + "These characters cannot be stored in the HDF5 file and will be replaced by underscores." + ) + product = replace_non_ascii(product) + start, stop = coordinates[0][0], coordinates[-1][1] @@ -889,6 +900,15 @@ def check_chevrons_in_start_and_stop(start: str, stop: str) -> Tuple[int, int, b is_partial = False product = attributes.pop('PRODUCT', "") + + if has_non_ascii(product): + + logging.getLogger("PPanGGOLiN").warning( + f"In genome '{organism}', the 'product' field of gene '{gene_id}' contains non-ASCII characters: '{product}'. " + "These characters cannot be stored in the HDF5 file and will be replaced by underscores." + ) + product = replace_non_ascii(product) + if contig is None or contig.name != fields_gff[gff_seqname]: # get the current contig diff --git a/ppanggolin/utils.py b/ppanggolin/utils.py index 7145fa71..d0ecc29d 100755 --- a/ppanggolin/utils.py +++ b/ppanggolin/utils.py @@ -1254,3 +1254,28 @@ def run_subprocess(cmd: List[str], output: Path = None, msg: str = "Subprocess f if output is not None: with open(output, 'w') as fout: fout.write(result.stdout) + + + +def has_non_ascii(string_to_test: str) -> bool: + """ + Check if a string contains any non-ASCII characters. + + :param string_to_test: The string to check for non-ASCII characters. + :return: True if the string contains non-ASCII characters, False otherwise. + """ + try: + string_to_test.encode('ascii') + except UnicodeEncodeError: + return True + return False + +def replace_non_ascii(string_with_ascii: str, replacement_string: str = "_") -> str: + """ + Replace all non-ASCII characters in a string with a specified replacement string. + + :param string_with_ascii: The string potentially containing non-ASCII characters. + :param replacement_string: The string to replace non-ASCII characters with (default is '_'). + :return: A new string where all non-ASCII characters have been replaced. + """ + return re.sub(r'[^\x00-\x7F]+', replacement_string, string_with_ascii) From fbe61326238639d53d555b2e6667c3e569e0b02e Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 16 Oct 2024 17:19:29 +0200 Subject: [PATCH 2/2] add some pytest to test ascii filtering fct --- tests/utils/test_utilities.py | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/tests/utils/test_utilities.py b/tests/utils/test_utilities.py index ceeef757..edb48d55 100644 --- a/tests/utils/test_utilities.py +++ b/tests/utils/test_utilities.py @@ -7,8 +7,7 @@ import zipfile from typing import Generator -from ppanggolin.utils import is_compressed, read_compressed_or_not, write_compressed_or_not - +from ppanggolin.utils import is_compressed, read_compressed_or_not, write_compressed_or_not, has_non_ascii, replace_non_ascii class TestCompressed: """ @@ -157,3 +156,27 @@ def test_write_uncompressed(self, plain_file_path: Path) -> None: f.write("Test data") with open(plain_file_path, 'r') as f: assert f.read() == "Test data" + + +# Test cases for has_non_ascii +@pytest.mark.parametrize("input_string, expected", [ + ("Escherichia_coli", False), # All ASCII characters + ("Escherichia_colí", True), # Contains non-ASCII character 'í' + ("simple_string", False), # Simple ASCII string + ("Ωmega", True), # Contains non-ASCII character 'Ω' + ("", False), # Empty string should return False +]) +def test_has_non_ascii(input_string, expected): + assert has_non_ascii(input_string) == expected + +# Test cases for replace_non_ascii +@pytest.mark.parametrize("input_string, replacement, expected", [ + ("Escherichia_coli", "_", "Escherichia_coli"), # All ASCII characters, no replacement needed + ("Escherichia_colí", "_", "Escherichia_col_"), # Replace 'í' with '_' + ("Ωmega", "-", "-mega"), # Replace 'Ω' with '-' + ("Escherichia_Ωcoli", "X", "Escherichia_Xcoli"),# Replace 'Ω' with 'X' + ("", "_", ""), # Empty string, no replacement +]) +def test_replace_non_ascii(input_string, replacement, expected): + assert replace_non_ascii(input_string, replacement) == expected +