From d4744c57cc94431746e09c6d20c47841abd5481b Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Mon, 18 Sep 2023 14:08:22 +0200 Subject: [PATCH] make input arg flexible accepting tsv or single fasta/gff/gbff --- .github/workflows/main.yml | 10 +- ppanggolin/main.py | 6 +- ppanggolin/projection/projection.py | 157 +++++++++++++++++----------- ppanggolin/utils.py | 4 +- 4 files changed, 108 insertions(+), 69 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index df85e287..3f07c28b 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -132,15 +132,15 @@ jobs: run: | cd testingDataset head organisms.gbff.list | sed 's/^/input_org_/g' > organisms.gbff.head.list - ppanggolin projection --pangenome stepbystep/pangenome.h5 -o projection_from_gbff \ - --anno organisms.gbff.head.list --fast + ppanggolin projection --pangenome stepbystep/pangenome.h5 -o projection_from_lisy_of_gbff \ + --anno organisms.gbff.head.list - ppanggolin projection --pangenome mybasicpangenome/pangenome.h5 -o projection_from_fasta \ - --organism_name chlam_A --single_fasta_file FASTA/GCF_002776845.1_ASM277684v1_genomic.fna.gz \ + ppanggolin projection --pangenome mybasicpangenome/pangenome.h5 -o projection_from_single_fasta \ + --organism_name chlam_A --fasta FASTA/GCF_002776845.1_ASM277684v1_genomic.fna.gz \ --spot_graph --graph_formats graphml --fast --keep_tmp -f - +Z diff --git a/ppanggolin/main.py b/ppanggolin/main.py index 017cdadd..4747bc7d 100644 --- a/ppanggolin/main.py +++ b/ppanggolin/main.py @@ -144,8 +144,10 @@ def cmd_line() -> argparse.Namespace: "using the --sequences argument, either through the command line or the config file.") if args.subcommand == "projection": - ppanggolin.projection.projection.check_projection_arguments(args, parser) - + # check argument correctness and determine input mode (single or multiple files) and add it to args. + input_mode = ppanggolin.projection.projection.check_projection_arguments(args, parser) + setattr(args, "input_mode", input_mode) + return args diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index 1e3d9fcb..7fb39a32 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -28,7 +28,7 @@ from ppanggolin.annotate import subparser as annotate_subparser from ppanggolin.pangenome import Pangenome # from ppanggolin.genome import input_organism, Gene, RNA, Contig -from ppanggolin.utils import create_tmpdir, read_compressed_or_not, write_compressed_or_not, restricted_float, mk_outdir, get_config_args, parse_config_file, get_default_args, check_input_files +from ppanggolin.utils import detect_filetype, create_tmpdir, read_compressed_or_not, write_compressed_or_not, restricted_float, mk_outdir, get_config_args, parse_config_file, get_default_args, check_input_files from ppanggolin.align.alignOnPang import get_input_seq_to_family_with_rep,get_input_seq_to_family_with_all, project_and_write_partition from ppanggolin.formats.writeSequences import write_gene_sequences_from_annotations from ppanggolin.formats.readBinaries import check_pangenome_info @@ -110,29 +110,26 @@ def launch(args: argparse.Namespace): **{step: argparse.Namespace(**k_v) for step, k_v in pangenome.parameters.items()}) - if args.anno: - genome_name_to_annot_path = parse_input_paths_file(args.anno) + genome_name_to_fasta_path, genome_name_to_annot_path = None, None - elif args.single_annot_file: - circular_contigs = args.circular_contigs if args.circular_contigs else [] - genome_name_to_annot_path = {args.organism_name: {"path": args.single_annot_file, - "circular_contigs": circular_contigs} - } - else: - genome_name_to_annot_path = None - - if args.fasta: - genome_name_to_fasta_path = parse_input_paths_file(args.fasta) + if args.input_mode == "multiple": + if args.anno: + genome_name_to_annot_path = parse_input_paths_file(args.anno) - elif args.single_fasta_file: - circular_contigs = args.circular_contigs if args.circular_contigs else [] - genome_name_to_fasta_path = {args.organism_name: {"path": args.single_fasta_file, - "circular_contigs": circular_contigs} - } - else: - genome_name_to_fasta_path = None + if args.fasta: + genome_name_to_fasta_path = parse_input_paths_file(args.fasta) + else: # args.input_mode == "single: + circular_contigs = args.circular_contigs if args.circular_contigs else [] + if args.anno: + genome_name_to_annot_path = {args.organism_name: {"path": args.annot, + "circular_contigs": circular_contigs}} + + if args.fasta: + genome_name_to_fasta_path = {args.organism_name: {"path": args.fasta, + "circular_contigs": circular_contigs}} + if genome_name_to_annot_path: check_input_names(pangenome, genome_name_to_annot_path) @@ -149,9 +146,6 @@ def launch(args: argparse.Namespace): "FASTA sequences using the --fasta or --single_fasta_file options. Therefore, it is impossible to project the pangenome onto the input genomes. " f"The following organisms have no associated sequence data: {', '.join(o.name for o in organisms_with_no_fasta)}") - - - elif genome_name_to_fasta_path: annotate_param_names = ["norna", "kingdom", "allow_overlap", "prodigal_procedure"] @@ -1060,50 +1054,101 @@ def project_and_write_modules(pangenome: Pangenome, input_organisms: Iterable[Or return input_orgs_to_modules -def check_projection_arguments(args: argparse.Namespace, parser: argparse.ArgumentParser): +def determine_input_mode(input_file: Path, expected_types: list[str], parser: argparse.ArgumentParser) -> str: + """ + Determine the input mode based on the provided input file and expected file types. + + :param input_file: A Path object representing the input file. + :param expected_types: A list of expected file types (e.g., ['fasta', 'gff', 'gbff', 'tsv']). + + :return: A string indicating the input mode ('single' or 'multiple'). + """ + if not input_file.exists(): + parser.error(f"The provided file {input_file} does not exist.") + + try: + filetype = detect_filetype(input_file) + except Exception: + parser.error("Based on its content, the provided file is not recognized as a valid input file. Please ensure it is in one of the supported formats (FASTA, GFF/GBFF, or TSV).") + + if filetype == "tsv": + logging.getLogger('PPanGGOLiN').debug(f"The provided file ({input_file}) is detected as a TSV file.") + mode = "multiple" + elif filetype in expected_types: + logging.getLogger('PPanGGOLiN').debug(f"The provided file ({input_file}) is detected as a single {'/'.join(expected_types)} file.") + mode = "single" + else: + logging.getLogger('PPanGGOLiN').error(f"The provided file {input_file} is not recognized as a valid {'/'.join(expected_types)} file or a TSV file listing names and {'/'.join(expected_types)} files of genomes to annotate.") + parser.error(f"The provided file {input_file} is not recognized as a valid {'/'.join(expected_types)} file or a TSV file listing names and files of genomes to annotate.") + + return mode + + +def check_projection_arguments(args: argparse.Namespace, parser: argparse.ArgumentParser ) -> str: """ Check the arguments provided for genome projection and raise errors if they are incompatible or missing. :param args: An argparse.Namespace object containing parsed command-line arguments. - :param parser: An argparse.ArgumentParser object used to raise errors. + :param parser : parser of the command + :return: A string indicating the input mode ('single' or 'multiple'). """ # Check if we annotate genomes from path files or only a single genome... - if args.fasta or args.anno: + if not args.anno and not args.fasta: + parser.error("Please provide either a FASTA file or a tab-separated file listing sequence files using the '--fasta' option, " + "or an annotation file or a tab-separated file listing annotation files using the '--anno' option. " + "You can specify these either through the command line or the configuration file.") + + mode_from_fasta, mode_from_anno = None, None + if args.fasta: + mode_from_fasta = determine_input_mode(args.fasta, ['fasta'], parser) + input_mode = mode_from_fasta + + if args.anno: + mode_from_anno = determine_input_mode(args.anno, ['gff', "gbff"], parser) + input_mode = mode_from_anno + + logging.getLogger('PPanGGOLiN').debug("") + + if mode_from_fasta and mode_from_anno and mode_from_fasta != mode_from_anno: + single_input, multiple_input = ("fasta", "anno") if mode_from_fasta == "single" else ("anno", "fasta") + + parser.error(f"You've provided both a single annotation/fasta file using the '--{single_input}' option and a list of files using " + f"the '--{multiple_input}' option. Please choose either a single file or a tab-separated file listing genome files, but not both.") + + + if input_mode == "multiple": # We are in paths file mode - incompatible_args = ["single_fasta_file", "single_annot_file", "organism_name", "circular_contigs"] + incompatible_args = ["organism_name", "circular_contigs"] for single_arg in incompatible_args: if getattr(args, single_arg) is not None: - parser.error(f"The single genome argument --{single_arg} is incompatible with multiple genomes arguments (--anno and/or --fasta).") - + parser.error("You provided a TSV file listing the files of genomes you wish to annotate. " + f"Therefore, the single genome argument '--{single_arg}' is incompatible with this multiple genomes file.") + if args.fasta: check_input_files(args.fasta, True) if args.anno: check_input_files(args.anno, True) - elif args.single_fasta_file or args.single_annot_file: + elif input_mode == "single": # We are in single file mode if args.organism_name is None: - parser.error("Please specify the name of the input organism you want to annotate. " + parser.error("You directly provided a single FASTA/GBFF/GFF file. Please specify the name of the input organism you want to annotate. " "You can use the --organism_name argument either through the command line or the config file.") - - else: - parser.error("Please provide either a sequence file using the '--single_fasta_file' or '--fasta' option, " - "or an annotation file using the '--single_annot_file' or '--anno' option. " - "You can specify these either through the command line or the config file.") - + + return input_mode def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: """ Subparser to launch PPanGGOLiN in Command line - :param sub_parser : sub_parser for align command + :param sub_parser : sub_parser for projection command - :return : parser arguments for align command + :return : parser arguments for projection command """ parser = sub_parser.add_parser( "projection", formatter_class=argparse.ArgumentDefaultsHelpFormatter) @@ -1121,35 +1166,25 @@ def parser_projection(parser: argparse.ArgumentParser): required.add_argument('-p', '--pangenome', required=False, type=Path, help="The pangenome.h5 file") - - required_multiple = parser.add_argument_group(title="Multiple genome arguments", - description="Arguments for annotating multiple genomes with the provided pangenome.") - required_multiple.add_argument('--fasta', required=False, type=Path, - help="A tab-separated file listing the organism names, and the fasta filepath of its genomic " - "sequence(s) (the fastas can be compressed with gzip). One line per organism.") + required.add_argument('--fasta', required=False, type=Path, + help="Specify a FASTA file containing the genomic sequences of the organism(s) you wish to annotate, " + "or provide a tab-separated file listing organism names alongside their respective FASTA filepaths, with one line per organism.") - required_multiple.add_argument('--anno', required=False, type=Path, - help="A tab-separated file listing the organism names, and the gff/gbff filepath of its " - "annotations (the files can be compressed with gzip). One line per organism. " - "If provided, those annotations will be used.") + required.add_argument('--anno', required=False, type=Path, + help="Specify an annotation file in GFF/GBFF format for the genome you wish to annotate. " + "Alternatively, you can provide a tab-separated file listing organism names alongside their respective annotation filepaths, " + "with one line per organism. If both an annotation file and a FASTA file are provided, the annotation file will take precedence.") - required_single = parser.add_argument_group(title="Single genome arguments", - description="Arguments for annotating a single genome with the provided pangenome.") + required_single = parser.add_argument_group(title="Single Genome Arguments", + description="Use these options when providing a single FASTA or annotation file:") required_single.add_argument("-n", '--organism_name', required=False, type=str, - help="Specify the name of the input organism whose genome you want to annotate with the provided pangenome.") - - required_single.add_argument('--single_fasta_file', required=False, type=Path, - help="Provide the file path to the genomic sequence(s) in FASTA format for the genome you wish to annotate. " - "(Fasta files can be compressed using gzip)") + help="Specify the name of the organism whose genome you want to annotate when providing a single FASTA or annotation file.") - required_single.add_argument('--single_annot_file', required=False, type=Path, - help="Provide the file path to the annotations in GFF/GBFF format for the genome you want to annotate. " - "(Annotation files can be compressed using gzip)") - required_single.add_argument('--circular_contigs', nargs="+", required=False, type=tuple, - help="Contigs of the input genome to consider as circular.") + help="Specify the contigs of the input genome that should be treated as circular when providing a single FASTA or annotation file.") + optional = parser.add_argument_group(title="Optional arguments") diff --git a/ppanggolin/utils.py b/ppanggolin/utils.py index fa9b3f40..1ac197b6 100755 --- a/ppanggolin/utils.py +++ b/ppanggolin/utils.py @@ -92,7 +92,7 @@ def check_tsv_sanity(tsv: Path): except IOError as ios_error: raise IOError(ios_error) except Exception as exception_error: - raise Exception(f"The following unexpected error happened when opening the list of pangenomes : " + raise Exception(f"The following unexpected error happened when opening the list of genomes path: " f"{exception_error}") else: name_set = set() @@ -319,6 +319,8 @@ def detect_filetype(filename: Path) -> str: return 'gff' elif first_line.startswith(">"): return 'fasta' + elif "\t" in first_line: + return "tsv" else: raise Exception("Filetype was not gff3 (file starts with '##gff-version 3') " "nor gbff/gbk (file starts with 'LOCUS '). "