Skip to content

Commit

Permalink
updated argschema to argparse
Browse files Browse the repository at this point in the history
  • Loading branch information
ens-ftricomi committed Nov 24, 2023
1 parent 4ba1bb5 commit 97dbb76
Show file tree
Hide file tree
Showing 11 changed files with 432 additions and 478 deletions.
91 changes: 45 additions & 46 deletions src/python/ensembl/tools/anno/protein_annotation/genblast.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
import signal
import subprocess
from typing import List
import argschema
import argparse

from ensembl.tools.anno.utils._utils import (
check_exe,
Expand Down Expand Up @@ -267,7 +267,7 @@ def _generate_genblast_gtf(genblast_dir: Path) -> None:
Collect output from geneblast and create the final gtf file
genblast_dir: Working directory path.
"""
logging.info("AAAAA _generate_genblast_gtf")

output_file = genblast_dir / "annotation.gtf"
with open(output_file, "w+", encoding="utf8") as file_out:
genblast_extension = "_1.1c_2.3_s1_0_16_1"
Expand Down Expand Up @@ -441,70 +441,69 @@ def _set_genblast_attributes(attributes: str, feature_type: str) -> str:
return converted_attributes


class InputSchema(argschema.ArgSchema):
"""Input arguments expected to run TRF."""

masked_genome_file = argschema.fields.InputFile(
required=True, description="Masked genome file path"
)
output_dir = argschema.fields.OutputDir(
required=True, description="Output directory path"
def parse_args():
"""Parse command line arguments."""
parser = argparse.ArgumentParser(description="Genblast arguments")
parser.add_argument(
"--masked_genome_file", required=True, help="Masked genome file path"
)
protein_file = argschema.fields.String(
required=True, description="Path for the protein dataset"
parser.add_argument("--output_dir", required=True, help="Output directory path")
parser.add_argument("--protein_file", required=True, help="Path for the protein dataset")
parser.add_argument(
"--genblast_timeout_secs", type=int, default=10800, help="Genblast timeout period"
)
genblast_timeout_secs = argschema.fields.Integer(
required=False, default=10800, description="Genblast timeout period"
parser.add_argument(
"--max_intron_length", type=int, required=True, help="Maximum intron length"
)
max_intron_length = argschema.fields.Integer(
required=True, description="Maximum intron length"
)
genblast_bin = argschema.fields.String(
required=False,
parser.add_argument(
"--genblast_bin",
default="genblast",
description="Genblast executable path",
help="Genblast executable path",
)
convert2blastmask_bin = argschema.fields.String(
required=False,
parser.add_argument(
"--convert2blastmask_bin",
default="convert2blastmask",
description="convert2blastmask executable path",
)
makeblastdb_bin = argschema.fields.String(
required=False, default="makeblastdb", description="makeblastdb executable path"
help="convert2blastmask executable path",
)
num_threads = argschema.fields.Integer(
required=False, default=1, description="Number of threads"
parser.add_argument(
"--makeblastdb_bin",
default="makeblastdb",
help="makeblastdb executable path",
)
protein_set = argschema.fields.String(
parser.add_argument("--num_threads", type=int, default=1, help="Number of threads")
parser.add_argument(
"--protein_set",
required=True,
description="Protein set [uniprot,orthodb]",
validate=lambda x: x in ["uniprot", "orthodb"],
choices=["uniprot", "orthodb"],
help="Protein set [uniprot, orthodb]",
)
return parser.parse_args()


def main() -> None:
def main():
"""Genblast's entry-point."""
mod = argschema.ArgSchemaParser(schema_type=InputSchema)
log_file_path = create_dir(mod.args["output_dir"], "log") / "genblast.log"
args = parse_args()

log_file_path = create_dir(args.output_dir, "log") / "genblast.log"
loginipath = Path(__file__).parents[6] / "conf" / "logging.conf"

logging.config.fileConfig(
loginipath,
defaults={"logfilename": str(log_file_path)},
disable_existing_loggers=False,
)

run_genblast(
Path(mod.args["masked_genome_file"]),
Path(mod.args["output_dir"]),
Path(mod.args["protein_file"]),
mod.args["max_intron_length"],
mod.args["genblast_timeout_secs"],
Path(mod.args["genblast_bin"]),
Path(mod.args["convert2blastmask_bin"]),
Path(mod.args["makeblastdb_bin"]),
mod.args["num_threads"],
mod.args["protein_set"],
Path(args.masked_genome_file),
Path(args.output_dir),
Path(args.protein_file),
args.max_intron_length,
args.genblast_timeout_secs,
Path(args.genblast_bin),
Path(args.convert2blastmask_bin),
Path(args.makeblastdb_bin),
args.num_threads,
args.protein_set,
)


if __name__ == "__main__":
main()
42 changes: 24 additions & 18 deletions src/python/ensembl/tools/anno/repeat_annotation/dust.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
import subprocess
import tempfile
from typing import List
import argschema
import argparse


from ensembl.tools.anno.utils._utils import (
Expand Down Expand Up @@ -166,36 +166,42 @@ def _create_dust_gtf(
repeat_count += 1


class InputSchema(argschema.ArgSchema):
"""Input arguments expected to run DustMasker."""

genome_file = argschema.fields.InputFile(required=True, description="Genome file path")
output_dir = argschema.fields.OutputDir(required=True, description="Output directory path")
dust_bin = argschema.fields.String(
required=False,

def parse_args():
"""Parse command line arguments."""
parser = argparse.ArgumentParser(description="DustMasker arguments")
parser.add_argument("--genome_file", required=True, help="Genome file path")
parser.add_argument("--output_dir", required=True, help="Output directory path")
parser.add_argument(
"--dust_bin",
default="dustmasker",
description="Dust executable path",
help="Dust executable path",
)
num_threads = argschema.fields.Integer(required=False, default=1, description="Number of threads")

parser.add_argument(
"--num_threads", type=int, default=1, help="Number of threads"
)
return parser.parse_args()

def main() -> None:
def main():
"""Dust's entry-point."""
mod = argschema.ArgSchemaParser(schema_type=InputSchema)
log_file_path = create_dir(mod.args["output_dir"], "log") / "dust.log"
args = parse_args()

log_file_path = create_dir(args.output_dir, "log") / "dust.log"
loginipath = Path(__file__).parents[6] / "conf" / "logging.conf"

logging.config.fileConfig(
loginipath,
defaults={"logfilename": str(log_file_path)},
disable_existing_loggers=False,
)

run_dust(
mod.args["genome_file"],
mod.args["output_dir"],
mod.args["dust_bin"],
mod.args["num_threads"],
args.genome_file,
args.output_dir,
args.dust_bin,
args.num_threads,
)


if __name__ == "__main__":
main()
77 changes: 42 additions & 35 deletions src/python/ensembl/tools/anno/repeat_annotation/red.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,13 @@
"""
__all__ = ["run_red"]

import argparse
import logging
import logging.config
from os import PathLike
from pathlib import Path
import re
import subprocess
import argschema


from ensembl.tools.anno.utils._utils import (
check_exe,
Expand All @@ -37,7 +37,11 @@
logger = logging.getLogger(__name__)


def run_red(genome_file: Path, output_dir: Path, red_bin: Path = Path("Red"),) -> str:
def run_red(
genome_file: Path,
output_dir: Path,
red_bin: Path = Path("Red"),
) -> str:
"""
Run Red on genome file
:param genome_file: Genome file path.
Expand All @@ -46,7 +50,7 @@ def run_red(genome_file: Path, output_dir: Path, red_bin: Path = Path("Red"),) -
:type output_dir: Path
:param red_bin: Red software path.
:type red_bin: Path, default Red
:return: Masked genome file
:rtype: str
"""
Expand Down Expand Up @@ -86,20 +90,20 @@ def run_red(genome_file: Path, output_dir: Path, red_bin: Path = Path("Red"),) -
red_genome_file.symlink_to(genome_file)
try:
if red_genome_file.exists():
logger.info("Running Red")
subprocess.run(
[
red_bin,
"-gnm",
red_genome_dir,
"-msk",
red_mask_dir,
"-rpt",
red_repeat_dir,
],
check=True,
)
except:
logger.info("Running Red")
subprocess.run(
[
red_bin,
"-gnm",
red_genome_dir,
"-msk",
red_mask_dir,
"-rpt",
red_repeat_dir,
],
check=True,
)
except:#pylint:disable=bare-except
logger.error(
"Could not find the genome file in the Red genome dir or sym link \
to the original file. Path expected:\n%s",
Expand Down Expand Up @@ -128,38 +132,41 @@ def _create_red_gtf(repeat_coords_file: Path, output_file: Path):
start = int(result_match.group(2)) + 1
end = int(result_match.group(3)) + 1
gtf_line = (
f"{region_name}\tRed\trepeat\t{start}\t"
f'{end}\t.\t+\t.\trepeat_id "{repeat_id}";\n'
f"{region_name}\tRed\trepeat\t{start}\t" f'{end}\t.\t+\t.\trepeat_id "{repeat_id}";\n'
)
red_out.write(gtf_line)


class InputSchema(argschema.ArgSchema):
"""Input arguments expected to run Red."""

genome_file = argschema.fields.InputFile(
required=True, description="Genome file path"
)
output_dir = argschema.fields.OutputDir(
required=True, description="Output directory path"
)
red_bin = argschema.fields.String(
required=False, default="Red", description="Red executable path",
def parse_args():
"""Parse command line arguments."""
parser = argparse.ArgumentParser(description="Red's arguments")
parser.add_argument("--genome_file", required=True, help="Genome file path")
parser.add_argument("--output_dir", required=True, help="Output directory path")
parser.add_argument(
"--red_bin",
default="red",
help="Red executable path",
)
return parser.parse_args()


def main() -> None:
def main():
"""Red's entry-point."""
mod = argschema.ArgSchemaParser(schema_type=InputSchema)
log_file_path = create_dir(mod.args["output_dir"], "log") / "red.log"
args = parse_args()

log_file_path = create_dir(args.output_dir, "log") / "red.log"
loginipath = Path(__file__).parents[6] / "conf" / "logging.conf"

logging.config.fileConfig(
loginipath,
defaults={"logfilename": str(log_file_path)},
disable_existing_loggers=False,
)

run_red(
Path(mod.args["genome_file"]), mod.args["output_dir"], mod.args["red_bin"],
Path(args.genome_file),
args.output_dir,
args.red_bin,
)


Expand Down
Loading

0 comments on commit 97dbb76

Please sign in to comment.