diff --git a/docs/pipeline_combine.png b/docs/pipeline_combine.png index 6edad49..5c81307 100644 Binary files a/docs/pipeline_combine.png and b/docs/pipeline_combine.png differ diff --git a/docs/pipeline_combine.pu b/docs/pipeline_combine.pu index 6549857..b0ccd9e 100644 --- a/docs/pipeline_combine.pu +++ b/docs/pipeline_combine.pu @@ -12,7 +12,7 @@ split again end split #lightblue:start tradis pipeline compare; - +:normalise plot files; split : create count file for original plot and original embl; diff --git a/quatradis/comparison/essentiality_analysis.py b/quatradis/comparison/essentiality_analysis.py index 17174a0..9f97200 100644 --- a/quatradis/comparison/essentiality_analysis.py +++ b/quatradis/comparison/essentiality_analysis.py @@ -1,7 +1,6 @@ import csv import os from dataclasses import dataclass -from tempfile import mkstemp from quatradis.comparison.essentiality import GeneEssentiality from quatradis.util.file_handle_helpers import ensure_output_dir_exists @@ -33,7 +32,7 @@ def get_all_gene_names(control_files, condition_files): gene_names1 = [r[1] for r in reader if len(r) > 1 and r[1] != "gene_name"] all_gene_names = all_gene_names.union(set(gene_names1)) - for f in control_files: + for filename in control_files: with open(filename, "r") as fileh: reader = csv.reader(fileh, delimiter="\t", quotechar='"') gene_names2 = [r[1] for r in reader if len(r) > 1 and r[1] != "gene_name"] @@ -51,7 +50,7 @@ def all_gene_essentiality(input: EssentialityInput, analysis_type, verbose=False genes_ess = {g: GeneEssentiality() for g in all_gene_names} if analysis_type == "original": for f in input.only_ess_condition_files: - ess_gene_names = gene_names_from_essentiality_file(f, verbose) + ess_gene_names = gene_names_from_essentiality_file(f) if verbose: print("ess_gene_names condition: " + str(len(ess_gene_names))) print("genes_ess: " + str(len(genes_ess))) @@ -60,7 +59,7 @@ def all_gene_essentiality(input: EssentialityInput, analysis_type, verbose=False genes_ess[e].condition += 1 genes_ess[e].number_of_reps = len(input.only_ess_condition_files) for f in input.only_ess_control_files: - ess_gene_names = gene_names_from_essentiality_file(f, verbose) + ess_gene_names = gene_names_from_essentiality_file(f) if verbose: print("ess_gene_names control: " + str(len(ess_gene_names))) print("genes_ess: " + str(len(genes_ess))) diff --git a/quatradis/comparison/scatterplot.py b/quatradis/comparison/scatterplot.py index f4b959b..9a0a9eb 100644 --- a/quatradis/comparison/scatterplot.py +++ b/quatradis/comparison/scatterplot.py @@ -48,7 +48,7 @@ def __init__(self, conditions, controls, window_size, prefix, normalise=False, v self.normalise = normalise if normalise: - n = NormalisePlots(self.conditions + self.controls, 0.0000001, verbose=self.verbose) + n = NormalisePlots(self.conditions + self.controls, 0.0000001, output_dir="normalised", verbose=self.verbose) plotfiles, max_reads = n.create_normalised_files() self.conditions = plotfiles[0:len(self.conditions)] self.controls = plotfiles[len(self.conditions):] diff --git a/quatradis/comparison/split.py b/quatradis/comparison/split.py index d44a903..cafe6d7 100644 --- a/quatradis/comparison/split.py +++ b/quatradis/comparison/split.py @@ -27,7 +27,7 @@ def get_combined_file_path(self): return self._construct_file_path("combined") def _create_split_plot_file(self, forward, reverse, filename): - p = PlotFromValuesGenerator(forward, reverse, filename, self.gzipped) + p = PlotFromValuesGenerator(forward, reverse, filename) p.construct_file() def create_all_files(self): diff --git a/quatradis/comparison/tradis_comparison.R b/quatradis/comparison/tradis_comparison.R index d677a02..7b3ab2d 100755 --- a/quatradis/comparison/tradis_comparison.R +++ b/quatradis/comparison/tradis_comparison.R @@ -119,4 +119,4 @@ abline(v=2, col="red") #write results -write.table(diff,file=opt$output,append=FALSE, quote=TRUE, sep=",", row.names=FALSE, col.names=c("locus_tag","gene_name","function","logFC","logCPM","PValue","q.value")) +write.table(diff,file=opt$output,append=FALSE, quote=FALSE, sep=",", row.names=FALSE, col.names=c("locus_tag","gene_name","function","logFC","logCPM","PValue","q.value")) diff --git a/quatradis/pipelines/compare.smk b/quatradis/pipelines/compare.smk index 64cd049..d940219 100644 --- a/quatradis/pipelines/compare.smk +++ b/quatradis/pipelines/compare.smk @@ -1,21 +1,30 @@ import os import shutil +input_files=[] +norm_files=[] plotnames=[] controlnames=[] conditionnames=[] -plot_lut={} +norm_lut={} for p in config["condition_files"]: + input_files.append(p) plotname = os.path.basename(p).split('.')[0] plotnames.append(plotname) conditionnames.append(plotname) - plot_lut[plotname]=p + for p in config["control_files"]: + input_files.append(p) plotname = os.path.basename(p).split('.')[0] plotnames.append(plotname) controlnames.append(plotname) - plot_lut[plotname]=p + +for p in input_files: + plotname = os.path.basename(p).split('.')[0] + n=os.path.join(config["output_dir"], "analysis", plotname, "original.plot.gz") + norm_files.append(n) + norm_lut[plotname]=n ZCAT_CMD='gzcat' if not shutil.which('gzcat'): @@ -24,14 +33,27 @@ if not shutil.which('gzcat'): else: raise Error("Couldn't find gzcat or zcat on your system. Please install and try again.") +def make_no_normalise_cmd(): + cmds = [] + for i in range(len(input_files)): + out_dir = os.path.join(config["output_dir"], "analysis", plotname) + cmds.append("mkdir -p " + out_dir) + cmds.append("if [[ $(file " + input_files[i] + " | grep ASCII | wc -l | cut -f 1) > 0 ]]; then gzip -c " + input_files[i] + " > " + norm_files[i] + "; else cp " + input_files[i] + " " + norm_files[i] + "; fi") + + return "; ".join(cmds) + +def make_normalise_cmd(): + return "tradis plot normalise -o " + os.path.join(config["output_dir"], "analysis") + " -n original.plot.gz " + \ + ("--minimum_proportion_insertions=" + config["minimum_proportion_insertions"] if config["minimum_proportion_insertions"] else "") + \ + " " + " ".join(input_files) rule finish: input: expand(os.path.join(config["output_dir"], "gene_report.tsv")), expand(os.path.join(config["output_dir"], "comparison", "{type}", "plot_absscatter.png"), type=["original", "forward", "reverse", "combined"]), - #expand(os.path.join(config["output_dir"],"comparison","{type}","{type}.compare.csv"), type=["original", "forward", "reverse", "combined"]), - expand(os.path.join(config["output_dir"],"comparison","{type}","essentiality.csv"), type=["original", "forward", "reverse", "combined"]), - expand(os.path.join(config["output_dir"],"analysis","{plot}","{type}.count.tsv.essen.csv"), type=["original", "combined", "forward", "reverse"], plot=plotnames) + #expand(os.path.join(config["output_dir"], "comparison", "{type}", "{type}.compare.csv"), type=["original", "forward", "reverse", "combined"]), + expand(os.path.join(config["output_dir"], "comparison", "{type}", "essentiality.csv"), type=["original", "forward", "reverse", "combined"]), + expand(os.path.join(config["output_dir"], "analysis", "{plot}", "{type}.count.tsv.essen.csv"), type=["original", "combined", "forward", "reverse"], plot=plotnames) run: print("All done!") @@ -49,10 +71,21 @@ rule prepare_embl: prime_feature_size="--prime_feature_size=" + config["prime_feature_size"] if config["prime_feature_size"] else "" shell: "tradis compare prepare_embl --output={output} {params.minimum_threshold} {params.window_size} {params.window_interval} {params.prime_feature_size} --emblfile {input.embl} {input.plot}" +rule normalise: + input: + input_files + output: + norm_files + message: "Normalising plot files" + log: os.path.join(config["output_dir"], "analysis", "normalise.log") + params: + cmd=make_normalise_cmd() if config["normalise"] else make_no_normalise_cmd() + shell: "{params.cmd}" + rule split_plots: input: - p=lambda wildcards: plot_lut[wildcards.plot] + p=lambda wildcards: norm_lut[wildcards.plot] output: c=os.path.join(config["output_dir"], "analysis", "{plot}", "combined.plot.gz"), f=os.path.join(config["output_dir"], "analysis", "{plot}", "forward.plot.gz"), @@ -74,34 +107,9 @@ rule count_plots: params: output_dir=os.path.join(config["output_dir"], "analysis", "{plot}"), suffix="count.tsv" - wildcard_constraints: - type="(?!original).*" shell: "tradis plot count -o {params.output_dir} -s {params.suffix} {input.embl} {input.p}" -rule copy_original: - input: - lambda wildcards: plot_lut[wildcards.plot] - output: - os.path.join(config["output_dir"],"analysis","{plot}","original.plot.gz") - message: "Copying original plot file to target folder: {input}" - params: - output_dir=os.path.join(config["output_dir"],"analysis","{plot}") - shell: "if [[ $(file {input} | grep ASCII | wc -l | cut -f 1) > 0 ]]; then gzip -c {input} > {output}; else cp {input} {output}; fi" - - -rule count_original_plots: - input: - p=rules.copy_original.output, - embl=config["annotations"] - output: - os.path.join(config["output_dir"], "analysis", "{plot}", "original.count.tsv") - message: "Analysing original plot and embl file {input.p}, {input.embl}" - params: - output_dir=os.path.join(config["output_dir"], "analysis", "{plot}") - shell: "tradis plot count -o {params.output_dir} -s count.tsv {input.embl} {input.p}" - - rule essentiality: input: c=os.path.join(config["output_dir"], "analysis", "{plot}", "{type}.count.tsv"), diff --git a/quatradis/pipelines/pipeline.py b/quatradis/pipelines/pipeline.py index 2ef30dd..856dba9 100644 --- a/quatradis/pipelines/pipeline.py +++ b/quatradis/pipelines/pipeline.py @@ -8,42 +8,90 @@ def add_subparser(subparsers): - pipeline_parser_desc = "TraDIS pipelines that stitch together other tools in this package." + pipeline_parser_desc = ( + "TraDIS pipelines that stitch together other tools in this package." + ) pipeline_parser = subparsers.add_parser("pipeline", help=pipeline_parser_desc) pipeline_subparsers = pipeline_parser.add_subparsers(title=pipeline_parser_desc) - create_parser("create_plots", pipeline_subparsers, create_plots_pipeline, create_plots_options, - "Creates transponson insertion site plot files for multiple fastqs in parallel where possible using snakemake.", - description='''This pipeline uses snakemake, therefore it is possible to customise how this operates to distribute the workload - across a cluster using a snakemake config file.''', - usage="tradis pipeline create_plots [options] ") - - create_parser("compare", pipeline_subparsers, compare_pipeline, compare_options, - "Calculates gene essentiality for a set of transposon insertion site plots files ", - description='''This pipeline uses snakemake, therefore it is possible to customise how this operates to distribute the workload - across a cluster using a snakemake config file.''', - usage="tradis pipeline compare [options]") + create_parser( + "create_plots", + pipeline_subparsers, + create_plots_pipeline, + create_plots_options, + "Creates transponson insertion site plot files for multiple fastqs in parallel where possible using snakemake.", + description="""This pipeline uses snakemake, therefore it is possible to customise how this operates to distribute the workload + across a cluster using a snakemake config file.""", + usage="tradis pipeline create_plots [options] ", + ) + + create_parser( + "compare", + pipeline_subparsers, + compare_pipeline, + compare_options, + "Calculates gene essentiality for a set of transposon insertion site plots files ", + description="""This pipeline uses snakemake, therefore it is possible to customise how this operates to distribute the workload + across a cluster using a snakemake config file.""", + usage="tradis pipeline compare [options]", + ) def create_plots_options(parser): - parser.add_argument('fastqs', type=str, - help='Either a line separated text file containing a list of fastq formatted reads for processing or a single fastq file (can be gzipped).') - parser.add_argument('reference', type=str, - help='The fasta formatted reference for processing.') - parser.add_argument('-o', '--output_dir', default="results", - help='The output directory to use for all output files (default: results)') - parser.add_argument('-n', '--threads', type=int, default=1, - help='number of threads to use when mapping and sorting (default: 1)') - parser.add_argument('-a', '--aligner', default="bwa", - help='mapping tool to use (bwa, smalt, minimap2) (default: bwa)') - parser.add_argument('-m', '--mapping_score', type=int, default=30, - help='mapping quality must be greater than X (Default: 30)') - parser.add_argument('-t', '--tag', type=str, default="", - help='the tag to remove from fastq input (default: "")') - parser.add_argument('-mm', '--mismatch', type=int, default=0, - help='number of mismatches allowed when matching tag (default: 0)') - parser.add_argument('-sp', '--snakemake_profile', type=str, - help='If provided, pass this directory onto snakemake. Assumes there is a file called "config.yaml" in that directory.') + parser.add_argument( + "fastqs", + type=str, + help="Either a line separated text file containing a list of fastq formatted reads for processing or a single fastq file (can be gzipped).", + ) + parser.add_argument( + "reference", type=str, help="The fasta formatted reference for processing." + ) + parser.add_argument( + "-o", + "--output_dir", + default="results", + help="The output directory to use for all output files (default: results)", + ) + parser.add_argument( + "-n", + "--threads", + type=int, + default=1, + help="number of threads to use when mapping and sorting (default: 1)", + ) + parser.add_argument( + "-a", + "--aligner", + default="bwa", + help="mapping tool to use (bwa, smalt, minimap2) (default: bwa)", + ) + parser.add_argument( + "-m", + "--mapping_score", + type=int, + default=30, + help="mapping quality must be greater than X (Default: 30)", + ) + parser.add_argument( + "-t", + "--tag", + type=str, + default="", + help='the tag to remove from fastq input (default: "")', + ) + parser.add_argument( + "-mm", + "--mismatch", + type=int, + default=0, + help="number of mismatches allowed when matching tag (default: 0)", + ) + parser.add_argument( + "-sp", + "--snakemake_profile", + type=str, + help='If provided, pass this directory onto snakemake. Assumes there is a file called "config.yaml" in that directory.', + ) def create_plots_pipeline(args): @@ -55,15 +103,15 @@ def create_plots_pipeline(args): fastqs = [args.fastqs] if not fhh.is_fastq(args.fastqs): - with open(args.fastqs, 'r') as fql: + with open(args.fastqs, "r") as fql: fastqs = [x.strip() for x in fql.readlines() if x] snakemake_config = os.path.join(args.output_dir, "create_plots_config.yaml") fastq_dir, fq_fn = os.path.split(args.fastqs) - with open(snakemake_config, 'w') as ofql: + with open(snakemake_config, "w") as ofql: ofql.write(create_yaml_option("output_dir", args.output_dir)) ofql.write(create_yaml_option("reference", args.reference)) - ofql.write("fastq_dir: \"" + fastq_dir + "\"\n") + ofql.write('fastq_dir: "' + fastq_dir + '"\n') ofql.write("fastqs:\n") for x in fastqs: ofql.write("- " + x + "\n") @@ -75,30 +123,86 @@ def create_plots_pipeline(args): pipeline = find_pipeline_file("create_plots.smk") - start_snakemake(pipeline, snakemake_config, threads=args.threads, - snakemake_profile=args.snakemake_profile, verbose=args.verbose) + start_snakemake( + pipeline, + snakemake_config, + threads=args.threads, + snakemake_profile=args.snakemake_profile, + verbose=args.verbose, + ) def compare_options(parser): - parser.add_argument('--condition_files', type=str, nargs='+', - help='A set of condition plot files to process (can be gzipped)') - parser.add_argument('--control_files', type=str, nargs='+', - help='A set of control plot files to process (can be gzipped)') - parser.add_argument('-o', '--output_dir', default="results", - help='The output directory to use for all output files (default: results)') - parser.add_argument('-n', '--threads', type=int, default=1, - help='number of threads to use when processing (default: 1)') - parser.add_argument('--annotations', '-a', - help='If provided genes in this EMBL annotations file will expanded based on data in the plotfile.', - type=str, default=None) - parser.add_argument('--minimum_threshold', '-m', - help='Only include insert sites with this number or greater insertions', type=int, default=5) - parser.add_argument('--prime_feature_size', '-z', - help='Feature size when adding 5/3 prime block when --use_annotation', type=int, default=198) - parser.add_argument('--window_interval', '-l', help='Window interval', type=int, default=25) - parser.add_argument('--window_size', '-w', help='Window size', type=int, default=100) - parser.add_argument('-sp', '--snakemake_profile', type=str, - help='If provided, pass this directory onto snakemake. Assumes there is a file called "config.yaml" in that directory.') + parser.add_argument( + "--condition_files", + type=str, + nargs="+", + help="A set of condition plot files to process (can be gzipped)", + ) + parser.add_argument( + "--control_files", + type=str, + nargs="+", + help="A set of control plot files to process (can be gzipped)", + ) + parser.add_argument( + "-o", + "--output_dir", + default="results", + help="The output directory to use for all output files (default: results)", + ) + parser.add_argument( + "--disable_normalisation", + action='store_true', + default=False, + help="Don't normalise the plots prior to comparison (default: false)", + ) + parser.add_argument( + "-n", + "--threads", + type=int, + default=1, + help="number of threads to use when processing (default: 1)", + ) + parser.add_argument( + "--annotations", + "-a", + help="If provided genes in this EMBL annotations file will expanded based on data in the plotfile.", + type=str, + default=None, + ) + parser.add_argument( + "--minimum_threshold", + "-m", + help="Only include insert sites with this number or greater insertions", + type=int, + default=5, + ) + parser.add_argument( + "--minimum_proportion_insertions", + help="If the proportion of insertions is too low compared to control, dont call decreased insertions below this level", + type=float, + default=0.1, + ) + parser.add_argument( + "--prime_feature_size", + "-z", + help="Feature size when adding 5/3 prime block when --use_annotation", + type=int, + default=198, + ) + parser.add_argument( + "--window_interval", "-l", help="Window interval", type=int, default=25 + ) + parser.add_argument( + "--window_size", "-w", help="Window size", type=int, default=100 + ) + parser.add_argument( + "-sp", + "--snakemake_profile", + type=str, + help='If provided, pass this directory onto snakemake. Assumes there is a file called "config.yaml" in that directory.', + ) def compare_pipeline(args): @@ -110,12 +214,14 @@ def compare_pipeline(args): raise ValueError("Must have equal number of control and condition files") if len(args.control_files) <= 1: - raise ValueError("Must have 2 or more replicates of control and condition files") + raise ValueError( + "Must have 2 or more replicates of control and condition files" + ) fhh.ensure_output_dir_exists(args.output_dir) snakemake_config = os.path.join(args.output_dir, "analysis_config.yaml") - with open(snakemake_config, 'w') as ofql: + with open(snakemake_config, "w") as ofql: ofql.write(create_yaml_option("output_dir", args.output_dir)) ofql.write("condition_files:\n") for x in args.condition_files: @@ -123,29 +229,44 @@ def compare_pipeline(args): ofql.write("control_files:\n") for x in args.control_files: ofql.write("- " + x + "\n") + ofql.write(create_yaml_option("normalise", not args.disable_normalisation, bool=True)) ofql.write(create_yaml_option("threads", args.threads, num=True)) ofql.write(create_yaml_option("annotations", args.annotations)) ofql.write(create_yaml_option("minimum_threshold", args.minimum_threshold)) ofql.write(create_yaml_option("prime_feature_size", args.prime_feature_size)) ofql.write(create_yaml_option("window_interval", args.window_interval)) ofql.write(create_yaml_option("window_size", args.window_size)) + ofql.write( + create_yaml_option( + "minimum_proportion_insertions", args.minimum_proportion_insertions + ) + ) pipeline = find_pipeline_file("compare.smk") - start_snakemake(pipeline, snakemake_config, threads=args.threads, snakemake_profile=args.snakemake_profile, - verbose=args.verbose) + start_snakemake( + pipeline, + snakemake_config, + threads=args.threads, + snakemake_profile=args.snakemake_profile, + verbose=args.verbose, + ) -def start_snakemake(snakefile, snakemake_config, threads=1, snakemake_profile=None, verbose=False): +def start_snakemake( + snakefile, snakemake_config, threads=1, snakemake_profile=None, verbose=False +): if verbose: print("Using snakemake config files at: " + ", ".join([snakemake_config])) print("Starting snakemake pipeline") - cmd_list = ["snakemake", - "--snakefile=" + snakefile, - "--configfile=" + snakemake_config, - "--cores=" + str(threads), - "--printshellcmds"] + cmd_list = [ + "snakemake", + "--snakefile=" + snakefile, + "--configfile=" + snakemake_config, + "--cores=" + str(threads), + "--printshellcmds", + ] if snakemake_profile: cmd_list.append("--profile=" + snakemake_profile) @@ -187,11 +308,13 @@ def find_pipeline_file(pipeline_file): raise RuntimeError("Could not find nextflow pipeline file.") -def create_yaml_option(option, value, num=False): +def create_yaml_option(option, value, num=False, bool=False): opt = option + ": " if num: opt += str(value) + elif bool: + opt += "true" if value else "false" else: - opt += "\"" + str(value) + "\"" + opt += '"' + str(value) + '"' opt += "\n" return opt diff --git a/quatradis/tisp/cli.py b/quatradis/tisp/cli.py index 2ded00b..a547fbb 100644 --- a/quatradis/tisp/cli.py +++ b/quatradis/tisp/cli.py @@ -8,6 +8,7 @@ from quatradis.tisp.analyse import count_insert_sites from quatradis.tisp.normalise import NormalisePlots + def add_subparser(subparsers): plot_parser_desc = "TraDIS plot file tools" plot_parser = subparsers.add_parser("plot", help=plot_parser_desc) @@ -103,9 +104,13 @@ def normalise_plot_options(parser): parser.add_argument('--minimum_proportion_insertions', '-d', help='If the proportion of insertions is too low compared to control, dont call decreased insertions below this level', type=float, default=0.1) + parser.add_argument('-o', '--output_dir', type=str, default="", + help='The directory in which to put all output files (default: current working directory)') + parser.add_argument('-n', '--output_filename', type=str, default="normalised.plot.gz", + help='The name for the normalised output files. Normalised files will be distinguished by directory') def normalise_plots(args): - n = NormalisePlots(args.plot_files, args.minimum_proportion_insertions, output_temp_files=False) + n = NormalisePlots(args.plot_files, args.minimum_proportion_insertions, output_dir=args.output_dir, output_filename=args.output_filename, verbose=args.verbose) plotfiles, max_plot_reads = n.create_normalised_files() n.decreased_insertion_reporting(max_plot_reads) \ No newline at end of file diff --git a/quatradis/tisp/generator/from_values.py b/quatradis/tisp/generator/from_values.py index 897341a..7f497b4 100644 --- a/quatradis/tisp/generator/from_values.py +++ b/quatradis/tisp/generator/from_values.py @@ -1,6 +1,10 @@ +import os + import numpy import pyximport +from quatradis.util.file_handle_helpers import ensure_output_dir_exists + pyximport.install() from quatradis.tisp.write import write_plot_file, write_gzipped_plot_file @@ -10,12 +14,10 @@ class PlotFromValuesGenerator: """ Takes in arrays for forward and reverse integers and creates a new file """ - - def __init__(self, forward, reverse, filename, gzipped=False): + def __init__(self, forward, reverse, filename): self.forward = forward self.reverse = reverse self.filename = filename - self.gzipped = gzipped self.forward_length = len(self.forward) self.reverse_length = len(self.reverse) @@ -29,7 +31,7 @@ def construct_file(self): if len(self.reverse) == 0: self.reverse = numpy.zeros(p_len, dtype=float) - if self.gzipped: + if self.filename.endswith('.gz'): write_gzipped_plot_file(self.filename, self.forward, self.reverse, p_len) else: write_plot_file(self.filename, self.forward, self.reverse, p_len) diff --git a/quatradis/tisp/normalise.py b/quatradis/tisp/normalise.py index 4d2b780..01d097a 100644 --- a/quatradis/tisp/normalise.py +++ b/quatradis/tisp/normalise.py @@ -1,7 +1,6 @@ - -from tempfile import mkstemp +import os import numpy - +from quatradis.util.file_handle_helpers import ensure_output_dir_exists from quatradis.tisp.parser import PlotParser from quatradis.tisp.generator.from_values import PlotFromValuesGenerator @@ -9,9 +8,10 @@ class NormalisePlots: '''Given a set of plot files, find the one with the highest number of reads and normalise all the rest into new files''' - def __init__(self, plotfiles, minimum_proportion_insertions, output_temp_files=True, verbose=False): + def __init__(self, plotfiles, minimum_proportion_insertions, output_dir=".", output_filename="normalised.plot_gz", verbose=False): self.plotfiles = plotfiles - self.output_temp_files=output_temp_files + self.output_dir = output_dir + self.output_filename = output_filename self.verbose = verbose self.minimum_proportion_insertions = minimum_proportion_insertions self.plot_objs = self.read_plots() @@ -20,9 +20,10 @@ def create_normalised_files(self): plot_objs, max_plot_reads = self.normalise() output_files = [] for p in self.plotfiles: - output_filename = p + ".norm" - if self.output_temp_files: - fd, output_filename = mkstemp() + plotname = os.path.basename(p).split('.')[0] + out_dir = os.path.join(self.output_dir, plotname) + output_filename = os.path.join(out_dir, self.output_filename) + ensure_output_dir_exists(out_dir) pg = PlotFromValuesGenerator(plot_objs[p].forward, plot_objs[p].reverse, output_filename) pg.construct_file() output_files.append(output_filename) diff --git a/quatradis/tisp/parser.py b/quatradis/tisp/parser.py index ec5bf4f..a7a6f3c 100644 --- a/quatradis/tisp/parser.py +++ b/quatradis/tisp/parser.py @@ -176,7 +176,7 @@ def __init__(self, filename, minimum_threshold=0): self.split_lines() def split_lines(self): - insert_site_array = pandas.read_csv(self.filename, delim_whitespace=True, dtype=float, engine='c', + insert_site_array = pandas.read_csv(self.filename, sep='\s+', dtype=float, engine='c', header=None).values self.genome_length = len(insert_site_array) diff --git a/tests/py/comparison/scatterplot_test.py b/tests/py/comparison/scatterplot_test.py index 9a6ba1a..174a390 100644 --- a/tests/py/comparison/scatterplot_test.py +++ b/tests/py/comparison/scatterplot_test.py @@ -1,3 +1,4 @@ +import shutil import unittest import os from quatradis.comparison.scatterplot import ScatterPlot @@ -24,3 +25,4 @@ def test_valid(self): os.remove("scattertest_absscatter.png") os.remove("scattertest_linear.png") os.remove("scattertest_scatter.png") + shutil.rmtree("normalised") diff --git a/tests/py/tisp/normalise_test.py b/tests/py/tisp/normalise_test.py index 8284549..8243fde 100644 --- a/tests/py/tisp/normalise_test.py +++ b/tests/py/tisp/normalise_test.py @@ -1,40 +1,76 @@ import filecmp import os +import shutil import unittest from quatradis.tisp.normalise import NormalisePlots -class ErrorReadingFile(Exception): pass +class ErrorReadingFile(Exception): + pass -class InvalidFileFormat(Exception): pass +class InvalidFileFormat(Exception): + pass -data_dir = os.path.join('data', 'tisp', 'normalise') -plotparser_dir = os.path.join('data', 'tisp', 'parser') +data_dir = os.path.join("data", "tisp", "normalise") +plotparser_dir = os.path.join("data", "tisp", "parser") class TestNormalisePlots(unittest.TestCase): def test_big_differences(self): - p = NormalisePlots([os.path.join(data_dir, 'sample1'), os.path.join(data_dir, 'sample2')], 0.1) + p = NormalisePlots( + [os.path.join(data_dir, "sample1"), os.path.join(data_dir, "sample2")], + 0.1, + output_dir="normalised", + ) output_files, max_reads = p.create_normalised_files() self.assertEqual(2, len(output_files)) - self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sample2'), output_files[1])) - self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'expected_sample1'), output_files[0])) + self.assertTrue(filecmp.cmp(os.path.join(data_dir, "sample2"), output_files[1])) + self.assertTrue( + filecmp.cmp(os.path.join(data_dir, "expected_sample1"), output_files[0]) + ) self.assertTrue(p.decreased_insertion_reporting()) + shutil.rmtree("normalised") def test_ignore_decreased_insertions(self): - p = NormalisePlots([os.path.join(data_dir, 'lowinsertions'), os.path.join(data_dir, 'highinsertions')], 0.1) + p = NormalisePlots( + [ + os.path.join(data_dir, "lowinsertions"), + os.path.join(data_dir, "highinsertions"), + ], + 0.1, + output_dir="normalised" + ) self.assertFalse(p.decreased_insertion_reporting()) def test_ignore_decreased_insertions_insert_sites(self): - p = NormalisePlots([os.path.join(data_dir, 'fewinsertions'), os.path.join(data_dir, 'manyinsertions')], 0.1) + p = NormalisePlots( + [ + os.path.join(data_dir, "fewinsertions"), + os.path.join(data_dir, "manyinsertions"), + ], + 0.1, + output_dir="normalised", + ) self.assertFalse(p.decreased_insertion_reporting()) def test_large_file_zipped(self): - p = NormalisePlots([os.path.join(plotparser_dir, 'Control2.out.CP009273.insert_site_plot.gz'), - os.path.join(plotparser_dir, 'Chloramrep2-MICpool.out.CP009273.insert_site_plot.gz')], 0.1) + p = NormalisePlots( + [ + os.path.join( + plotparser_dir, "Control2.out.CP009273.insert_site_plot.gz" + ), + os.path.join( + plotparser_dir, + "Chloramrep2-MICpool.out.CP009273.insert_site_plot.gz", + ), + ], + 0.1, + output_dir="normalised", + ) output_files, max_reads = p.create_normalised_files() self.assertTrue(p.decreased_insertion_reporting()) + shutil.rmtree("normalised") diff --git a/tests/scripts/plot_test.sh b/tests/scripts/plot_test.sh index 1c20b9f..f5f4c36 100755 --- a/tests/scripts/plot_test.sh +++ b/tests/scripts/plot_test.sh @@ -37,6 +37,6 @@ rm -r temp_test echo "ok" echo -n "Checking 'tradis plot normalise' ... " -./tradis plot normalise $DATA_DIR/normalise/fewinsertions $DATA_DIR/normalise/manyinsertions > /dev/null 2>&1 -rm $DATA_DIR/normalise/*.norm +./tradis plot normalise --output_dir temp_test $DATA_DIR/normalise/fewinsertions $DATA_DIR/normalise/manyinsertions > /dev/null 2>&1 +rm -r temp_test echo "ok" \ No newline at end of file