feat(normalisation): adding plot file normalisation to the compare pi…

…peline We add normalisation to the beginning of the plot compare pipeline. fix no normalisation issue
quadram-institute-bioscience · Feb 25, 2024 · 9559db5 · 9559db5
1 parent 8c7ac03
commit 9559db5
Show file tree

Hide file tree

Showing 15 changed files with 309 additions and 133 deletions.
diff --git a/docs/pipeline_combine.png b/docs/pipeline_combine.png
diff --git a/docs/pipeline_combine.pu b/docs/pipeline_combine.pu
@@ -12,7 +12,7 @@ split again
 end split
 
 #lightblue:start tradis pipeline compare;
-
+:normalise plot files;
 split
 : create count file for
 original plot and original embl;

diff --git a/quatradis/comparison/essentiality_analysis.py b/quatradis/comparison/essentiality_analysis.py
@@ -1,7 +1,6 @@
 import csv
 import os
 from dataclasses import dataclass
-from tempfile import mkstemp
 
 from quatradis.comparison.essentiality import GeneEssentiality
 from quatradis.util.file_handle_helpers import ensure_output_dir_exists
@@ -33,7 +32,7 @@ def get_all_gene_names(control_files, condition_files):
             gene_names1 = [r[1] for r in reader if len(r) > 1 and r[1] != "gene_name"]
             all_gene_names = all_gene_names.union(set(gene_names1))
 
-    for f in control_files:
+    for filename in control_files:
         with open(filename, "r") as fileh:
             reader = csv.reader(fileh, delimiter="\t", quotechar='"')
             gene_names2 = [r[1] for r in reader if len(r) > 1 and r[1] != "gene_name"]
@@ -51,7 +50,7 @@ def all_gene_essentiality(input: EssentialityInput, analysis_type, verbose=False
     genes_ess = {g: GeneEssentiality() for g in all_gene_names}
     if analysis_type == "original":
         for f in input.only_ess_condition_files:
-            ess_gene_names = gene_names_from_essentiality_file(f, verbose)
+            ess_gene_names = gene_names_from_essentiality_file(f)
             if verbose:
                 print("ess_gene_names condition: " + str(len(ess_gene_names)))
                 print("genes_ess: " + str(len(genes_ess)))
@@ -60,7 +59,7 @@ def all_gene_essentiality(input: EssentialityInput, analysis_type, verbose=False
                     genes_ess[e].condition += 1
                 genes_ess[e].number_of_reps = len(input.only_ess_condition_files)
         for f in input.only_ess_control_files:
-            ess_gene_names = gene_names_from_essentiality_file(f, verbose)
+            ess_gene_names = gene_names_from_essentiality_file(f)
             if verbose:
                 print("ess_gene_names control: " + str(len(ess_gene_names)))
                 print("genes_ess: " + str(len(genes_ess)))

diff --git a/quatradis/comparison/scatterplot.py b/quatradis/comparison/scatterplot.py
@@ -48,7 +48,7 @@ def __init__(self, conditions, controls, window_size, prefix, normalise=False, v
         self.normalise = normalise
 
         if normalise:
-            n = NormalisePlots(self.conditions + self.controls, 0.0000001, verbose=self.verbose)
+            n = NormalisePlots(self.conditions + self.controls, 0.0000001, output_dir="normalised", verbose=self.verbose)
             plotfiles, max_reads = n.create_normalised_files()
             self.conditions = plotfiles[0:len(self.conditions)]
             self.controls = plotfiles[len(self.conditions):]

diff --git a/quatradis/comparison/split.py b/quatradis/comparison/split.py
@@ -27,7 +27,7 @@ def get_combined_file_path(self):
         return self._construct_file_path("combined")
 
     def _create_split_plot_file(self, forward, reverse, filename):
-        p = PlotFromValuesGenerator(forward, reverse, filename, self.gzipped)
+        p = PlotFromValuesGenerator(forward, reverse, filename)
         p.construct_file()
 
     def create_all_files(self):

diff --git a/quatradis/comparison/tradis_comparison.R b/quatradis/comparison/tradis_comparison.R
@@ -119,4 +119,4 @@ abline(v=2, col="red")
 
 
 #write results
-write.table(diff,file=opt$output,append=FALSE, quote=TRUE, sep=",", row.names=FALSE, col.names=c("locus_tag","gene_name","function","logFC","logCPM","PValue","q.value"))
+write.table(diff,file=opt$output,append=FALSE, quote=FALSE, sep=",", row.names=FALSE, col.names=c("locus_tag","gene_name","function","logFC","logCPM","PValue","q.value"))
diff --git a/quatradis/pipelines/compare.smk b/quatradis/pipelines/compare.smk
@@ -1,21 +1,30 @@
 import os
 import shutil
 
+input_files=[]
+norm_files=[]
 plotnames=[]
 controlnames=[]
 conditionnames=[]
-plot_lut={}
+norm_lut={}
 for p in config["condition_files"]:
+    input_files.append(p)
     plotname = os.path.basename(p).split('.')[0]
     plotnames.append(plotname)
     conditionnames.append(plotname)
-    plot_lut[plotname]=p
+
 
 for p in config["control_files"]:
+    input_files.append(p)
     plotname = os.path.basename(p).split('.')[0]
     plotnames.append(plotname)
     controlnames.append(plotname)
-    plot_lut[plotname]=p
+
+for p in input_files:
+    plotname = os.path.basename(p).split('.')[0]
+    n=os.path.join(config["output_dir"], "analysis", plotname, "original.plot.gz")
+    norm_files.append(n)
+    norm_lut[plotname]=n
 
 ZCAT_CMD='gzcat'
 if not shutil.which('gzcat'):
@@ -24,14 +33,27 @@ if not shutil.which('gzcat'):
 	else:
 		raise Error("Couldn't find gzcat or zcat on your system.  Please install and try again.")
 
+def make_no_normalise_cmd():
+	cmds = []
+	for i in range(len(input_files)):
+		out_dir = os.path.join(config["output_dir"], "analysis", plotname)
+		cmds.append("mkdir -p " + out_dir)
+		cmds.append("if [[ $(file " + input_files[i] + " | grep ASCII | wc -l | cut -f 1) > 0 ]]; then gzip -c " + input_files[i] + " > " + norm_files[i] + "; else cp " + input_files[i] + " " + norm_files[i] + "; fi")
+
+	return "; ".join(cmds)
+
+def make_normalise_cmd():
+    return "tradis plot normalise -o " + os.path.join(config["output_dir"], "analysis") + " -n original.plot.gz " + \
+        ("--minimum_proportion_insertions=" + config["minimum_proportion_insertions"] if config["minimum_proportion_insertions"] else "") + \
+            " " + " ".join(input_files)
 
 rule finish:
     input:
         expand(os.path.join(config["output_dir"], "gene_report.tsv")),
         expand(os.path.join(config["output_dir"], "comparison", "{type}", "plot_absscatter.png"), type=["original", "forward", "reverse", "combined"]),
-        #expand(os.path.join(config["output_dir"],"comparison","{type}","{type}.compare.csv"), type=["original", "forward", "reverse", "combined"]),
-        expand(os.path.join(config["output_dir"],"comparison","{type}","essentiality.csv"), type=["original", "forward", "reverse", "combined"]),
-        expand(os.path.join(config["output_dir"],"analysis","{plot}","{type}.count.tsv.essen.csv"), type=["original", "combined", "forward", "reverse"], plot=plotnames)
+        #expand(os.path.join(config["output_dir"], "comparison", "{type}", "{type}.compare.csv"), type=["original", "forward", "reverse", "combined"]),
+        expand(os.path.join(config["output_dir"], "comparison", "{type}", "essentiality.csv"), type=["original", "forward", "reverse", "combined"]),
+        expand(os.path.join(config["output_dir"], "analysis", "{plot}", "{type}.count.tsv.essen.csv"), type=["original", "combined", "forward", "reverse"], plot=plotnames)
     run:
         print("All done!")
 
@@ -49,10 +71,21 @@ rule prepare_embl:
         prime_feature_size="--prime_feature_size=" + config["prime_feature_size"] if config["prime_feature_size"] else ""
     shell: "tradis compare prepare_embl --output={output} {params.minimum_threshold} {params.window_size} {params.window_interval} {params.prime_feature_size} --emblfile {input.embl} {input.plot}"
 
+rule normalise:
+    input:
+        input_files
+    output:
+        norm_files
+    message: "Normalising plot files"
+    log: os.path.join(config["output_dir"], "analysis", "normalise.log")
+    params:
+        cmd=make_normalise_cmd() if config["normalise"] else make_no_normalise_cmd()
+    shell: "{params.cmd}"
+
 
 rule split_plots:
     input:
-        p=lambda wildcards: plot_lut[wildcards.plot]
+        p=lambda wildcards: norm_lut[wildcards.plot]
     output:
         c=os.path.join(config["output_dir"], "analysis", "{plot}", "combined.plot.gz"),
         f=os.path.join(config["output_dir"], "analysis", "{plot}", "forward.plot.gz"),
@@ -74,34 +107,9 @@ rule count_plots:
     params:
         output_dir=os.path.join(config["output_dir"], "analysis", "{plot}"),
         suffix="count.tsv"
-    wildcard_constraints:
-        type="(?!original).*"
     shell: "tradis plot count -o {params.output_dir} -s {params.suffix} {input.embl} {input.p}"
 
 
-rule copy_original:
-    input:
-        lambda wildcards: plot_lut[wildcards.plot]
-    output:
-        os.path.join(config["output_dir"],"analysis","{plot}","original.plot.gz")
-    message: "Copying original plot file to target folder: {input}"
-    params:
-        output_dir=os.path.join(config["output_dir"],"analysis","{plot}")
-    shell: "if [[ $(file {input} | grep ASCII | wc -l | cut -f 1) > 0 ]]; then gzip -c {input} > {output}; else cp {input} {output}; fi"
-
-
-rule count_original_plots:
-	input:
-		p=rules.copy_original.output,
-		embl=config["annotations"]
-	output:
-		os.path.join(config["output_dir"], "analysis", "{plot}", "original.count.tsv")
-	message: "Analysing original plot and embl file {input.p}, {input.embl}"
-	params:
-		output_dir=os.path.join(config["output_dir"], "analysis", "{plot}")
-	shell: "tradis plot count -o {params.output_dir} -s count.tsv {input.embl} {input.p}"
-
-
 rule essentiality:
     input:
         c=os.path.join(config["output_dir"], "analysis", "{plot}", "{type}.count.tsv"),
Original file line number	Diff line number	Diff line change
Expand Up		@@ -119,4 +119,4 @@ abline(v=2, col="red")


		#write results
		write.table(diff,file=opt$output,append=FALSE, quote=TRUE, sep=",", row.names=FALSE, col.names=c("locus_tag","gene_name","function","logFC","logCPM","PValue","q.value"))
		write.table(diff,file=opt$output,append=FALSE, quote=FALSE, sep=",", row.names=FALSE, col.names=c("locus_tag","gene_name","function","logFC","logCPM","PValue","q.value"))