Normalisation and bug fixes (#39)

* fix(essentiality): essentiality analysis report now contains data Previously we were using the .all.csv as input to the essentiality report file, however this expect the .tsv input from the insertion site analysis output. This fix changes that. Also we remove the .all.csv file from output generated by the R script as it doesn't serve any purpose. Also updated the docs. fixing stuff * feat(normalisation): adding plot file normalisation to the compare pipeline We add normalisation to the beginning of the plot compare pipeline. fix no normalisation issue --------- Co-authored-by: Sarah Bastkowski <[email protected]>
quadram-institute-bioscience · Feb 25, 2024 · 5e47b2e · 5e47b2e
1 parent f843d9d
commit 5e47b2e
Show file tree

Hide file tree

Showing 22 changed files with 472 additions and 203 deletions.
diff --git a/README.md b/README.md
@@ -149,6 +149,9 @@ conda activate quatradis
 # Install mamba if you haven't already
 conda install -c conda-forge mamba
 
+# Install edgeR separately because this doesn't get imported correctly with quatradis.
+mamba install -c conda-forge -c bioconda bioconductor-edger
+
 # Install quatradis
 mamba install -c conda-forge -c bioconda quatradis
 ```

diff --git a/docs/pipeline_combine.png b/docs/pipeline_combine.png
diff --git a/docs/pipeline_combine.pu b/docs/pipeline_combine.pu
@@ -12,7 +12,7 @@ split again
 end split
 
 #lightblue:start tradis pipeline compare;
-
+:normalise plot files;
 split
 : create count file for
 original plot and original embl;
@@ -37,10 +37,8 @@ split
 each plot file;
 :essentiality analysis;
 split again
-:compare plot files
-create figures;
-split again
-: logfc comparison;
+:compare insertion sites
+create logfc and pval for genes;
 : create gene report;
 end split
 

diff --git a/quatradis/comparison/cli.py b/quatradis/comparison/cli.py
@@ -3,7 +3,7 @@
 from quatradis.comparison.scatterplot import scatterplot_run
 from quatradis.comparison.presence_absence import presence_absence_run
 from quatradis.comparison.comparison import prep_essentiality_pipeline_output_for_comparison, run_comparisons, \
-    generate_logfc_plot
+    insertion_site_comparison
 from quatradis.comparison.gene_stats import gene_statistics
 from quatradis.comparison.plot_logfc import PlotLogOptions
 from quatradis.comparison.split import split_plot as sp
@@ -16,9 +16,9 @@ def add_subparser(subparsers):
     compare_parser_desc = "Comparative analysis and visualisation of TraDIS experiments (albatradis)"
     compare_parser = subparsers.add_parser("compare", help=compare_parser_desc)
     compare_subparsers = compare_parser.add_subparsers(title=compare_parser_desc)
-    create_parser("logfc_plot", compare_subparsers, logfc_plot, add_logfc_options,
-                  "Run logfc analysis for a specific plot file direction: forward, reverse, combined, original over all experiments.",
-                  usage="tradis compare logfc_plot [options] --condition_dirs <condition_plotfiles>+ --control_dirs <control_plotfiles>+ <EMBL_file> <analysis_type>")
+    create_parser("insertion_sites", compare_subparsers, insertion_site_comparison_cli, add_insertion_site_comparison_options,
+                  "Run an insertion site comparison for a specific plot file direction: forward, reverse, combined, original over all experiments to produce a file including edgeR output (logfc, p and q values)",
+                  usage="tradis compare insertion_sites [options] --condition_dirs <condition_plotfiles>+ --control_dirs <control_plotfiles>+ <EMBL_file> <analysis_type>")
     create_parser("presence_absence", compare_subparsers, presence_absence, add_pa_options,
                   "Take in gene report files and produce a heatmap",
                   usage="tradis compare presence_absence [options] <EMBLfile> <gene_reports>+")
@@ -42,7 +42,7 @@ def add_subparser(subparsers):
                   usage="tradis compare essentiality_analysis [options] --controls <control_gene_files> --conditions <condition_gene_files> --ess_controls <control_essential_gene_files> --ess_conditions <condition_essential_gene_files>")
 
 
-def add_logfc_options(parser):
+def add_insertion_site_comparison_options(parser):
     parser.add_argument('emblfile', help='Annotation file in EMBL format', type=str)
     parser.add_argument('analysis_type', help='Analysis type: forward, reverse, combined, original', type=str)
     parser.add_argument('--controls',
@@ -71,7 +71,7 @@ def add_logfc_options(parser):
                         required=True)
 
 
-def logfc_plot(args):
+def insertion_site_comparison_cli(args):
     if len(args.controls) != len(args.conditions):
         raise "Must have equal number of control and condition files to process"
 
@@ -92,8 +92,8 @@ def logfc_plot(args):
         report_decreased_insertions=not args.dont_report_decreased_insertions,
         minimum_block=args.minimum_block)
 
-    plotfile, scoresfile = generate_logfc_plot(args.analysis_type, args.controls, args.conditions,
-                                               args.emblfile, args.prefix, options, args.verbose)
+    plotfile, scoresfile = insertion_site_comparison(args.analysis_type, args.controls, args.conditions,
+                                                     args.emblfile, args.prefix, options, args.verbose)
 
     return plotfile, scoresfile
 

diff --git a/quatradis/comparison/comparison.py b/quatradis/comparison/comparison.py
@@ -1,11 +1,9 @@
-import subprocess
-from tempfile import mkstemp
 import os
-import csv
 import shutil
+import subprocess
+from tempfile import mkstemp
 
 from quatradis.comparison.plot_logfc import PlotLog
-from quatradis.comparison.essentiality import GeneEssentiality
 from quatradis.util.file_handle_helpers import ensure_output_dir_exists
 
 
@@ -45,32 +43,36 @@ def run_comparisons(controls_all, conditions_all, annotations, prefix, compariso
     if verbose:
         print("Running comparison between condition and control and write plot files and gene report.\n")
 
-    forward_plotfile, forward_scoresfile = generate_logfc_plot('forward', controls_all, conditions_all,
-                                                               annotations, prefix, comparison_options, verbose)
+    forward_plotfile, forward_scoresfile = insertion_site_comparison('forward', controls_all, conditions_all, annotations,
+                                                                     prefix, comparison_options, verbose)
     if verbose:
         print("Forward insertions have been compared.\n")
-    reverse_plotfile, reverse_scoresfile = generate_logfc_plot('reverse', controls_all, conditions_all, annotations,
-                                                               prefix, comparison_options, verbose)
+    reverse_plotfile, reverse_scoresfile = insertion_site_comparison('reverse', controls_all, conditions_all, annotations,
+                                                                     prefix, comparison_options, verbose)
     if verbose:
         print("Reverse insertions have been compared.\n")
-    combined_plotfile, combined_scoresfile = generate_logfc_plot('combined', controls_all, conditions_all, annotations,
-                                                                 prefix, comparison_options, verbose)
+    combined_plotfile, combined_scoresfile = insertion_site_comparison('combined', controls_all, conditions_all, annotations,
+                                                                       prefix, comparison_options, verbose)
     if verbose:
         print("Combined insertions have been compared.\n")
-    original_plotfile, original_scoresfile = generate_logfc_plot('original', controls_all, conditions_all, annotations,
-                                                                 prefix, comparison_options, verbose)
+    original_plotfile, original_scoresfile = insertion_site_comparison('original', controls_all, conditions_all, annotations,
+                                                                       prefix, comparison_options, verbose)
     if verbose:
         print("Original insertions have been compared.\n")
 
     return forward_plotfile, reverse_plotfile, combined_plotfile, original_plotfile, \
            forward_scoresfile, reverse_scoresfile, combined_scoresfile, original_scoresfile
 
 
-def generate_logfc_plot(analysis_type, controls_all, conditions_all,
-                        annotations, prefix, options, verbose):
+def insertion_site_comparison(analysis_type, controls_all, conditions_all,
+                              annotations, prefix, options, verbose):
+
+    # Create CSV report file containing LogFC and PValue information for genes based on their insertion sites
     t = TradisComparisonRunner(conditions_all, controls_all, verbose, options.minimum_block,
                                analysis_type, prefix)
     t.run()
+
+    # Create plot sytle files containing the logfc and p and q values at every position in the reference
     p = PlotLog(t.get_report_file_path(), annotations, options,
                 output_plot_filename=os.path.join(prefix, analysis_type + ".logfc.plot"),
                 output_scores_filename=os.path.join(prefix, analysis_type + ".pqvals.plot"))

diff --git a/quatradis/comparison/essentiality.py b/quatradis/comparison/essentiality.py
@@ -74,8 +74,9 @@ def run(self):
             print(cmd)
         subprocess.check_output(cmd, shell=True)
 
+        os.remove(self.count_file + ".all.csv")
+
         if self.verbose:
-            print("all.csv\t" + self.count_file + ".all.csv")
             print("essen.csv\t" + self.count_file + ".essen.csv")
 
         return self

diff --git a/quatradis/comparison/essentiality_analysis.py b/quatradis/comparison/essentiality_analysis.py
@@ -1,9 +1,9 @@
 import csv
 import os
 from dataclasses import dataclass
-from tempfile import mkstemp
 
 from quatradis.comparison.essentiality import GeneEssentiality
+from quatradis.util.file_handle_helpers import ensure_output_dir_exists
 
 
 @dataclass
@@ -13,10 +13,11 @@ class EssentialityInput:
     only_ess_control_files: list
     only_ess_condition_files: list
 
-def gene_names_from_essentiality_file(filename, verbose=False):
-    with open(filename, 'r') as fileh:
-        reader = csv.reader(fileh, delimiter=',', quotechar='"')
-        gene_names = [r[1] for r in reader if r[1] != 'gene_name']
+
+def gene_names_from_essentiality_file(filename):
+    with open(filename, "r") as fileh:
+        reader = csv.reader(fileh, delimiter=",", quotechar='"')
+        gene_names = [r[1] for r in reader if r[1] != "gene_name"]
         print("Number of all genes:" + str(len(gene_names)))
 
     return gene_names
@@ -26,19 +27,20 @@ def get_all_gene_names(control_files, condition_files):
     all_gene_names = set()
 
     for filename in condition_files:
-        with open(filename, 'r') as fileh:
-            reader = csv.reader(fileh, delimiter='\t', quotechar='"')
-            gene_names1 = [r[1] for r in reader if len(r) > 1 and r[1] != 'gene_name']
+        with open(filename, "r") as fileh:
+            reader = csv.reader(fileh, delimiter="\t", quotechar='"')
+            gene_names1 = [r[1] for r in reader if len(r) > 1 and r[1] != "gene_name"]
             all_gene_names = all_gene_names.union(set(gene_names1))
 
-    for f in control_files:
-        with open(filename, 'r') as fileh:
-            reader = csv.reader(fileh, delimiter='\t', quotechar='"')
-            gene_names2 = [r[1] for r in reader if len(r) > 1 and r[1] != 'gene_name']
+    for filename in control_files:
+        with open(filename, "r") as fileh:
+            reader = csv.reader(fileh, delimiter="\t", quotechar='"')
+            gene_names2 = [r[1] for r in reader if len(r) > 1 and r[1] != "gene_name"]
             all_gene_names = all_gene_names.union(set(gene_names2))
 
     return list(all_gene_names)
 
+
 def all_gene_essentiality(input: EssentialityInput, analysis_type, verbose=False):
 
     all_gene_names = get_all_gene_names(input.control_files, input.condition_files)
@@ -48,7 +50,7 @@ def all_gene_essentiality(input: EssentialityInput, analysis_type, verbose=False
     genes_ess = {g: GeneEssentiality() for g in all_gene_names}
     if analysis_type == "original":
         for f in input.only_ess_condition_files:
-            ess_gene_names = gene_names_from_essentiality_file(f, verbose)
+            ess_gene_names = gene_names_from_essentiality_file(f)
             if verbose:
                 print("ess_gene_names condition: " + str(len(ess_gene_names)))
                 print("genes_ess: " + str(len(genes_ess)))
@@ -57,7 +59,7 @@ def all_gene_essentiality(input: EssentialityInput, analysis_type, verbose=False
                     genes_ess[e].condition += 1
                 genes_ess[e].number_of_reps = len(input.only_ess_condition_files)
         for f in input.only_ess_control_files:
-            ess_gene_names = gene_names_from_essentiality_file(f, verbose)
+            ess_gene_names = gene_names_from_essentiality_file(f)
             if verbose:
                 print("ess_gene_names control: " + str(len(ess_gene_names)))
                 print("genes_ess: " + str(len(genes_ess)))
@@ -73,16 +75,19 @@ def all_gene_essentiality(input: EssentialityInput, analysis_type, verbose=False
 
     return genes_ess
 
-def add_gene_essentiality_to_file(input_filename, output_filename, genes_ess, analysis_type):
+
+def add_gene_essentiality_to_file(
+    input_filename, output_filename, genes_ess, analysis_type
+):
     """
     We can add information on gene essentiality to the comparison output,
     but this will not reflect full set of essential genes as the output does not contain all genes
     In order to prevent confusion this is "switched" off, but could be used if uncommented
     """
-    with open(input_filename, 'r') as inputfh:
+    with open(input_filename, "r") as inputfh:
         output_content = []
 
-        reader = csv.reader(inputfh, delimiter=',', quotechar='"')
+        reader = csv.reader(inputfh, delimiter=",", quotechar='"')
         input_content = [r for r in reader]
         # if analysis_type == "original":
         #     print("Number of cells: " + len(input_content))
@@ -99,21 +104,34 @@ def add_gene_essentiality_to_file(input_filename, output_filename, genes_ess, an
 
         output_content = input_content
 
-        with open(output_filename, 'w') as outputfh:
+        with open(output_filename, "w") as outputfh:
             for line in output_content:
                 outputfh.write(",".join(line) + "\n")
 
 
-def essentiality_analysis(input: EssentialityInput, output_dir, analysis_type, output_filename=""):
+def essentiality_analysis(
+    input: EssentialityInput, output_dir, analysis_type, output_filename=""
+):
 
-    #out_csv = mkstemp()
+    # out_csv = mkstemp()
     genes_ess = all_gene_essentiality(input, analysis_type)
-    #add_gene_essentiality_to_file(out_csv, output_filename, genes_ess, analysis_type)
-    #os.remove(out_csv)
+    # add_gene_essentiality_to_file(out_csv, output_filename, genes_ess, analysis_type)
+    # os.remove(out_csv)
 
+    ensure_output_dir_exists(output_dir)
     ess = open(os.path.join(output_dir, "essentiality.csv"), "w+")
     ess.write("Gene,Essentiality,Control,Condition,Replicates\n")
     for e in genes_ess:
-        ess.write(",".join([e, str(genes_ess[e].status()), str(genes_ess[e].control),
-                            str(genes_ess[e].condition), str(genes_ess[e].number_of_reps)]) + "\n")
+        ess.write(
+            ",".join(
+                [
+                    e,
+                    str(genes_ess[e].status()),
+                    str(genes_ess[e].control),
+                    str(genes_ess[e].condition),
+                    str(genes_ess[e].number_of_reps),
+                ]
+            )
+            + "\n"
+        )
     ess.close()
diff --git a/quatradis/comparison/scatterplot.py b/quatradis/comparison/scatterplot.py
@@ -48,7 +48,7 @@ def __init__(self, conditions, controls, window_size, prefix, normalise=False, v
         self.normalise = normalise
 
         if normalise:
-            n = NormalisePlots(self.conditions + self.controls, 0.0000001, verbose=self.verbose)
+            n = NormalisePlots(self.conditions + self.controls, 0.0000001, output_dir="normalised", verbose=self.verbose)
             plotfiles, max_reads = n.create_normalised_files()
             self.conditions = plotfiles[0:len(self.conditions)]
             self.controls = plotfiles[len(self.conditions):]

diff --git a/quatradis/comparison/split.py b/quatradis/comparison/split.py
@@ -27,7 +27,7 @@ def get_combined_file_path(self):
         return self._construct_file_path("combined")
 
     def _create_split_plot_file(self, forward, reverse, filename):
-        p = PlotFromValuesGenerator(forward, reverse, filename, self.gzipped)
+        p = PlotFromValuesGenerator(forward, reverse, filename)
         p.construct_file()
 
     def create_all_files(self):

diff --git a/quatradis/comparison/tradis_comparison.R b/quatradis/comparison/tradis_comparison.R
@@ -119,4 +119,4 @@ abline(v=2, col="red")
 
 
 #write results
-write.table(diff,file=opt$output,append=FALSE, quote=TRUE, sep=",", row.names=FALSE, col.names=c("locus_tag","gene_name","function","logFC","logCPM","PValue","q.value"))
+write.table(diff,file=opt$output,append=FALSE, quote=FALSE, sep=",", row.names=FALSE, col.names=c("locus_tag","gene_name","function","logFC","logCPM","PValue","q.value"))
Original file line number	Diff line number	Diff line change
Expand Up		@@ -119,4 +119,4 @@ abline(v=2, col="red")


		#write results
		write.table(diff,file=opt$output,append=FALSE, quote=TRUE, sep=",", row.names=FALSE, col.names=c("locus_tag","gene_name","function","logFC","logCPM","PValue","q.value"))
		write.table(diff,file=opt$output,append=FALSE, quote=FALSE, sep=",", row.names=FALSE, col.names=c("locus_tag","gene_name","function","logFC","logCPM","PValue","q.value"))