Skip to content

Commit

Permalink
Normalisation and bug fixes (#39)
Browse files Browse the repository at this point in the history
* fix(essentiality): essentiality analysis report now contains data

Previously we were using the .all.csv as input to the essentiality report file, however this expect the .tsv input from the insertion site analysis output.  This fix changes that.

Also we remove the .all.csv file from output generated by the R script as it doesn't serve any purpose.

Also updated the docs.

fixing stuff

* feat(normalisation): adding plot file normalisation to the compare pipeline

We add normalisation to the beginning of the plot compare pipeline.

fix no normalisation issue

---------

Co-authored-by: Sarah Bastkowski <[email protected]>
  • Loading branch information
maplesond and sbastkowski authored Feb 25, 2024
1 parent f843d9d commit 5e47b2e
Show file tree
Hide file tree
Showing 22 changed files with 472 additions and 203 deletions.
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,9 @@ conda activate quatradis
# Install mamba if you haven't already
conda install -c conda-forge mamba

# Install edgeR separately because this doesn't get imported correctly with quatradis.
mamba install -c conda-forge -c bioconda bioconductor-edger

# Install quatradis
mamba install -c conda-forge -c bioconda quatradis
```
Expand Down
Binary file modified docs/pipeline_combine.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
8 changes: 3 additions & 5 deletions docs/pipeline_combine.pu
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ split again
end split

#lightblue:start tradis pipeline compare;

:normalise plot files;
split
: create count file for
original plot and original embl;
Expand All @@ -37,10 +37,8 @@ split
each plot file;
:essentiality analysis;
split again
:compare plot files
create figures;
split again
: logfc comparison;
:compare insertion sites
create logfc and pval for genes;
: create gene report;
end split

Expand Down
16 changes: 8 additions & 8 deletions quatradis/comparison/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from quatradis.comparison.scatterplot import scatterplot_run
from quatradis.comparison.presence_absence import presence_absence_run
from quatradis.comparison.comparison import prep_essentiality_pipeline_output_for_comparison, run_comparisons, \
generate_logfc_plot
insertion_site_comparison
from quatradis.comparison.gene_stats import gene_statistics
from quatradis.comparison.plot_logfc import PlotLogOptions
from quatradis.comparison.split import split_plot as sp
Expand All @@ -16,9 +16,9 @@ def add_subparser(subparsers):
compare_parser_desc = "Comparative analysis and visualisation of TraDIS experiments (albatradis)"
compare_parser = subparsers.add_parser("compare", help=compare_parser_desc)
compare_subparsers = compare_parser.add_subparsers(title=compare_parser_desc)
create_parser("logfc_plot", compare_subparsers, logfc_plot, add_logfc_options,
"Run logfc analysis for a specific plot file direction: forward, reverse, combined, original over all experiments.",
usage="tradis compare logfc_plot [options] --condition_dirs <condition_plotfiles>+ --control_dirs <control_plotfiles>+ <EMBL_file> <analysis_type>")
create_parser("insertion_sites", compare_subparsers, insertion_site_comparison_cli, add_insertion_site_comparison_options,
"Run an insertion site comparison for a specific plot file direction: forward, reverse, combined, original over all experiments to produce a file including edgeR output (logfc, p and q values)",
usage="tradis compare insertion_sites [options] --condition_dirs <condition_plotfiles>+ --control_dirs <control_plotfiles>+ <EMBL_file> <analysis_type>")
create_parser("presence_absence", compare_subparsers, presence_absence, add_pa_options,
"Take in gene report files and produce a heatmap",
usage="tradis compare presence_absence [options] <EMBLfile> <gene_reports>+")
Expand All @@ -42,7 +42,7 @@ def add_subparser(subparsers):
usage="tradis compare essentiality_analysis [options] --controls <control_gene_files> --conditions <condition_gene_files> --ess_controls <control_essential_gene_files> --ess_conditions <condition_essential_gene_files>")


def add_logfc_options(parser):
def add_insertion_site_comparison_options(parser):
parser.add_argument('emblfile', help='Annotation file in EMBL format', type=str)
parser.add_argument('analysis_type', help='Analysis type: forward, reverse, combined, original', type=str)
parser.add_argument('--controls',
Expand Down Expand Up @@ -71,7 +71,7 @@ def add_logfc_options(parser):
required=True)


def logfc_plot(args):
def insertion_site_comparison_cli(args):
if len(args.controls) != len(args.conditions):
raise "Must have equal number of control and condition files to process"

Expand All @@ -92,8 +92,8 @@ def logfc_plot(args):
report_decreased_insertions=not args.dont_report_decreased_insertions,
minimum_block=args.minimum_block)

plotfile, scoresfile = generate_logfc_plot(args.analysis_type, args.controls, args.conditions,
args.emblfile, args.prefix, options, args.verbose)
plotfile, scoresfile = insertion_site_comparison(args.analysis_type, args.controls, args.conditions,
args.emblfile, args.prefix, options, args.verbose)

return plotfile, scoresfile

Expand Down
30 changes: 16 additions & 14 deletions quatradis/comparison/comparison.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
import subprocess
from tempfile import mkstemp
import os
import csv
import shutil
import subprocess
from tempfile import mkstemp

from quatradis.comparison.plot_logfc import PlotLog
from quatradis.comparison.essentiality import GeneEssentiality
from quatradis.util.file_handle_helpers import ensure_output_dir_exists


Expand Down Expand Up @@ -45,32 +43,36 @@ def run_comparisons(controls_all, conditions_all, annotations, prefix, compariso
if verbose:
print("Running comparison between condition and control and write plot files and gene report.\n")

forward_plotfile, forward_scoresfile = generate_logfc_plot('forward', controls_all, conditions_all,
annotations, prefix, comparison_options, verbose)
forward_plotfile, forward_scoresfile = insertion_site_comparison('forward', controls_all, conditions_all, annotations,
prefix, comparison_options, verbose)
if verbose:
print("Forward insertions have been compared.\n")
reverse_plotfile, reverse_scoresfile = generate_logfc_plot('reverse', controls_all, conditions_all, annotations,
prefix, comparison_options, verbose)
reverse_plotfile, reverse_scoresfile = insertion_site_comparison('reverse', controls_all, conditions_all, annotations,
prefix, comparison_options, verbose)
if verbose:
print("Reverse insertions have been compared.\n")
combined_plotfile, combined_scoresfile = generate_logfc_plot('combined', controls_all, conditions_all, annotations,
prefix, comparison_options, verbose)
combined_plotfile, combined_scoresfile = insertion_site_comparison('combined', controls_all, conditions_all, annotations,
prefix, comparison_options, verbose)
if verbose:
print("Combined insertions have been compared.\n")
original_plotfile, original_scoresfile = generate_logfc_plot('original', controls_all, conditions_all, annotations,
prefix, comparison_options, verbose)
original_plotfile, original_scoresfile = insertion_site_comparison('original', controls_all, conditions_all, annotations,
prefix, comparison_options, verbose)
if verbose:
print("Original insertions have been compared.\n")

return forward_plotfile, reverse_plotfile, combined_plotfile, original_plotfile, \
forward_scoresfile, reverse_scoresfile, combined_scoresfile, original_scoresfile


def generate_logfc_plot(analysis_type, controls_all, conditions_all,
annotations, prefix, options, verbose):
def insertion_site_comparison(analysis_type, controls_all, conditions_all,
annotations, prefix, options, verbose):

# Create CSV report file containing LogFC and PValue information for genes based on their insertion sites
t = TradisComparisonRunner(conditions_all, controls_all, verbose, options.minimum_block,
analysis_type, prefix)
t.run()

# Create plot sytle files containing the logfc and p and q values at every position in the reference
p = PlotLog(t.get_report_file_path(), annotations, options,
output_plot_filename=os.path.join(prefix, analysis_type + ".logfc.plot"),
output_scores_filename=os.path.join(prefix, analysis_type + ".pqvals.plot"))
Expand Down
3 changes: 2 additions & 1 deletion quatradis/comparison/essentiality.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,9 @@ def run(self):
print(cmd)
subprocess.check_output(cmd, shell=True)

os.remove(self.count_file + ".all.csv")

if self.verbose:
print("all.csv\t" + self.count_file + ".all.csv")
print("essen.csv\t" + self.count_file + ".essen.csv")

return self
Expand Down
66 changes: 42 additions & 24 deletions quatradis/comparison/essentiality_analysis.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import csv
import os
from dataclasses import dataclass
from tempfile import mkstemp

from quatradis.comparison.essentiality import GeneEssentiality
from quatradis.util.file_handle_helpers import ensure_output_dir_exists


@dataclass
Expand All @@ -13,10 +13,11 @@ class EssentialityInput:
only_ess_control_files: list
only_ess_condition_files: list

def gene_names_from_essentiality_file(filename, verbose=False):
with open(filename, 'r') as fileh:
reader = csv.reader(fileh, delimiter=',', quotechar='"')
gene_names = [r[1] for r in reader if r[1] != 'gene_name']

def gene_names_from_essentiality_file(filename):
with open(filename, "r") as fileh:
reader = csv.reader(fileh, delimiter=",", quotechar='"')
gene_names = [r[1] for r in reader if r[1] != "gene_name"]
print("Number of all genes:" + str(len(gene_names)))

return gene_names
Expand All @@ -26,19 +27,20 @@ def get_all_gene_names(control_files, condition_files):
all_gene_names = set()

for filename in condition_files:
with open(filename, 'r') as fileh:
reader = csv.reader(fileh, delimiter='\t', quotechar='"')
gene_names1 = [r[1] for r in reader if len(r) > 1 and r[1] != 'gene_name']
with open(filename, "r") as fileh:
reader = csv.reader(fileh, delimiter="\t", quotechar='"')
gene_names1 = [r[1] for r in reader if len(r) > 1 and r[1] != "gene_name"]
all_gene_names = all_gene_names.union(set(gene_names1))

for f in control_files:
with open(filename, 'r') as fileh:
reader = csv.reader(fileh, delimiter='\t', quotechar='"')
gene_names2 = [r[1] for r in reader if len(r) > 1 and r[1] != 'gene_name']
for filename in control_files:
with open(filename, "r") as fileh:
reader = csv.reader(fileh, delimiter="\t", quotechar='"')
gene_names2 = [r[1] for r in reader if len(r) > 1 and r[1] != "gene_name"]
all_gene_names = all_gene_names.union(set(gene_names2))

return list(all_gene_names)


def all_gene_essentiality(input: EssentialityInput, analysis_type, verbose=False):

all_gene_names = get_all_gene_names(input.control_files, input.condition_files)
Expand All @@ -48,7 +50,7 @@ def all_gene_essentiality(input: EssentialityInput, analysis_type, verbose=False
genes_ess = {g: GeneEssentiality() for g in all_gene_names}
if analysis_type == "original":
for f in input.only_ess_condition_files:
ess_gene_names = gene_names_from_essentiality_file(f, verbose)
ess_gene_names = gene_names_from_essentiality_file(f)
if verbose:
print("ess_gene_names condition: " + str(len(ess_gene_names)))
print("genes_ess: " + str(len(genes_ess)))
Expand All @@ -57,7 +59,7 @@ def all_gene_essentiality(input: EssentialityInput, analysis_type, verbose=False
genes_ess[e].condition += 1
genes_ess[e].number_of_reps = len(input.only_ess_condition_files)
for f in input.only_ess_control_files:
ess_gene_names = gene_names_from_essentiality_file(f, verbose)
ess_gene_names = gene_names_from_essentiality_file(f)
if verbose:
print("ess_gene_names control: " + str(len(ess_gene_names)))
print("genes_ess: " + str(len(genes_ess)))
Expand All @@ -73,16 +75,19 @@ def all_gene_essentiality(input: EssentialityInput, analysis_type, verbose=False

return genes_ess

def add_gene_essentiality_to_file(input_filename, output_filename, genes_ess, analysis_type):

def add_gene_essentiality_to_file(
input_filename, output_filename, genes_ess, analysis_type
):
"""
We can add information on gene essentiality to the comparison output,
but this will not reflect full set of essential genes as the output does not contain all genes
In order to prevent confusion this is "switched" off, but could be used if uncommented
"""
with open(input_filename, 'r') as inputfh:
with open(input_filename, "r") as inputfh:
output_content = []

reader = csv.reader(inputfh, delimiter=',', quotechar='"')
reader = csv.reader(inputfh, delimiter=",", quotechar='"')
input_content = [r for r in reader]
# if analysis_type == "original":
# print("Number of cells: " + len(input_content))
Expand All @@ -99,21 +104,34 @@ def add_gene_essentiality_to_file(input_filename, output_filename, genes_ess, an

output_content = input_content

with open(output_filename, 'w') as outputfh:
with open(output_filename, "w") as outputfh:
for line in output_content:
outputfh.write(",".join(line) + "\n")


def essentiality_analysis(input: EssentialityInput, output_dir, analysis_type, output_filename=""):
def essentiality_analysis(
input: EssentialityInput, output_dir, analysis_type, output_filename=""
):

#out_csv = mkstemp()
# out_csv = mkstemp()
genes_ess = all_gene_essentiality(input, analysis_type)
#add_gene_essentiality_to_file(out_csv, output_filename, genes_ess, analysis_type)
#os.remove(out_csv)
# add_gene_essentiality_to_file(out_csv, output_filename, genes_ess, analysis_type)
# os.remove(out_csv)

ensure_output_dir_exists(output_dir)
ess = open(os.path.join(output_dir, "essentiality.csv"), "w+")
ess.write("Gene,Essentiality,Control,Condition,Replicates\n")
for e in genes_ess:
ess.write(",".join([e, str(genes_ess[e].status()), str(genes_ess[e].control),
str(genes_ess[e].condition), str(genes_ess[e].number_of_reps)]) + "\n")
ess.write(
",".join(
[
e,
str(genes_ess[e].status()),
str(genes_ess[e].control),
str(genes_ess[e].condition),
str(genes_ess[e].number_of_reps),
]
)
+ "\n"
)
ess.close()
2 changes: 1 addition & 1 deletion quatradis/comparison/scatterplot.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def __init__(self, conditions, controls, window_size, prefix, normalise=False, v
self.normalise = normalise

if normalise:
n = NormalisePlots(self.conditions + self.controls, 0.0000001, verbose=self.verbose)
n = NormalisePlots(self.conditions + self.controls, 0.0000001, output_dir="normalised", verbose=self.verbose)
plotfiles, max_reads = n.create_normalised_files()
self.conditions = plotfiles[0:len(self.conditions)]
self.controls = plotfiles[len(self.conditions):]
Expand Down
2 changes: 1 addition & 1 deletion quatradis/comparison/split.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def get_combined_file_path(self):
return self._construct_file_path("combined")

def _create_split_plot_file(self, forward, reverse, filename):
p = PlotFromValuesGenerator(forward, reverse, filename, self.gzipped)
p = PlotFromValuesGenerator(forward, reverse, filename)
p.construct_file()

def create_all_files(self):
Expand Down
2 changes: 1 addition & 1 deletion quatradis/comparison/tradis_comparison.R
Original file line number Diff line number Diff line change
Expand Up @@ -119,4 +119,4 @@ abline(v=2, col="red")


#write results
write.table(diff,file=opt$output,append=FALSE, quote=TRUE, sep=",", row.names=FALSE, col.names=c("locus_tag","gene_name","function","logFC","logCPM","PValue","q.value"))
write.table(diff,file=opt$output,append=FALSE, quote=FALSE, sep=",", row.names=FALSE, col.names=c("locus_tag","gene_name","function","logFC","logCPM","PValue","q.value"))
Loading

0 comments on commit 5e47b2e

Please sign in to comment.