Skip to content

Commit

Permalink
fix(essentiality): essentiality analysis report now contains data
Browse files Browse the repository at this point in the history
Previously we were using the .all.csv as input to the essentiality report file, however this expect the .tsv input from the insertion site analysis output.  This fix changes that.

Also we remove the .all.csv file from output generated by the R script as it doesn't serve any purpose.

Also updated the docs.

fixing stuff
  • Loading branch information
sbastkowski committed Feb 24, 2024
1 parent f843d9d commit 8c7ac03
Show file tree
Hide file tree
Showing 12 changed files with 164 additions and 71 deletions.
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,9 @@ conda activate quatradis
# Install mamba if you haven't already
conda install -c conda-forge mamba

# Install edgeR separately because this doesn't get imported correctly with quatradis.
mamba install -c conda-forge -c bioconda bioconductor-edger

# Install quatradis
mamba install -c conda-forge -c bioconda quatradis
```
Expand Down
Binary file modified docs/pipeline_combine.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
6 changes: 2 additions & 4 deletions docs/pipeline_combine.pu
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,8 @@ split
each plot file;
:essentiality analysis;
split again
:compare plot files
create figures;
split again
: logfc comparison;
:compare insertion sites
create logfc and pval for genes;
: create gene report;
end split

Expand Down
16 changes: 8 additions & 8 deletions quatradis/comparison/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from quatradis.comparison.scatterplot import scatterplot_run
from quatradis.comparison.presence_absence import presence_absence_run
from quatradis.comparison.comparison import prep_essentiality_pipeline_output_for_comparison, run_comparisons, \
generate_logfc_plot
insertion_site_comparison
from quatradis.comparison.gene_stats import gene_statistics
from quatradis.comparison.plot_logfc import PlotLogOptions
from quatradis.comparison.split import split_plot as sp
Expand All @@ -16,9 +16,9 @@ def add_subparser(subparsers):
compare_parser_desc = "Comparative analysis and visualisation of TraDIS experiments (albatradis)"
compare_parser = subparsers.add_parser("compare", help=compare_parser_desc)
compare_subparsers = compare_parser.add_subparsers(title=compare_parser_desc)
create_parser("logfc_plot", compare_subparsers, logfc_plot, add_logfc_options,
"Run logfc analysis for a specific plot file direction: forward, reverse, combined, original over all experiments.",
usage="tradis compare logfc_plot [options] --condition_dirs <condition_plotfiles>+ --control_dirs <control_plotfiles>+ <EMBL_file> <analysis_type>")
create_parser("insertion_sites", compare_subparsers, insertion_site_comparison_cli, add_insertion_site_comparison_options,
"Run an insertion site comparison for a specific plot file direction: forward, reverse, combined, original over all experiments to produce a file including edgeR output (logfc, p and q values)",
usage="tradis compare insertion_sites [options] --condition_dirs <condition_plotfiles>+ --control_dirs <control_plotfiles>+ <EMBL_file> <analysis_type>")
create_parser("presence_absence", compare_subparsers, presence_absence, add_pa_options,
"Take in gene report files and produce a heatmap",
usage="tradis compare presence_absence [options] <EMBLfile> <gene_reports>+")
Expand All @@ -42,7 +42,7 @@ def add_subparser(subparsers):
usage="tradis compare essentiality_analysis [options] --controls <control_gene_files> --conditions <condition_gene_files> --ess_controls <control_essential_gene_files> --ess_conditions <condition_essential_gene_files>")


def add_logfc_options(parser):
def add_insertion_site_comparison_options(parser):
parser.add_argument('emblfile', help='Annotation file in EMBL format', type=str)
parser.add_argument('analysis_type', help='Analysis type: forward, reverse, combined, original', type=str)
parser.add_argument('--controls',
Expand Down Expand Up @@ -71,7 +71,7 @@ def add_logfc_options(parser):
required=True)


def logfc_plot(args):
def insertion_site_comparison_cli(args):
if len(args.controls) != len(args.conditions):
raise "Must have equal number of control and condition files to process"

Expand All @@ -92,8 +92,8 @@ def logfc_plot(args):
report_decreased_insertions=not args.dont_report_decreased_insertions,
minimum_block=args.minimum_block)

plotfile, scoresfile = generate_logfc_plot(args.analysis_type, args.controls, args.conditions,
args.emblfile, args.prefix, options, args.verbose)
plotfile, scoresfile = insertion_site_comparison(args.analysis_type, args.controls, args.conditions,
args.emblfile, args.prefix, options, args.verbose)

return plotfile, scoresfile

Expand Down
30 changes: 16 additions & 14 deletions quatradis/comparison/comparison.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
import subprocess
from tempfile import mkstemp
import os
import csv
import shutil
import subprocess
from tempfile import mkstemp

from quatradis.comparison.plot_logfc import PlotLog
from quatradis.comparison.essentiality import GeneEssentiality
from quatradis.util.file_handle_helpers import ensure_output_dir_exists


Expand Down Expand Up @@ -45,32 +43,36 @@ def run_comparisons(controls_all, conditions_all, annotations, prefix, compariso
if verbose:
print("Running comparison between condition and control and write plot files and gene report.\n")

forward_plotfile, forward_scoresfile = generate_logfc_plot('forward', controls_all, conditions_all,
annotations, prefix, comparison_options, verbose)
forward_plotfile, forward_scoresfile = insertion_site_comparison('forward', controls_all, conditions_all, annotations,
prefix, comparison_options, verbose)
if verbose:
print("Forward insertions have been compared.\n")
reverse_plotfile, reverse_scoresfile = generate_logfc_plot('reverse', controls_all, conditions_all, annotations,
prefix, comparison_options, verbose)
reverse_plotfile, reverse_scoresfile = insertion_site_comparison('reverse', controls_all, conditions_all, annotations,
prefix, comparison_options, verbose)
if verbose:
print("Reverse insertions have been compared.\n")
combined_plotfile, combined_scoresfile = generate_logfc_plot('combined', controls_all, conditions_all, annotations,
prefix, comparison_options, verbose)
combined_plotfile, combined_scoresfile = insertion_site_comparison('combined', controls_all, conditions_all, annotations,
prefix, comparison_options, verbose)
if verbose:
print("Combined insertions have been compared.\n")
original_plotfile, original_scoresfile = generate_logfc_plot('original', controls_all, conditions_all, annotations,
prefix, comparison_options, verbose)
original_plotfile, original_scoresfile = insertion_site_comparison('original', controls_all, conditions_all, annotations,
prefix, comparison_options, verbose)
if verbose:
print("Original insertions have been compared.\n")

return forward_plotfile, reverse_plotfile, combined_plotfile, original_plotfile, \
forward_scoresfile, reverse_scoresfile, combined_scoresfile, original_scoresfile


def generate_logfc_plot(analysis_type, controls_all, conditions_all,
annotations, prefix, options, verbose):
def insertion_site_comparison(analysis_type, controls_all, conditions_all,
annotations, prefix, options, verbose):

# Create CSV report file containing LogFC and PValue information for genes based on their insertion sites
t = TradisComparisonRunner(conditions_all, controls_all, verbose, options.minimum_block,
analysis_type, prefix)
t.run()

# Create plot sytle files containing the logfc and p and q values at every position in the reference
p = PlotLog(t.get_report_file_path(), annotations, options,
output_plot_filename=os.path.join(prefix, analysis_type + ".logfc.plot"),
output_scores_filename=os.path.join(prefix, analysis_type + ".pqvals.plot"))
Expand Down
3 changes: 2 additions & 1 deletion quatradis/comparison/essentiality.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,9 @@ def run(self):
print(cmd)
subprocess.check_output(cmd, shell=True)

os.remove(self.count_file + ".all.csv")

if self.verbose:
print("all.csv\t" + self.count_file + ".all.csv")
print("essen.csv\t" + self.count_file + ".essen.csv")

return self
Expand Down
59 changes: 39 additions & 20 deletions quatradis/comparison/essentiality_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from tempfile import mkstemp

from quatradis.comparison.essentiality import GeneEssentiality
from quatradis.util.file_handle_helpers import ensure_output_dir_exists


@dataclass
Expand All @@ -13,10 +14,11 @@ class EssentialityInput:
only_ess_control_files: list
only_ess_condition_files: list

def gene_names_from_essentiality_file(filename, verbose=False):
with open(filename, 'r') as fileh:
reader = csv.reader(fileh, delimiter=',', quotechar='"')
gene_names = [r[1] for r in reader if r[1] != 'gene_name']

def gene_names_from_essentiality_file(filename):
with open(filename, "r") as fileh:
reader = csv.reader(fileh, delimiter=",", quotechar='"')
gene_names = [r[1] for r in reader if r[1] != "gene_name"]
print("Number of all genes:" + str(len(gene_names)))

return gene_names
Expand All @@ -26,19 +28,20 @@ def get_all_gene_names(control_files, condition_files):
all_gene_names = set()

for filename in condition_files:
with open(filename, 'r') as fileh:
reader = csv.reader(fileh, delimiter='\t', quotechar='"')
gene_names1 = [r[1] for r in reader if len(r) > 1 and r[1] != 'gene_name']
with open(filename, "r") as fileh:
reader = csv.reader(fileh, delimiter="\t", quotechar='"')
gene_names1 = [r[1] for r in reader if len(r) > 1 and r[1] != "gene_name"]
all_gene_names = all_gene_names.union(set(gene_names1))

for f in control_files:
with open(filename, 'r') as fileh:
reader = csv.reader(fileh, delimiter='\t', quotechar='"')
gene_names2 = [r[1] for r in reader if len(r) > 1 and r[1] != 'gene_name']
with open(filename, "r") as fileh:
reader = csv.reader(fileh, delimiter="\t", quotechar='"')
gene_names2 = [r[1] for r in reader if len(r) > 1 and r[1] != "gene_name"]
all_gene_names = all_gene_names.union(set(gene_names2))

return list(all_gene_names)


def all_gene_essentiality(input: EssentialityInput, analysis_type, verbose=False):

all_gene_names = get_all_gene_names(input.control_files, input.condition_files)
Expand Down Expand Up @@ -73,16 +76,19 @@ def all_gene_essentiality(input: EssentialityInput, analysis_type, verbose=False

return genes_ess

def add_gene_essentiality_to_file(input_filename, output_filename, genes_ess, analysis_type):

def add_gene_essentiality_to_file(
input_filename, output_filename, genes_ess, analysis_type
):
"""
We can add information on gene essentiality to the comparison output,
but this will not reflect full set of essential genes as the output does not contain all genes
In order to prevent confusion this is "switched" off, but could be used if uncommented
"""
with open(input_filename, 'r') as inputfh:
with open(input_filename, "r") as inputfh:
output_content = []

reader = csv.reader(inputfh, delimiter=',', quotechar='"')
reader = csv.reader(inputfh, delimiter=",", quotechar='"')
input_content = [r for r in reader]
# if analysis_type == "original":
# print("Number of cells: " + len(input_content))
Expand All @@ -99,21 +105,34 @@ def add_gene_essentiality_to_file(input_filename, output_filename, genes_ess, an

output_content = input_content

with open(output_filename, 'w') as outputfh:
with open(output_filename, "w") as outputfh:
for line in output_content:
outputfh.write(",".join(line) + "\n")


def essentiality_analysis(input: EssentialityInput, output_dir, analysis_type, output_filename=""):
def essentiality_analysis(
input: EssentialityInput, output_dir, analysis_type, output_filename=""
):

#out_csv = mkstemp()
# out_csv = mkstemp()
genes_ess = all_gene_essentiality(input, analysis_type)
#add_gene_essentiality_to_file(out_csv, output_filename, genes_ess, analysis_type)
#os.remove(out_csv)
# add_gene_essentiality_to_file(out_csv, output_filename, genes_ess, analysis_type)
# os.remove(out_csv)

ensure_output_dir_exists(output_dir)
ess = open(os.path.join(output_dir, "essentiality.csv"), "w+")
ess.write("Gene,Essentiality,Control,Condition,Replicates\n")
for e in genes_ess:
ess.write(",".join([e, str(genes_ess[e].status()), str(genes_ess[e].control),
str(genes_ess[e].condition), str(genes_ess[e].number_of_reps)]) + "\n")
ess.write(
",".join(
[
e,
str(genes_ess[e].status()),
str(genes_ess[e].control),
str(genes_ess[e].condition),
str(genes_ess[e].number_of_reps),
]
)
+ "\n"
)
ess.close()
11 changes: 5 additions & 6 deletions quatradis/pipelines/compare.smk
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ rule finish:
expand(os.path.join(config["output_dir"], "comparison", "{type}", "plot_absscatter.png"), type=["original", "forward", "reverse", "combined"]),
#expand(os.path.join(config["output_dir"],"comparison","{type}","{type}.compare.csv"), type=["original", "forward", "reverse", "combined"]),
expand(os.path.join(config["output_dir"],"comparison","{type}","essentiality.csv"), type=["original", "forward", "reverse", "combined"]),
expand(os.path.join(config["output_dir"],"analysis","{plot}","{type}.count.tsv.all.csv"), type=["original", "combined", "forward", "reverse"], plot=plotnames)
expand(os.path.join(config["output_dir"],"analysis","{plot}","{type}.count.tsv.essen.csv"), type=["original", "combined", "forward", "reverse"], plot=plotnames)
run:
print("All done!")

Expand Down Expand Up @@ -106,7 +106,6 @@ rule essentiality:
input:
c=os.path.join(config["output_dir"], "analysis", "{plot}", "{type}.count.tsv"),
output:
all=os.path.join(config["output_dir"], "analysis", "{plot}", "{type}.count.tsv.all.csv"),
ess=os.path.join(config["output_dir"], "analysis", "{plot}", "{type}.count.tsv.essen.csv")
log: os.path.join(config["output_dir"], "analysis", "{plot}", "{type}.count.tsv.essen.log")
message: "Determining gene essentiality for {input}"
Expand All @@ -128,7 +127,7 @@ rule plot:
shell: "tradis compare figures {params.window_size} {params.prefix} {params.controls} {params.conditions} > /dev/null 2>&1"


rule logfc:
rule insertion_site_comparison:
input:
controls=lambda wildcards: expand(os.path.join(config["output_dir"], "analysis", "{control}", wildcards.tt + ".count.tsv"), control=controlnames),
conditions=lambda wildcards: expand(os.path.join(config["output_dir"], "analysis", "{condition}", wildcards.tt + ".count.tsv"), condition=conditionnames),
Expand All @@ -143,7 +142,7 @@ rule logfc:
output_dir="--prefix=" + os.path.join(config["output_dir"], "comparison", "{tt}"),
zcat=ZCAT_CMD
message: "Calculating logfc for {output}"
shell: "SEQLENGTH=$({params.zcat} {params.combined} | wc -l); tradis compare logfc_plot {input.embl} {params.tt} {params.output_dir} -g ${{SEQLENGTH}} --verbose --controls {input.controls} --conditions {input.conditions} > {log} 2>&1"
shell: "SEQLENGTH=$({params.zcat} {params.combined} | wc -l); tradis compare insertion_sites {input.embl} {params.tt} {params.output_dir} -g ${{SEQLENGTH}} --verbose --controls {input.controls} --conditions {input.conditions} > {log} 2>&1"


rule gene_stats:
Expand All @@ -165,8 +164,8 @@ rule gene_stats:

rule essentiality_analysis:
input:
controls = lambda wildcards: expand(os.path.join(config["output_dir"],"analysis","{control}",wildcards.type + ".count.tsv.all.csv"),control=controlnames),
conditions = lambda wildcards: expand(os.path.join(config["output_dir"],"analysis","{condition}",wildcards.type + ".count.tsv.all.csv"),condition=conditionnames),
controls = lambda wildcards: expand(os.path.join(config["output_dir"],"analysis","{control}",wildcards.type + ".count.tsv"),control=controlnames),
conditions = lambda wildcards: expand(os.path.join(config["output_dir"],"analysis","{condition}",wildcards.type + ".count.tsv"),condition=conditionnames),
ess_controls= lambda wildcards: expand(os.path.join(config["output_dir"],"analysis","{control}",wildcards.type + ".count.tsv.essen.csv"),control=controlnames),
ess_conditions=lambda wildcards: expand(os.path.join(config["output_dir"],"analysis","{condition}",wildcards.type + ".count.tsv.essen.csv"),condition=conditionnames)
output:
Expand Down
2 changes: 1 addition & 1 deletion quatradis/tisp/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ def get_plot_file_length(plot_file, working_dir):

def read(self):
handle = PlotParser.create_file_handle(self.filename)
insert_site_array = pandas.read_csv(handle, delim_whitespace=True, dtype=float, engine='c',
insert_site_array = pandas.read_csv(handle, sep='\s+', dtype=float, engine='c',
header=None).values

self.genome_length = len(insert_site_array)
Expand Down
Loading

0 comments on commit 8c7ac03

Please sign in to comment.