From b112d9c12216e2c15c38af9ac47612b2dc05d7a5 Mon Sep 17 00:00:00 2001 From: Max Date: Mon, 28 Oct 2024 14:36:58 +0100 Subject: [PATCH] feat!: igvf outputs (#129) * refactor: removed statistics from final barcode to oligo map * refactor outputs * fix scripts due to renaming headers * fix assignment statistic due to new output * refactor!: moving files. not attched counts are not used as well as median for scaling * adding logs --------- Co-authored-by: Max Schubach --- .gitignore | 5 +- workflow/Snakefile | 18 ++ workflow/rules/assigned_counts.smk | 66 +++++++- workflow/rules/assignment.smk | 2 +- workflow/rules/assignment/statistic.smk | 2 +- workflow/rules/common.smk | 28 ++++ workflow/rules/statistic/assigned_counts.smk | 4 +- workflow/rules/statistic/bc_overlap.smk | 12 +- workflow/rules/statistic/correlation.smk | 12 +- .../scripts/assignment/statistic_assignment.R | 82 ++++----- workflow/scripts/count/combine_replicates.py | 22 ++- workflow/scripts/count/make_master_tables.R | 156 +++++++++++------- workflow/scripts/count/merge_label.py | 68 +++++--- .../count/merge_replicates_barcode_counts.py | 131 ++++++++------- .../count/plot_perInsertCounts_correlation.R | 14 +- .../count/plot_perInsertCounts_stats.R | 19 +-- .../variants/correlateVariantTables.py | 2 +- .../variants/generateMasterVariantTable.py | 28 ++-- .../scripts/variants/generateVariantTable.py | 8 +- 19 files changed, 426 insertions(+), 253 deletions(-) diff --git a/.gitignore b/.gitignore index 38d5e812..9267b83a 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,9 @@ logs !config/* !resources !resources/** +resources/**/.local +resources/**/.cache +resources/**/.ipython !workflow !workflow/** !.gitattributes @@ -27,4 +30,4 @@ mix_data *report.html *.simg *results -.DS_Store \ No newline at end of file +.DS_Store diff --git a/workflow/Snakefile b/workflow/Snakefile index c9eef965..6abde68b 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -134,6 +134,15 @@ rule all: "results/experiments/{project}/assigned_counts/{assignment}/{config}/{condition}_allreps_merged.combined.tsv.gz", "results/experiments/{project}/assigned_counts/{assignment}/{config}/{condition}_allreps_minThreshold_merged.combined.tsv.gz", "results/experiments/{project}/assigned_counts/{assignment}/{config}/{condition}_allreps_merged_barcode_assigned_counts.tsv.gz", + "results/experiments/{project}/assigned_counts/{assignment}/{config}/{condition}_allreps_minThreshold_merged_barcode_assigned_counts.tsv.gz", + "results/experiments/{project}/reporter_experiment.oligo.{condition}.{assignment}.{config}.all.tsv.gz", + "results/experiments/{project}/reporter_experiment.barcode.{condition}.{assignment}.{config}.all.tsv.gz", + ] + ), + getOutputProjectConditionAssignmentConfigThreshold_helper( + [ + "results/experiments/{project}/reporter_experiment.barcode.{condition}.{assignment}.{config}.min_oligo_threshold_{threshold}.tsv.gz", + "results/experiments/{project}/reporter_experiment.oligo.{condition}.{assignment}.{config}.min_oligo_threshold_{threshold}.tsv.gz", ] ), # assignment statistic @@ -263,6 +272,15 @@ rule all_experiments: "results/experiments/{project}/assigned_counts/{assignment}/{config}/{condition}_allreps_merged.combined.tsv.gz", "results/experiments/{project}/assigned_counts/{assignment}/{config}/{condition}_allreps_minThreshold_merged.combined.tsv.gz", "results/experiments/{project}/assigned_counts/{assignment}/{config}/{condition}_allreps_merged_barcode_assigned_counts.tsv.gz", + "results/experiments/{project}/assigned_counts/{assignment}/{config}/{condition}_allreps_minThreshold_merged_barcode_assigned_counts.tsv.gz", + "results/experiments/{project}/reporter_experiment.oligo.{condition}.{assignment}.{config}.all.tsv.gz", + "results/experiments/{project}/reporter_experiment.barcode.{condition}.{assignment}.{config}.all.tsv.gz", + ] + ), + getOutputProjectConditionAssignmentConfigThreshold_helper( + [ + "results/experiments/{project}/reporter_experiment.barcode.{condition}.{assignment}.{config}.min_oligo_threshold_{threshold}.tsv.gz", + "results/experiments/{project}/reporter_experiment.oligo.{condition}.{assignment}.{config}.min_oligo_threshold_{threshold}.tsv.gz", ] ), # assignment statistic diff --git a/workflow/rules/assigned_counts.smk b/workflow/rules/assigned_counts.smk index ef0f1197..cb2c0699 100644 --- a/workflow/rules/assigned_counts.smk +++ b/workflow/rules/assigned_counts.smk @@ -67,7 +67,9 @@ rule assigned_counts_assignBarcodes: script=getScript("count/merge_BC_and_assignment.py"), output: counts="results/experiments/{project}/assigned_counts/{assignment}/{condition}_{replicate}_{type}_final_counts.config.{config}.tsv.gz", - stats="results/experiments/{project}/statistic/assigned_counts/{assignment}/{condition}_{replicate}_{type}_{config}.statistic.tsv.gz", + statistic=temp( + "results/experiments/{project}/statistic/assigned_counts/{assignment}/{condition}_{replicate}_{type}_{config}.statistic.tsv.gz" + ), params: name="{condition}_{replicate}_{type}", log: @@ -79,7 +81,7 @@ rule assigned_counts_assignBarcodes: python {input.script} --counts {input.counts} \ --assignment {input.association} \ --output {output.counts} \ - --statistic {output.stats} \ + --statistic {output.statistic} \ --name {params.name} &> {log} """ @@ -97,7 +99,9 @@ rule assigned_counts_dna_rna_merge: output: counts="results/experiments/{project}/assigned_counts/{assignment}/{config}/{condition}_{replicate}_merged_assigned_counts.tsv.gz", bc_counts="results/experiments/{project}/assigned_counts/{assignment}/{config}/{condition}_{replicate}_barcode_assigned_counts.tsv.gz", - stats="results/experiments/{project}/statistic/assigned_counts/{assignment}/{config}/{condition}_{replicate}_merged_assigned_counts.statistic.tsv.gz", + statistic=temp( + "results/experiments/{project}/statistic/assigned_counts/{assignment}/{config}/{condition}_{replicate}_merged_assigned_counts.statistic.tsv.gz" + ), params: minRNACounts=lambda wc: config["experiments"][wc.project]["configs"][ wc.config @@ -116,7 +120,7 @@ rule assigned_counts_dna_rna_merge: --assignment {input.association} \ --output {output.counts} \ --bcOutput {output.bc_counts} \ - --statistic {output.stats} &> {log} + --statistic {output.statistic} &> {log} """ @@ -137,7 +141,6 @@ rule assigned_counts_make_master_tables: all="results/experiments/{project}/assigned_counts/{assignment}/{config}/{condition}_allreps_merged.tsv.gz", thresh="results/experiments/{project}/assigned_counts/{assignment}/{config}/{condition}_allreps_minThreshold_merged.tsv.gz", params: - cond="{condition}", files=lambda wc: ",".join( expand( "results/experiments/{project}/assigned_counts/{assignment}/{config}/{condition}_{replicate}_merged_assigned_counts.tsv.gz", @@ -161,7 +164,6 @@ rule assigned_counts_make_master_tables: shell: """ Rscript {input.script} \ - --condition {params.cond} \ --threshold {params.thresh} \ --files {params.files} \ --replicates {params.replicates} \ @@ -185,7 +187,8 @@ rule assigned_counts_combine_replicates_barcode_output: ), script=getScript("count/merge_replicates_barcode_counts.py"), output: - bc_merged="results/experiments/{project}/assigned_counts/{assignment}/{config}/{condition}_allreps_merged_barcode_assigned_counts.tsv.gz", + bc_merged_thresh="results/experiments/{project}/assigned_counts/{assignment}/{config}/{condition}_allreps_minThreshold_merged_barcode_assigned_counts.tsv.gz", + bc_merged_all="results/experiments/{project}/assigned_counts/{assignment}/{config}/{condition}_allreps_merged_barcode_assigned_counts.tsv.gz", params: thresh=lambda wc: config["experiments"][wc.project]["configs"][wc.config][ "filter" @@ -218,7 +221,8 @@ rule assigned_counts_combine_replicates_barcode_output: python {input.script} {params.bc_counts} \ --threshold {params.thresh} \ {params.replicates} \ - --output {output.bc_merged} &> {log} + --output-threshold {output.bc_merged_thresh} \ + --output {output.bc_merged_all} &> {log} """ @@ -250,3 +254,49 @@ rule assigned_counts_combine_replicates: {params.label_file} \ --output {output} &> {log} """ + + +rule assigned_counts_copy_final_all_files: + """ + Will copy final files to the main folder so that it is creal which files to use. + """ + conda: + "../envs/default.yaml" + input: + all=lambda wc: "results/experiments/{project}/assigned_counts/{assignment}/{config}/{condition}_allreps_merged.tsv.gz", + bc_all=lambda wc: "results/experiments/{project}/assigned_counts/{assignment}/{config}/{condition}_allreps_merged_barcode_assigned_counts.tsv.gz", + output: + all="results/experiments/{project}/reporter_experiment.oligo.{condition}.{assignment}.{config}.all.tsv.gz", + bc_all="results/experiments/{project}/reporter_experiment.barcode.{condition}.{assignment}.{config}.all.tsv.gz", + log: + temp( + "results/logs/assigned_counts/copy_final_all_files.{project}.{condition}.{assignment}.{config}.log" + ), + shell: + """ + cp {input.all} {output.all} &> {log} + cp {input.bc_all} {output.bc_all} &>> {log} + """ + + +rule assigned_counts_copy_final_thresh_files: + """ + Will copy final files to the main folder so that it is creal which files to use. + """ + conda: + "../envs/default.yaml" + input: + thresh=lambda wc: "results/experiments/{project}/assigned_counts/{assignment}/{config}/{condition}_allreps_minThreshold_merged.tsv.gz", + bc_thresh=lambda wc: "results/experiments/{project}/assigned_counts/{assignment}/{config}/{condition}_allreps_minThreshold_merged_barcode_assigned_counts.tsv.gz", + output: + thresh="results/experiments/{project}/reporter_experiment.oligo.{condition}.{assignment}.{config}.min_oligo_threshold_{threshold}.tsv.gz", + bc_thresh="results/experiments/{project}/reporter_experiment.barcode.{condition}.{assignment}.{config}.min_oligo_threshold_{threshold}.tsv.gz", + log: + temp( + "results/logs/assigned_counts/copy_final_thresh_files.{project}.{condition}.{assignment}.{config}.{threshold}.log" + ), + shell: + """ + cp {input.thresh} {output.thresh} &> {log} + cp {input.bc_thresh} {output.bc_thresh} &>> {log} + """ diff --git a/workflow/rules/assignment.smk b/workflow/rules/assignment.smk index f3d7a572..620e2f20 100644 --- a/workflow/rules/assignment.smk +++ b/workflow/rules/assignment.smk @@ -248,7 +248,7 @@ rule assignment_filter: python {input.script} \ -m {params.min_support} -f {params.fraction} {params.unknown_other} {params.ambiguous} | \ tee >(gzip -c > {output.ambigous}) | \ - awk -v "OFS=\\t" -F"\\t" '{{ if (($2 != \"ambiguous\") && ($2 != \"other\")) {{ print $0 }} }}' | \ + awk -v "OFS=\\t" -F"\\t" '{{ if (($2 != \"ambiguous\") && ($2 != \"other\")) {{ print $1,$2 }} }}' | \ gzip -c > {output.final} 2> {log.err}; gzip -l {output.final} | awk 'NR==2 {{exit($2==0)}}' || {{ echo "Error: Empty barcode file {output.final}. No barcodes detected!" >> {log.err}; exit 1; }} """ diff --git a/workflow/rules/assignment/statistic.smk b/workflow/rules/assignment/statistic.smk index 35b6c64f..3af3cfad 100644 --- a/workflow/rules/assignment/statistic.smk +++ b/workflow/rules/assignment/statistic.smk @@ -48,7 +48,7 @@ rule assignment_statistic_assignment: conda: "../../envs/r.yaml" input: - bc="results/assignment/{assignment}/assignment_barcodes.{assignment_config}.tsv.gz", + bc="results/assignment/{assignment}/assignment_barcodes_with_ambigous.{assignment_config}.tsv.gz", script=getScript("assignment/statistic_assignment.R"), output: stats="results/assignment/{assignment}/statistic/assignment.{assignment_config}.tsv.gz", diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index c0615c9b..63be7717 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -374,6 +374,34 @@ def getOutputProjectConditionAssignmentConfig_helper(files): return output +def getOutputProjectConditionAssignmentConfigThreshold_helper(files): + """ + Inserts {project}, {condition}, {assignment} {config} (from configs of project) and Threshold from config into given file. + """ + output = [] + projects = getProjects() + for project in projects: + try: + conditions = getConditions(project) + for condition in conditions: + for conf in getConfigs(project): + threshold = config["experiments"][project]["configs"][conf][ + "filter" + ]["bc_threshold"] + for file in files: + output += expand( + file, + project=project, + condition=condition, + assignment=getProjectAssignments(project), + config=conf, + threshold=threshold, + ) + except MissingAssignmentInConfigException: + continue + return output + + def getOutputProjectAssignmentConfig_helper(files, betweenReplicates=False): """ Inserts {project}, {assignment} and {config} (from configs of project) from config into given file. diff --git a/workflow/rules/statistic/assigned_counts.smk b/workflow/rules/statistic/assigned_counts.smk index 9a32af44..15ae1ad7 100644 --- a/workflow/rules/statistic/assigned_counts.smk +++ b/workflow/rules/statistic/assigned_counts.smk @@ -86,7 +86,9 @@ rule statistic_assigned_counts_combine_stats_dna_rna_merge: ), script=getScript("count/merge_statistic_tables.py"), output: - "results/experiments/{project}/statistic/assigned_counts/{assignment}/{config}/combined/{condition}_merged_assigned_counts.statistic.tsv.gz", + temp( + "results/experiments/{project}/statistic/assigned_counts/{assignment}/{config}/combined/{condition}_merged_assigned_counts.statistic.tsv.gz" + ), params: cond="{condition}", statistic=lambda wc: " ".join( diff --git a/workflow/rules/statistic/bc_overlap.smk b/workflow/rules/statistic/bc_overlap.smk index 77e01cb8..7ac65bd1 100644 --- a/workflow/rules/statistic/bc_overlap.smk +++ b/workflow/rules/statistic/bc_overlap.smk @@ -50,7 +50,7 @@ rule statistic_bc_overlap_combine_counts: conda: "../../envs/default.yaml" input: - stats=lambda wc: expand( + statistic=lambda wc: expand( "results/experiments/{{project}}/statistic/bc_overlap/counts/overlapBCandCounts.{condition}_{type}.{config}.tsv", type=["DNA", "RNA"], condition=getConditions(wc.project), @@ -74,8 +74,8 @@ rule statistic_bc_overlap_combine_counts: """ set +o pipefail; ( - cat {input.stats[0]} | head -n 1; - for i in {input.stats}; do + cat {input.statistic[0]} | head -n 1; + for i in {input.statistic}; do cat $i | tail -n +2 done; ) > {output} 2> {log} @@ -86,7 +86,7 @@ rule statistic_bc_overlap_combine_assigned_counts: conda: "../../envs/default.yaml" input: - stats=lambda wc: expand( + statistic=lambda wc: expand( "results/experiments/{{project}}/statistic/bc_overlap/assigned_counts/{{assignment}}/overlapBCandCounts.{condition}_{type}.{{config}}.tsv", type=["DNA", "RNA"], condition=getConditions(wc.project), @@ -111,8 +111,8 @@ rule statistic_bc_overlap_combine_assigned_counts: """ set +o pipefail; ( - cat {input.stats[0]} | head -n 1; - for i in {input.stats}; do + cat {input.statistic[0]} | head -n 1; + for i in {input.statistic}; do cat $i | tail -n +2 done; ) > {output} 2> {log} diff --git a/workflow/rules/statistic/correlation.smk b/workflow/rules/statistic/correlation.smk index 4175ec4a..675c8b6a 100644 --- a/workflow/rules/statistic/correlation.smk +++ b/workflow/rules/statistic/correlation.smk @@ -45,7 +45,9 @@ rule statistic_correlation_bc_counts: "Plot": "Ratio", }, ), - "results/experiments/{project}/statistic/barcode/{raw_or_assigned}/{condition}_{config}_barcode_correlation.tsv", + temp( + "results/experiments/{project}/statistic/barcode/{raw_or_assigned}/{condition}_{config}_barcode_correlation.tsv" + ), params: replicates=lambda wc: ",".join( getMergedCounts(wc.project, wc.raw_or_assigned, wc.condition, wc.config)[1] @@ -287,8 +289,12 @@ rule statistic_correlation_calculate: ), }, ), - "results/experiments/{project}/statistic/assigned_counts/{assignment}/{config}/{condition}_correlation.tsv", - "results/experiments/{project}/statistic/assigned_counts/{assignment}/{config}/{condition}_correlation_minThreshold.tsv", + temp( + "results/experiments/{project}/statistic/assigned_counts/{assignment}/{config}/{condition}_correlation.tsv" + ), + temp( + "results/experiments/{project}/statistic/assigned_counts/{assignment}/{config}/{condition}_correlation_minThreshold.tsv" + ), params: cond="{condition}", files=lambda wc: ",".join( diff --git a/workflow/scripts/assignment/statistic_assignment.R b/workflow/scripts/assignment/statistic_assignment.R index 9beb9092..e699d586 100644 --- a/workflow/scripts/assignment/statistic_assignment.R +++ b/workflow/scripts/assignment/statistic_assignment.R @@ -3,38 +3,38 @@ library(optparse) option_list <- list( - make_option(c("-i", "--input"), - type = "character", - help = "Assignment parcode file" - ), - make_option(c("-p", "--plot"), - type = "character", - help = "histogram plot" - ), - make_option(c("-s", "--statistic"), - type = "character", - help = "Statistics" - ) + make_option(c("-i", "--input"), + type = "character", + help = "Assignment parcode file" + ), + make_option(c("-p", "--plot"), + type = "character", + help = "histogram plot" + ), + make_option(c("-s", "--statistic"), + type = "character", + help = "Statistics" + ) ) arguments <- parse_args(OptionParser(option_list = option_list), positional_arguments = TRUE) opt <- arguments$options if (!"input" %in% names(opt)) { - stop("--input parameter must be provided. See script usage (--help)") + stop("--input parameter must be provided. See script usage (--help)") } if (!"plot" %in% names(opt)) { - stop("--plot parameter must be provided. See script usage (--help)") + stop("--plot parameter must be provided. See script usage (--help)") } if (!"statistic" %in% names(opt)) { - stop("--statistic parameter must be provided. See script usage (--help)") + stop("--statistic parameter must be provided. See script usage (--help)") } bcs <- read_delim(opt$input, delim = "\t", escape_double = FALSE, col_names = FALSE, trim_ws = TRUE) bcs <- bcs %>% - filter(!(X2 %in% c("other", "ambiguous"))) %>% - separate(X4, c("support", "all"), sep = "/", convert = TRUE) + filter(!(X2 %in% c("other", "ambiguous"))) %>% + separate(X4, c("support", "all"), sep = "/", convert = TRUE) @@ -42,13 +42,13 @@ avg_support <- mean(bcs$support / bcs$all) median_support <- median(bcs$support / bcs$all) bcs <- bcs %>% - group_by(X2) %>% - count() %>% - ungroup() + group_by(X2) %>% + count() %>% + ungroup() oligos_support <- bcs %>% - filter(n >= 15) %>% - nrow() + filter(n >= 15) %>% + nrow() output <- data.frame(Value = c(avg_support, median_support, oligos_support)) @@ -57,23 +57,25 @@ rownames(output) <- c("Average support of BC for Oligo:", "Median support of BC write.table(output, file = opt$statistic, quote = FALSE, sep = "\t", row.names = TRUE, col.names = FALSE) p <- ggplot(bcs, aes(n)) + - geom_histogram(bins = 150) + - xlab("Number of BCs") + - ylab("Number of Oligos") + - geom_vline(aes(xintercept = mean(bcs$n)), col = "red", linewidth = 2) + - geom_text( - aes( - label = sprintf("\nmean = %.2f", mean(bcs$n)), - x = Inf, y = Inf - ), hjust = 1.1, vjust = 1.3, colour = "red" - ) + - geom_vline(aes(xintercept = median(bcs$n)), col = "blue", linewidth = 2) + - geom_text( - aes( - label = sprintf("median = %.2f\n", median(bcs$n)), - x = Inf, y = Inf - ), hjust = 1.1, vjust = 1.3, colour = "blue" - ) + - theme_bw() + geom_histogram(bins = 150) + + xlab("Number of BCs") + + ylab("Number of Oligos") + + geom_vline(aes(xintercept = mean(bcs$n)), col = "red", linewidth = 2) + + geom_text( + aes( + label = sprintf("\nmean = %.2f", mean(bcs$n)), + x = Inf, y = Inf + ), + hjust = 1.1, vjust = 1.3, colour = "red" + ) + + geom_vline(aes(xintercept = median(bcs$n)), col = "blue", linewidth = 2) + + geom_text( + aes( + label = sprintf("median = %.2f\n", median(bcs$n)), + x = Inf, y = Inf + ), + hjust = 1.1, vjust = 1.3, colour = "blue" + ) + + theme_bw() ggsave(opt$plot, p, type = "cairo", dpi = 300) diff --git a/workflow/scripts/count/combine_replicates.py b/workflow/scripts/count/combine_replicates.py index 7e3e69d5..d5af1c44 100755 --- a/workflow/scripts/count/combine_replicates.py +++ b/workflow/scripts/count/combine_replicates.py @@ -42,22 +42,21 @@ def cli(input_file, label_file, output_file): df_out.to_csv(output_file, sep="\t", index=False) def combine_replicates(label_file, df_allreps, total_dna_counts, total_rna_counts): - df_allreps = df_allreps.groupby(by=["condition", "name"]).aggregate( + df_allreps = df_allreps.groupby(by=["oligo_name"]).aggregate( { "replicate": "count", "dna_counts": ["sum", "mean"], "rna_counts": ["sum", "mean"], "dna_normalized": "mean", "rna_normalized": "mean", - "ratio": "mean", - "log2": "mean", - "n_obs_bc": ["sum", "mean"], + "log2FoldChange": "mean", + "n_bc": ["sum", "mean"], } ) df_allreps = df_allreps.reset_index() - df_out = df_allreps.iloc[:, 0:2] - df_out.columns = ["condition", "name"] + df_out = df_allreps.iloc[:, 0:1] + df_out.columns = ["oligo_name"] df_out["replicates"] = df_allreps.replicate["count"] @@ -70,20 +69,19 @@ def combine_replicates(label_file, df_allreps, total_dna_counts, total_rna_count df_out["dna_normalized"] = df_out["dna_counts"] / total_dna_counts * scaling df_out["rna_normalized"] = df_out["rna_counts"] / total_rna_counts * scaling - df_out["ratio"] = df_out["rna_normalized"] / df_out["dna_normalized"] - df_out["log2"] = np.log2(df_out.ratio) + df_out["log2FoldChange"] = np.log2(df_out["rna_normalized"] / df_out["dna_normalized"]) df_out["mean_dna_counts"] = df_allreps.dna_counts["mean"] df_out["mean_rna_counts"] = df_allreps.rna_counts["mean"] df_out["mean_dna_normalized"] = df_allreps.dna_normalized["mean"] df_out["mean_rna_normalized"] = df_allreps.rna_normalized["mean"] - df_out["mean_ratio"] = df_allreps.ratio["mean"] - df_out["mean_log2"] = df_allreps.log2["mean"] + + df_out["mean_log2FoldChange"] = df_allreps.log2FoldChange["mean"] if label_file: - df_labels = pd.read_csv(label_file, sep="\t", names=["name", "Label"]) - df_out = df_out.join(df_labels.set_index("name"), on="name") + df_labels = pd.read_csv(label_file, sep="\t", names=["oligo_name", "Label"]) + df_out = df_out.join(df_labels.set_index("oligo_name"), on="oligo_name") return df_out diff --git a/workflow/scripts/count/make_master_tables.R b/workflow/scripts/count/make_master_tables.R index 569337f9..03195627 100644 --- a/workflow/scripts/count/make_master_tables.R +++ b/workflow/scripts/count/make_master_tables.R @@ -1,4 +1,4 @@ -#Adapted from Vikram Agarwal by Gracie Gordon +# Adapted from Vikram Agarwal by Gracie Gordon and Max Schubach library(dplyr) @@ -6,30 +6,39 @@ library(optparse) option_list <- list( - make_option(c("-c", "--condition"), type="character", - help="Condition name"), - make_option(c("-l", "--label"), type="character", - help="Label file. (optional)"), - make_option(c("-f", "--files"), type="character", - help="Comma separated input files of assigned counts"), - make_option(c("-r", "--replicates"), type="character", - help="Comma separated name of the replicates (same order than files)"), - make_option(c("-a", "--output-all"), type="character", - help="Output file of master table. No BC threshold filter (optional)."), - make_option(c("-o", "--output"), type="character", - help="Output file of master table filtered by --threshold"), - make_option(c("-s", "--statistic"), type="character", - help="Statistic of master table and filtered master table"), - make_option(c("-t", "--threshold"), type="integer", default=10, - help="Number of required barcodes (default 10)") + make_option(c("-l", "--label"), + type = "character", + help = "Label file. (optional)" + ), + make_option(c("-f", "--files"), + type = "character", + help = "Comma separated input files of assigned counts" + ), + make_option(c("-r", "--replicates"), + type = "character", + help = "Comma separated name of the replicates (same order than files)" + ), + make_option(c("-a", "--output-all"), + type = "character", + help = "Output file of master table. No BC threshold filter (optional)." + ), + make_option(c("-o", "--output"), + type = "character", + help = "Output file of master table filtered by --threshold" + ), + make_option(c("-s", "--statistic"), + type = "character", + help = "Statistic of master table and filtered master table" + ), + make_option(c("-t", "--threshold"), + type = "integer", default = 10, + help = "Number of required barcodes (default 10)" + ) ) -arguments <- parse_args(OptionParser(option_list=option_list), positional_arguments=TRUE) +arguments <- parse_args(OptionParser(option_list = option_list), positional_arguments = TRUE) opt <- arguments$options -if (!"condition" %in% names(opt)) { - stop("--condition parameter must be provided. See script usage (--help)") -} if (!"files" %in% names(opt)) { stop("--files parameter must be provided. See script usage (--help)") } @@ -44,69 +53,88 @@ if (!"statistic" %in% names(opt)) { } - -#exp=args[1] -cond=opt$cond -thresh=opt$threshold -files <- strsplit(opt$files,",")[[1]] -replicates=strsplit(opt$replicates,",")[[1]] +# exp=args[1] +cond <- opt$cond +thresh <- opt$threshold +files <- strsplit(opt$files, ",")[[1]] +replicates <- strsplit(opt$replicates, ",")[[1]] if (length(files) != length(replicates)) { - stop("Number of input files must be euqal to number of replicates") + stop("Number of input files must be euqal to number of replicates") } -outfile=opt$output -avg_outfile=opt$statistic -#out=args[3] +outfile <- opt$output +avg_outfile <- opt$statistic -##MAKE MASTER TABLE -masterTable = data.frame() -for (i in 1:length(files)){ - file=files[i] - rep=replicates[i] +precision <- 4 - table = as.data.frame(read.table(file,header=TRUE),stringsAsFactors=FALSE) - table$condition = cond - table$replicate = rep - - masterTable <- masterTable %>% bind_rows(table) -} +## MAKE MASTER TABLE +master_table <- data.frame() +for (i in seq_len(length(files))) { + file <- files[i] + rep <- replicates[i] -masterTable <- masterTable %>% group_by(condition,replicate) %>% - select(condition, replicate, name, dna_counts, rna_counts, dna_normalized, rna_normalized, ratio, log2, n_obs_bc) + table <- as.data.frame(read.table(file, header = TRUE), stringsAsFactors = FALSE) + table$replicate <- rep -masterTableFiltered <- masterTable %>% - filter(n_obs_bc >= thresh) %>% - mutate(ratio = ratio/median(ratio), log2 = round(log2(ratio),8)) -masterTable <- masterTable %>% mutate(ratio = ratio/median(ratio), log2 = round(log2(ratio),8)) + master_table <- master_table %>% bind_rows(table) +} -writeFile <- function(file,table) { +master_table <- master_table %>% + group_by(replicate) %>% + filter(!oligo_name %in% c("no_BC")) %>% + mutate( + dna_normalized = round(dna_normalized, precision), + rna_normalized = round(rna_normalized, precision), + ratio = round(ratio, precision), + log2FoldChange = round(log2FoldChange, precision) + ) + +master_table_filtered <- master_table %>% + filter(n_bc >= thresh) +# TODO use this? Maybe not because it normalizes across all replicates +# master_table_filtered %>% mutate(ratio = ratio / median(ratio)) + +# TODO use this? Maybe not because it normalizes across all replicates +# master_table %>% mutate(ratio = ratio / median(ratio)) + +write_file <- function(file, table) { gz <- gzfile(file, "w") - write.table(table,file=gz,quote=FALSE,sep='\t',row.names=FALSE,col.names=TRUE ) + write.table( + table, + file = gz, quote = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE + ) close(gz) } -writeFile(outfile,masterTableFiltered) +write_file( + outfile, + master_table_filtered %>% + select(replicate, oligo_name, dna_counts, rna_counts, dna_normalized, rna_normalized, log2FoldChange, n_bc) +) if ("output-all" %in% names(opt)) { - writeFile(opt$`output-all`, masterTable) + write_file( + opt$`output-all`, + master_table %>% + select(replicate, oligo_name, dna_counts, rna_counts, dna_normalized, rna_normalized, log2FoldChange, n_bc) + ) } -##MAKE AVERAGED ACROSS REPLICATES -makeAverageAcrossReplicates <- function(table, name) { +## MAKE AVERAGED ACROSS REPLICATES +make_average_across_replicates <- function(table, name) { avg <- table %>% summarize( - mean_ratio=mean(ratio), - mean_log2=log2(mean(ratio)), - mean_n_obs_bc=mean(n_obs_bc) - ) - avg$BC_filter = name + mean_ratio = mean(ratio), + mean_log2FoldChange = mean(log2FoldChange), + mean_n_bc = mean(n_bc) + ) + avg$BC_filter <- name return(avg) } -all_avg <- makeAverageAcrossReplicates(masterTable, 'None') %>% - bind_rows(makeAverageAcrossReplicates(masterTableFiltered, paste0("n_obs_bc >= ", thresh))) - -writeFile(avg_outfile, all_avg) +all_avg <- make_average_across_replicates(master_table, "None") %>% + bind_rows(make_average_across_replicates(master_table_filtered, paste0("n_bc >= ", thresh))) +write_file(avg_outfile, all_avg) -print('done') +print("done") diff --git a/workflow/scripts/count/merge_label.py b/workflow/scripts/count/merge_label.py index 2a0b16d3..3117f1d3 100644 --- a/workflow/scripts/count/merge_label.py +++ b/workflow/scripts/count/merge_label.py @@ -39,6 +39,21 @@ type=int, help="Minimum number of DNA counts required per barcode. If 0 pesudocounts are used.", ) +@click.option( + "--inlclude-not-assigned-for-normalization/--exclude-not-assigned-for-normalization", + "normalize_with_not_assigned", + default=False, + show_default=True, + help="Use barcodes that are not assigned for normalization.", +) +@click.option( + "--scaling", + "scaling", + default=10**6, + show_default=True, + type=float, + help="Scaling parameter. Usually counts per million (10**6).", +) @click.option( "--output", "output_file", @@ -64,6 +79,8 @@ def cli( assignment_file, minRNACounts, minDNACounts, + normalize_with_not_assigned, + scaling, output_file, statistic_file, bc_output_file, @@ -97,19 +114,19 @@ def cli( header=None, usecols=[0, 1], sep="\t", - names=["Barcode", "Oligo"], + names=["barcode", "oligo"], ) # drop duplicated barcodes!!! - assoc.drop_duplicates("Barcode", keep=False, inplace=True) - assoc.set_index("Barcode", inplace=True) + assoc.drop_duplicates("barcode", keep=False, inplace=True) + assoc.set_index("barcode", inplace=True) - statistic["oligos design"] = assoc.Oligo.nunique() + statistic["oligos design"] = assoc.oligo.nunique() statistic[["barcodes design"]] = assoc.shape[0] # get count df click.echo("Read count file...") counts = pd.read_csv( - counts_file, sep="\t", header=None, names=["Barcode", "dna_count", "rna_count"] + counts_file, sep="\t", header=None, names=["barcode", "dna_count", "rna_count"] ) # filter counts = counts[ @@ -120,11 +137,11 @@ def cli( # fill in labels from dictionary click.echo("Combine assignment with count file...") - counts = pd.merge(assoc, counts, how="right", on="Barcode") - counts.Oligo.fillna("no_BC", inplace=True) - counts.rename(columns={"Oligo": "name"}, inplace=True) - # counts = counts[['name', 'Barcode', 'dna_count', 'rna_count']] - statistic[["unknown barcodes dna/rna"]] = counts[counts.name == "no_BC"].shape[0] + counts = pd.merge(assoc, counts, how="right", on="barcode") + counts.oligo.fillna("no_BC", inplace=True) + counts.rename(columns={"oligo": "oligo_name"}, inplace=True) + # counts = counts[['oligo_name', 'barcode', 'dna_count', 'rna_count']] + statistic[["unknown barcodes dna/rna"]] = counts[counts.oligo_name == "no_BC"].shape[0] statistic["matched barcodes"] = ( statistic["barcodes dna/rna"] - statistic["unknown barcodes dna/rna"] @@ -137,14 +154,18 @@ def cli( counts.to_csv(bc_output_file, index=False, sep="\t", compression="gzip") # remove Barcorde. Not needed anymore - counts.drop(["Barcode"], axis=1, inplace=True) + counts.drop(["barcode"], axis=1, inplace=True) # number of DNA and RNA counts total_dna_counts = sum(counts["dna_count"]) total_rna_counts = sum(counts["rna_count"]) + unknown_dna_counts = sum(counts[counts.oligo_name == "no_BC"]["dna_count"]) + unknown_rna_counts = sum(counts[counts.oligo_name == "no_BC"]["rna_count"]) + statistic["total dna counts"] = total_dna_counts statistic["total rna counts"] = total_rna_counts + statistic["avg dna counts per bc"] = ( statistic["total dna counts"] / statistic["barcodes dna/rna"] ) @@ -152,9 +173,18 @@ def cli( statistic["total rna counts"] / statistic["barcodes dna/rna"] ) - grouped_label = counts.groupby("name").agg( + # group by oligo name and inlcude or exclude not assigned barcodes depending on flag given + if normalize_with_not_assigned: + grouped_label = counts.groupby("oligo_name").agg( {"dna_count": ["sum", "count"], "rna_count": ["sum", "count"]} - ) + ) + else: + grouped_label = counts[counts.oligo_name != "no_BC"].groupby("oligo_name").agg( + {"dna_count": ["sum", "count"], "rna_count": ["sum", "count"]} + ) + total_dna_counts -= unknown_dna_counts + total_rna_counts -= unknown_rna_counts + grouped_label.reset_index(inplace=True) # add pseudo BC counts to total number of counts if needed @@ -169,18 +199,16 @@ def cli( click.echo(grouped_label.head()) - output["name"] = grouped_label["name"] + output["oligo_name"] = grouped_label["oligo_name"] output["dna_counts"] = grouped_label.dna_count["sum"] output["rna_counts"] = grouped_label.rna_count["sum"] - statistic["oligos dna/rna"] = len(grouped_label) - 1 + statistic["oligos dna/rna"] = len(grouped_label) -1 if normalize_with_not_assigned else len(grouped_label) statistic["avg dna/rna barcodes per oligo"] = ( - sum(grouped_label[grouped_label["name"] != "no_BC"].dna_count["count"]) + sum(grouped_label[grouped_label["oligo_name"] != "no_BC"].dna_count["count"]) / statistic["oligos dna/rna"] ) - # scaling = 10**min([len(str(total_dna_counts))-1,len(str(total_rna_counts))-1]) - scaling = 10**6 output["dna_normalized"] = ( ( @@ -207,9 +235,9 @@ def cli( ) output["ratio"] = output["rna_normalized"] / output["dna_normalized"] - output["log2"] = np.log2(output.ratio) + output["log2FoldChange"] = np.log2(output.ratio) - output["n_obs_bc"] = grouped_label.dna_count["count"] + output["n_bc"] = grouped_label.dna_count["count"] click.echo(output_file) diff --git a/workflow/scripts/count/merge_replicates_barcode_counts.py b/workflow/scripts/count/merge_replicates_barcode_counts.py index a5f79056..a5495264 100644 --- a/workflow/scripts/count/merge_replicates_barcode_counts.py +++ b/workflow/scripts/count/merge_replicates_barcode_counts.py @@ -29,82 +29,93 @@ ) @click.option( "--output", + "output_threshold_file", + required=True, + type=click.Path(writable=True), + help="Output file.", +) +@click.option( + "--output-threshold", "output_file", required=True, type=click.Path(writable=True), help="Output file.", ) -def cli(counts_files, bc_thresh, replicates, output_file): - """ - Merge the associated barcode count files of all replicates. - """ - - # ensure there are as many replicates as there are files - if len(replicates) != len(counts_files): - raise ( - click.BadParameter( - "Number of replicates ({}) doesn't equal the number of files ({}).".format( - len(replicates), len(counts_files) - ) - ) - ) +def cli(counts_files, bc_thresh, replicates, output_threshold_file, output_file): + """ + Merge the associated barcode count files of all replicates. + """ - # check if every file exists - for file in counts_files: - if not os.path.exists(file): - raise (click.BadParameter("{}: file not found".format(file))) + # ensure there are as many replicates as there are files + if len(replicates) != len(counts_files): + raise ( + click.BadParameter( + "Number of replicates ({}) doesn't equal the number of files ({}).".format( + len(replicates), len(counts_files) + ) + ) + ) - all_reps = [] - for file in counts_files: - curr_rep = -1 - # find the replicate name of the current file - for rep in replicates: - if rep in os.path.basename(file).split("_")[1]: - curr_rep = rep - break - if curr_rep == -1: - raise (click.BadParameter("{}: incorrect file".format(file))) - df = pd.read_csv(file, sep="\t") - df['replicate'] = curr_rep - all_reps.append(df) + # check if every file exists + for file in counts_files: + if not os.path.exists(file): + raise (click.BadParameter("{}: file not found".format(file))) - df = pd.concat(all_reps) - df = df[df["name"] != "no_BC"] + all_reps = [] + for file in counts_files: + curr_rep = -1 + # find the replicate name of the current file + for rep in replicates: + if rep in os.path.basename(file).split("_")[1]: + curr_rep = rep + break + if curr_rep == -1: + raise (click.BadParameter("{}: incorrect file".format(file))) + df = pd.read_csv(file, sep="\t") + df['replicate'] = curr_rep + all_reps.append(df) - # only keep oligo's with a number of barcodes of at least the given threshold - df_filtered = df.groupby(["name", "replicate"]).filter(lambda x: len(x) >= bc_thresh) + df = pd.concat(all_reps) + df = df[df["oligo_name"] != "no_BC"] - # pivot table to make a dna and rna count column for every replicate - df_filtered = df_filtered.pivot_table( - values=["dna_count", "rna_count"], - index=["Barcode", "name"], - columns="replicate", - aggfunc='first' - ) - df_filtered = df_filtered.sort_values("name") + def pivot_table(df, replicates): + # pivot table to make a dna and rna count column for every replicate + df = df.pivot_table( + values=["dna_count", "rna_count"], + index=["barcode", "oligo_name"], + columns="replicate", + aggfunc='first' + ) + df = df.sort_values("oligo_name") - # order columns to have dna then rna count of each replicate - col_order = sum( - [ - ["dna_count_" + rep, "rna_count_" + rep] - for rep in replicates - ], - [], - ) + # order columns to have dna then rna count of each replicate + col_order = sum( + [ + ["dna_count_" + rep, "rna_count_" + rep] + for rep in replicates + ], + [], + ) - df_filtered = df_filtered.reset_index() + df = df.reset_index() - df_filtered.columns = ['_'.join(col).strip() if col[1] else col[0] for col in df_filtered.columns.values] - - df_filtered = df_filtered[["Barcode", "name"] + col_order] + df.columns = ['_'.join(col).strip() if col[1] else col[0] for col in df.columns.values] + + df = df[["barcode", "oligo_name"] + col_order] - for col in col_order: - df_filtered[col] = df_filtered[col].astype('Int64') + for col in col_order: + df[col] = df[col].astype('Int32') + + return df - # write to output file - df_filtered.to_csv(output_file, sep="\t", index=False, compression="gzip") + # only keep oligo's with a number of barcodes of at least the given threshold + df_filtered = df.groupby(["oligo_name", "replicate"]).filter(lambda x: len(x) >= bc_thresh) + + # write to output file + pivot_table(df_filtered,replicates).to_csv(output_threshold_file, sep="\t", index=False, compression="gzip") + pivot_table(df,replicates).to_csv(output_file, sep="\t", index=False, compression="gzip") if __name__ == "__main__": - cli() + cli() diff --git a/workflow/scripts/count/plot_perInsertCounts_correlation.R b/workflow/scripts/count/plot_perInsertCounts_correlation.R index 6042e2d1..9e07dd73 100644 --- a/workflow/scripts/count/plot_perInsertCounts_correlation.R +++ b/workflow/scripts/count/plot_perInsertCounts_correlation.R @@ -64,7 +64,7 @@ if ("label" %in% names(opt)) { header = TRUE, stringsAsFactors = FALSE )) - colnames(label_f) <- c("name", "label") + colnames(label_f) <- c("oligo_name", "label") use_labels <- TRUE } else { use_labels <- FALSE @@ -281,7 +281,7 @@ read_data <- function(file) { header = TRUE, stringsAsFactors = FALSE ) %>% - filter(name != "no_BC") %>% + filter(oligo_name != "no_BC") %>% mutate( dna_normalized_log2 = log2(dna_normalized), rna_normalized_log2 = log2(rna_normalized), @@ -306,7 +306,7 @@ for (n in 1:(data %>% nrow())) { if (use_labels) { all <- all %>% - left_join(label_f, by = c("name")) %>% + left_join(label_f, by = c("oligo_name")) %>% mutate(label = replace_na(label, "NA")) } else { if (nrow(all) > 0) { # can be 0 when no BCs are assigned @@ -339,13 +339,13 @@ if (data %>% nrow() > 1 && nrow(all) > 1) { n_oligos_r2 <- data2 %>% nrow() n_oligos_r1_thres <- data1 %>% - filter(n_obs_bc >= thresh) %>% + filter(n_bc >= thresh) %>% nrow() n_oligos_r2_thres <- data2 %>% - filter(n_obs_bc >= thresh) %>% + filter(n_bc >= thresh) %>% nrow() - res <- data1 %>% inner_join(data2, by = c("name")) + res <- data1 %>% inner_join(data2, by = c("oligo_name")) plots_correlations_dna[[i]] <- plot_correlations_dna(res, cond, r1, r2, "pairwise") @@ -369,7 +369,7 @@ if (data %>% nrow() > 1 && nrow(all) > 1) { # Min Threshold res <- - res %>% filter(n_obs_bc.x >= thresh, n_obs_bc.y >= thresh) + res %>% filter(n_bc.x >= thresh, n_bc.y >= thresh) plots_cor_min_thresh_dna[[i]] <- plot_correlations_dna(res, cond, r1, r2, "pairwise_minThreshold") plots_cor_min_thresh_rna[[i]] <- diff --git a/workflow/scripts/count/plot_perInsertCounts_stats.R b/workflow/scripts/count/plot_perInsertCounts_stats.R index 93f68988..00528297 100644 --- a/workflow/scripts/count/plot_perInsertCounts_stats.R +++ b/workflow/scripts/count/plot_perInsertCounts_stats.R @@ -65,7 +65,7 @@ if ("label" %in% names(opt)) { header = TRUE, stringsAsFactors = FALSE )) - colnames(label_f) <- c("name", "label") + colnames(label_f) <- c("oligo_name", "label") use_labels <- TRUE } else { use_labels <- FALSE @@ -95,11 +95,10 @@ read_data <- function(file) { header = TRUE, stringsAsFactors = FALSE ) %>% - filter(name != "no_BC") %>% + filter(oligo_name != "no_BC") %>% mutate( dna_normalized_log2 = log2(dna_normalized), rna_normalized_log2 = log2(rna_normalized), - ratio_log2 = log2(ratio) ) return(data) } @@ -117,7 +116,7 @@ for (n in 1:(data %>% nrow())) { if (use_labels) { all <- all %>% - left_join(label_f, by = c("name")) %>% + left_join(label_f, by = c("oligo_name")) %>% mutate(label = replace_na(label, "NA")) } else { all$label <- "NA" @@ -128,7 +127,7 @@ print("Histogram plots, RNA/DNA correlation plots, Violin plots") plot_median_dna_rna_cor <- function(data) { data <- data %>% - group_by(name) %>% + group_by(oligo_name) %>% summarise(dna_normalized = median(log10(dna_normalized)), rna_normalized = median(log10(rna_normalized)), n = n()) data <- data %>% filter(n == length(replicates)) p <- ggplot(data, aes(x = dna_normalized, y = rna_normalized)) + @@ -151,7 +150,7 @@ plot_median_dna_rna_cor <- function(data) { } plot_group_bc_per_insert <- function(data) { - bp <- ggplot(data, aes(x = label, y = log2, fill = label)) + + bp <- ggplot(data, aes(x = label, y = log2FoldChange, fill = label)) + geom_violin() + geom_boxplot(width = 0.1, fill = "white") + xlab("insert") + @@ -180,7 +179,7 @@ ggsave(sprintf("%s_dna_vs_rna.png", outdir), width = 10, height = 10 ) ggsave(sprintf("%s_dna_vs_rna_minThreshold.png", outdir), - plot_median_dna_rna_cor(all %>% filter(n_obs_bc >= thresh)), + plot_median_dna_rna_cor(all %>% filter(n_bc >= thresh)), width = 10, height = 10 ) @@ -196,9 +195,9 @@ for (n in 1:(data %>% nrow())) { assigned_counts <- all %>% filter(replicate == rep) # Histograms - intercept <- median(assigned_counts$n_obs_bc) + intercept <- median(assigned_counts$n_bc) hist_plot_list[[n]] <- - ggplot(assigned_counts, aes(x = n_obs_bc)) + + ggplot(assigned_counts, aes(x = n_bc)) + geom_histogram(bins = 300) + geom_vline(xintercept = intercept, colour = "red") + xlim(0, 300) + @@ -210,7 +209,7 @@ for (n in 1:(data %>% nrow())) { ggtitle(paste("replicate", rep, sep = " ")) box_plot_insert_thresh_list[[n]] <- - plot_group_bc_per_insert(assigned_counts %>% filter(n_obs_bc >= thresh)) + + plot_group_bc_per_insert(assigned_counts %>% filter(n_bc >= thresh)) + ggtitle(paste("replicate", rep, sep = " ")) } diff --git a/workflow/scripts/variants/correlateVariantTables.py b/workflow/scripts/variants/correlateVariantTables.py index a1d69069..b79e9f33 100644 --- a/workflow/scripts/variants/correlateVariantTables.py +++ b/workflow/scripts/variants/correlateVariantTables.py @@ -32,7 +32,7 @@ def cli(condition, variants, bc_threshold, output_file): def filterOnThreshold(variants, threshold): - return(variants.query('n_obs_bc_ALT >= %d & n_obs_bc_REF >= %d' % (threshold, threshold))) + return(variants.query('n_bc_ALT >= %d & n_bc_REF >= %d' % (threshold, threshold))) output = pd.DataFrame() for i in range(len(variants)): diff --git a/workflow/scripts/variants/generateMasterVariantTable.py b/workflow/scripts/variants/generateMasterVariantTable.py index f77c90f0..0828466c 100644 --- a/workflow/scripts/variants/generateMasterVariantTable.py +++ b/workflow/scripts/variants/generateMasterVariantTable.py @@ -49,18 +49,18 @@ def cli(input_files, minRNACounts, minDNACounts, output_file): click.echo("Create new expression values...") df = df.groupby(['ID', 'REF', 'ALT']).agg(dna_counts_REF=('dna_counts_REF', sum), rna_counts_REF=('rna_counts_REF', sum), - n_obs_bc_REF=('n_obs_bc_REF', sum), + n_bc_REF=('n_bc_REF', sum), dna_counts_ALT=('dna_counts_ALT', sum), rna_counts_ALT=('rna_counts_ALT', sum), - n_obs_bc_ALT=('n_obs_bc_ALT', sum) + n_bc_ALT=('n_bc_ALT', sum) ).reset_index() scaling = 10**6 - df_total = df[['dna_counts_REF', 'rna_counts_REF', 'n_obs_bc_REF', - 'dna_counts_ALT', 'rna_counts_ALT', 'n_obs_bc_ALT']].sum() + df_total = df[['dna_counts_REF', 'rna_counts_REF', 'n_bc_REF', + 'dna_counts_ALT', 'rna_counts_ALT', 'n_bc_ALT']].sum() - total_bc = df_total['n_obs_bc_REF'] + df_total['n_obs_bc_ALT'] + total_bc = df_total['n_bc_REF'] + df_total['n_bc_ALT'] total_dna = df_total['dna_counts_REF'] + df_total['dna_counts_ALT'] + total_bc * pseudocountDNA total_rna = df_total['rna_counts_REF'] + df_total['rna_counts_ALT'] + total_bc * pseudocountRNA @@ -68,8 +68,8 @@ def cli(input_files, minRNACounts, minDNACounts, output_file): def normalize(df, total, count_type, ref_alt, pseudocount, scaling): return((( - df["%s_counts_%s" % (count_type, ref_alt)] + pseudocount * df['n_obs_bc_%s' % ref_alt] - )/df['n_obs_bc_%s' % ref_alt])/total*scaling) + df["%s_counts_%s" % (count_type, ref_alt)] + pseudocount * df['n_bc_%s' % ref_alt] + )/df['n_bc_%s' % ref_alt])/total*scaling) for ref_alt in ["REF", "ALT"]: for count_type in ["dna", "rna"]: @@ -80,20 +80,20 @@ def normalize(df, total, count_type, ref_alt, pseudocount, scaling): df, total, count_type, ref_alt, pseudocount, scaling) df['ratio_%s' % ref_alt] = df['rna_normalized_%s' % ref_alt] / df['dna_normalized_%s' % ref_alt] - df['log2_%s' % ref_alt] = np.log2(df['ratio_%s' % ref_alt]) + df['log2FoldChange_%s' % ref_alt] = np.log2(df['ratio_%s' % ref_alt]) - df["log2_expression"] = np.log2(df['ratio_ALT']/df['ratio_REF']) + df["log2FoldChange_expression"] = np.log2(df['ratio_ALT']/df['ratio_REF']) # fill NA and set correct output types df.fillna(0, inplace=True) - df = df.astype(dtype={'dna_counts_REF': 'int64', 'rna_counts_REF': 'int64', 'n_obs_bc_REF': 'int64', - 'dna_counts_ALT': 'int64', 'rna_counts_ALT': 'int64', 'n_obs_bc_ALT': 'int64'}, copy=False) + df = df.astype(dtype={'dna_counts_REF': 'int64', 'rna_counts_REF': 'int64', 'n_bc_REF': 'int64', + 'dna_counts_ALT': 'int64', 'rna_counts_ALT': 'int64', 'n_bc_ALT': 'int64'}, copy=False) df = df.reindex(columns=["ID", "REF", "ALT", "dna_counts_REF", "rna_counts_REF", - "dna_normalized_REF", "rna_normalized_REF", "ratio_REF", "log2_REF", - "n_obs_bc_REF", "dna_counts_ALT", "rna_counts_ALT", + "dna_normalized_REF", "rna_normalized_REF", "ratio_REF", "log2FoldChange_REF", + "n_bc_REF", "dna_counts_ALT", "rna_counts_ALT", "dna_normalized_ALT", "rna_normalized_ALT", "ratio_ALT", - "log2_ALT", "n_obs_bc_ALT", "log2_expression"]) + "log2FoldChange_ALT", "n_bc_ALT", "log2FoldChange_expression"]) # write output click.echo("Write files...") diff --git a/workflow/scripts/variants/generateVariantTable.py b/workflow/scripts/variants/generateVariantTable.py index fa2d719a..bf465248 100644 --- a/workflow/scripts/variants/generateVariantTable.py +++ b/workflow/scripts/variants/generateVariantTable.py @@ -32,18 +32,18 @@ def cli(counts_file, declaration_file, output_file): # get count df click.echo("Read count file...") - # expected column names: name dna_counts rna_counts dna_normalized rna_normalized ratio log2 n_obs_bc + # expected column names: name dna_counts rna_counts dna_normalized rna_normalized ratio log2FoldChangeFoldChange n_bc counts = pd.read_csv(counts_file, header=0, sep="\t", index_col=0) # join ref with index of counts output = declaration.join(counts, on='REF') # join again for the alt columns output = output.join(counts, on='ALT', lsuffix='_REF', rsuffix='_ALT') - output["log2_expression"] = np.log2(output['ratio_ALT']/output['ratio_REF']) + output["log2FoldChange_expression"] = np.log2(output['ratio_ALT']/output['ratio_REF']) # fill NA and set correct output types output.fillna(0, inplace=True) - output = output.astype(dtype={'dna_counts_REF': 'int64', 'rna_counts_REF': 'int64', 'n_obs_bc_REF': 'int64', - 'dna_counts_ALT': 'int64', 'rna_counts_ALT': 'int64', 'n_obs_bc_ALT': 'int64'}, + output = output.astype(dtype={'dna_counts_REF': 'int64', 'rna_counts_REF': 'int64', 'n_bc_REF': 'int64', + 'dna_counts_ALT': 'int64', 'rna_counts_ALT': 'int64', 'n_bc_ALT': 'int64'}, copy=False) # write output