Merge pull request #4 from kircherlab/develop

kircherlab · Oct 1, 2020 · 96adb92 · 96adb92
2 parents df43d85 + bbef199
commit 96adb92
Show file tree

Hide file tree

Showing 5 changed files with 144 additions and 25 deletions.
diff --git a/.gitignore b/.gitignore
@@ -15,4 +15,5 @@ analysis/*
 !.test
 !LICENSE
 !README.md
+!CHANGELOG.md
 !*.smk
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -0,0 +1,19 @@
+# cfDNA-Workflow Changelog
+
+## develop
+
+## v0.1.2
+
+### refactor
+
+- overlays.py
+    - adding samples is now wrapped in a function
+    - flanking regions are now determined dynamically
+
+### documentation
+
+- added README.md in config dir
+
+- updated README.md in root dir
+    - updated input sections for workflows documentation
+    - added links to .tsv files and README.md in config dir
diff --git a/README.md b/README.md
@@ -135,9 +135,14 @@ Windowed protection scores are calculated for all provided regions with addition
 
 #### Input
 
-- configured by user:
-    - sample bam file (samples.tsv)
-    - bed file containing regions of interest (e.g. TFBS), all having the same length (regions.tsv)
+
+- configured by the user ([samples.tsv](config/samples.tsv)):
+    - samples
+    - path to sample bam files
+- configured by the user ([regions.tsv](config/regions.tsv)):
+    - bed file containing regions of interest (e.g. TFBS), all having the same length
+
+**Note:** More information about config files [here](config/README.md)
 
 #### Output
 
@@ -168,10 +173,12 @@ WPS was used to calculate periodograms of genomic regions using Fast Fourier Tra
     - annotations
     - labels
     - RNAtable from Protein Atlas
-- configured by the user (samples.tsv):
+- configured by the user ([samples.tsv](config/samples.tsv)):
     - samples
     - path to sample bam files
 
+**Note:** More information about config files [here](config/README.md)
+
 #### Output
 
 - fft_summary tables (results/intermediate/body/fft_summaries)

diff --git a/config/README.md b/config/README.md
@@ -0,0 +1,47 @@
+# Documentation for config files
+
+## config.yml
+
+The config.yml file configures values that should stay constant between samples.
+
+```yml
+samples: "config/samples.tsv" # .tsv file containing sample names and locations
+regions: "config/regions.tsv" # .tsv file containing bed files with regions of interest
+
+tissue: ["CACO.2", "MCF7", "PC.3"] # proteinAtlas tissues for generating plots in GE workflow
+refSample: "NPH001" # reference sample for rank correlation comparison
+minRL: 120 # minimum read length for calculating WPS
+maxRL: 180 # maximum read length for calculating WPS
+bpProtection: 120 # value for WPS window
+```
+
+## samples.tsv
+
+The samples.tsv contains a header with four columns:
+
+```bash
+ID	sample	path	ref_samples
+experimentID	testsample1	"/path/to/testsample1.bam"	testsample2,testsample3
+experimentID	testsample2	"/path/to/testsample2.bam"	testsample1,testsample3
+experimentID	testsample3	"/path/to/testsample3.bam"	testsample1,testsample2
+```
+
+- **ID** - ID for a certain analysis to create identifiable directories and/or filenames
+- **sample** - sample name used to identify files
+- **path** - path to input file
+- **ref_sample** - Reference sample for some        visualizations/calculations. ref_samples are comma separated, must be in present in the sample column and every sample needs a ref_sample (e.g. itself).
+
+## regions.tsv
+
+The regions.tsv contains a header with two columns:
+
+```text
+target  path
+gene1   /path/to/gene1.bed
+TF1     /pat/to/TF1BS.bed
+```
+
+- **target** - describes the targets defined in the correspoding .bed file
+- **path** - path to input .bed file containing coordinates of interest (all coordinates should be centered around a specific feature and of same length)
+
+**Note:** .bed has to contain the first 6 fields (chrom, chromStart, chromEnd, name, value, strand), even though name and value are not actively used.
diff --git a/workflow/scripts/WPS/overlays.py b/workflow/scripts/WPS/overlays.py
@@ -6,14 +6,13 @@
 :Date: 08.09.2020
 """
 
-# imports
-
-import pandas as pd
-
-from scipy import stats
 import matplotlib
-matplotlib.use('pdf')
+import pandas as pd
 from matplotlib.backends.backend_pdf import PdfPages
+from scipy import stats
+
+matplotlib.use("pdf")
+
 
 # get variables from snakefile
 
@@ -26,35 +25,81 @@
 target = snakemake.params["target"]
 outfile = snakemake.output[0]
 
-# load tables containing position specific scores for all defined target regions +-1000 bp
+# def functions
+
+
+def calculate_flanking_regions(val: int):
+    """Calculates flanking regions for point of interest.
+
+    Args:
+        val (int): should be length of value vector
+
+    Raises:
+        TypeError: Only integers are allowed
+
+    Returns:
+        [iterator]: range of values around center point (e.g. range(-1000,1000))
+    """
+
+    if not isinstance(val, int):
+        raise TypeError("Only integers are allowed")
+
+    if val % 2 == 0:
+        flank = int(val / 2)
+        region = range(-flank, flank)
+    elif val % 2 == 1:
+        flank_l = int(val / 2 - 0.5)
+        flank_r = int(val / 2 + 0.5)
+        region = range(-flank_l, flank_r)
+    return region
+
+
+def add_sample(path: str):
+    """Reads .csv file, calculates mean over all rows and divides by trimmed mean.
+
+    Args:
+        path (str): Path to a .csv file
+
+    Returns:
+        [type]: [description]
+    """
+    sample = pd.read_csv(path, header=None).mean()
+    sample = sample / stats.trim_mean(sample, 0.25)
+    return sample
+
+
+# load tables containing position specific scores for all defined target regions
 # average over all regions per sample and substract the trimmed mean to normalise
 
+
 av_WPS = pd.DataFrame()
-av_WPS[sample_ID] = pd.read_csv(WPS, header=None).mean()
-av_WPS[sample_ID] = av_WPS[sample_ID]/stats.trim_mean(av_WPS[sample_ID], 0.25)
+av_WPS[sample_ID] = add_sample(WPS)
 for (ref_ID, WPS_ref) in zip(ref_IDs, WPS_refs):
-    av_WPS[ref_ID] = pd.read_csv(WPS_ref, header=None).mean()
-    av_WPS[ref_ID] = av_WPS[ref_ID]/stats.trim_mean(av_WPS[ref_ID], 0.25)
+    av_WPS[ref_ID] = add_sample(WPS_ref)
 
-av_WPS["position"] = range(-1001,1001)
+av_WPS["position"] = calculate_flanking_regions(len(av_WPS))
 av_WPS = av_WPS.set_index("position")
 
-
 av_COV = pd.DataFrame()
-av_COV[sample_ID] = pd.read_csv(COV, header=None).mean()
-av_COV[sample_ID] = av_COV[sample_ID]/stats.trim_mean(av_COV[sample_ID], 0.25)
+av_COV[sample_ID] = add_sample(COV)
 for (ref_ID, COV_ref) in zip(ref_IDs, COV_refs):
-    print(ref_ID, COV_ref)
-    av_COV[ref_ID] = pd.read_csv(COV_ref, header=None).mean()
-    av_COV[ref_ID] = av_COV[ref_ID]/stats.trim_mean(av_COV[ref_ID], 0.25)
+    av_COV[ref_ID] = add_sample(COV_ref)
 
-av_COV["position"] = range(-1001,1001)
+av_COV["position"] = calculate_flanking_regions(len(av_WPS))
 av_COV = av_COV.set_index("position")
 
 # create line plots and save to a single pdf
 
 with PdfPages(outfile) as pdf:
-    Fig_WPS = av_WPS.plot(title=f"adjusted WPS: {target} target regions", xlabel="Position relative to target site", ylabel="adjusted WPS")
-    Fig_Cov = av_COV.plot(title=f"adjusted read coverage: {target} target regions", xlabel="Position relative to target site", ylabel="adjusted read coverage")
+    Fig_WPS = av_WPS.plot(
+        title=f"adjusted WPS: {target} target regions",
+        xlabel="Position relative to target site",
+        ylabel="normalized WPS",
+    )
+    Fig_Cov = av_COV.plot(
+        title=f"adjusted read coverage: {target} target regions",
+        xlabel="Position relative to target site",
+        ylabel="normalized read coverage",
+    )
     pdf.savefig(Fig_WPS.get_figure())
     pdf.savefig(Fig_Cov.get_figure())