diff --git a/.gitignore b/.gitignore index 83c4247..338adbb 100644 --- a/.gitignore +++ b/.gitignore @@ -15,4 +15,5 @@ analysis/* !.test !LICENSE !README.md +!CHANGELOG.md !*.smk \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..32e21c4 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,19 @@ +# cfDNA-Workflow Changelog + +## develop + +## v0.1.2 + +### refactor + +- overlays.py + - adding samples is now wrapped in a function + - flanking regions are now determined dynamically + +### documentation + +- added README.md in config dir + +- updated README.md in root dir + - updated input sections for workflows documentation + - added links to .tsv files and README.md in config dir diff --git a/README.md b/README.md index 1ec0561..9a15f71 100644 --- a/README.md +++ b/README.md @@ -135,9 +135,14 @@ Windowed protection scores are calculated for all provided regions with addition #### Input -- configured by user: - - sample bam file (samples.tsv) - - bed file containing regions of interest (e.g. TFBS), all having the same length (regions.tsv) + +- configured by the user ([samples.tsv](config/samples.tsv)): + - samples + - path to sample bam files +- configured by the user ([regions.tsv](config/regions.tsv)): + - bed file containing regions of interest (e.g. TFBS), all having the same length + +**Note:** More information about config files [here](config/README.md) #### Output @@ -168,10 +173,12 @@ WPS was used to calculate periodograms of genomic regions using Fast Fourier Tra - annotations - labels - RNAtable from Protein Atlas -- configured by the user (samples.tsv): +- configured by the user ([samples.tsv](config/samples.tsv)): - samples - path to sample bam files +**Note:** More information about config files [here](config/README.md) + #### Output - fft_summary tables (results/intermediate/body/fft_summaries) diff --git a/config/README.md b/config/README.md new file mode 100644 index 0000000..ed5e900 --- /dev/null +++ b/config/README.md @@ -0,0 +1,47 @@ +# Documentation for config files + +## config.yml + +The config.yml file configures values that should stay constant between samples. + +```yml +samples: "config/samples.tsv" # .tsv file containing sample names and locations +regions: "config/regions.tsv" # .tsv file containing bed files with regions of interest + +tissue: ["CACO.2", "MCF7", "PC.3"] # proteinAtlas tissues for generating plots in GE workflow +refSample: "NPH001" # reference sample for rank correlation comparison +minRL: 120 # minimum read length for calculating WPS +maxRL: 180 # maximum read length for calculating WPS +bpProtection: 120 # value for WPS window +``` + +## samples.tsv + +The samples.tsv contains a header with four columns: + +```bash +ID sample path ref_samples +experimentID testsample1 "/path/to/testsample1.bam" testsample2,testsample3 +experimentID testsample2 "/path/to/testsample2.bam" testsample1,testsample3 +experimentID testsample3 "/path/to/testsample3.bam" testsample1,testsample2 +``` + +- **ID** - ID for a certain analysis to create identifiable directories and/or filenames +- **sample** - sample name used to identify files +- **path** - path to input file +- **ref_sample** - Reference sample for some visualizations/calculations. ref_samples are comma separated, must be in present in the sample column and every sample needs a ref_sample (e.g. itself). + +## regions.tsv + +The regions.tsv contains a header with two columns: + +```text +target path +gene1 /path/to/gene1.bed +TF1 /pat/to/TF1BS.bed +``` + +- **target** - describes the targets defined in the correspoding .bed file +- **path** - path to input .bed file containing coordinates of interest (all coordinates should be centered around a specific feature and of same length) + +**Note:** .bed has to contain the first 6 fields (chrom, chromStart, chromEnd, name, value, strand), even though name and value are not actively used. diff --git a/workflow/scripts/WPS/overlays.py b/workflow/scripts/WPS/overlays.py index 596868e..349b94c 100755 --- a/workflow/scripts/WPS/overlays.py +++ b/workflow/scripts/WPS/overlays.py @@ -6,14 +6,13 @@ :Date: 08.09.2020 """ -# imports - -import pandas as pd - -from scipy import stats import matplotlib -matplotlib.use('pdf') +import pandas as pd from matplotlib.backends.backend_pdf import PdfPages +from scipy import stats + +matplotlib.use("pdf") + # get variables from snakefile @@ -26,35 +25,81 @@ target = snakemake.params["target"] outfile = snakemake.output[0] -# load tables containing position specific scores for all defined target regions +-1000 bp +# def functions + + +def calculate_flanking_regions(val: int): + """Calculates flanking regions for point of interest. + + Args: + val (int): should be length of value vector + + Raises: + TypeError: Only integers are allowed + + Returns: + [iterator]: range of values around center point (e.g. range(-1000,1000)) + """ + + if not isinstance(val, int): + raise TypeError("Only integers are allowed") + + if val % 2 == 0: + flank = int(val / 2) + region = range(-flank, flank) + elif val % 2 == 1: + flank_l = int(val / 2 - 0.5) + flank_r = int(val / 2 + 0.5) + region = range(-flank_l, flank_r) + return region + + +def add_sample(path: str): + """Reads .csv file, calculates mean over all rows and divides by trimmed mean. + + Args: + path (str): Path to a .csv file + + Returns: + [type]: [description] + """ + sample = pd.read_csv(path, header=None).mean() + sample = sample / stats.trim_mean(sample, 0.25) + return sample + + +# load tables containing position specific scores for all defined target regions # average over all regions per sample and substract the trimmed mean to normalise + av_WPS = pd.DataFrame() -av_WPS[sample_ID] = pd.read_csv(WPS, header=None).mean() -av_WPS[sample_ID] = av_WPS[sample_ID]/stats.trim_mean(av_WPS[sample_ID], 0.25) +av_WPS[sample_ID] = add_sample(WPS) for (ref_ID, WPS_ref) in zip(ref_IDs, WPS_refs): - av_WPS[ref_ID] = pd.read_csv(WPS_ref, header=None).mean() - av_WPS[ref_ID] = av_WPS[ref_ID]/stats.trim_mean(av_WPS[ref_ID], 0.25) + av_WPS[ref_ID] = add_sample(WPS_ref) -av_WPS["position"] = range(-1001,1001) +av_WPS["position"] = calculate_flanking_regions(len(av_WPS)) av_WPS = av_WPS.set_index("position") - av_COV = pd.DataFrame() -av_COV[sample_ID] = pd.read_csv(COV, header=None).mean() -av_COV[sample_ID] = av_COV[sample_ID]/stats.trim_mean(av_COV[sample_ID], 0.25) +av_COV[sample_ID] = add_sample(COV) for (ref_ID, COV_ref) in zip(ref_IDs, COV_refs): - print(ref_ID, COV_ref) - av_COV[ref_ID] = pd.read_csv(COV_ref, header=None).mean() - av_COV[ref_ID] = av_COV[ref_ID]/stats.trim_mean(av_COV[ref_ID], 0.25) + av_COV[ref_ID] = add_sample(COV_ref) -av_COV["position"] = range(-1001,1001) +av_COV["position"] = calculate_flanking_regions(len(av_WPS)) av_COV = av_COV.set_index("position") # create line plots and save to a single pdf with PdfPages(outfile) as pdf: - Fig_WPS = av_WPS.plot(title=f"adjusted WPS: {target} target regions", xlabel="Position relative to target site", ylabel="adjusted WPS") - Fig_Cov = av_COV.plot(title=f"adjusted read coverage: {target} target regions", xlabel="Position relative to target site", ylabel="adjusted read coverage") + Fig_WPS = av_WPS.plot( + title=f"adjusted WPS: {target} target regions", + xlabel="Position relative to target site", + ylabel="normalized WPS", + ) + Fig_Cov = av_COV.plot( + title=f"adjusted read coverage: {target} target regions", + xlabel="Position relative to target site", + ylabel="normalized read coverage", + ) pdf.savefig(Fig_WPS.get_figure()) pdf.savefig(Fig_Cov.get_figure())