Merge branch 'release-v0.2.0'

kircherlab · Apr 1, 2021 · 7e10fcc · 7e10fcc
2 parents 96adb92 + da21c72
commit 7e10fcc
Show file tree

Hide file tree

Showing 52 changed files with 5,876 additions and 88,496 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,13 +1,33 @@
 *
-analysis
-analysis/*
-!analysis/README.md
-!config
-!config/*
-!resources
-!resources/**
-!workflow
-!workflow/**
+
+!resources/
+!resources/genome/
+!resources/genome/**
+
+!resources/annotations/
+!resources/annotations/**
+
+!resources/blacklists/
+!resources/blacklists/**
+
+!resources/protein_atlas/
+!resources/protein_atlas/**
+
+!workflow/
+!workflow/envs/
+!workflow/envs/**
+
+!workflow/schemas/
+!workflow/schemas/**
+
+!workflow/scripts/
+!workflow/scripts/**
+
+!config/example.config.yml
+!config/samples.tsv
+!config/regions.tsv
+
+
 !.gitignore
 !.gitattributes
 !.editorconfig

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,36 @@
 
 ## develop
 
+### refactor
+
+- changed config.yml to example.config.yml
+- updated schemas
+- updated Documentation
+
+### new files
+
+- updated blacklists for both genome builds
+- updated genome files for both genome builds
+
+### new feature
+
+- snakefile_GE_analysis:
+    - updated ProteinAtlas
+        - 3 types: Blood, Tissue, Tissue+cell-lines
+    - added support for GRCh38
+
+- snakefile_WPS:
+    - added support for normalization by random background sequences
+        - generate random background sequences not overlapping target regions
+        - calculate WPS,COV,STARTS table for background sequences
+
+- extractFromBAM_RegionBed_WPS_Cov.py
+    - added output for fragment endpoints (STARTS output)
+    - added strand specificity
+
+- overlays.py:
+    - integrated normalization by background sequences
+
 ## v0.1.2
 
 ### refactor
@@ -16,4 +46,4 @@
 
 - updated README.md in root dir
     - updated input sections for workflows documentation
-    - added links to .tsv files and README.md in config dir
+    - added links to .tsv files and README.md in config dir
diff --git a/README.md b/README.md
@@ -135,10 +135,12 @@ Windowed protection scores are calculated for all provided regions with addition
 
 #### Input
 
-
 - configured by the user ([samples.tsv](config/samples.tsv)):
+    - analysis ID
     - samples
-    - path to sample bam files
+    - path to sample .bam files
+    - reference samples fro plotting
+    - genome build per sample
 - configured by the user ([regions.tsv](config/regions.tsv)):
     - bed file containing regions of interest (e.g. TFBS), all having the same length
 
@@ -172,10 +174,15 @@ WPS was used to calculate periodograms of genomic regions using Fast Fourier Tra
 - included in the repository:
     - annotations
     - labels
-    - RNAtable from Protein Atlas
+    - RNAtable from Protein Atlas 
+        - blood atlas ["Blood"]
+        - protein atlas tissues ["Tissue"]
+        - protein atlas tissues + cell lines ["Extended"]
 - configured by the user ([samples.tsv](config/samples.tsv)):
+    - analysis ID
     - samples
     - path to sample bam files
+    - genome build per sample
 
 **Note:** More information about config files [here](config/README.md)
 
@@ -184,4 +191,4 @@ WPS was used to calculate periodograms of genomic regions using Fast Fourier Tra
 - fft_summary tables (results/intermediate/body/fft_summaries)
 - plots showing intensities across tissues (results/plots)
 - table showing correlation with tissues/cell lines
-- table showing correlation rank difference to reference sample
+- table showing correlation rank difference to reference sample
diff --git a/config/README.md b/config/README.md
@@ -8,28 +8,46 @@ The config.yml file configures values that should stay constant between samples.
 samples: "config/samples.tsv" # .tsv file containing sample names and locations
 regions: "config/regions.tsv" # .tsv file containing bed files with regions of interest
 
-tissue: ["CACO.2", "MCF7", "PC.3"] # proteinAtlas tissues for generating plots in GE workflow
-refSample: "NPH001" # reference sample for rank correlation comparison
+proteinAtlas: "Blood" #RNAtable name 
+tissue: ["NK_cell", "memory_B_cell", "classical_monocyte", "basophil", "memory_CD4_T_cell", "memory_CD8_T_cell"] # tissues for generating plots
+refSample: "BH01" # reference sample for rank correlation comparison
 minRL: 120 # minimum read length for calculating WPS
 maxRL: 180 # maximum read length for calculating WPS
-bpProtection: 120 # value for WPS window
+bpProtection: 120 
+
+## genome build specific options ##
+
+GRCh37:
+  genome: "resources/genome/hg19.fa.genome" #full .genome file
+  genome_autosomes: "resources/genome/hg19.fa.genome.regular_autosomes" # .genome file reduced to regular autosomes
+  UCSC_gap: "resources/blacklists/UCSC/UCSC_gap.hg19.bed" # UCSC_gap file in .bed format
+  transcriptAnno: "resources/annotations/transcriptAnno-GRCh37.103.tsv.gz" # file containing TSSs
+
+GRCh38:
+  genome: "resources/genome/hg38.fa.genome" #full .genome file
+  genome_autosomes: "resources/genome/hg38.fa.genome.regular_autosomes" #.genome file reduced to regular autosome
+  UCSC_gap: "resources/blacklists/UCSC/UCSC_gap.hg38.bed" # UCSC_gap file in .bed format
+  transcriptAnno: "resources/annotations/transcriptAnno-GRCh38.103.tsv.gz" # file containing TSSs
 ```
 
 ## samples.tsv
 
 The samples.tsv contains a header with four columns:
 
 ```bash
-ID	sample	path	ref_samples
-experimentID	testsample1	"/path/to/testsample1.bam"	testsample2,testsample3
-experimentID	testsample2	"/path/to/testsample2.bam"	testsample1,testsample3
-experimentID	testsample3	"/path/to/testsample3.bam"	testsample1,testsample2
+ID	sample	path	ref_samples	genome_build
+experimentID	testsample1	"/path/to/testsample1.bam"	testsample2,testsample3	GRCh37
+experimentID	testsample2	"/path/to/testsample2.bam"	testsample1,testsample3	GRCh37
+experimentID	testsample3	"/path/to/testsample3.bam"	testsample1,testsample2	GRCh38
 ```
 
 - **ID** - ID for a certain analysis to create identifiable directories and/or filenames
 - **sample** - sample name used to identify files
 - **path** - path to input file
 - **ref_sample** - Reference sample for some        visualizations/calculations. ref_samples are comma separated, must be in present in the sample column and every sample needs a ref_sample (e.g. itself).
+- **genome_build** - Defines the genome build to be used for a specific sample. Valid options are ["GRCh37","GRCh38"].
+
+**Note:** Input files should match the specified genome build.
 
 ## regions.tsv
 

diff --git a/config/config.yml b/config/config.yml
diff --git a/config/example.config.yml b/config/example.config.yml
@@ -0,0 +1,23 @@
+samples: "config/samples.tsv" # .tsv file containing sample names and locations
+regions: "config/regions.tsv" # .tsv file containing bed files with regions of interest
+
+proteinAtlas: "Blood" #RNAtable name 
+tissue: ["NK_cell", "memory_B_cell", "classical_monocyte", "basophil", "memory_CD4_T_cell", "memory_CD8_T_cell"] # tissues for generating plots
+refSample: "BH01" # reference sample for rank correlation comparison
+minRL: 120 # minimum read length for calculating WPS
+maxRL: 180 # maximum read length for calculating WPS
+bpProtection: 120 
+
+## genome build specific options ##
+
+GRCh37:
+  genome: "resources/genome/hg19.fa.genome" #full .genome file
+  genome_autosomes: "resources/genome/hg19.fa.genome.regular_autosomes" # .genome file reduced to regular autosomes
+  UCSC_gap: "resources/blacklists/UCSC/UCSC_gap.hg19.bed" # UCSC_gap file in .bed format
+  transcriptAnno: "resources/annotations/transcriptAnno-GRCh37.103.tsv.gz" # file containing TSSs
+
+GRCh38:
+  genome: "resources/genome/hg38.fa.genome" #full .genome file
+  genome_autosomes: "resources/genome/hg38.fa.genome.regular_autosomes" #.genome file reduced to regular autosome
+  UCSC_gap: "resources/blacklists/UCSC/UCSC_gap.hg38.bed" # UCSC_gap file in .bed format
+  transcriptAnno: "resources/annotations/transcriptAnno-GRCh38.103.tsv.gz" # file containing TSSs
diff --git a/config/samples.tsv b/config/samples.tsv
@@ -1,4 +1,4 @@
-ID	sample	path	ref_samples
-experimentID	testsample1	"/path/to/testsample1.bam"	testsample2,testsample3
-experimentID	testsample2	"/path/to/testsample2.bam"	testsample1,testsample3
-experimentID	testsample3	"/path/to/testsample3.bam"	testsample1,testsample2
+ID	sample	path	ref_samples	genome_build
+experimentID	testsample1	"/path/to/testsample1.bam"	testsample2,testsample3	GRCh37
+experimentID	testsample2	"/path/to/testsample2.bam"	testsample1,testsample3	GRCh37
+experimentID	testsample3	"/path/to/testsample3.bam"	testsample1,testsample2	GRCh38
diff --git a/resources/README.md b/resources/README.md
@@ -0,0 +1,18 @@
+# Resources
+
+This folder is meant to contain all resources necessary for running the workflow, for example reference sequences or databases. Wherever feasible, they can also be downloaded programmatically via rules defined in the pipeline.
+
+=======
+
+## Files
+
+- resources
+    - protein_atlas
+        - RNAtableBlood.tsv.gz (from [https://www.proteinatlas.org/download/rna_blood_cell.tsv.zip](https://www.proteinatlas.org/download/rna_blood_cell.tsv.zip))
+        - labels_Blood.tsv
+        - RNAtableTissue.tsv.gz (from [https://www.proteinatlas.org/download/rna_tissue_consensus.tsv.zip](https://www.proteinatlas.org/download/rna_tissue_consensus.tsv.zip))
+        - labels_Tissue.tsv
+        - RNAtableExtended.tsv.gz (from [https://www.proteinatlas.org/download/rna_tissue_consensus.tsv.zip](https://www.proteinatlas.org/download/rna_tissue_consensus.tsv.zip) and [https://www.proteinatlas.org/download/rna_celline.tsv.zip](https://www.proteinatlas.org/download/rna_celline.tsv.zip))
+        - labels_Extended.tsv  
+    - transcriptAnno-GRCh37.103.tsv.gz (unique GeneIDs, not filtered)
+    - transcriptAnno-GRCh38.103.tsv.gz (unique GeneIDs, not filtered)
diff --git a/resources/annotations/transcriptAnno-GRCh37.103.tsv.gz b/resources/annotations/transcriptAnno-GRCh37.103.tsv.gz