forked from ohsu-cedar-comp-hub/ChIP-seq-pipeline
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Snakefile
127 lines (108 loc) · 4.84 KB
/
Snakefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
__author__ = "Breeshey Roskams-Hieter"
__email__ = "[email protected]"
__license__ = "MIT"
# improved by Garth Kong
"""Computation Hub omic data processing pipeline"""
import datetime
import sys
import os
import pandas as pd
import plotly as plt
import plotly.express as px
import plotly.graph_objects as go
timestamp = ('{:%Y-%m-%d_%H:%M:%S}'.format(datetime.datetime.now()))
configfile:"config.yaml"
project_id = config["project_id"]
seq_type = config["seq_type"]
md = pd.read_table(config["samples"], index_col=["SampleID"], dtype=str)
contrasts = pd.read_csv(config['contrasts'], sep = "\t")
if config["seq_type"]=="SE":
CASES, = glob_wildcards("samples/cases/{sample}.fastq.gz")
COND_REPS, FACTORS, = glob_wildcards("samples/cases/{cond_rep}_{factor}.fastq.gz")
else:
CASES, = glob_wildcards("samples/cases/{sample}_R1.fastq.gz")
COND_REPS, FACTORS, = glob_wildcards("samples/cases/{cond_rep}_{factor}_R1.fastq.gz")
# CASES glob for "{cond}{rep}_{factor}"
# COND_REPS glob for "{cond}{rep}" only
# FACTORS glob for "{factor}" only
if config["seq_type"]=="SE":
CONTROLS, = glob_wildcards("samples/controls/{sample}.fastq.gz")
else:
CONTROLS, = glob_wildcards("samples/controls/{sample}_R1.fastq.gz")
CASES = sorted(CASES)
CONTROLS = sorted(CONTROLS)
COND_REPS = sorted(set(COND_REPS))
## multiple samples may use the same control input/IgG files
CONTROLS_UNIQUE = list(set(CONTROLS))
SAMPLES = CASES + CONTROLS_UNIQUE
FACTORS = set(contrasts.Factor)
rule_dirs = ['mapReads','makeTracks','bb2bed','call_peaks']
for rule in rule_dirs:
if not os.path.exists(os.path.join(os.getcwd(),'logs',rule)):
log_out = os.path.join(os.getcwd(), 'logs', rule)
os.makedirs(log_out)
print(log_out)
def get_peak(wildcards):
if md.loc[wildcards.sample,["Peak_call"]].values == "broad":
return "broad"
elif md.loc[wildcards.sample,["Peak_call"]].values == "narrow":
return "narrow"
else:
return "ERROR"
def message(mes):
sys.stderr.write("|--- " + mes + "\n")
for sample in SAMPLES:
message("Sample " + sample + " will be processed")
for case in CASES:
message("case " + case + " will be expanded")
essential_report = []
if config["gen_report"] == True: essential_report.append("results/essential_report.html"); message("Generating essential report")
# ChIP-Screen
DB, CELL_LINE, = glob_wildcards("/home/groups/MaxsonLab/kongg/chip_seq/data/beds/{db}/{cell_line}/tf_chip.bed.gz")
chip_screen = []
if config["chip_screen"] == True:
message("Will report results of the ChIP-screener")
for factor in list(FACTORS):
for db, cell_line in zip(DB, CELL_LINE):
s = "/".join(["results", "chip_screen", db, cell_line, factor + "_screen.bed.gz"])
f = "/".join(["results", "chip_screen", db, cell_line, factor + "_screen.out"])
chip_screen.append(s)
chip_screen.append(f)
# snakemake -j 99 --use-conda --rerun-incomplete --latency-wait 60 --keep-going --profile slurm --cluster-config cluster2.json
localrules: frip_plot, de_stats, align_stats, merge_counts, essential_report
rule all:
input:
# pre-process = align + make tracks + macs2 + homer
expand("samples/bams/{sample}.mapped.dedup.sorted.bam", sample = SAMPLES),
expand("samples/bigBed/{sample}.all.bb", sample = SAMPLES),
expand("samples/bigwig/{sample}.bw", sample = SAMPLES),
expand("samples/macs/{sample}/{sample}_peaks.narrowPeak", sample = CASES),
expand("results/motifs/{sample}/homerResults.html", sample = CASES),
"results/qc/align_stats.txt",
# pre-DESeq2 = consensus peaks + counts table
"samples/macs/all_peaks.bed",
"results/qc/consensus_stats.txt",
expand("samples/macs/{factor}_peaks.bed", factor = FACTORS),
expand("samples/macs/counts/{sample}_{factor}.txt", sample = COND_REPS, factor = FACTORS),
# DESeq2 = pca + normalized counts + analyze contrasts
"results/de/de_stats.txt",
expand("results/de/plots/{factor}_pca.png", factor = FACTORS),
expand(["results/de/{factor}/{factor}_deseq2_norm_counts.txt",
"results/de/{factor}/{factor}_deseq2_lognorm_counts.txt"], factor = FACTORS),
directory(expand("results/de/{factor}", factor = FACTORS)),
# quality control = frip + fastqc-screen + duplicate rate
"results/qc/frip.html",
expand(["results/qc/fastq_screen/{cases}/{cases}_{dir}_screen.{ext}",
"results/qc/fastq_screen/{controls}/{controls}_{dir}_screen.{ext}"],
dir = ["R1", "R2"],
cases = CASES,
controls = CONTROLS_UNIQUE,
ext = ["png", "txt", "html"]),
# essential report
essential_report,
chip_screen
# workflow in this order
include: "rules/align.smk"
include: "rules/peaks.smk"
include: "rules/qc.smk"
include: "rules/deseq2_report.smk"