forked from agalitsyna/RedClib
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathproject.yml
279 lines (251 loc) · 13.1 KB
/
project.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
input:
fastq_paths:
# Input syntax:
# library_name:
# - forward.library.fastq or .gz
# - reverse.library.fastq or .gz
test:
- /home/agalicina/soft/RedClib/data/fastq/sample_R1_001.fastq.gz
- /home/agalicina/soft/RedClib/data/fastq/sample_R2_001.fastq.gz
redc-fibr:
- sra:SRR10010324
oligos: # Paths to files with oligos
# Bridge fasta has 1 record:
bridge_forward: /home/agalicina/soft/RedClib/data/oligos/br_37_for.fasta
bridge_reverse: /home/agalicina/soft/RedClib/data/oligos/br_37_rev.fasta
# GGG fasta has 1 record:
ggg: /home/agalicina/soft/RedClib/data/oligos/ggg.fasta
# Adaptors files have 2 records for two variants of adaptors:
adaptor_forward: /home/agalicina/soft/RedClib/data/oligos/for_20.fasta
adaptor_reverse: /home/agalicina/soft/RedClib/data/oligos/rev_20.fasta
adaptor_reverse_short: /home/agalicina/soft/RedClib/data/oligos/rev_16.fasta
oligos_variants: # Numbers of records (variants) in the files with oligos
bridge_forward: 1 #3 # Modify this for multiple variants, e.g. br_37_for_variants.fasta
bridge_reverse: 1 #3
ggg: 1
adaptor_forward: 2
adaptor_reverse: 2
adaptor_reverse_short: 1
genome:
assembly_name: hg19
# If auto_download_genome in on, download the genome with assembly_name from UCSC,
# build the index and calculate the chromosome sizes.
auto_download_genome: true
# If auto_download is off, the local files will be used instead.
# "fasta" and "chromsizes" can be URLs instead.
# Note that if "fasta" is URL then index_prefix is ignored and index will be build de novo.
#fasta: /home/user/GENOMES/hg19/hg19.fa
#chromsizes: /home/user/GENOMES/hg19/hg19.chromsizes.txt
#index_prefix: /home/user/GENOMES/hg19/hisat2/genome
rna_annotation:
# We use RNA annotation from Gencode, it's custom name:
rna_annotation_name: gencode19
# Can be local file or URL:
genes_gtf: ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_19/gencode.v19.chr_patch_hapl_scaff.annotation.gtf.gz
protocol:
read_length:
test: 151
redc-fibr: 125
# Keys for loading restriction enzymes data:
renz:
# Syntax, see filters.restriction for example usage of keys:
# key: actual_restr_enz_name
nla: NlaIII
mme: MmeI
dpn: DpnII
mse: MseI
# Length of bridge (should correspond to the bridge length in bridge_forward/bridge_reverse files):
bridge_length: 37
run:
# Input fastq files are split into chunks with this number of reads per chunk:
chunksize: 1000000
# For real-life usage, values aroun 1e6 are optimal
# The larger the chunk size, the more memory will be consumed.
# With fine chunks, the merge step will take a long time.
# Length of read for detection of PCR duplications:
fastuniq_crop: 50
# Parameters for reads quality trimming:
params_trimmomatic: "SLIDINGWINDOW:5:26 MINLEN:0"
# Length of RNA segments for the detection of overlap of forward/reverse read
rna_complementary_length: 14
# Check presence of GA in the bridge:
check_GA: true
# DNA extension (e.g. restriction site). Set to "" or remove if no restirction site extension is needed.
dna_extension: "CATG"
# Min size for DNA/RNA1/RNA2 part so that it's subjected to mapping:
min_substring_size: 14
# Run restriction by these enzymes for the specified segments:
check_restriction: true
restriction_check:
dna:
- nla
dna_nonextended:
- nla
rna1:
- nla
- mmep # Mme recognised on plus chain
- mmen # Mme recognised on minus chain
rna2:
- nla
- mmep
- mmen
# Filters are named criteria that will be reported in the final file with statistics.
# Write filters as if they were Python operations on numpy vectors (numpy is imported as np).
### List of available variables:
# id, is_notPCRdup, seqR1, seqR1_len, seqR2, seqR2_len, trimF, trimR,
# has_GA, has_nobridge, bridge_end, bridge_nmm, bridge_start, has_noggg, ggg_end, ggg_start,
# dna_R1_len_notrim, dna_R1_len_trim, dna_R1_start, dna_R1_end, dna_R1_end_trim,
# dna_chr, dna_start, dna_end, dna_strand, dna_cigar, dna_is_mapped, dna_is_not_multi, dna_nlen,
# dna_noNla_chr, dna_noNla_is_mapped, dna_noNla_is_not_multi, dna_noNla_cigar,
# rna1_R1_start, rna1_R1_end, rna1_R1_end_trim, rna1_R1_len_notrim, rna1_R1_len_trim,
# rna1_chr, rna1_start, rna1_end, rna1_strand, rna1_cigar, rna1_is_mapped, rna1_is_not_multi, rna1_nlen,
# rna2_R2_start, rna2_R2_end, rna2_R2_end_trim, rna2_R2_len_notrim, rna2_R2_len_trim,
# rna2_chr, rna2_start, rna2_end, rna2_strand, rna2_cigar, rna2_is_mapped, rna2_is_not_multi, rna2_nlen
#
### For restriction filters, the keys and renzymes are loaded from protocol.renz section of this file.
# The segments are compared against the renzymes recognition sites in run.restriction_check section.
# If restriction enzyme with name nla appeared for rna1 fragment in restriction_check, and it corresponds
# to enzyme with palindromic recognition site (e.g. NlaIII), the following variables are accessible:
# rna1_end_nla_left, rna1_end_nla_right, rna1_start_nla_left, rna1_start_nla_right
### If enzyme is not palindromic, then the DNA strand matters, and we hit + (p) and - (n) strands separately:
# rna1_start_mmep_left, rna1_start_mmep_right, rna1_start_mmen_left, rna1_start_mmen_right,
# rna1_end_mmep_left, rna1_end_mmep_right, rna1_end_mmen_left, rna1_end_mmen_right
filters:
# Specify the restriction filters:
restriction:
rna2_mme_failed: # <- filter name
- '(rna2_strand=="+") & (rna2_start_mmep_left<=-26) & (rna2_start_mmep_left>=-27)'
- '(rna2_strand=="+") & (rna2_start_mmen_right<=19) & (rna2_start_mmen_right>=-18)'
- '(rna2_strand=="-") & (rna2_end_mmep_left<=-25) & (rna2_end_mmep_left>=-26)'
- '(rna2_strand=="-") & (rna2_end_mmen_right<=21) & (rna2_end_mmen_right>=20)'
rna2_nla_failed:
- '(rna2_strand=="+") & (rna2_start_nla_left>=-5) & (rna2_start_nla_left<=0)'
- '(rna2_strand=="+") & (rna2_start_nla_right>=0) & (rna2_start_nla_right<=1)'
- '(rna2_strand=="-") & (rna2_end_nla_left>=-5) & (rna2_end_nla_left<=0)'
- '(rna2_strand=="-") & (rna2_end_nla_right>=0) & (rna2_end_nla_right<=1)'
rna1_nla_failed:
- '(rna1_strand=="+") & (rna1_start_nla_left>=-5) & (rna1_start_nla_left<=0)'
- '(rna1_strand=="+") & (rna1_start_nla_right>=0) & (rna1_start_nla_right<=1)'
- '(rna1_strand=="-") & (rna1_end_nla_left>=-5) & (rna1_end_nla_left<=0)'
- '(rna1_strand=="-") & (rna1_end_nla_right>=0) & (rna1_end_nla_right<=1)'
# Understanding the distance filtration for NlaIII example:
# # For mappings on positive RNA strand (+):
# -5 CATGN|->N
# -4 CATG|->NN
# -3 CAT|->GNN
# -2 CA|->TGNN
# -1 C|->ATGNN
# 0 |->CATGNN
# 1 |->NCATGN
# # For mappings on negative RNA strand (-)
# -5 |->NCATGN
# -4 |->CATGNN
# -3 C|->ATGNN
# -2 CA|->TGNN
# -1 CAT|->GNN
# 0 CATG|->NN
# 1 CATGN|->N
# Pattern for filtering the relevant chromosomes:
canonical_chromosomes: "chr[0-9,X,Y]|chr[0-9][0-9]"
additional_filters:
RNAsDirectionPassed: "(rna2_chr==rna1_chr) & (rna2_strand!=rna1_strand)"
RNADNASamePos: "(dna_chr==rna2_chr) & (dna_strand!=rna2_strand) & (np.abs(dna_start-rna2_end)<=1)"
# Filter 1: Read is not PCR duplication
F1_notDup: "(is_notPCRdup)"
# Filter 2: there is a bridge in the read, with last AG letters, it is not cut by the quality filter
F2_goodBridge: "~has_nobridge & has_GA"
# Filter 3: Reverse read starts with GGG
F3_goodGGG: "~has_noggg"
# Filter 4: DNA length 18-20
F4_goodDNAlength: "(dna_R1_len_notrim>=18)&(dna_R1_len_notrim<=20)"
# Filter 5: RNA is >= 14 bp before trimming and RNA1 >= 14 bp
F5_goodRNAlength: "(rna1_R1_len_notrim>=14)&(rna2_R2_len_notrim>=14)"
# Filter 6: Trimming passed
F6_passedTrimming: "(trimF>0)&(trimR>0)"
# Filter 6a: Trimmed segments are > 14 bp
F6a_passedTrimming_strict: "(dna_R1_len_trim_adjusted>=18)&(dna_R1_len_trim_adjusted<=20)&(rna1_R1_len_trim_adjusted>=14)&(rna2_R2_len_trim_adjusted>=14)"
# Filter 7: All three segments are uniquely mapped to chromosomes
F7_uniqueCanonical: "dna_is_not_multi & rna1_is_not_multi & rna2_is_not_multi & dna_is_mapped & rna1_is_mapped & rna2_is_mapped & dna_chr_canonical & rna1_chr_canonical & rna2_chr_canonical"
F7a_RNAsameChr: "rna1_chr==rna2_chr"
F7b_RNAoppositeStrands: "rna1_strand!=rna2_strand"
# Filter 8a: Same pos of DNA and RNA
F8a_notRNADNASamePos: "~RNADNASamePos"
# Filter 8b: Confirm absence of NlaIII and MmeI influence on RNA
F8b_RNAnotDigested: "~rna1_nla_failed & ~rna2_nla_failed & ~rna2_mme_failed"
# Filter 9: RNA1 and RNA distance is small enough
F9_closeRNA: "np.abs(rna2_start-rna1_start)<1e4"
# Applying sequential filters, similar ones were used in the supplementary table for RedC paper:
V0: "(np.ones(len(F1_notDup)))"
V1: "(F1_notDup)"
V2: "(F1_notDup & F2_goodBridge)"
V3: "(V2 & F3_goodGGG)"
V4: "(V3 & F4_goodDNAlength)"
V35: "(V3 & F5_goodRNAlength)"
V5: "(V4 & F5_goodRNAlength)"
V24: "(V2 & F4_goodDNAlength)"
V25: "(V2 & F5_goodRNAlength)"
V6: "(V5 & F6_passedTrimming)"
V6a: "(V5 & F6a_passedTrimming_strict)"
V7: "(V5 & F6_passedTrimming & F6a_passedTrimming_strict & F7_uniqueCanonical)"
V7a: "(V7 & F7a_RNAsameChr)"
V7b: "(V7 & F7a_RNAsameChr & F7b_RNAoppositeStrands)"
V8a: "(V7b & F8a_notRNADNASamePos)"
V8b: "(V7b & F8a_notRNADNASamePos & F8b_RNAnotDigested)"
V9: "(V8b & F9_closeRNA)"
# Report the sum over the following columns:
report_stats:
- V0
- V1
- V2
- V3
- V4
- V5
- V35
- V24
- V25
- V6
- V6a
- V7
- V7a
- V7b
- V8a
- V8b
- V9
# Output table with filtered reads:
output:
make_final_table: true
tables:
# You may specify one or multiple tables for different filters and sets of output columns.
# Syntax:
# table_name:
# filter: "filter_name"
# header: "<list-of-columns>"
resulting_table1:
filter: "V9"
header: "id rna1_chr rna1_start rna1_end rna1_strand rna1_cigar rna2_chr rna2_start rna2_end rna2_strand rna2_cigar dna_chr dna_start dna_end dna_strand dna_cigar"
# Demonstration of alternative table structure. Note that this table will be huge and useless for real data:
resulting_table2:
filter: "V1"
header: "id V9 seqR1 seqR2"
make_cooler: true
cooler_properties: # Describe what to use for coolers (required):
table_name: resulting_table1
resolution: 1000
c1: 2 # Column with chromosome name for RNA part
c2: 12 # Column with chromosome name for DNA part
p1: 3 # Column with start for RNA part
p2: 13 # Column with start for DNA part
# Storage of intermediary and final files:
dirs:
genome: 'results/genome/'
fastq: 'results/fastq/'
table: 'results/table/'
cindex: 'results/cindex/'
cout: 'results/cout/'
filtered_fastq: 'results/filtered_fastq/'
sam: 'results/sam/'
bed: 'results/bed/'
hdf5: 'results/hdf5/'
stats: 'results/stats/'
final_table: 'results/final_table/'
cooler: 'results/cooler/'