-
Notifications
You must be signed in to change notification settings - Fork 37
/
multi-DegePrime.yaml
112 lines (92 loc) · 5.44 KB
/
multi-DegePrime.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
---
################################################################################
# directories
################################################################################
# directory, where is/are the input virus fasta file(s) located
# absolute path
input_dir: /share/data3/yangjunbo/multiPCR/multiPrime2/virus_sequence/Rhinovirus
# directory, in which the pipeline writes all output files (relative to current directory)
# (this also includes the summary files)
results_dir: /share/data3/yangjunbo/multiPCR/multiPrime2/virus_sequence/Rhinovirus/multiPrime1.results
# directory to write the log-files, created during processing, to
log_dir: /share/data3/yangjunbo/multiPCR/multiPrime2/virus_sequence/Rhinovirus/multiPrime1.logs
# directory of scripts.
scripts_dir: /share/data3/yangjunbo/git_storage/multiPrime/scripts
################################################################################
# sample(s) information
################################################################################
# name(s) of the input fastq file(s)
# please type here: sample1
# this name is used throughout the entire pipeline as name for the output samples
virus:
- Rhinovirus
################################################################################
# preprocessing
################################################################################
#-------------------------------------------------------------------------------
# cluster by cd-hit
#-------------------------------------------------------------------------------
#Params in cd-hit -c {}. Suggestion: 0.7. minimum: 0.65
#If the identity of sequences in your input file is very high, you can set this param as 0.9.
#BUG (dont use 1): If you set this param as 1. It will generate primers by no-cluster methods.
identity: 0.7
#Max sequence number of each cluster used in multi-alignment {muscle}.
#We randomly choose {maxseq} sequences in cluster for next steps.
#It wont affect the calculation of primer coverage in the final primer set,
#cause we used all sequence in cluster to check the final coverage.
max_seq: 500
#-------------------------------------------------------------------------------
## primer design by degePrimer
##-------------------------------------------------------------------------------
#param in DegePrime.pl -d {}. number of the degenerate nucleotide
degeneracy: 10
#param in DegePrime.pl -l {}. length of the degenerate primers
primer_len: 18
#-------------------------------------------------------------------------------
## get candidate primers from degePrimer output
##-------------------------------------------------------------------------------
#param of get_degePrimer.
#Filter primers by match rate:
#[Number of sequences that match the selected primer] / [Number of sequences that span the selected primer].
#Only primers with fraction [> frac] will retain. default: 0.6.
#Sometimes the sequence number of some clusters is less than 10.
#coverage threshold [0.6] is not suitable. you can reset this threshold.
#If sequence number of some clusters is very large {> 100000}. it will take {500} sequences randomly to get output
#Details are list in the usage of get_degePrimer.py
coverage: 0.5
#PCR PRODUCT SIZE. Filter primers by PRODUCT size.default [150,400].
PRODUCT_size: 150,1200
#Filter primers by GC content. default [0.45,0.65].
gc_content: 0.2,0.7
#Filter param of hairpin, which means distance of the minimal paired bases. Default: 4. Example:(number of X) AGCT[XXXX]AGCT
distance: 4
#Filter primers by rank number (match number):
#Number of sequences from degePrime.
rank_number: 1000
#Filter primers by degenerate base position. e.g. [-t 4] means I dont want degenerate base
#appear at the end four bases when primer pre-filter. Default: 4.
end: 4
#Adaptor sequence, which is used for NGS next. Hairpin or dimer detection for adaptor--primer.
#For example: TCTTTCCCTACACGACGCTCTTCCGATCT,TCTTTCCCTACACGACGCTCTTCCGATCT. if None, use comma.
#If you don't know which adaptor will be used, and you really need an adaptor for next sequencing. you can use the example sequence to instead.
#because adaptor sequence will not affect primer set generally.
#If you don't need adaptor for the next step. set adaptor: ",".
adaptor: "TCTTTCCCTACACGACGCTCTTCCGATCT,TGGAGTTCAGACGTGTGCTCTTCCGATCT"
#-------------------------------------------------------------------------------
### get core primer set
###-------------------------------------------------------------------------------
# min sequence in each cluster.
core_number: 10
#-------------------------------------------------------------------------------
## get max primer set from get_degePrimer
##-------------------------------------------------------------------------------
#params of get_Maxprimerset_V4.
#This step will not consider the dimer formation of 5' (primers), cause it is designed for the NGS.
#It wont form dimers between adaptors of NGS and primers.
#distance between primers; column number of primer1_F to primer2_F. Do not change this param.
#GTGTGCTCGTGACCTTGA CCACAATTGCCACGTTAG 159 3 1.0 GTGTGCTCGTGACCTTGA GGTGTCTTGTTGGAAGGG 181 3
# primer1_F Primer1_R Product_len number_match coverage primer2_F primer2_R Product_len number_match
# 1 2 3 4 5 next(1) 2 3 4
step: 5
#maximal maximum
method: T