multi-DegePrime.yaml

---
################################################################################
# directories
################################################################################
# directory, where is/are the input virus fasta file(s) located
# absolute path
input_dir: /share/data3/yangjunbo/multiPCR/multiPrime2/virus_sequence/Rhinovirus

# directory, in which the pipeline writes all output files (relative to current directory)
# (this also includes the summary files)
results_dir: /share/data3/yangjunbo/multiPCR/multiPrime2/virus_sequence/Rhinovirus/multiPrime1.results

# directory to write the log-files, created during processing, to 
log_dir: /share/data3/yangjunbo/multiPCR/multiPrime2/virus_sequence/Rhinovirus/multiPrime1.logs

# directory of scripts.
scripts_dir: /share/data3/yangjunbo/git_storage/multiPrime/scripts

################################################################################
# sample(s) information
################################################################################
# name(s) of the input fastq file(s)
# please type here: sample1
# this name is used throughout the entire pipeline as name for the output samples
virus:
  - Rhinovirus
################################################################################
# preprocessing
################################################################################

#-------------------------------------------------------------------------------
# cluster by cd-hit
#-------------------------------------------------------------------------------
#Params in cd-hit -c {}. Suggestion: 0.7. minimum: 0.65 
#If the identity of sequences in your input file is very high, you can set this param as 0.9.
#BUG (dont use 1): If you set this param as 1. It will generate primers by no-cluster methods.
identity: 0.7
#Max sequence number of each cluster used in multi-alignment {muscle}. 
#We randomly choose {maxseq} sequences in cluster for next steps.
#It wont affect the calculation of primer coverage in the final primer set, 
#cause we used all sequence in cluster to check the final coverage. 
max_seq: 500

#-------------------------------------------------------------------------------
## primer design by degePrimer
##-------------------------------------------------------------------------------
#param in DegePrime.pl -d {}. number of the degenerate nucleotide
degeneracy: 10 
#param in DegePrime.pl -l {}. length of the degenerate primers
primer_len: 18

#-------------------------------------------------------------------------------
## get candidate primers from degePrimer output
##-------------------------------------------------------------------------------
#param of get_degePrimer.
#Filter primers by match rate: 
#[Number of sequences that match the selected primer] / [Number of sequences that span the selected primer].
#Only primers with fraction [> frac] will retain. default: 0.6.
#Sometimes the sequence number of some clusters is less than 10.
#coverage threshold [0.6] is not suitable. you can reset this threshold.
#If sequence number of some clusters is very large {> 100000}. it will take {500} sequences randomly to get output
#Details are list in the usage of get_degePrimer.py
coverage: 0.5

#PCR PRODUCT SIZE. Filter primers by PRODUCT size.default [150,400].
PRODUCT_size: 150,1200

#Filter primers by GC content. default [0.45,0.65].
gc_content: 0.2,0.7

#Filter param of hairpin, which means distance of the minimal paired bases. Default: 4. Example:(number of X) AGCT[XXXX]AGCT
distance: 4

#Filter primers by rank number (match number): 
#Number of sequences from degePrime.
rank_number: 1000
#Filter primers by degenerate base position. e.g. [-t 4] means I dont want degenerate base
#appear at the end four bases when primer pre-filter. Default: 4.
end: 4

#Adaptor sequence, which is used for NGS next. Hairpin or dimer detection for adaptor--primer.
#For example: TCTTTCCCTACACGACGCTCTTCCGATCT,TCTTTCCCTACACGACGCTCTTCCGATCT. if None, use comma.
#If you don't know which adaptor will be used, and you really need an adaptor for next sequencing. you can use the example sequence to instead.
#because adaptor sequence will not affect primer set generally.
#If you don't need adaptor for the next step. set adaptor: ",".
adaptor: "TCTTTCCCTACACGACGCTCTTCCGATCT,TGGAGTTCAGACGTGTGCTCTTCCGATCT"
#-------------------------------------------------------------------------------
### get core primer set
###-------------------------------------------------------------------------------
# min sequence in each cluster. 
core_number: 10
#-------------------------------------------------------------------------------
## get max primer set from get_degePrimer
##-------------------------------------------------------------------------------
#params of get_Maxprimerset_V4. 
#This step will not consider the dimer formation of 5' (primers), cause it is designed for the NGS. 
#It wont form dimers between adaptors of NGS and primers.

#distance between primers; column number of primer1_F to primer2_F. Do not change this param.
#GTGTGCTCGTGACCTTGA   CCACAATTGCCACGTTAG      159              3            1.0        GTGTGCTCGTGACCTTGA      GGTGTCTTGTTGGAAGGG      181                3
#    primer1_F             Primer1_R       Product_len    number_match    coverage        primer2_F               primer2_R      Product_len     number_match
#       1                    2                 3               4             5             next(1)                     2               3                 4

step: 5

#maximal maximum
method: T