From 4bde5bc9b89e305ee2d683f5c6268045a7e6c695 Mon Sep 17 00:00:00 2001 From: joybio Date: Wed, 19 Apr 2023 13:04:10 +0800 Subject: [PATCH] update --- multiPrime3.py | 469 +++++++++++++++++++++++++++++++++++++++++++++++ multiPrime3.yaml | 129 +++++++++++++ 2 files changed, 598 insertions(+) create mode 100644 multiPrime3.py create mode 100644 multiPrime3.yaml diff --git a/multiPrime3.py b/multiPrime3.py new file mode 100644 index 0000000..4726a9b --- /dev/null +++ b/multiPrime3.py @@ -0,0 +1,469 @@ +configfile: "multiPrime2.yaml" +#__version__ = "2.0.2" +#__date__ = "2022-7-28" +#__author__ = "Junbo Yang" +#__email__ = yang_junbo_hi@126.com; 1806389316@pku.edu.cn +#__description__ = "A Snakemake workflow to design multiPCR primers and get Max_primerset" + +import os + +virus = config["virus"] + +def aggregate_input(wildcards): + checkpoint_output = checkpoints.extract_cluster_fa.get(**wildcards).output[1] + return expand(config["results_dir"] + "/Clusters_cprimer/{i}.candidate.primers.txt", + i=glob_wildcards(os.path.join(checkpoint_output, "{i}.fa")).i) +#Here a new directory will be created for each sample by the checkpoint. +#After completion of the checkpoint. the aggregate_input function is re-evaluated as previously. +#The values of the wildcard i is this time used to expand the pattern "post/{sample}/{i}.txt", +#such that the rule intermediate is executed for each of the determined clusters. + +rule all: + ## LOCAL ## +# ''' +# Defines the target rule by specifying all final output files. +# Additionally, the cluster_logs dir is deleted if +# snakemake was run locally. +# ''' + input: + config["results_dir"] + "/Primers_set/candidate_primers_sets.txt", + config["results_dir"] + "/Primers_set/final_maxprimers_set.xls", + config["results_dir"] + "/Primers_set/Coverage_stast.xls", + config["results_dir"] + "/Primers_set/candidate_primers_sets.number", + config["results_dir"] + "/Primers_set/final_maxprimers_set.fa.dimer", + config["results_dir"] + "/Core_primers_set/core_Coverage_stast.xls", + config["results_dir"] + "/Core_primers_set/core_final_maxprimers_set.fa.dimer", + config["results_dir"] + "/Core_primers_set/core_candidate_primers_sets.number", + config["results_dir"] + "/Core_primers_set/BWT_coverage/core_final_maxprimers_set.out" + +#------------------------------------------------------------------------------------------- +# seq_format rule 1: Dependency packages - python +#------------------------------------------------------------------------------------------- +rule seq_format: + input: + config["input_dir"] + "/{virus}.fa" + output: + config["results_dir"] + "/Total_fa/{virus}.format.fa" + params: + config["scripts_dir"] + message: + "Step1: format fasta file.." + shell: + ''' + python {params}/seq_format.py -i {input} -o {output} + ''' +#------------------------------------------------------------------------------------------- +# extract_ rule 2: Dependency packages - None +#------------------------------------------------------------------------------------------- +#rule extract_seq: +# input: +# config["results_dir"] + "/Total_fa/{virus}.format.fa" +# output: +# config["results_dir"] + "/Total_fa/{virus}.format.fa", +# message: +# "Step2: extract from the raw fasta .." +# shell: +# ''' +# cat {input} | grep -A 1 "" > {output[0]} +# ''' +#------------------------------------------------------------------------------------------- +# rmdup rule 3: Dependency packages - cd-hit +#------------------------------------------------------------------------------------------- +rule rmdup: + input: + config["results_dir"] + "/Total_fa/{virus}.format.fa" + output: + config["results_dir"] + "/Total_fa/{virus}.format.rmdup.cluster.fa" + message: + "Step3: Remove duplicated sequence .." + shell: + ''' + cd-hit -M 0 -T 0 -i {input} -o {output} -c 1 + ''' +#------------------------------------------------------------------------------------------- +# bowtie2-index rule 2: Dependency packages - bowtie2 +#------------------------------------------------------------------------------------------- +#rule makedb: +# input: +# config["results_dir"] + "/Total_fa/{virus}.format.rmdup.cluster.fa" +# output: +# touch(config["results_dir"] + "/Bowtie_db/done") +# params: +# output_prefix=config["results_dir"] + "/Bowtie_db/genome" +# message: +# "Step2: Build index for BWT (bowtie2) .." +# shell: +# ''' +# bowtie2-build {input} {params.output_prefix} +# ''' +#------------------------------------------------------------------------------------------- +# cluster_by_identity rule 4: Dependency packages - cd-hit || suggest: identity=0.8 +#------------------------------------------------------------------------------------------- +rule cluster_by_identity: + input: + config["results_dir"] + "/Total_fa/{virus}.format.rmdup.cluster.fa" + output: + config["results_dir"] + "/Total_fa/{virus}.format.rmdup.cluster.uniq.fa", + config["results_dir"] + "/Total_fa/{virus}.format.rmdup.cluster.uniq.fa.clstr" + # It is allowed that output name not used in the shell. + message: + "Step4: Cluster sequences by identity .." + params: + identity=config['identity'] + shell: + ''' + cd-hit -M 0 -T 0 -i {input} -o {output[0]} -c {params.identity} + ''' +#------------------------------------------------------------------------------------------- +# extract_cluster_fa rule 5: Dependency packages - None +#------------------------------------------------------------------------------------------- +checkpoint extract_cluster_fa: + input: + expand(config["results_dir"] + "/Total_fa/{virus}.format.rmdup.cluster.fa", + virus = virus), + expand(config["results_dir"] + "/Total_fa/{virus}.format.rmdup.cluster.uniq.fa.clstr", + virus = virus) + output: + config["results_dir"] + "/cluster.txt", + directory(config["results_dir"] + "/Clusters_fa"), + config["results_dir"] + "/cluster.identities.txt", + config["results_dir"] + "/history.txt", + params: + script = config["scripts_dir"], + max_seq = config["max_seq"], + threshold = config["seq_number_ANI"], + drop = config["drop"], + ani = config["ani"] + message: + "Step5: Extract fasta from cd-hit results .." + shell: + ''' + python {params.script}/extract_cluster.py -i {input[0]} -c {input[1]} \ + -m {params.max_seq} -o {output[0]} -y {output[2]} -d {output[1]}; + + python {params.script}/merge_cluster_by_ANI.py -i {output[0]} -p 20 -t {params.threshold} \ + -o {output[3]} -d {params.drop} -a {params.ani} + ''' + +#------------------------------------------------------------------------------------------- +# merge_cluster 6: Dependency packages - fastANI +#------------------------------------------------------------------------------------------- +#rule merge_cluster: +# input: +# config["results_dir"] + "/cluster.txt" +# output: +# config["results_dir"] + "/history.txt" +# message: +# "Step6: Merging cluster by ANI.." +# shell: +# ''' +# python {params.script}/merge_cluster_by_ANI.py -i {input} -p 20 -t 20 -o {output} +# ''' + +#------------------------------------------------------------------------------------------- +# alignment_by_muscle rule 6: Dependency packages - None +#------------------------------------------------------------------------------------------- +rule alignment_by_muscle: + input: + config["results_dir"] + "/Clusters_fa/{i}.tfa" + output: + config["results_dir"] + "/Clusters_msa/{i}.tmsa" + params: + script = config["scripts_dir"] + resources: + mem_mb = 10000 + message: + "Step6: Alignment by MAFFT/muscle .." + shell: + ''' + python {params.script}/run_mafft.py -i {input} -o {output} + ''' +# muscle -in {input} -out {output} +# for long input. +# mafft --auto {input} > {output} +#------------------------------------------------------------------------------------------- +# multiPrime rule 7: Dependency packages - multiPrime-core +#------------------------------------------------------------------------------------------- +rule multiPrime: + input: + rules.alignment_by_muscle.output + output: + config["results_dir"] + "/Clusters_primer/{i}.top.primer.out" + log: + config["log_dir"] + "/multiPrime_{i}.log" + resources: + mem_mb = 10000 + params: + script = config["scripts_dir"], + dege_number = config["dege_number"], + degeneracy = config["degeneracy"], + primer_len = config["primer_len"], + min_PCR_size = config["PRODUCT_size"].split(",")[0], + variation = config["variation"], + coordinate = config["coordinate"], + GC = config["gc_content"], + coverage = config["coverage"] + message: + "Step7: Design primers by multiPrime .." + shell: + ''' + python {params.script}/multiPrime-core_V16.py -i {input} -n {params.dege_number} \ + -d {params.degeneracy} -v {params.variation} -c {params.coordinate} \ + -g {params.GC} -s {params.min_PCR_size} -l {params.primer_len} \ + -o {output} -f {params.coverage} -p 1 2>&1 > {log} + ''' +#------------------------------------------------------------------------------------------- +# get_degePrimer rule 8: Dependency packages - pandas, biopython, math, operator,functools +#------------------------------------------------------------------------------------------- +rule get_multiPrime: + input: + primer = config["results_dir"] + "/Clusters_primer/{i}.top.primer.out", + ref_fa = config["results_dir"] + "/Clusters_fa/{i}.tfa" + output: + config["results_dir"] + "/Clusters_cprimer/{i}.candidate.primers.txt" + log: + config["log_dir"] + "/get_multiPrime_{i}.log" + resources: + mem_mb = 10000 + params: + script = config["scripts_dir"], + fraction = config["coverage"], + size = config["PRODUCT_size"], + # maxseq=config["max_seq"], + gc_content = config["gc_content"], + distance = config["distance"], + adaptor = config["adaptor"], + end = config["end"] + message: + "Step8: Filter candidate primeri pairs for each cluster (hairpin, dimer (F-R) check) .." + shell: + ''' + python {params.script}/get_multiPrime.py -i {input.primer} -r {input.ref_fa} \ + -f {params.fraction} -s {params.size} -g {params.gc_content} -e {params.end} \ + -d {params.distance} -a {params.adaptor} -m 0\ + -o {output} -p 1 2>&1 > {log} + ''' + +#------------------------------------------------------------------------------------------- +# aggregate_candidate_primers rule 9: Dependency packages - None +#------------------------------------------------------------------------------------------- +rule aggregate_candidate_primers: + input: + aggregate_input + output: + config["results_dir"] + "/Primers_set/candidate_primers_sets.txt" + message: + "Step9: Prepare all candidate primers for primer selection .." + shell: + ''' + cat {input} > {output} + ''' +#------------------------------------------------------------------------------------------- +# get_candidate_primer_fa rule 10: Dependency packages - None +#------------------------------------------------------------------------------------------- +rule get_candidate_primer_fa: + input: + config["results_dir"] + "/Primers_set/candidate_primers_sets.txt" + output: + config["results_dir"] + "/Primers_set/candidate_primers_sets.number", + directory(config["results_dir"] + "/Primers_set/candidate_primers_sets") + params: + script = config["scripts_dir"], + step = config["step"] + message: + "Step10: Get candidate primers .." + shell: + ''' + python {params.script}/candidate_primer_txt2fa.py -i {input} -s {params.step} \ + -n {output[0]} -o {output[1]} + ''' + +#------------------------------------------------------------------------------------------- +# get_Maxprimerset rule 11: Dependency packages - python, pandas, biopython +#------------------------------------------------------------------------------------------- +rule get_Maxprimerset: + input: + config["results_dir"] + "/Primers_set/candidate_primers_sets.txt" + output: + config["results_dir"] + "/Primers_set/final_maxprimers_set.xls" + log: + config["log_dir"] + "/get_Maxprimerset.log" + params: + script = config["scripts_dir"], + step = config["step"], + method = config["method"] + message: + "Step11: Try to find Max_primer_set..." + shell: + ''' + python {params.script}/get_Maxprimerset.py -i {input} -s {params.step} \ + -m {params.method} -o {output} \ + 2>&1 > {log} + ''' +#------------------------------------------------------------------------------------------- +# get_core_primer_set rule 12: Dependency packages - python +#------------------------------------------------------------------------------------------- +rule get_core_primer_set: + input: + config["results_dir"] + "/Primers_set/candidate_primers_sets.txt" + output: + config["results_dir"] + "/Core_primers_set/core_candidate_primers_sets.txt" + message: + "step12: Extract core primer set..." + params: + script = config["scripts_dir"], + number = config["core_number"] + shell: + """ + python {params.script}/core_primerset_extraction.py -i {input} -o {output} \ + -n {params.number} + """ +#------------------------------------------------------------------------------------------- +# get_core_Maxprimerset rule 13: Dependency packages - python +#------------------------------------------------------------------------------------------- +rule get_core_Maxprimerset: + input: + config["results_dir"] + "/Core_primers_set/core_candidate_primers_sets.txt" + output: + config["results_dir"] + "/Core_primers_set/core_final_maxprimers_set.xls" + message: + "Step13: Try to find core Max_primer_set .." + params: + script = config["scripts_dir"], + step = config["step"], + method = config["method"] + log: + config["log_dir"] + "/get_core_Maxprimerset.log" + shell: + """ + python {params.script}/get_Maxprimerset.py -i {input} -s {params.step} \ + -m {params.method} -o {output} \ + 2>&1 > {log} + """ +#------------------------------------------------------------------------------------------- +# format_transition rule 14: Dependency packages - python +#------------------------------------------------------------------------------------------- +rule format_transition: + input: + config["results_dir"] + "/Core_primers_set/core_candidate_primers_sets.txt" + output: + directory(config["results_dir"] + "/Core_primers_set/core_candidate_primers_sets"), + config["results_dir"] + "/Core_primers_set/core_candidate_primers_sets.number" + params: + script = config["scripts_dir"], + step = config["step"] + message: + "Step14: Format transition .." + shell: + """ + python {params.script}/candidate_primer_txt2fa.py -i {input} -s {params.step} \ + -o {output[0]} -n {output[1]} + """ +#------------------------------------------------------------------------------------------- +# get_all_PCR_product rule 15: Dependency packages - python +#------------------------------------------------------------------------------------------- +rule get_all_PCR_product: + input: + config["results_dir"] + "/Primers_set/final_maxprimers_set.xls", + expand(config["results_dir"] + "/Total_fa/{virus}.format.fa",virus = virus) + output: + directory(config["results_dir"] + "/Primers_set/PCR_product"), + config["results_dir"] + "/Primers_set/Coverage_stast.xls" + params: + config["scripts_dir"] + message: + "Step15: Extract PCR product from the input virus sequence (non-mismatch) .." + shell: + ''' + python {params}/extract_PCR_product.py -i {input[0]} -r {input[1]} -p 10 \ + -f xls -o {output[0]} -s {output[1]} + ''' +#------------------------------------------------------------------------------------------- +# get_core_PCR_product rule 16: Dependency packages - python +#------------------------------------------------------------------------------------------- +rule get_core_PCR_product: + input: + config["results_dir"] + "/Core_primers_set/core_final_maxprimers_set.xls", + expand(config["results_dir"] + "/Total_fa/{virus}.format.fa",virus = virus) + output: + directory(config["results_dir"] + "/Core_primers_set/core_PCR_product"), + config["results_dir"] + "/Core_primers_set/core_Coverage_stast.xls" + params: + config["scripts_dir"] + message: + "Step16: Extract core PCR product from the input virus sequence (non-mismatch) .." + shell: + ''' + python {params}/extract_PCR_product.py -i {input[0]} -r {input[1]} -p 10 \ + -f xls -o {output[0]} -s {output[1]} + ''' +#------------------------------------------------------------------------------------------- +# mfeprimer_check rule 17: Dependency packages - mfeprimer-3.2.6 +#------------------------------------------------------------------------------------------- +rule all_mfeprimer_check: + input: + config["results_dir"] + "/Primers_set/final_maxprimers_set.xls" + output: + config["results_dir"] + "/Primers_set/final_maxprimers_set.fa", + config["results_dir"] + "/Primers_set/final_maxprimers_set.fa.hairpin", + config["results_dir"] + "/Primers_set/final_maxprimers_set.fa.dimer", + config["results_dir"] + "/Primers_set/final_maxprimers_set.fa.findimer" + params: + config["scripts_dir"] + message: + "Step17: Hairpin and dimer check by mfeprimer .." + + shell: + """ + python {params}/primerset_format.py -i {input} -o {output[0]} + {params}/mfeprimer-3.2.6 hairpin -i {output[0]} -o {output[1]} + {params}/mfeprimer-3.2.6 dimer -i {output[0]} -o {output[2]} + python {params}/finDimer.py -i {output[0]} -o {output[3]} + """ +#------------------------------------------------------------------------------------------- +# core_mfeprimer_check rule 18: Dependency packages - mfeprimer-3.2.6 +#------------------------------------------------------------------------------------------- +rule core_mfeprimer_check: + input: + config["results_dir"] + "/Core_primers_set/core_final_maxprimers_set.xls" + output: + config["results_dir"] + "/Core_primers_set/core_final_maxprimers_set.fa", + config["results_dir"] + "/Core_primers_set/core_final_maxprimers_set.fa.hairpin", + config["results_dir"] + "/Core_primers_set/core_final_maxprimers_set.fa.dimer", + config["results_dir"] + "/Core_primers_set/core_final_maxprimers_set.fa.findimer" + params: + config["scripts_dir"] + message: + "Step18: Hairpin and dimer check by mfeprimer.. " + shell: + """ + python {params}/primerset_format.py -i {input} -o {output[0]} + {params}/mfeprimer-3.2.6 hairpin -i {output[0]} -o {output[1]} + {params}/mfeprimer-3.2.6 dimer -i {output[0]} -o {output[2]} + python {params}/finDimer.py -i {output[0]} -o {output[3]} + """ +#------------------------------------------------------------------------------------------- +# core_primer_coverage rule 19: Dependency packages - bowtie2 +#------------------------------------------------------------------------------------------- +rule BWT_validation: + input: + config["results_dir"] + "/Core_primers_set/core_final_maxprimers_set.fa", + expand(config["results_dir"] + "/Total_fa/{virus}.format.fa",virus = virus) + output: + config["results_dir"] + "/Core_primers_set/BWT_coverage/core_final_maxprimers_set.out" + params: + script = config["scripts_dir"], + primer_len = config["primer_len"], + message: + "Step19: Primer coverage clculation .. " + shell: + """ + python {params.script}/primer_coverage_validation_by_BWT.py -i {input[0]} -r {input[1]} \ + -l {params.primer_len} -t 1 -s 50,2000 -o {output} + """ + +#------------------------------------------------------------------------------------------- +# Done! +#------------------------------------------------------------------------------------------- + + diff --git a/multiPrime3.yaml b/multiPrime3.yaml new file mode 100644 index 0000000..69ceabf --- /dev/null +++ b/multiPrime3.yaml @@ -0,0 +1,129 @@ +--- +################################################################################ +# directories +################################################################################ +# directory, where is/are the input virus fasta file(s) located +# absolute path +input_dir: /share/data3/yangjunbo/multiPCR/multiPrime2/virus_sequence/Rhinovirus/ + +# directory, in which the pipeline writes all output files (relative to current directory) +# (this also includes the summary files) +results_dir: /share/data3/yangjunbo/multiPCR/multiPrime2/virus_sequence/Rhinovirus/results + +# directory to write the log-files, created during processing, to +log_dir: /share/data3/yangjunbo/multiPCR/multiPrime2/virus_sequence/Rhinovirus/logs + +# directory of scripts. +scripts_dir: /share/data3/yangjunbo/git_storage/multiPrime/scripts + +################################################################################ +# sample(s) information +################################################################################ +# name(s) of the input fastq file(s) +# please type here: sample1 +# this name is used throughout the entire pipeline as name for the output samples +virus: + - Rhinovirus +################################################################################ +# preprocessing +################################################################################ + +#------------------------------------------------------------------------------- +# cluster by cd-hit +#------------------------------------------------------------------------------- +#Params in cd-hit -c {}. Suggestion: 0.7. minimum: 0.65 +#If the identity of sequences in your input file is very high, you can set this param as 0.9. +#BUG (dont use 1): If you set this param as 1. It will generate primers by no-cluster methods. +identity: 0.7 + +# minmum sequence number in cluster +# multiPrime will try to merge those clusters with less than {seq_number_ANI} into those clusters +# greater than {seq_number_ANI}. It is used to reduce cluster number. +# If seq_number_ANI = 0, then all clusters will be processed +# If seq_number_ANI = 1, then no clusters will be processed +# Suggestion: 10 or 0 +seq_number_ANI: 1 +# Threshold of average nucleotide identity, minimum:0.7. +ani: 0.8 +# Merge or drop those clusters with rare sequences and shows high ANI with others. T: drop; F: merge into others +# Carefully use "F". +drop: "F" + +#Max sequence number of each cluster used in multi-alignment {muscle}. +#We randomly choose {maxseq} sequences in cluster for next steps. +#It wont affect the calculation of primer coverage in the final primer set, +#cause we used all sequence in cluster to check the final coverage. +max_seq: 500 + +#------------------------------------------------------------------------------- +## primer design by multiPrime +##------------------------------------------------------------------------------- +#param in multiPrime-core -n {}. number of the degenerate nucleotide +dege_number: 4 +#param in multiPrime-core -d {}. degeneracy +degeneracy: 10 +#param in multiPrime-core -l {}. length of the degenerate primers +primer_len: 18 +#variation. max mis-match in the calculation of mis-coverage +variation: 1 +#coordinate:. mis-match position is not allowed to located in certain positions in the calculation of mis-coverage +coordinate: -17,-1 + +#------------------------------------------------------------------------------- +## get candidate primers from multiPrime output +##------------------------------------------------------------------------------- +#param of get_multiPrime. +#Filter primers by match rate: +#[Number of sequences that match the selected primer] / [Number of sequences that span the selected primer]. +#Only primers with fraction [> frac] will retain. default: 0.6. +#Sometimes the sequence number of some clusters is less than 10. +#coverage threshold [0.6] is not suitable. you can reset this threshold. +#If sequence number of some clusters is very large {> 100000}. it will take {500} sequences randomly to get output +#Details are list in the usage of get_multiPrime.py +coverage: 0.7 + +#PCR PRODUCT SIZE. Filter primers by PRODUCT size.default [250,700]. +PRODUCT_size: 150,1200 + +#Filter primers by GC content. default [0.45,0.65]. +gc_content: 0.2,0.7 + +#Filter param of hairpin, which means distance of the minimal paired bases. Default: 4. Example:(number of X) AGCT[XXXX]AGCT +distance: 4 + +#Filter primers by degenerate base position. e.g. [-t 4] means I dont want degenerate base +#appear at the end four bases when primer pre-filter. Default: 4. +end: 4 + +#Adaptor sequence, which is used for NGS next. Hairpin or dimer detection for adaptor--primer. +#For example: TCTTTCCCTACACGACGCTCTTCCGATCT,TCTTTCCCTACACGACGCTCTTCCGATCT. if None, use comma. +#If you don't know which adaptor will be used, and you really need an adaptor for next sequencing. you can use the example sequence to instead. +#because adaptor sequence will not affect primer set generally. +#If you don't need adaptor for the next step. set adaptor: ",". +adaptor: "TCTTTCCCTACACGACGCTCTTCCGATCT,TGGAGTTCAGACGTGTGCTCTTCCGATCT" +#------------------------------------------------------------------------------- +### get core primer set +###------------------------------------------------------------------------------- +# min sequence in each cluster. +core_number: 10 +#------------------------------------------------------------------------------- +## get max primer set from get_multiPrime +##------------------------------------------------------------------------------- +#params of get_Maxprimerset_V4. +#This step will not consider the dimer formation of 5' (primers), cause it is designed for the NGS. +#It wont form dimers between adaptors of NGS and primers. + +#distance between primers; column number of primer1_F to primer2_F. Do not change this param. +#GTGTGCTCGTGACCTTGA CCACAATTGCCACGTTAG 159 3 1.0 GTGTGCTCGTGACCTTGA GGTGTCTTGTTGGAAGGG 181 3 +# primer1_F Primer1_R Product_len number_match coverage primer2_F primer2_R Product_len number_match +# 1 2 3 4 5 next(1) 2 3 4 + +step: 5 + +#maximal maximum +method: T + + + + +