-
Notifications
You must be signed in to change notification settings - Fork 0
/
ancestry_prediction_scRNAseq.yaml
137 lines (106 loc) · 8.49 KB
/
ancestry_prediction_scRNAseq.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
---
####################################################################################
##### The following arguments are for indicating file locations on your system #####
####################################################################################
refs:
genome: hg38 ## hg38 or hg19; genome the sequencing data have been aligned to
hg19_fasta: /path/to/hg19/reference/genome.fa ## Path to the reference hg19 fasta to be used for remapping for freebayes demultiplexing steps. Ideally this would be the same reference used for original mapping but any reference on the same genome with the same 'chr' encoding will do
hg38_fasta: /path/to/hg38/reference/genome.fa ## ONLY NEEDED IF DATA ORIGINALLY MAPPED TO HG38; Path to the reference hg38 fasta to be used for remapping for freebayes demultiplexing steps. Ideally this would be the same reference used for original mapping but any reference on the same genome with the same 'chr' encoding will do
inputs:
metadata_file: /path/to/samples_meta.tsv ## Sample metadata file that has two columns: 'Pool' and 'N'. The Pool should be the exact names of the parent folders for the scRNAseq output
singularity_image: /path/to/singularity/image.sif ### The complete path to the singularity image that has all the softwares
bind_path: /path ## List of paths to bind to Singularity. You can specify multiple directories by adding a "," between them. Eg. ${DIRECTORY1},${DIRECTORY2}. Singularity will bind the directory that you are running from + subfolders but will not be able to find anything above unless it is in this argument
scRNAseq_dir: /path/to/scRNAseq/parent/directory ### the parent directory that has directories for each pool and the scRNA-seq output below it
barcode_annotation_dir: /path/to/barcodes/annotation/directory ### The directory that contains each of the barcode files with per-barcode annotation. The pool name needs to be within the file name. these should be filtered to remove doublets and contain only cells assigned to an individual
common_snps: None ### Leave as None for first run of the pipeline. This will be the file of SNPs common across all sites and samples. This will be generated by sending your snp list files to Drew Neavin and the garvan institute ([email protected]) to create a common list of snps.
barcode_tag: "CB"
outputs:
outdir: /path/to/parent/out/dir
#############################################################################################################################
##### The following arguments are if you have reference SNPs for these individuals for method accuracy testing purposes #####
#############################################################################################################################
snp:
ref_snp_predict: False ## Set to true or false depending on if have reference SNP genotype data to be predicted for pipeline accuracy testing purposes
ref_snp: /path/to/unimputed/reference/file ## Reference SNP genotype vcf (file.vcf) or plink files base (of pgen files or bfiles)
#########################################################################################################################################
##### The following arguments are common parameters such as memory and threads that may need to be changed depending on the dataset #####
#########################################################################################################################################
freebayes_ancestry:
### Following parameters are for bam subsetting by individual - will only be used if multi-individual sample multiplexing was used
subset_bam_memory: 4
subset_bam_threads: 8
### Following parameters are for indexing the individual subset bam
index_memory: 4
index_threads: 2
### Following parameters are for identifying regions to be used for freebayes-parallel
parallel_freebayes_regions_memory: 4
parallel_freebayes_regions_threads: 2
parallel_freebayes_regions_N: 100000 ## Number of regions to make for freebayes parallel
### Following options for freebayes execution: freebayes_pipeline.py -i {input.bam} -b {input.barcodes} -f {params.fasta} -t {threads} -o {params.out} -k {params.N} --common_variants {input.snps}
freebayes_memory: 8 ## used for both memory and disk allocation
freebayes_threads: 1
### Following options for lifting freebayes vcf results to hg19 (for ancestry prediction)
freebayes_update_vcf_memory: 4 ## used for both memory and disk allocation
freebayes_update_vcf_threads: 2
### Following options for converting freebayes vcf to plink
freebayes_vcf2plink_memory: 8 ## used for both memory and disk allocation
freebayes_vcf2plink_threads: 2
### Following options for identifying common snps between data and 1000G reference
freebayes_common_snps_memory: 12 ## used for both memory and disk allocation
freebayes_common_snps_threads: 1
### Following options for identifying common snps between pools in your dataset
common_snps_across_pools_memory: 12 ## used for both memory and disk allocation
common_snps_across_poolss_threads: 2
### Following options for subsetting data by SNPs provided across the different pools, sites and datasets
subset_common_snps_memory: 12 ## used for both memory and disk allocation
subset_common_snps_threads: 1
### Following options for pruning 1000G data with indep
freebayes_prune_1000g_memory: 5 ## used for both memory and disk allocation
freebayes_prune_1000g_threads: 1
### Following options for contingency to deal with duplicated snps - to remove from both 1000G and dataaset
freebayes_final_pruning_memory: 5 ## used for both memory and disk allocation
freebayes_final_pruning_threads: 1
### Following options for subsetting for indicated snps + update X, Y, mt to 23, 24, 25, 26 chrs
freebayes_split_data_memory: 4 ## used for both memory and disk allocation
freebayes_split_data_threads: 1
### Following options for projecting 1000 genomes data
freebayes_pca_1000g_memory: 4 ## used for both memory and disk allocation
freebayes_pca_1000g_threads: 1
### Following options for projecting data to 1000 genome pc space
freebayes_pca_project_memory: 8 ## used for both memory and disk allocation
freebayes_pca_project_threads: 2
### Following options for predicting cell annotation using 1000 genomes reference in the orginal dataset
freebayes_pca_projection_assign_original_memory: 12 ## used for both memory and disk allocation
freebayes_pca_projection_assign_original_threads: 2
### These rules will just run if ref_snp_predict in the snp section above is ste to True
reference_ancestry_predictions:
### Following options for correlating individual IDs to clusters
freebayes_correlate_indivs_memory: 8 ## used for both memory and disk allocation
freebayes_correlate_indivs_threads: 2
### Following options for converting freebayes vcf to plink
reference_vcf2plink_memory: 8 ## used for both memory and disk allocation
reference_vcf2plink_threads: 2
### Following options for identifying common snps between data and 1000G reference
reference_common_snps_memory: 24 ## used for both memory and disk allocation
reference_common_snps_threads: 1
### Following options for pruning 1000G data with indep
reference_prune_1000g_memory: 5 ## used for both memory and disk allocation
reference_prune_1000g_threads: 1
### Following options for contingency to deal with duplicated snps - to remove from both 1000G and dataaset
reference_final_pruning_memory: 5 ## used for both memory and disk allocation
reference_final_pruning_threads: 1
### Following options for subsetting for indicated snps + update X, Y, mt to 23, 24, 25, 26 chrs
reference_split_data_memory: 4 ## used for both memory and disk allocation
reference_split_data_threads: 1
### Following options for projecting 1000 genomes data
reference_pca_1000g_memory: 4 ## used for both memory and disk allocation
reference_pca_1000g_threads: 1
### Following options for projecting data to 1000 genome pc space
reference_pca_project_memory: 8 ## used for both memory and disk allocation
reference_pca_project_threads: 2
### Following options for predicting cell annotation using 1000 genomes reference in the orginal dataset
reference_pca_projection_assign_original_memory: 12 ## used for both memory and disk allocation
reference_pca_projection_assign_original_threads: 2
### Following options for comparing freebayes predicted annotations to reference annotations
reference_freebayes_comparison_memory: 16 ## used for both memory and disk allocation
reference_freebayes_comparison_threads: 4