ancestry_prediction_scRNAseq.yaml

---
####################################################################################
##### The following arguments are for indicating file locations on your system #####
####################################################################################
refs:
  genome: hg38 ## hg38 or hg19; genome the sequencing data have been aligned to
  hg19_fasta: /path/to/hg19/reference/genome.fa ## Path to the reference hg19 fasta to be used for remapping for freebayes demultiplexing steps. Ideally this would be the same reference used for original mapping but any reference on the same genome with the same 'chr' encoding will do
  hg38_fasta: /path/to/hg38/reference/genome.fa ## ONLY NEEDED IF DATA ORIGINALLY MAPPED TO HG38; Path to the reference hg38 fasta to be used for remapping for freebayes demultiplexing steps. Ideally this would be the same reference used for original mapping but any reference on the same genome with the same 'chr' encoding will do

inputs:
  metadata_file: /path/to/samples_meta.tsv ## Sample metadata file that has two columns: 'Pool' and 'N'. The Pool should be the exact names of the parent folders for the scRNAseq output
  singularity_image: /path/to/singularity/image.sif ### The complete path to the singularity image that has all the softwares
  bind_path: /path ## List of paths to bind to Singularity. You can specify multiple directories by adding a "," between them. Eg. ${DIRECTORY1},${DIRECTORY2}. Singularity will bind the directory that you are running from + subfolders but will not be able to find anything above unless it is in this argument
  scRNAseq_dir: /path/to/scRNAseq/parent/directory ### the parent directory that has directories for each pool and the scRNA-seq output below it
  barcode_annotation_dir: /path/to/barcodes/annotation/directory ### The directory that contains each of the barcode files with per-barcode annotation. The pool name needs to be within the file name. these should be filtered to remove doublets and contain only cells assigned to an individual 
  common_snps: None ### Leave as None for first run of the pipeline. This will be the file of SNPs common across all sites and samples. This will be generated by sending your snp list files to Drew Neavin and the garvan institute (d.neavin@garvan.org.au) to create a common list of snps.
  barcode_tag: "CB"

outputs: 
  outdir: /path/to/parent/out/dir


#############################################################################################################################
##### The following arguments are if you have reference SNPs for these individuals for method accuracy testing purposes #####
#############################################################################################################################
snp:
  ref_snp_predict: False ## Set to true or false depending on if have reference SNP genotype data to be predicted for pipeline accuracy testing purposes
  ref_snp: /path/to/unimputed/reference/file ## Reference SNP genotype vcf (file.vcf) or plink files base (of pgen files or bfiles)


#########################################################################################################################################
##### The following arguments are common parameters such as memory and threads that may need to be changed depending on the dataset #####
#########################################################################################################################################
freebayes_ancestry:
  ### Following parameters are for bam subsetting by individual - will only be used if multi-individual sample multiplexing was used
  subset_bam_memory: 4
  subset_bam_threads: 8

  ### Following parameters are for indexing the individual subset bam
  index_memory: 4
  index_threads: 2

  ### Following parameters are for identifying regions to be used for freebayes-parallel
  parallel_freebayes_regions_memory: 4
  parallel_freebayes_regions_threads: 2
  parallel_freebayes_regions_N: 100000 ## Number of regions to make for freebayes parallel

  ### Following options for freebayes execution: freebayes_pipeline.py -i {input.bam} -b {input.barcodes} -f {params.fasta} -t {threads} -o {params.out} -k {params.N} --common_variants {input.snps}
  freebayes_memory: 8 ## used for both memory and disk allocation
  freebayes_threads: 1

  ### Following options for lifting freebayes vcf results to hg19 (for ancestry prediction)
  freebayes_update_vcf_memory: 4 ## used for both memory and disk allocation
  freebayes_update_vcf_threads: 2

  ### Following options for converting freebayes vcf to plink
  freebayes_vcf2plink_memory: 8 ## used for both memory and disk allocation
  freebayes_vcf2plink_threads: 2

  ### Following options for identifying common snps between data and 1000G reference
  freebayes_common_snps_memory: 12 ## used for both memory and disk allocation
  freebayes_common_snps_threads: 1

  ### Following options for identifying common snps between pools in your dataset
  common_snps_across_pools_memory: 12 ## used for both memory and disk allocation
  common_snps_across_poolss_threads: 2

  ### Following options for subsetting data by SNPs provided across the different pools, sites and datasets
  subset_common_snps_memory: 12 ## used for both memory and disk allocation
  subset_common_snps_threads: 1

  ### Following options for pruning 1000G data with indep
  freebayes_prune_1000g_memory: 5  ## used for both memory and disk allocation
  freebayes_prune_1000g_threads: 1

  ### Following options for contingency to deal with duplicated snps - to remove from both 1000G and dataaset
  freebayes_final_pruning_memory: 5  ## used for both memory and disk allocation
  freebayes_final_pruning_threads: 1

  ### Following options for subsetting for indicated snps + update X, Y, mt to 23, 24, 25, 26 chrs
  freebayes_split_data_memory: 4  ## used for both memory and disk allocation
  freebayes_split_data_threads: 1

  ### Following options for projecting 1000 genomes data
  freebayes_pca_1000g_memory: 4  ## used for both memory and disk allocation
  freebayes_pca_1000g_threads: 1

  ### Following options for projecting data to 1000 genome pc space
  freebayes_pca_project_memory: 8  ## used for both memory and disk allocation
  freebayes_pca_project_threads: 2

  ### Following options for predicting cell annotation using 1000 genomes reference in the orginal dataset
  freebayes_pca_projection_assign_original_memory: 12  ## used for both memory and disk allocation
  freebayes_pca_projection_assign_original_threads: 2


### These rules will just run if ref_snp_predict in the snp section above is ste to True
reference_ancestry_predictions:
  ### Following options for correlating individual IDs to clusters
  freebayes_correlate_indivs_memory: 8  ## used for both memory and disk allocation
  freebayes_correlate_indivs_threads: 2

  ### Following options for converting freebayes vcf to plink
  reference_vcf2plink_memory: 8 ## used for both memory and disk allocation
  reference_vcf2plink_threads: 2

  ### Following options for identifying common snps between data and 1000G reference
  reference_common_snps_memory: 24 ## used for both memory and disk allocation
  reference_common_snps_threads: 1

  ### Following options for pruning 1000G data with indep
  reference_prune_1000g_memory: 5  ## used for both memory and disk allocation
  reference_prune_1000g_threads: 1

  ### Following options for contingency to deal with duplicated snps - to remove from both 1000G and dataaset
  reference_final_pruning_memory: 5  ## used for both memory and disk allocation
  reference_final_pruning_threads: 1

  ### Following options for subsetting for indicated snps + update X, Y, mt to 23, 24, 25, 26 chrs
  reference_split_data_memory: 4  ## used for both memory and disk allocation
  reference_split_data_threads: 1

  ### Following options for projecting 1000 genomes data
  reference_pca_1000g_memory: 4  ## used for both memory and disk allocation
  reference_pca_1000g_threads: 1

  ### Following options for projecting data to 1000 genome pc space
  reference_pca_project_memory: 8  ## used for both memory and disk allocation
  reference_pca_project_threads: 2

  ### Following options for predicting cell annotation using 1000 genomes reference in the orginal dataset
  reference_pca_projection_assign_original_memory: 12  ## used for both memory and disk allocation
  reference_pca_projection_assign_original_threads: 2

  ### Following options for comparing freebayes predicted annotations to reference annotations
  reference_freebayes_comparison_memory: 16  ## used for both memory and disk allocation
  reference_freebayes_comparison_threads: 4