diff --git a/GoogleCloud/.DS_Store b/GoogleCloud/.DS_Store new file mode 100644 index 0000000..19663ef Binary files /dev/null and b/GoogleCloud/.DS_Store differ diff --git a/GoogleCloud/amethyst/Snakefile b/GoogleCloud/amethyst/Snakefile new file mode 100644 index 0000000..4ec6c25 --- /dev/null +++ b/GoogleCloud/amethyst/Snakefile @@ -0,0 +1,425 @@ +# Grab a list of all the samples +# Assumes all the file names have the format R[12]_{sample}_R[12].fastq +import glob +import re + +pattern = re.compile(r"00_data/fastq/R1/(.*)_R1.fastq$") + +files = glob.glob('00_data/fastq/R1/*.fastq') + +SAMPLES = [] +for file in files: + match = pattern.match(file) + if match: + SAMPLES.append(match.group(1)) + +ruleorder: fastqc > multiqc > fastp > megahit > bbdb > bbmap > prodigal > interleave > sourmash > maxbin2 > checkm > dRep +# Master rule that snakemake uses to determine which files need to be +# generated. +rule all: + input: + expand("00_data/fastq/R1/{sample}_R1.fastq", sample=SAMPLES), + expand("00_data/fastq/R2/{sample}_R2.fastq", sample=SAMPLES), + expand("00_data/fastq/fastqc-R1/{sample}_R1_fastqc.html", sample=SAMPLES), + expand("00_data/fastq/fastqc-R2/{sample}_R2_fastqc.html", sample=SAMPLES), + "00_data/fastq/fastqc-R1/multiqc_report.html", + "00_data/fastq/fastqc-R2/multiqc_report.html", + expand("01_qc/trimmed_reads/test/{sample}_1.fq", sample=SAMPLES), + expand("01_qc/trimmed_reads/test/{sample}_2.fq", sample=SAMPLES), + expand("02_assembly/{sample}/{sample}.contigs.fa", sample=SAMPLES), + expand("02_assembly/{sample}.1.bt2", sample=SAMPLES), + expand("02_assembly/{sample}/{sample}.sam", sample=SAMPLES), + expand("02_assembly/{sample}/prodigal/{sample}_contig_cords.gbk", sample=SAMPLES), + expand("02_assembly/{sample}/prodigal/{sample}_contig_orfs.faa", sample=SAMPLES), + expand("02_assembly/{sample}/prodigal/{sample}_contig_orfs.fna", sample=SAMPLES), + expand("01_qc/trimmed_reads/test/{sample}_1.fq", sample=SAMPLES), + expand("01_qc/trimmed_reads/test/{sample}_2.fq", sample=SAMPLES), + expand("02_assembly/sourmash/tax_out/{sample}_sourmash_gather_out.csv", sample=SAMPLES), + expand("02_assembly/{sample}_MaxBin.abundance", sample=SAMPLES), + expand("02_assembly/{sample}/{sample}.contigs.fa", sample=SAMPLES), + "02_assembly/dRep_out/log/logger.log", + "03_assignment/GTDBtk/mashoutput.msh", + "02_assembly/checkm/results/checkm.log", + "03_assignment/GTDBtk/gtdbtk.log", + "README.md", + #expand("02_assembly/{sample}/prodigal/{sample}/{sample}.gff", sample=SAMPLES), + #expand("02_assembly/{sample}/prodigal/{sample}/{sample}.faa", sample=SAMPLES), + + +# Run all the samples through FastQC +rule fastqc: + conda: + "mg-qc" + input: + r1 = "00_data/fastq/R1/{sample}_R1.fastq", + r2 = "00_data/fastq/R2/{sample}_R2.fastq" + output: + o1 = "00_data/fastq/fastqc-R1/{sample}_R1_fastqc.html", + o2 = "00_data/fastq/fastqc-R2/{sample}_R2_fastqc.html" + group: 1 + priority: 13 + params: + outfolder1 = "00_data/fastq/fastqc-R1", + outfolder2 = "00_data/fastq/fastqc-R2" + threads: 5 + log: + "logs_full/fastqc/{sample}.log" + benchmark: + "benchmarks_full/fastqc/{sample}.txt" + shell: + """ + mkdir -p "{params.outfolder1}" + mkdir -p "{params.outfolder2}" + fastqc -t {threads} -c 16 -o {params.outfolder1} {input.r1} + fastqc -t {threads} -c 16 -o {params.outfolder2} {input.r2} + """ + +# Run MultiQC on the FastQC reports +rule multiqc: + conda: + "mg-qc" + output: + "00_data/fastq/fastqc-R1/multiqc_report.html", + "00_data/fastq/fastqc-R2/multiqc_report.html" + group: 2 + priority: 12 + params: + outfolder1 = "00_data/fastq/fastqc-R1", + outfolder2 = "00_data/fastq/fastqc-R2" + log: + "logs_full/multiqc/multiqc.log" + benchmark: + "benchmarks_full/multiqc/multiqc.txt" + shell: + """ + cd 00_data/fastq/fastqc-R1 + + multiqc --export . -f + cd .. + cd fastqc-R2 + multiqc --export . -f + """ + +# Run fastp +rule fastp: + conda: + "/home/jupyter/amethyst-test-data/multitrim/multitrim.yml" + input: + r1 = "00_data/fastq/R1/{sample}_R1.fastq", + r2 = "00_data/fastq/R2/{sample}_R2.fastq" + output: + o1 = "01_qc/trimmed_reads/test/{sample}_1.fq", + o2 = "01_qc/trimmed_reads/test/{sample}_2.fq", + o3 = "01_qc/trimmed_reads/test/{sample}_test_report.html" + priority: 11 + threads: 10 + log: + "logs_full/fastp/{sample}.log" + benchmark: + "benchmarks_full/fastp/{sample}.txt" + shell: + """ + fastp \ + -i {input.r1} \ + -o {output.o1} \ + -I {input.r2} \ + -O {output.o2} \ + --detect_adapter_for_pe \ + -g -l 50 -W 4 -M 20 -w 16 \ + --cut_front \ + --thread {threads} \ + -h {output.o3} + """ +# Run bbnorm +#rule bbnorm: +# conda: +# "mg-norm" +# input: +# r1 = "01_qc/trimmed_reads/test/{sample}_1.fq.gz", +# r2 = "01_qc/trimmed_reads/test/{sample}_2.fq.gz" +# output: +# o1 = "01_qc/{sample}_normalized.fq.gz" +# priority: 10 +# params: +# r1 = "01_qc/trimmed_reads/test/{sample}_1.fq.gz", +# r2 = "01_qc/trimmed_reads/test/{sample}_2.fq.gz" +# log: +# "logs_full/bbnorm/{sample}.log" +# benchmark: +# "benchmarks_full/bbnorm/{sample}.txt" +# shell: +# """ +# bbmap/bbnorm.sh in={input.r1} in2={input.r2} out={output.o1} target=100 min=5 interleaved=FALSE -Xmx50g +# """ +# Run megahit +# snakemake will create the output folders since that is the location of the +# output files we specify. megahit refuses to run if its output folder already +# exists, so because of this, we have to remove the folder snakemake creates +# before we do anything. +# Right now megahit is set to use all the cores and 0.85% of the machine's +# memory. This will probably need to be adjusted when used under other +# situations. +rule megahit: + conda: + "mg-assembly" + input: + r1 = "01_qc/trimmed_reads/test/{sample}_1.fq", + r2 = "01_qc/trimmed_reads/test/{sample}_2.fq" + output: + o1 = "02_assembly/{sample}/{sample}.contigs.fa" + priority: 9 + params: + r1 = "02_assembly/{sample}_R1.fq", + r2 = "02_assembly/{sample}_R2.fq", + outfolder = "02_assembly/{sample}", + prefix = "{sample}" + threads: 20 + log: + "logs_full/megahit/{sample}.log" + benchmark: + "benchmarks_full/megahit/{sample}.txt" + shell: + """ + rm -rf {params.outfolder} + cat {input.r1} > {params.r1} + cat {input.r2} > {params.r2} + megahit -1 {params.r1} -2 {params.r2} -m 0.85 -t {threads}\ + --min-contig-len 20 --out-prefix {params.prefix} \ + -o {params.outfolder} --k-min 21 --k-max 21 + rm {params.r1} {params.r2} + """ +#build db +rule bbdb: + conda: + "mg-binning" + input: + seq = "02_assembly/{sample}/{sample}.contigs.fa" + output: + o1 = "02_assembly/{sample}.1.bt2", + o2 = "02_assembly/{sample}.2.bt2", + o3 = "02_assembly/{sample}.3.bt2", + o4 = "02_assembly/{sample}.4.bt2", + o5 = "02_assembly/{sample}.rev.1.bt2", + o6 = "02_assembly/{sample}.rev.2.bt2" + priority: 8 + params: + basename="02_assembly/{sample}" + threads: 20 + log: + "logs_full/bbdb/{sample}.log" + benchmark: + "benchmarks_full/bbdb/{sample}.txt" + shell: + """ + bowtie2-build --threads 20 {input.seq} {params.basename} + """ +#map and make sam file +rule bbmap: + conda: + "mg-binning" + input: + r1 = "01_qc/trimmed_reads/test/{sample}_1.fq", + r2 = "01_qc/trimmed_reads/test/{sample}_2.fq", + o1 = "02_assembly/{sample}.1.bt2", + o2 = "02_assembly/{sample}.2.bt2", + o3 = "02_assembly/{sample}.3.bt2", + o4 = "02_assembly/{sample}.4.bt2", + o5 = "02_assembly/{sample}.rev.1.bt2", + o6 = "02_assembly/{sample}.rev.2.bt2" + output: + o1 = "02_assembly/{sample}/{sample}.sam", + log = "02_assembly/{sample}.bowtie2.log" + priority: 7 + params: + o2 = "02_assembly/{sample}" + threads: 32 + log: + "logs_full/bbmap/{sample}.log" + benchmark: + "benchmarks_full/bbmap/{sample}.txt" + shell: + """ + bowtie2 --threads 32 -x {params.o2} -1 {input.r1} \ + -2 {input.r2} -S {output.o1} > {output.log} + """ +rule prodigal: + conda: + "mg-assembly" + input: + r1 = "02_assembly/{sample}/{sample}.contigs.fa" + output: + o1 = "02_assembly/{sample}/prodigal/{sample}_contig_cords.gbk", + o2 = "02_assembly/{sample}/prodigal/{sample}_contig_orfs.faa", + o3 = "02_assembly/{sample}/prodigal/{sample}_contig_orfs.fna" + priority: 6 + threads: 32 + log: + "logs_full/prodigal/{sample}.log" + benchmark: + "benchmarks_full/prodigal/{sample}.txt" + shell: + """ + prodigal -i {input.r1} -o {output.o1} -a {output.o2} -d {output.o3} + """ +#rule prokka: +# conda: +# "mg-assembly2" +# input: +# r1 = "02_assembly/{sample}/{sample}.contigs.fa" +# output: +# o1 = "02_assembly/{sample}/prodigal/{sample}/{sample}.gff", +# o2 = "02_assembly/{sample}/prodigal/{sample}/{sample}.faa" +# priority: 5 +# params: +# outfolder = "02_assembly/{sample}/prodigal/{sample}", +# prefix = "{sample}" +# threads: 32 +# log: +# "logs_full/prokka/{sample}.log" +# benchmark: +# "benchmarks_full/prokka/{sample}.txt" +# shell: +# """ +# prokka {input.r1} --outdir {params.outfolder} --prefix {params.prefix} --force --cpu 0 +# """ + +rule interleave: + conda: + "mg-diversity" + input: + r1 = "01_qc/trimmed_reads/test/{sample}_1.fq", + r2 = "01_qc/trimmed_reads/test/{sample}_2.fq", + output: + o1 = "01_qc/interleaved/{sample}_interleaved.fq", + priority: 4 + params: + threads: 20 + log: + "logs_full/bbint/{sample}.log" + benchmark: + "benchmarks_full/bbint/{sample}.txt" + shell: + """ + /home/jupyter/amethyst/dbs/bbmap/reformat.sh in1={input.r1} in2={input.r2} out={output.o1} + """ +rule sourmash: + conda: + "mg-diversity" + input: + o1 = "01_qc/interleaved/{sample}_interleaved.fq" + output: + o2 = "02_assembly/sourmash/tax_out/{sample}_reads.sig", + o3 = "02_assembly/sourmash/tax_out/{sample}_sourmash_gather_out.csv", + priority: 3 + params: + outfolder2 = "02_assembly/sourmash/tax_out/{sample}", + db = "./dbs/gtdb-rs202.genomic-reps.k31.zip" + threads: 20 + log: + "logs_full/sourmash/{sample}.log" + benchmark: + "benchmarks_full/sourmash/{sample}.txt" + shell: + """ + sourmash sketch dna {input.o1} -o {output.o2} + sourmash gather {output.o2} {params.db} -o {output.o3} --ignore-abundance + sourmash tax metagenome -g {output.o3} -t ./dbs/gtdb-rs202.taxonomy.v2.csv -o {params.outfolder2} --output-format csv_summary --force + sourmash tax metagenome -g {output.o3} -t ./dbs/gtdb-rs202.taxonomy.v2.csv -o {params.outfolder2} --output-format krona --rank family --force + """ +rule maxbin2: + conda: + "mg-binning2" + input: + r1 = "02_assembly/{sample}/{sample}.contigs.fa", + r2 = "01_qc/trimmed_reads/test/{sample}_1.fq", + r3 = "01_qc/trimmed_reads/test/{sample}_2.fq" + output: + o2 = "02_assembly/{sample}_MaxBin.abundance" + priority: 4 + params: + outfolder = "02_assembly/{sample}_MaxBin" + threads: 20 + log: + "logs_full/maxbin/{sample}.log" + benchmark: + "benchmarks_full/maxbin/{sample}.txt" + shell: + """ + run_MaxBin.pl -contig {input.r1} -min_contig_length 100 \ + -reads {input.r2} -reads2 {input.r3} \ + -out {params.outfolder} -thread 20 + """ + +rule checkm: + conda: + "checkm" + input: + r1 = "00_data/fastq/fastqc-R1/multiqc_report.html" + output: + o2 = "02_assembly/checkm/results/checkm.log" + params: + outfolder = "02_assembly/checkm", + outfolder2 = "02_assembly/checkm/results" + log: + "logs_full/checkm/checkm.log" + benchmark: + "benchmarks_full/checkm/checkm.txt" + shell: + """ + export CHECKM_DATA_PATH=./dbs/ + cp -n 02_assembly/*/*.contigs.fa 02_assembly/checkm + test -f {output.o2} && 2>&1 || checkm lineage_wf -t 8 -x fa {params.outfolder} {params.outfolder2} >output.log + """ +rule dRep: + conda: + "mg-binning3" + input: + r1 = "README.md" + output: + o1 = "02_assembly/dRep_out/log/logger.log" + priority: 1 + params: + infolder = "02_assembly/dRep_data", + outfolder = "02_assembly/dRep_out" + threads: 20 + log: + "logs_full/dRep/dRep.log" + benchmark: + "benchmarks_full/dRep/dRep.txt" + shell: + """ + if [ -d "{params.infolder}" ]; then + rm -rf "{params.infolder}" + fi + if [ -d "{params.outfolder}" ]; then + rm -rf "{params.outfolder}" + fi + mkdir -p "{params.infolder}" + mkdir -p "{params.outfolder}" + cp 02_assembly/*.fasta "{params.infolder}" + test -f "{output.o1}" && 2>&1 || dRep dereplicate "{params.outfolder}" -g 02_assembly/dRep_data/*.fasta --ignoreGenomeQuality --SkipSecondary -p 10 + """ + + +rule GTDBtk: + conda: + "mg-binning3" + input: + r1 = "logs_full/dRep/dRep.log" + output: + o2 = "03_assignment/GTDBtk/gtdbtk.log", + o3 = "03_assignment/GTDBtk/mashoutput.msh" + params: + o1 = directory("03_assignment/GTDBtk"), + i1 = "02_assembly/dRep_data/" + threads: 20 + log: + "logs_full/GTDBtk/gtdb.log" + benchmark: + "benchmarks_full/GTDBtk/bm.txt" + shell: + """ + mkdir -p 03_assignment/ + mkdir -p 03_assignment/GTDBtk + gtdbtk classify_wf --mash_db 03_assignment/GTDBtk/mashoutput.msh --genome_dir {params.i1} --out_dir {params.o1} --extension fasta --cpus 32 --pplacer_cpus 32 --debug > logs_full/gtdbtk_std.out 2> logs_full/gtdbtk_std.err + """ \ No newline at end of file diff --git a/GoogleCloud/mega-non-model-wgs-snakeflow b/GoogleCloud/mega-non-model-wgs-snakeflow/README.md similarity index 100% rename from GoogleCloud/mega-non-model-wgs-snakeflow rename to GoogleCloud/mega-non-model-wgs-snakeflow/README.md diff --git a/GoogleCloud/mega-non-model-wgs-snakeflow/config/chromosomes.tsv b/GoogleCloud/mega-non-model-wgs-snakeflow/config/chromosomes.tsv new file mode 100644 index 0000000..f2ec40f --- /dev/null +++ b/GoogleCloud/mega-non-model-wgs-snakeflow/config/chromosomes.tsv @@ -0,0 +1,32 @@ +chrom num_bases +NC_051269.1 44063945 +NC_051270.1 41642849 +NC_051271.1 41172918 +NW_023618430.1 101937 +NC_051272.1 39310580 +NC_051273.1 38607003 +NC_051274.1 36325703 +NC_051275.1 36146424 +NC_051276.1 36006835 +NC_051277.1 35235349 +NC_051278.1 34964250 +NC_051279.1 34912176 +NC_051280.1 34196205 +NC_051281.1 33154565 +NW_023618431.1 250492 +NW_023618432.1 178401 +NC_051282.1 32625488 +NW_023618433.1 278450 +NW_023618434.1 173117 +NC_051283.1 32625529 +NC_051284.1 32153643 +NC_051285.1 30648620 +NC_051286.1 29458520 +NC_051287.1 28767290 +NC_051288.1 27725676 +NW_023618435.1 104377 +NC_051289.1 27312080 +NC_051290.1 24788685 +NW_023618436.1 113387 +NC_051291.1 23426545 +NC_051292.1 16640398 diff --git a/GoogleCloud/mega-non-model-wgs-snakeflow/config/config.yaml b/GoogleCloud/mega-non-model-wgs-snakeflow/config/config.yaml new file mode 100644 index 0000000..8b2d3e6 --- /dev/null +++ b/GoogleCloud/mega-non-model-wgs-snakeflow/config/config.yaml @@ -0,0 +1,127 @@ +units: config/units.tsv +chromosomes: config/chromosomes.tsv +scaffold_groups: config/scaffold_groups.tsv +scatter_intervals_file: config/scatters_5000000.tsv + +# this file is only needed when treating different samples +# as different species, for indel realignment, etc. +indel_grps: config/igrps-species.tsv + + +rclone_data: False + +# This will typically be left at 0. Unless you want to do +# BQSR. +bqsr_rounds: 0 + +# to specify some downsampling levels to do you can put them in the +# depths list here. +downsample_bams: + depths: [] + bqsr_round: 0 + +# leave thse as one of your maf_cutoffs so that we can do BQSR still +# as before, though we probably won't. +bqsr_maf: 0.01 + +# these following ones are irrelevant if you are not doing BQSR, but they +# have to be in the config still +bqsr_qual: 37 +bqsr_qd: 15 + + +# the following must be a list, even if it is just one element +maf_cutoffs: [0.01] + + + +# where to send the results +rclone_base: "gdrive-rclone:Bioinformatic-Project-Archives/rockfish-genomics/rockfish-lanes-1-and-2" + + +# this is the default value for the first GenomicsDBImport run. +# If you want to change it, you should typically change it +# on the command line. +genomics_db_import_num: 0 + + +# eric modified this to be able to easily handle genomes of non-model +# organisms that are not yet on enseml, etc. +ref: + # name of the species. (will simply be put in the SnpEff config). + # Put underscores for spaces + species_name: Sebastes_umbrosus + # name you want to use for the genome version + genome_version_name: fSebUmb1.pri + # the URL where this can be downloaded, if public and easy to + # get with wget. If not, then you should hand-download + # genome.fasta and genome.gff or genome.gtf + genome_url: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/015/220/745/GCF_015220745.1_fSebUmb1.pri/GCF_015220745.1_fSebUmb1.pri_genomic.fna.gz + # if there is a GFF or GTF file, they have to have a .gff or + # .gtf extension. This where you put the URL for it + gff_or_gtf_url: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/015/220/745/GCF_015220745.1_fSebUmb1.pri/GCF_015220745.1_fSebUmb1.pri_genomic.gtf.gz + + +filtering: + # Set to true in order to apply machine learning based recalibration of + # quality scores instead of hard filtering. + vqsr: false + hard: + # hard filtering as outlined in GATK docs + # (https://gatkforums.broadinstitute.org/gatk/discussion/2806/howto-apply-hard-filters-to-a-call-set) + snvs: + "QD < 2.0 || FS > 60.0 || MQ < 40.0 || MQRankSum < -12.5 || ReadPosRankSum < -8.0" + indels: + "QD < 2.0 || FS > 200.0 || ReadPosRankSum < -20.0" + +processing: + remove-duplicates: true + # Uncomment and point to a bed file with, e.g., captured regions if necessary, + # see https://gatkforums.broadinstitute.org/gatk/discussion/4133/when-should-i-use-l-to-pass-in-a-list-of-intervals. + restrict-regions: chr24_26_28.bed + # If regions are restricted, uncomment this to enlarge them by the given value in order to include + # flanking areas. + # region-padding: 100 + +params: + gatk: + # the default for haplotype caller is to require very high base quality scores + # because BQSR on non-model organisms doesn't work for crap, and Nina's group + # found it better to just require high base quality scores. + HaplotypeCaller: " --min-base-quality-score 33 --minimum-mapping-quality 20 " + BaseRecalibrator: "" + GenotypeGVCFs: "" + VariantRecalibrator: "" + picard: + MarkDuplicates: " --TAGGING_POLICY All --CREATE_INDEX " + fastp: + pe: + trimmer: + # See fastp manual for adding additional options, e.g. for adapter trimming. + - " --adapter_sequence=AGATCGGAAGAGCACACGTCTGAACTCCAGTCA" + - " --adapter_sequence_r2=AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT" + - " --detect_adapter_for_pe --cut_right --cut_right_window_size 4 --cut_right_mean_quality 20 " + # single-end is currently broken + trimmomatic: + pe: + trimmer: + # See trimmomatic manual for adding additional options, e.g. for adapter trimming. + - "ILLUMINACLIP:resources/adapters/TruSeq3-PE-2.fa:2:30:10" + - "LEADING:3" + - "TRAILING:3" + - "SLIDINGWINDOW:4:20" + - "MINLEN:36" + se: + trimmer: + # See trimmomatic manual for adding additional options, e.g. for adapter trimming. + - "LEADING:3" + - "TRAILING:3" + - "SLIDINGWINDOW:4:20" + - "MINLEN:36" + vep: + plugins: + # Add any plugin from https://www.ensembl.org/info/docs/tools/vep/script/vep_plugins.html + # Plugin args can be passed as well, e.g. "LoFtool,path/to/custom/scores.txt". + - LoFtool + # extra command line arguments (e.g. --sift, see docs) + extra: "" diff --git a/GoogleCloud/mega-non-model-wgs-snakeflow/config/igrps-species.tsv b/GoogleCloud/mega-non-model-wgs-snakeflow/config/igrps-species.tsv new file mode 100644 index 0000000..70a5dd5 --- /dev/null +++ b/GoogleCloud/mega-non-model-wgs-snakeflow/config/igrps-species.tsv @@ -0,0 +1,44 @@ +sample sample_id indel_grp +111_31 111_31 rosenblatti +111_33 111_33 rosenblatti +111_34 111_34 rosenblatti +111_35 111_35 rosenblatti +111_55 111_55 rosenblatti +111_56 111_56 rosenblatti +111_57 111_57 rosenblatti +18_37 18_37 rosenblatti +18_38 18_38 rosenblatti +221_49 221_49 rosenblatti +221_50 221_50 rosenblatti +238_55 238_55 rosenblatti +240_12 240_12 rosenblatti +240_16 240_16 rosenblatti +240_17 240_17 rosenblatti +336_15 336_15 rosenblatti +336_16 336_16 rosenblatti +336_31 336_31 rosenblatti +336_36 336_36 rosenblatti +336_38 336_38 rosenblatti +336_55 336_55 rosenblatti +336_56 336_56 rosenblatti +336_70 336_70 rosenblatti +51_71 51_71 rosenblatti +51_72 51_72 rosenblatti +51_73 51_73 rosenblatti +51_74 51_74 rosenblatti +51_75 51_75 rosenblatti +51_76 51_76 rosenblatti +8_35 8_35 rosenblatti +8_41 8_41 rosenblatti +8_43 8_43 rosenblatti +8_44 8_44 rosenblatti +8_45 8_45 rosenblatti +8_47 8_47 rosenblatti +8_49 8_49 rosenblatti +8_74 8_74 rosenblatti +8_75 8_75 rosenblatti +8_76 8_76 rosenblatti +8_77 8_77 rosenblatti +8_78 8_78 rosenblatti +8_80 8_80 rosenblatti +8_82 8_82 rosenblatti diff --git a/GoogleCloud/mega-non-model-wgs-snakeflow/config/meta.csv b/GoogleCloud/mega-non-model-wgs-snakeflow/config/meta.csv new file mode 100644 index 0000000..286c2e9 --- /dev/null +++ b/GoogleCloud/mega-non-model-wgs-snakeflow/config/meta.csv @@ -0,0 +1,43 @@ +NMFS_DNA_ID,BOX_ID,BOX_POSITION,SAMPLE_ID,BATCH_ID,PROJECT_NAME,GENUS,SPECIES,Cluster_Morph,LENGTH,WEIGHT,SEX,AGE,REPORTED_LIFE_STAGE,PHENOTYPE,HATCHERY_MARK,TAG_NUMBER,COLLECTION_DATE,ESTIMATED_DATE,PICKER,PICK_DATE,LEFTOVER_SAMPLE,SAMPLE_COMMENTS,SPECIES_CODE,COMMON_NAME,LANDFALL_PORT,CRUISE,HAUL,SITE,STATE_M,COUNTY_M,LATITUDE_M,LONGITUDE_M,LOCATION_COMMENTS_M +,,,8_41,x,,Sebastes,rosenblatti,,,,,,,,,,,1996,,,,From J. Hyde,greenblotched rockfish,,,,,Palos Verdes,CA,,33.81383,-118.439, +,,,8_43,x,,Sebastes,rosenblatti,,,,,,,,,,,1996,,,,From J. Hyde,greenblotched rockfish,,,,,Palos Verdes,CA,,33.81383,-118.439, +,,,8_44,x,,Sebastes,rosenblatti,,,,,,,,,,,1996,,,,From J. Hyde,greenblotched rockfish,,,,,Palos Verdes,CA,,33.81383,-118.439, +,,,8_45,x,,Sebastes,rosenblatti,,,,,,,,,,,1996,,,,From J. Hyde,greenblotched rockfish,,,,,Palos Verdes,CA,,33.81383,-118.439, +,,,8_47,x,,Sebastes,rosenblatti,,,,,,,,,,,1996,,,,From J. Hyde,greenblotched rockfish,,,,,Palos Verdes,CA,,33.81383,-118.439, +,,,8_49,x,,Sebastes,eos,,,,,,,,,,,1996,,,,From J. Hyde,pink rockfish,,,,,Palos Verdes,CA,,33.81383,-118.439, +,,,8_74,x,,Sebastes,rosenblatti,,,,,,,,,,,1996,,,,From J. Hyde,greenblotched rockfish,,,,,Guadalupe Island,MX,,29.15917,-118.27, +,,,8_76,x,,Sebastes,rosenblatti,,,,,,,,,,,1996,,,,From J. Hyde,greenblotched rockfish,,,,,Guadalupe Island,MX,,29.15917,-118.27, +,,,8_77,x,,Sebastes,rosenblatti,,,,,,,,,,,1996,,,,From J. Hyde,greenblotched rockfish,,,,,Guadalupe Island,MX,,29.15917,-118.27, +,,,8_78,x,,Sebastes,rosenblatti,,,,,,,,,,,1996,,,,From J. Hyde,greenblotched rockfish,,,,,Guadalupe Island,MX,,29.15917,-118.27, +,,,8_80,x,,Sebastes,rosenblatti,,,,,,,,,,,1996,,,,From J. Hyde,greenblotched rockfish,,,,,Guadalupe Island,MX,,29.15917,-118.27, +,,,8_82,x,,Sebastes,chlorostictus,,,,,,,,,,,1996,,,,From J. Hyde,greenspotted rockfish,,,,,Guadalupe Island,MX,,29.15917,-118.27, +,,,111_31,x,,Sebastes,rosenblatti,,,,,,,,,,,1994,,,,From J. Hyde,greenblotched rockfish,,,,,60 Mile Bank,CA,,32.10517,-118.237, +,,,111_33,x,,Sebastes,rosenblatti,,,,,,,,,,,1994,,,,From J. Hyde,greenblotched rockfish,,,,,60 Mile Bank,CA,,32.10517,-118.237, +,,,111_34,x,,Sebastes,rosenblatti,,,,,,,,,,,1994,,,,From J. Hyde,greenblotched rockfish,,,,,La Jolla,CA,,32.87333,-117.312, +,,,111_35,x,,Sebastes,rosenblatti,,,,,,,,,,,1994,,,,From J. Hyde,greenblotched rockfish,,,,,La Jolla,CA,,32.87333,-117.312, +,,,111_55,x,,Sebastes,rosenblatti,,,,,,,,,,,1994,,,,From J. Hyde,greenblotched rockfish,,,,,San Nicholas Island,CA,,32.87333,-117.312, +,,,111_56,x,,Sebastes,rosenblatti,,,,,,,,,,,1994,,,,From J. Hyde,greenblotched rockfish,,,,,San Nicholas Island,CA,,33.20083,-119.512, +,,,111_57,x,,Sebastes,chlorostictus,,,,,,,,,,,1994,,,,From J. Hyde,greenspotted rockfish,,,,,San Nicholas Island,CA,,33.20083,-119.512, +,,,51_71,x,,Sebastes,chlorostictus,,,,,,,,,,,1998,,,,From J. Hyde,greenspotted rockfish,,,,,Point Reyes,CA,,38.075,-123.527, +,,,51_72,x,,Sebastes,chlorostictus,,,,,,,,,,,1998,,,,From J. Hyde,greenspotted rockfish,,,,,Point Reyes,CA,,38.075,-123.527, +,,,51_73,x,,Sebastes,chlorostictus,,,,,,,,,,,1998,,,,From J. Hyde,greenspotted rockfish,,,,,Point Reyes,CA,,38.075,-123.527, +,,,51_74,x,,Sebastes,chlorostictus,,,,,,,,,,,1998,,,,From J. Hyde,greenspotted rockfish,,,,,Point Reyes,CA,,38.075,-123.527, +,,,51_75,x,,Sebastes,chlorostictus,,,,,,,,,,,1998,,,,From J. Hyde,greenspotted rockfish,,,,,Point Reyes,CA,,38.075,-123.527, +,,,51_76,x,,Sebastes,chlorostictus,,,,,,,,,,,1998,,,,From J. Hyde,greenspotted rockfish,,,,,Point Reyes,CA,,38.075,-123.527, +,,,240_16,x,,Sebastes,chlorostictus,,,,,,,,,,,2005,,,,From J. Hyde,greenspotted rockfish,,,,,Osborne Bank,CA,,33.36,-119.03, +,,,240_17,x,,Sebastes,chlorostictus,,,,,,,,,,,2005,,,,From J. Hyde,greenspotted rockfish,,,,,Osborne Bank,CA,,33.36,-119.03, +,,,336_15,x,,Sebastes,chlorostictus,,,,,,,,,,,2007,,,,From J. Hyde,greenspotted rockfish,,,,,Tanner Bank,CA,,32.7,-119.06, +,,,336_31,x,,Sebastes,chlorostictus,,,,,,,,,,,2007,,,,From J. Hyde,greenspotted rockfish,,,,,San Clemente Island,CA,,32.78,-118.36, +,,,336_36,x,,Sebastes,chlorostictus,,,,,,,,,,,2007,,,,From J. Hyde,greenspotted rockfish,,,,,San Clemente Island,CA,,32.78,-118.4, +,,,336_38,x,,Sebastes,chlorostictus,,,,,,,,,,,2007,,,,From J. Hyde,greenspotted rockfish,,,,,San Clemente Island,CA,,32.78,-118.4, +,,,336_55,x,,Sebastes,chlorostictus,,,,,,,,,,,2007,,,,From J. Hyde,greenspotted rockfish,,,,,San Nicholas Island,CA,,33.28,-119.51, +,,,336_70,x,,Sebastes,chlorostictus,,,,,,,,,,,2007,,,,From J. Hyde,greenspotted rockfish,,,,,San Nicholas Island,CA,,33.28,-119.51, +,,,18_37,x,,Sebastes,rosenblatti,,,,,,,,,,,2018,,,,Collected by Aguilar,greenblotched rockfish,,,,,Palos Verdes,CA,,33.6834349,-118.320934, +,,,18_38,x,,Sebastes,eos,,,,,,,,,,,2018,,,,Collected by Aguilar,pink rockfish,,,,,Palos Verdes,CA,,33.6834349,-118.320934, +,,,8_35,x,,Sebastes,chlorostictus,,,,,,,,,,,1996,,,,From J. Hyde,greenspotted rockfish,,,,,Palos Verdes,CA,,33.81383,-118.439, +,,,240_12,x,,Sebastes,chlorostictus,,,,,,,,,,,2005,,,,From J. Hyde,greenspotted rockfish,,,,,Osborne Bank,CA,,33.36,-119.03, +,,,336_16,x,,Sebastes,chlorostictus,,,,,,,,,,,2007,,,,From J. Hyde,greenspotted rockfish,,,,,Tanner Bank,CA,,32.7,-119.06, +,,,336_56,x,,Sebastes,chlorostictus,,,,,,,,,,,2007,,,,From J. Hyde,greenspotted rockfish,,,,,San Nicholas Island,CA,,33.28,-119.51, +,,,238_55,x,,Sebates ,eos,,,,,,,,,,,2007,,,,From J. Hyde,pink rockfish,,,,,Nine Mile Bank,CA,,32.83333,-117.25, +,,,221_49,x,,Sebates ,eos,,,,,,,,,,,2005,,,,From J. Hyde,pink rockfish,,,,,Santa Rosa Flats,CA,,33.67993,-120, +,,,221_50,x,,Sebates ,eos,,,,,,,,,,,2005,,,,From J. Hyde,pink rockfish,,,,,Santa Rosa Flats,CA,,33.67993,-120, diff --git a/GoogleCloud/mega-non-model-wgs-snakeflow/config/scaffold_groups.tsv b/GoogleCloud/mega-non-model-wgs-snakeflow/config/scaffold_groups.tsv new file mode 100644 index 0000000..14d074f --- /dev/null +++ b/GoogleCloud/mega-non-model-wgs-snakeflow/config/scaffold_groups.tsv @@ -0,0 +1,108 @@ +id chrom len cumul +scaff_group001 NW_023618437.1 204126 204126 +scaff_group001 NW_023618438.1 114053 318179 +scaff_group001 NW_023618439.1 246245 564424 +scaff_group001 NW_023618440.1 422125 986549 +scaff_group001 NW_023618441.1 251291 1237840 +scaff_group001 NW_023618442.1 161087 1398927 +scaff_group001 NW_023618443.1 107603 1506530 +scaff_group001 NW_023618444.1 105840 1612370 +scaff_group001 NW_023618445.1 104370 1716740 +scaff_group001 NW_023618446.1 45793 1762533 +scaff_group001 NW_023618447.1 99780 1862313 +scaff_group001 NW_023618448.1 98809 1961122 +scaff_group001 NW_023618449.1 98658 2059780 +scaff_group001 NW_023618450.1 90733 2150513 +scaff_group001 NW_023618451.1 86800 2237313 +scaff_group001 NW_023618452.1 87547 2324860 +scaff_group001 NW_023618453.1 84445 2409305 +scaff_group001 NW_023618454.1 77169 2486474 +scaff_group001 NW_023618455.1 76524 2562998 +scaff_group001 NW_023618456.1 75864 2638862 +scaff_group001 NW_023618457.1 75517 2714379 +scaff_group001 NW_023618458.1 74501 2788880 +scaff_group001 NW_023618459.1 74818 2863698 +scaff_group001 NW_023618460.1 74196 2937894 +scaff_group001 NW_023618461.1 73345 3011239 +scaff_group001 NW_023618462.1 72699 3083938 +scaff_group001 NW_023618463.1 71536 3155474 +scaff_group001 NW_023618464.1 71225 3226699 +scaff_group001 NW_023618465.1 64522 3291221 +scaff_group001 NW_023618466.1 62348 3353569 +scaff_group001 NW_023618467.1 60741 3414310 +scaff_group001 NW_023618468.1 59442 3473752 +scaff_group001 NW_023618469.1 10174 3483926 +scaff_group001 NW_023618470.1 58209 3542135 +scaff_group001 NW_023618471.1 57847 3599982 +scaff_group001 NW_023618472.1 57411 3657393 +scaff_group001 NW_023618473.1 57104 3714497 +scaff_group001 NW_023618474.1 56904 3771401 +scaff_group001 NW_023618475.1 54570 3825971 +scaff_group001 NW_023618476.1 54391 3880362 +scaff_group001 NW_023618477.1 32169 3912531 +scaff_group001 NW_023618478.1 52788 3965319 +scaff_group001 NW_023618479.1 52159 4017478 +scaff_group001 NW_023618480.1 52063 4069541 +scaff_group001 NW_023618481.1 51365 4120906 +scaff_group001 NW_023618482.1 50260 4171166 +scaff_group001 NW_023618483.1 49436 4220602 +scaff_group001 NW_023618484.1 64152 4284754 +scaff_group001 NW_023618485.1 49333 4334087 +scaff_group001 NW_023618486.1 48336 4382423 +scaff_group001 NW_023618487.1 48680 4431103 +scaff_group001 NW_023618488.1 48018 4479121 +scaff_group001 NW_023618489.1 47477 4526598 +scaff_group001 NW_023618490.1 46746 4573344 +scaff_group001 NW_023618491.1 46882 4620226 +scaff_group001 NW_023618492.1 46310 4666536 +scaff_group001 NW_023618493.1 43696 4710232 +scaff_group001 NW_023618494.1 43652 4753884 +scaff_group001 NW_023618495.1 42768 4796652 +scaff_group001 NW_023618496.1 42464 4839116 +scaff_group001 NW_023618497.1 42129 4881245 +scaff_group001 NW_023618498.1 41579 4922824 +scaff_group001 NW_023618499.1 40765 4963589 +scaff_group001 NW_023618500.1 40549 5004138 +scaff_group001 NW_023618501.1 39165 5043303 +scaff_group001 NW_023618502.1 39231 5082534 +scaff_group001 NW_023618503.1 38418 5120952 +scaff_group001 NW_023618504.1 38339 5159291 +scaff_group001 NW_023618505.1 38270 5197561 +scaff_group001 NW_023618506.1 38104 5235665 +scaff_group001 NW_023618507.1 36413 5272078 +scaff_group001 NW_023618508.1 35189 5307267 +scaff_group001 NW_023618509.1 34840 5342107 +scaff_group001 NW_023618510.1 34679 5376786 +scaff_group001 NW_023618511.1 33985 5410771 +scaff_group001 NW_023618512.1 33753 5444524 +scaff_group001 NW_023618513.1 32839 5477363 +scaff_group001 NW_023618514.1 32835 5510198 +scaff_group001 NW_023618515.1 31384 5541582 +scaff_group001 NW_023618516.1 31237 5572819 +scaff_group001 NW_023618517.1 31102 5603921 +scaff_group001 NW_023618518.1 29418 5633339 +scaff_group001 NW_023618519.1 27743 5661082 +scaff_group001 NW_023618520.1 27412 5688494 +scaff_group001 NW_023618521.1 26008 5714502 +scaff_group001 NW_023618522.1 25196 5739698 +scaff_group001 NW_023618523.1 24512 5764210 +scaff_group001 NW_023618524.1 22772 5786982 +scaff_group001 NW_023618525.1 22257 5809239 +scaff_group001 NW_023618526.1 21389 5830628 +scaff_group001 NW_023618527.1 21196 5851824 +scaff_group001 NW_023618528.1 20509 5872333 +scaff_group001 NW_023618529.1 17830 5890163 +scaff_group001 NW_023618530.1 7215 5897378 +scaff_group001 NW_023618531.1 3314 5900692 +scaff_group001 NW_023618532.1 173666 6074358 +scaff_group001 NW_023618533.1 217730 6292088 +scaff_group001 NW_023618534.1 182171 6474259 +scaff_group001 NW_023618535.1 167199 6641458 +scaff_group001 NW_023618536.1 162409 6803867 +scaff_group001 NW_023618537.1 155123 6958990 +scaff_group001 NW_023618538.1 151542 7110532 +scaff_group001 NW_023618539.1 146466 7256998 +scaff_group001 NW_023618540.1 143925 7400923 +scaff_group001 NW_023618541.1 137484 7538407 +scaff_group001 NW_023618542.1 137558 7675965 +scaff_group001 NW_023618543.1 116618 7792583 diff --git a/GoogleCloud/mega-non-model-wgs-snakeflow/config/scatters_5000000.tsv b/GoogleCloud/mega-non-model-wgs-snakeflow/config/scatters_5000000.tsv new file mode 100644 index 0000000..4cba8ba --- /dev/null +++ b/GoogleCloud/mega-non-model-wgs-snakeflow/config/scatters_5000000.tsv @@ -0,0 +1,284 @@ +id scatter_idx chrom start end scatter_length +NC_051269.1 scat_0001 NC_051269.1 1 4895994 4895994 +NC_051269.1 scat_0002 NC_051269.1 4895995 9791988 4895994 +NC_051269.1 scat_0003 NC_051269.1 9791989 14687982 4895994 +NC_051269.1 scat_0004 NC_051269.1 14687983 19583976 4895994 +NC_051269.1 scat_0005 NC_051269.1 19583977 24479970 4895994 +NC_051269.1 scat_0006 NC_051269.1 24479971 29375964 4895994 +NC_051269.1 scat_0007 NC_051269.1 29375965 34271958 4895994 +NC_051269.1 scat_0008 NC_051269.1 34271959 39167952 4895994 +NC_051269.1 scat_0009 NC_051269.1 39167953 44063945 4895993 +NC_051270.1 scat_0001 NC_051270.1 1 4626984 4626984 +NC_051270.1 scat_0002 NC_051270.1 4626985 9253968 4626984 +NC_051270.1 scat_0003 NC_051270.1 9253969 13880952 4626984 +NC_051270.1 scat_0004 NC_051270.1 13880953 18507936 4626984 +NC_051270.1 scat_0005 NC_051270.1 18507937 23134920 4626984 +NC_051270.1 scat_0006 NC_051270.1 23134921 27761904 4626984 +NC_051270.1 scat_0007 NC_051270.1 27761905 32388888 4626984 +NC_051270.1 scat_0008 NC_051270.1 32388889 37015872 4626984 +NC_051270.1 scat_0009 NC_051270.1 37015873 41642849 4626977 +NC_051271.1 scat_0001 NC_051271.1 1 4574769 4574769 +NC_051271.1 scat_0002 NC_051271.1 4574770 9149538 4574769 +NC_051271.1 scat_0003 NC_051271.1 9149539 13724307 4574769 +NC_051271.1 scat_0004 NC_051271.1 13724308 18299076 4574769 +NC_051271.1 scat_0005 NC_051271.1 18299077 22873845 4574769 +NC_051271.1 scat_0006 NC_051271.1 22873846 27448614 4574769 +NC_051271.1 scat_0007 NC_051271.1 27448615 32023383 4574769 +NC_051271.1 scat_0008 NC_051271.1 32023384 36598152 4574769 +NC_051271.1 scat_0009 NC_051271.1 36598153 41172918 4574766 +NW_023618430.1 scat_0001 NW_023618430.1 1 101937 101937 +NC_051272.1 scat_0001 NC_051272.1 1 4913823 4913823 +NC_051272.1 scat_0002 NC_051272.1 4913824 9827646 4913823 +NC_051272.1 scat_0003 NC_051272.1 9827647 14741469 4913823 +NC_051272.1 scat_0004 NC_051272.1 14741470 19655292 4913823 +NC_051272.1 scat_0005 NC_051272.1 19655293 24569115 4913823 +NC_051272.1 scat_0006 NC_051272.1 24569116 29482938 4913823 +NC_051272.1 scat_0007 NC_051272.1 29482939 34396761 4913823 +NC_051272.1 scat_0008 NC_051272.1 34396762 39310580 4913819 +NC_051273.1 scat_0001 NC_051273.1 1 4825876 4825876 +NC_051273.1 scat_0002 NC_051273.1 4825877 9651752 4825876 +NC_051273.1 scat_0003 NC_051273.1 9651753 14477628 4825876 +NC_051273.1 scat_0004 NC_051273.1 14477629 19303504 4825876 +NC_051273.1 scat_0005 NC_051273.1 19303505 24129380 4825876 +NC_051273.1 scat_0006 NC_051273.1 24129381 28955256 4825876 +NC_051273.1 scat_0007 NC_051273.1 28955257 33781132 4825876 +NC_051273.1 scat_0008 NC_051273.1 33781133 38607003 4825871 +NC_051274.1 scat_0001 NC_051274.1 1 4540713 4540713 +NC_051274.1 scat_0002 NC_051274.1 4540714 9081426 4540713 +NC_051274.1 scat_0003 NC_051274.1 9081427 13622139 4540713 +NC_051274.1 scat_0004 NC_051274.1 13622140 18162852 4540713 +NC_051274.1 scat_0005 NC_051274.1 18162853 22703565 4540713 +NC_051274.1 scat_0006 NC_051274.1 22703566 27244278 4540713 +NC_051274.1 scat_0007 NC_051274.1 27244279 31784991 4540713 +NC_051274.1 scat_0008 NC_051274.1 31784992 36325703 4540712 +NC_051275.1 scat_0001 NC_051275.1 1 4518303 4518303 +NC_051275.1 scat_0002 NC_051275.1 4518304 9036606 4518303 +NC_051275.1 scat_0003 NC_051275.1 9036607 13554909 4518303 +NC_051275.1 scat_0004 NC_051275.1 13554910 18073212 4518303 +NC_051275.1 scat_0005 NC_051275.1 18073213 22591515 4518303 +NC_051275.1 scat_0006 NC_051275.1 22591516 27109818 4518303 +NC_051275.1 scat_0007 NC_051275.1 27109819 31628121 4518303 +NC_051275.1 scat_0008 NC_051275.1 31628122 36146424 4518303 +NC_051276.1 scat_0001 NC_051276.1 1 4500855 4500855 +NC_051276.1 scat_0002 NC_051276.1 4500856 9001710 4500855 +NC_051276.1 scat_0003 NC_051276.1 9001711 13502565 4500855 +NC_051276.1 scat_0004 NC_051276.1 13502566 18003420 4500855 +NC_051276.1 scat_0005 NC_051276.1 18003421 22504275 4500855 +NC_051276.1 scat_0006 NC_051276.1 22504276 27005130 4500855 +NC_051276.1 scat_0007 NC_051276.1 27005131 31505985 4500855 +NC_051276.1 scat_0008 NC_051276.1 31505986 36006835 4500850 +NC_051277.1 scat_0001 NC_051277.1 1 4404419 4404419 +NC_051277.1 scat_0002 NC_051277.1 4404420 8808838 4404419 +NC_051277.1 scat_0003 NC_051277.1 8808839 13213257 4404419 +NC_051277.1 scat_0004 NC_051277.1 13213258 17617676 4404419 +NC_051277.1 scat_0005 NC_051277.1 17617677 22022095 4404419 +NC_051277.1 scat_0006 NC_051277.1 22022096 26426514 4404419 +NC_051277.1 scat_0007 NC_051277.1 26426515 30830933 4404419 +NC_051277.1 scat_0008 NC_051277.1 30830934 35235349 4404416 +NC_051278.1 scat_0001 NC_051278.1 1 4994893 4994893 +NC_051278.1 scat_0002 NC_051278.1 4994894 9989786 4994893 +NC_051278.1 scat_0003 NC_051278.1 9989787 14984679 4994893 +NC_051278.1 scat_0004 NC_051278.1 14984680 19979572 4994893 +NC_051278.1 scat_0005 NC_051278.1 19979573 24974465 4994893 +NC_051278.1 scat_0006 NC_051278.1 24974466 29969358 4994893 +NC_051278.1 scat_0007 NC_051278.1 29969359 34964250 4994892 +NC_051279.1 scat_0001 NC_051279.1 1 4987454 4987454 +NC_051279.1 scat_0002 NC_051279.1 4987455 9974908 4987454 +NC_051279.1 scat_0003 NC_051279.1 9974909 14962362 4987454 +NC_051279.1 scat_0004 NC_051279.1 14962363 19949816 4987454 +NC_051279.1 scat_0005 NC_051279.1 19949817 24937270 4987454 +NC_051279.1 scat_0006 NC_051279.1 24937271 29924724 4987454 +NC_051279.1 scat_0007 NC_051279.1 29924725 34912176 4987452 +NC_051280.1 scat_0001 NC_051280.1 1 4885173 4885173 +NC_051280.1 scat_0002 NC_051280.1 4885174 9770346 4885173 +NC_051280.1 scat_0003 NC_051280.1 9770347 14655519 4885173 +NC_051280.1 scat_0004 NC_051280.1 14655520 19540692 4885173 +NC_051280.1 scat_0005 NC_051280.1 19540693 24425865 4885173 +NC_051280.1 scat_0006 NC_051280.1 24425866 29311038 4885173 +NC_051280.1 scat_0007 NC_051280.1 29311039 34196205 4885167 +NC_051281.1 scat_0001 NC_051281.1 1 4736367 4736367 +NC_051281.1 scat_0002 NC_051281.1 4736368 9472734 4736367 +NC_051281.1 scat_0003 NC_051281.1 9472735 14209101 4736367 +NC_051281.1 scat_0004 NC_051281.1 14209102 18945468 4736367 +NC_051281.1 scat_0005 NC_051281.1 18945469 23681835 4736367 +NC_051281.1 scat_0006 NC_051281.1 23681836 28418202 4736367 +NC_051281.1 scat_0007 NC_051281.1 28418203 33154565 4736363 +NW_023618431.1 scat_0001 NW_023618431.1 1 250492 250492 +NW_023618432.1 scat_0001 NW_023618432.1 1 178401 178401 +NC_051282.1 scat_0001 NC_051282.1 1 4660784 4660784 +NC_051282.1 scat_0002 NC_051282.1 4660785 9321568 4660784 +NC_051282.1 scat_0003 NC_051282.1 9321569 13982352 4660784 +NC_051282.1 scat_0004 NC_051282.1 13982353 18643136 4660784 +NC_051282.1 scat_0005 NC_051282.1 18643137 23303920 4660784 +NC_051282.1 scat_0006 NC_051282.1 23303921 27964704 4660784 +NC_051282.1 scat_0007 NC_051282.1 27964705 32625488 4660784 +NW_023618433.1 scat_0001 NW_023618433.1 1 278450 278450 +NW_023618434.1 scat_0001 NW_023618434.1 1 173117 173117 +NC_051283.1 scat_0001 NC_051283.1 1 4660790 4660790 +NC_051283.1 scat_0002 NC_051283.1 4660791 9321580 4660790 +NC_051283.1 scat_0003 NC_051283.1 9321581 13982370 4660790 +NC_051283.1 scat_0004 NC_051283.1 13982371 18643160 4660790 +NC_051283.1 scat_0005 NC_051283.1 18643161 23303950 4660790 +NC_051283.1 scat_0006 NC_051283.1 23303951 27964740 4660790 +NC_051283.1 scat_0007 NC_051283.1 27964741 32625529 4660789 +NC_051284.1 scat_0001 NC_051284.1 1 4593378 4593378 +NC_051284.1 scat_0002 NC_051284.1 4593379 9186756 4593378 +NC_051284.1 scat_0003 NC_051284.1 9186757 13780134 4593378 +NC_051284.1 scat_0004 NC_051284.1 13780135 18373512 4593378 +NC_051284.1 scat_0005 NC_051284.1 18373513 22966890 4593378 +NC_051284.1 scat_0006 NC_051284.1 22966891 27560268 4593378 +NC_051284.1 scat_0007 NC_051284.1 27560269 32153643 4593375 +NC_051285.1 scat_0001 NC_051285.1 1 4378375 4378375 +NC_051285.1 scat_0002 NC_051285.1 4378376 8756750 4378375 +NC_051285.1 scat_0003 NC_051285.1 8756751 13135125 4378375 +NC_051285.1 scat_0004 NC_051285.1 13135126 17513500 4378375 +NC_051285.1 scat_0005 NC_051285.1 17513501 21891875 4378375 +NC_051285.1 scat_0006 NC_051285.1 21891876 26270250 4378375 +NC_051285.1 scat_0007 NC_051285.1 26270251 30648620 4378370 +NC_051286.1 scat_0001 NC_051286.1 1 4909754 4909754 +NC_051286.1 scat_0002 NC_051286.1 4909755 9819508 4909754 +NC_051286.1 scat_0003 NC_051286.1 9819509 14729262 4909754 +NC_051286.1 scat_0004 NC_051286.1 14729263 19639016 4909754 +NC_051286.1 scat_0005 NC_051286.1 19639017 24548770 4909754 +NC_051286.1 scat_0006 NC_051286.1 24548771 29458520 4909750 +NC_051287.1 scat_0001 NC_051287.1 1 4794549 4794549 +NC_051287.1 scat_0002 NC_051287.1 4794550 9589098 4794549 +NC_051287.1 scat_0003 NC_051287.1 9589099 14383647 4794549 +NC_051287.1 scat_0004 NC_051287.1 14383648 19178196 4794549 +NC_051287.1 scat_0005 NC_051287.1 19178197 23972745 4794549 +NC_051287.1 scat_0006 NC_051287.1 23972746 28767290 4794545 +NC_051288.1 scat_0001 NC_051288.1 1 4620946 4620946 +NC_051288.1 scat_0002 NC_051288.1 4620947 9241892 4620946 +NC_051288.1 scat_0003 NC_051288.1 9241893 13862838 4620946 +NC_051288.1 scat_0004 NC_051288.1 13862839 18483784 4620946 +NC_051288.1 scat_0005 NC_051288.1 18483785 23104730 4620946 +NC_051288.1 scat_0006 NC_051288.1 23104731 27725676 4620946 +NW_023618435.1 scat_0001 NW_023618435.1 1 104377 104377 +NC_051289.1 scat_0001 NC_051289.1 1 4552014 4552014 +NC_051289.1 scat_0002 NC_051289.1 4552015 9104028 4552014 +NC_051289.1 scat_0003 NC_051289.1 9104029 13656042 4552014 +NC_051289.1 scat_0004 NC_051289.1 13656043 18208056 4552014 +NC_051289.1 scat_0005 NC_051289.1 18208057 22760070 4552014 +NC_051289.1 scat_0006 NC_051289.1 22760071 27312080 4552010 +NC_051290.1 scat_0001 NC_051290.1 1 4957737 4957737 +NC_051290.1 scat_0002 NC_051290.1 4957738 9915474 4957737 +NC_051290.1 scat_0003 NC_051290.1 9915475 14873211 4957737 +NC_051290.1 scat_0004 NC_051290.1 14873212 19830948 4957737 +NC_051290.1 scat_0005 NC_051290.1 19830949 24788685 4957737 +NW_023618436.1 scat_0001 NW_023618436.1 1 113387 113387 +NC_051291.1 scat_0001 NC_051291.1 1 4685309 4685309 +NC_051291.1 scat_0002 NC_051291.1 4685310 9370618 4685309 +NC_051291.1 scat_0003 NC_051291.1 9370619 14055927 4685309 +NC_051291.1 scat_0004 NC_051291.1 14055928 18741236 4685309 +NC_051291.1 scat_0005 NC_051291.1 18741237 23426545 4685309 +NC_051292.1 scat_0001 NC_051292.1 1 4160100 4160100 +NC_051292.1 scat_0002 NC_051292.1 4160101 8320200 4160100 +NC_051292.1 scat_0003 NC_051292.1 8320201 12480300 4160100 +NC_051292.1 scat_0004 NC_051292.1 12480301 16640398 4160098 +scaff_group001 scat_0001.1 NW_023618437.1 1 204126 4963589 +scaff_group001 scat_0001.2 NW_023618438.1 1 114053 4963589 +scaff_group001 scat_0001.3 NW_023618439.1 1 246245 4963589 +scaff_group001 scat_0001.4 NW_023618440.1 1 422125 4963589 +scaff_group001 scat_0001.5 NW_023618441.1 1 251291 4963589 +scaff_group001 scat_0001.6 NW_023618442.1 1 161087 4963589 +scaff_group001 scat_0001.7 NW_023618443.1 1 107603 4963589 +scaff_group001 scat_0001.8 NW_023618444.1 1 105840 4963589 +scaff_group001 scat_0001.9 NW_023618445.1 1 104370 4963589 +scaff_group001 scat_0001.10 NW_023618446.1 1 45793 4963589 +scaff_group001 scat_0001.11 NW_023618447.1 1 99780 4963589 +scaff_group001 scat_0001.12 NW_023618448.1 1 98809 4963589 +scaff_group001 scat_0001.13 NW_023618449.1 1 98658 4963589 +scaff_group001 scat_0001.14 NW_023618450.1 1 90733 4963589 +scaff_group001 scat_0001.15 NW_023618451.1 1 86800 4963589 +scaff_group001 scat_0001.16 NW_023618452.1 1 87547 4963589 +scaff_group001 scat_0001.17 NW_023618453.1 1 84445 4963589 +scaff_group001 scat_0001.18 NW_023618454.1 1 77169 4963589 +scaff_group001 scat_0001.19 NW_023618455.1 1 76524 4963589 +scaff_group001 scat_0001.20 NW_023618456.1 1 75864 4963589 +scaff_group001 scat_0001.21 NW_023618457.1 1 75517 4963589 +scaff_group001 scat_0001.22 NW_023618458.1 1 74501 4963589 +scaff_group001 scat_0001.23 NW_023618459.1 1 74818 4963589 +scaff_group001 scat_0001.24 NW_023618460.1 1 74196 4963589 +scaff_group001 scat_0001.25 NW_023618461.1 1 73345 4963589 +scaff_group001 scat_0001.26 NW_023618462.1 1 72699 4963589 +scaff_group001 scat_0001.27 NW_023618463.1 1 71536 4963589 +scaff_group001 scat_0001.28 NW_023618464.1 1 71225 4963589 +scaff_group001 scat_0001.29 NW_023618465.1 1 64522 4963589 +scaff_group001 scat_0001.30 NW_023618466.1 1 62348 4963589 +scaff_group001 scat_0001.31 NW_023618467.1 1 60741 4963589 +scaff_group001 scat_0001.32 NW_023618468.1 1 59442 4963589 +scaff_group001 scat_0001.33 NW_023618469.1 1 10174 4963589 +scaff_group001 scat_0001.34 NW_023618470.1 1 58209 4963589 +scaff_group001 scat_0001.35 NW_023618471.1 1 57847 4963589 +scaff_group001 scat_0001.36 NW_023618472.1 1 57411 4963589 +scaff_group001 scat_0001.37 NW_023618473.1 1 57104 4963589 +scaff_group001 scat_0001.38 NW_023618474.1 1 56904 4963589 +scaff_group001 scat_0001.39 NW_023618475.1 1 54570 4963589 +scaff_group001 scat_0001.40 NW_023618476.1 1 54391 4963589 +scaff_group001 scat_0001.41 NW_023618477.1 1 32169 4963589 +scaff_group001 scat_0001.42 NW_023618478.1 1 52788 4963589 +scaff_group001 scat_0001.43 NW_023618479.1 1 52159 4963589 +scaff_group001 scat_0001.44 NW_023618480.1 1 52063 4963589 +scaff_group001 scat_0001.45 NW_023618481.1 1 51365 4963589 +scaff_group001 scat_0001.46 NW_023618482.1 1 50260 4963589 +scaff_group001 scat_0001.47 NW_023618483.1 1 49436 4963589 +scaff_group001 scat_0001.48 NW_023618484.1 1 64152 4963589 +scaff_group001 scat_0001.49 NW_023618485.1 1 49333 4963589 +scaff_group001 scat_0001.50 NW_023618486.1 1 48336 4963589 +scaff_group001 scat_0001.51 NW_023618487.1 1 48680 4963589 +scaff_group001 scat_0001.52 NW_023618488.1 1 48018 4963589 +scaff_group001 scat_0001.53 NW_023618489.1 1 47477 4963589 +scaff_group001 scat_0001.54 NW_023618490.1 1 46746 4963589 +scaff_group001 scat_0001.55 NW_023618491.1 1 46882 4963589 +scaff_group001 scat_0001.56 NW_023618492.1 1 46310 4963589 +scaff_group001 scat_0001.57 NW_023618493.1 1 43696 4963589 +scaff_group001 scat_0001.58 NW_023618494.1 1 43652 4963589 +scaff_group001 scat_0001.59 NW_023618495.1 1 42768 4963589 +scaff_group001 scat_0001.60 NW_023618496.1 1 42464 4963589 +scaff_group001 scat_0001.61 NW_023618497.1 1 42129 4963589 +scaff_group001 scat_0001.62 NW_023618498.1 1 41579 4963589 +scaff_group001 scat_0001.63 NW_023618499.1 1 40765 4963589 +scaff_group001 scat_0002 NW_023618500.1 1 40549 2828994 +scaff_group001 scat_0002 NW_023618501.1 1 39165 2828994 +scaff_group001 scat_0002 NW_023618502.1 1 39231 2828994 +scaff_group001 scat_0002 NW_023618503.1 1 38418 2828994 +scaff_group001 scat_0002 NW_023618504.1 1 38339 2828994 +scaff_group001 scat_0002 NW_023618505.1 1 38270 2828994 +scaff_group001 scat_0002 NW_023618506.1 1 38104 2828994 +scaff_group001 scat_0002 NW_023618507.1 1 36413 2828994 +scaff_group001 scat_0002 NW_023618508.1 1 35189 2828994 +scaff_group001 scat_0002 NW_023618509.1 1 34840 2828994 +scaff_group001 scat_0002 NW_023618510.1 1 34679 2828994 +scaff_group001 scat_0002 NW_023618511.1 1 33985 2828994 +scaff_group001 scat_0002 NW_023618512.1 1 33753 2828994 +scaff_group001 scat_0002 NW_023618513.1 1 32839 2828994 +scaff_group001 scat_0002 NW_023618514.1 1 32835 2828994 +scaff_group001 scat_0002 NW_023618515.1 1 31384 2828994 +scaff_group001 scat_0002 NW_023618516.1 1 31237 2828994 +scaff_group001 scat_0002 NW_023618517.1 1 31102 2828994 +scaff_group001 scat_0002 NW_023618518.1 1 29418 2828994 +scaff_group001 scat_0002 NW_023618519.1 1 27743 2828994 +scaff_group001 scat_0002 NW_023618520.1 1 27412 2828994 +scaff_group001 scat_0002 NW_023618521.1 1 26008 2828994 +scaff_group001 scat_0002 NW_023618522.1 1 25196 2828994 +scaff_group001 scat_0002 NW_023618523.1 1 24512 2828994 +scaff_group001 scat_0002 NW_023618524.1 1 22772 2828994 +scaff_group001 scat_0002 NW_023618525.1 1 22257 2828994 +scaff_group001 scat_0002 NW_023618526.1 1 21389 2828994 +scaff_group001 scat_0002 NW_023618527.1 1 21196 2828994 +scaff_group001 scat_0002 NW_023618528.1 1 20509 2828994 +scaff_group001 scat_0002 NW_023618529.1 1 17830 2828994 +scaff_group001 scat_0002 NW_023618530.1 1 7215 2828994 +scaff_group001 scat_0002 NW_023618531.1 1 3314 2828994 +scaff_group001 scat_0002 NW_023618532.1 1 173666 2828994 +scaff_group001 scat_0002 NW_023618533.1 1 217730 2828994 +scaff_group001 scat_0002 NW_023618534.1 1 182171 2828994 +scaff_group001 scat_0002 NW_023618535.1 1 167199 2828994 +scaff_group001 scat_0002 NW_023618536.1 1 162409 2828994 +scaff_group001 scat_0002 NW_023618537.1 1 155123 2828994 +scaff_group001 scat_0002 NW_023618538.1 1 151542 2828994 +scaff_group001 scat_0002 NW_023618539.1 1 146466 2828994 +scaff_group001 scat_0002 NW_023618540.1 1 143925 2828994 +scaff_group001 scat_0002 NW_023618541.1 1 137484 2828994 +scaff_group001 scat_0002 NW_023618542.1 1 137558 2828994 +scaff_group001 scat_0002 NW_023618543.1 1 116618 2828994 diff --git a/GoogleCloud/mega-non-model-wgs-snakeflow/config/units.tsv b/GoogleCloud/mega-non-model-wgs-snakeflow/config/units.tsv new file mode 100644 index 0000000..e90f4f0 --- /dev/null +++ b/GoogleCloud/mega-non-model-wgs-snakeflow/config/units.tsv @@ -0,0 +1,44 @@ +sample unit library flowcell platform lane sample_id barcode fq1 fq2 kb1 kb2 +111_31 1 lib1 whatevs ILLUMINA 1 111_31 111_31 data/111_31_R1.fastq.gz data/111_31_R2.fastq.gz 1693921.631 1888278.541 +111_33 1 lib1 whatevs ILLUMINA 1 111_33 111_33 data/111_33_R1.fastq.gz data/111_33_R2.fastq.gz 1765908.38 1780170.862 +111_34 1 lib1 whatevs ILLUMINA 1 111_34 111_34 data/111_34_R1.fastq.gz data/111_34_R2.fastq.gz 1530198.425 1672207.299 +111_35 1 lib1 whatevs ILLUMINA 1 111_35 111_35 data/111_35_R1.fastq.gz data/111_35_R2.fastq.gz 2126581.937 2533394.433 +111_55 1 lib1 whatevs ILLUMINA 1 111_55 111_55 data/111_55_R1.fastq.gz data/111_55_R2.fastq.gz 1998278.053 2231404.085 +111_56 1 lib1 whatevs ILLUMINA 1 111_56 111_56 data/111_56_R1.fastq.gz data/111_56_R2.fastq.gz 2463275.861 2947696.793 +111_57 1 lib1 whatevs ILLUMINA 1 111_57 111_57 data/111_57_R1.fastq.gz data/111_57_R2.fastq.gz 2021582.016 2329525.83 +18_37 1 lib1 whatevs ILLUMINA 1 18_37 18_37 data/18_37_R1.fastq.gz data/18_37_R2.fastq.gz 2244852.189 2591387.032 +18_38 1 lib1 whatevs ILLUMINA 1 18_38 18_38 data/18_38_R1.fastq.gz data/18_38_R2.fastq.gz 3136391.175 2726511.546 +221_49 1 lib1 whatevs ILLUMINA 1 221_49 221_49 data/221_49_R1.fastq.gz data/221_49_R2.fastq.gz 1753876.524 1984610.355 +221_50 1 lib1 whatevs ILLUMINA 1 221_50 221_50 data/221_50_R1.fastq.gz data/221_50_R2.fastq.gz 2978285.352 2388413.84 +238_55 1 lib1 whatevs ILLUMINA 1 238_55 238_55 data/238_55_R1.fastq.gz data/238_55_R2.fastq.gz 1598702.763 1843382.406 +240_12 1 lib1 whatevs ILLUMINA 1 240_12 240_12 data/240_12_R1.fastq.gz data/240_12_R2.fastq.gz 1250295.467 1193329.391 +240_16 1 lib1 whatevs ILLUMINA 1 240_16 240_16 data/240_16_R1.fastq.gz data/240_16_R2.fastq.gz 2978313.528 2896319.632 +240_17 1 lib1 whatevs ILLUMINA 1 240_17 240_17 data/240_17_R1.fastq.gz data/240_17_R2.fastq.gz 1710870.26 2010172.805 +336_15 1 lib1 whatevs ILLUMINA 1 336_15 336_15 data/336_15_R1.fastq.gz data/336_15_R2.fastq.gz 1553853.147 1810588.894 +336_16 1 lib1 whatevs ILLUMINA 1 336_16 336_16 data/336_16_R1.fastq.gz data/336_16_R2.fastq.gz 1396722.709 1359621.872 +336_31 1 lib1 whatevs ILLUMINA 1 336_31 336_31 data/336_31_R1.fastq.gz data/336_31_R2.fastq.gz 1437762.797 1685443.433 +336_36 1 lib1 whatevs ILLUMINA 1 336_36 336_36 data/336_36_R1.fastq.gz data/336_36_R2.fastq.gz 1414805.613 1655513.266 +336_38 1 lib1 whatevs ILLUMINA 1 336_38 336_38 data/336_38_R1.fastq.gz data/336_38_R2.fastq.gz 1808386.968 2183086.325 +336_55 1 lib1 whatevs ILLUMINA 1 336_55 336_55 data/336_55_R1.fastq.gz data/336_55_R2.fastq.gz 2489536.105 2936176.938 +336_56 1 lib1 whatevs ILLUMINA 1 336_56 336_56 data/336_56_R1.fastq.gz data/336_56_R2.fastq.gz 1216287.746 1196769.05 +336_70 1 lib1 whatevs ILLUMINA 1 336_70 336_70 data/336_70_R1.fastq.gz data/336_70_R2.fastq.gz 2351028.128 2743010.741 +51_71 1 lib1 whatevs ILLUMINA 1 51_71 51_71 data/51_71_R1.fastq.gz data/51_71_R2.fastq.gz 2804494.578 3247021.488 +51_72 1 lib1 whatevs ILLUMINA 1 51_72 51_72 data/51_72_R1.fastq.gz data/51_72_R2.fastq.gz 2341439.407 2695547.308 +51_73 1 lib1 whatevs ILLUMINA 1 51_73 51_73 data/51_73_R1.fastq.gz data/51_73_R2.fastq.gz 1381225.801 1570612.867 +51_74 1 lib1 whatevs ILLUMINA 1 51_74 51_74 data/51_74_R1.fastq.gz data/51_74_R2.fastq.gz 2197938.542 2550664.792 +51_75 1 lib1 whatevs ILLUMINA 1 51_75 51_75 data/51_75_R1.fastq.gz data/51_75_R2.fastq.gz 2359503.134 2741136.905 +51_76 1 lib1 whatevs ILLUMINA 1 51_76 51_76 data/51_76_R1.fastq.gz data/51_76_R2.fastq.gz 1861500.315 2157536.376 +8_35 1 lib1 whatevs ILLUMINA 1 8_35 8_35 data/8_35_R1.fastq.gz data/8_35_R2.fastq.gz 1205254.846 1188218.778 +8_41 1 lib1 whatevs ILLUMINA 1 8_41 8_41 data/8_41_R1.fastq.gz data/8_41_R2.fastq.gz 1708010.932 1977446.635 +8_43 1 lib1 whatevs ILLUMINA 1 8_43 8_43 data/8_43_R1.fastq.gz data/8_43_R2.fastq.gz 1368282.901 1566076.613 +8_44 1 lib1 whatevs ILLUMINA 1 8_44 8_44 data/8_44_R1.fastq.gz data/8_44_R2.fastq.gz 2890441.549 3380174.3 +8_45 1 lib1 whatevs ILLUMINA 1 8_45 8_45 data/8_45_R1.fastq.gz data/8_45_R2.fastq.gz 1761547.334 1996261.721 +8_47 1 lib1 whatevs ILLUMINA 1 8_47 8_47 data/8_47_R1.fastq.gz data/8_47_R2.fastq.gz 1520024.539 1803399.904 +8_49 1 lib1 whatevs ILLUMINA 1 8_49 8_49 data/8_49_R1.fastq.gz data/8_49_R2.fastq.gz 2399236.94 2699213.422 +8_74 1 lib1 whatevs ILLUMINA 1 8_74 8_74 data/8_74_R1.fastq.gz data/8_74_R2.fastq.gz 1951267.072 2292361.244 +8_75 1 lib1 whatevs ILLUMINA 1 8_75 8_75 data/8_75_R1.fastq.gz data/8_75_R2.fastq.gz 1424747.481 1407468.385 +8_76 1 lib1 whatevs ILLUMINA 1 8_76 8_76 data/8_76_R1.fastq.gz data/8_76_R2.fastq.gz 2209244.084 2441998.085 +8_77 1 lib1 whatevs ILLUMINA 1 8_77 8_77 data/8_77_R1.fastq.gz data/8_77_R2.fastq.gz 2167315.136 2542445.371 +8_78 1 lib1 whatevs ILLUMINA 1 8_78 8_78 data/8_78_R1.fastq.gz data/8_78_R2.fastq.gz 2584928.033 2969979.419 +8_80 1 lib1 whatevs ILLUMINA 1 8_80 8_80 data/8_80_R1.fastq.gz data/8_80_R2.fastq.gz 1945430.29 2200151.228 +8_82 1 lib1 whatevs ILLUMINA 1 8_82 8_82 data/8_82_R1.fastq.gz data/8_82_R2.fastq.gz 1798971.371 2057053.718 diff --git a/GoogleCloud/mega-non-model-wgs-snakeflow/hpcc-profiles/.DS_Store b/GoogleCloud/mega-non-model-wgs-snakeflow/hpcc-profiles/.DS_Store new file mode 100644 index 0000000..4a878e4 Binary files /dev/null and b/GoogleCloud/mega-non-model-wgs-snakeflow/hpcc-profiles/.DS_Store differ diff --git a/GoogleCloud/mega-non-model-wgs-snakeflow/hpcc-profiles/slurm/alpine/config.yaml b/GoogleCloud/mega-non-model-wgs-snakeflow/hpcc-profiles/slurm/alpine/config.yaml new file mode 100644 index 0000000..878877e --- /dev/null +++ b/GoogleCloud/mega-non-model-wgs-snakeflow/hpcc-profiles/slurm/alpine/config.yaml @@ -0,0 +1,60 @@ +cluster: + mkdir -p results/slurm_logs/{rule} && + sbatch + --partition=amilan,csu + --cpus-per-task={threads} + --mem={resources.mem_mb} + --time={resources.time} + --job-name=smk-{rule}-{wildcards} + --output=results/slurm_logs/{rule}/{rule}-{wildcards}-%j.out + --error=results/slurm_logs/{rule}/{rule}-{wildcards}-%j.err + --parsable +default-resources: + - time="08:00:00" + - mem_mb=3740 + - tmpdir="results/snake-tmp" +restart-times: 0 +max-jobs-per-second: 10 +max-status-checks-per-second: 50 +local-cores: 1 +latency-wait: 60 +cores: 2400 +jobs: 950 +keep-going: True +rerun-incomplete: True +printshellcmds: True +use-conda: True +rerun-trigger: mtime +cluster-status: status-sacct-robust.sh +cluster-cancel: scancel +cluster-cancel-nargs: 4000 + + +set-threads: + map_reads: 4 + realigner_target_creator: 4 + genomics_db_import_chromosomes: 2 + genomics_db_import_scaffold_groups: 2 + genomics_db2vcf_scattered: 2 +set-resources: + map_reads: + mem_mb: 14960 + time: "23:59:59" + make_gvcf_sections: + mem_mb: 3600 + time: "23:59:59" + genomics_db_import_chromosomes: + mem_mb: 7480 + time: "23:59:59" + genomics_db_import_scaffold_groups: + mem_mb: 11000 + time: "23:59:59" + genomics_db2vcf_scattered: + mem_mb: 11000 + time: "23:59:59" + multiqc_dir: + mem_mb: 37000 + bwa_index: + mem_mb: 37000 + realigner_target_creator: + mem_mb: 14960 \ No newline at end of file diff --git a/GoogleCloud/mega-non-model-wgs-snakeflow/hpcc-profiles/slurm/alpine/status-sacct-robust.sh b/GoogleCloud/mega-non-model-wgs-snakeflow/hpcc-profiles/slurm/alpine/status-sacct-robust.sh new file mode 100644 index 0000000..eddc323 --- /dev/null +++ b/GoogleCloud/mega-non-model-wgs-snakeflow/hpcc-profiles/slurm/alpine/status-sacct-robust.sh @@ -0,0 +1,45 @@ +#!/usr/bin/env bash + +# Check status of Slurm job. More robust because runs `sacct` multiple times if +# needed + +jobid="$1" + +if [[ "$jobid" == Submitted ]] +then + echo smk-simple-slurm: Invalid job ID: "$jobid" >&2 + echo smk-simple-slurm: Did you remember to add the flag --parsable to your sbatch call? >&2 + exit 1 +fi + +function get_status(){ + sacct -j "$1" --format State --noheader | head -n 1 | awk '{print $1}' +} + +for i in {1..20} +do + output=`get_status "$jobid"` + if [[ ! -z $output ]] + then + break + else + sleep 3 + fi +done + +if [[ -z $output ]] +then + echo sacct failed to return the status for jobid "$jobid" >&2 + echo Maybe you need to use scontrol instead? >&2 + exit 1 +fi + +if [[ $output =~ ^(COMPLETED).* ]] +then + echo success +elif [[ $output =~ ^(RUNNING|PENDING|COMPLETING|CONFIGURING|SUSPENDED).* ]] +then + echo running +else + echo failed +fi diff --git a/GoogleCloud/mega-non-model-wgs-snakeflow/hpcc-profiles/slurm/azhop5/config.yaml b/GoogleCloud/mega-non-model-wgs-snakeflow/hpcc-profiles/slurm/azhop5/config.yaml new file mode 100644 index 0000000..26d1c50 --- /dev/null +++ b/GoogleCloud/mega-non-model-wgs-snakeflow/hpcc-profiles/slurm/azhop5/config.yaml @@ -0,0 +1,59 @@ +cluster: + mkdir -p results/slurm_logs/{rule} && + sbatch + --partition=execute + --cpus-per-task={threads} + --mem={resources.mem_mb} + --time={resources.time} + --job-name=smk-{rule}-{wildcards} + --output=results/slurm_logs/{rule}/{rule}-{wildcards}-%j.out + --error=results/slurm_logs/{rule}/{rule}-{wildcards}-%j.err + --parsable +default-resources: + - time="08:00:00" + - mem_mb=3700 +restart-times: 0 +max-jobs-per-second: 10 +max-status-checks-per-second: 50 +local-cores: 1 +latency-wait: 60 +cores: 320 +jobs: 300 +keep-going: True +rerun-incomplete: True +printshellcmds: True +use-conda: True +rerun-trigger: mtime +cluster-status: status-sacct-robust.sh +cluster-cancel: scancel +cluster-cancel-nargs: 4000 + + +set-threads: + map_reads: 4 + realigner_target_creator: 4 + genomics_db_import_chromosomes: 2 + genomics_db_import_scaffold_groups: 2 + genomics_db2vcf_scattered: 2 +set-resources: + map_reads: + mem_mb: 14800 + time: "23:59:59" + make_gvcf_sections: + mem_mb: 3700 + time: "23:59:59" + genomics_db_import_chromosomes: + mem_mb: 7400 + time: "23:59:59" + genomics_db_import_scaffold_groups: + mem_mb: 11100 + time: "23:59:59" + genomics_db2vcf_scattered: + mem_mb: 11100 + time: "23:59:59" + multiqc_dir: + mem_mb: 37000 + bwa_index: + mem_mb: 37000 + realigner_target_creator: + mem_mb: 14960 \ No newline at end of file diff --git a/GoogleCloud/mega-non-model-wgs-snakeflow/hpcc-profiles/slurm/azhop5/status-sacct-robust.sh b/GoogleCloud/mega-non-model-wgs-snakeflow/hpcc-profiles/slurm/azhop5/status-sacct-robust.sh new file mode 100644 index 0000000..eddc323 --- /dev/null +++ b/GoogleCloud/mega-non-model-wgs-snakeflow/hpcc-profiles/slurm/azhop5/status-sacct-robust.sh @@ -0,0 +1,45 @@ +#!/usr/bin/env bash + +# Check status of Slurm job. More robust because runs `sacct` multiple times if +# needed + +jobid="$1" + +if [[ "$jobid" == Submitted ]] +then + echo smk-simple-slurm: Invalid job ID: "$jobid" >&2 + echo smk-simple-slurm: Did you remember to add the flag --parsable to your sbatch call? >&2 + exit 1 +fi + +function get_status(){ + sacct -j "$1" --format State --noheader | head -n 1 | awk '{print $1}' +} + +for i in {1..20} +do + output=`get_status "$jobid"` + if [[ ! -z $output ]] + then + break + else + sleep 3 + fi +done + +if [[ -z $output ]] +then + echo sacct failed to return the status for jobid "$jobid" >&2 + echo Maybe you need to use scontrol instead? >&2 + exit 1 +fi + +if [[ $output =~ ^(COMPLETED).* ]] +then + echo success +elif [[ $output =~ ^(RUNNING|PENDING|COMPLETING|CONFIGURING|SUSPENDED).* ]] +then + echo running +else + echo failed +fi diff --git a/GoogleCloud/mega-non-model-wgs-snakeflow/hpcc-profiles/slurm/google/config.yaml b/GoogleCloud/mega-non-model-wgs-snakeflow/hpcc-profiles/slurm/google/config.yaml new file mode 100644 index 0000000..a50d1be --- /dev/null +++ b/GoogleCloud/mega-non-model-wgs-snakeflow/hpcc-profiles/slurm/google/config.yaml @@ -0,0 +1,29 @@ +cluster: + mkdir -p results/slurm_logs/{rule} && + sbatch + --partition=compute + --cpus-per-task={threads} + --mem={resources.mem_mb} + --time={resources.time} + --job-name=smk-{rule}-{wildcards} + --output=results/slurm_logs/{rule}/{rule}-{wildcards}-%j.out + --error=results/slurm_logs/{rule}/{rule}-{wildcards}-%j.err + --parsable +default-resources: + - time="08:00:00" + - mem_mb=4800 +restart-times: 0 +max-jobs-per-second: 10 +max-status-checks-per-second: 50 +local-cores: 1 +latency-wait: 60 +cores: 600 +jobs: 1200 +keep-going: True +rerun-incomplete: True +printshellcmds: True +use-conda: True +#cluster-status: status-sacct-robust.sh +#cluster-cancel: scancel +#cluster-cancel-nargs: 1000 + diff --git a/GoogleCloud/mega-non-model-wgs-snakeflow/hpcc-profiles/slurm/google/status-sacct-robust.sh b/GoogleCloud/mega-non-model-wgs-snakeflow/hpcc-profiles/slurm/google/status-sacct-robust.sh new file mode 100644 index 0000000..eddc323 --- /dev/null +++ b/GoogleCloud/mega-non-model-wgs-snakeflow/hpcc-profiles/slurm/google/status-sacct-robust.sh @@ -0,0 +1,45 @@ +#!/usr/bin/env bash + +# Check status of Slurm job. More robust because runs `sacct` multiple times if +# needed + +jobid="$1" + +if [[ "$jobid" == Submitted ]] +then + echo smk-simple-slurm: Invalid job ID: "$jobid" >&2 + echo smk-simple-slurm: Did you remember to add the flag --parsable to your sbatch call? >&2 + exit 1 +fi + +function get_status(){ + sacct -j "$1" --format State --noheader | head -n 1 | awk '{print $1}' +} + +for i in {1..20} +do + output=`get_status "$jobid"` + if [[ ! -z $output ]] + then + break + else + sleep 3 + fi +done + +if [[ -z $output ]] +then + echo sacct failed to return the status for jobid "$jobid" >&2 + echo Maybe you need to use scontrol instead? >&2 + exit 1 +fi + +if [[ $output =~ ^(COMPLETED).* ]] +then + echo success +elif [[ $output =~ ^(RUNNING|PENDING|COMPLETING|CONFIGURING|SUSPENDED).* ]] +then + echo running +else + echo failed +fi diff --git a/GoogleCloud/mega-non-model-wgs-snakeflow/hpcc-profiles/slurm/sedna-deeper-seq/config.yaml b/GoogleCloud/mega-non-model-wgs-snakeflow/hpcc-profiles/slurm/sedna-deeper-seq/config.yaml new file mode 100644 index 0000000..af0cd19 --- /dev/null +++ b/GoogleCloud/mega-non-model-wgs-snakeflow/hpcc-profiles/slurm/sedna-deeper-seq/config.yaml @@ -0,0 +1,43 @@ +cluster: + mkdir -p results/slurm_logs/{rule} && + sbatch + --exclude=node[29-36] + --cpus-per-task={threads} + --mem={resources.mem_mb} + --time={resources.time} + --job-name=smk-{rule}-{wildcards} + --output=results/slurm_logs/{rule}/{rule}-{wildcards}-%j.out + --error=results/slurm_logs/{rule}/{rule}-{wildcards}-%j.err + --parsable +default-resources: + - time="08:00:00" + - mem_mb=4800 +restart-times: 0 +max-jobs-per-second: 10 +max-status-checks-per-second: 50 +local-cores: 1 +latency-wait: 60 +cores: 600 +jobs: 1200 +keep-going: True +rerun-incomplete: True +printshellcmds: True +use-conda: True +cluster-status: status-sacct-robust.sh +cluster-cancel: scancel +cluster-cancel-nargs: 1000 + +set-threads: + map_reads: 20 + trim_reads_pe: 5 +set-resources: + map_reads: + mem_mb: 94000 + time: "7-00:00:00" + mark_duplicates: + mem_mb: 47000 + time: "7-00:00:00" + trim_reads_pe: + mem_mb: 23500 + time: "7-00:00:00" + diff --git a/GoogleCloud/mega-non-model-wgs-snakeflow/hpcc-profiles/slurm/sedna-deeper-seq/status-sacct-robust.sh b/GoogleCloud/mega-non-model-wgs-snakeflow/hpcc-profiles/slurm/sedna-deeper-seq/status-sacct-robust.sh new file mode 100644 index 0000000..eddc323 --- /dev/null +++ b/GoogleCloud/mega-non-model-wgs-snakeflow/hpcc-profiles/slurm/sedna-deeper-seq/status-sacct-robust.sh @@ -0,0 +1,45 @@ +#!/usr/bin/env bash + +# Check status of Slurm job. More robust because runs `sacct` multiple times if +# needed + +jobid="$1" + +if [[ "$jobid" == Submitted ]] +then + echo smk-simple-slurm: Invalid job ID: "$jobid" >&2 + echo smk-simple-slurm: Did you remember to add the flag --parsable to your sbatch call? >&2 + exit 1 +fi + +function get_status(){ + sacct -j "$1" --format State --noheader | head -n 1 | awk '{print $1}' +} + +for i in {1..20} +do + output=`get_status "$jobid"` + if [[ ! -z $output ]] + then + break + else + sleep 3 + fi +done + +if [[ -z $output ]] +then + echo sacct failed to return the status for jobid "$jobid" >&2 + echo Maybe you need to use scontrol instead? >&2 + exit 1 +fi + +if [[ $output =~ ^(COMPLETED).* ]] +then + echo success +elif [[ $output =~ ^(RUNNING|PENDING|COMPLETING|CONFIGURING|SUSPENDED).* ]] +then + echo running +else + echo failed +fi diff --git a/GoogleCloud/mega-non-model-wgs-snakeflow/hpcc-profiles/slurm/sedna/config.yaml b/GoogleCloud/mega-non-model-wgs-snakeflow/hpcc-profiles/slurm/sedna/config.yaml new file mode 100644 index 0000000..41da070 --- /dev/null +++ b/GoogleCloud/mega-non-model-wgs-snakeflow/hpcc-profiles/slurm/sedna/config.yaml @@ -0,0 +1,29 @@ +cluster: + mkdir -p results/slurm_logs/{rule} && + sbatch + --partition=compute + --cpus-per-task={threads} + --mem={resources.mem_mb} + --time={resources.time} + --job-name=smk-{rule}-{wildcards} + --output=results/slurm_logs/{rule}/{rule}-{wildcards}-%j.out + --error=results/slurm_logs/{rule}/{rule}-{wildcards}-%j.err + --parsable +default-resources: + - time="08:00:00" + - mem_mb=4800 +restart-times: 0 +max-jobs-per-second: 10 +max-status-checks-per-second: 50 +local-cores: 1 +latency-wait: 60 +cores: 600 +jobs: 1200 +keep-going: True +rerun-incomplete: True +printshellcmds: True +use-conda: True +cluster-status: status-sacct-robust.sh +cluster-cancel: scancel +cluster-cancel-nargs: 1000 + diff --git a/GoogleCloud/mega-non-model-wgs-snakeflow/hpcc-profiles/slurm/sedna/status-sacct-robust.sh b/GoogleCloud/mega-non-model-wgs-snakeflow/hpcc-profiles/slurm/sedna/status-sacct-robust.sh new file mode 100644 index 0000000..eddc323 --- /dev/null +++ b/GoogleCloud/mega-non-model-wgs-snakeflow/hpcc-profiles/slurm/sedna/status-sacct-robust.sh @@ -0,0 +1,45 @@ +#!/usr/bin/env bash + +# Check status of Slurm job. More robust because runs `sacct` multiple times if +# needed + +jobid="$1" + +if [[ "$jobid" == Submitted ]] +then + echo smk-simple-slurm: Invalid job ID: "$jobid" >&2 + echo smk-simple-slurm: Did you remember to add the flag --parsable to your sbatch call? >&2 + exit 1 +fi + +function get_status(){ + sacct -j "$1" --format State --noheader | head -n 1 | awk '{print $1}' +} + +for i in {1..20} +do + output=`get_status "$jobid"` + if [[ ! -z $output ]] + then + break + else + sleep 3 + fi +done + +if [[ -z $output ]] +then + echo sacct failed to return the status for jobid "$jobid" >&2 + echo Maybe you need to use scontrol instead? >&2 + exit 1 +fi + +if [[ $output =~ ^(COMPLETED).* ]] +then + echo success +elif [[ $output =~ ^(RUNNING|PENDING|COMPLETING|CONFIGURING|SUSPENDED).* ]] +then + echo running +else + echo failed +fi diff --git a/GoogleCloud/mega-non-model-wgs-snakeflow/hpcc-profiles/slurm/summit/config.yaml b/GoogleCloud/mega-non-model-wgs-snakeflow/hpcc-profiles/slurm/summit/config.yaml new file mode 100644 index 0000000..a1a349a --- /dev/null +++ b/GoogleCloud/mega-non-model-wgs-snakeflow/hpcc-profiles/slurm/summit/config.yaml @@ -0,0 +1,26 @@ +cluster: + mkdir -p results/slurm_logs/{rule} && + sbatch + --partition={resources.partition} + --qos={resources.qos} + --cpus-per-task={threads} + --mem={resources.mem_mb} + --time={resources.time} + --job-name=smk-{rule}-{wildcards} + --output=results/slurm_logs/{rule}/{rule}-{wildcards}-%j.out + --error=results/slurm_logs/{rule}/{rule}-{wildcards}-%j.err +default-resources: + - partition=shas + - qos=normal + - mem_mb=4700 + - time="01:00:00" +restart-times: 0 +max-jobs-per-second: 10 +max-status-checks-per-second: 1 +local-cores: 1 +latency-wait: 60 +jobs: 500 +keep-going: True +rerun-incomplete: True +printshellcmds: True +use-conda: True \ No newline at end of file diff --git a/GoogleCloud/tourmaline/config.yaml b/GoogleCloud/tourmaline/config.yaml new file mode 100644 index 0000000..8db9524 --- /dev/null +++ b/GoogleCloud/tourmaline/config.yaml @@ -0,0 +1,236 @@ +# config.yaml - configuration file for the Tourmaline Snakemake workflow +# Compatable with qiime2-2023.5 +# User MUST edit these parameters before running their own data. +# Detailed instructions: https://github.com/aomlomics/tourmaline/wiki. + +# METADATA FILE + +# Metadata file must be named as follows (or use symbolic link): +# - 00-data/metadata.tsv + +# Standardization is recommended through use of MIxS/MIMARKS (https://gensc.org/mixs/) column headers, for example: +# - submitted_to_insdc: {boolean} +# - investigation_type: [eukaryote|bacteria_archaea|plasmid|virus|organelle|metagenome|metatranscriptome|mimarks-survey|mimarks-specimen|misag|mimag|miuvig] +# - project_name: {text} +# - experimental_factor (text or EFO and/or OBI): {termLabel} {[termID]}|{text} +# - lat_lon (decimal degrees): {float} {float} + +# FASTQ DATA (choose one option) + +# Option 1 - if you want to start from manifest files and import per-sample fastq sequences, +# Name your manifest file(s) as follows (or use symbolic links): +# - paired-end sequences: 00-data/manifest_pe.csv +# - single-end sequences: 00-data/manifest_se.csv + +# Option 2 - if your fastq sequences are already archived in qza format, +# Name your qza file(s) as follows (or use symbolic links): +# - paired-end sequences: 01-imported/fastq_pe.qza +# - single-end sequences: 01-imported/fastq_se.qza + +# REFERENCE DATABASE (choose one option) + +# Option 1 - if your reference database is not yet imported, +# Name your reference fna and tsv files as follows (or use symbolic links): +# - reference sequences: 00-data/refseqs.fna +# - reference taxonomy: 00-data/reftax.tsv + +# Option 2 - if your reference database is already archived in qza format, +# Name your your qza files as follows (or use symbolic links): +# - reference sequences: 01-imported/refseqs.qza +# - reference taxonomy: 01-imported/reftax.qza + +# Provide a descriptive name for the reference database used, no spaces (e.g., original file name, source and version) +database_name: silva-138-99-515-806_q2-2021.2 + +# DENOISE + +# DADA2 PAIRED-END +# For more info run: qiime dada2 denoise-paired --help + +dada2pe_trunc_len_f: 240 +dada2pe_trunc_len_r: 190 +dada2pe_trim_left_f: 0 +dada2pe_trim_left_r: 0 +dada2pe_max_ee_f: 2 +dada2pe_max_ee_r: 2 +dada2pe_trunc_q: 2 +dada2pe_pooling_method: independent +dada2pe_chimera_method: consensus +dada2pe_min_fold_parent_over_abundance: 1 +dada2pe_n_reads_learn: 1000000 +dada2pe_hashed_feature_ids: --p-hashed-feature-ids + +# DADA2 SINGLE-END +# For more info run: qiime dada2 denoise-single --help + +dada2se_trunc_len: 240 +dada2se_trim_left: 0 +dada2se_max_ee: 2 +dada2se_trunc_q: 2 +dada2se_pooling_method: independent +dada2se_chimera_method: consensus +dada2se_min_fold_parent_over_abundance: 1 +dada2se_n_reads_learn: 1000000 +dada2se_hashed_feature_ids: --p-hashed-feature-ids + +# DEBLUR SINGLE-END +# For more info run: qiime deblur denoise-other --help + +deblur_trim_length: 240 +deblur_sample_stats: --p-sample-stats +deblur_mean_error: 0.005 +deblur_indel_prob: 0.01 +deblur_indel_max: 3 +deblur_min_reads: 10 +deblur_min_size: 2 +deblur_hashed_feature_ids: --p-hashed-feature-ids + +# TAXONOMIC CLASSIFICATION + +# Taxonomic classification of the representative sequences. +# - method: choose from: naive-bayes, consensus-blast, consensus-vsearch +# - parameters: add arbitrary parameters or at least one parameter; see "qiime feature-classifier COMMAND --help" +# For more info run: qiime feature-classifier [fit-classifier-naive-bayes|classify-sklearn|classify-consensus-blast|classify-consensus-vsearch] --help +classify_method: consensus-vsearch +classify_parameters: --verbose + +# Table of feature counts per sample collapsed by chosen taxonomic level. +# taxa_level - taxonomic level at which the features should be collapsed. Default 7 +# For more info run: qiime taxa collapse --help +classify_taxalevel: 7 + +# MULTIPLE SEQUENCE ALIGNMENT + +# Multiple sequence alignment of the representative sequences. +# - method: choose from: muscle v5, clustalo, mafft +# - muscle_iters: number of refinement iterations (integer, default 100) +# For more info run: muscle, clustalo --help, qiime alignment [mafft|mask] --help + +alignment_method: muscle +alignment_muscle_iters: 50 + +# OUTLIER DETECTION + +# Representative sequence outlier detection using odseq. +# - distance_metric: choose from: linear, affine +# - bootstrap_replicates: number of bootstrap replicates (integer) +# - threshold: probability to be at right of the bootstrap scores distribution (float) +# For more info see: https://www.bioconductor.org/packages/release/bioc/html/odseq.html + +odseq_distance_metric: linear +odseq_bootstrap_replicates: 100 +odseq_threshold: 0.025 + +# SUBSAMPLING (RAREFACTION) + +# Subsample (rarefy) data to have an even number of observations per sample. +# - core_sampling_depth: subsampling depth for core diversity analyses +# - alpha_max_depth: maximum subsampling depth for alpha diversity rarefaction analysis + +core_sampling_depth: 500 +alpha_max_depth: 500 + +# BETA GROUP SIGNIFICANCE + +# Statistical test for difference between samples grouped by a metadata variable (column). +# - column: metadata column to test beta group significance; this column must appear in the metadata file 00-data/metadata.tsv +# - method: choose from: permanova, anosim, permdisp +# - pairwise: choose from: --p-no-pairwise, --p-pairwise (can be very slow) + +beta_group_column: region +beta_group_method: permanova +beta_group_pairwise: --p-pairwise + +# DEICODE BETA DIVERSITY + +# Robust Aitchison PCA (and biplot ordination) with automatically estimated underlying low-rank structure. +# Parameters are set to recommended defaults: +# - min_sample_count: minimum sum cutoff of samples across all features +# - min_feature_count: minimum sum cufoff of features across all samples +# - min_feature_frequency: minimum percentage of samples a feature must appear with value greater than zero +# - max_iterations: number of iterations to optimize the solution +# - num_features: number of most important features (arrows) to display in the biplot ordination +# For more info run: qiime deicode auto-rpca --help + +deicode_min_sample_count: 500 +deicode_min_feature_count: 10 +deicode_min_feature_frequency: 0 +deicode_max_iterations: 5 +deicode_num_features: 5 + +# REPORT THEME + +# Report theme for html version. +# Choose from: github, gothic, newsprint, night, pixyll, whitey. + +report_theme: github + +# FILTERING + +# Filtering is implemented by filtering commands only, eg "snakemake dada2_pe_report_filtered", and is applied to each of +# representative sequences, taxonomy, and feature table. + +# FILTER SAMPLES BY ID +# The file of sample IDs to be removed must have the sample IDs in the 1st column with a header line. Any number of additional columns can be added. +# The file must be named 00-data/samples_to_filter_{method}.tsv, where method is dada2-pe, dada2-se, or deblur-se. +# For more info run: qiime feature-table filter-samples --help +# TO SKIP FILTERING BY SAMPLE ID: provide a file with only headers (default) or don't run filtering commands. + +# FILTER SAMPLES BY METADATA +# Samples can be removed or retained based on their metadata, using SQLite WHERE-clause syntax inside double quotes. +# For more info run: qiime feature-table filter-samples --help +# TO SKIP FILTERING BY SAMPLE METADATA: provide "none" (default) or don't run filtering commands. +# EXAMPLE: "[region]='Open Water'" +metadata_filter: none + +# FILTER BY FEATURE ID +# The file of feature IDs to be filtered must have a header line with two columns: 1. "featureid", 2. anything. +# The file must be named 00-data/repseqs_to_filter_{method}.tsv, where method is dada2-pe, dada2-se, or deblur-se. +# If filtering outliers, just copy 02-output-{method}-{filter}/02-alignment-tree/repseqs_to_filter_outliers.tsv to the above filename. +# Add any additional feature IDs to be filtered (no duplicates allowed). +# For more info run: qiime feature-table [filter-seqs|filter-features] --help +# TO SKIP FILTERING BY FEATURE ID: provide only nonsense feature IDs in the above files (default) or don't run filtering commands. + +# FILTER BY TAXONOMY +# Features with taxonomy containing these terms will be filtered. +# Separate terms with commas (e.g., mitochondria,chloroplast,eukaryota,unassigned). +# Terms are not case-sensitive. +# For more info run: qiime taxa filter-seqs --help +# TO SKIP FILTERING BY TAXONOMY: provide a nonsense term or don't run filtering commands. + +exclude_terms: eukaryota,archaea,mitochondria,chloroplast,unassigned + +# FILTER BY LENGTH +# Set minimum and maximum sequence lengths to filter representative sequences by. +# Limits are inclusive, ie, greater than or equal to minimum, less than or equal to maximum. +# For more info run: qiime feature-table filter-seqs --help +# TO SKIP FILTERING BY LENGTH: set values to extreme values, eg (0, 10000) or don't run filtering commands. + +repseq_min_length: 0 +repseq_max_length: 260 + +# FILTER BY ABUNDANCE & PREVALENCE +# Set minimum abundance/prevalence limits for filtering. +# Values are floats range(0,1) +# Limit is inclusive, ie, greater than or equal to minimum. Samples with frequency of 0 after filtering will also be removed. +# For more info run: qiime feature-table filter-features-conditionally --help +# TO SKIP FILTERING BY ABUNDANCE/PREVALENCE: set values to 0 or don't run filtering commands. + +repseq_min_abundance: 0.01 +repseq_min_prevalence: 0.1 + +# THREADS + +# Max number of threads for individual rules. +# Threads used will be the lower of this and snakemake parameter --cores. +# Parameter other_threads is used for all other rules, regardless if they can use multiple threads, +# because it prevents multiple rules from running simultaneously with --cores >1. + +dada2pe_threads: 8 +dada2se_threads: 8 +deblur_threads: 8 +alignment_threads: 8 +feature_classifier_threads: 8 +phylogeny_fasttree_threads: 8 +diversity_core_metrics_phylogenetic_threads: 8 +other_threads: 8 diff --git a/GoogleCloud/trinity/run.trinity.v1.job b/GoogleCloud/trinity/run.trinity.v1.job new file mode 100644 index 0000000..4f9c646 --- /dev/null +++ b/GoogleCloud/trinity/run.trinity.v1.job @@ -0,0 +1,48 @@ +#!/bin/bash + +#SBATCH --job-name=Trinity +#SBATCH -c 30 +#SBATCH --mem=200G +#SBATCH -t 0 +#SBATCH --mail-type=ALL + +#SBATCH -p compute +#SBATCH --nodelist=hpcsmall-compute-ghpc-19 + +# so we can use conda with the scheduler +source ~/.bashrc + +# load the trinity environment +mamba activate trinity-2.15.1 + +VER=1 + +# Setup +BASE=/home/giles_goetz_noaa_gov/testing/trinity +IN=${BASE}/raw +OUT=${BASE}/trinity/trinity.all.v${VER} + +ADAPTER_FILE=${BASE}/TruSeq3-PE-2.fa +TRIM_STRING="ILLUMINACLIP:${ADAPTER_FILE}" +TRIM_STRING+=":2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:20 MINLEN:50" + +if [ ! -d ${OUT} ]; then + mkdir -p ${OUT} +fi + +# Get a list of the files, only using the paired trimmed ones +FILES_R1=$(ls ${IN}/*.R1.fq.gz | tr '\n' ',' | sed -e 's/,$//') +FILES_R2=$(ls ${IN}/*.R2.fq.gz | tr '\n' ',' | sed -e 's/,$//') + +# Running Trinity, setting a really high memory limit since +# we don't know how much it is going to use. +Trinity \ + --seqType fq \ + --CPU 20 \ + --output ${OUT} \ + --max_memory 200G \ + --left ${FILES_R1} \ + --right ${FILES_R2} \ + --trimmomatic \ + --quality_trimming_params "${TRIM_STRING}" \ + &> ${OUT}/trinity.v${VER}.log