diff --git a/FAQ.md b/FAQ.md index f50509f..40c64f1 100644 --- a/FAQ.md +++ b/FAQ.md @@ -4,7 +4,7 @@ ______________________ #### *VEBA* has so many modules and capabilities, how can I get a feel for how to use it for my dataset(s)? -Check out the [walkthroughs](https://github.com/jolespin/veba/tree/main/walkthroughs) where there are step-by-step workflows for different types of data. +Check out the [walkthroughs](https://github.com/jolespin/veba/tree/main/walkthroughs) where there are step-by-step workflows for different types of data. For a visual walkthrough of the modules, watch the [Getting started with VEBA](https://www.youtube.com/watch?v=pqrIffWNuug) YouTube video. There are several video tutorials on our [YouTube Channel @VEBA-Multiomics](https://www.youtube.com/@VEBA-Multiomics) covering topics such as how to get started, how to install/configure databases, custom installations/databases, Docker usage on local machines, and the end-to-end walkthrough in real-ish time.
@@ -28,7 +28,7 @@ Since this is more advanced usage, you'll have to go through and comment out the ______________________ -### How can I install just a single module and a subset of the database required for that module? +#### How can I install just a single module and a subset of the database required for that module? This can be done easily with a custom installation. For example, let's say you want to only use the `annotate.py` module. You would go to the [module table](https://github.com/jolespin/veba/blob/main/bin/README.md) to see that `annotate.py` module uses the `VEBA-annotate_env` and the `Annotate` database. Then you would install the custom build as follows: @@ -644,7 +644,7 @@ Are there any large contigs? What's the N50? ______________________ -### How can I use Docker or Singularity to run VEBA? +#### How can I use Docker or Singularity to run VEBA? Check out the [*VEBA* walkthroughs for Docker, Singularity, and AWS](https://github.com/jolespin/veba/tree/main/walkthroughs#containerization-and-aws). diff --git a/README.md b/README.md index f1cdb9f..c37de97 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ [issues-shield]: https://img.shields.io/github/issues/jolespin/veba.svg?style=for-the-badge [issues-url]: https://github.com/jolespin/veba/issues -[![Schematic](images/graphical-abstract.png)](images/Schematic.pdf) +[![Schematic](images/graphical-abstract.png)](images/graphical-abstract.pdf) ### What is VEBA? The *Viral Eukaryotic Bacterial Archaeal* (VEBA) is an open-source software suite developed with all domains of microorganisms as the primary objective (not post hoc adjustments) including prokaryotic, eukaryotic, and viral organisms. VEBA is an end-to-end metagenomics and bioprospecting software suite that can directly recover and analyze eukaryotic and viral genomes in addition to prokaryotic genomes with native support for candidate phyla radiation (CPR). VEBA implements a novel iterative binning procedure and an optional hybrid sample-specific/multi-sample framework that recovers more genomes than non-iterative methods. To optimize the microeukaryotic gene calling and taxonomic classifications, VEBA includes a consensus microeukaryotic database containing protists and fungi compiled from several existing databases. VEBA also provides a unique clustering-based dereplication strategy allowing for sample-specific genomes and proteins to be directly compared across non-overlapping biological samples. VEBA also automates biosynthetic gene cluster identification and novelty scores for bioprospecting. diff --git a/images/Schematic.pdf b/images/Schematic.pdf index 6defa45..2f35471 100644 Binary files a/images/Schematic.pdf and b/images/Schematic.pdf differ diff --git a/images/Schematic.png b/images/Schematic.png index 3e0c963..70f3e91 100644 Binary files a/images/Schematic.png and b/images/Schematic.png differ diff --git a/images/Schematic/Schematic_v1.1.x.gslides b/images/Schematic/Schematic_v1.1.x.gslides index b6cb45e..3878c5a 100644 --- a/images/Schematic/Schematic_v1.1.x.gslides +++ b/images/Schematic/Schematic_v1.1.x.gslides @@ -1 +1 @@ -{"":"WARNING! DO NOT EDIT THIS FILE! ANY CHANGES MADE WILL BE LOST!","doc_id":"1kgxUEfnlyF3h1ieYR2wABSSanKIEUFYrJBDDQUJbLJc","resource_key":"","email":"jol.espinoz@gmail.com"} +{"":"WARNING! DO NOT EDIT THIS FILE! ANY CHANGES MADE WILL BE LOST!","doc_id":"1dPtRoigCmvF6asvqCR3pJAJ-daAUF_tAoYPzXHQ1Edc","resource_key":"","email":"jol.espinoz@gmail.com"} diff --git a/images/Schematic/Schematic_v1.2.0.gslides b/images/Schematic/Schematic_v1.2.0.gslides index 678272c..395eca6 100644 --- a/images/Schematic/Schematic_v1.2.0.gslides +++ b/images/Schematic/Schematic_v1.2.0.gslides @@ -1 +1 @@ -{"":"WARNING! DO NOT EDIT THIS FILE! ANY CHANGES MADE WILL BE LOST!","doc_id":"1wZhXu19GhIcY0E0YscQaCIK6MEPK4nllNFDAeHJtsHw","resource_key":"","email":"jol.espinoz@gmail.com"} +{"":"WARNING! DO NOT EDIT THIS FILE! ANY CHANGES MADE WILL BE LOST!","doc_id":"1MYo9n8YFHYN9UE-jtBSSmVgztkRfnoCMdOCPdHlOehw","resource_key":"","email":"jol.espinoz@gmail.com"} diff --git a/images/Schematic/Schematic_v2.0.0.gslides b/images/Schematic/Schematic_v2.0.0.gslides index f128947..b23e4a8 100644 --- a/images/Schematic/Schematic_v2.0.0.gslides +++ b/images/Schematic/Schematic_v2.0.0.gslides @@ -1 +1 @@ -{"":"WARNING! DO NOT EDIT THIS FILE! ANY CHANGES MADE WILL BE LOST!","doc_id":"1F97THCJYgPImOFqPAeb56k8sGvpGHPUHwUKFfypv8ms","resource_key":"","email":"jol.espinoz@gmail.com"} +{"":"WARNING! DO NOT EDIT THIS FILE! ANY CHANGES MADE WILL BE LOST!","doc_id":"13EHm4cj_U80pumbc2NIeiDpvlkDmh-3mk-iUP6cBRgU","resource_key":"","email":"jol.espinoz@gmail.com"} diff --git a/images/devel/Modules/assembly.md b/images/devel/Modules/assembly.md deleted file mode 100644 index 5c793a0..0000000 --- a/images/devel/Modules/assembly.md +++ /dev/null @@ -1,44 +0,0 @@ - - -```mermaid -%%{init: { "flowchart": { "curve": "linear" } } }%% - -%% Available curve styles include basis, bumpX, bumpY, cardinal, catmullRom, linear, monotoneX, monotoneY, natural, step, stepAfter, and stepBefore. %%% - -graph LR - -subgraph "`**assembly**`" - - %% Programs - METASPADES["metaSPAdes"] - SAMTOOLS["samtools"] - BOWTIE2_INDEX["bowtie2-build"] - BOWTIE2["bowtie2"] - FEATURECOUNTS["featureCounts"] - SEQKIT["seqkit stats"] - - %% inputs - READS[\"cleaned_1/2.fastq.gz"/] - - %% outputs - STATS["statistics.tsv"] - - %% FastP - READS --repair.sh--> METASPADES - METASPADES --> ASSEMBLY["scaffolds.fasta"] - ASSEMBLY --"fasta_to_saf.py"--> SAF["scaffolds.fasta.saf"] - - %% Bowtie2 - ASSEMBLY --> BOWTIE2_INDEX --> INDEX["scaffolds.fasta.*.bt2"] - - READS & INDEX --> BOWTIE2 --> SAMTOOLS --> BAM["mapped.sorted.bam"] - - %% featureCounts - BAM & SAF --> FEATURECOUNTS --> COUNTS["counts.tsv"] - - ASSEMBLY --> SEQKIT --> STATS - -end - - -``` \ No newline at end of file diff --git a/images/devel/Modules/binning-prokaryotic.md b/images/devel/Modules/binning-prokaryotic.md deleted file mode 100644 index d65acc4..0000000 --- a/images/devel/Modules/binning-prokaryotic.md +++ /dev/null @@ -1,65 +0,0 @@ - - -```mermaid -%%{init: { "flowchart": { "curve": "linear" } } }%% - -%% Available curve styles include basis, bumpX, bumpY, cardinal, catmullRom, linear, monotoneX, monotoneY, natural, step, stepAfter, and stepBefore. %%% - -graph TD - - - %% Programs - COVERM["coverm"] - PYRODIGAL["Pyrodigal"] - METABAT2["Metabat2"] - MAXBIN2_107["MaxBin2(MarkerSet=107)"] - MAXBIN2_40["MaxBin2(MarkerSet=40)"] - CONCOCT["CONCOCT"] - DASTOOL["DAS_Tool"] - TIARA["Tiara"] - CHECKM2["CheckM2"] - BARRNAP["barrnap"] - TRNASCANSE["tRNAscan-SE"] - FEATURECOUNTS["featureCounts"] - SEQKIT["seqkit stats"] - - %% inputs - ASSEMBLY["scaffolds.fasta"] - BAM["mapped.sorted.bam"] - - %% outputs - STATS["statistics.tsv"] - - BAM --> COVERM --> COVERAGE["coverage.tsv"] - ASSEMBLY --> PYRODIGAL --> PROTEINS["proteins.fasta"] & CDS["cds.fasta"] & GFF["gene_models.gff"] - -subgraph "`**_N_ iterative binning-prokaryotic**`" - - ASSEMBLY & COVERAGE --> METABAT2 --> MAGS_METABAT["MAGsMetabat2"] - ASSEMBLY & COVERAGE --> MAXBIN2_107 --> MAGS_MAXBIN2_107["MAGsMaxBin2_107"] - ASSEMBLY & COVERAGE --> MAXBIN2_40 --> MAGS_MAXBIN2_40["MAGsMaxBin2_40"] - ASSEMBLY & COVERAGE --> CONCOCT --> MAGS_CONCOCT["MAGsCONCOCT"] - - MAGS_MAXBIN2_107 & MAGS_MAXBIN2_40 & MAGS_CONCOCT & PROTEINS --> DASTOOL - - DASTOOL --> CANDIDATE_MAGS["MAGsCandidate"] - - CANDIDATE_MAGS --> TIARA - TIARA --> MAGS_P["MAGsProkaryotic"] - TIARA --x MAGS_E["MAGsEukaryotic"] - - MAGS_P & PROTEINS --> CHECKM2 - - CHECKM2 --> MAGS_PASSED["MAGsPassed"] - CHECKM2 --x MAGS_FAILED["MAGsFailed"] --> UNBINNED["unbinned.fasta"] --> BEGINNING["Repeat with unbinned.fasta"] - - -end - -MAGS_PASSED --> BARRNAP --> RRNA["MAGS.rRNA.fasta"] -MAGS_PASSED --> TRNASCANSE --> TRNA["MAGS.TRNA.fasta"] - -MAGS_PASSED & CDS & RRNA & TRNA --> SEQKIT --> STATS - - -``` \ No newline at end of file diff --git a/images/devel/Modules/mermaid_test.md b/images/devel/Modules/mermaid_test.md deleted file mode 100644 index e04fd00..0000000 --- a/images/devel/Modules/mermaid_test.md +++ /dev/null @@ -1,114 +0,0 @@ - - -```mermaid -%%{init: { "flowchart": { "curve": "linear" } } }%% - -%% Available curve styles include basis, bumpX, bumpY, cardinal, catmullRom, linear, monotoneX, monotoneY, natural, step, stepAfter, and stepBefore. %%% - -graph TD -subgraph "`**preprocessing**`" - %% modules - PREPROCESS_SHORT(["`_preprocess-short_`"]) - PREPROCESS_LONG(["`_preprocess-long_`"]) - - %% inputs - R1[\"Illumina_1.fastq.gz"/] - R2[\"Illumina_2.fastq.gz"/] - LONG[\"ONT|PacBio.fastq.gz"/] - - - %% databases - CONTAMINATION[(Contamination)] - KMER[(K-mer Profiles)] - - %% --- - - - %% preprocess/-long - R1 & R2 --> PREPROCESS_SHORT - CONTAMINATION -.-> PREPROCESS_SHORT - KMER -.-> PREPROCESS_SHORT - - LONG --> PREPROCESS_LONG - CONTAMINATION -.-> PREPROCESS_LONG - KMER -.-> PREPROCESS_LONG -end - -subgraph "`**assembly**`" - %%inputs - ASSEMBLY(["`_assembly|assembly-long_`"]) - - %% outputs - ASSEMBLY_FASTA[["assembly.fasta"]] - BAM[["mapped.sorted.bam"]] - - %% assembly/-long - PREPROCESS_SHORT --cleaned_1/2.fastq.gz--> ASSEMBLY - PREPROCESS_LONG --cleaned.fastq.gz--> ASSEMBLY - ASSEMBLY --> ASSEMBLY_FASTA & BAM -end - -%% -- - -subgraph "`**binning**`" - %% modules - BINNING_VIRAL(["`_binning-viral_`"]) - BINNING_PROKARYOTIC(["`_binning-prokaryotic_`"]) - BINNING_EUKARYOTIC(["`_binning-eukaryotic_`"]) - - - %% outputs - GENOMES_AND_GENE_MODELS("Genomes & Gene Models") - GENOMES[["Genomes"]] - GENE_MODELS[["Gene Models"]] - - %% databases - %%CHECKV[("CheckV")]--> BINNING_VIRAL - %%GENOMAD[("geNomad")]--> BINNING_VIRAL - - %% -- - %% binning-viral - ASSEMBLY_FASTA & BAM --> BINNING_VIRAL - - %% binning-prokaryotic - BINNING_VIRAL --unbinned.fasta--> BINNING_PROKARYOTIC - BAM --> BINNING_PROKARYOTIC - - %% binning-eukaryotic - BINNING_PROKARYOTIC --unbinned.fasta--> BINNING_EUKARYOTIC - BAM --> BINNING_EUKARYOTIC - - %% coverage - %% COVERAGE("coverage|coverage-long") - - BINNING_VIRAL & BINNING_PROKARYOTIC & BINNING_EUKARYOTIC --"genome-resolved"--> GENOMES_AND_GENE_MODELS - GENOMES_AND_GENE_MODELS --> GENOMES & GENE_MODELS - - -end - -%% -- - -subgraph "`**clustering**`" - %% modules - CLUSTER("`_cluster_`") - - %% output - PROTEIN_CLUSTERS[["SLC-specific Protein Clusters (SSPC)"]] - GENOME_CLUSTERS[["Species-level Clusters (SLC)"]] - - - %% cluster - GENOMES & GENE_MODELS--> CLUSTER - CLUSTER --> GENOME_CLUSTERS - CLUSTER --> PROTEIN_CLUSTERS - -end - -subgraph "`**annotation**`" -ANNOTATE("`_annotate_`") - -GENE_MODELS & PROTEIN_CLUSTERS --> ANNOTATE -end - -``` \ No newline at end of file diff --git a/images/devel/Modules/preprocess.md b/images/devel/Modules/preprocess.md deleted file mode 100644 index af1938e..0000000 --- a/images/devel/Modules/preprocess.md +++ /dev/null @@ -1,53 +0,0 @@ - - -```mermaid -%%{init: { "flowchart": { "curve": "linear" } } }%% - -%% Available curve styles include basis, bumpX, bumpY, cardinal, catmullRom, linear, monotoneX, monotoneY, natural, step, stepAfter, and stepBefore. %%% - -graph LR - -subgraph "`**preprocess**`" - - %% Programs - FASTP["FastP"] - BOWTIE2["Bowtie2"] - SEQKIT["seqkit stats"] - BBDUK["BBDuk"] - - %% Databases - CONTAMINATION[("Contamination")] - KMERS[("K-mer Profiles")] - - %% inputs - READS[\"Illumina_1/2.fastq.gz"/] - - %% outputs - STATS["statistics.tsv"] - - %% FastP - READS --> FASTP - - FASTP --"trimmed_1/2.fastq.gz"--> BOWTIE2 - - %% Bowtie2 - CONTAMINATION --> BOWTIE2 - BOWTIE2 --"cleaned_1/2.fastq.gz"--> BBDUK - BOWTIE2 --"contaminated_1/2.fastq.gz"--> STATS - - %%BBDuk - KMERS --> BBDUK - - READS --> SEQKIT - BOWTIE2 --> SEQKIT - BBDUK --"cleaned_1/2.non-kmer_hits.fastq.gz"--> SEQKIT - BBDUK --"cleaned_1/2.kmer_hits.fastq.gz"--> SEQKIT - - SEQKIT --> STATS - - - -end - - -``` \ No newline at end of file