From 6c9d48b3ec1da29549095279d09d283b2a790ec7 Mon Sep 17 00:00:00 2001 From: "Josh L. Espinoza" Date: Tue, 19 Dec 2023 14:52:26 -0800 Subject: [PATCH 1/4] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 9e6a39a..a24e58a 100644 --- a/README.md +++ b/README.md @@ -42,7 +42,7 @@ ___________________________________________________________________ ### Announcements -* **What's new in `VEBA v1.3.0`?** +* **What's new in `VEBA v1.4.0`?** * **`VEBA` Modules:** From 60ec7c4cab9bc9097f30df04f779450133ff46e5 Mon Sep 17 00:00:00 2001 From: "Josh L. Espinoza" Date: Tue, 19 Dec 2023 14:57:06 -0800 Subject: [PATCH 2/4] v1.4.0 redo --- images/Schematic/Schematic_v1.1.x.gslides | 1 + images/Schematic/Schematic_v1.2.0.gslides | 1 + 2 files changed, 2 insertions(+) create mode 100644 images/Schematic/Schematic_v1.1.x.gslides create mode 100644 images/Schematic/Schematic_v1.2.0.gslides diff --git a/images/Schematic/Schematic_v1.1.x.gslides b/images/Schematic/Schematic_v1.1.x.gslides new file mode 100644 index 0000000..5f06a64 --- /dev/null +++ b/images/Schematic/Schematic_v1.1.x.gslides @@ -0,0 +1 @@ +{"":"WARNING! DO NOT EDIT THIS FILE! ANY CHANGES MADE WILL BE LOST!","doc_id":"1L0LdxYJxvgSgINjKZXaOJS9UKtFbf_RC8lYxAFUhCTw","resource_key":"","email":"jespinoz.jcvi@gmail.com"} diff --git a/images/Schematic/Schematic_v1.2.0.gslides b/images/Schematic/Schematic_v1.2.0.gslides new file mode 100644 index 0000000..b2f70ae --- /dev/null +++ b/images/Schematic/Schematic_v1.2.0.gslides @@ -0,0 +1 @@ +{"":"WARNING! DO NOT EDIT THIS FILE! ANY CHANGES MADE WILL BE LOST!","doc_id":"1WzXffcWcl84a__OQHP5Qx0jeM50b2HCZZMT9vzNtvQw","resource_key":"","email":"jespinoz.jcvi@gmail.com"} From 9212788250535c8fac2bbd464bf7aaa67a187b81 Mon Sep 17 00:00:00 2001 From: "Josh L. Espinoza" Date: Tue, 19 Dec 2023 15:22:38 -0800 Subject: [PATCH 3/4] v1.4.1 Fixed version info --- CHANGELOG.md | 32 +-- README.md | 4 +- VERSION | 2 +- images/Schematic/Schematic_v1.1.x.gslides | 1 - images/Schematic/Schematic_v1.2.0.gslides | 1 - install/DATABASE.md | 212 ++++++++++++++++++- install/README.md | 2 +- install/download_databases.sh | 8 +- walkthroughs/adapting_commands_for_aws.md | 2 +- walkthroughs/adapting_commands_for_docker.md | 6 +- 10 files changed, 218 insertions(+), 52 deletions(-) delete mode 100644 images/Schematic/Schematic_v1.1.x.gslides delete mode 100644 images/Schematic/Schematic_v1.2.0.gslides diff --git a/CHANGELOG.md b/CHANGELOG.md index 7f1709c..d24460e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,7 +6,7 @@ ________________________________________________________________ #### Current Releases: -**Release v1.4.0 Highlights:** +**Release v1.4.1 Highlights:** * **`VEBA` Modules:** @@ -21,36 +21,6 @@ ________________________________________________________________ * Completely rebuilt `VEBA's Microeukaryotic Protein Database` to produce a clustered database `MicroEuk100/90/50` similar to `UniRef100/90/50`. Available on [doi:10.5281/zenodo.10139450](https://zenodo.org/records/10139451). - * **Number of sequences:** - - * MicroEuk100 = 79,920,431 (19 GB) - - * MicroEuk90 = 51,767,730 (13 GB) - - * MicroEuk50 = 29,898,853 (6.5 GB) - - - - * **Number of source organisms per dataset:** - - * MycoCosm = 2503 - - * PhycoCosm = 174 - - * EnsemblProtists = 233 - - * MMETSP = 759 - - * TARA_SAGv1 = 8 - - * EukProt = 366 - - * EukZoo = 27 - - * TARA_SMAGv1 = 389 - - * NR_Protists-Fungi = 48217 -
**Release v1.4.0 Details** * [2023.12.15] - Added `profile-taxonomic.py` module which uses `sylph` to build a sketch database for genomes and queries the genome database similar to `Kraken` for taxonomic abundance. diff --git a/README.md b/README.md index a24e58a..3d2973b 100644 --- a/README.md +++ b/README.md @@ -42,7 +42,7 @@ ___________________________________________________________________ ### Announcements -* **What's new in `VEBA v1.4.0`?** +* **What's new in `VEBA v1.4.1`?** * **`VEBA` Modules:** @@ -67,7 +67,7 @@ ___________________________________________________________________ ### Installation and databases -**Current Stable Version:** [`v1.4.0`](https://github.com/jolespin/veba/releases/tag/v1.4.0) +**Current Stable Version:** [`v1.4.1`](https://github.com/jolespin/veba/releases/tag/v1.4.1) **Current Database Version:** `VDB_v6` diff --git a/VERSION b/VERSION index a0fef3f..35cf735 100644 --- a/VERSION +++ b/VERSION @@ -1,2 +1,2 @@ -1.4.0b +1.4.1 VDB_v6 diff --git a/images/Schematic/Schematic_v1.1.x.gslides b/images/Schematic/Schematic_v1.1.x.gslides deleted file mode 100644 index 5f06a64..0000000 --- a/images/Schematic/Schematic_v1.1.x.gslides +++ /dev/null @@ -1 +0,0 @@ -{"":"WARNING! DO NOT EDIT THIS FILE! ANY CHANGES MADE WILL BE LOST!","doc_id":"1L0LdxYJxvgSgINjKZXaOJS9UKtFbf_RC8lYxAFUhCTw","resource_key":"","email":"jespinoz.jcvi@gmail.com"} diff --git a/images/Schematic/Schematic_v1.2.0.gslides b/images/Schematic/Schematic_v1.2.0.gslides deleted file mode 100644 index b2f70ae..0000000 --- a/images/Schematic/Schematic_v1.2.0.gslides +++ /dev/null @@ -1 +0,0 @@ -{"":"WARNING! DO NOT EDIT THIS FILE! ANY CHANGES MADE WILL BE LOST!","doc_id":"1WzXffcWcl84a__OQHP5Qx0jeM50b2HCZZMT9vzNtvQw","resource_key":"","email":"jespinoz.jcvi@gmail.com"} diff --git a/install/DATABASE.md b/install/DATABASE.md index 9bb0c5b..21d013a 100644 --- a/install/DATABASE.md +++ b/install/DATABASE.md @@ -21,20 +21,218 @@ Please cite the following sources if these marker sets are used in any way: Espinoza, Josh (2022): Profile HMM marker sets. figshare. Dataset. https://doi.org/10.6084/m9.figshare.19616016.v1 #### Microeukaryotic protein database: -A protein database is required not only for eukaryotic gene calls using MetaEuk but can also be used for MAG annotation. Many eukaryotic protein databases exist such as MMETSP, EukZoo, and EukProt, yet these are limited to marine environments, include prokaryotic sequences, or include eukaryotic sequences for organisms that would not be expected to be binned out of metagenomes such as metazoans. We combined and dereplicated MMETSP, EukZoo, EukProt, and NCBI non-redundant to include only microeukaryotes such as protists and fungi. This optimized microeukaryotic database ensures that only eukaryotic exons expected to be represented in metagenomes are utilized for eukaryotic gene modeling and the resulting MetaEuk reference targets are used for eukaryotic MAG classification. VEBA’s microeukaryotic protein database includes 48,006,918 proteins from 42,922 microeukaryotic strains. +VEBA’s Microeukaryotic Protein Database has been completely redesigned using the logic of UniRef and their clustered database. The previous microeukaryotic protein database contained 48,006,918 proteins from 44,647 source organisms while the updated database, MicroEuk, contains 79,920,430 proteins from 52,495 source organisms. As in the prior major release, MicroEuk concentrates on microeukaryotic organisms while excluding higher eukaryotes as these organisms are the primary eukaryotes targeted by shotgun metagenomics and metatranscriptomics. Source organisms in this context are defined as organisms in which the proteins were derived. + +**Number of sequences:** + + * MicroEuk100 = 79,920,431 (19 GB) + * MicroEuk90 = 51,767,730 (13 GB) + * MicroEuk50 = 29,898,853 (6.5 GB) + + + +**Number of source organisms per dataset:** + +* MycoCosm = 2503 +* PhycoCosm = 174 +* EnsemblProtists = 233 +* MMETSP = 759 +* TARA_SAGv1 = 8 +* EukProt = 366 +* EukZoo = 27 +* TARA_SMAGv1 = 389 +* NR_Protists-Fungi = 48217 **Current:** -* [VDB-Microeukaryotic\_v2.1](https://zenodo.org/record/7485114) available on Zenodo +* [MicroEuk\_v3](https://zenodo.org/records/10139451) available on Zenodo **Deprecated:** -* [VDB-Microeukaryotic\_v1](https://figshare.com/articles/dataset/Microeukaryotic_Protein_Database/19668855) available on FigShare +* [MicroEuk\_v2](https://zenodo.org/record/7485114) available on Zenodo + +* [MicroEuk\_v1](https://figshare.com/articles/dataset/Microeukaryotic_Protein_Database/19668855) available on FigShare #### Database Structure: **Current:** -*VEBA Database* version: `VDB_v5.2` (243 GB) + + *VEBA Database* version: `VDB_v6` (272 GB) + + * Added `MicroEuk_v3` + +``` + tree -L 3 . +. +├── ACCESS_DATE +├── Annotate +│   ├── CAZy +│   │   └── CAZyDB.07262023.dmnd +│   ├── KOFAM +│   │   ├── ko_list +│   │   └── profiles +│   ├── MIBiG +│   │   └── mibig_v3.1.dmnd +│   ├── MicrobeAnnotator-KEGG +│   │   ├── KEGG_Bifurcating_Module_Information.pkl +│   │   ├── KEGG_Bifurcating_Module_Information.pkl.md5 +│   │   ├── KEGG_Module_Information.txt +│   │   ├── KEGG_Module_Information.txt.md5 +│   │   ├── KEGG_Regular_Module_Information.pkl +│   │   ├── KEGG_Regular_Module_Information.pkl.md5 +│   │   ├── KEGG_Structural_Module_Information.pkl +│   │   └── KEGG_Structural_Module_Information.pkl.md5 +│   ├── MicrobeAnnotator-KEGG.tar.gz +│   ├── NCBIfam-AMRFinder +│   │   ├── NCBIfam-AMRFinder.changelog.txt +│   │   ├── NCBIfam-AMRFinder.hmm.gz +│   │   └── NCBIfam-AMRFinder.tsv +│   ├── Pfam +│   │   ├── Pfam-A.hmm.gz +│   │   └── relnotes.txt +│   ├── UniRef +│   │   ├── uniref50.dmnd +│   │   ├── uniref50.release_note +│   │   ├── uniref90.dmnd +│   │   └── uniref90.release_note +│   └── VFDB +│   └── VFDB_setA_pro.dmnd +├── Classify +│   ├── CheckM2 +│   │   └── uniref100.KO.1.dmnd +│   ├── CheckV +│   │   ├── genome_db +│   │   ├── hmm_db +│   │   └── README.txt +│   ├── geNomad +│   │   ├── genomad_db +│   │   ├── genomad_db.dbtype +│   │   ├── genomad_db_h +│   │   ├── genomad_db_h.dbtype +│   │   ├── genomad_db_h.index +│   │   ├── genomad_db.index +│   │   ├── genomad_db.lookup +│   │   ├── genomad_db_mapping +│   │   ├── genomad_db.source +│   │   ├── genomad_db_taxonomy +│   │   ├── genomad_integrase_db +│   │   ├── genomad_integrase_db.dbtype +│   │   ├── genomad_integrase_db_h +│   │   ├── genomad_integrase_db_h.dbtype +│   │   ├── genomad_integrase_db_h.index +│   │   ├── genomad_integrase_db.index +│   │   ├── genomad_integrase_db.lookup +│   │   ├── genomad_integrase_db.source +│   │   ├── genomad_marker_metadata.tsv +│   │   ├── genomad_mini_db -> genomad_db +│   │   ├── genomad_mini_db.dbtype +│   │   ├── genomad_mini_db_h -> genomad_db_h +│   │   ├── genomad_mini_db_h.dbtype -> genomad_db_h.dbtype +│   │   ├── genomad_mini_db_h.index -> genomad_db_h.index +│   │   ├── genomad_mini_db.index +│   │   ├── genomad_mini_db.lookup -> genomad_db.lookup +│   │   ├── genomad_mini_db_mapping -> genomad_db_mapping +│   │   ├── genomad_mini_db.source -> genomad_db.source +│   │   ├── genomad_mini_db_taxonomy -> genomad_db_taxonomy +│   │   ├── mini_set_ids +│   │   ├── names.dmp +│   │   ├── nodes.dmp +│   │   ├── plasmid_hallmark_annotation.txt +│   │   ├── version.txt +│   │   └── virus_hallmark_annotation.txt +│   ├── GTDB +│   │   ├── fastani +│   │   ├── markers +│   │   ├── mash +│   │   ├── masks +│   │   ├── metadata +│   │   ├── mrca_red +│   │   ├── msa +│   │   ├── pplacer +│   │   ├── radii +│   │   ├── split +│   │   ├── taxonomy +│   │   └── temp +│   ├── MicroEuk +│   │   ├── MicroEuk100 +│   │   ├── MicroEuk100.dbtype +│   │   ├── MicroEuk100.eukaryota_odb10 +│   │   ├── MicroEuk100.eukaryota_odb10.dbtype +│   │   ├── MicroEuk100.eukaryota_odb10_h +│   │   ├── MicroEuk100.eukaryota_odb10_h.dbtype +│   │   ├── MicroEuk100.eukaryota_odb10_h.index +│   │   ├── MicroEuk100.eukaryota_odb10.index +│   │   ├── MicroEuk100.eukaryota_odb10.lookup +│   │   ├── MicroEuk100.eukaryota_odb10.source +│   │   ├── MicroEuk100_h +│   │   ├── MicroEuk100_h.dbtype +│   │   ├── MicroEuk100_h.index +│   │   ├── MicroEuk100.index +│   │   ├── MicroEuk100.lookup +│   │   ├── MicroEuk100_mapping +│   │   ├── MicroEuk100.source +│   │   ├── MicroEuk100_taxonomy +│   │   ├── MicroEuk50 +│   │   ├── MicroEuk50.dbtype +│   │   ├── MicroEuk50_h +│   │   ├── MicroEuk50_h.dbtype +│   │   ├── MicroEuk50_h.index +│   │   ├── MicroEuk50.index +│   │   ├── MicroEuk50.lookup +│   │   ├── MicroEuk50.source +│   │   ├── MicroEuk90 +│   │   ├── MicroEuk90.dbtype +│   │   ├── MicroEuk90_h +│   │   ├── MicroEuk90_h.dbtype +│   │   ├── MicroEuk90_h.index +│   │   ├── MicroEuk90.index +│   │   ├── MicroEuk90.lookup +│   │   ├── MicroEuk90.source +│   │   ├── source_taxonomy.tsv.gz +│   │   ├── source_to_lineage.dict.pkl.gz +│   │   └── target_to_source.dict.pkl.gz +│   └── NCBITaxonomy +│   ├── citations.dmp +│   ├── delnodes.dmp +│   ├── division.dmp +│   ├── gc.prt +│   ├── gencode.dmp +│   ├── merged.dmp +│   ├── names.dmp +│   ├── nodes.dmp +│   └── readme.txt +├── Contamination +│   ├── AntiFam +│   │   ├── AntiFam.hmm.gz +│   │   ├── relnotes +│   │   └── version +│   ├── chm13v2.0 +│   │   ├── chm13v2.0.1.bt2 +│   │   ├── chm13v2.0.2.bt2 +│   │   ├── chm13v2.0.3.bt2 +│   │   ├── chm13v2.0.4.bt2 +│   │   ├── chm13v2.0.rev.1.bt2 +│   │   └── chm13v2.0.rev.2.bt2 +│   └── kmers +│   └── ribokmers.fa.gz +└── MarkerSets + ├── Archaea_76.hmm.gz + ├── Bacteria_71.hmm.gz + ├── CPR_43.hmm.gz + ├── eukaryota_odb10.hmm.gz + ├── eukaryota_odb10.scores_cutoff.tsv.gz + ├── Fungi_593.hmm.gz + ├── Protista_83.hmm.gz + └── README + +36 directories, 124 files + +``` + +**Deprecated:** + +
+ *VEBA Database* version: `VDB_v5.2` (243 GB)
* Added `MicrobeAnnotator-KEGG` [Zenodo: 10020074](https://zenodo.org/records/10020074) which includes KEGG module pathway information from [`MicrobeAnnotator`](https://doi.org/10.1186/s12859-020-03940-5). * Added `CAZy` protein sequences from [`dbCAN2`](https://academic.oup.com/nar/article/46/W1/W95/4996582) @@ -194,7 +392,7 @@ tree -L 3 . 37 directories, 112 files ``` -**Deprecated:** +
*VEBA Database* version: `VDB_v5.1` @@ -340,6 +538,7 @@ tree -L 3 . ├── Protista_83.hmm.gz └── README ``` +
@@ -481,6 +680,7 @@ tree -L 3 . ├── Protista_83.hmm.gz └── README ``` +
@@ -622,6 +822,7 @@ tree -L 3 . 31 directories, 96 files ``` +
@@ -731,6 +932,7 @@ tree -L 3 . 35 directories, 60 files ``` + diff --git a/install/README.md b/install/README.md index f92e986..7f796d0 100644 --- a/install/README.md +++ b/install/README.md @@ -85,7 +85,7 @@ The `VEBA` installation is going to configure some `conda` environments for you ``` # For stable version, download and decompress the tarball: -VERSION="1.4.0" +VERSION="1.4.1" wget https://github.com/jolespin/veba/archive/refs/tags/v${VERSION}.tar.gz tar -xvf v${VERSION}.tar.gz && mv veba-${VERSION} veba diff --git a/install/download_databases.sh b/install/download_databases.sh index 06c4d48..d7ac1ce 100644 --- a/install/download_databases.sh +++ b/install/download_databases.sh @@ -1,5 +1,5 @@ #!/bin/bash -# __version__ = "2023.12.11" +# __version__ = "2023.12.19" # VEBA_DATABASE_VERSION = "VDB_v6" # MICROEUKAYROTIC_DATABASE_VERSION = "MicroEuk_v3" @@ -114,11 +114,7 @@ mmseqs createdb --compressed 1 ${DATABASE_DIRECTORY}/MicroEuk_v3/MicroEuk100.faa # MicroEuk100.eukaryota_odb10 gzip -d ${DATABASE_DIRECTORY}/MicroEuk_v3/MicroEuk100.eukaryota_odb10.list.gz -seqkit grep -f ${DATABASE_DIRECTORY}/MicroEuk_v3/MicroEuk100.eukaryota_odb10.list ${DATABASE_DIRECTORY}/MicroEuk_v3/MicroEuk100.faa | mmseqs createdb --compressed 1 stdin ${DATABASE_DIRECTORY}/Classify/MicroEuk/MicroEuk100 - -# MicroEuk90 -gzip -d -c ${DATABASE_DIRECTORY}/MicroEuk_v3/MicroEuk90_clusters.tsv.gz | cut -f1 | sort -u > ${DATABASE_DIRECTORY}/MicroEuk_v3/MicroEuk90.list -seqkit grep -f ${DATABASE_DIRECTORY}/MicroEuk_v3/MicroEuk90.list ${DATABASE_DIRECTORY}/MicroEuk_v3/MicroEuk100.faa | mmseqs createdb --compressed 1 stdin ${DATABASE_DIRECTORY}/Classify/MicroEuk/MicroEuk90 +seqkit grep -f ${DATABASE_DIRECTORY}/MicroEuk_v3/MicroEuk100.eukaryota_odb10.list ${DATABASE_DIRECTORY}/MicroEuk_v3/MicroEuk100.faa | mmseqs createdb --compressed 1 stdin ${DATABASE_DIRECTORY}/Classify/MicroEuk/MicroEuk100.eukaryota_odb10 # MicroEuk90 gzip -d -c ${DATABASE_DIRECTORY}/MicroEuk_v3/MicroEuk90_clusters.tsv.gz | cut -f1 | sort -u > ${DATABASE_DIRECTORY}/MicroEuk_v3/MicroEuk90.list diff --git a/walkthroughs/adapting_commands_for_aws.md b/walkthroughs/adapting_commands_for_aws.md index 091fe36..bd74421 100644 --- a/walkthroughs/adapting_commands_for_aws.md +++ b/walkthroughs/adapting_commands_for_aws.md @@ -38,7 +38,7 @@ This job definition pulls the [jolespin/veba_preprocess](https://hub.docker.com/ "jobDefinitionName": "preprocess__S1", "type": "container", "containerProperties": { - "image": "jolespin/veba_preprocess:1.4.0", + "image": "jolespin/veba_preprocess:1.4.1", "command": [ "preprocess.py", "-1", diff --git a/walkthroughs/adapting_commands_for_docker.md b/walkthroughs/adapting_commands_for_docker.md index b6d4899..b16c23c 100644 --- a/walkthroughs/adapting_commands_for_docker.md +++ b/walkthroughs/adapting_commands_for_docker.md @@ -24,7 +24,7 @@ Refer to the [Docker documentation](https://docs.docker.com/engine/install/). Let's say you wanted to use the `preprocess` module. Download the Docker image as so: ``` -VERSION=1.4.0 +VERSION=1.4.1 docker image pull jolespin/veba_preprocess:${VERSION} ``` @@ -36,7 +36,7 @@ For example, here's how we would run the `preprocess.py` module. First let's ju ```bash # Version -VERSION=1.4.0 +VERSION=1.4.1 # Image DOCKER_IMAGE="jolespin/veba_preprocess:${VERSION}" @@ -90,7 +90,7 @@ CMD="preprocess.py -1 ${CONTAINER_INPUT_DIRECTORY}/${R1} -2 ${CONTAINER_INPUT_DI # Docker # Version -VERSION=1.4.0 +VERSION=1.4.1 # Image DOCKER_IMAGE="jolespin/veba_preprocess:${VERSION}" From b76259fa3f3ba41d1bff9a52a4bdcc7d33ef9598 Mon Sep 17 00:00:00 2001 From: "Josh L. Espinoza" Date: Tue, 19 Dec 2023 15:23:47 -0800 Subject: [PATCH 4/4] merge --- images/Schematic/Schematic_v1.1.x.gslides | 1 + images/Schematic/Schematic_v1.2.0.gslides | 1 + 2 files changed, 2 insertions(+) create mode 100644 images/Schematic/Schematic_v1.1.x.gslides create mode 100644 images/Schematic/Schematic_v1.2.0.gslides diff --git a/images/Schematic/Schematic_v1.1.x.gslides b/images/Schematic/Schematic_v1.1.x.gslides new file mode 100644 index 0000000..5f06a64 --- /dev/null +++ b/images/Schematic/Schematic_v1.1.x.gslides @@ -0,0 +1 @@ +{"":"WARNING! DO NOT EDIT THIS FILE! ANY CHANGES MADE WILL BE LOST!","doc_id":"1L0LdxYJxvgSgINjKZXaOJS9UKtFbf_RC8lYxAFUhCTw","resource_key":"","email":"jespinoz.jcvi@gmail.com"} diff --git a/images/Schematic/Schematic_v1.2.0.gslides b/images/Schematic/Schematic_v1.2.0.gslides new file mode 100644 index 0000000..b2f70ae --- /dev/null +++ b/images/Schematic/Schematic_v1.2.0.gslides @@ -0,0 +1 @@ +{"":"WARNING! DO NOT EDIT THIS FILE! ANY CHANGES MADE WILL BE LOST!","doc_id":"1WzXffcWcl84a__OQHP5Qx0jeM50b2HCZZMT9vzNtvQw","resource_key":"","email":"jespinoz.jcvi@gmail.com"}