From fa9f6a78646d2f8a3aaea8971e663cc3375019b1 Mon Sep 17 00:00:00 2001 From: marieBvr Date: Mon, 5 Jul 2021 16:11:34 +0200 Subject: [PATCH] Fix #1 update taxonomy db creation --- db/loadTaxonomy.pl | 45 +++++++++++++++++++----------------- docs/source/prerequisite.rst | 8 ++++--- 2 files changed, 29 insertions(+), 24 deletions(-) diff --git a/db/loadTaxonomy.pl b/db/loadTaxonomy.pl index 67648a3..f1ed40a 100755 --- a/db/loadTaxonomy.pl +++ b/db/loadTaxonomy.pl @@ -28,6 +28,8 @@ "acc_prot=s" => \$data_acc_prot, "acc_wgs=s"=> \$data_acc_wgs, "acc_nucl=s"=> \$data_acc_nucl, + "dead_prot=s"=> \$data_dead_acc_prot, + "dead_nucl=s"=> \$data_dead_acc_nucl, # "gi_nucl=s" => \$gi_nucl, "gi_prot=s" => \$gi_prot, "names=s" => \$data_names, @@ -246,27 +248,28 @@ sub _set_options { } -sub help { -my $prog = basename($0); -print STDERR < taxonomyStructure.sql path. (Default: $taxo_struct_dmp) - -index taxonomyIndex.sql path. (Default: $taxo_index_dmp) - -acc_prot prot.accession2taxid. (Default: $data_acc_prot) - -acc_nucl nucl.accession2taxid. (Default: $data_acc_wgs) - -names names.dmp file. (Default: $data_names) - -nodes nodes.dmp file. (Default: $data_nodes) - -gi_prot gi_taxid_prot.dmp file (Default: $gi_prot) - -v Verbosity level. (0 -> 4). -EOF -exit(1); + ### OPTIONS ### + -struct taxonomyStructure.sql path. (Default: $taxo_struct_dmp) + -index taxonomyIndex.sql path. (Default: $taxo_index_dmp) + -acc_prot prot.accession2taxid. (Default: $data_acc_prot) + -acc_nucl nucl.accession2taxid. (Default: $data_acc_wgs) + -names names.dmp file. (Default: $data_names) + -nodes nodes.dmp file. (Default: $data_nodes) + -gi_prot gi_taxid_prot.dmp file (Default: $gi_prot) + -v Verbosity level. (0 -> 4). + EOF + exit(1); + } } diff --git a/docs/source/prerequisite.rst b/docs/source/prerequisite.rst index 875abac..194822e 100644 --- a/docs/source/prerequisite.rst +++ b/docs/source/prerequisite.rst @@ -44,6 +44,8 @@ Perl external libraries * String::Random * Bio::SearchIO:blastxml * Bio::SeqIO +* Expect +* GD:Simple Perl included libraries @@ -116,7 +118,7 @@ NCBI Taxonomy wget ftp://ftp.ncbi.nih.gov/pub/taxonomy/accession2taxid/dead_wgs.accession2taxid.gz ; gunzip dead_wgs.accession2taxid.gz cat nucl_wgs.accession2taxid nucl_gb.accession2taxid dead_wgs.accession2taxid > acc2taxid.nucl wget ftp://ftp.ncbi.nih.gov/pub/taxonomy/accession2taxid/dead_nucl.accession2taxid.gz; gunzip dead_nucl.accession2taxid.gz; - wget ftp://ftp.ncbi.nih.gov/pub/taxonomy/gi_taxid_prot.dmp.gz; gunzip gi_taxid_prot.dmp.gz; + wget ftp://ftp.ncbi.nih.gov/pub/taxonomy/obsolete/gi_taxid_prot.dmp.gz; gunzip gi_taxid_prot.dmp.gz; Optionally you can combine multiple accession2taxid file with a simple cat. But keep separated nucl and prot accessions as they will be loaded in two different tables. @@ -124,7 +126,7 @@ Launch the loadTaxonomy.pl script that will create the sqlite database. The scri .. code-block:: bash - ./loadTaxonomy.pl -struct taxonomyStructure.sql -index taxonomyIndex.sql -acc_prot acc2taxid.prot -acc_nucl acc2taxid.nucl -names names.dmp -nodes nodes.dmp -gi_prot gi_taxid_prot.dmp + ./loadTaxonomy.pl -struct taxonomyStructure.sql -index taxonomyIndex.sql -acc_prot acc2taxid.prot -acc_nucl acc2taxid.nucl -names names.dmp -nodes nodes.dmp -gi_prot gi_taxid_prot.dmp -acc_wgs acc2taxid.nucl PFAM taxonomy @@ -140,7 +142,7 @@ Be carefull, the files you will download have a size of ~900Mo. ftp://ftp.ncbi.nih.gov/pub/mmdb/cdd/fasta.tar.gz tar -xzf fasta.tar.gz; - mkdir pfam + mkdir fasta mv pfam*.FASTA fasta/ cd pfam/