add viral database compression

medvir · Mar 14, 2023 · edc8f7d · edc8f7d
1 parent 0f4dae7
commit edc8f7d
Show file tree

Hide file tree

Showing 6 changed files with 168 additions and 37 deletions.
diff --git a/src/virmet/cli.py b/src/virmet/cli.py
@@ -98,6 +98,7 @@ def main():
     parser_fetch.add_argument('--bact', help='bacterial (RefSeq)', action='store_true')
     parser_fetch.add_argument('--fungal', help='fungal (RefSeq)', action='store_true')
     parser_fetch.add_argument('--bovine', help='bovine (Bos taurus)', action='store_true')
+    parser_fetch.add_argument('--no_db_compression', help='do not compress the viral database', action='store_true', default=False)
     parser_fetch.set_defaults(func=fetch_db)
 
     # create the parser for command "update"
@@ -106,6 +107,7 @@ def main():
     parser_update.add_argument('--bact', help='update bacterial database', action='store_true', default=False)
     parser_update.add_argument('--fungal', help='update fungal database', action='store_true', default=False)
     parser_update.add_argument('--picked', help='file with additional sequences, one GI per line', default=None)
+    parser_update.add_argument('--update_min_date', help='update viral [n]ucleic/[p]rotein with sequences produced after the date YYYY/MM/DD (e.g. 2022/09/01)', default=None)
     parser_update.set_defaults(func=update_db)
 
     # create the parser for command "index"

diff --git a/src/virmet/common.py b/src/virmet/common.py
@@ -10,8 +10,10 @@
 import multiprocessing as mp
 import pandas as pd
 
-DB_DIR = '/data/virmet_databases/'
-N_FILES_BACT = 5
+DB_DIR = '/data/virmet_databases'
+DB_DIR_UPDATE = '/data/virmet_databases_update'
+N_FILES_BACT = 16
+MAX_TAXID = 10000 # max number of sequences belonging to the same taxid in compressed viral database 
 
 # decorator for taken from RepoPhlan
 # https://bitbucket.org/nsegata/repophlan/src/5804db9d341165f72c4f7e448691ce90a1465764/repophlan_get_microbes.py?at=ncbi2017&fileviewer=file-view-default
@@ -82,6 +84,8 @@ def ftp_down(remote_url, local_url=None):
 
     # decompressing
     elif remote_url.endswith('.gz') and not outname.endswith('.gz'):
+        #print(remote_url)
+        #print(outname)
         if os.path.exists(outname):
             outhandle = open(outname, 'a')
         else:
@@ -98,7 +102,7 @@ def ftp_down(remote_url, local_url=None):
         else:
             outhandle = open(outname, 'wb')
         logging.debug('Downloading %s', remote_url)
-        with urllib.request.urlopen(remote_url, timeout=30) as f:
+        with urllib.request.urlopen(remote_url, timeout=300) as f:
             outhandle.write(f.read())
 
     # uncompressed to uncompressed
@@ -115,28 +119,97 @@ def ftp_down(remote_url, local_url=None):
     # outhandle.close()
     return outhandle
 
+def random_reduction(viral_mode):
+    #This code, identify sequences from the same species using their taxid. 
+    #Based on the given thresholds (10,000) subsample sequences of the corresponding taxid.
+    #Currently this code has the absolute threshold and any taxid with above 
+    #10,000 reference sequences are filtered out.  
+    #This is mainly done because of SARS-CoV-2 sequences covering above 90% of 
+    #the viral database as of begining of 2023. 
+    #The highest frequent virus is SARS-CoV-2 with 90% sequences of the database.  
+
+    def strip(str_):
+        """Make the strip method a function"""
+        return str_.strip()
+
+    if viral_mode == 'n':
+        logging.info('compressing viral nuccore sequences')
+        target_dir = os.path.join(DB_DIR_UPDATE, 'viral_nuccore')    
+    elif viral_mode == 'p':
+        logging.info('compressing viral protein sequences')
+        target_dir = os.path.join(DB_DIR_UPDATE, 'viral_protein')
+    logging.info('Database real path for compression: %s' %os.path.realpath(target_dir))
+    os.chdir(target_dir)
+
+    viral_info_file = os.path.join(target_dir, 'viral_seqs_info.tsv')
+    viral_fasta_file = os.path.join(target_dir, 'viral_database.fasta')
+
+    viral_info = pd.read_table(viral_info_file, names=['accn', 'TaxId', 'seq_len', 'Organism', 'Title', 'accn_version' ])
+    viral_info.drop_duplicates(subset=['accn_version'], inplace=True)
+    # Do compression at the texid ID level, 
+    # check if any taxid has more than 10,000 reference sequences move them to a new pandas table 
+    # remove them from the main pandas table
+    TaxId_to_counter_df = viral_info['TaxId'].value_counts().reset_index()
+    TaxId_to_counter_df.rename(columns={"TaxId": "TaxId_count", "index": "TaxId_num"}, inplace=True)
+    TaxId_to_percentage = viral_info['TaxId'].value_counts(normalize=True).reset_index()
+    TaxId_to_counter_df['percentage'] = TaxId_to_percentage['TaxId']
+    TaxId_to_counter_filterred_df = TaxId_to_counter_df[TaxId_to_counter_df["TaxId_count"] > MAX_TAXID]
+
+    viral_info_subsampled = viral_info.copy()
+    random.seed(100)
+    removed_set = set()
+    for index, row in TaxId_to_counter_filterred_df.iterrows():
+        taxid_to_subsample = int(row['TaxId_num'])
+        viral_info_to_subsample_df = viral_info_subsampled[viral_info_subsampled['TaxId'] == taxid_to_subsample]
+        accn_set = set(viral_info_to_subsample_df['accn_version'])
+        # subsample to make it about 1% of the database
+        selected_accn_ls = random.sample(accn_set, MAX_TAXID)
+
+        # filter out the unselected IDs from viral_info
+        # (viral_info_subsampled['accn'] in selected_accn_ls)
+        # Keep refseq sequence: all accn of refseq sequences contain char  '_' 
+        # viral_info_subsampled['accn'].contain('_')  viral_info_subsampled['accn'].str.contains('ON8')
+        viral_info_subsampled = viral_info_subsampled.loc[(viral_info_subsampled['TaxId'] != taxid_to_subsample)  | viral_info_subsampled['accn_version'].isin(selected_accn_ls) | viral_info_subsampled['accn_version'].str.contains('_')]
+        # create a text file of the non_selected ACC ID as extra information
+        removed_set = removed_set | (accn_set - set(selected_accn_ls))
+
+    viral_info_subsampled.drop_duplicates(subset=['accn_version'], inplace=True)
+    viral_info_subsampled.accn_version.to_csv('outfile.csv', sep='\n', index=False, header=False)
+    # extract the selected accession number from the fasta file using seqtk 
+    subsample_fasta_command = 'seqtk subseq %s outfile.csv >  viral_database_subsampled.fasta' %(viral_fasta_file)
+    run_child(subsample_fasta_command)
+    os.rename('viral_database.fasta', 'viral_database_original_rmdup.fasta')
+    os.rename('viral_database_subsampled.fasta', 'viral_database.fasta')
+    TaxId_to_counter_filterred_df.to_csv('filtered_taxids.csv', sep=',', index=False)
 
-def viral_query(viral_db):
+def viral_query(viral_db, update_min_date=None):
     # Viruses, Taxonomy ID: 10239
     # Human adenovirus A, Taxonomy ID: 129875 (only for testing, 7 hits)
     # Mastadenovirus, Taxonomy ID: 10509 (only for testing, 440 hits)
     # Alphatorquevirus Taxonomy ID: 687331
     # Cellular organisms, Taxonomy ID: 131567 (to avoid chimeras)
     txid = '10239'  # change here for viruses or smaller taxa
-    query_text = '-query \"txid%s [orgn] AND (\\"complete genome\\" [Title] OR srcdb_refseq[prop])' % txid
-    query_text += ' NOT wgs[PROP] NOT \\"cellular organisms\\"[Organism] NOT AC_000001[PACC] : AC_999999[PACC]\"'
+    query_text = '-query \"txid%s [orgn] AND (\\"complete genome\\" [Title] OR \\"complete segment\\" [Title] OR srcdb_refseq[prop])' % txid
+    query_text += ' NOT \\"cellular organisms\\"[Organism] NOT AC_000001[PACC] : AC_999999[PACC]\"'
+
+    if update_min_date:
+        logging.info('Viral Database Update is performed with sequences added to NCBI after %s .\n' %update_min_date)
+        query_text += '-datetype PDAT -mindate %s' %str(update_min_date) #-datetype MDAT -mindate %s'
+
     query_text += ' > ncbi_search'
+
     if viral_db == 'n':
-        target_dir = os.path.join(DB_DIR, 'viral_nuccore')
+        target_dir = os.path.join(DB_DIR_UPDATE, 'viral_nuccore')
         search_text = '-db nuccore ' + query_text
     elif viral_db == 'p':
-        target_dir = os.path.join(DB_DIR, 'viral_protein')
+        target_dir = os.path.join(DB_DIR_UPDATE, 'viral_protein')
         search_text = '-db protein ' + query_text
     try:
         os.mkdir(target_dir)
     except FileExistsError:
         pass
     os.chdir(target_dir)
+    logging.info('Database real path: %s' %os.path.realpath(target_dir))
     return 'esearch ' + search_text
 
 
@@ -158,7 +231,7 @@ def bact_fung_query(query_type=None, download=True, info_file=None):
         with urllib.request.urlopen(url) as f:
             print(f.read().decode('utf-8'), file=bh)
         bh.close()
-    querinfo = pd.read_csv(info_file, sep='\t', header=0, skiprows=1)
+    querinfo = pd.read_csv(info_file, sep='\t', header=0, skiprows=1, dtype={'excluded_from_refseq': str})
     querinfo.rename(columns={'# assembly_accession': 'assembly_accession'}, inplace=True)
     if query_type == 'bacteria':
         gb = querinfo[(querinfo.assembly_level == 'Complete Genome') &
@@ -169,7 +242,8 @@ def bact_fung_query(query_type=None, download=True, info_file=None):
                       (querinfo.version_status == 'latest') &
                       (querinfo.genome_rep == 'Full') &
                       (querinfo.release_type == 'Major')]
-    gb.set_index('assembly_accession')
+    gb.set_index('assembly_accession', inplace=True)
+    gb = gb[gb['ftp_path']!='na']
     x = gb['ftp_path'].apply(lambda col: col + '/' + col.split('/')[-1] + '_genomic.fna.gz')
     gb = gb.assign(ftp_genome_path=x)
     all_urls = list(gb['ftp_genome_path'])
@@ -196,8 +270,8 @@ def download_genomes(all_urls, prefix, n_files=1):
     dl_pairs = []
     for i, seqs in enumerate(seqs_urls):
         fasta_out = 'fasta/%s%d.fasta' % (prefix, i + 1)
-        if os.path.exists(fasta_out):
-            os.remove(fasta_out)
+        #if os.path.exists(fasta_out):
+        #    os.remove(fasta_out)
         dl_pairs.append((fasta_out, seqs))
 
     # run download in parallel

diff --git a/src/virmet/fetch.py b/src/virmet/fetch.py
@@ -4,12 +4,13 @@
 import logging
 
 from virmet.common import viral_query, bact_fung_query, ftp_down, run_child, \
-download_genomes, get_accs, DB_DIR, N_FILES_BACT
+download_genomes, get_accs, DB_DIR_UPDATE, N_FILES_BACT, random_reduction
 
-
-def fetch_viral(viral_mode):
+DB_DIR = DB_DIR_UPDATE
+def fetch_viral(viral_mode, no_compression=False):
     """Download nucleotide or protein database."""
     # define the search nuccore/protein
+
     if viral_mode == 'n':
         logging.info('downloading viral nuccore sequences')
         target_dir = os.path.join(DB_DIR, 'viral_nuccore')
@@ -19,12 +20,13 @@ def fetch_viral(viral_mode):
         target_dir = os.path.join(DB_DIR, 'viral_protein')
         cml_search = viral_query('p')
     # run the search and download
+    logging.info('Database real path: %s' %os.path.realpath(target_dir))
     os.chdir(target_dir)
     run_child(cml_search)
     cml_fetch_fasta = 'efetch -format fasta < ncbi_search > viral_database.fasta'
     run_child(cml_fetch_fasta)
     cml_efetch_xtract = 'efetch -format docsum < ncbi_search | xtract'
-    cml_efetch_xtract += ' -pattern DocumentSummary -element Caption TaxId Slen Organism Title > viral_seqs_info.tsv'
+    cml_efetch_xtract += ' -pattern DocumentSummary -element Caption TaxId Slen Organism Title AccessionVersion > viral_seqs_info.tsv'
     run_child(cml_efetch_xtract)
     logging.info('downloaded viral seqs info in %s', target_dir)
     logging.info('saving viral taxonomy')
@@ -35,6 +37,17 @@ def fetch_viral(viral_mode):
     accs_2 = set([l.split()[0] for l in open('viral_accn_taxid.dmp')])
     assert accs_1 == accs_2, accs_1 ^ accs_2
     logging.info('taxonomy and fasta sequences match')
+
+    rmdup_cmd = 'cat viral_database.fasta | seqkit rmdup -i -o viral_database_rmdup.fasta -D duplicated_names.txt'
+    run_child(rmdup_cmd) 
+    os.rename('viral_database.fasta', 'viral_database_original.fasta')
+    os.rename('viral_database_rmdup.fasta', 'viral_database.fasta')
+
+    #do_compression = True # change this to a parameter
+    #print('no_compression is %s' %no_compression)
+    if no_compression == False:
+        logging.info('Compress the database\n')
+        random_reduction(viral_mode)
 
     os.chdir(DB_DIR)
     logging.info('downloading taxonomy databases')
@@ -50,29 +63,39 @@ def fetch_viral(viral_mode):
             os.remove(ftd)
         except OSError:
             logging.warning('Could not find file %s', ftd)
-
-
+    try:
+        run_child('bgzip names.dmp')
+        run_child('bgzip nodes.dmp')
+    except:
+        logging.debug('Could not find files names.dmp, nodes.dmp.')
+
 def fetch_bacterial():
     """Download the three bacterial sequence databases."""
     target_dir = os.path.join(DB_DIR, 'bacteria')
+    logging.info('Database real path: %s' %os.path.realpath(target_dir))
     try:
         os.mkdir(target_dir)
     except FileExistsError:
         pass
     os.chdir(target_dir)
-
+    
     # first download summary file with all ftp paths and return urls
     all_urls = bact_fung_query(query_type='bacteria')
+    #print(all_urls)
     logging.info('%d bacterial genomes were found', len(all_urls))
     # then download genomic_fna.gz files
-    download_genomes(all_urls, prefix='bact', n_files=N_FILES_BACT)
+    mid = len(all_urls)//2
+    download_genomes(all_urls[:mid], prefix='bact', n_files=N_FILES_BACT)
+    print('second half starts')
+    download_genomes(all_urls[mid:], prefix='bact', n_files=N_FILES_BACT)
     for j in range(1, N_FILES_BACT+1):
         run_child('bgzip fasta/bact%d.fasta' % j)
 
 
 def fetch_human():
     """Download human genome and annotations."""
     target_dir = os.path.join(DB_DIR, 'human')
+    logging.info('Database real path: %s' %os.path.realpath(target_dir))
     try:
         os.mkdir(target_dir)
     except FileExistsError:
@@ -99,6 +122,7 @@ def fetch_human():
 def fetch_fungal():
     """Download fungal sequences."""
     target_dir = os.path.join(DB_DIR, 'fungi')
+    logging.info('Database real path: %s' %os.path.realpath(target_dir))
     try:
         os.mkdir(target_dir)
     except FileExistsError:
@@ -115,6 +139,7 @@ def fetch_fungal():
 def fetch_bovine():
     """Download cow genome and annotations."""
     target_dir = os.path.join(DB_DIR, 'bovine')
+    logging.info('Database real path: %s' %os.path.realpath(target_dir))
     try:
         os.mkdir(target_dir)
     except FileExistsError:
@@ -128,27 +153,27 @@ def fetch_bovine():
     chromosomes = ['chr%d' % chrom for chrom in range(1, 30)]
     chromosomes.extend(['chrX']) # chrY is missing 
     logging.info('Downloading bovine genome')
-    local_file_name = os.path.join(target_dir, 'fasta', 'ref_Bos_taurus_GCF_002263795.1_ARS-UCD1.2.fasta')
+    local_file_name = os.path.join(target_dir, 'fasta', 'ref_Bos_taurus_GCF_002263795.2_ARS-UCD1.3.fasta')
     if os.path.exists(local_file_name):
         os.remove(local_file_name)
     for chrom in chromosomes:
         logging.debug('Downloading bovine chromosome %s', chrom)
-        fasta_url = 'ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Bos_taurus/latest_assembly_versions/GCF_002263795.1_ARS-UCD1.2/GCF_002263795.1_ARS-UCD1.2_assembly_structure/Primary_Assembly/assembled_chromosomes/FASTA/%s.fna.gz' % chrom
+        fasta_url = 'ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Bos_taurus/latest_assembly_versions/GCF_002263795.2_ARS-UCD1.3/GCF_002263795.2_ARS-UCD1.3_assembly_structure/Primary_Assembly/assembled_chromosomes/FASTA/%s.fna.gz' % chrom
         download_handle = ftp_down(fasta_url, local_file_name)
         download_handle.close()
         logging.debug('Downloaded bovine chromosome %s', chrom)
-    fasta_url ='ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Bos_taurus/latest_assembly_versions/GCF_002263795.1_ARS-UCD1.2/GCF_002263795.1_ARS-UCD1.2_assembly_structure/non-nuclear/assembled_chromosomes/FASTA/chrMT.fna.gz'
+    fasta_url ='ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Bos_taurus/latest_assembly_versions/GCF_002263795.2_ARS-UCD1.3/GCF_002263795.2_ARS-UCD1.3_assembly_structure/non-nuclear/assembled_chromosomes/FASTA/chrMT.fna.gz'
     download_handle = ftp_down(fasta_url, local_file_name)
     download_handle.close()
     logging.debug('Downloaded bovine chromosome MT')
-    fasta_url ='ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Bos_taurus/latest_assembly_versions/GCF_002263795.1_ARS-UCD1.2/GCF_002263795.1_ARS-UCD1.2_assembly_structure/Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fna.gz'
+    fasta_url ='ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Bos_taurus/latest_assembly_versions/GCF_002263795.2_ARS-UCD1.3/GCF_002263795.2_ARS-UCD1.3_assembly_structure/Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fna.gz'
     download_handle = ftp_down(fasta_url, local_file_name)
     download_handle.close()
     logging.debug('Downloaded bovine chromosome unplaced')
 
     run_child('bgzip %s' % local_file_name)
     logging.info('Downloading gff annotation file')
-    gff_url = 'ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Bos_taurus/latest_assembly_versions/GCF_002263795.1_ARS-UCD1.2/GCF_002263795.1_ARS-UCD1.2_genomic.gff.gz'
+    gff_url = 'ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Bos_taurus/latest_assembly_versions/GCF_002263795.2_ARS-UCD1.3/GCF_002263795.2_ARS-UCD1.3_genomic.gff.gz'
     download_handle = ftp_down(gff_url)
     download_handle.close()
 
@@ -157,7 +182,8 @@ def main(args):
     """What the main does."""
     logging.info('now in fetch_data')
     if args.viral:
-        fetch_viral(args.viral)
+        #print(args.no_db_compression)
+        fetch_viral(args.viral, args.no_db_compression)
     if args.bact:
         fetch_bacterial()
     elif args.human:

diff --git a/src/virmet/index.py b/src/virmet/index.py
@@ -9,9 +9,9 @@
 import datetime
 from shutil import rmtree
 import multiprocessing as mp
-from virmet.common import run_child, DB_DIR, N_FILES_BACT
-
+from virmet.common import run_child, DB_DIR_UPDATE, N_FILES_BACT
 
+DB_DIR = DB_DIR_UPDATE
 def single_bwa_index(index_params):
     '''run a single bwa indexing job'''
     in_fasta, index_prefix = index_params
@@ -23,7 +23,7 @@ def single_bwa_index(index_params):
 def main(args):
     '''only function doing all the indexing'''
     logging.info('now in index')
-
+    logging.info('Database real path: %s' %os.path.realpath(DB_DIR))
     if args.viral == 'n':
         target_dir = os.path.join(DB_DIR, 'viral_nuccore')
         os.chdir(target_dir)
@@ -82,7 +82,7 @@ def main(args):
             os.mkdir(bwa_dir)
         except FileExistsError as err:
             logging.warning('FileExistsError: %s' % err)
-        fasta_file = os.path.join(DB_DIR, 'bovine', 'fasta', 'ref_Bos_taurus_GCF_002263795.1_ARS-UCD1.2.fasta.gz')
+        fasta_file = os.path.join(DB_DIR, 'bovine', 'fasta', 'ref_Bos_taurus_GCF_002263795.2_ARS-UCD1.3.fasta.gz')
         index_prefix = os.path.join(bwa_dir, 'bt_ref')
         index_pairs.append((fasta_file, index_prefix))