Skip to content

Commit

Permalink
add viral database compression
Browse files Browse the repository at this point in the history
  • Loading branch information
MaryamZaheri committed Mar 14, 2023
1 parent 0f4dae7 commit edc8f7d
Show file tree
Hide file tree
Showing 6 changed files with 168 additions and 37 deletions.
2 changes: 2 additions & 0 deletions src/virmet/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ def main():
parser_fetch.add_argument('--bact', help='bacterial (RefSeq)', action='store_true')
parser_fetch.add_argument('--fungal', help='fungal (RefSeq)', action='store_true')
parser_fetch.add_argument('--bovine', help='bovine (Bos taurus)', action='store_true')
parser_fetch.add_argument('--no_db_compression', help='do not compress the viral database', action='store_true', default=False)
parser_fetch.set_defaults(func=fetch_db)

# create the parser for command "update"
Expand All @@ -106,6 +107,7 @@ def main():
parser_update.add_argument('--bact', help='update bacterial database', action='store_true', default=False)
parser_update.add_argument('--fungal', help='update fungal database', action='store_true', default=False)
parser_update.add_argument('--picked', help='file with additional sequences, one GI per line', default=None)
parser_update.add_argument('--update_min_date', help='update viral [n]ucleic/[p]rotein with sequences produced after the date YYYY/MM/DD (e.g. 2022/09/01)', default=None)
parser_update.set_defaults(func=update_db)

# create the parser for command "index"
Expand Down
98 changes: 86 additions & 12 deletions src/virmet/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,10 @@
import multiprocessing as mp
import pandas as pd

DB_DIR = '/data/virmet_databases/'
N_FILES_BACT = 5
DB_DIR = '/data/virmet_databases'
DB_DIR_UPDATE = '/data/virmet_databases_update'
N_FILES_BACT = 16
MAX_TAXID = 10000 # max number of sequences belonging to the same taxid in compressed viral database

# decorator for taken from RepoPhlan
# https://bitbucket.org/nsegata/repophlan/src/5804db9d341165f72c4f7e448691ce90a1465764/repophlan_get_microbes.py?at=ncbi2017&fileviewer=file-view-default
Expand Down Expand Up @@ -82,6 +84,8 @@ def ftp_down(remote_url, local_url=None):

# decompressing
elif remote_url.endswith('.gz') and not outname.endswith('.gz'):
#print(remote_url)
#print(outname)
if os.path.exists(outname):
outhandle = open(outname, 'a')
else:
Expand All @@ -98,7 +102,7 @@ def ftp_down(remote_url, local_url=None):
else:
outhandle = open(outname, 'wb')
logging.debug('Downloading %s', remote_url)
with urllib.request.urlopen(remote_url, timeout=30) as f:
with urllib.request.urlopen(remote_url, timeout=300) as f:
outhandle.write(f.read())

# uncompressed to uncompressed
Expand All @@ -115,28 +119,97 @@ def ftp_down(remote_url, local_url=None):
# outhandle.close()
return outhandle

def random_reduction(viral_mode):
#This code, identify sequences from the same species using their taxid.
#Based on the given thresholds (10,000) subsample sequences of the corresponding taxid.
#Currently this code has the absolute threshold and any taxid with above
#10,000 reference sequences are filtered out.
#This is mainly done because of SARS-CoV-2 sequences covering above 90% of
#the viral database as of begining of 2023.
#The highest frequent virus is SARS-CoV-2 with 90% sequences of the database.

def strip(str_):
"""Make the strip method a function"""
return str_.strip()

if viral_mode == 'n':
logging.info('compressing viral nuccore sequences')
target_dir = os.path.join(DB_DIR_UPDATE, 'viral_nuccore')
elif viral_mode == 'p':
logging.info('compressing viral protein sequences')
target_dir = os.path.join(DB_DIR_UPDATE, 'viral_protein')
logging.info('Database real path for compression: %s' %os.path.realpath(target_dir))
os.chdir(target_dir)

viral_info_file = os.path.join(target_dir, 'viral_seqs_info.tsv')
viral_fasta_file = os.path.join(target_dir, 'viral_database.fasta')

viral_info = pd.read_table(viral_info_file, names=['accn', 'TaxId', 'seq_len', 'Organism', 'Title', 'accn_version' ])
viral_info.drop_duplicates(subset=['accn_version'], inplace=True)
# Do compression at the texid ID level,
# check if any taxid has more than 10,000 reference sequences move them to a new pandas table
# remove them from the main pandas table
TaxId_to_counter_df = viral_info['TaxId'].value_counts().reset_index()
TaxId_to_counter_df.rename(columns={"TaxId": "TaxId_count", "index": "TaxId_num"}, inplace=True)
TaxId_to_percentage = viral_info['TaxId'].value_counts(normalize=True).reset_index()
TaxId_to_counter_df['percentage'] = TaxId_to_percentage['TaxId']
TaxId_to_counter_filterred_df = TaxId_to_counter_df[TaxId_to_counter_df["TaxId_count"] > MAX_TAXID]

viral_info_subsampled = viral_info.copy()
random.seed(100)
removed_set = set()
for index, row in TaxId_to_counter_filterred_df.iterrows():
taxid_to_subsample = int(row['TaxId_num'])
viral_info_to_subsample_df = viral_info_subsampled[viral_info_subsampled['TaxId'] == taxid_to_subsample]
accn_set = set(viral_info_to_subsample_df['accn_version'])
# subsample to make it about 1% of the database
selected_accn_ls = random.sample(accn_set, MAX_TAXID)

# filter out the unselected IDs from viral_info
# (viral_info_subsampled['accn'] in selected_accn_ls)
# Keep refseq sequence: all accn of refseq sequences contain char '_'
# viral_info_subsampled['accn'].contain('_') viral_info_subsampled['accn'].str.contains('ON8')
viral_info_subsampled = viral_info_subsampled.loc[(viral_info_subsampled['TaxId'] != taxid_to_subsample) | viral_info_subsampled['accn_version'].isin(selected_accn_ls) | viral_info_subsampled['accn_version'].str.contains('_')]
# create a text file of the non_selected ACC ID as extra information
removed_set = removed_set | (accn_set - set(selected_accn_ls))

viral_info_subsampled.drop_duplicates(subset=['accn_version'], inplace=True)
viral_info_subsampled.accn_version.to_csv('outfile.csv', sep='\n', index=False, header=False)
# extract the selected accession number from the fasta file using seqtk
subsample_fasta_command = 'seqtk subseq %s outfile.csv > viral_database_subsampled.fasta' %(viral_fasta_file)
run_child(subsample_fasta_command)
os.rename('viral_database.fasta', 'viral_database_original_rmdup.fasta')
os.rename('viral_database_subsampled.fasta', 'viral_database.fasta')
TaxId_to_counter_filterred_df.to_csv('filtered_taxids.csv', sep=',', index=False)

def viral_query(viral_db):
def viral_query(viral_db, update_min_date=None):
# Viruses, Taxonomy ID: 10239
# Human adenovirus A, Taxonomy ID: 129875 (only for testing, 7 hits)
# Mastadenovirus, Taxonomy ID: 10509 (only for testing, 440 hits)
# Alphatorquevirus Taxonomy ID: 687331
# Cellular organisms, Taxonomy ID: 131567 (to avoid chimeras)
txid = '10239' # change here for viruses or smaller taxa
query_text = '-query \"txid%s [orgn] AND (\\"complete genome\\" [Title] OR srcdb_refseq[prop])' % txid
query_text += ' NOT wgs[PROP] NOT \\"cellular organisms\\"[Organism] NOT AC_000001[PACC] : AC_999999[PACC]\"'
query_text = '-query \"txid%s [orgn] AND (\\"complete genome\\" [Title] OR \\"complete segment\\" [Title] OR srcdb_refseq[prop])' % txid
query_text += ' NOT \\"cellular organisms\\"[Organism] NOT AC_000001[PACC] : AC_999999[PACC]\"'

if update_min_date:
logging.info('Viral Database Update is performed with sequences added to NCBI after %s .\n' %update_min_date)
query_text += '-datetype PDAT -mindate %s' %str(update_min_date) #-datetype MDAT -mindate %s'

query_text += ' > ncbi_search'

if viral_db == 'n':
target_dir = os.path.join(DB_DIR, 'viral_nuccore')
target_dir = os.path.join(DB_DIR_UPDATE, 'viral_nuccore')
search_text = '-db nuccore ' + query_text
elif viral_db == 'p':
target_dir = os.path.join(DB_DIR, 'viral_protein')
target_dir = os.path.join(DB_DIR_UPDATE, 'viral_protein')
search_text = '-db protein ' + query_text
try:
os.mkdir(target_dir)
except FileExistsError:
pass
os.chdir(target_dir)
logging.info('Database real path: %s' %os.path.realpath(target_dir))
return 'esearch ' + search_text


Expand All @@ -158,7 +231,7 @@ def bact_fung_query(query_type=None, download=True, info_file=None):
with urllib.request.urlopen(url) as f:
print(f.read().decode('utf-8'), file=bh)
bh.close()
querinfo = pd.read_csv(info_file, sep='\t', header=0, skiprows=1)
querinfo = pd.read_csv(info_file, sep='\t', header=0, skiprows=1, dtype={'excluded_from_refseq': str})
querinfo.rename(columns={'# assembly_accession': 'assembly_accession'}, inplace=True)
if query_type == 'bacteria':
gb = querinfo[(querinfo.assembly_level == 'Complete Genome') &
Expand All @@ -169,7 +242,8 @@ def bact_fung_query(query_type=None, download=True, info_file=None):
(querinfo.version_status == 'latest') &
(querinfo.genome_rep == 'Full') &
(querinfo.release_type == 'Major')]
gb.set_index('assembly_accession')
gb.set_index('assembly_accession', inplace=True)
gb = gb[gb['ftp_path']!='na']
x = gb['ftp_path'].apply(lambda col: col + '/' + col.split('/')[-1] + '_genomic.fna.gz')
gb = gb.assign(ftp_genome_path=x)
all_urls = list(gb['ftp_genome_path'])
Expand All @@ -196,8 +270,8 @@ def download_genomes(all_urls, prefix, n_files=1):
dl_pairs = []
for i, seqs in enumerate(seqs_urls):
fasta_out = 'fasta/%s%d.fasta' % (prefix, i + 1)
if os.path.exists(fasta_out):
os.remove(fasta_out)
#if os.path.exists(fasta_out):
# os.remove(fasta_out)
dl_pairs.append((fasta_out, seqs))

# run download in parallel
Expand Down
54 changes: 40 additions & 14 deletions src/virmet/fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,13 @@
import logging

from virmet.common import viral_query, bact_fung_query, ftp_down, run_child, \
download_genomes, get_accs, DB_DIR, N_FILES_BACT
download_genomes, get_accs, DB_DIR_UPDATE, N_FILES_BACT, random_reduction


def fetch_viral(viral_mode):
DB_DIR = DB_DIR_UPDATE
def fetch_viral(viral_mode, no_compression=False):
"""Download nucleotide or protein database."""
# define the search nuccore/protein

if viral_mode == 'n':
logging.info('downloading viral nuccore sequences')
target_dir = os.path.join(DB_DIR, 'viral_nuccore')
Expand All @@ -19,12 +20,13 @@ def fetch_viral(viral_mode):
target_dir = os.path.join(DB_DIR, 'viral_protein')
cml_search = viral_query('p')
# run the search and download
logging.info('Database real path: %s' %os.path.realpath(target_dir))
os.chdir(target_dir)
run_child(cml_search)
cml_fetch_fasta = 'efetch -format fasta < ncbi_search > viral_database.fasta'
run_child(cml_fetch_fasta)
cml_efetch_xtract = 'efetch -format docsum < ncbi_search | xtract'
cml_efetch_xtract += ' -pattern DocumentSummary -element Caption TaxId Slen Organism Title > viral_seqs_info.tsv'
cml_efetch_xtract += ' -pattern DocumentSummary -element Caption TaxId Slen Organism Title AccessionVersion > viral_seqs_info.tsv'
run_child(cml_efetch_xtract)
logging.info('downloaded viral seqs info in %s', target_dir)
logging.info('saving viral taxonomy')
Expand All @@ -35,6 +37,17 @@ def fetch_viral(viral_mode):
accs_2 = set([l.split()[0] for l in open('viral_accn_taxid.dmp')])
assert accs_1 == accs_2, accs_1 ^ accs_2
logging.info('taxonomy and fasta sequences match')

rmdup_cmd = 'cat viral_database.fasta | seqkit rmdup -i -o viral_database_rmdup.fasta -D duplicated_names.txt'
run_child(rmdup_cmd)
os.rename('viral_database.fasta', 'viral_database_original.fasta')
os.rename('viral_database_rmdup.fasta', 'viral_database.fasta')

#do_compression = True # change this to a parameter
#print('no_compression is %s' %no_compression)
if no_compression == False:
logging.info('Compress the database\n')
random_reduction(viral_mode)

os.chdir(DB_DIR)
logging.info('downloading taxonomy databases')
Expand All @@ -50,29 +63,39 @@ def fetch_viral(viral_mode):
os.remove(ftd)
except OSError:
logging.warning('Could not find file %s', ftd)


try:
run_child('bgzip names.dmp')
run_child('bgzip nodes.dmp')
except:
logging.debug('Could not find files names.dmp, nodes.dmp.')

def fetch_bacterial():
"""Download the three bacterial sequence databases."""
target_dir = os.path.join(DB_DIR, 'bacteria')
logging.info('Database real path: %s' %os.path.realpath(target_dir))
try:
os.mkdir(target_dir)
except FileExistsError:
pass
os.chdir(target_dir)

# first download summary file with all ftp paths and return urls
all_urls = bact_fung_query(query_type='bacteria')
#print(all_urls)
logging.info('%d bacterial genomes were found', len(all_urls))
# then download genomic_fna.gz files
download_genomes(all_urls, prefix='bact', n_files=N_FILES_BACT)
mid = len(all_urls)//2
download_genomes(all_urls[:mid], prefix='bact', n_files=N_FILES_BACT)
print('second half starts')
download_genomes(all_urls[mid:], prefix='bact', n_files=N_FILES_BACT)
for j in range(1, N_FILES_BACT+1):
run_child('bgzip fasta/bact%d.fasta' % j)


def fetch_human():
"""Download human genome and annotations."""
target_dir = os.path.join(DB_DIR, 'human')
logging.info('Database real path: %s' %os.path.realpath(target_dir))
try:
os.mkdir(target_dir)
except FileExistsError:
Expand All @@ -99,6 +122,7 @@ def fetch_human():
def fetch_fungal():
"""Download fungal sequences."""
target_dir = os.path.join(DB_DIR, 'fungi')
logging.info('Database real path: %s' %os.path.realpath(target_dir))
try:
os.mkdir(target_dir)
except FileExistsError:
Expand All @@ -115,6 +139,7 @@ def fetch_fungal():
def fetch_bovine():
"""Download cow genome and annotations."""
target_dir = os.path.join(DB_DIR, 'bovine')
logging.info('Database real path: %s' %os.path.realpath(target_dir))
try:
os.mkdir(target_dir)
except FileExistsError:
Expand All @@ -128,27 +153,27 @@ def fetch_bovine():
chromosomes = ['chr%d' % chrom for chrom in range(1, 30)]
chromosomes.extend(['chrX']) # chrY is missing
logging.info('Downloading bovine genome')
local_file_name = os.path.join(target_dir, 'fasta', 'ref_Bos_taurus_GCF_002263795.1_ARS-UCD1.2.fasta')
local_file_name = os.path.join(target_dir, 'fasta', 'ref_Bos_taurus_GCF_002263795.2_ARS-UCD1.3.fasta')
if os.path.exists(local_file_name):
os.remove(local_file_name)
for chrom in chromosomes:
logging.debug('Downloading bovine chromosome %s', chrom)
fasta_url = 'ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Bos_taurus/latest_assembly_versions/GCF_002263795.1_ARS-UCD1.2/GCF_002263795.1_ARS-UCD1.2_assembly_structure/Primary_Assembly/assembled_chromosomes/FASTA/%s.fna.gz' % chrom
fasta_url = 'ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Bos_taurus/latest_assembly_versions/GCF_002263795.2_ARS-UCD1.3/GCF_002263795.2_ARS-UCD1.3_assembly_structure/Primary_Assembly/assembled_chromosomes/FASTA/%s.fna.gz' % chrom
download_handle = ftp_down(fasta_url, local_file_name)
download_handle.close()
logging.debug('Downloaded bovine chromosome %s', chrom)
fasta_url ='ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Bos_taurus/latest_assembly_versions/GCF_002263795.1_ARS-UCD1.2/GCF_002263795.1_ARS-UCD1.2_assembly_structure/non-nuclear/assembled_chromosomes/FASTA/chrMT.fna.gz'
fasta_url ='ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Bos_taurus/latest_assembly_versions/GCF_002263795.2_ARS-UCD1.3/GCF_002263795.2_ARS-UCD1.3_assembly_structure/non-nuclear/assembled_chromosomes/FASTA/chrMT.fna.gz'
download_handle = ftp_down(fasta_url, local_file_name)
download_handle.close()
logging.debug('Downloaded bovine chromosome MT')
fasta_url ='ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Bos_taurus/latest_assembly_versions/GCF_002263795.1_ARS-UCD1.2/GCF_002263795.1_ARS-UCD1.2_assembly_structure/Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fna.gz'
fasta_url ='ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Bos_taurus/latest_assembly_versions/GCF_002263795.2_ARS-UCD1.3/GCF_002263795.2_ARS-UCD1.3_assembly_structure/Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fna.gz'
download_handle = ftp_down(fasta_url, local_file_name)
download_handle.close()
logging.debug('Downloaded bovine chromosome unplaced')

run_child('bgzip %s' % local_file_name)
logging.info('Downloading gff annotation file')
gff_url = 'ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Bos_taurus/latest_assembly_versions/GCF_002263795.1_ARS-UCD1.2/GCF_002263795.1_ARS-UCD1.2_genomic.gff.gz'
gff_url = 'ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Bos_taurus/latest_assembly_versions/GCF_002263795.2_ARS-UCD1.3/GCF_002263795.2_ARS-UCD1.3_genomic.gff.gz'
download_handle = ftp_down(gff_url)
download_handle.close()

Expand All @@ -157,7 +182,8 @@ def main(args):
"""What the main does."""
logging.info('now in fetch_data')
if args.viral:
fetch_viral(args.viral)
#print(args.no_db_compression)
fetch_viral(args.viral, args.no_db_compression)
if args.bact:
fetch_bacterial()
elif args.human:
Expand Down
8 changes: 4 additions & 4 deletions src/virmet/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@
import datetime
from shutil import rmtree
import multiprocessing as mp
from virmet.common import run_child, DB_DIR, N_FILES_BACT

from virmet.common import run_child, DB_DIR_UPDATE, N_FILES_BACT

DB_DIR = DB_DIR_UPDATE
def single_bwa_index(index_params):
'''run a single bwa indexing job'''
in_fasta, index_prefix = index_params
Expand All @@ -23,7 +23,7 @@ def single_bwa_index(index_params):
def main(args):
'''only function doing all the indexing'''
logging.info('now in index')

logging.info('Database real path: %s' %os.path.realpath(DB_DIR))
if args.viral == 'n':
target_dir = os.path.join(DB_DIR, 'viral_nuccore')
os.chdir(target_dir)
Expand Down Expand Up @@ -82,7 +82,7 @@ def main(args):
os.mkdir(bwa_dir)
except FileExistsError as err:
logging.warning('FileExistsError: %s' % err)
fasta_file = os.path.join(DB_DIR, 'bovine', 'fasta', 'ref_Bos_taurus_GCF_002263795.1_ARS-UCD1.2.fasta.gz')
fasta_file = os.path.join(DB_DIR, 'bovine', 'fasta', 'ref_Bos_taurus_GCF_002263795.2_ARS-UCD1.3.fasta.gz')
index_prefix = os.path.join(bwa_dir, 'bt_ref')
index_pairs.append((fasta_file, index_prefix))

Expand Down
Loading

0 comments on commit edc8f7d

Please sign in to comment.