Skip to content

Commit

Permalink
Sets download of precomputed indexes as the default option when insta…
Browse files Browse the repository at this point in the history
…lling the database
  • Loading branch information
abmiguez committed Jan 17, 2023
1 parent 611d37d commit fe40ea5
Showing 1 changed file with 68 additions and 55 deletions.
123 changes: 68 additions & 55 deletions metaphlan/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,46 +83,26 @@ def download(url, download_file, force=False):
sys.stderr.write("\nWarning: Unable to download " + url + "\n")
else:
sys.stderr.write("\nFile {} already present!\n".format(download_file))

def download_unpack_tar(download_file_name, folder, bowtie2_build, nproc, use_zenodo):
"""
Download the url to the file and decompress into the folder
"""

# Create the folder if it does not already exist
if not os.path.isdir(folder):
try:
os.makedirs(folder)
except EnvironmentError:
sys.exit("ERROR: Unable to create folder for database install: " + folder)

# Check the directory permissions
if not os.access(folder, os.W_OK):
sys.exit("ERROR: The directory is not writeable: " + folder + ". "
"Please modify the permissions.")

#local path of the tarfile and md5file


def download_and_untar(download_file_name, folder, origin):
#local path of the tarfile and md5file
tar_file = os.path.join(folder, download_file_name + ".tar")
md5_file = os.path.join(folder, download_file_name + ".md5")

#Download the list of all the files in the FPT
url_tar_file = "http://cmprod1.cibio.unitn.it/biobakery4/metaphlan_databases/{}.tar".format(download_file_name)
url_md5_file = "http://cmprod1.cibio.unitn.it/biobakery4/metaphlan_databases/{}.md5".format(download_file_name)

url_tar_file = "{}/{}.tar".format(origin, download_file_name)
url_md5_file = "{}/{}.md5".format(origin, download_file_name)
# download tar and MD5 checksum
download(url_tar_file, tar_file)
download(url_md5_file, md5_file)

md5_md5 = None
md5_tar = None

if os.path.isfile(md5_file):
with open(md5_file) as f:
for row in f:
md5_md5 = row.strip().split(' ')[0]
else:
sys.stderr.write('File "{}" not found!\n'.format(md5_file))

# compute MD5 of .tar.bz2
if os.path.isfile(tar_file):
hash_md5 = hashlib.md5()
Expand All @@ -134,23 +114,44 @@ def download_unpack_tar(download_file_name, folder, bowtie2_build, nproc, use_ze
md5_tar = hash_md5.hexdigest()[:32]
else:
sys.stderr.write('File "{}" not found!\n'.format(tar_file))

if (md5_tar is None) or (md5_md5 is None):
sys.exit("MD5 checksums not found, something went wrong!")

# compare checksums
if md5_tar != md5_md5:
sys.exit("MD5 checksums do not correspond! If this happens again, you should remove the database files and "
"rerun MetaPhlAn so they are re-downloaded")

# untar
try:
tarfile_handle = tarfile.open(tar_file)
tarfile_handle.extractall(path=folder)
tarfile_handle.close()
os.remove(tar_file)
os.remove(md5_file)
except EnvironmentError:
sys.stderr.write("Warning: Unable to extract {}.\n".format(tar_file))

def download_unpack_tar(download_file_name, folder, bowtie2_build, nproc, use_zenodo):
"""
Download the url to the file and decompress into the folder
"""

# Create the folder if it does not already exist
if not os.path.isdir(folder):
try:
os.makedirs(folder)
except EnvironmentError:
sys.exit("ERROR: Unable to create folder for database install: " + folder)

# Check the directory permissions
if not os.access(folder, os.W_OK):
sys.exit("ERROR: The directory is not writeable: " + folder + ". "
"Please modify the permissions.")

sys.stderr.write('\n\Downloading and uncompressing indexes\n')
download_and_untar("{}_bt2".format(download_file_name), folder, "http://cmprod1.cibio.unitn.it/biobakery4/metaphlan_databases/bowtie2_indexes")
sys.stderr.write('\nDownloading and uncompressing additional files\n')
download_and_untar(download_file_name, folder, "http://cmprod1.cibio.unitn.it/biobakery4/metaphlan_databases")

# uncompress sequences
for bz2_file in iglob(os.path.join(folder, download_file_name + "_*.fna.bz2")):
fna_file = bz2_file[:-4]
Expand All @@ -162,37 +163,20 @@ def download_unpack_tar(download_file_name, folder, bowtie2_build, nproc, use_ze
bz2.BZ2File(bz2_file, 'rb') as bz2_h:
for data in iter(lambda: bz2_h.read(100 * 1024), b''):
fna_h.write(data)
os.remove(bz2_file)

# join fasta files
sys.stderr.write('\n\nJoining FASTA databases\n')
with open(os.path.join(folder, download_file_name + ".fna"), 'w') as fna_h:
for fna_file in iglob(os.path.join(folder, download_file_name + "_*.fna")):
with open(fna_file, 'r') as fna_r:
for line in fna_r:
fna_h.write(line)
fna_file = os.path.join(folder, download_file_name + ".fna")
os.remove(bz2_file)

# build bowtie2 indexes
if not glob(os.path.join(folder, download_file_name + "*.bt2l")):
bt2_base = os.path.join(folder, download_file_name)
bt2_cmd = [bowtie2_build, '--quiet']

if nproc > 1:
bt2_build_output = subp.check_output([bowtie2_build, '--usage'], stderr=subp.STDOUT)

if 'threads' in str(bt2_build_output):
bt2_cmd += ['--threads', str(nproc)]

bt2_cmd += ['-f', fna_file, bt2_base]

sys.stderr.write('\nBuilding Bowtie2 indexes\n')

build_bwt_indexes(folder, download_file_name, bowtie2_build, nproc)
else:
try:
subp.check_call(bt2_cmd)
subp.check_call(['bowtie2-inspect', '-n', os.path.join(folder, download_file_name)], stdout=subp.DEVNULL, stderr=subp.DEVNULL)
except Exception as e:
sys.stderr.write("Fatal error running '{}'\nError message: '{}'\n\n".format(' '.join(bt2_cmd), e))
sys.exit(1)
sys.stderr.write('Downloaded indexes are not compatible with the installed verion of Bowtie2\n')
sys.stderr.write('Building indexes from the FASTA files\n')
for btw_file in iglob(os.path.join(folder, download_file_name + "*.bt2l")):
os.remove(btw_file)
build_bwt_indexes(folder, download_file_name, bowtie2_build, nproc)

try:
for bt2 in glob(os.path.join(folder, download_file_name + "*.bt2l")):
Expand All @@ -207,6 +191,35 @@ def download_unpack_tar(download_file_name, folder, bowtie2_build, nproc, use_ze
for fna_file in iglob(os.path.join(folder, download_file_name + "_*.fna")):
if not fna_file.endswith('_VSG.fna'):
os.remove(fna_file)


def build_bwt_indexes(folder, download_file_name, bowtie2_build, nproc):
sys.stderr.write('\n\nJoining FASTA databases\n')
with open(os.path.join(folder, download_file_name + ".fna"), 'w') as fna_h:
for fna_file in iglob(os.path.join(folder, download_file_name + "_*.fna")):
with open(fna_file, 'r') as fna_r:
for line in fna_r:
fna_h.write(line)
fna_file = os.path.join(folder, download_file_name + ".fna")

bt2_base = os.path.join(folder, download_file_name)
bt2_cmd = [bowtie2_build, '--quiet']

if nproc > 1:
bt2_build_output = subp.check_output([bowtie2_build, '--usage'], stderr=subp.STDOUT)

if 'threads' in str(bt2_build_output):
bt2_cmd += ['--threads', str(nproc)]

bt2_cmd += ['-f', fna_file, bt2_base]

sys.stderr.write('\nBuilding Bowtie2 indexes\n')

try:
subp.check_call(bt2_cmd)
except Exception as e:
sys.stderr.write("Fatal error running '{}'\nError message: '{}'\n\n".format(' '.join(bt2_cmd), e))
sys.exit(1)

def download_unpack_zip(url,download_file_name,folder,software_name):
"""
Expand Down

0 comments on commit fe40ea5

Please sign in to comment.