From fe40ea530157dafd71178f4b399590083ad28759 Mon Sep 17 00:00:00 2001 From: Aitor Blanco Date: Tue, 17 Jan 2023 16:59:53 +0100 Subject: [PATCH] Sets download of precomputed indexes as the default option when installing the database --- metaphlan/__init__.py | 123 +++++++++++++++++++++++------------------- 1 file changed, 68 insertions(+), 55 deletions(-) diff --git a/metaphlan/__init__.py b/metaphlan/__init__.py index 2c8b631..4e17731 100644 --- a/metaphlan/__init__.py +++ b/metaphlan/__init__.py @@ -83,46 +83,26 @@ def download(url, download_file, force=False): sys.stderr.write("\nWarning: Unable to download " + url + "\n") else: sys.stderr.write("\nFile {} already present!\n".format(download_file)) - -def download_unpack_tar(download_file_name, folder, bowtie2_build, nproc, use_zenodo): - """ - Download the url to the file and decompress into the folder - """ - - # Create the folder if it does not already exist - if not os.path.isdir(folder): - try: - os.makedirs(folder) - except EnvironmentError: - sys.exit("ERROR: Unable to create folder for database install: " + folder) - - # Check the directory permissions - if not os.access(folder, os.W_OK): - sys.exit("ERROR: The directory is not writeable: " + folder + ". " - "Please modify the permissions.") - - #local path of the tarfile and md5file + + +def download_and_untar(download_file_name, folder, origin): + #local path of the tarfile and md5file tar_file = os.path.join(folder, download_file_name + ".tar") md5_file = os.path.join(folder, download_file_name + ".md5") - #Download the list of all the files in the FPT - url_tar_file = "http://cmprod1.cibio.unitn.it/biobakery4/metaphlan_databases/{}.tar".format(download_file_name) - url_md5_file = "http://cmprod1.cibio.unitn.it/biobakery4/metaphlan_databases/{}.md5".format(download_file_name) - + url_tar_file = "{}/{}.tar".format(origin, download_file_name) + url_md5_file = "{}/{}.md5".format(origin, download_file_name) # download tar and MD5 checksum download(url_tar_file, tar_file) download(url_md5_file, md5_file) - md5_md5 = None md5_tar = None - if os.path.isfile(md5_file): with open(md5_file) as f: for row in f: md5_md5 = row.strip().split(' ')[0] else: sys.stderr.write('File "{}" not found!\n'.format(md5_file)) - # compute MD5 of .tar.bz2 if os.path.isfile(tar_file): hash_md5 = hashlib.md5() @@ -134,23 +114,44 @@ def download_unpack_tar(download_file_name, folder, bowtie2_build, nproc, use_ze md5_tar = hash_md5.hexdigest()[:32] else: sys.stderr.write('File "{}" not found!\n'.format(tar_file)) - if (md5_tar is None) or (md5_md5 is None): sys.exit("MD5 checksums not found, something went wrong!") - # compare checksums if md5_tar != md5_md5: sys.exit("MD5 checksums do not correspond! If this happens again, you should remove the database files and " "rerun MetaPhlAn so they are re-downloaded") - # untar try: tarfile_handle = tarfile.open(tar_file) tarfile_handle.extractall(path=folder) tarfile_handle.close() + os.remove(tar_file) + os.remove(md5_file) except EnvironmentError: sys.stderr.write("Warning: Unable to extract {}.\n".format(tar_file)) +def download_unpack_tar(download_file_name, folder, bowtie2_build, nproc, use_zenodo): + """ + Download the url to the file and decompress into the folder + """ + + # Create the folder if it does not already exist + if not os.path.isdir(folder): + try: + os.makedirs(folder) + except EnvironmentError: + sys.exit("ERROR: Unable to create folder for database install: " + folder) + + # Check the directory permissions + if not os.access(folder, os.W_OK): + sys.exit("ERROR: The directory is not writeable: " + folder + ". " + "Please modify the permissions.") + + sys.stderr.write('\n\Downloading and uncompressing indexes\n') + download_and_untar("{}_bt2".format(download_file_name), folder, "http://cmprod1.cibio.unitn.it/biobakery4/metaphlan_databases/bowtie2_indexes") + sys.stderr.write('\nDownloading and uncompressing additional files\n') + download_and_untar(download_file_name, folder, "http://cmprod1.cibio.unitn.it/biobakery4/metaphlan_databases") + # uncompress sequences for bz2_file in iglob(os.path.join(folder, download_file_name + "_*.fna.bz2")): fna_file = bz2_file[:-4] @@ -162,37 +163,20 @@ def download_unpack_tar(download_file_name, folder, bowtie2_build, nproc, use_ze bz2.BZ2File(bz2_file, 'rb') as bz2_h: for data in iter(lambda: bz2_h.read(100 * 1024), b''): fna_h.write(data) - os.remove(bz2_file) - - # join fasta files - sys.stderr.write('\n\nJoining FASTA databases\n') - with open(os.path.join(folder, download_file_name + ".fna"), 'w') as fna_h: - for fna_file in iglob(os.path.join(folder, download_file_name + "_*.fna")): - with open(fna_file, 'r') as fna_r: - for line in fna_r: - fna_h.write(line) - fna_file = os.path.join(folder, download_file_name + ".fna") + os.remove(bz2_file) # build bowtie2 indexes if not glob(os.path.join(folder, download_file_name + "*.bt2l")): - bt2_base = os.path.join(folder, download_file_name) - bt2_cmd = [bowtie2_build, '--quiet'] - - if nproc > 1: - bt2_build_output = subp.check_output([bowtie2_build, '--usage'], stderr=subp.STDOUT) - - if 'threads' in str(bt2_build_output): - bt2_cmd += ['--threads', str(nproc)] - - bt2_cmd += ['-f', fna_file, bt2_base] - - sys.stderr.write('\nBuilding Bowtie2 indexes\n') - + build_bwt_indexes(folder, download_file_name, bowtie2_build, nproc) + else: try: - subp.check_call(bt2_cmd) + subp.check_call(['bowtie2-inspect', '-n', os.path.join(folder, download_file_name)], stdout=subp.DEVNULL, stderr=subp.DEVNULL) except Exception as e: - sys.stderr.write("Fatal error running '{}'\nError message: '{}'\n\n".format(' '.join(bt2_cmd), e)) - sys.exit(1) + sys.stderr.write('Downloaded indexes are not compatible with the installed verion of Bowtie2\n') + sys.stderr.write('Building indexes from the FASTA files\n') + for btw_file in iglob(os.path.join(folder, download_file_name + "*.bt2l")): + os.remove(btw_file) + build_bwt_indexes(folder, download_file_name, bowtie2_build, nproc) try: for bt2 in glob(os.path.join(folder, download_file_name + "*.bt2l")): @@ -207,6 +191,35 @@ def download_unpack_tar(download_file_name, folder, bowtie2_build, nproc, use_ze for fna_file in iglob(os.path.join(folder, download_file_name + "_*.fna")): if not fna_file.endswith('_VSG.fna'): os.remove(fna_file) + + +def build_bwt_indexes(folder, download_file_name, bowtie2_build, nproc): + sys.stderr.write('\n\nJoining FASTA databases\n') + with open(os.path.join(folder, download_file_name + ".fna"), 'w') as fna_h: + for fna_file in iglob(os.path.join(folder, download_file_name + "_*.fna")): + with open(fna_file, 'r') as fna_r: + for line in fna_r: + fna_h.write(line) + fna_file = os.path.join(folder, download_file_name + ".fna") + + bt2_base = os.path.join(folder, download_file_name) + bt2_cmd = [bowtie2_build, '--quiet'] + + if nproc > 1: + bt2_build_output = subp.check_output([bowtie2_build, '--usage'], stderr=subp.STDOUT) + + if 'threads' in str(bt2_build_output): + bt2_cmd += ['--threads', str(nproc)] + + bt2_cmd += ['-f', fna_file, bt2_base] + + sys.stderr.write('\nBuilding Bowtie2 indexes\n') + + try: + subp.check_call(bt2_cmd) + except Exception as e: + sys.stderr.write("Fatal error running '{}'\nError message: '{}'\n\n".format(' '.join(bt2_cmd), e)) + sys.exit(1) def download_unpack_zip(url,download_file_name,folder,software_name): """