Skip to content

Commit

Permalink
Merge pull request #19942 from ThomasHoffmann77/20240220124705_new_pr…
Browse files Browse the repository at this point in the history
…_AlphaFold232

{bio}[foss/2023a] AlphaFold v2.3.2, dm-haiku v0.0.12, tensorstore v0.1.65 w/ CUDA v12.1.1
  • Loading branch information
boegel authored Oct 11, 2024
2 parents 4303c2e + c81b3e5 commit 96515d3
Show file tree
Hide file tree
Showing 6 changed files with 700 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
easyblock = 'PythonBundle'

name = 'AlphaFold'
version = '2.3.2'
versionsuffix = '-CUDA-%(cudaver)s'

homepage = 'https://deepmind.com/research/case-studies/alphafold'
description = "AlphaFold can predict protein structures with atomic accuracy even where no similar structure is known"

toolchain = {'name': 'foss', 'version': '2023a'}

builddependencies = [
('poetry', '1.5.1')
]

dependencies = [
('Python', '3.11.3'),
('CUDA', '12.1.1', '', SYSTEM),
('SciPy-bundle', '2023.07'),
('PyYAML', '6.0'),
('TensorFlow', '2.13.0'), # doesn't require TF-gpu
('Biopython', '1.83'),
('HH-suite', '3.3.0'),
('HMMER', '3.4'),
('Kalign', '3.4.0'),
('jax', '0.4.25', versionsuffix), # also provides absl-py # requirement is ==0.3.25!
('UCX-CUDA', '1.14.1', versionsuffix),
('cuDNN', '8.9.2.26', versionsuffix, SYSTEM),
('NCCL', '2.18.3', versionsuffix),
('OpenMM', '8.0.0', versionsuffix),
('dm-tree', '0.1.8'),
('dm-haiku', '0.0.12', versionsuffix),
]

# commit to use for downloading stereo_chemical_props.txt and copy to alphafold/common,
# see docker/Dockerfile in AlphaFold repository
local_scp_commit = '7102c6'

components = [
('stereo_chemical_props.txt', local_scp_commit, {
'easyblock': 'Binary',
'source_urls': [
'https://git.scicore.unibas.ch/schwede/openstructure/-/raw/%s/modules/mol/alg/src/' % local_scp_commit,
],
'sources': [
{
'download_filename': 'stereo_chemical_props.txt',
'filename': 'stereo_chemical_props-%s.txt' % local_scp_commit,
'extract_cmd': "cp %s ./stereo_chemical_props.txt",
}
],
'checksums': [
'24510899eeb49167cffedec8fa45363a4d08279c0c637a403b452f7d0ac09451', # stereo_chemical_props-7102c6.txt
]
})
]

use_pip = True

exts_list = [
('PDBFixer', '1.9', {
'source_urls': ['https://github.com/openmm/pdbfixer/archive/refs/tags/'],
'sources': [{'download_filename': '%(version)s.tar.gz', 'filename': SOURCE_TAR_GZ}],
'checksums': ['88b9a77e50655f89d0eb2075093773e82c27a4cef842cb7d735c877b20cd39fb'],
}),
('tabulate', '0.9.0', {
'checksums': ['0095b12bf5966de529c0feb1fa08671671b3368eec77d7ef7ab114be2c068b3c'],
}),
('websocket-client', '1.5.1', {
'modulename': 'websocket',
'checksums': ['3f09e6d8230892547132177f575a4e3e73cfdf06526e20cc02aa1c3b47184d40'],
}),
('docker', '7.0.0', {
'checksums': ['323736fb92cd9418fc5e7133bc953e11a9da04f4483f828b527db553f1e7e5a3'],
}),
('immutabledict', '4.1.0', {
'checksums': ['93d100ccd2cd09a1fd3f136b9328c6e59529ba341de8bb499437f6819159fe8a'],
}),
('contextlib2', '21.6.0', {
'checksums': ['ab1e2bfe1d01d968e1b7e8d9023bc51ef3509bba217bb730cee3827e1ee82869'],
}),
('ml_collections', '0.1.1', {
'preinstallopts': "touch requirements.txt && touch requirements-test.txt && ",
'checksums': ['3fefcc72ec433aa1e5d32307a3e474bbb67f405be814ea52a2166bfc9dbe68cc'],
}),
(name, version, {
'patches': [
'AlphaFold-2.0.0_fix-packages.patch',
'AlphaFold-2.3.2_data-dep-paths-shebang-UniRef30.patch',
'AlphaFold-2.0.0_n-cpu.patch',
'AlphaFold-2.0.1_setup_rm_tfcpu.patch',
'AlphaFold-2.3.2_use_openmm_8.0.0.patch',
'AlphaFold-2.3.2_BioPythonPDBData.patch',
],
'source_urls': ['https://github.com/deepmind/alphafold/archive/refs/tags/'],
'sources': [{'download_filename': 'v%(version)s.tar.gz', 'filename': SOURCE_TAR_GZ}],
'checksums': [
{'AlphaFold-2.3.2.tar.gz': '4ea8005ba1b573fa1585e4c29b7d188c5cbfa59b4e4761c9f0c15c9db9584a8e'},
{'AlphaFold-2.0.0_fix-packages.patch': '826d2d1a5d6ac52c51a60ba210e1947d5631a1e2d76f8815305b5d23f74458db'},
{'AlphaFold-2.3.2_data-dep-paths-shebang-UniRef30.patch':
'58cd0ce4094afe76909649abe68034c4fbdb500967f5c818f49b530356dc012b'},
{'AlphaFold-2.0.0_n-cpu.patch': 'dfda4dd5f9aba19fe2b6eb9a0ec583d12dcefdfee8ab8803fc57ad48d582db04'},
{'AlphaFold-2.0.1_setup_rm_tfcpu.patch':
'1a2e4e843bd9a4d15ee39e6c37cc63ba281311cc7a0a5610f0e43b52ef93faac'},
{'AlphaFold-2.3.2_use_openmm_8.0.0.patch':
'bbef940c0c959040aaf3984ec47777a229c164517b54616a2688d58fae636d84'},
{'AlphaFold-2.3.2_BioPythonPDBData.patch':
'e4483a525ae5c4dc5a5f633bed8cf5337c329e64b603ab7b684a9d18cd26a22f'},
],
}),
]

local_pylibdir = '%(installdir)s/lib/python%(pyshortver)s/site-packages'
local_link_scp = 'ln -s %%(installdir)s/stereo_chemical_props.txt %s/alphafold/common' % local_pylibdir

postinstallcmds = [
'cp %(builddir)s/AlphaFold/alphafold-%(version)s/run_alphafold*.py %(installdir)s/bin',
'cp -rpP %(builddir)s/AlphaFold/alphafold-%(version)s/scripts %(installdir)s',
'cd %(installdir)s/bin && ln -s run_alphafold.py alphafold',
'chmod a+x %(installdir)s/bin/run_alphafold.py',
local_link_scp,
]

sanity_check_paths = {
'files': ['bin/alphafold', 'bin/pdbfixer', 'bin/run_alphafold.py', 'stereo_chemical_props.txt'],
'dirs': ['lib/python%(pyshortver)s/site-packages', 'scripts'],
}

sanity_check_commands = [
"pdbfixer --help",
"python -m openmm.testInstallation",
"python -c 'import alphafold'",
"alphafold --help 2>&1 | grep 'Full AlphaFold protein structure prediction script'",
"python %(installdir)s/bin/run_alphafold_test.py",
]

sanity_pip_check = True

# these allow to make predictions on proteins that would typically be too long to fit into GPU memory;
# see https://github.com/deepmind/alphafold/blob/main/docker/run_docker.py
modextravars = {
# these allow to make predictions on proteins that would typically be too long to fit into GPU memory;
# see https://github.com/deepmind/alphafold/blob/main/docker/run_docker.py
'TF_FORCE_UNIFIED_MEMORY': '1',
# jaxlib 0.4.1: https://jax.readthedocs.io/en/latest/changelog.html#jaxlib-0-4-1-dec-13-2022
# "The behavior of XLA_PYTHON_CLIENT_MEM_FRACTION=.XX has been changed to allocate XX% of the total GPU memory
# instead of the previous behavior of using currently available GPU memory to calculate preallocation. Please refer
# to GPU memory allocation for more details."
# https://jax.readthedocs.io/en/latest/gpu_memory_allocation.html
'XLA_PYTHON_CLIENT_MEM_FRACTION': '2.5',
#
# Download with $EBROOTALPHAFOLD/scripts/download_all_data.sh /path/to/AlphaFold_DBs/$EBVERSIONALPHAFOLD
'ALPHAFOLD_DATA_DIR': '/path/to/AlphaFold_DBs/%(versions)s', # please adapt
# Adapt in order to use a different version of UniRef30 by default,
# e.g., v2023_02 from https://wwwuser.gwdg.de/~compbiol/uniclust/2023_02/UniRef30_2023_02_hhsuite.tar.gz:
'ALPHAFOLD_UNIREF30_VER': '2021_03',
'OPENMM_RELAX': 'CUDA' # unset or set to 'CPU' in order not to run the energy minimization on GPU; PR#189
}

postinstallmsgs = [
"A newer version of UniRef30 (2023_02) is available at: "
"https://wwwuser.gwdg.de/~compbiol/uniclust/2023_02/UniRef30_2023_02_hhsuite.tar.gz. "
"Untar to $ALPHAFOLD_DATA_DIR/uniref30/ and set the default version accordingly by changing "
"modextravars:ALPHAFOLD_UNIREF30_VER."
]

moduleclass = 'bio'
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Thomas Hoffmann, EMBL Heidelberg, [email protected], 2024/10
# BioPython 1.83 does not provide protein_letters_3to1 in Bio.Data.SCOPdata but in Bio.Data.PDBData (and Bio.Data.IUPACData)
diff -ru -ru alphafold-2.3.2/alphafold/data/mmcif_parsing.py alphafold-2.3.2_BioPythonSCOPData/alphafold/data/mmcif_parsing.py
--- alphafold-2.3.2/alphafold/data/mmcif_parsing.py 2024-02-19 09:55:16.359778490 +0100
+++ alphafold-2.3.2_BioPythonSCOPData/alphafold/data/mmcif_parsing.py 2023-03-27 13:50:49.000000000 +0200
@@ -21,7 +21,7 @@

from absl import logging
from Bio import PDB
-from Bio.Data import SCOPData
+from Bio.Data import PDBData as SCOPData

# Type aliases:
ChainId = str
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
pick up on $ALPHAFOLD_DATA_DIR to specify location to downloaded data
(see https://github.com/deepmind/alphafold/blob/main/docker/run_docker.py);
pick up on HH-suite, HHMER, Kalign dependencies provided via EasyBuild
author: Kenneth Hoste (HPC-UGent)
update 2.0.1 -> 2.1.0/2.1.2/2.3.0/2.3.2: Thomas Hoffmann (EMBL);
uniref30 version env. variable (THEMBL)

diff -ru alphafold-2.3.2/run_alphafold.py alphafold-2.3.2_data-dep-paths-shebang-UniRef30/run_alphafold.py
--- alphafold-2.3.2/run_alphafold.py 2023-03-27 13:50:49.000000000 +0200
+++ alphafold-2.3.2_data-dep-paths-shebang-UniRef30/run_alphafold.py 2024-10-11 11:34:06.330278962 +0200
@@ -1,3 +1,4 @@
+#!/usr/bin/env python
# Copyright 2021 DeepMind Technologies Limited
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -42,6 +43,48 @@
import numpy as np

# Internal import (7716).
+use_reduced_dbs = any("--db_preset=reduced_dbs" in s for s in sys.argv[1:])
+use_monomer_preset = not any("--model_preset=multimer" in s for s in sys.argv[1:])
+
+data_dir = os.getenv('ALPHAFOLD_DATA_DIR')
+use_gpu_relax = os.getenv('OPENMM_RELAX')=='CUDA'
+uniref30_ver = os.getenv('ALPHAFOLD_UNIREF30_VER')
+if not uniref30_ver: uniref30_ver = '2021_03'
+
+if data_dir:
+ mgnify_database_path = os.path.join(data_dir, 'mgnify', 'mgy_clusters_2022_05.fa')
+ uniref90_database_path = os.path.join(data_dir, 'uniref90', 'uniref90.fasta')
+ template_mmcif_dir = os.path.join(data_dir, 'pdb_mmcif', 'mmcif_files')
+ obsolete_pdbs_path = os.path.join(data_dir, 'pdb_mmcif', 'obsolete.dat')
+ if use_monomer_preset:
+ pdb_seqres_database_path = None
+ uniprot_database_path = None
+ pdb70_database_path = os.path.join(data_dir, 'pdb70', 'pdb70')
+ else:
+ pdb_seqres_database_path = os.path.join(data_dir, 'pdb_seqres', 'pdb_seqres.txt')
+ uniprot_database_path = os.path.join(data_dir, 'uniprot', 'uniprot.fasta')
+ pdb70_database_path = None
+ if use_reduced_dbs:
+ small_bfd_database_path = os.path.join(data_dir, 'small_bfd','bfd-first_non_consensus_sequences.fasta')
+ uniref30_database_path = None
+ bfd_database_path = None
+ else:
+ small_bfd_database_path = None
+ bfd_database_path = os.path.join(data_dir, 'bfd', 'bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt')
+ uniref30_database_path = os.path.join(data_dir, 'uniref30', 'UniRef30_%s' % uniref30_ver)
+else:
+ sys.stderr.write("$ALPHAFOLD_DATA_DIR is not defined!")
+ uniref90_database_path = None
+ mgnify_database_path = None
+ bfd_database_path = None
+ uniref30_database_path = None
+ pdb70_database_path = None
+ template_mmcif_dir = None
+ obsolete_pdbs_path = None
+ small_bfd_database_path = None
+ uniprot_database_path = None
+ pdb_seqres_database_path = None
+ use_gpu_relax = None

logging.set_verbosity(logging.INFO)

@@ -59,7 +102,7 @@
'separated by commas. All FASTA paths must have a unique basename as the '
'basename is used to name the output directories for each prediction.')

-flags.DEFINE_string('data_dir', None, 'Path to directory of supporting data.')
+flags.DEFINE_string('data_dir', data_dir, 'Path to directory of supporting data.')
flags.DEFINE_string('output_dir', None, 'Path to a directory that will '
'store the results.')
flags.DEFINE_string('jackhmmer_binary_path', shutil.which('jackhmmer'),
@@ -71,32 +114,32 @@
flags.DEFINE_string('hmmsearch_binary_path', shutil.which('hmmsearch'),
'Path to the hmmsearch executable.')
flags.DEFINE_string('hmmbuild_binary_path', shutil.which('hmmbuild'),
- 'Path to the hmmbuild executable.')
+ 'Path to the hmmbuild executable.')
flags.DEFINE_string('kalign_binary_path', shutil.which('kalign'),
- 'Path to the Kalign executable.')
-flags.DEFINE_string('uniref90_database_path', None, 'Path to the Uniref90 '
- 'database for use by JackHMMER.')
-flags.DEFINE_string('mgnify_database_path', None, 'Path to the MGnify '
- 'database for use by JackHMMER.')
-flags.DEFINE_string('bfd_database_path', None, 'Path to the BFD '
- 'database for use by HHblits.')
-flags.DEFINE_string('small_bfd_database_path', None, 'Path to the small '
- 'version of BFD used with the "reduced_dbs" preset.')
-flags.DEFINE_string('uniref30_database_path', None, 'Path to the UniRef30 '
- 'database for use by HHblits.')
-flags.DEFINE_string('uniprot_database_path', None, 'Path to the Uniprot '
- 'database for use by JackHMMer.')
-flags.DEFINE_string('pdb70_database_path', None, 'Path to the PDB70 '
- 'database for use by HHsearch.')
-flags.DEFINE_string('pdb_seqres_database_path', None, 'Path to the PDB '
- 'seqres database for use by hmmsearch.')
-flags.DEFINE_string('template_mmcif_dir', None, 'Path to a directory with '
- 'template mmCIF structures, each named <pdb_id>.cif')
+ 'Path to the Kalign executable.')
+flags.DEFINE_string('uniref90_database_path', uniref90_database_path, 'Path to the Uniref90 '
+ 'database for use by JackHMMER.')
+flags.DEFINE_string('mgnify_database_path', mgnify_database_path, 'Path to the MGnify '
+ 'database for use by JackHMMER.')
+flags.DEFINE_string('bfd_database_path', bfd_database_path, 'Path to the BFD '
+ 'database for use by HHblits.')
+flags.DEFINE_string('small_bfd_database_path', small_bfd_database_path, 'Path to the small '
+ 'version of BFD used with the "reduced_dbs" preset.')
+flags.DEFINE_string('uniref30_database_path', uniref30_database_path, 'Path to the UniRef30 '
+ 'database for use by HHblits.')
+flags.DEFINE_string('uniprot_database_path', uniprot_database_path, 'Path to the Uniprot '
+ 'database for use by JackHMMer.')
+flags.DEFINE_string('pdb70_database_path', pdb70_database_path, 'Path to the PDB70 '
+ 'database for use by HHsearch.')
+flags.DEFINE_string('pdb_seqres_database_path', pdb_seqres_database_path, 'Path to the PDB '
+ 'seqres database for use by hmmsearch.')
+flags.DEFINE_string('template_mmcif_dir', template_mmcif_dir, 'Path to a directory with '
+ 'template mmCIF structures, each named <pdb_id>.cif')
flags.DEFINE_string('max_template_date', None, 'Maximum template release date '
- 'to consider. Important if folding historical test sets.')
-flags.DEFINE_string('obsolete_pdbs_path', None, 'Path to file containing a '
- 'mapping from obsolete PDB IDs to the PDB IDs of their '
- 'replacements.')
+ 'to consider. Important if folding historical test sets.')
+flags.DEFINE_string('obsolete_pdbs_path', obsolete_pdbs_path, 'Path to file containing a '
+ 'mapping from obsolete PDB IDs to the PDB IDs of their '
+ 'replacements.')
flags.DEFINE_enum('db_preset', 'full_dbs',
['full_dbs', 'reduced_dbs'],
'Choose preset MSA database configuration - '
@@ -137,7 +180,7 @@
'distracting stereochemical violations but might help '
'in case you are having issues with the relaxation '
'stage.')
-flags.DEFINE_boolean('use_gpu_relax', None, 'Whether to relax on GPU. '
+flags.DEFINE_boolean('use_gpu_relax', use_gpu_relax, 'Whether to relax on GPU. '
'Relax on GPU can be much faster than CPU, so it is '
'recommended to enable if possible. GPUs must be available'
' if this setting is enabled.')
@@ -334,6 +377,10 @@
'sure it is installed on your system.')

use_small_bfd = FLAGS.db_preset == 'reduced_dbs'
+ if use_small_bfd and data_dir:
+ bfd_database_path = None
+ uniref30_database_path = None
+
_check_flag('small_bfd_database_path', 'db_preset',
should_be_set=use_small_bfd)
_check_flag('bfd_database_path', 'db_preset',
@@ -456,13 +503,7 @@
flags.mark_flags_as_required([
'fasta_paths',
'output_dir',
- 'data_dir',
- 'uniref90_database_path',
- 'mgnify_database_path',
- 'template_mmcif_dir',
'max_template_date',
- 'obsolete_pdbs_path',
- 'use_gpu_relax',
])

app.run(main)
Loading

0 comments on commit 96515d3

Please sign in to comment.