Merge pull request #69 from cole-group/daskify_rlist

Daskify rlist - the cornerstone for the 2.0.0 release.
cole-group · Jun 16, 2024 · 7835d91 · 7835d91
2 parents eec50f3 + 387fadc
commit 7835d91
Show file tree

Hide file tree

Showing 33 changed files with 14,841 additions and 435 deletions.
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
@@ -10,7 +10,7 @@ jobs:
  fail-fast: false
  matrix:
  os: [ "ubuntu-latest" ]
- python-version: [ "3.8", "3.9", "3.10"]
+ python-version: ["3.9", "3.10"]
  max-parallel: 5
 
  steps:
@@ -19,7 +19,7 @@ jobs:
  uses: conda-incubator/[email protected]
  with:
  activate-environment: fegrow
- environment-file: env.yml
+ environment-file: environment.yml
  python-version: ${{ matrix.python-version }}
  auto-update-conda: true
  auto-activate-base: false

diff --git a/MANIFEST.in b/MANIFEST.in
@@ -4,3 +4,5 @@ include fegrow/data/fpscores.pkl.gz
 include fegrow/version.txt
 include LICENSE.txt
 include fegrow/testing/data/*sdf
+include fegrow/testing/data/cs50k_scored49578_unique47710.csv.zip
+include fegrow/testing/data/5R83_rec.pdb
diff --git a/env.yml → environment.yml b/env.yml → environment.yml
@@ -9,7 +9,7 @@ dependencies:
  - conda-forge::pip
  - conda-forge::rdkit
  - conda-forge::prody
- - conda-forge::openff-toolkit>=0.11
+ - conda-forge::openff-toolkit>=0.13
  - conda-forge::parmed
  - conda-forge::openmm>=7.5.0
  - conda-forge::openmm-ml
@@ -21,8 +21,12 @@ dependencies:
  - conda-forge::torchani>2.2.0
  - conda-forge::pytorch-gpu>=1.10.0
  - conda-forge::openmm-torch>=0.8
- - conda-forge::pint<0.20.0
+ - conda-forge::pint!=0.2
  - conda-forge::py3dmol>=1.8.0
  - conda-forge::mols2grid>=0.2.4
  - conda-forge::networkx
  - conda-forge::pint-pandas
+ - conda-forge::dask
+ - conda-forge::modAL
+ - conda-forge::cachey
+ - conda-forge::pyparsing<3.1.0 # temporary fix for prody
diff --git a/fegrow/__init__.py b/fegrow/__init__.py
@@ -2,10 +2,9 @@
 
 from .conformers import generate_conformers, WrongCoreForMolecule
 from .package import (
- RList,
+ ChemSpace,
  RMol,
- rep2D,
- build_molecules,
+ build_molecule,
  RGroups,
  Linkers,
 )
@@ -18,14 +17,13 @@
 __all__ = [
  RMol,
  generate_conformers,
- rep2D,
  fix_receptor,
  optimise_in_receptor,
  tox_props,
  sort_conformers,
  RGroups,
  Linkers,
- build_molecules,
+ build_molecule,
  __version__,
  WrongCoreForMolecule,
 ]
diff --git a/fegrow/al.py b/fegrow/al.py
@@ -0,0 +1,159 @@
+import functools
+import time
+from typing import Callable
+import logging
+
+import dask
+import numpy as np
+from sklearn import linear_model, neural_network, ensemble, gaussian_process
+from modAL.acquisition import max_UCB, max_EI, max_PI
+
+
+logger = logging.getLogger(__name__)
+
+
+def _dask_tanimito_similarity(a, b):
+ """
+ Fixme this does not need to use matmul anymore because it's not a single core.
+ This can be transitioned to simple row by row dispatching.
+ """
+ logger.info(f"About to compute tanimoto for array lengths {len(a)} and {len(b)}")
+ start = time.time()
+ chunk_size = 8_000
+ da = dask.array.from_array(a, chunks=chunk_size)
+ db = dask.array.from_array(b, chunks=chunk_size)
+ aa = dask.array.sum(da, axis=1, keepdims=True)
+ bb = dask.array.sum(db, axis=1, keepdims=True)
+ ab = dask.array.matmul(da, db.T)
+ td = dask.array.true_divide(ab, aa + bb.T - ab)
+ td_computed = td.compute()
+ logger.info(f"Computed tanimoto similarity in {time.time() - start:.2f}s for array lengths {len(a)} and {len(b)}")
+ return td_computed
+
+
+class TanimotoKernel(gaussian_process.kernels.NormalizedKernelMixin,
+ gaussian_process.kernels.StationaryKernelMixin,
+ gaussian_process.kernels.Kernel):
+ """Custom Gaussian process kernel that computes Tanimoto similarity."""
+
+ def __init__(self):
+ """Initializer."""
+
+ def __call__(self, X, Y=None, eval_gradient=False): # pylint: disable=invalid-name
+ """Computes the pairwise Tanimoto similarity.
+
+ Args:
+ X: Numpy array with shape [batch_size_a, num_features].
+ Y: Numpy array with shape [batch_size_b, num_features]. If None, X is
+ used.
+ eval_gradient: Whether to compute the gradient.
+
+ Returns:
+ Numpy array with shape [batch_size_a, batch_size_b].
+
+ Raises:
+ NotImplementedError: If eval_gradient is True.
+ """
+ if eval_gradient:
+ raise NotImplementedError
+ if Y is None:
+ Y = X
+ return _dask_tanimito_similarity(X, Y)
+
+
+class Query:
+ @staticmethod
+ def Greedy() -> Callable:
+ """Takes the best instances by inference value sorted in ascending order.
+
+ Returns:
+ The greedy function.
+ """
+
+ def greedy(optimizer,
+ features,
+ n_instances=1):
+ """Takes the best instances by inference value sorted in ascending order.
+
+ Args:
+ optimizer: BaseLearner. Model to use to score instances.
+ features: modALinput. Featurization of the instances to choose from.
+ n_instances: Integer. The number of instances to select.
+
+ Returns:
+ Indices of the instances chosen.
+ """
+ return np.argpartition(optimizer.predict(features), n_instances)[:n_instances]
+ return functools.partial(greedy, fegrow_label="greedy")
+
+ @staticmethod
+ def PI(tradeoff: float = 0) -> Callable:
+ """
+ Maximum PI query strategy. Selects the instance with highest probability of improvement.
+
+ Args:
+ tradeoff: Value controlling the tradeoff parameter.
+
+ Returns:
+ The function with pre-populated parameters.
+ """
+ return functools.partial(max_PI, tradeoff=tradeoff, fegrow_label="PI")
+
+ @staticmethod
+ def EI(tradeoff: float = 0) -> Callable:
+ """
+ Maximum EI query strategy. Selects the instance with highest expected improvement.
+
+ Args:
+ tradeoff: Value controlling the tradeoff parameter.
+
+ Returns:
+ The function with pre-populated parameters.
+ """
+ return functools.partial(max_EI, tradeoff=tradeoff, fegrow_label="EI")
+
+ @staticmethod
+ def UCB(beta: float = 1) -> Callable:
+ """
+ Maximum UCB query strategy. Selects the instance with highest upper confidence bound.
+
+ Args:
+ beta: Value controlling the beta parameter.
+
+ Returns:
+ The function with pre-populated parameters.
+ """
+ return functools.partial(max_UCB, beta=beta, fegrow_label="UCB")
+
+
+class Model:
+
+ @staticmethod
+ def linear(**model_params):
+ return linear_model.LinearRegression(**model_params)
+
+ @staticmethod
+ def elastic_net(**model_params):
+ return linear_model.ElasticNetCV(**model_params)
+
+ @staticmethod
+ def random_forest(**model_params):
+ return ensemble.RandomForestRegressor(**model_params)
+
+ @staticmethod
+ def gradient_boosting_regressor(**model_params):
+ return ensemble.GradientBoostingRegressor(**model_params)
+
+ @staticmethod
+ def gaussian_process(**model_params):
+ return gaussian_process.GaussianProcessRegressor(kernel=TanimotoKernel(), **model_params)
+
+ @staticmethod
+ def mlp_regressor(**model_params):
+ return neural_network.MLPRegressor(**model_params)
+
+
+
+
+
+