Skip to content

Commit

Permalink
Merge pull request #69 from cole-group/daskify_rlist
Browse files Browse the repository at this point in the history
Daskify rlist - the cornerstone for the 2.0.0 release.
  • Loading branch information
bieniekmateusz authored Jun 16, 2024
2 parents eec50f3 + 387fadc commit 7835d91
Show file tree
Hide file tree
Showing 33 changed files with 14,841 additions and 435 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ jobs:
fail-fast: false
matrix:
os: [ "ubuntu-latest" ]
python-version: [ "3.8", "3.9", "3.10"]
python-version: ["3.9", "3.10"]
max-parallel: 5

steps:
Expand All @@ -19,7 +19,7 @@ jobs:
uses: conda-incubator/[email protected]
with:
activate-environment: fegrow
environment-file: env.yml
environment-file: environment.yml
python-version: ${{ matrix.python-version }}
auto-update-conda: true
auto-activate-base: false
Expand Down
2 changes: 2 additions & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,5 @@ include fegrow/data/fpscores.pkl.gz
include fegrow/version.txt
include LICENSE.txt
include fegrow/testing/data/*sdf
include fegrow/testing/data/cs50k_scored49578_unique47710.csv.zip
include fegrow/testing/data/5R83_rec.pdb
8 changes: 6 additions & 2 deletions env.yml → environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ dependencies:
- conda-forge::pip
- conda-forge::rdkit
- conda-forge::prody
- conda-forge::openff-toolkit>=0.11
- conda-forge::openff-toolkit>=0.13
- conda-forge::parmed
- conda-forge::openmm>=7.5.0
- conda-forge::openmm-ml
Expand All @@ -21,8 +21,12 @@ dependencies:
- conda-forge::torchani>2.2.0
- conda-forge::pytorch-gpu>=1.10.0
- conda-forge::openmm-torch>=0.8
- conda-forge::pint<0.20.0
- conda-forge::pint!=0.2
- conda-forge::py3dmol>=1.8.0
- conda-forge::mols2grid>=0.2.4
- conda-forge::networkx
- conda-forge::pint-pandas
- conda-forge::dask
- conda-forge::modAL
- conda-forge::cachey
- conda-forge::pyparsing<3.1.0 # temporary fix for prody
8 changes: 3 additions & 5 deletions fegrow/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,9 @@

from .conformers import generate_conformers, WrongCoreForMolecule
from .package import (
RList,
ChemSpace,
RMol,
rep2D,
build_molecules,
build_molecule,
RGroups,
Linkers,
)
Expand All @@ -18,14 +17,13 @@
__all__ = [
RMol,
generate_conformers,
rep2D,
fix_receptor,
optimise_in_receptor,
tox_props,
sort_conformers,
RGroups,
Linkers,
build_molecules,
build_molecule,
__version__,
WrongCoreForMolecule,
]
159 changes: 159 additions & 0 deletions fegrow/al.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
import functools
import time
from typing import Callable
import logging

import dask
import numpy as np
from sklearn import linear_model, neural_network, ensemble, gaussian_process
from modAL.acquisition import max_UCB, max_EI, max_PI


logger = logging.getLogger(__name__)


def _dask_tanimito_similarity(a, b):
"""
Fixme this does not need to use matmul anymore because it's not a single core.
This can be transitioned to simple row by row dispatching.
"""
logger.info(f"About to compute tanimoto for array lengths {len(a)} and {len(b)}")
start = time.time()
chunk_size = 8_000
da = dask.array.from_array(a, chunks=chunk_size)
db = dask.array.from_array(b, chunks=chunk_size)
aa = dask.array.sum(da, axis=1, keepdims=True)
bb = dask.array.sum(db, axis=1, keepdims=True)
ab = dask.array.matmul(da, db.T)
td = dask.array.true_divide(ab, aa + bb.T - ab)
td_computed = td.compute()
logger.info(f"Computed tanimoto similarity in {time.time() - start:.2f}s for array lengths {len(a)} and {len(b)}")
return td_computed


class TanimotoKernel(gaussian_process.kernels.NormalizedKernelMixin,
gaussian_process.kernels.StationaryKernelMixin,
gaussian_process.kernels.Kernel):
"""Custom Gaussian process kernel that computes Tanimoto similarity."""

def __init__(self):
"""Initializer."""

def __call__(self, X, Y=None, eval_gradient=False): # pylint: disable=invalid-name
"""Computes the pairwise Tanimoto similarity.
Args:
X: Numpy array with shape [batch_size_a, num_features].
Y: Numpy array with shape [batch_size_b, num_features]. If None, X is
used.
eval_gradient: Whether to compute the gradient.
Returns:
Numpy array with shape [batch_size_a, batch_size_b].
Raises:
NotImplementedError: If eval_gradient is True.
"""
if eval_gradient:
raise NotImplementedError
if Y is None:
Y = X
return _dask_tanimito_similarity(X, Y)


class Query:
@staticmethod
def Greedy() -> Callable:
"""Takes the best instances by inference value sorted in ascending order.
Returns:
The greedy function.
"""

def greedy(optimizer,
features,
n_instances=1):
"""Takes the best instances by inference value sorted in ascending order.
Args:
optimizer: BaseLearner. Model to use to score instances.
features: modALinput. Featurization of the instances to choose from.
n_instances: Integer. The number of instances to select.
Returns:
Indices of the instances chosen.
"""
return np.argpartition(optimizer.predict(features), n_instances)[:n_instances]
return functools.partial(greedy, fegrow_label="greedy")

@staticmethod
def PI(tradeoff: float = 0) -> Callable:
"""
Maximum PI query strategy. Selects the instance with highest probability of improvement.
Args:
tradeoff: Value controlling the tradeoff parameter.
Returns:
The function with pre-populated parameters.
"""
return functools.partial(max_PI, tradeoff=tradeoff, fegrow_label="PI")

@staticmethod
def EI(tradeoff: float = 0) -> Callable:
"""
Maximum EI query strategy. Selects the instance with highest expected improvement.
Args:
tradeoff: Value controlling the tradeoff parameter.
Returns:
The function with pre-populated parameters.
"""
return functools.partial(max_EI, tradeoff=tradeoff, fegrow_label="EI")

@staticmethod
def UCB(beta: float = 1) -> Callable:
"""
Maximum UCB query strategy. Selects the instance with highest upper confidence bound.
Args:
beta: Value controlling the beta parameter.
Returns:
The function with pre-populated parameters.
"""
return functools.partial(max_UCB, beta=beta, fegrow_label="UCB")


class Model:

@staticmethod
def linear(**model_params):
return linear_model.LinearRegression(**model_params)

@staticmethod
def elastic_net(**model_params):
return linear_model.ElasticNetCV(**model_params)

@staticmethod
def random_forest(**model_params):
return ensemble.RandomForestRegressor(**model_params)

@staticmethod
def gradient_boosting_regressor(**model_params):
return ensemble.GradientBoostingRegressor(**model_params)

@staticmethod
def gaussian_process(**model_params):
return gaussian_process.GaussianProcessRegressor(kernel=TanimotoKernel(), **model_params)

@staticmethod
def mlp_regressor(**model_params):
return neural_network.MLPRegressor(**model_params)






Loading

0 comments on commit 7835d91

Please sign in to comment.