From 3cb762fd083e300da5b7188d486397bc1ca6d296 Mon Sep 17 00:00:00 2001 From: Jackson Burns Date: Wed, 29 Mar 2023 16:20:17 -0400 Subject: [PATCH] minhash is not a binary descriptor, move to exptl desc list --- AIMSim-demo.ipynb | 4 ++-- README.md | 2 +- aimsim/ops/descriptor.py | 8 ++++---- tests/test_Descriptor.py | 26 ++++++++++++++++++++++++++ 4 files changed, 33 insertions(+), 7 deletions(-) diff --git a/AIMSim-demo.ipynb b/AIMSim-demo.ipynb index bb162ad8..0a08e044 100644 --- a/AIMSim-demo.ipynb +++ b/AIMSim-demo.ipynb @@ -112,13 +112,13 @@ "|1|morgan|\n", "|2|topological|\n", "|3|daylight|\n", - "|4|minhash|\n", "\n", "Each of these fingerprints should be generally applicable for chemical problems, though they are all provided to serve as an easy way to compare the results according to fingerprinting approach.\n", "\n", "Additional descriptors are included with _AIMSim_ which are not mathematically compatible with some of the similarity measures. When such a descriptor is selected, the corresponding similarity measure will be removed from the dropdown.\n", "\n", "The `Exp. Descriptors` checkbox adds additional molecular descriptors into the `Molecular Descriptor` dropdown. These are marked as _experimental_ because they are generated using third-party libraries over which we have very little or no control. The descriptors generated by these libraries should be used only when the user has a very specific need for a descriptor as implemented in one of the packages below:\n", + " - [minhash](https://github.com/reymond-group/mhfp): Much faster fingerprint for similarity searches, supports non-binary similarity metrics.\n", " - [ccbmlib](https://doi.org/10.12688/f1000research.22292.2): All molecular fingerprints included in the `ccbmlib` library have been reproduced in _AIMSim_. Read about these fingerprints [in the `ccbmlib` repository](https://github.com/vogt-m/ccbmlib).\n", " - [mordred](https://doi.org/10.1186/s13321-018-0258-y): All 1000+ descriptors included in `mordred` are available in _AIMSim_, though as of Januray 2022 it seems that `mordred` is no longer being maintained and has a significant amount of bugs. Use at your own risk. (command line only)\n", " - [PaDELPy](https://doi.org/10.1002/jcc.21707): [This package](https://github.com/ecrl/padelpy) provides access to all of the molecular descriptors included as part of the PaDEL-Descriptor standalone Java program. (command line only)\n", @@ -3227,7 +3227,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.7.15" } }, "nbformat": 4, diff --git a/README.md b/README.md index 246bf97c..b718bc97 100644 --- a/README.md +++ b/README.md @@ -70,10 +70,10 @@ Start `AIMSim` with a prepared configuration YAML file (`config.yaml`): 1. Morgan Fingerprint (Equivalent to the ECFP fingerprints) 2. RDKit Topological Fingerprint 3. RDKit Daylight Fingerprint -4. MinHash Fingerprint (see [MHFP](https://github.com/reymond-group/mhfp)) _The following are available via command line use (config.yaml) only:_ +4. MinHash Fingerprint (see [MHFP](https://github.com/reymond-group/mhfp)) 5. All fingerprints available from the [ccbmlib](https://github.com/vogt-m/ccbmlib) package (_specify 'ccbmlib:descriptorname' for command line input_). 6. All descriptors and fingerprints available from [PaDELPy](https://github.com/ecrl/padelpy), an interface to PaDEL-Descriptor. (_specify 'padelpy:desciptorname' for command line input._). 7. All descriptors available through the [Mordred](https://github.com/mordred-descriptor/mordred) library (_specify 'mordred:desciptorname' for command line input._). To enable this option, you must install with `pip install 'aimsim[mordred]'` (see disclaimer in the Installation section above). diff --git a/aimsim/ops/descriptor.py b/aimsim/ops/descriptor.py index 72aed31f..87cb0f5e 100644 --- a/aimsim/ops/descriptor.py +++ b/aimsim/ops/descriptor.py @@ -265,7 +265,7 @@ def _set_ccbmlib_fingerprint(self, molecule_graph, descriptor, **kwargs): self.label_ = descriptor self.params_ = {} - def _set_minhash_fprint(self, molecule_graph, **kwargs): + def _set_minhash_fingerprint(self, molecule_graph, **kwargs): """Set the descriptor to the minhash fingerprint. Args: @@ -273,7 +273,7 @@ def _set_minhash_fprint(self, molecule_graph, **kwargs): """ mhfp_encoder = MHFPEncoder( - n_permutations=kwargs["n_premutations"], + n_permutations=kwargs["n_permutations"], seed=kwargs["seed"], ) fp = mhfp_encoder.encode_mol( @@ -307,7 +307,7 @@ def make_fingerprint( morgan_params = {"radius": 3, "n_bits": 1024} morgan_params.update(fingerprint_params) self._set_morgan_fingerprint(molecule_graph=molecule_graph, **morgan_params) - if fingerprint_type == "minhash_fingerprint": + elif fingerprint_type == "minhash_fingerprint": minhash_params = { "n_permutations": 2048, "seed": 42, @@ -499,7 +499,6 @@ def get_supported_fprints(): "morgan_fingerprint", "topological_fingerprint", "daylight_fingerprint", - "minhash_fingerprint", ] @staticmethod @@ -515,6 +514,7 @@ def get_all_supported_descriptors(): "morgan_fingerprint", "topological_fingerprint", "daylight_fingerprint", + "minhash_fingerprint", "maccs_keys", "atom-pair_fingerprint", "torsion_fingerprint", diff --git a/tests/test_Descriptor.py b/tests/test_Descriptor.py index ebdf11d2..662440f8 100644 --- a/tests/test_Descriptor.py +++ b/tests/test_Descriptor.py @@ -273,6 +273,31 @@ def test_padelpy_descriptors(self): with self.assertRaises(ValueError): descriptor.to_rdkit() + def test_minhash_fingerprint(self): + """Test creation of minhash fingerprint""" + mol_graph = MolFromSmiles("CCOCC") + descriptor = Descriptor() + descriptor.make_fingerprint( + molecule_graph=mol_graph, fingerprint_type="minhash_fingerprint" + ) + self.assertTrue( + descriptor.check_init(), + "Expected Descriptor object to be initialized", + ) + self.assertEqual( + descriptor.label_, + "minhash_fingerprint", + "Expected label of descriptor initialized with " + "{} to match the fingerprint".format("minhash_fingerprint"), + ) + self.assertIsInstance( + descriptor.to_numpy(), + np.ndarray, + "Expected numpy.ndarray from to_numpy()", + ) + with self.assertRaises(ValueError): + descriptor.to_rdkit() + def test_ccbmlib_descriptors(self): """Test ability to passthrough descriptors to ccbmlib.""" mol_graph = MolFromSmiles("CCOCC") @@ -310,6 +335,7 @@ def test_exptl_descriptors(self): "maccs_keys", "atom-pair_fingerprint", "torsion_fingerprint", + "minhash_fingerprint", ] for desc in fprint_list: descriptor = Descriptor()