Merge pull request #179 from biothings/indexer-update

Add MyChemIndexer class
biothings · Jun 17, 2024 · e6fd7c3 · e6fd7c3
2 parents e13aa96 + a56dae5
commit e6fd7c3
Show file tree

Hide file tree

Showing 17 changed files with 862 additions and 837 deletions.
diff --git a/.github/workflows/app_tests.yml b/.github/workflows/app_tests.yml
@@ -12,7 +12,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
+        python-version: ['3.7', '3.8', '3.9', '3.10', '3.11', '3.12']
     steps:
       - name: Checkout source
         uses: actions/checkout@v3
@@ -28,7 +28,7 @@ jobs:
         run: pip install pytest
       - name: Run App Tests
         run: pytest test_local.py
-        working-directory: src/tests 
+        working-directory: src/tests
     services:
       Elasticsearch:
         image: docker.elastic.co/elasticsearch/elasticsearch:8.6.0

diff --git a/requirements_web.txt b/requirements_web.txt
@@ -3,5 +3,5 @@
 
 # biothings[web_extra]==0.12.4
 
-# Fixes from 0.12.x up to 2024-05-06
+# Fixes from 0.12.x up to 2024-06-14
 git+https://github.com/biothings/biothings.api@639302ace2f9dd90c8a9de57aa16c5a0d4beac27#egg=biothings[web_extra]
diff --git a/src/config_hub.py b/src/config_hub.py
@@ -10,7 +10,7 @@
 INDEX_CONFIG = {
     "indexer_select": {
         # default
-        None: "hub.dataindex.indexer.DrugIndexer",
+        None: "hub.dataindex.indexer.MyChemIndexer",
     },
     "env": {
         "prod": {

diff --git a/src/hub/dataindex/indexer.py b/src/hub/dataindex/indexer.py
@@ -1,11 +1,33 @@
-import asyncio
+from copy import deepcopy
 
 from biothings.hub.dataindex.indexer import Indexer
 
+DEFAULT_INDEX_MAPPINGS = {
+    "properties": {
+        "all": {"type": "text"},
+        "name": {
+            "type": "text",
+            "fields": {
+                "raw": {
+                    "type": "keyword",
+                    "ignore_above": 128,
+                    "normalizer": "keyword_lowercase_normalizer"
+                }
+            },
+            "copy_to": "all"
+        }
+    }
+}
 
-class DrugIndexer(Indexer):
-    pass
 
-    # @asyncio.coroutine
-    # def index(self, job_manager, steps=("pre", "index", "post"), batch_size=2500, ids=None, mode="index"):
-    #     return super().index(job_manager, steps=steps, batch_size=batch_size, ids=ids, mode=mode)
+class MyChemIndexer(Indexer):
+    def __init__(self, build_doc, indexer_env, index_name):
+        super().__init__(build_doc, indexer_env, index_name)
+
+        new_mappings = deepcopy(DEFAULT_INDEX_MAPPINGS)
+
+        self.es_index_mappings["properties"].update(
+            new_mappings["properties"])
+
+        self.logger.debug("Updated Index mappings: %s",
+                          dict(self.es_index_mappings))
diff --git a/src/hub/dataload/sources/aeolus/aeolus_upload.py b/src/hub/dataload/sources/aeolus/aeolus_upload.py
@@ -110,7 +110,8 @@ def get_mapping(klass):
                     "drug_name": {
                         "type": "text",
                         "copy_to": [
-                            "all"
+                            "all",
+                            "name"
                         ]
                     },
                     "pt": {

diff --git a/src/hub/dataload/sources/chebi/chebi_upload.py b/src/hub/dataload/sources/chebi/chebi_upload.py
@@ -1,15 +1,15 @@
 import os
-import pymongo
 
-from .chebi_parser import ChebiParser, CompoundReader, OntologyReader
-from .exclusion_ids import exclusion_ids
-from hub.dataload.uploader import BaseDrugUploader
-from biothings.utils.mongo import get_src_db
 import biothings.hub.dataload.storage as storage
+import pymongo
 from biothings.utils.exclude_ids import ExcludeFieldsById
+from biothings.utils.mongo import get_src_db
 
+from hub.dataload.uploader import BaseDrugUploader
 from hub.datatransform.keylookup import MyChemKeyLookup
 
+from .chebi_parser import ChebiParser, CompoundReader, OntologyReader
+from .exclusion_ids import exclusion_ids
 
 SRC_META = {
     "url": 'https://www.ebi.ac.uk/chebi/',
@@ -38,7 +38,7 @@ class ChebiUploader(BaseDrugUploader):
     - `chebi.xrefs.patent`
 
     `ExcludeFieldsById` acts like a filter to truncate the length of such long lists to 1,000.
-    
+
     See the comment on the ExcludeFieldsById for use of this class.
     """
     exclude_fields = ExcludeFieldsById(exclusion_ids, [
@@ -53,10 +53,12 @@ def load_data(self, data_folder):
         self.logger.info("Load data from '%s'" % data_folder)
 
         sdf_input_file = os.path.join(data_folder, "ChEBI_complete.sdf")
-        assert os.path.exists(sdf_input_file), "Can't find input file '%s'" % sdf_input_file
+        assert os.path.exists(
+            sdf_input_file), "Can't find input file '%s'" % sdf_input_file
 
         obo_input_file = os.path.join(data_folder, "chebi_lite.obo")
-        assert os.path.exists(obo_input_file), "Can't find input file '%s'" % obo_input_file
+        assert os.path.exists(
+            obo_input_file), "Can't find input file '%s'" % obo_input_file
 
         # get others source collection for inchi key conversion
         drugbank_col = get_src_db()["drugbank"]
@@ -276,7 +278,7 @@ def get_mapping(klass):
                     },
                     "name": {
                         "type": "text",
-                        'copy_to': ['all'],
+                        'copy_to': ['all', 'name'],
                     },
                     "charge": {
                         "type": "integer"

diff --git a/src/hub/dataload/sources/chembl/chembl_upload.py b/src/hub/dataload/sources/chembl/chembl_upload.py
@@ -1,15 +1,18 @@
 """
 Chembl uploader
 """
+import glob
+
 # pylint: disable=E0401, E0611
 import os
-import glob
+
 import biothings.hub.dataload.storage as storage
 from biothings.hub.dataload.uploader import ParallelizedSourceUploader
+
 from hub.dataload.uploader import BaseDrugUploader
 from hub.datatransform.keylookup import MyChemKeyLookup
-from .chembl_parser import AuxiliaryDataLoader, MoleculeDataLoader, load_chembl_data
 
+from .chembl_parser import AuxiliaryDataLoader, MoleculeDataLoader, load_chembl_data
 
 SRC_META = {
     "url": 'https://www.ebi.ac.uk/chembl/',
@@ -50,13 +53,19 @@ def jobs(self):
         this method will be called by self.update_data() and then generate arguments for self.load.data() method,
         allowing parallelization
         """
-        molecule_filepaths = glob.glob(os.path.join(self.data_folder, self.MOLECULE_FILENAME_PATTERN))
-        mol_data_loaders = [MoleculeDataLoader(molecule_filepath=filepath) for filepath in molecule_filepaths]
+        molecule_filepaths = glob.glob(os.path.join(
+            self.data_folder, self.MOLECULE_FILENAME_PATTERN))
+        mol_data_loaders = [MoleculeDataLoader(
+            molecule_filepath=filepath) for filepath in molecule_filepaths]
 
-        drug_indication_filepaths = glob.iglob(os.path.join(self.data_folder, self.DRUG_INDICATION_FILENAME_PATTERN))
-        mechanism_filepaths = glob.iglob(os.path.join(self.data_folder, self.MECHANISM_FILENAME_PATTERN))
-        target_filepaths = glob.iglob(os.path.join(self.data_folder, self.TARGET_FILENAME_PATTERN))
-        binding_site_filepaths = glob.iglob(os.path.join(self.data_folder, self.BINDING_SITE_FILENAME_PATTERN))
+        drug_indication_filepaths = glob.iglob(os.path.join(
+            self.data_folder, self.DRUG_INDICATION_FILENAME_PATTERN))
+        mechanism_filepaths = glob.iglob(os.path.join(
+            self.data_folder, self.MECHANISM_FILENAME_PATTERN))
+        target_filepaths = glob.iglob(os.path.join(
+            self.data_folder, self.TARGET_FILENAME_PATTERN))
+        binding_site_filepaths = glob.iglob(os.path.join(
+            self.data_folder, self.BINDING_SITE_FILENAME_PATTERN))
         aux_data_loader = AuxiliaryDataLoader(drug_indication_filepaths=drug_indication_filepaths,
                                               mechanism_filepaths=mechanism_filepaths,
                                               target_filepaths=target_filepaths,
@@ -67,7 +76,8 @@ def jobs(self):
 
     def load_data(self, mol_data_loader: MoleculeDataLoader, aux_data_loader: AuxiliaryDataLoader):
         """load data from an input file"""
-        self.logger.info("Load data from file '%s'" % mol_data_loader.molecule_filepath)
+        self.logger.info("Load data from file '%s'" %
+                         mol_data_loader.molecule_filepath)
 
         return self.keylookup(load_chembl_data, debug=True)(mol_data_loader, aux_data_loader)
 
@@ -103,9 +113,9 @@ def get_mapping(cls):
                                     }
                                 }
                             },
-			    "first_approval": {
-				"type": "integer"
- 			    },
+                            "first_approval": {
+                                "type": "integer"
+                            },
                             "indication_refs": {
                                 "properties": {
                                     "id": {
@@ -535,7 +545,8 @@ def get_mapping(cls):
                     "pref_name": {
                         "type": "text",
                         "copy_to": [
-                            "all"
+                            "all",
+                            "name"
                         ]
                     },
                     "first_approval": {

diff --git a/src/hub/dataload/sources/drugbank_open/drugbank_open_mapping.py b/src/hub/dataload/sources/drugbank_open/drugbank_open_mapping.py
@@ -11,7 +11,8 @@
                 "type": "keyword"
             },
             "name": {
-                "type": "text"
+                "type": "text",
+                "copy_to": ["name"]
             },
             "cas_number": {
                 "normalizer": "keyword_lowercase_normalizer",
-Original file line number
+Diff line change
@@ Expand Up / @@ -110,7 +110,8 @@ def get_mapping(klass): @@
                         "drug_name": {
                             "type": "text",
                             "copy_to": [
-                                "all"
+                                "all",
+                                "name"
                             ]
                         },
                         "pt": {
@@ Expand Down @@