add name group to mapping for sources

biothings · Jun 12, 2024 · 71c52db · 71c52db
1 parent 858e4fa
commit 71c52db
Show file tree

Hide file tree

Showing 11 changed files with 820 additions and 793 deletions.
diff --git a/src/hub/dataload/sources/aeolus/aeolus_upload.py b/src/hub/dataload/sources/aeolus/aeolus_upload.py
@@ -110,7 +110,8 @@ def get_mapping(klass):
                     "drug_name": {
                         "type": "text",
                         "copy_to": [
-                            "all"
+                            "all",
+                            "name"
                         ]
                     },
                     "pt": {

diff --git a/src/hub/dataload/sources/chebi/chebi_upload.py b/src/hub/dataload/sources/chebi/chebi_upload.py
@@ -1,15 +1,15 @@
 import os
-import pymongo
 
-from .chebi_parser import ChebiParser, CompoundReader, OntologyReader
-from .exclusion_ids import exclusion_ids
-from hub.dataload.uploader import BaseDrugUploader
-from biothings.utils.mongo import get_src_db
 import biothings.hub.dataload.storage as storage
+import pymongo
 from biothings.utils.exclude_ids import ExcludeFieldsById
+from biothings.utils.mongo import get_src_db
 
+from hub.dataload.uploader import BaseDrugUploader
 from hub.datatransform.keylookup import MyChemKeyLookup
 
+from .chebi_parser import ChebiParser, CompoundReader, OntologyReader
+from .exclusion_ids import exclusion_ids
 
 SRC_META = {
     "url": 'https://www.ebi.ac.uk/chebi/',
@@ -38,7 +38,7 @@ class ChebiUploader(BaseDrugUploader):
     - `chebi.xrefs.patent`
 
     `ExcludeFieldsById` acts like a filter to truncate the length of such long lists to 1,000.
-    
+
     See the comment on the ExcludeFieldsById for use of this class.
     """
     exclude_fields = ExcludeFieldsById(exclusion_ids, [
@@ -53,10 +53,12 @@ def load_data(self, data_folder):
         self.logger.info("Load data from '%s'" % data_folder)
 
         sdf_input_file = os.path.join(data_folder, "ChEBI_complete.sdf")
-        assert os.path.exists(sdf_input_file), "Can't find input file '%s'" % sdf_input_file
+        assert os.path.exists(
+            sdf_input_file), "Can't find input file '%s'" % sdf_input_file
 
         obo_input_file = os.path.join(data_folder, "chebi_lite.obo")
-        assert os.path.exists(obo_input_file), "Can't find input file '%s'" % obo_input_file
+        assert os.path.exists(
+            obo_input_file), "Can't find input file '%s'" % obo_input_file
 
         # get others source collection for inchi key conversion
         drugbank_col = get_src_db()["drugbank"]
@@ -276,7 +278,7 @@ def get_mapping(klass):
                     },
                     "name": {
                         "type": "text",
-                        'copy_to': ['all'],
+                        'copy_to': ['all', 'name'],
                     },
                     "charge": {
                         "type": "integer"

diff --git a/src/hub/dataload/sources/chembl/chembl_upload.py b/src/hub/dataload/sources/chembl/chembl_upload.py
@@ -1,15 +1,18 @@
 """
 Chembl uploader
 """
+import glob
+
 # pylint: disable=E0401, E0611
 import os
-import glob
+
 import biothings.hub.dataload.storage as storage
 from biothings.hub.dataload.uploader import ParallelizedSourceUploader
+
 from hub.dataload.uploader import BaseDrugUploader
 from hub.datatransform.keylookup import MyChemKeyLookup
-from .chembl_parser import AuxiliaryDataLoader, MoleculeDataLoader, load_chembl_data
 
+from .chembl_parser import AuxiliaryDataLoader, MoleculeDataLoader, load_chembl_data
 
 SRC_META = {
     "url": 'https://www.ebi.ac.uk/chembl/',
@@ -50,13 +53,19 @@ def jobs(self):
         this method will be called by self.update_data() and then generate arguments for self.load.data() method,
         allowing parallelization
         """
-        molecule_filepaths = glob.glob(os.path.join(self.data_folder, self.MOLECULE_FILENAME_PATTERN))
-        mol_data_loaders = [MoleculeDataLoader(molecule_filepath=filepath) for filepath in molecule_filepaths]
+        molecule_filepaths = glob.glob(os.path.join(
+            self.data_folder, self.MOLECULE_FILENAME_PATTERN))
+        mol_data_loaders = [MoleculeDataLoader(
+            molecule_filepath=filepath) for filepath in molecule_filepaths]
 
-        drug_indication_filepaths = glob.iglob(os.path.join(self.data_folder, self.DRUG_INDICATION_FILENAME_PATTERN))
-        mechanism_filepaths = glob.iglob(os.path.join(self.data_folder, self.MECHANISM_FILENAME_PATTERN))
-        target_filepaths = glob.iglob(os.path.join(self.data_folder, self.TARGET_FILENAME_PATTERN))
-        binding_site_filepaths = glob.iglob(os.path.join(self.data_folder, self.BINDING_SITE_FILENAME_PATTERN))
+        drug_indication_filepaths = glob.iglob(os.path.join(
+            self.data_folder, self.DRUG_INDICATION_FILENAME_PATTERN))
+        mechanism_filepaths = glob.iglob(os.path.join(
+            self.data_folder, self.MECHANISM_FILENAME_PATTERN))
+        target_filepaths = glob.iglob(os.path.join(
+            self.data_folder, self.TARGET_FILENAME_PATTERN))
+        binding_site_filepaths = glob.iglob(os.path.join(
+            self.data_folder, self.BINDING_SITE_FILENAME_PATTERN))
         aux_data_loader = AuxiliaryDataLoader(drug_indication_filepaths=drug_indication_filepaths,
                                               mechanism_filepaths=mechanism_filepaths,
                                               target_filepaths=target_filepaths,
@@ -67,7 +76,8 @@ def jobs(self):
 
     def load_data(self, mol_data_loader: MoleculeDataLoader, aux_data_loader: AuxiliaryDataLoader):
         """load data from an input file"""
-        self.logger.info("Load data from file '%s'" % mol_data_loader.molecule_filepath)
+        self.logger.info("Load data from file '%s'" %
+                         mol_data_loader.molecule_filepath)
 
         return self.keylookup(load_chembl_data, debug=True)(mol_data_loader, aux_data_loader)
 
@@ -103,9 +113,9 @@ def get_mapping(cls):
                                     }
                                 }
                             },
-			    "first_approval": {
-				"type": "integer"
- 			    },
+                            "first_approval": {
+                                "type": "integer"
+                            },
                             "indication_refs": {
                                 "properties": {
                                     "id": {
@@ -535,7 +545,8 @@ def get_mapping(cls):
                     "pref_name": {
                         "type": "text",
                         "copy_to": [
-                            "all"
+                            "all",
+                            "name"
                         ]
                     },
                     "first_approval": {

diff --git a/src/hub/dataload/sources/drugbank_open/drugbank_open_mapping.py b/src/hub/dataload/sources/drugbank_open/drugbank_open_mapping.py
@@ -11,7 +11,8 @@
                 "type": "keyword"
             },
             "name": {
-                "type": "text"
+                "type": "text",
+                "copy_to": ["name"]
             },
             "cas_number": {
                 "normalizer": "keyword_lowercase_normalizer",
-Original file line number
+Diff line change
@@ Expand Up / @@ -110,7 +110,8 @@ def get_mapping(klass): @@
                         "drug_name": {
                             "type": "text",
                             "copy_to": [
-                                "all"
+                                "all",
+                                "name"
                             ]
                         },
                         "pt": {
@@ Expand Down @@