Skip to content

Commit

Permalink
add name group to mapping for sources
Browse files Browse the repository at this point in the history
  • Loading branch information
DylanWelzel committed Jun 12, 2024
1 parent 858e4fa commit 71c52db
Show file tree
Hide file tree
Showing 11 changed files with 820 additions and 793 deletions.
3 changes: 2 additions & 1 deletion src/hub/dataload/sources/aeolus/aeolus_upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,8 @@ def get_mapping(klass):
"drug_name": {
"type": "text",
"copy_to": [
"all"
"all",
"name"
]
},
"pt": {
Expand Down
20 changes: 11 additions & 9 deletions src/hub/dataload/sources/chebi/chebi_upload.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
import os
import pymongo

from .chebi_parser import ChebiParser, CompoundReader, OntologyReader
from .exclusion_ids import exclusion_ids
from hub.dataload.uploader import BaseDrugUploader
from biothings.utils.mongo import get_src_db
import biothings.hub.dataload.storage as storage
import pymongo
from biothings.utils.exclude_ids import ExcludeFieldsById
from biothings.utils.mongo import get_src_db

from hub.dataload.uploader import BaseDrugUploader
from hub.datatransform.keylookup import MyChemKeyLookup

from .chebi_parser import ChebiParser, CompoundReader, OntologyReader
from .exclusion_ids import exclusion_ids

SRC_META = {
"url": 'https://www.ebi.ac.uk/chebi/',
Expand Down Expand Up @@ -38,7 +38,7 @@ class ChebiUploader(BaseDrugUploader):
- `chebi.xrefs.patent`
`ExcludeFieldsById` acts like a filter to truncate the length of such long lists to 1,000.
See the comment on the ExcludeFieldsById for use of this class.
"""
exclude_fields = ExcludeFieldsById(exclusion_ids, [
Expand All @@ -53,10 +53,12 @@ def load_data(self, data_folder):
self.logger.info("Load data from '%s'" % data_folder)

sdf_input_file = os.path.join(data_folder, "ChEBI_complete.sdf")
assert os.path.exists(sdf_input_file), "Can't find input file '%s'" % sdf_input_file
assert os.path.exists(
sdf_input_file), "Can't find input file '%s'" % sdf_input_file

obo_input_file = os.path.join(data_folder, "chebi_lite.obo")
assert os.path.exists(obo_input_file), "Can't find input file '%s'" % obo_input_file
assert os.path.exists(
obo_input_file), "Can't find input file '%s'" % obo_input_file

# get others source collection for inchi key conversion
drugbank_col = get_src_db()["drugbank"]
Expand Down Expand Up @@ -276,7 +278,7 @@ def get_mapping(klass):
},
"name": {
"type": "text",
'copy_to': ['all'],
'copy_to': ['all', 'name'],
},
"charge": {
"type": "integer"
Expand Down
37 changes: 24 additions & 13 deletions src/hub/dataload/sources/chembl/chembl_upload.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,18 @@
"""
Chembl uploader
"""
import glob

# pylint: disable=E0401, E0611
import os
import glob

import biothings.hub.dataload.storage as storage
from biothings.hub.dataload.uploader import ParallelizedSourceUploader

from hub.dataload.uploader import BaseDrugUploader
from hub.datatransform.keylookup import MyChemKeyLookup
from .chembl_parser import AuxiliaryDataLoader, MoleculeDataLoader, load_chembl_data

from .chembl_parser import AuxiliaryDataLoader, MoleculeDataLoader, load_chembl_data

SRC_META = {
"url": 'https://www.ebi.ac.uk/chembl/',
Expand Down Expand Up @@ -50,13 +53,19 @@ def jobs(self):
this method will be called by self.update_data() and then generate arguments for self.load.data() method,
allowing parallelization
"""
molecule_filepaths = glob.glob(os.path.join(self.data_folder, self.MOLECULE_FILENAME_PATTERN))
mol_data_loaders = [MoleculeDataLoader(molecule_filepath=filepath) for filepath in molecule_filepaths]
molecule_filepaths = glob.glob(os.path.join(
self.data_folder, self.MOLECULE_FILENAME_PATTERN))
mol_data_loaders = [MoleculeDataLoader(
molecule_filepath=filepath) for filepath in molecule_filepaths]

drug_indication_filepaths = glob.iglob(os.path.join(self.data_folder, self.DRUG_INDICATION_FILENAME_PATTERN))
mechanism_filepaths = glob.iglob(os.path.join(self.data_folder, self.MECHANISM_FILENAME_PATTERN))
target_filepaths = glob.iglob(os.path.join(self.data_folder, self.TARGET_FILENAME_PATTERN))
binding_site_filepaths = glob.iglob(os.path.join(self.data_folder, self.BINDING_SITE_FILENAME_PATTERN))
drug_indication_filepaths = glob.iglob(os.path.join(
self.data_folder, self.DRUG_INDICATION_FILENAME_PATTERN))
mechanism_filepaths = glob.iglob(os.path.join(
self.data_folder, self.MECHANISM_FILENAME_PATTERN))
target_filepaths = glob.iglob(os.path.join(
self.data_folder, self.TARGET_FILENAME_PATTERN))
binding_site_filepaths = glob.iglob(os.path.join(
self.data_folder, self.BINDING_SITE_FILENAME_PATTERN))
aux_data_loader = AuxiliaryDataLoader(drug_indication_filepaths=drug_indication_filepaths,
mechanism_filepaths=mechanism_filepaths,
target_filepaths=target_filepaths,
Expand All @@ -67,7 +76,8 @@ def jobs(self):

def load_data(self, mol_data_loader: MoleculeDataLoader, aux_data_loader: AuxiliaryDataLoader):
"""load data from an input file"""
self.logger.info("Load data from file '%s'" % mol_data_loader.molecule_filepath)
self.logger.info("Load data from file '%s'" %
mol_data_loader.molecule_filepath)

return self.keylookup(load_chembl_data, debug=True)(mol_data_loader, aux_data_loader)

Expand Down Expand Up @@ -103,9 +113,9 @@ def get_mapping(cls):
}
}
},
"first_approval": {
"type": "integer"
},
"first_approval": {
"type": "integer"
},
"indication_refs": {
"properties": {
"id": {
Expand Down Expand Up @@ -535,7 +545,8 @@ def get_mapping(cls):
"pref_name": {
"type": "text",
"copy_to": [
"all"
"all",
"name"
]
},
"first_approval": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@
"type": "keyword"
},
"name": {
"type": "text"
"type": "text",
"copy_to": ["name"]
},
"cas_number": {
"normalizer": "keyword_lowercase_normalizer",
Expand Down
Loading

0 comments on commit 71c52db

Please sign in to comment.