Skip to content

Commit

Permalink
Merge pull request #179 from biothings/indexer-update
Browse files Browse the repository at this point in the history
Add MyChemIndexer class
  • Loading branch information
DylanWelzel authored Jun 17, 2024
2 parents e13aa96 + a56dae5 commit e6fd7c3
Show file tree
Hide file tree
Showing 17 changed files with 862 additions and 837 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/app_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
python-version: ['3.7', '3.8', '3.9', '3.10', '3.11', '3.12']
steps:
- name: Checkout source
uses: actions/checkout@v3
Expand All @@ -28,7 +28,7 @@ jobs:
run: pip install pytest
- name: Run App Tests
run: pytest test_local.py
working-directory: src/tests
working-directory: src/tests
services:
Elasticsearch:
image: docker.elastic.co/elasticsearch/elasticsearch:8.6.0
Expand Down
2 changes: 1 addition & 1 deletion requirements_web.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,5 @@

# biothings[web_extra]==0.12.4

# Fixes from 0.12.x up to 2024-05-06
# Fixes from 0.12.x up to 2024-06-14
git+https://github.com/biothings/biothings.api@639302ace2f9dd90c8a9de57aa16c5a0d4beac27#egg=biothings[web_extra]
2 changes: 1 addition & 1 deletion src/config_hub.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
INDEX_CONFIG = {
"indexer_select": {
# default
None: "hub.dataindex.indexer.DrugIndexer",
None: "hub.dataindex.indexer.MyChemIndexer",
},
"env": {
"prod": {
Expand Down
34 changes: 28 additions & 6 deletions src/hub/dataindex/indexer.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,33 @@
import asyncio
from copy import deepcopy

from biothings.hub.dataindex.indexer import Indexer

DEFAULT_INDEX_MAPPINGS = {
"properties": {
"all": {"type": "text"},
"name": {
"type": "text",
"fields": {
"raw": {
"type": "keyword",
"ignore_above": 128,
"normalizer": "keyword_lowercase_normalizer"
}
},
"copy_to": "all"
}
}
}

class DrugIndexer(Indexer):
pass

# @asyncio.coroutine
# def index(self, job_manager, steps=("pre", "index", "post"), batch_size=2500, ids=None, mode="index"):
# return super().index(job_manager, steps=steps, batch_size=batch_size, ids=ids, mode=mode)
class MyChemIndexer(Indexer):
def __init__(self, build_doc, indexer_env, index_name):
super().__init__(build_doc, indexer_env, index_name)

new_mappings = deepcopy(DEFAULT_INDEX_MAPPINGS)

self.es_index_mappings["properties"].update(
new_mappings["properties"])

self.logger.debug("Updated Index mappings: %s",
dict(self.es_index_mappings))
3 changes: 2 additions & 1 deletion src/hub/dataload/sources/aeolus/aeolus_upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,8 @@ def get_mapping(klass):
"drug_name": {
"type": "text",
"copy_to": [
"all"
"all",
"name"
]
},
"pt": {
Expand Down
20 changes: 11 additions & 9 deletions src/hub/dataload/sources/chebi/chebi_upload.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
import os
import pymongo

from .chebi_parser import ChebiParser, CompoundReader, OntologyReader
from .exclusion_ids import exclusion_ids
from hub.dataload.uploader import BaseDrugUploader
from biothings.utils.mongo import get_src_db
import biothings.hub.dataload.storage as storage
import pymongo
from biothings.utils.exclude_ids import ExcludeFieldsById
from biothings.utils.mongo import get_src_db

from hub.dataload.uploader import BaseDrugUploader
from hub.datatransform.keylookup import MyChemKeyLookup

from .chebi_parser import ChebiParser, CompoundReader, OntologyReader
from .exclusion_ids import exclusion_ids

SRC_META = {
"url": 'https://www.ebi.ac.uk/chebi/',
Expand Down Expand Up @@ -38,7 +38,7 @@ class ChebiUploader(BaseDrugUploader):
- `chebi.xrefs.patent`
`ExcludeFieldsById` acts like a filter to truncate the length of such long lists to 1,000.
See the comment on the ExcludeFieldsById for use of this class.
"""
exclude_fields = ExcludeFieldsById(exclusion_ids, [
Expand All @@ -53,10 +53,12 @@ def load_data(self, data_folder):
self.logger.info("Load data from '%s'" % data_folder)

sdf_input_file = os.path.join(data_folder, "ChEBI_complete.sdf")
assert os.path.exists(sdf_input_file), "Can't find input file '%s'" % sdf_input_file
assert os.path.exists(
sdf_input_file), "Can't find input file '%s'" % sdf_input_file

obo_input_file = os.path.join(data_folder, "chebi_lite.obo")
assert os.path.exists(obo_input_file), "Can't find input file '%s'" % obo_input_file
assert os.path.exists(
obo_input_file), "Can't find input file '%s'" % obo_input_file

# get others source collection for inchi key conversion
drugbank_col = get_src_db()["drugbank"]
Expand Down Expand Up @@ -276,7 +278,7 @@ def get_mapping(klass):
},
"name": {
"type": "text",
'copy_to': ['all'],
'copy_to': ['all', 'name'],
},
"charge": {
"type": "integer"
Expand Down
37 changes: 24 additions & 13 deletions src/hub/dataload/sources/chembl/chembl_upload.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,18 @@
"""
Chembl uploader
"""
import glob

# pylint: disable=E0401, E0611
import os
import glob

import biothings.hub.dataload.storage as storage
from biothings.hub.dataload.uploader import ParallelizedSourceUploader

from hub.dataload.uploader import BaseDrugUploader
from hub.datatransform.keylookup import MyChemKeyLookup
from .chembl_parser import AuxiliaryDataLoader, MoleculeDataLoader, load_chembl_data

from .chembl_parser import AuxiliaryDataLoader, MoleculeDataLoader, load_chembl_data

SRC_META = {
"url": 'https://www.ebi.ac.uk/chembl/',
Expand Down Expand Up @@ -50,13 +53,19 @@ def jobs(self):
this method will be called by self.update_data() and then generate arguments for self.load.data() method,
allowing parallelization
"""
molecule_filepaths = glob.glob(os.path.join(self.data_folder, self.MOLECULE_FILENAME_PATTERN))
mol_data_loaders = [MoleculeDataLoader(molecule_filepath=filepath) for filepath in molecule_filepaths]
molecule_filepaths = glob.glob(os.path.join(
self.data_folder, self.MOLECULE_FILENAME_PATTERN))
mol_data_loaders = [MoleculeDataLoader(
molecule_filepath=filepath) for filepath in molecule_filepaths]

drug_indication_filepaths = glob.iglob(os.path.join(self.data_folder, self.DRUG_INDICATION_FILENAME_PATTERN))
mechanism_filepaths = glob.iglob(os.path.join(self.data_folder, self.MECHANISM_FILENAME_PATTERN))
target_filepaths = glob.iglob(os.path.join(self.data_folder, self.TARGET_FILENAME_PATTERN))
binding_site_filepaths = glob.iglob(os.path.join(self.data_folder, self.BINDING_SITE_FILENAME_PATTERN))
drug_indication_filepaths = glob.iglob(os.path.join(
self.data_folder, self.DRUG_INDICATION_FILENAME_PATTERN))
mechanism_filepaths = glob.iglob(os.path.join(
self.data_folder, self.MECHANISM_FILENAME_PATTERN))
target_filepaths = glob.iglob(os.path.join(
self.data_folder, self.TARGET_FILENAME_PATTERN))
binding_site_filepaths = glob.iglob(os.path.join(
self.data_folder, self.BINDING_SITE_FILENAME_PATTERN))
aux_data_loader = AuxiliaryDataLoader(drug_indication_filepaths=drug_indication_filepaths,
mechanism_filepaths=mechanism_filepaths,
target_filepaths=target_filepaths,
Expand All @@ -67,7 +76,8 @@ def jobs(self):

def load_data(self, mol_data_loader: MoleculeDataLoader, aux_data_loader: AuxiliaryDataLoader):
"""load data from an input file"""
self.logger.info("Load data from file '%s'" % mol_data_loader.molecule_filepath)
self.logger.info("Load data from file '%s'" %
mol_data_loader.molecule_filepath)

return self.keylookup(load_chembl_data, debug=True)(mol_data_loader, aux_data_loader)

Expand Down Expand Up @@ -103,9 +113,9 @@ def get_mapping(cls):
}
}
},
"first_approval": {
"type": "integer"
},
"first_approval": {
"type": "integer"
},
"indication_refs": {
"properties": {
"id": {
Expand Down Expand Up @@ -535,7 +545,8 @@ def get_mapping(cls):
"pref_name": {
"type": "text",
"copy_to": [
"all"
"all",
"name"
]
},
"first_approval": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@
"type": "keyword"
},
"name": {
"type": "text"
"type": "text",
"copy_to": ["name"]
},
"cas_number": {
"normalizer": "keyword_lowercase_normalizer",
Expand Down
Loading

0 comments on commit e6fd7c3

Please sign in to comment.