diff --git a/src/hub/dataload/sources/pharmgkb/pharmgkb_parser.py b/src/hub/dataload/sources/pharmgkb/pharmgkb_parser.py index 967d371..ec81116 100644 --- a/src/hub/dataload/sources/pharmgkb/pharmgkb_parser.py +++ b/src/hub/dataload/sources/pharmgkb/pharmgkb_parser.py @@ -4,21 +4,12 @@ from biothings.utils.dataload import dict_sweep, unlist -try: - from biothings import config - logging = config.logger -except ImportError: - import logging - LOG_LEVEL = logging.INFO - logging.basicConfig(level=LOG_LEVEL, format='%(asctime)s: %(message)s') - csv.field_size_limit(sys.maxsize) def load_data(tsv_file): _file = open(tsv_file) reader = csv.DictReader(_file, delimiter='\t') - _dict = {} for row in reader: _id = row["PharmGKB Accession Id"] _d = restr_dict(row) @@ -92,7 +83,6 @@ def _parse_brand_mixtures(mixtures): parsed_mixtures.append( {"brand_name": brand_name, "mixture": ingredients}) return parsed_mixtures - _d = {} for key, val in iter(d.items()): if key in ["SMILES", "Name", "Type", "InChI"]: @@ -122,34 +112,19 @@ def _parse_brand_mixtures(mixtures): def clean_up(d): _li = ['xrefs', 'external_vocabulary'] _d = {} - - def extract_primary_id(value): - # Here, we prioritize extracting the numeric value before any comma. - # This function returns the first numeric sequence found in the string. - matches = re.findall(r'\d+', value) - return int(matches[0]) if matches else None - for key, val in iter(d.items()): if key in _li: for ele in val: idx = ele.find(':') + # Note: original pharmgkb keys do not have '.' k = transform_xrefs_fieldnames(ele[0:idx]) - v = ele[idx+1:].strip() - + v = ele[idx+1:] if k in ["pubchem.cid", "pubchem.sid"]: - try: - v = int(v) - except ValueError: - v = extract_primary_id(v) - if v is None: - logging.warning( - f"Failed to extract primary ID for {k}: {ele}. Skipping this entry.") - continue - + v = int(v) # Handle nested elements (ex: 'wikipedia.url_stub') here sub_d = sub_field(k, v) _d.update(sub_d) - + # 'xrefs' and 'external_vocabulary' are merged if 'external_vocabulary' in d.keys(): d.pop('external_vocabulary') d.update({'xrefs': _d}) diff --git a/src/hub/dataload/sources/unii/unii_parser.py b/src/hub/dataload/sources/unii/unii_parser.py index 86e6a27..d038814 100644 --- a/src/hub/dataload/sources/unii/unii_parser.py +++ b/src/hub/dataload/sources/unii/unii_parser.py @@ -65,12 +65,12 @@ def load_data(input_file): del record['unii']['_id'] if 'display name' in record['unii']: record['unii']['display_name'] = record['unii'].pop( - 'display name') + 'display name').strip() else: for subr in record['unii']: del subr['_id'] if 'display name' in subr: - subr['display_name'] = subr.pop('display name') + subr['display_name'] = subr.pop('display name').strip() # convert fields to integer record = int_convert(record, include_keys=['unii.pubchem'])