Skip to content

Commit

Permalink
handle merge conflicts with main branch
Browse files Browse the repository at this point in the history
  • Loading branch information
DylanWelzel committed Jun 13, 2024
1 parent 37a742b commit 2de0fde
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 31 deletions.
33 changes: 4 additions & 29 deletions src/hub/dataload/sources/pharmgkb/pharmgkb_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,21 +4,12 @@

from biothings.utils.dataload import dict_sweep, unlist

try:
from biothings import config
logging = config.logger
except ImportError:
import logging
LOG_LEVEL = logging.INFO
logging.basicConfig(level=LOG_LEVEL, format='%(asctime)s: %(message)s')

csv.field_size_limit(sys.maxsize)


def load_data(tsv_file):
_file = open(tsv_file)
reader = csv.DictReader(_file, delimiter='\t')
_dict = {}
for row in reader:
_id = row["PharmGKB Accession Id"]
_d = restr_dict(row)
Expand Down Expand Up @@ -92,7 +83,6 @@ def _parse_brand_mixtures(mixtures):
parsed_mixtures.append(
{"brand_name": brand_name, "mixture": ingredients})
return parsed_mixtures

_d = {}
for key, val in iter(d.items()):
if key in ["SMILES", "Name", "Type", "InChI"]:
Expand Down Expand Up @@ -122,34 +112,19 @@ def _parse_brand_mixtures(mixtures):
def clean_up(d):
_li = ['xrefs', 'external_vocabulary']
_d = {}

def extract_primary_id(value):
# Here, we prioritize extracting the numeric value before any comma.
# This function returns the first numeric sequence found in the string.
matches = re.findall(r'\d+', value)
return int(matches[0]) if matches else None

for key, val in iter(d.items()):
if key in _li:
for ele in val:
idx = ele.find(':')
# Note: original pharmgkb keys do not have '.'
k = transform_xrefs_fieldnames(ele[0:idx])
v = ele[idx+1:].strip()

v = ele[idx+1:]
if k in ["pubchem.cid", "pubchem.sid"]:
try:
v = int(v)
except ValueError:
v = extract_primary_id(v)
if v is None:
logging.warning(
f"Failed to extract primary ID for {k}: {ele}. Skipping this entry.")
continue

v = int(v)
# Handle nested elements (ex: 'wikipedia.url_stub') here
sub_d = sub_field(k, v)
_d.update(sub_d)

# 'xrefs' and 'external_vocabulary' are merged
if 'external_vocabulary' in d.keys():
d.pop('external_vocabulary')
d.update({'xrefs': _d})
Expand Down
4 changes: 2 additions & 2 deletions src/hub/dataload/sources/unii/unii_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,12 +65,12 @@ def load_data(input_file):
del record['unii']['_id']
if 'display name' in record['unii']:
record['unii']['display_name'] = record['unii'].pop(
'display name')
'display name').strip()
else:
for subr in record['unii']:
del subr['_id']
if 'display name' in subr:
subr['display_name'] = subr.pop('display name')
subr['display_name'] = subr.pop('display name').strip()

# convert fields to integer
record = int_convert(record, include_keys=['unii.pubchem'])
Expand Down

0 comments on commit 2de0fde

Please sign in to comment.