From 2de0fde14c6e77b7cade433872b7ed5ff3726952 Mon Sep 17 00:00:00 2001
From: Dylan Welzel <dylanwelzel@gmail.com>
Date: Thu, 13 Jun 2024 16:43:50 -0700
Subject: [PATCH] handle merge conflicts with main branch

---
 .../sources/pharmgkb/pharmgkb_parser.py       | 33 +++----------------
 src/hub/dataload/sources/unii/unii_parser.py  |  4 +--
 2 files changed, 6 insertions(+), 31 deletions(-)

diff --git a/src/hub/dataload/sources/pharmgkb/pharmgkb_parser.py b/src/hub/dataload/sources/pharmgkb/pharmgkb_parser.py
index 967d371..ec81116 100644
--- a/src/hub/dataload/sources/pharmgkb/pharmgkb_parser.py
+++ b/src/hub/dataload/sources/pharmgkb/pharmgkb_parser.py
@@ -4,21 +4,12 @@
 
 from biothings.utils.dataload import dict_sweep, unlist
 
-try:
-    from biothings import config
-    logging = config.logger
-except ImportError:
-    import logging
-    LOG_LEVEL = logging.INFO
-    logging.basicConfig(level=LOG_LEVEL, format='%(asctime)s: %(message)s')
-
 csv.field_size_limit(sys.maxsize)
 
 
 def load_data(tsv_file):
     _file = open(tsv_file)
     reader = csv.DictReader(_file, delimiter='\t')
-    _dict = {}
     for row in reader:
         _id = row["PharmGKB Accession Id"]
         _d = restr_dict(row)
@@ -92,7 +83,6 @@ def _parse_brand_mixtures(mixtures):
             parsed_mixtures.append(
                 {"brand_name": brand_name, "mixture": ingredients})
         return parsed_mixtures
-
     _d = {}
     for key, val in iter(d.items()):
         if key in ["SMILES", "Name", "Type", "InChI"]:
@@ -122,34 +112,19 @@ def _parse_brand_mixtures(mixtures):
 def clean_up(d):
     _li = ['xrefs', 'external_vocabulary']
     _d = {}
-
-    def extract_primary_id(value):
-        # Here, we prioritize extracting the numeric value before any comma.
-        # This function returns the first numeric sequence found in the string.
-        matches = re.findall(r'\d+', value)
-        return int(matches[0]) if matches else None
-
     for key, val in iter(d.items()):
         if key in _li:
             for ele in val:
                 idx = ele.find(':')
+                # Note:  original pharmgkb keys do not have '.'
                 k = transform_xrefs_fieldnames(ele[0:idx])
-                v = ele[idx+1:].strip()
-
+                v = ele[idx+1:]
                 if k in ["pubchem.cid", "pubchem.sid"]:
-                    try:
-                        v = int(v)
-                    except ValueError:
-                        v = extract_primary_id(v)
-                        if v is None:
-                            logging.warning(
-                                f"Failed to extract primary ID for {k}: {ele}. Skipping this entry.")
-                            continue
-
+                    v = int(v)
                 # Handle nested elements (ex: 'wikipedia.url_stub') here
                 sub_d = sub_field(k, v)
                 _d.update(sub_d)
-
+    # 'xrefs' and 'external_vocabulary' are merged
     if 'external_vocabulary' in d.keys():
         d.pop('external_vocabulary')
     d.update({'xrefs': _d})
diff --git a/src/hub/dataload/sources/unii/unii_parser.py b/src/hub/dataload/sources/unii/unii_parser.py
index 86e6a27..d038814 100644
--- a/src/hub/dataload/sources/unii/unii_parser.py
+++ b/src/hub/dataload/sources/unii/unii_parser.py
@@ -65,12 +65,12 @@ def load_data(input_file):
             del record['unii']['_id']
             if 'display name' in record['unii']:
                 record['unii']['display_name'] = record['unii'].pop(
-                    'display name')
+                    'display name').strip()
         else:
             for subr in record['unii']:
                 del subr['_id']
                 if 'display name' in subr:
-                    subr['display_name'] = subr.pop('display name')
+                    subr['display_name'] = subr.pop('display name').strip()
 
         # convert fields to integer
         record = int_convert(record, include_keys=['unii.pubchem'])