From 5be407be01c5eaf35135fddd412b0c75608eef26 Mon Sep 17 00:00:00 2001
From: Pradeep Bashyal <pbashyal@nmdp.org>
Date: Tue, 29 Oct 2024 15:36:24 -0500
Subject: [PATCH 1/4] Only single `lgx` reductions

- For `lgx` reduction, tracking of duplicate alleles is removed. For duplicate alleles, map it to the lower numbered allele in the list.
---
 pyard/ard.py                     |  4 +---
 pyard/data_repository.py         | 30 ++++++++++++++++++++----------
 pyard/db.py                      | 10 ----------
 pyard/mappings.py                |  1 -
 tests/features/p_g_group.feature | 14 +++++++-------
 5 files changed, 28 insertions(+), 31 deletions(-)

diff --git a/pyard/ard.py b/pyard/ard.py
index 61033e4..fcab4e9 100644
--- a/pyard/ard.py
+++ b/pyard/ard.py
@@ -246,9 +246,7 @@ def _redux_allele(
         elif redux_type == "P" and allele in self.ars_mappings.p_group:
             return self.ars_mappings.p_group[allele]
         elif redux_type in ["lgx", "lg"]:
-            if allele in self.ars_mappings.dup_lgx:
-                redux_allele = self.ars_mappings.dup_lgx[allele]
-            elif allele in self.ars_mappings.lgx_group:
+            if allele in self.ars_mappings.lgx_group:
                 redux_allele = self.ars_mappings.lgx_group[allele]
             else:
                 # for 'lgx' or 'lg' mode when allele is not in G group,
diff --git a/pyard/data_repository.py b/pyard/data_repository.py
index e21ecf7..bac642a 100644
--- a/pyard/data_repository.py
+++ b/pyard/data_repository.py
@@ -50,6 +50,7 @@
     get_1field_allele,
 )
 from .serology import broad_splits_dna_mapping, SerologyMapping
+from .smart_sort import smart_sort_comparator
 
 
 def expression_reduce(df):
@@ -113,15 +114,6 @@ def generate_ard_mapping(db_connection: sqlite3.Connection, imgt_version) -> ARS
     mlgx = df_g_group.drop_duplicates(["2d", "lgx"])["2d"].value_counts()
     multiple_lgx_list = mlgx[mlgx > 1].index.to_list()
 
-    # Keep only the alleles that have more than 1 mapping
-    dup_lgx = (
-        df_g_group[df_g_group["2d"].isin(multiple_lgx_list)][["lgx", "2d"]]
-        .drop_duplicates()
-        .groupby("2d", as_index=True)
-        .agg("/".join)
-        .to_dict()["lgx"]
-    )
-
     # Extract G group mapping
     df_g = pd.concat(
         [
@@ -154,6 +146,25 @@ def generate_ard_mapping(db_connection: sqlite3.Connection, imgt_version) -> ARS
     )
     lgx_group = df_lgx.set_index("A")["lgx"].to_dict()
 
+    # Find the alleles that have more than 1 mapping
+    dup_lgx = (
+        df_g_group[df_g_group["2d"].isin(multiple_lgx_list)][["lgx", "2d"]]
+        .drop_duplicates()
+        .groupby("2d", as_index=True)
+        .agg(list)
+        .to_dict()["lgx"]
+    )
+    print(dup_lgx)
+    # Do not keep duplicate alleles for lgx. Issue #333
+    # DPA1*02:02/DPA1*02:07 ==> DPA1*02:02
+    #
+    lowest_numbered_dup_lgx = {
+        k: sorted(v, key=functools.cmp_to_key(smart_sort_comparator))[0]
+        for k, v in dup_lgx.items()
+    }
+    # Update the lgx_group with the allele with the lowest number
+    lgx_group.update(lowest_numbered_dup_lgx)
+
     # Extract exon mapping
     df_exon = pd.concat(
         [
@@ -164,7 +175,6 @@ def generate_ard_mapping(db_connection: sqlite3.Connection, imgt_version) -> ARS
 
     ars_mapping = ARSMapping(
         dup_g=dup_g,
-        dup_lgx=dup_lgx,
         g_group=g_group,
         p_group=p_group,
         lgx_group=lgx_group,
diff --git a/pyard/db.py b/pyard/db.py
index 65f6a52..45ce425 100644
--- a/pyard/db.py
+++ b/pyard/db.py
@@ -461,9 +461,6 @@ def set_user_version(connection: sqlite3.Connection, version: int):
 
 def load_ars_mappings(db_connection):
     dup_g = load_dict(db_connection, table_name="dup_g", columns=("allele", "g_group"))
-    dup_lgx = load_dict(
-        db_connection, table_name="dup_lgx", columns=("allele", "lgx_group")
-    )
     g_group = load_dict(db_connection, table_name="g_group", columns=("allele", "g"))
     p_group = load_dict(db_connection, table_name="p_group", columns=("allele", "p"))
     lgx_group = load_dict(
@@ -475,7 +472,6 @@ def load_ars_mappings(db_connection):
     p_not_g = load_dict(db_connection, table_name="p_not_g", columns=("allele", "lgx"))
     return ARSMapping(
         dup_g=dup_g,
-        dup_lgx=dup_lgx,
         g_group=g_group,
         p_group=p_group,
         lgx_group=lgx_group,
@@ -497,12 +493,6 @@ def save_ars_mappings(db_connection: sqlite3.Connection, ars_mapping: ARSMapping
         dictionary=ars_mapping.dup_g,
         columns=("allele", "g_group"),
     )
-    save_dict(
-        db_connection,
-        table_name="dup_lgx",
-        dictionary=ars_mapping.dup_lgx,
-        columns=("allele", "lgx_group"),
-    )
     save_dict(
         db_connection,
         table_name="g_group",
diff --git a/pyard/mappings.py b/pyard/mappings.py
index 07b4f2f..9a42a13 100644
--- a/pyard/mappings.py
+++ b/pyard/mappings.py
@@ -23,7 +23,6 @@
 
 ars_mapping_tables = [
     "dup_g",
-    "dup_lgx",
     "g_group",
     "p_group",
     "lgx_group",
diff --git a/tests/features/p_g_group.feature b/tests/features/p_g_group.feature
index 3c5b877..8f8e75d 100644
--- a/tests/features/p_g_group.feature
+++ b/tests/features/p_g_group.feature
@@ -90,10 +90,10 @@ Feature: P and G Groups
       | C*02:10    | lg    | C*02:02g     |
       | C*02:10    | lgx   | C*02:02      |
 
-    Examples: lgx with duplicates
-      | Allele        | Level | Redux Allele            |
-      | DPA1*02:12    | lgx   | DPA1*02:02/DPA1*02:07   |
-      | DPA1*02:12    | lg    | DPA1*02:02g/DPA1*02:07g |
-      | DQA1*03:03    | lgx   | DQA1*03:01              |
-      | DQA1*03:03    | lg    | DQA1*03:01g             |
-      | DQA1*03:03:09 | lg    | DQA1*03:03g             |
+    Examples: lgx redux with duplicate G groups
+      | Allele        | Level | Redux Allele |
+      | DPA1*02:12    | lgx   | DPA1*02:02   |
+      | DPA1*02:12    | lg    | DPA1*02:02g  |
+      | DQA1*03:03    | lgx   | DQA1*03:01   |
+      | DQA1*03:03    | lg    | DQA1*03:01g  |
+      | DQA1*03:03:09 | lg    | DQA1*03:03g  |

From 43b4bfe46bbd2157e71725a16b4aac02179844aa Mon Sep 17 00:00:00 2001
From: Pradeep Bashyal <pbashyal@nmdp.org>
Date: Tue, 29 Oct 2024 15:36:24 -0500
Subject: [PATCH 2/4] Only single `lgx` reductions

- For `lgx` reduction, tracking of duplicate alleles is removed. For duplicate alleles, map it to the lower numbered allele in the list.
---
 pyard/ard.py                     |  4 +---
 pyard/data_repository.py         | 29 +++++++++++++++++++----------
 pyard/db.py                      | 10 ----------
 pyard/mappings.py                |  1 -
 tests/features/p_g_group.feature | 14 +++++++-------
 5 files changed, 27 insertions(+), 31 deletions(-)

diff --git a/pyard/ard.py b/pyard/ard.py
index 61033e4..fcab4e9 100644
--- a/pyard/ard.py
+++ b/pyard/ard.py
@@ -246,9 +246,7 @@ def _redux_allele(
         elif redux_type == "P" and allele in self.ars_mappings.p_group:
             return self.ars_mappings.p_group[allele]
         elif redux_type in ["lgx", "lg"]:
-            if allele in self.ars_mappings.dup_lgx:
-                redux_allele = self.ars_mappings.dup_lgx[allele]
-            elif allele in self.ars_mappings.lgx_group:
+            if allele in self.ars_mappings.lgx_group:
                 redux_allele = self.ars_mappings.lgx_group[allele]
             else:
                 # for 'lgx' or 'lg' mode when allele is not in G group,
diff --git a/pyard/data_repository.py b/pyard/data_repository.py
index e21ecf7..5f03bba 100644
--- a/pyard/data_repository.py
+++ b/pyard/data_repository.py
@@ -50,6 +50,7 @@
     get_1field_allele,
 )
 from .serology import broad_splits_dna_mapping, SerologyMapping
+from .smart_sort import smart_sort_comparator
 
 
 def expression_reduce(df):
@@ -113,15 +114,6 @@ def generate_ard_mapping(db_connection: sqlite3.Connection, imgt_version) -> ARS
     mlgx = df_g_group.drop_duplicates(["2d", "lgx"])["2d"].value_counts()
     multiple_lgx_list = mlgx[mlgx > 1].index.to_list()
 
-    # Keep only the alleles that have more than 1 mapping
-    dup_lgx = (
-        df_g_group[df_g_group["2d"].isin(multiple_lgx_list)][["lgx", "2d"]]
-        .drop_duplicates()
-        .groupby("2d", as_index=True)
-        .agg("/".join)
-        .to_dict()["lgx"]
-    )
-
     # Extract G group mapping
     df_g = pd.concat(
         [
@@ -154,6 +146,24 @@ def generate_ard_mapping(db_connection: sqlite3.Connection, imgt_version) -> ARS
     )
     lgx_group = df_lgx.set_index("A")["lgx"].to_dict()
 
+    # Find the alleles that have more than 1 mapping
+    dup_lgx = (
+        df_g_group[df_g_group["2d"].isin(multiple_lgx_list)][["lgx", "2d"]]
+        .drop_duplicates()
+        .groupby("2d", as_index=True)
+        .agg(list)
+        .to_dict()["lgx"]
+    )
+    # Do not keep duplicate alleles for lgx. Issue #333
+    # DPA1*02:02/DPA1*02:07 ==> DPA1*02:02
+    #
+    lowest_numbered_dup_lgx = {
+        k: sorted(v, key=functools.cmp_to_key(smart_sort_comparator))[0]
+        for k, v in dup_lgx.items()
+    }
+    # Update the lgx_group with the allele with the lowest number
+    lgx_group.update(lowest_numbered_dup_lgx)
+
     # Extract exon mapping
     df_exon = pd.concat(
         [
@@ -164,7 +174,6 @@ def generate_ard_mapping(db_connection: sqlite3.Connection, imgt_version) -> ARS
 
     ars_mapping = ARSMapping(
         dup_g=dup_g,
-        dup_lgx=dup_lgx,
         g_group=g_group,
         p_group=p_group,
         lgx_group=lgx_group,
diff --git a/pyard/db.py b/pyard/db.py
index 65f6a52..45ce425 100644
--- a/pyard/db.py
+++ b/pyard/db.py
@@ -461,9 +461,6 @@ def set_user_version(connection: sqlite3.Connection, version: int):
 
 def load_ars_mappings(db_connection):
     dup_g = load_dict(db_connection, table_name="dup_g", columns=("allele", "g_group"))
-    dup_lgx = load_dict(
-        db_connection, table_name="dup_lgx", columns=("allele", "lgx_group")
-    )
     g_group = load_dict(db_connection, table_name="g_group", columns=("allele", "g"))
     p_group = load_dict(db_connection, table_name="p_group", columns=("allele", "p"))
     lgx_group = load_dict(
@@ -475,7 +472,6 @@ def load_ars_mappings(db_connection):
     p_not_g = load_dict(db_connection, table_name="p_not_g", columns=("allele", "lgx"))
     return ARSMapping(
         dup_g=dup_g,
-        dup_lgx=dup_lgx,
         g_group=g_group,
         p_group=p_group,
         lgx_group=lgx_group,
@@ -497,12 +493,6 @@ def save_ars_mappings(db_connection: sqlite3.Connection, ars_mapping: ARSMapping
         dictionary=ars_mapping.dup_g,
         columns=("allele", "g_group"),
     )
-    save_dict(
-        db_connection,
-        table_name="dup_lgx",
-        dictionary=ars_mapping.dup_lgx,
-        columns=("allele", "lgx_group"),
-    )
     save_dict(
         db_connection,
         table_name="g_group",
diff --git a/pyard/mappings.py b/pyard/mappings.py
index 07b4f2f..9a42a13 100644
--- a/pyard/mappings.py
+++ b/pyard/mappings.py
@@ -23,7 +23,6 @@
 
 ars_mapping_tables = [
     "dup_g",
-    "dup_lgx",
     "g_group",
     "p_group",
     "lgx_group",
diff --git a/tests/features/p_g_group.feature b/tests/features/p_g_group.feature
index 3c5b877..8f8e75d 100644
--- a/tests/features/p_g_group.feature
+++ b/tests/features/p_g_group.feature
@@ -90,10 +90,10 @@ Feature: P and G Groups
       | C*02:10    | lg    | C*02:02g     |
       | C*02:10    | lgx   | C*02:02      |
 
-    Examples: lgx with duplicates
-      | Allele        | Level | Redux Allele            |
-      | DPA1*02:12    | lgx   | DPA1*02:02/DPA1*02:07   |
-      | DPA1*02:12    | lg    | DPA1*02:02g/DPA1*02:07g |
-      | DQA1*03:03    | lgx   | DQA1*03:01              |
-      | DQA1*03:03    | lg    | DQA1*03:01g             |
-      | DQA1*03:03:09 | lg    | DQA1*03:03g             |
+    Examples: lgx redux with duplicate G groups
+      | Allele        | Level | Redux Allele |
+      | DPA1*02:12    | lgx   | DPA1*02:02   |
+      | DPA1*02:12    | lg    | DPA1*02:02g  |
+      | DQA1*03:03    | lgx   | DQA1*03:01   |
+      | DQA1*03:03    | lg    | DQA1*03:01g  |
+      | DQA1*03:03:09 | lg    | DQA1*03:03g  |

From 165f6031e31bd69a29b13a1d72d6e4f998f09ae3 Mon Sep 17 00:00:00 2001
From: Pradeep Bashyal <pbashyal@nmdp.org>
Date: Tue, 29 Oct 2024 15:42:24 -0500
Subject: [PATCH 3/4] Apply HLA- prefix correctly if the result is an allele
 list

---
 pyard/ard.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pyard/ard.py b/pyard/ard.py
index fcab4e9..88d8e7b 100644
--- a/pyard/ard.py
+++ b/pyard/ard.py
@@ -189,6 +189,8 @@ def _redux_allele(
             hla, allele_name = allele.split("-")
             redux_allele = self._redux_allele(allele_name, redux_type)
             if redux_allele:
+                if "/" in redux_allele:
+                    return "/".join(["HLA-" + ra for ra in redux_allele.split("/")])
                 return "HLA-" + redux_allele
             else:
                 return redux_allele

From f91fef4395d7958e2b9ce600633ca1475d725640 Mon Sep 17 00:00:00 2001
From: Pradeep Bashyal <pbashyal@nmdp.org>
Date: Tue, 29 Oct 2024 15:45:59 -0500
Subject: [PATCH 4/4] =?UTF-8?q?Bump=20version:=201.5.1=20=E2=86=92=201.5.2?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 Dockerfile        | 2 +-
 api-spec.yaml     | 2 +-
 pyard/__init__.py | 2 +-
 setup.cfg         | 2 +-
 setup.py          | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 2b57dd5..52d1bd7 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -4,7 +4,7 @@ LABEL MAINTAINER="Pradeep Bashyal"
 
 WORKDIR /app
 
-ARG PY_ARD_VERSION=1.5.1
+ARG PY_ARD_VERSION=1.5.2
 
 COPY requirements.txt /app
 RUN pip install --no-cache-dir --upgrade pip && \
diff --git a/api-spec.yaml b/api-spec.yaml
index 5604fca..83ddb6f 100644
--- a/api-spec.yaml
+++ b/api-spec.yaml
@@ -2,7 +2,7 @@ openapi: 3.0.3
 info:
   title: ARD Reduction
   description: Reduce to ARD Level
-  version: "1.5.1"
+  version: "1.5.2"
 servers:
   - url: 'http://localhost:8080'
 tags:
diff --git a/pyard/__init__.py b/pyard/__init__.py
index a4e65fa..a2cc703 100644
--- a/pyard/__init__.py
+++ b/pyard/__init__.py
@@ -26,7 +26,7 @@
 from .misc import get_imgt_db_versions as db_versions
 
 __author__ = """NMDP Bioinformatics"""
-__version__ = "1.5.1"
+__version__ = "1.5.2"
 
 
 def init(
diff --git a/setup.cfg b/setup.cfg
index 9d7b2e8..038a518 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 1.5.1
+current_version = 1.5.2
 commit = True
 tag = True
 
diff --git a/setup.py b/setup.py
index af6b9c6..65ed2c2 100644
--- a/setup.py
+++ b/setup.py
@@ -36,7 +36,7 @@
 
 setup(
     name="py-ard",
-    version="1.5.1",
+    version="1.5.2",
     description="ARD reduction for HLA with Python",
     long_description=readme,
     long_description_content_type="text/markdown",