From 5be407be01c5eaf35135fddd412b0c75608eef26 Mon Sep 17 00:00:00 2001 From: Pradeep Bashyal Date: Tue, 29 Oct 2024 15:36:24 -0500 Subject: [PATCH 1/4] Only single `lgx` reductions - For `lgx` reduction, tracking of duplicate alleles is removed. For duplicate alleles, map it to the lower numbered allele in the list. --- pyard/ard.py | 4 +--- pyard/data_repository.py | 30 ++++++++++++++++++++---------- pyard/db.py | 10 ---------- pyard/mappings.py | 1 - tests/features/p_g_group.feature | 14 +++++++------- 5 files changed, 28 insertions(+), 31 deletions(-) diff --git a/pyard/ard.py b/pyard/ard.py index 61033e4..fcab4e9 100644 --- a/pyard/ard.py +++ b/pyard/ard.py @@ -246,9 +246,7 @@ def _redux_allele( elif redux_type == "P" and allele in self.ars_mappings.p_group: return self.ars_mappings.p_group[allele] elif redux_type in ["lgx", "lg"]: - if allele in self.ars_mappings.dup_lgx: - redux_allele = self.ars_mappings.dup_lgx[allele] - elif allele in self.ars_mappings.lgx_group: + if allele in self.ars_mappings.lgx_group: redux_allele = self.ars_mappings.lgx_group[allele] else: # for 'lgx' or 'lg' mode when allele is not in G group, diff --git a/pyard/data_repository.py b/pyard/data_repository.py index e21ecf7..bac642a 100644 --- a/pyard/data_repository.py +++ b/pyard/data_repository.py @@ -50,6 +50,7 @@ get_1field_allele, ) from .serology import broad_splits_dna_mapping, SerologyMapping +from .smart_sort import smart_sort_comparator def expression_reduce(df): @@ -113,15 +114,6 @@ def generate_ard_mapping(db_connection: sqlite3.Connection, imgt_version) -> ARS mlgx = df_g_group.drop_duplicates(["2d", "lgx"])["2d"].value_counts() multiple_lgx_list = mlgx[mlgx > 1].index.to_list() - # Keep only the alleles that have more than 1 mapping - dup_lgx = ( - df_g_group[df_g_group["2d"].isin(multiple_lgx_list)][["lgx", "2d"]] - .drop_duplicates() - .groupby("2d", as_index=True) - .agg("/".join) - .to_dict()["lgx"] - ) - # Extract G group mapping df_g = pd.concat( [ @@ -154,6 +146,25 @@ def generate_ard_mapping(db_connection: sqlite3.Connection, imgt_version) -> ARS ) lgx_group = df_lgx.set_index("A")["lgx"].to_dict() + # Find the alleles that have more than 1 mapping + dup_lgx = ( + df_g_group[df_g_group["2d"].isin(multiple_lgx_list)][["lgx", "2d"]] + .drop_duplicates() + .groupby("2d", as_index=True) + .agg(list) + .to_dict()["lgx"] + ) + print(dup_lgx) + # Do not keep duplicate alleles for lgx. Issue #333 + # DPA1*02:02/DPA1*02:07 ==> DPA1*02:02 + # + lowest_numbered_dup_lgx = { + k: sorted(v, key=functools.cmp_to_key(smart_sort_comparator))[0] + for k, v in dup_lgx.items() + } + # Update the lgx_group with the allele with the lowest number + lgx_group.update(lowest_numbered_dup_lgx) + # Extract exon mapping df_exon = pd.concat( [ @@ -164,7 +175,6 @@ def generate_ard_mapping(db_connection: sqlite3.Connection, imgt_version) -> ARS ars_mapping = ARSMapping( dup_g=dup_g, - dup_lgx=dup_lgx, g_group=g_group, p_group=p_group, lgx_group=lgx_group, diff --git a/pyard/db.py b/pyard/db.py index 65f6a52..45ce425 100644 --- a/pyard/db.py +++ b/pyard/db.py @@ -461,9 +461,6 @@ def set_user_version(connection: sqlite3.Connection, version: int): def load_ars_mappings(db_connection): dup_g = load_dict(db_connection, table_name="dup_g", columns=("allele", "g_group")) - dup_lgx = load_dict( - db_connection, table_name="dup_lgx", columns=("allele", "lgx_group") - ) g_group = load_dict(db_connection, table_name="g_group", columns=("allele", "g")) p_group = load_dict(db_connection, table_name="p_group", columns=("allele", "p")) lgx_group = load_dict( @@ -475,7 +472,6 @@ def load_ars_mappings(db_connection): p_not_g = load_dict(db_connection, table_name="p_not_g", columns=("allele", "lgx")) return ARSMapping( dup_g=dup_g, - dup_lgx=dup_lgx, g_group=g_group, p_group=p_group, lgx_group=lgx_group, @@ -497,12 +493,6 @@ def save_ars_mappings(db_connection: sqlite3.Connection, ars_mapping: ARSMapping dictionary=ars_mapping.dup_g, columns=("allele", "g_group"), ) - save_dict( - db_connection, - table_name="dup_lgx", - dictionary=ars_mapping.dup_lgx, - columns=("allele", "lgx_group"), - ) save_dict( db_connection, table_name="g_group", diff --git a/pyard/mappings.py b/pyard/mappings.py index 07b4f2f..9a42a13 100644 --- a/pyard/mappings.py +++ b/pyard/mappings.py @@ -23,7 +23,6 @@ ars_mapping_tables = [ "dup_g", - "dup_lgx", "g_group", "p_group", "lgx_group", diff --git a/tests/features/p_g_group.feature b/tests/features/p_g_group.feature index 3c5b877..8f8e75d 100644 --- a/tests/features/p_g_group.feature +++ b/tests/features/p_g_group.feature @@ -90,10 +90,10 @@ Feature: P and G Groups | C*02:10 | lg | C*02:02g | | C*02:10 | lgx | C*02:02 | - Examples: lgx with duplicates - | Allele | Level | Redux Allele | - | DPA1*02:12 | lgx | DPA1*02:02/DPA1*02:07 | - | DPA1*02:12 | lg | DPA1*02:02g/DPA1*02:07g | - | DQA1*03:03 | lgx | DQA1*03:01 | - | DQA1*03:03 | lg | DQA1*03:01g | - | DQA1*03:03:09 | lg | DQA1*03:03g | + Examples: lgx redux with duplicate G groups + | Allele | Level | Redux Allele | + | DPA1*02:12 | lgx | DPA1*02:02 | + | DPA1*02:12 | lg | DPA1*02:02g | + | DQA1*03:03 | lgx | DQA1*03:01 | + | DQA1*03:03 | lg | DQA1*03:01g | + | DQA1*03:03:09 | lg | DQA1*03:03g | From 43b4bfe46bbd2157e71725a16b4aac02179844aa Mon Sep 17 00:00:00 2001 From: Pradeep Bashyal Date: Tue, 29 Oct 2024 15:36:24 -0500 Subject: [PATCH 2/4] Only single `lgx` reductions - For `lgx` reduction, tracking of duplicate alleles is removed. For duplicate alleles, map it to the lower numbered allele in the list. --- pyard/ard.py | 4 +--- pyard/data_repository.py | 29 +++++++++++++++++++---------- pyard/db.py | 10 ---------- pyard/mappings.py | 1 - tests/features/p_g_group.feature | 14 +++++++------- 5 files changed, 27 insertions(+), 31 deletions(-) diff --git a/pyard/ard.py b/pyard/ard.py index 61033e4..fcab4e9 100644 --- a/pyard/ard.py +++ b/pyard/ard.py @@ -246,9 +246,7 @@ def _redux_allele( elif redux_type == "P" and allele in self.ars_mappings.p_group: return self.ars_mappings.p_group[allele] elif redux_type in ["lgx", "lg"]: - if allele in self.ars_mappings.dup_lgx: - redux_allele = self.ars_mappings.dup_lgx[allele] - elif allele in self.ars_mappings.lgx_group: + if allele in self.ars_mappings.lgx_group: redux_allele = self.ars_mappings.lgx_group[allele] else: # for 'lgx' or 'lg' mode when allele is not in G group, diff --git a/pyard/data_repository.py b/pyard/data_repository.py index e21ecf7..5f03bba 100644 --- a/pyard/data_repository.py +++ b/pyard/data_repository.py @@ -50,6 +50,7 @@ get_1field_allele, ) from .serology import broad_splits_dna_mapping, SerologyMapping +from .smart_sort import smart_sort_comparator def expression_reduce(df): @@ -113,15 +114,6 @@ def generate_ard_mapping(db_connection: sqlite3.Connection, imgt_version) -> ARS mlgx = df_g_group.drop_duplicates(["2d", "lgx"])["2d"].value_counts() multiple_lgx_list = mlgx[mlgx > 1].index.to_list() - # Keep only the alleles that have more than 1 mapping - dup_lgx = ( - df_g_group[df_g_group["2d"].isin(multiple_lgx_list)][["lgx", "2d"]] - .drop_duplicates() - .groupby("2d", as_index=True) - .agg("/".join) - .to_dict()["lgx"] - ) - # Extract G group mapping df_g = pd.concat( [ @@ -154,6 +146,24 @@ def generate_ard_mapping(db_connection: sqlite3.Connection, imgt_version) -> ARS ) lgx_group = df_lgx.set_index("A")["lgx"].to_dict() + # Find the alleles that have more than 1 mapping + dup_lgx = ( + df_g_group[df_g_group["2d"].isin(multiple_lgx_list)][["lgx", "2d"]] + .drop_duplicates() + .groupby("2d", as_index=True) + .agg(list) + .to_dict()["lgx"] + ) + # Do not keep duplicate alleles for lgx. Issue #333 + # DPA1*02:02/DPA1*02:07 ==> DPA1*02:02 + # + lowest_numbered_dup_lgx = { + k: sorted(v, key=functools.cmp_to_key(smart_sort_comparator))[0] + for k, v in dup_lgx.items() + } + # Update the lgx_group with the allele with the lowest number + lgx_group.update(lowest_numbered_dup_lgx) + # Extract exon mapping df_exon = pd.concat( [ @@ -164,7 +174,6 @@ def generate_ard_mapping(db_connection: sqlite3.Connection, imgt_version) -> ARS ars_mapping = ARSMapping( dup_g=dup_g, - dup_lgx=dup_lgx, g_group=g_group, p_group=p_group, lgx_group=lgx_group, diff --git a/pyard/db.py b/pyard/db.py index 65f6a52..45ce425 100644 --- a/pyard/db.py +++ b/pyard/db.py @@ -461,9 +461,6 @@ def set_user_version(connection: sqlite3.Connection, version: int): def load_ars_mappings(db_connection): dup_g = load_dict(db_connection, table_name="dup_g", columns=("allele", "g_group")) - dup_lgx = load_dict( - db_connection, table_name="dup_lgx", columns=("allele", "lgx_group") - ) g_group = load_dict(db_connection, table_name="g_group", columns=("allele", "g")) p_group = load_dict(db_connection, table_name="p_group", columns=("allele", "p")) lgx_group = load_dict( @@ -475,7 +472,6 @@ def load_ars_mappings(db_connection): p_not_g = load_dict(db_connection, table_name="p_not_g", columns=("allele", "lgx")) return ARSMapping( dup_g=dup_g, - dup_lgx=dup_lgx, g_group=g_group, p_group=p_group, lgx_group=lgx_group, @@ -497,12 +493,6 @@ def save_ars_mappings(db_connection: sqlite3.Connection, ars_mapping: ARSMapping dictionary=ars_mapping.dup_g, columns=("allele", "g_group"), ) - save_dict( - db_connection, - table_name="dup_lgx", - dictionary=ars_mapping.dup_lgx, - columns=("allele", "lgx_group"), - ) save_dict( db_connection, table_name="g_group", diff --git a/pyard/mappings.py b/pyard/mappings.py index 07b4f2f..9a42a13 100644 --- a/pyard/mappings.py +++ b/pyard/mappings.py @@ -23,7 +23,6 @@ ars_mapping_tables = [ "dup_g", - "dup_lgx", "g_group", "p_group", "lgx_group", diff --git a/tests/features/p_g_group.feature b/tests/features/p_g_group.feature index 3c5b877..8f8e75d 100644 --- a/tests/features/p_g_group.feature +++ b/tests/features/p_g_group.feature @@ -90,10 +90,10 @@ Feature: P and G Groups | C*02:10 | lg | C*02:02g | | C*02:10 | lgx | C*02:02 | - Examples: lgx with duplicates - | Allele | Level | Redux Allele | - | DPA1*02:12 | lgx | DPA1*02:02/DPA1*02:07 | - | DPA1*02:12 | lg | DPA1*02:02g/DPA1*02:07g | - | DQA1*03:03 | lgx | DQA1*03:01 | - | DQA1*03:03 | lg | DQA1*03:01g | - | DQA1*03:03:09 | lg | DQA1*03:03g | + Examples: lgx redux with duplicate G groups + | Allele | Level | Redux Allele | + | DPA1*02:12 | lgx | DPA1*02:02 | + | DPA1*02:12 | lg | DPA1*02:02g | + | DQA1*03:03 | lgx | DQA1*03:01 | + | DQA1*03:03 | lg | DQA1*03:01g | + | DQA1*03:03:09 | lg | DQA1*03:03g | From 165f6031e31bd69a29b13a1d72d6e4f998f09ae3 Mon Sep 17 00:00:00 2001 From: Pradeep Bashyal Date: Tue, 29 Oct 2024 15:42:24 -0500 Subject: [PATCH 3/4] Apply HLA- prefix correctly if the result is an allele list --- pyard/ard.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pyard/ard.py b/pyard/ard.py index fcab4e9..88d8e7b 100644 --- a/pyard/ard.py +++ b/pyard/ard.py @@ -189,6 +189,8 @@ def _redux_allele( hla, allele_name = allele.split("-") redux_allele = self._redux_allele(allele_name, redux_type) if redux_allele: + if "/" in redux_allele: + return "/".join(["HLA-" + ra for ra in redux_allele.split("/")]) return "HLA-" + redux_allele else: return redux_allele From f91fef4395d7958e2b9ce600633ca1475d725640 Mon Sep 17 00:00:00 2001 From: Pradeep Bashyal Date: Tue, 29 Oct 2024 15:45:59 -0500 Subject: [PATCH 4/4] =?UTF-8?q?Bump=20version:=201.5.1=20=E2=86=92=201.5.2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Dockerfile | 2 +- api-spec.yaml | 2 +- pyard/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Dockerfile b/Dockerfile index 2b57dd5..52d1bd7 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,7 +4,7 @@ LABEL MAINTAINER="Pradeep Bashyal" WORKDIR /app -ARG PY_ARD_VERSION=1.5.1 +ARG PY_ARD_VERSION=1.5.2 COPY requirements.txt /app RUN pip install --no-cache-dir --upgrade pip && \ diff --git a/api-spec.yaml b/api-spec.yaml index 5604fca..83ddb6f 100644 --- a/api-spec.yaml +++ b/api-spec.yaml @@ -2,7 +2,7 @@ openapi: 3.0.3 info: title: ARD Reduction description: Reduce to ARD Level - version: "1.5.1" + version: "1.5.2" servers: - url: 'http://localhost:8080' tags: diff --git a/pyard/__init__.py b/pyard/__init__.py index a4e65fa..a2cc703 100644 --- a/pyard/__init__.py +++ b/pyard/__init__.py @@ -26,7 +26,7 @@ from .misc import get_imgt_db_versions as db_versions __author__ = """NMDP Bioinformatics""" -__version__ = "1.5.1" +__version__ = "1.5.2" def init( diff --git a/setup.cfg b/setup.cfg index 9d7b2e8..038a518 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 1.5.1 +current_version = 1.5.2 commit = True tag = True diff --git a/setup.py b/setup.py index af6b9c6..65ed2c2 100644 --- a/setup.py +++ b/setup.py @@ -36,7 +36,7 @@ setup( name="py-ard", - version="1.5.1", + version="1.5.2", description="ARD reduction for HLA with Python", long_description=readme, long_description_content_type="text/markdown",