Skip to content

Commit

Permalink
PnotG (#178)
Browse files Browse the repository at this point in the history
* works but not for G mode

* ping mode

* updated test to fix its behavior

* remove lambdas, add and remove comments

* fixed test expected results to new correct answer

* ping tests

* lint
  • Loading branch information
mmaiers-nmdp authored Sep 29, 2022
1 parent 47e2529 commit 0952a16
Show file tree
Hide file tree
Showing 7 changed files with 138 additions and 19 deletions.
76 changes: 62 additions & 14 deletions pyard/data_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,13 @@
from collections import namedtuple
import functools
import sqlite3

import pandas as pd

from . import db
from .broad_splits import broad_splits_dna_mapping
from .broad_splits import broad_splits_ser_mapping
from .misc import get_2field_allele, get_3field_allele, number_of_fields
from .misc import expression_chars
from .misc import expression_chars, get_G_name, get_P_name

# GitHub URL where IMGT HLA files are downloaded.
from pyard.smart_sort import smart_sort_comparator
Expand All @@ -46,6 +45,7 @@
"lgx_group",
"exon_group",
"p_group",
"p_not_g",
]
ARSMapping = namedtuple("ARSMapping", ars_mapping_tables)

Expand Down Expand Up @@ -102,6 +102,9 @@ def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version):
p_group = db.load_dict(
db_connection, table_name="p_group", columns=("allele", "p")
)
p_not_g = db.load_dict(
db_connection, table_name="p_not_g", columns=("allele", "lgx")
)
return ARSMapping(
dup_g=dup_g,
dup_lg=dup_lg,
Expand All @@ -111,13 +114,46 @@ def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version):
lgx_group=lgx_group,
exon_group=exon_group,
p_group=p_group,
p_not_g=p_not_g,
)

# load the hla_nom_g.txt
ars_G_url = f"{IMGT_HLA_URL}{imgt_version}/wmda/hla_nom_g.txt"
df = pd.read_csv(ars_G_url, skiprows=6, names=["Locus", "A", "G"], sep=";").dropna()

# the G-group is named for its first allele
df["G"] = df["A"].apply(get_G_name)

# load the hla_nom_p.txt
ars_P_url = f"{IMGT_HLA_URL}{imgt_version}/wmda/hla_nom_p.txt"
# example: C*;06:06:01:01/06:06:01:02/06:271;06:06P
df_P = pd.read_csv(
ars_P_url, skiprows=6, names=["Locus", "A", "P"], sep=";"
).dropna()

# the P-group is named for its first allele
df_P["P"] = df_P["A"].apply(get_P_name)

# convert slash delimited string to a list
df_P["A"] = df_P["A"].apply(lambda a: a.split("/"))
df_P = df_P.explode("A")
# C* 06:06:01:01/06:06:01:02/06:271 06:06P
df_P["A"] = df_P["Locus"] + df_P["A"]
df_P["P"] = df_P["Locus"] + df_P["P"]
# C* 06:06:01:01 06:06P
# C* 06:06:01:02 06:06P
# C* 06:271 06:06P
p_group = df_P.set_index("A")["P"].to_dict()
df_P["2d"] = df_P["A"].apply(get_2field_allele)
# lgx has the P-group name without the P for comparison
df_P["lgx"] = df_P["P"].apply(get_2field_allele)

# convert slash delimited string to a list
df["A"] = df["A"].apply(lambda a: a.split("/"))
# convert the list into separate rows for each element
df = df.explode("A")

# A* + 02:01 = A*02:01
df["A"] = df["Locus"] + df["A"]
df["G"] = df["Locus"] + df["G"]

Expand All @@ -126,8 +162,24 @@ def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version):
df["lg"] = df["G"].apply(lambda a: ":".join(a.split(":")[0:2]) + "g")
df["lgx"] = df["G"].apply(lambda a: ":".join(a.split(":")[0:2]))

# compare df_P["2d"] with df["2d"] to find 2-field alleles in the
# P-group that aren't in the G-group
PnotinG = set(df_P["2d"]) - set(df["2d"])

# filter to find these 2-field alleles (2d) in the P-group data frame
df_PnotG = df_P[df_P["2d"].isin(PnotinG)]

# dictionary which will define the table
p_not_g = df_PnotG.set_index("A")["lgx"].to_dict()

# multiple Gs
# goal: identify 2-field alleles that are in multiple G-groups

# group by 2d and G, and select the 2d column and count the columns
mg = df.drop_duplicates(["2d", "G"])["2d"].value_counts()
# filter out the mg with count > 1, leaving only duplicates
# take the index from the 2d version the data frame, make that a column
# and turn that into a list
multiple_g_list = mg[mg > 1].reset_index()["index"].to_list()

# Keep only the alleles that have more than 1 mapping
Expand Down Expand Up @@ -202,18 +254,13 @@ def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version):
)
exon_group = df_exon.set_index("A")["exon"].to_dict()

# P groups
ars_P_url = f"{IMGT_HLA_URL}{imgt_version}/wmda/hla_nom_p.txt"
df_P = pd.read_csv(
ars_P_url, skiprows=6, names=["Locus", "A", "P"], sep=";"
).dropna()
df_P["A"] = df_P["A"].apply(lambda a: a.split("/"))
df_P = df_P.explode("A")
df_P["A"] = df_P["Locus"] + df_P["A"]
df_P["P"] = df_P["Locus"] + df_P["P"]
p_group = df_P.set_index("A")["P"].to_dict()

# save
db.save_dict(
db_connection,
table_name="p_not_g",
dictionary=p_not_g,
columns=("allele", "lgx"),
)
db.save_dict(
db_connection,
table_name="dup_g",
Expand Down Expand Up @@ -256,7 +303,7 @@ def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version):
db.save_dict(
db_connection,
table_name="p_group",
dictionary=exon_group,
dictionary=p_group,
columns=("allele", "p"),
)

Expand All @@ -269,6 +316,7 @@ def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version):
lgx_group=lgx_group,
exon_group=exon_group,
p_group=p_group,
p_not_g=p_not_g,
)


Expand Down
34 changes: 34 additions & 0 deletions pyard/misc.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# List of expression characters
expression_chars = ["N", "Q", "L", "S"]
# List of P and G characters
PandG_chars = ["P", "G"]


def get_n_field_allele(allele: str, n: int, preserve_expression=False) -> str:
Expand All @@ -20,12 +22,44 @@ def get_n_field_allele(allele: str, n: int, preserve_expression=False) -> str:


def get_3field_allele(a: str) -> str:
last_char = a[-1]
if last_char in PandG_chars:
a = a[:-1]

return get_n_field_allele(a, 3)


def get_2field_allele(a: str) -> str:
last_char = a[-1]
if last_char in PandG_chars:
a = a[:-1]
return get_n_field_allele(a, 2)


def number_of_fields(allele: str) -> int:
return len(allele.split(":"))


# computes a valid G name based on the ambiguity string
def get_G_name(a: str) -> str:
a = a.split("/")[0]
last_char = a[-1]
if last_char in PandG_chars + expression_chars:
a = a[:-1]
if len(a.split(":")) == 2:
return ":".join([a, "01"]) + "G"
else:
return ":".join(a.split(":")[0:3]) + "G"


# computes a valid P name based on the ambiguity string
def get_P_name(a: str) -> str:
a = a.split("/")[0]
last_char = a[-1]
if last_char in PandG_chars + expression_chars:
a = a[:-1]
return ":".join(a.split(":")[0:2]) + "P"


def number_of_fields(allele: str) -> int:
return len(allele.split(":"))
10 changes: 9 additions & 1 deletion pyard/pyard.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
"reduce_XX": True,
"reduce_MAC": True,
"reduce_shortnull": True,
"ping": False,
"map_drb345_to_drbx": True,
"verbose_log": True,
}
Expand Down Expand Up @@ -140,7 +141,7 @@ def __del__(self):
self.db_connection.close()

@functools.lru_cache(maxsize=max_cache_size)
def redux(self, allele: str, redux_type: VALID_REDUCTION_TYPES) -> str:
def redux(self, allele: str, redux_type: VALID_REDUCTION_TYPES, reping=True) -> str:
"""
Does ARS reduction with allele and ARS type
Expand Down Expand Up @@ -172,6 +173,13 @@ def redux(self, allele: str, redux_type: VALID_REDUCTION_TYPES) -> str:
if allele.endswith(("P", "G")):
if redux_type in ["lg", "lgx", "G"]:
allele = allele[:-1]
if self._config["ping"] and reping:
if redux_type in ("lg", "lgx", "U2"):
if allele in self.ars_mappings.p_not_g:
return self.ars_mappings.p_not_g[allele]
else:
return self.redux(allele, redux_type, False)

if redux_type == "G" and allele in self.ars_mappings.g_group:
if allele in self.ars_mappings.dup_g:
return self.ars_mappings.dup_g[allele]
Expand Down
6 changes: 6 additions & 0 deletions tests/environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,9 @@

def before_all(context):
context.ard = ARD("3440", data_dir="/tmp/py-ard")

# an ard with ping set to True
my_config = {
"ping": True,
}
context.ard_ping = ARD("3440", data_dir="/tmp/py-ard", config=my_config)
23 changes: 20 additions & 3 deletions tests/features/allele.feature
Original file line number Diff line number Diff line change
@@ -1,6 +1,20 @@
Feature: Alleles

Scenario Outline:
Scenario Outline: allele reduction with ping

Given the allele as <Allele>
When reducing on the <Level> level with ping
Then the reduced allele is found to be <Redux Allele>

Examples:
| Allele | Level | Redux Allele |
| C*02:02 | lg | C*02:02g |
| C*02:02 | lgx | C*02:02 |
| C*02:10 | lg | C*02:02g |
| C*02:10 | lgx | C*02:02 |
| C*06:17 | lgx | C*06:02 |

Scenario Outline: allele reduction

Given the allele as <Allele>
When reducing on the <Level> level
Expand All @@ -21,5 +35,8 @@ Feature: Alleles

| DRB1*14:06:01 | lgx | DRB1*14:06 |
| DRB1*14:06:01 | lg | DRB1*14:06g |
| C*02:02 | lg | C*02:02g/C*02:10g |
| C*02:02 | lgx | C*02:02/C*02:10 |
| C*02:02 | lg | C*02:02g |
| C*02:02 | lgx | C*02:02 |
| C*02:10 | lg | C*02:02g |
| C*02:10 | lgx | C*02:02 |
| C*06:17 | lgx | C*06:17 |
6 changes: 6 additions & 0 deletions tests/steps/redux_allele.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,12 @@ def step_impl(context, level):
context.redux_allele = context.ard.redux(context.allele, level)


@when("reducing on the {level} level with ping")
def step_impl(context, level):
context.level = level
context.redux_allele = context.ard_ping.redux(context.allele, level)


@when("reducing on the {level} level (ambiguous)")
def step_impl(context, level):
context.level = level
Expand Down
2 changes: 1 addition & 1 deletion tests/test_pyard.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,4 +156,4 @@ def test_allele_duplicated(self):
# https://github.com/nmdp-bioinformatics/py-ard/issues/135
allele_code = "C*02:ACMGS"
allele_code_rx = self.ard.redux_gl(allele_code, "lgx")
self.assertEqual(allele_code_rx, "C*02:02/C*02:10")
self.assertEqual(allele_code_rx, "C*02:02")

0 comments on commit 0952a16

Please sign in to comment.