From 30039c999f069a307c1a95b535c037513220307f Mon Sep 17 00:00:00 2001 From: Pradeep Bashyal Date: Wed, 21 Aug 2024 17:25:13 -0500 Subject: [PATCH] Batch Reductions Update: - `homozygosify_glstring`: When creating a GL String if one of the typ allele(s) is not available, homozygosify from the other typ - `suppress_reduced_locus_column`: When creating GL String, you may not want the reduced locus columns in the output. --- extras/reduce_conf.json | 2 ++ extras/reduce_conf_glstring.json | 2 ++ extras/sample.csv | 2 +- extras/sample_glstring.csv | 2 +- scripts/pyard-reduce-csv | 32 +++++++++++++++++++++++++++----- 5 files changed, 33 insertions(+), 7 deletions(-) diff --git a/extras/reduce_conf.json b/extras/reduce_conf.json index 8e83d32..eacad7b 100644 --- a/extras/reduce_conf.json +++ b/extras/reduce_conf.json @@ -123,6 +123,8 @@ "new_column_for_redux": true, "reduced_column_prefix": "reduced_", "generate_glstring": true, + "homozygosify_glstring": true, + "suppress_reduced_locus_column": true, "output_file_format": "csv", "apply_compression": "gzip", "verbose_log": true diff --git a/extras/reduce_conf_glstring.json b/extras/reduce_conf_glstring.json index 24c8d9b..e89e635 100644 --- a/extras/reduce_conf_glstring.json +++ b/extras/reduce_conf_glstring.json @@ -27,6 +27,8 @@ "new_column_for_redux": true, "reduced_column_prefix": "reduced_", "generate_glstring": true, + "homozygosify_glstring": true, + "suppress_reduced_locus_column": true, "output_file_format": "csv", "apply_compression": "gzip", "verbose_log": true diff --git a/extras/sample.csv b/extras/sample.csv index 75d3f3f..9de2567 100644 --- a/extras/sample.csv +++ b/extras/sample.csv @@ -1,4 +1,4 @@ rid,did,r_a_typ1,r_a_typ2,r_b_typ1,r_b_typ2,r_c_typ1,r_c_typ2,r_drb1_typ1,r_drb1_typ2,r_dpb1_typ1,r_dpb1_typ2,d_a_typ1,d_a_typ2,d_b_typ1,d_b_typ2,d_c_typ1,d_c_typ2,d_drb1_typ1,d_drb1_typ2,d_dpb1_typ1,d_dpb1_typ2,r_drb3_typ1,r_drb3_typ2,r_drb4_typ1,r_drb4_typ2,r_drb5_typ1,r_drb5_typ2,d_drb3_typ1,d_drb3_typ2,d_drb4_typ1,d_drb4_typ2,d_drb5_typ1,d_drb5_typ2 -2110,123,A*01:AB,A*29:79,B*18:67,B*51:275,C*05:01:19,C*02:85:02,DRB1*03:03,DRB1*14:144,DPB1*193:01:01,DPB1*582:01:01,A*01:AB,A*29:79,B*18:67,B*51:275,C*05:01:19,C*02:85:02,DRB1*03:03,DRB1*14:144,DPB1*193:01:01,DPB1*582:01:01,DRB3*02:189,DRB3*03:09,NNNN,NNNN,NNNN,NNNN,NNNN,NNNN,NNNN,NNNN,DRB5*01:93,DRB5*02:02:01 +2110,123,A*01:AB,,B*18:67,B*51:275,C*05:01:19,C*02:85:02,DRB1*03:03,DRB1*14:144,DPB1*193:01:01,DPB1*582:01:01,A*01:AB,A*29:79,B*18:67,B*51:275,C*05:01:19,C*02:85:02,DRB1*03:03,DRB1*14:144,DPB1*193:01:01,DPB1*582:01:01,DRB3*02:189,DRB3*03:09,NNNN,NNNN,NNNN,NNNN,NNNN,NNNN,NNNN,NNNN,DRB5*01:93,DRB5*02:02:01 2111,456,A*01:01:42,A*30:12:02,B*44:02:32,B*35:42,C*03:148,C*04:322,DRB1*13:01:16,DRB1*15:80N,DPB1*914:01:01,DPB1*278:01:01,A*01:01:42,A*30:12:02,B*44:02:32,B*35:42,C*03:148,C*04:322,DRB1*13:01:16,DRB1*15:80N,DPB1*914:01:01,DPB1*278:01:01,NNNN,NNNN,DRB4*01:53,DRB4*01:31,NNNN,NNNN,NNNN,NNNN,NNNN,NNNN,DRB5*01:102,DRB5*01:103 2113,789,A*02:247,A*03:227,B*15:570,B*07:02:01:17,C*16:01:10,C*06:102,DRB1*13:156,DRB1*14:167:01,DPB1*405:01:01,DPB1*479:01:01,A*02:247,A*03:227,B*15:570,B*07:02:01:17,C*16:01:10,C*06:102,DRB1*13:156,DRB1*14:167:01,DPB1*405:01:01,DPB1*479:01:01,NNNN,NNNN,DRB4*01:79,DRB4*01:119,NNNN,NNNN,DRB3*02:189,DRB3*03:09,NNNN,NNNN,NNNN,NNNN diff --git a/extras/sample_glstring.csv b/extras/sample_glstring.csv index 0e96dfc..8a10387 100644 --- a/extras/sample_glstring.csv +++ b/extras/sample_glstring.csv @@ -1,3 +1,3 @@ rid,did,recip_gl,donor_gl -123,456,A*02:GNF+A*03:XYZ^B*07:ABD+B*44:AWA,A*02:01:01+A*03:01:01^B*07:RVXR+B*44:XYAG +123,456,A*02:GNF+A*03:AB^B*07:ABD+B*44:AWA,A*02:01:01+A*03:01:01^B*07:RVXR+B*44:XYAG 789,345,A*01:TUS+A*24:02:01G^B*08:ARGR+B*08:ARGS,A*02:01:01+A*01:PXTD^B*51:01:01G+B*40:BWUP diff --git a/scripts/pyard-reduce-csv b/scripts/pyard-reduce-csv index 6dc45e5..dedec87 100755 --- a/scripts/pyard-reduce-csv +++ b/scripts/pyard-reduce-csv @@ -231,15 +231,17 @@ def reduce_locus_columns(df, ard_config, locus_column_mapping, verbose): for locus in locus_column_mapping[subject]: slug_column = locus + "_slug" slug_columns.append(slug_column) - if len(locus_column_mapping[subject][locus]) > 1: - df[slug_column] = ( - df[locus_column_mapping[subject][locus][0]] - + "+" - + df[locus_column_mapping[subject][locus][1]] + locus_typ_pair = locus_column_mapping[subject][locus] + if len(locus_typ_pair) > 1: + df[slug_column] = df[locus_typ_pair].apply( + create_reduced_slug, axis=1 ) else: df[slug_column] = df[locus_column_mapping[subject][locus][0]] + if ard_config.get("suppress_reduced_locus_column"): + df.drop(columns=locus_typ_pair, inplace=True) + df[subject + "_gl"] = df[slug_columns].agg("^".join, axis=1) df[subject + "_gl"] = df[subject + "_gl"].apply( lambda gl: gl.replace("^+", "") @@ -247,6 +249,26 @@ def reduce_locus_columns(df, ard_config, locus_column_mapping, verbose): df.drop(columns=slug_columns, inplace=True) +def create_reduced_slug(locus_typ1_typ2_pair): + typ1 = locus_typ1_typ2_pair.iloc[0] + typ2 = locus_typ1_typ2_pair.iloc[1] + + if not typ1 and not typ2: + return "" + + if typ1 and typ2: + return typ1 + "+" + typ2 + elif ard_config.get("homozygosify_glstring"): + if typ1: + return typ1 + "+" + typ1 + if typ2: + return typ2 + "+" + typ2 + else: + if typ2: + return typ2 + return typ1 + + def reduce_glstring(glstring: str) -> str: try: return ard.redux(glstring, ard_config["redux_type"])