From b1db899502d5d9e05915c1c09171482e92dbe601 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Mon, 10 Jul 2023 12:00:55 -0400 Subject: [PATCH 1/6] Adding print to see what is getting captured --- iAnnotateSV/AnnotationForKinaseDomain.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/iAnnotateSV/AnnotationForKinaseDomain.py b/iAnnotateSV/AnnotationForKinaseDomain.py index 26218cd..6c63d1f 100644 --- a/iAnnotateSV/AnnotationForKinaseDomain.py +++ b/iAnnotateSV/AnnotationForKinaseDomain.py @@ -128,6 +128,10 @@ def run(svDFA, refPath, ctPath, allctPath, upPath, verbose): def processData(chrom, transcript, refDF, upDF): transcripts = (refDF[refDF['name'] == transcript]) if (len(transcripts) > 1): + print(transcript,"\n") + print(chrom,"\n") + print(transcripts[transcripts['chrom'] == chrom].index,"\n") + print(refDF[refDF['name'] == transcript].index,"\n") transcriptIdx, = (transcripts[transcripts['chrom'] == chrom].index) else: try: From 65d62e2cc727f51c304e2627dfcf4daec4e9d482 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Mon, 10 Jul 2023 12:42:05 -0400 Subject: [PATCH 2/6] Update AnnotationForKinaseDomain.py --- iAnnotateSV/AnnotationForKinaseDomain.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/iAnnotateSV/AnnotationForKinaseDomain.py b/iAnnotateSV/AnnotationForKinaseDomain.py index 6c63d1f..33f51d4 100644 --- a/iAnnotateSV/AnnotationForKinaseDomain.py +++ b/iAnnotateSV/AnnotationForKinaseDomain.py @@ -132,10 +132,10 @@ def processData(chrom, transcript, refDF, upDF): print(chrom,"\n") print(transcripts[transcripts['chrom'] == chrom].index,"\n") print(refDF[refDF['name'] == transcript].index,"\n") - transcriptIdx, = (transcripts[transcripts['chrom'] == chrom].index) + transcriptIdx, *addIdx = (transcripts[transcripts['chrom'] == chrom].index) else: try: - transcriptIdx, = (refDF[refDF['name'] == transcript].index) + transcriptIdx, *addIdx = (refDF[refDF['name'] == transcript].index) except ValueError: return (None, None, None) From 02c155bf47f9be9cc11bcd9109d4336cbf227a77 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Mon, 10 Jul 2023 13:01:29 -0400 Subject: [PATCH 3/6] Update AnnotationForKinaseDomain.py --- iAnnotateSV/AnnotationForKinaseDomain.py | 183 +++++++++-------------- 1 file changed, 69 insertions(+), 114 deletions(-) diff --git a/iAnnotateSV/AnnotationForKinaseDomain.py b/iAnnotateSV/AnnotationForKinaseDomain.py index 33f51d4..3560fd4 100644 --- a/iAnnotateSV/AnnotationForKinaseDomain.py +++ b/iAnnotateSV/AnnotationForKinaseDomain.py @@ -3,6 +3,8 @@ @Ronak Shah ''' + +import contextlib import os import sys import pandas as pd @@ -55,16 +57,8 @@ def run(svDFA, refPath, ctPath, allctPath, upPath, verbose): "iAnnotateSV::AnnotateForKinaseDomain: Checking Entry %d in Uniprot data", count) chr1 = str(row.loc['chr1']) chr2 = str(row.loc['chr2']) - if(chr1.startswith('chr')): - chr1 = chr1 - else: - chr1 = "chr" + chr1 - if(chr2.startswith('chr')): - chr2 = chr2 - else: - chr2 = "chr" + chr2 - pos1 = int(row.loc['pos1']) - pos2 = int(row.loc['pos2']) + chr1 = chr1 if (chr1.startswith('chr')) else f"chr{chr1}" + chr2 = chr2 if (chr2.startswith('chr')) else f"chr{chr2}" gene1 = str(row.loc['gene1']) gene2 = str(row.loc['gene2']) site1 = str(row.loc['site1']) @@ -93,22 +87,22 @@ def run(svDFA, refPath, ctPath, allctPath, upPath, verbose): kanno1 = None kanno2 = None - if(fusion != "-"): - # First Gene +, Second Gene - - fusionevent = re.search(r'\{(.*)\}', fusion) - if(fusionevent): - eventType = fusionevent.group(1) - if(":" in eventType): + if (fusion != "-"): + if fusionevent := re.search(r'\{(.*)\}', fusion): + eventType = fusionevent[1] + if (":" in eventType): # print fusion, fusionevent, eventType (egene1, egene2) = (str(eventType)).split(":") - if(transcript1): + if transcript1: + pos1 = int(row.loc['pos1']) kanno1 = getKinaseInfo( chr1, pos1, gene1, egene1, egene2, transcript1, refDF, upDF) else: kanno1 = None - if(transcript2): + if transcript2: + pos2 = int(row.loc['pos2']) kanno2 = getKinaseInfo( chr2, pos2, gene2, egene1, egene2, transcript2, refDF, upDF) else: @@ -132,10 +126,10 @@ def processData(chrom, transcript, refDF, upDF): print(chrom,"\n") print(transcripts[transcripts['chrom'] == chrom].index,"\n") print(refDF[refDF['name'] == transcript].index,"\n") - transcriptIdx, *addIdx = (transcripts[transcripts['chrom'] == chrom].index) + transcriptIdx = getValueOrDefault(transcripts[transcripts['chrom'] == chrom].index,0) else: try: - transcriptIdx, *addIdx = (refDF[refDF['name'] == transcript].index) + transcriptIdx = getValueOrDefault(refDF[refDF['name'] == transcript].index,0) except ValueError: return (None, None, None) @@ -149,27 +143,19 @@ def processData(chrom, transcript, refDF, upDF): # print upDF.iloc[index],"\n" chromStart = upDF.iloc[index]['chromStart'] chromEnd = upDF.iloc[index]['chromEnd'] - if ((chromStart >= refTxSt) and (chromEnd <= refTxEn)): - # print "Chr" , chromStart,chromEnd, refTxSt, refTxEn,"\n" - if (upDF.iloc[index]['annotationType'] == 'domain'): - up_recordIndex.append(index) + if (chromStart >= refTxSt) and (chromEnd <= refTxEn) and upDF.iloc[index]['annotationType'] == 'domain': + up_recordIndex.append(index) allMaxVal = [] allMinVal = [] - for index, val in enumerate(up_recordIndex): + for val in up_recordIndex: chromStart = upDF.iloc[val]['chromStart'] chromEnd = upDF.iloc[val]['chromEnd'] maxVal = max(refTxEn, chromEnd) allMaxVal.append(maxVal) minVal = min(refTxSt, chromStart) allMinVal.append(minVal) - if (allMaxVal): - max_len = max(allMaxVal) - else: - max_len = refTxEn - if (allMinVal): - min_len = max(allMinVal) - else: - min_len = refTxSt + max_len = max(allMaxVal, default=refTxEn) + min_len = max(allMinVal, default=refTxSt) return (up_recordIndex, max_len, min_len) @@ -180,111 +166,80 @@ def getKinaseInfo(chrom, pos, gene, egene1, egene2, transcript, refDF, upDF): strand = refDF.strand[refDF.name[refDF.name == transcript].index.tolist()[ 0]] #kanno = None - if(strand == "+"): - if(egene1 == gene): + if (strand == "+"): + if (egene1 == gene): # print "Here1" # See if Kinase occurs after the breakpoint or within the breakpoint - for index, val in enumerate(domainIdx): + for val in domainIdx: chromStart = upDF.iloc[val]['chromStart'] chromEnd = upDF.iloc[val]['chromEnd'] fname = upDF.iloc[val]['name'] - if("Protein kinase" in fname): + if ("Protein kinase" in fname): if (pos > chromEnd): - kanno = "Kinase Domain Included" + return "Kinase Domain Included" else: - if(chromStart <= pos): - if(pos <= chromEnd): - kanno = "Partial Kinase Domain Included" - else: - kanno = "Kinase Domain Not Included" - else: - if(chromEnd <= pos): - if(pos <= chromStart): - kanno = "Partial Kinase Domain Included" - else: - kanno = "Kinase Domain Not Included" - else: - kanno = "Kinase Domain Not Included" - # print gene, pos, chromStart, chromEnd, transcript, strand, kanno - return(kanno) - - if(egene2 == gene): + return ( + "Partial Kinase Domain Included" + if chromStart > pos + and chromEnd <= pos <= chromStart + or (chromStart <= pos) + else "Kinase Domain Not Included" + ) + if (egene2 == gene): # print "Here2" # See if Kinase occurs after the breakpoint or within the breakpoint - for index, val in enumerate(domainIdx): + for val in domainIdx: chromStart = upDF.iloc[val]['chromStart'] chromEnd = upDF.iloc[val]['chromEnd'] fname = upDF.iloc[val]['name'] - if("Protein kinase" in fname): - if(pos < chromStart): - kanno = "Kinase Domain Included" + if ("Protein kinase" in fname): + if (pos < chromStart): + return "Kinase Domain Included" else: - if(chromStart <= pos): - if(pos <= chromEnd): - kanno = "Partial Kinase Domain Included" - else: - kanno = "Kinase Domain Not Included" - else: - if(chromEnd <= pos): - if(pos <= chromStart): - kanno = "Partial Kinase Domain Included" - else: - kanno = "Kinase Domain Not Included" - else: - kanno = "Kinase Domain Not Included" - # print gene, pos, chromStart, chromEnd, transcript, strand, kanno - return(kanno) + return ( + "Partial Kinase Domain Included" + if (pos <= chromEnd) + else "Kinase Domain Not Included" + ) else: - if(egene1 == gene): + if (egene1 == gene): # print "Here3" # See if Kinase occurs after the breakpoint or within the breakpoint - for index, val in enumerate(domainIdx): + for val in domainIdx: chromStart = upDF.iloc[val]['chromStart'] chromEnd = upDF.iloc[val]['chromEnd'] fname = upDF.iloc[val]['name'] if ("Protein kinase" in fname): - if(pos < chromStart): - kanno = "Kinase Domain Included" + if (pos < chromStart): + return "Kinase Domain Included" else: - if(chromStart <= pos): - if(pos <= chromEnd): - kanno = "Partial Kinase Domain Included" - else: - kanno = "Kinase Domain Not Included" - else: - if(chromEnd <= pos): - if(pos <= chromStart): - kanno = "Partial Kinase Domain Included" - else: - kanno = "Kinase Domain Not Included" - else: - kanno = "Kinase Domain Not Included" - # print gene, pos, chromStart, chromEnd, transcript, strand, kanno - return(kanno) - - if(egene2 == gene): + return ( + "Partial Kinase Domain Included" + if (pos <= chromEnd) + else "Kinase Domain Not Included" + ) + if (egene2 == gene): # print "Here4" # See if Kinase occurs after the breakpoint or within the breakpoint - for index, val in enumerate(domainIdx): + for val in domainIdx: chromStart = upDF.iloc[val]['chromStart'] chromEnd = upDF.iloc[val]['chromEnd'] fname = upDF.iloc[val]['name'] - if("Protein kinase" in fname): - if(pos > chromEnd): - kanno = "Kinase Domain Included" + if ("Protein kinase" in fname): + if (pos > chromEnd): + return "Kinase Domain Included" else: - if(chromStart <= pos): - if(pos <= chromEnd): - kanno = "Partial Kinase Domain Included" - else: - kanno = "Kinase Domain Not Included" - else: - if(chromEnd <= pos): - if(pos <= chromStart): - kanno = "Partial Kinase Domain Included" - else: - kanno = "Kinase Domain Not Included" - else: - kanno = "Kinase Domain Not Included" - # print gene, pos, chromStart, chromEnd, transcript, strand, kanno - return(kanno) + return ( + "Partial Kinase Domain Included" + if chromStart > pos + and chromEnd <= pos <= chromStart + or (chromStart <= pos) + else "Kinase Domain Not Included" + ) + +def getValueOrDefault(value, index, default=None): + returnValue = default + + with contextlib.suppress(Exception): + returnValue = value[index] + return returnValue \ No newline at end of file From 6228763fd3a458eeae6c3cb1cb091509506f64ce Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Mon, 10 Jul 2023 13:03:57 -0400 Subject: [PATCH 4/6] Update AnnotationForKinaseDomain.py --- iAnnotateSV/AnnotationForKinaseDomain.py | 177 +++++++++++++++-------- 1 file changed, 116 insertions(+), 61 deletions(-) diff --git a/iAnnotateSV/AnnotationForKinaseDomain.py b/iAnnotateSV/AnnotationForKinaseDomain.py index 3560fd4..da5ded3 100644 --- a/iAnnotateSV/AnnotationForKinaseDomain.py +++ b/iAnnotateSV/AnnotationForKinaseDomain.py @@ -3,8 +3,6 @@ @Ronak Shah ''' - -import contextlib import os import sys import pandas as pd @@ -57,8 +55,16 @@ def run(svDFA, refPath, ctPath, allctPath, upPath, verbose): "iAnnotateSV::AnnotateForKinaseDomain: Checking Entry %d in Uniprot data", count) chr1 = str(row.loc['chr1']) chr2 = str(row.loc['chr2']) - chr1 = chr1 if (chr1.startswith('chr')) else f"chr{chr1}" - chr2 = chr2 if (chr2.startswith('chr')) else f"chr{chr2}" + if(chr1.startswith('chr')): + chr1 = chr1 + else: + chr1 = "chr" + chr1 + if(chr2.startswith('chr')): + chr2 = chr2 + else: + chr2 = "chr" + chr2 + pos1 = int(row.loc['pos1']) + pos2 = int(row.loc['pos2']) gene1 = str(row.loc['gene1']) gene2 = str(row.loc['gene2']) site1 = str(row.loc['site1']) @@ -87,22 +93,22 @@ def run(svDFA, refPath, ctPath, allctPath, upPath, verbose): kanno1 = None kanno2 = None - if (fusion != "-"): - if fusionevent := re.search(r'\{(.*)\}', fusion): - eventType = fusionevent[1] - if (":" in eventType): + if(fusion != "-"): + # First Gene +, Second Gene - + fusionevent = re.search(r'\{(.*)\}', fusion) + if(fusionevent): + eventType = fusionevent.group(1) + if(":" in eventType): # print fusion, fusionevent, eventType (egene1, egene2) = (str(eventType)).split(":") - if transcript1: - pos1 = int(row.loc['pos1']) + if(transcript1): kanno1 = getKinaseInfo( chr1, pos1, gene1, egene1, egene2, transcript1, refDF, upDF) else: kanno1 = None - if transcript2: - pos2 = int(row.loc['pos2']) + if(transcript2): kanno2 = getKinaseInfo( chr2, pos2, gene2, egene1, egene2, transcript2, refDF, upDF) else: @@ -143,19 +149,27 @@ def processData(chrom, transcript, refDF, upDF): # print upDF.iloc[index],"\n" chromStart = upDF.iloc[index]['chromStart'] chromEnd = upDF.iloc[index]['chromEnd'] - if (chromStart >= refTxSt) and (chromEnd <= refTxEn) and upDF.iloc[index]['annotationType'] == 'domain': - up_recordIndex.append(index) + if ((chromStart >= refTxSt) and (chromEnd <= refTxEn)): + # print "Chr" , chromStart,chromEnd, refTxSt, refTxEn,"\n" + if (upDF.iloc[index]['annotationType'] == 'domain'): + up_recordIndex.append(index) allMaxVal = [] allMinVal = [] - for val in up_recordIndex: + for index, val in enumerate(up_recordIndex): chromStart = upDF.iloc[val]['chromStart'] chromEnd = upDF.iloc[val]['chromEnd'] maxVal = max(refTxEn, chromEnd) allMaxVal.append(maxVal) minVal = min(refTxSt, chromStart) allMinVal.append(minVal) - max_len = max(allMaxVal, default=refTxEn) - min_len = max(allMinVal, default=refTxSt) + if (allMaxVal): + max_len = max(allMaxVal) + else: + max_len = refTxEn + if (allMinVal): + min_len = max(allMinVal) + else: + min_len = refTxSt return (up_recordIndex, max_len, min_len) @@ -166,80 +180,121 @@ def getKinaseInfo(chrom, pos, gene, egene1, egene2, transcript, refDF, upDF): strand = refDF.strand[refDF.name[refDF.name == transcript].index.tolist()[ 0]] #kanno = None - if (strand == "+"): - if (egene1 == gene): + if(strand == "+"): + if(egene1 == gene): # print "Here1" # See if Kinase occurs after the breakpoint or within the breakpoint - for val in domainIdx: + for index, val in enumerate(domainIdx): chromStart = upDF.iloc[val]['chromStart'] chromEnd = upDF.iloc[val]['chromEnd'] fname = upDF.iloc[val]['name'] - if ("Protein kinase" in fname): + if("Protein kinase" in fname): if (pos > chromEnd): - return "Kinase Domain Included" + kanno = "Kinase Domain Included" else: - return ( - "Partial Kinase Domain Included" - if chromStart > pos - and chromEnd <= pos <= chromStart - or (chromStart <= pos) - else "Kinase Domain Not Included" - ) - if (egene2 == gene): + if(chromStart <= pos): + if(pos <= chromEnd): + kanno = "Partial Kinase Domain Included" + else: + kanno = "Kinase Domain Not Included" + else: + if(chromEnd <= pos): + if(pos <= chromStart): + kanno = "Partial Kinase Domain Included" + else: + kanno = "Kinase Domain Not Included" + else: + kanno = "Kinase Domain Not Included" + # print gene, pos, chromStart, chromEnd, transcript, strand, kanno + return(kanno) + + if(egene2 == gene): # print "Here2" # See if Kinase occurs after the breakpoint or within the breakpoint - for val in domainIdx: + for index, val in enumerate(domainIdx): chromStart = upDF.iloc[val]['chromStart'] chromEnd = upDF.iloc[val]['chromEnd'] fname = upDF.iloc[val]['name'] - if ("Protein kinase" in fname): - if (pos < chromStart): - return "Kinase Domain Included" + if("Protein kinase" in fname): + if(pos < chromStart): + kanno = "Kinase Domain Included" else: - return ( - "Partial Kinase Domain Included" - if (pos <= chromEnd) - else "Kinase Domain Not Included" - ) + if(chromStart <= pos): + if(pos <= chromEnd): + kanno = "Partial Kinase Domain Included" + else: + kanno = "Kinase Domain Not Included" + else: + if(chromEnd <= pos): + if(pos <= chromStart): + kanno = "Partial Kinase Domain Included" + else: + kanno = "Kinase Domain Not Included" + else: + kanno = "Kinase Domain Not Included" + # print gene, pos, chromStart, chromEnd, transcript, strand, kanno + return(kanno) else: - if (egene1 == gene): + if(egene1 == gene): # print "Here3" # See if Kinase occurs after the breakpoint or within the breakpoint - for val in domainIdx: + for index, val in enumerate(domainIdx): chromStart = upDF.iloc[val]['chromStart'] chromEnd = upDF.iloc[val]['chromEnd'] fname = upDF.iloc[val]['name'] if ("Protein kinase" in fname): - if (pos < chromStart): - return "Kinase Domain Included" + if(pos < chromStart): + kanno = "Kinase Domain Included" else: - return ( - "Partial Kinase Domain Included" - if (pos <= chromEnd) - else "Kinase Domain Not Included" - ) - if (egene2 == gene): + if(chromStart <= pos): + if(pos <= chromEnd): + kanno = "Partial Kinase Domain Included" + else: + kanno = "Kinase Domain Not Included" + else: + if(chromEnd <= pos): + if(pos <= chromStart): + kanno = "Partial Kinase Domain Included" + else: + kanno = "Kinase Domain Not Included" + else: + kanno = "Kinase Domain Not Included" + # print gene, pos, chromStart, chromEnd, transcript, strand, kanno + return(kanno) + + if(egene2 == gene): # print "Here4" # See if Kinase occurs after the breakpoint or within the breakpoint - for val in domainIdx: + for index, val in enumerate(domainIdx): chromStart = upDF.iloc[val]['chromStart'] chromEnd = upDF.iloc[val]['chromEnd'] fname = upDF.iloc[val]['name'] - if ("Protein kinase" in fname): - if (pos > chromEnd): - return "Kinase Domain Included" + if("Protein kinase" in fname): + if(pos > chromEnd): + kanno = "Kinase Domain Included" else: - return ( - "Partial Kinase Domain Included" - if chromStart > pos - and chromEnd <= pos <= chromStart - or (chromStart <= pos) - else "Kinase Domain Not Included" - ) + if(chromStart <= pos): + if(pos <= chromEnd): + kanno = "Partial Kinase Domain Included" + else: + kanno = "Kinase Domain Not Included" + else: + if(chromEnd <= pos): + if(pos <= chromStart): + kanno = "Partial Kinase Domain Included" + else: + kanno = "Kinase Domain Not Included" + else: + kanno = "Kinase Domain Not Included" + # print gene, pos, chromStart, chromEnd, transcript, strand, kanno + return(kanno) def getValueOrDefault(value, index, default=None): returnValue = default - with contextlib.suppress(Exception): + try: returnValue = value[index] + except Exception: + pass + return returnValue \ No newline at end of file From 776d17a5a17a0cbe357945b9c0e04950c43074fe Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Mon, 10 Jul 2023 13:10:27 -0400 Subject: [PATCH 5/6] Update AnnotationForKinaseDomain.py --- iAnnotateSV/AnnotationForKinaseDomain.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/iAnnotateSV/AnnotationForKinaseDomain.py b/iAnnotateSV/AnnotationForKinaseDomain.py index da5ded3..1d63015 100644 --- a/iAnnotateSV/AnnotationForKinaseDomain.py +++ b/iAnnotateSV/AnnotationForKinaseDomain.py @@ -128,10 +128,6 @@ def run(svDFA, refPath, ctPath, allctPath, upPath, verbose): def processData(chrom, transcript, refDF, upDF): transcripts = (refDF[refDF['name'] == transcript]) if (len(transcripts) > 1): - print(transcript,"\n") - print(chrom,"\n") - print(transcripts[transcripts['chrom'] == chrom].index,"\n") - print(refDF[refDF['name'] == transcript].index,"\n") transcriptIdx = getValueOrDefault(transcripts[transcripts['chrom'] == chrom].index,0) else: try: From 1e845e94effb3138774963f825137b0fd73ca544 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Mon, 10 Jul 2023 13:40:15 -0400 Subject: [PATCH 6/6] Update AnnotationForKinaseDomain.py --- iAnnotateSV/AnnotationForKinaseDomain.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/iAnnotateSV/AnnotationForKinaseDomain.py b/iAnnotateSV/AnnotationForKinaseDomain.py index 1d63015..030f4da 100644 --- a/iAnnotateSV/AnnotationForKinaseDomain.py +++ b/iAnnotateSV/AnnotationForKinaseDomain.py @@ -293,4 +293,4 @@ def getValueOrDefault(value, index, default=None): except Exception: pass - return returnValue \ No newline at end of file + return returnValue