From 3e195d9769df0187258cfe1f666d02f9d41303a8 Mon Sep 17 00:00:00 2001 From: Arjun Arkal Rao Date: Tue, 15 May 2018 18:24:10 -0700 Subject: [PATCH] Fix event with inconsistency in normals with many IARS(resolves #284) resolves #284 We now print a message and return 'N' * peplen --- src/protect/binding_prediction/common.py | 64 +++++++++++++----------- 1 file changed, 34 insertions(+), 30 deletions(-) diff --git a/src/protect/binding_prediction/common.py b/src/protect/binding_prediction/common.py index 307c666a..3bead87e 100644 --- a/src/protect/binding_prediction/common.py +++ b/src/protect/binding_prediction/common.py @@ -329,7 +329,7 @@ def pept_diff(p1, p2): return sum([p1[i] != p2[i] for i in range(len(p1))]) -def _get_normal_peptides(mhc_df, iars, peplen): +def _get_normal_peptides(job, mhc_df, iars, peplen): """ Get the corresponding normal peptides for the tumor peptides that have already been subjected to mhc:peptide binding prediction. @@ -345,38 +345,42 @@ def _get_normal_peptides(mhc_df, iars, peplen): for pred in mhc_df.itertuples(): containing_iars = [i for i, sl in iars.items() if pred.pept in sl[0]] assert len(containing_iars) != 0, "No IARS contained the peptide" - if len(containing_iars) > 1: - # If there are multiple IARs, they all or none of them have to have a corresponding - # normal. - assert len(set([len(y) for x, y in iars.items() if x in containing_iars])) == 1 if len(iars[containing_iars[0]]) == 1: # This is a fusion and has no corresponding normal normal_peptides.append('N' * peplen) else: - tum, norm = iars[containing_iars[0]] - pos = tum.find(pred.pept) - temp_normal_pept = norm[pos:pos + peplen] - ndiff = pept_diff(pred.pept, temp_normal_pept) - assert ndiff != 0 - if ndiff == 1: - normal_peptides.append(norm[pos:pos + peplen]) + # If there are multiple IARs, they all or none of them have to have a corresponding + # normal. + if len(set([len(y) for x, y in iars.items() if x in containing_iars])) != 1: + job.fileStore.logToMaster('Some IARS were found to contain the substring but were' + 'inconsistent with the presence of a corresponding ' + 'normal.') + normal_peptides.append('N' * peplen) else: - if len(tum) == len(norm): - # Too (2+) many single nucleotide changes to warrant having a normal counterpart - # This might be an artifact - normal_peptides.append('N' * peplen) + tum, norm = iars[containing_iars[0]] + pos = tum.find(pred.pept) + temp_normal_pept = norm[pos:pos + peplen] + ndiff = pept_diff(pred.pept, temp_normal_pept) + assert ndiff != 0 + if ndiff == 1: + normal_peptides.append(norm[pos:pos + peplen]) else: - # There is an indel in play. The difference cannot be in the last AA as that - # would have come out properly in the first case. There is a possibility that - # the indel was in the first AA causing a shift. We can handle that by looking - # at the suffix. - pos = norm.find(pred.pept[1:]) - if pos != -1: - # The suffix was found, - normal_peptides.append(norm[pos-1:pos + peplen]) - else: - # The indel was too large to warrant having a normal counterpart + if len(tum) == len(norm): + # Too (2+) many single nucleotide changes to warrant having a normal + # counterpart. This might be an artifact normal_peptides.append('N' * peplen) + else: + # There is an indel in play. The difference cannot be in the last AA as that + # would have come out properly in the first case. There is a possibility + # that the indel was in the first AA causing a shift. We can handle that by + # looking at the suffix. + pos = norm.find(pred.pept[1:]) + if pos != -1: + # The suffix was found, + normal_peptides.append(norm[pos-1:pos + peplen]) + else: + # The indel was too large to warrant having a normal counterpart + normal_peptides.append('N' * peplen) mhc_df['normal_pept'] = normal_peptides return mhc_df, normal_peptides @@ -417,7 +421,7 @@ def predict_normal_binding(job, binding_result, transgened_files, allele, peplen 'predictor': None} elif predictor == 'Consensus': results = _process_consensus_mhcii(mhc_file) - results, peptides = _get_normal_peptides(results, iars, peplen) + results, peptides = _get_normal_peptides(job, results, iars, peplen) with open('peptides.faa', 'w') as pfile: for pept in peptides: print('>', pept, '\n', pept, sep='', file=pfile) @@ -431,7 +435,7 @@ def predict_normal_binding(job, binding_result, transgened_files, allele, peplen 'predictor': 'Consensus'} elif predictor == 'Sturniolo': results = _process_sturniolo_mhcii(mhc_file) - results, peptides = _get_normal_peptides(results, iars, peplen) + results, peptides = _get_normal_peptides(job, results, iars, peplen) with open('peptides.faa', 'w') as pfile: for pept in peptides: print('>', pept, '\n', pept, sep='', file=pfile) @@ -445,7 +449,7 @@ def predict_normal_binding(job, binding_result, transgened_files, allele, peplen 'predictor': 'Sturniolo'} elif predictor == 'netMHCIIpan': results = _process_net_mhcii(mhc_file) - results, peptides = _get_normal_peptides(results, iars, peplen) + results, peptides = _get_normal_peptides(job, results, iars, peplen) with open('peptides.faa', 'w') as pfile: for pept in peptides: print('>', pept, '\n', pept, sep='', file=pfile) @@ -465,7 +469,7 @@ def predict_normal_binding(job, binding_result, transgened_files, allele, peplen mhc_file = job.fileStore.readGlobalFile(binding_result, os.path.join(work_dir, 'mhci_results')) results = _process_mhci(mhc_file) - results, peptides = _get_normal_peptides(results, iars, peplen) + results, peptides = _get_normal_peptides(job, results, iars, peplen) with open('peptides.faa', 'w') as pfile: for pept in peptides: print('>', pept, '\n', pept, sep='', file=pfile)