From aa1b40804cc4e51935db94a3e46b2a37f1ba78a3 Mon Sep 17 00:00:00 2001
From: mgiulini <mrcgiulini@gmail.com>
Date: Tue, 26 Mar 2024 12:33:52 +0100
Subject: [PATCH] added renumbering input pdb logic

---
 src/arctic3d/cli.py                      |  46 +++++---
 src/arctic3d/modules/input.py            |  25 ++--
 src/arctic3d/modules/interface_matrix.py |   7 +-
 src/arctic3d/modules/output.py           |   8 +-
 src/arctic3d/modules/pdb.py              | 140 ++++++++++++++++-------
 src/arctic3d/modules/sequence.py         |  82 ++++++++++---
 6 files changed, 222 insertions(+), 86 deletions(-)

diff --git a/src/arctic3d/cli.py b/src/arctic3d/cli.py
index a39567f..5772d97 100644
--- a/src/arctic3d/cli.py
+++ b/src/arctic3d/cli.py
@@ -8,7 +8,7 @@
 from arctic3d.modules.blast import run_blast
 from arctic3d.modules.clustering import filter_clusters
 from arctic3d.modules.cluster_interfaces import cluster_interfaces
-from arctic3d.modules.input import Input
+from arctic3d.modules.input import is_fasta, is_pdb, is_uniprot
 from arctic3d.modules.interface import (
     get_interface_residues,
     read_interface_residues,
@@ -25,14 +25,25 @@
 
 
 argument_parser = argparse.ArgumentParser()
+# three arguments: uniprot_id, fasta_file, pdb_file
 argument_parser.add_argument(
-    "input_arg",
-    help="",
+    "--input_uniprot",
+    help="Uniprot ID to search",
+)
+
+argument_parser.add_argument(
+    "--input_fasta",
+    help="FASTA file",
+)
+
+argument_parser.add_argument(
+    "--input_pdb",
+    help="input PDB file to be used",
 )
 
 argument_parser.add_argument(
     "--db",
-    help="",
+    help="database to use for blast search",
 )
 
 argument_parser.add_argument(
@@ -170,7 +181,9 @@ def maincli():
 
 
 def main(
-    input_arg,
+    input_uniprot,
+    input_fasta,
+    input_pdb,
     db,
     interface_file,
     out_partner,
@@ -192,22 +205,24 @@ def main(
     init_message = get_init_message()
     log.info(init_message)
     st_time = time.time()
-    inp = Input(input_arg)
+
+    # handling input files
     input_files = {}
-    # retrieve uniprot information
-    if inp.is_fasta():
-        input_files["fasta"] = Path(inp.arg)
+    if is_fasta(input_fasta):
+        input_files["fasta"] = Path(input_fasta)
         uniprot_id = run_blast(input_files["fasta"], db)
-    if inp.is_uniprot():
-        uniprot_id = inp.arg.upper()
-    if inp.is_pdb():
-        input_files["pdb"] = Path(inp.arg)
+    if is_pdb(input_pdb):
+        input_files["pdb"] = Path(input_pdb)
         if not interface_file:
             fasta_f = to_fasta(input_files["pdb"], temp=False)
             uniprot_id = run_blast(fasta_f.name, db)
         else:
             input_files["interface_file"] = Path(interface_file)
             uniprot_id = None
+    # 25/7/2023: if the pdb is provided, the blast uniprot_id can be
+    # overwritten by specifying the uniprot_id input argument
+    if is_uniprot(input_uniprot):
+        uniprot_id = input_uniprot.upper()
 
     # create output folder
     run_dir_path = create_output_folder(run_dir, uniprot_id)
@@ -256,10 +271,11 @@ def main(
 
     if interface_residues:
         # retrieve pdb file
-        if inp.is_pdb():
+        if "pdb" in input_files:
             if not interface_file:
                 log.warning(
-                    f"""Input pdb file submitted without interface file. Renumbering input pdb to match uniprot ID {uniprot_id}."""
+                    f"input pdb file submitted without interface file. "
+                    f"Renumbering input pdb to match uniprot ID {uniprot_id}"
                 )
             # interfaces will be filtered later
             filtered_interfaces = None
diff --git a/src/arctic3d/modules/input.py b/src/arctic3d/modules/input.py
index a168716..c757ba6 100644
--- a/src/arctic3d/modules/input.py
+++ b/src/arctic3d/modules/input.py
@@ -1,12 +1,19 @@
-class Input:
-    def __init__(self, input_arg):
-        self.arg = input_arg
+def is_fasta(input_fasta):
+    if input_fasta:
+        return input_fasta.endswith(".fasta")
+    else:
+        return False
 
-    def is_fasta(self):
-        return self.arg.endswith(".fasta")
 
-    def is_pdb(self):
-        return self.arg.endswith(".pdb")
+def is_pdb(input_pdb):
+    if input_pdb:
+        return input_pdb.endswith(".pdb")
+    else:
+        return False
 
-    def is_uniprot(self):
-        return len(self.arg.split(".")) == 1
+
+def is_uniprot(input_uniprot):
+    if input_uniprot:
+        return len(input_uniprot.split(".")) == 1
+    else:
+        return False
diff --git a/src/arctic3d/modules/interface_matrix.py b/src/arctic3d/modules/interface_matrix.py
index 43ac873..bcf7880 100644
--- a/src/arctic3d/modules/interface_matrix.py
+++ b/src/arctic3d/modules/interface_matrix.py
@@ -8,6 +8,8 @@
 import pandas as pd
 from scipy.spatial.distance import cdist
 
+from arctic3d.modules.sequence import LETTERS
+
 SIGMA = 1.9
 
 log = logging.getLogger("arctic3d.log")
@@ -239,7 +241,10 @@ def interface_matrix(interface_dict, pdb_path, int_cov_cutoff=0.7):
     if not os.path.exists(pdb_path):
         raise Exception(f"pdb_path {pdb_path} does not exist")
     mdu = mda.Universe(pdb_path)
-    pdb_resids = mdu.select_atoms("name CA").resids
+    pdb_resids = mdu.select_atoms(
+        f'protein and name CA and not icode in {" ".join(LETTERS)}'
+    ).resids
+    print(f"pdb_resids {pdb_resids}")
     retained_interfaces = filter_interfaces(
         interface_dict, pdb_resids, int_cov_cutoff
     )
diff --git a/src/arctic3d/modules/output.py b/src/arctic3d/modules/output.py
index 93a868e..062579b 100644
--- a/src/arctic3d/modules/output.py
+++ b/src/arctic3d/modules/output.py
@@ -229,9 +229,13 @@ def output_pdb(pdb_f, cl_residues_probs):
         with open(new_filename, "w") as wfile:
             for ln in original_content:
                 if ln.startswith("ATOM"):
-                    resid = int(ln[23:27].strip())
+                    resid = int(ln[23:26].strip())
+                    ins_code = ln[26]
                     new_beta = 0.00
-                    if resid in cl_residues_probs[cl_id].keys():
+                    if (
+                        resid in cl_residues_probs[cl_id].keys()
+                        and ins_code == " "
+                    ):
                         new_beta = (
                             st_beta
                             + (100 - st_beta) * cl_residues_probs[cl_id][resid]
diff --git a/src/arctic3d/modules/pdb.py b/src/arctic3d/modules/pdb.py
index fdcc076..ca98567 100644
--- a/src/arctic3d/modules/pdb.py
+++ b/src/arctic3d/modules/pdb.py
@@ -1,6 +1,5 @@
 import logging
 import os
-import shutil
 from pathlib import Path
 
 import jsonpickle
@@ -17,7 +16,7 @@
 
 from arctic3d.functions import make_request
 from arctic3d.modules.interface_matrix import filter_interfaces
-from arctic3d.modules.sequence import align_sequences, to_fasta
+from arctic3d.modules.sequence import cycle_alignment, LETTERS, to_fasta
 
 log = logging.getLogger("arctic3d.log")
 
@@ -423,6 +422,78 @@ def fetch_pdb_files(pdb_to_fetch, uniprot_id):
     return validated_pdb_and_cifs
 
 
+def write_renumbered_pdb(pdb_torenum, pdb_aln_string, uniprot_aln_string):
+    """
+    Writes down the renumbered pdb file based on the alignment between the
+     pdb and the uniprot sequence.
+
+    Insertions in the pdb sequence are marked with a letter in the insertion
+    code column (column 27).
+
+    Parameters
+    ----------
+    pdb_torenum : str or Path
+        input pdb file
+
+    pdb_renum : str or Path
+        output pdb file
+
+    pdb_aln_string : str
+        pdb sequence in the alignment
+
+    uniprot_aln_string : str
+        uniprot sequence in the alignment
+    """
+    pdb_renum = Path(f"{pdb_torenum.stem}-renum.pdb")
+    resid_idx, prev_resid, uniprot_resid = -1, -1, 1
+    uniprot_letter_idx = 0
+
+    file_content = ""
+    with open(pdb_renum, "w") as wfile:
+        with open(pdb_torenum) as rfile:
+            for ln in rfile:
+                if ln.startswith(RECORDS):
+                    resid = ln[22:26].strip()
+                    if resid != prev_resid:
+                        resid_idx += 1
+                        found_uniprot = False
+                        while found_uniprot is False:
+                            if pdb_aln_string[resid_idx] == "-":
+                                # unaligned uniprot residue: we skip it and
+                                # increment the uniprot_resid
+                                uniprot_resid += 1
+                                resid_idx += 1
+                            else:
+                                found_uniprot = True
+                        # pdb residue is not a gap (anymore)
+                        if uniprot_aln_string[resid_idx] == "-":
+                            # unaligned residue: we add a letter in the
+                            # insertion code column
+                            if uniprot_letter_idx > len(LETTERS) - 1:
+                                log.warning(
+                                    "A lot of gaps in the alignment..."
+                                    "reinitializing letters..."
+                                )
+                                uniprot_letter_idx = 0
+                            letter = LETTERS[uniprot_letter_idx]
+                            res_to_write = f"{uniprot_resid-1}{letter}"
+                            uniprot_letter_idx += 1
+                        else:
+                            # aligned residue, write down its uniprot residue
+                            res_to_write = f"{uniprot_resid} "
+                            uniprot_resid += 1
+                            uniprot_letter_idx = 0
+                    prev_resid = resid
+                    # renumbering takes place here
+                    n_spaces = 5 - len(res_to_write)
+                    resid_str = f"{' ' * n_spaces}{res_to_write} "
+                    file_content += f"{ln[:22]}{resid_str}{ln[28:]}"
+                else:
+                    file_content += f"{ln}"
+            wfile.write(file_content)
+    return pdb_renum
+
+
 def renumber_pdb_from_uniprot(pdb_f, uniprot_id):
     """
     Renumbers a pdb file based on the information coming from the corresponding
@@ -440,7 +511,9 @@ def renumber_pdb_from_uniprot(pdb_f, uniprot_id):
     out_pdb_renum : Path
         renumbered pdb file
     """
-    log.warning("Uniprot-based renumbering should not be completely trusted...")
+    log.warning(
+        "Uniprot-based renumbering should not be completely trusted..."
+    )
     url = f"{UNIPROT_API_URL}/{uniprot_id}"
     try:
         pdb_dict = make_request(url, None)
@@ -453,48 +526,33 @@ def renumber_pdb_from_uniprot(pdb_f, uniprot_id):
     fasta_f = to_fasta(pdb_f, temp=False)
     fasta_sequences = SeqIO.parse(open(fasta_f.name), "fasta")
 
-    # looping over sequences
-    max_id = -1.0
-    for fasta in fasta_sequences:
-        name, seq = fasta.id, str(fasta.seq)
-        aln_fname, top_aln = align_sequences(ref_seq, seq)
-        identity = str(top_aln).count("|") / float(min(len(ref_seq), len(seq)))
-
-        log.info(f"sequence {name} has identity {identity}")
-        if identity > max_id:
-            max_id = identity
-            max_id_chain = name.split("|")[1]
-            shutil.copy(aln_fname, f"{uniprot_id}.aln")
-    os.unlink(aln_fname)
-
+    aln_fname = f"{uniprot_id}.aln"
+    max_id_chain, max_id = cycle_alignment(fasta_sequences, ref_seq, aln_fname)
     # preprocess pdb before renumbering
     log.info(
-        f"Renumbering chain {max_id_chain} of {pdb_f} ({max_id} identity with {uniprot_id})"
+        f"Renumbering chain {max_id_chain} of {pdb_f}"
+        f" ({(max_id*100):.2f}% identity with {uniprot_id})"
+    )
+    if max_id < 0.9:
+        log.warning(
+            f"Identity between {max_id_chain} and {uniprot_id} lower than 90%"
+        )
+    pdb_torenum = selchain_pdb(pdb_f, max_id_chain)
+    pdb_numb_lines = open(f"{uniprot_id}.aln", "r").read().split("\n")
+    nlines_pdb = list(range(2, len(pdb_numb_lines), 4))
+    nlines_uniprot = list(range(0, len(pdb_numb_lines), 4))
+
+    pdb_aln_string = "".join(
+        [pdb_numb_lines[n].split()[2] for n in nlines_pdb]
+    )
+    uniprot_aln_string = "".join(
+        [pdb_numb_lines[n].split()[2] for n in nlines_uniprot]
+    )
+
+    pdb_renum = write_renumbered_pdb(
+        pdb_torenum, pdb_aln_string, uniprot_aln_string
     )
-    pdb_torenum = preprocess_pdb(pdb_f, max_id_chain)
-    pdb_numb = open(f"{uniprot_id}.aln", "r").read().split("\n")[2]
-    pdb_renum = Path(f"{pdb_torenum.stem}-renum.pdb")
 
-    # renumbering. Pdb is aligned to uniprot, so each letter in the alignment
-    # is positioned at the correct residue index (adjusted with + 1)
-    numbering_list = [n + 1 for n in range(len(pdb_numb)) if pdb_numb[n] != "-"]
-    resid_idx, prev_resid = -1, -1
-    file_content = ""
-    with open(pdb_renum, "w") as wfile:
-        with open(pdb_torenum) as rfile:
-            for ln in rfile:
-                if ln.startswith(RECORDS):
-                    resid = ln[22:26].strip()
-                    if resid != prev_resid:
-                        resid_idx += 1
-                    prev_resid = resid
-                    # renumbering takes place here
-                    n_spaces = 4 - len(str(numbering_list[resid_idx]))
-                    resid_str = f"{' ' * n_spaces}{numbering_list[resid_idx]} "
-                    file_content += f"{ln[:22]}{resid_str}{ln[27:]}"
-                else:
-                    file_content += f"{ln}"
-            wfile.write(file_content)
     os.unlink(pdb_torenum)
     out_pdb_renum = pdb_renum.rename(f"{pdb_f.stem}-{uniprot_id}.pdb")
     return out_pdb_renum
diff --git a/src/arctic3d/modules/sequence.py b/src/arctic3d/modules/sequence.py
index 8da889c..79b643f 100644
--- a/src/arctic3d/modules/sequence.py
+++ b/src/arctic3d/modules/sequence.py
@@ -1,26 +1,39 @@
+import logging
 import tempfile
 
 from Bio import Align
 from Bio.Align import substitution_matrices
 from pdbtools.pdb_tofasta import pdb_to_fasta
 
-
-def load_seq(fasta_file):
-    """
-    Load sequence.
-
-    Parameters
-    ----------
-    fasta_file : str
-        Fasta file.
-
-    Returns
-    -------
-    fasta_seq : str
-        Fasta sequence.
-
-    """
-    pass
+import os
+import shutil
+
+log = logging.getLogger("arctic3d.log")
+
+LETTERS = [
+    "A",
+        "B",
+        "C",
+        "D",
+        "E",
+        "F",
+        "G",
+        "H",
+        "I",
+        "K",
+        "L",
+        "M",
+        "N",
+        "P",
+        "Q",
+        "R",
+        "S",
+        "T",
+        "V",
+        "W",
+        "Y",
+        "Z",
+    ]
 
 
 def to_fasta(pdb_f, temp):
@@ -74,9 +87,42 @@ def align_sequences(seq1, seq2):
     """
     aln_fname = "blosum80.aln"
     aligner = Align.PairwiseAligner()
-    aligner.substitution_matrix = substitution_matrices.load("BLOSUM80")
+    aligner.substitution_matrix = substitution_matrices.load("BLOSUM62")
     alns = aligner.align(seq1, seq2)
     top_aln = alns[0]
     with open(aln_fname, "w") as fh:
         fh.write(str(top_aln))
     return aln_fname, top_aln
+
+
+def cycle_alignment(fasta_sequences, ref_seq, output_aln_fname):
+    """
+    Perform pairwise alignment between a reference sequence and a list of
+     sequences.
+
+    Parameters
+    ----------
+    fasta_sequences : list
+        List of sequences
+
+    ref_seq : str
+        Reference sequence
+
+    output_aln_fname : str
+        Output alignment file name
+    """
+    # looping over sequences
+    max_id = -1.0
+    for fasta in fasta_sequences:
+        name, seq = fasta.id, str(fasta.seq)
+        aln_fname, top_aln = align_sequences(ref_seq, seq)
+        identity = str(top_aln).count("|") / float(min(len(ref_seq), len(seq)))
+        # compute percentage and logging
+        perc_identity = identity * 100
+        log.info(f"sequence {name} has {perc_identity:.2f}% sequence identity")
+        if identity > max_id:
+            max_id = identity
+            max_id_chain = name.split("|")[1]
+            shutil.copy(aln_fname, output_aln_fname)
+    os.unlink(aln_fname)
+    return max_id_chain, max_id