Skip to content

Commit

Permalink
added renumbering input pdb logic
Browse files Browse the repository at this point in the history
  • Loading branch information
mgiulini committed Mar 26, 2024
1 parent 2fda40a commit aa1b408
Show file tree
Hide file tree
Showing 6 changed files with 222 additions and 86 deletions.
46 changes: 31 additions & 15 deletions src/arctic3d/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from arctic3d.modules.blast import run_blast
from arctic3d.modules.clustering import filter_clusters
from arctic3d.modules.cluster_interfaces import cluster_interfaces
from arctic3d.modules.input import Input
from arctic3d.modules.input import is_fasta, is_pdb, is_uniprot
from arctic3d.modules.interface import (
get_interface_residues,
read_interface_residues,
Expand All @@ -25,14 +25,25 @@


argument_parser = argparse.ArgumentParser()
# three arguments: uniprot_id, fasta_file, pdb_file
argument_parser.add_argument(
"input_arg",
help="",
"--input_uniprot",
help="Uniprot ID to search",
)

argument_parser.add_argument(
"--input_fasta",
help="FASTA file",
)

argument_parser.add_argument(
"--input_pdb",
help="input PDB file to be used",
)

argument_parser.add_argument(
"--db",
help="",
help="database to use for blast search",
)

argument_parser.add_argument(
Expand Down Expand Up @@ -170,7 +181,9 @@ def maincli():


def main(
input_arg,
input_uniprot,
input_fasta,
input_pdb,
db,
interface_file,
out_partner,
Expand All @@ -192,22 +205,24 @@ def main(
init_message = get_init_message()
log.info(init_message)
st_time = time.time()
inp = Input(input_arg)

# handling input files
input_files = {}
# retrieve uniprot information
if inp.is_fasta():
input_files["fasta"] = Path(inp.arg)
if is_fasta(input_fasta):
input_files["fasta"] = Path(input_fasta)
uniprot_id = run_blast(input_files["fasta"], db)
if inp.is_uniprot():
uniprot_id = inp.arg.upper()
if inp.is_pdb():
input_files["pdb"] = Path(inp.arg)
if is_pdb(input_pdb):
input_files["pdb"] = Path(input_pdb)
if not interface_file:
fasta_f = to_fasta(input_files["pdb"], temp=False)
uniprot_id = run_blast(fasta_f.name, db)
else:
input_files["interface_file"] = Path(interface_file)
uniprot_id = None
# 25/7/2023: if the pdb is provided, the blast uniprot_id can be
# overwritten by specifying the uniprot_id input argument
if is_uniprot(input_uniprot):
uniprot_id = input_uniprot.upper()

# create output folder
run_dir_path = create_output_folder(run_dir, uniprot_id)
Expand Down Expand Up @@ -256,10 +271,11 @@ def main(

if interface_residues:
# retrieve pdb file
if inp.is_pdb():
if "pdb" in input_files:
if not interface_file:
log.warning(
f"""Input pdb file submitted without interface file. Renumbering input pdb to match uniprot ID {uniprot_id}."""
f"input pdb file submitted without interface file. "
f"Renumbering input pdb to match uniprot ID {uniprot_id}"
)
# interfaces will be filtered later
filtered_interfaces = None
Expand Down
25 changes: 16 additions & 9 deletions src/arctic3d/modules/input.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,19 @@
class Input:
def __init__(self, input_arg):
self.arg = input_arg
def is_fasta(input_fasta):
if input_fasta:
return input_fasta.endswith(".fasta")
else:
return False

def is_fasta(self):
return self.arg.endswith(".fasta")

def is_pdb(self):
return self.arg.endswith(".pdb")
def is_pdb(input_pdb):
if input_pdb:
return input_pdb.endswith(".pdb")
else:
return False

def is_uniprot(self):
return len(self.arg.split(".")) == 1

def is_uniprot(input_uniprot):
if input_uniprot:
return len(input_uniprot.split(".")) == 1
else:
return False
7 changes: 6 additions & 1 deletion src/arctic3d/modules/interface_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
import pandas as pd
from scipy.spatial.distance import cdist

from arctic3d.modules.sequence import LETTERS

SIGMA = 1.9

log = logging.getLogger("arctic3d.log")
Expand Down Expand Up @@ -239,7 +241,10 @@ def interface_matrix(interface_dict, pdb_path, int_cov_cutoff=0.7):
if not os.path.exists(pdb_path):
raise Exception(f"pdb_path {pdb_path} does not exist")
mdu = mda.Universe(pdb_path)
pdb_resids = mdu.select_atoms("name CA").resids
pdb_resids = mdu.select_atoms(
f'protein and name CA and not icode in {" ".join(LETTERS)}'
).resids
print(f"pdb_resids {pdb_resids}")
retained_interfaces = filter_interfaces(
interface_dict, pdb_resids, int_cov_cutoff
)
Expand Down
8 changes: 6 additions & 2 deletions src/arctic3d/modules/output.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,9 +229,13 @@ def output_pdb(pdb_f, cl_residues_probs):
with open(new_filename, "w") as wfile:
for ln in original_content:
if ln.startswith("ATOM"):
resid = int(ln[23:27].strip())
resid = int(ln[23:26].strip())
ins_code = ln[26]
new_beta = 0.00
if resid in cl_residues_probs[cl_id].keys():
if (
resid in cl_residues_probs[cl_id].keys()
and ins_code == " "
):
new_beta = (
st_beta
+ (100 - st_beta) * cl_residues_probs[cl_id][resid]
Expand Down
140 changes: 99 additions & 41 deletions src/arctic3d/modules/pdb.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import logging
import os
import shutil
from pathlib import Path

import jsonpickle
Expand All @@ -17,7 +16,7 @@

from arctic3d.functions import make_request
from arctic3d.modules.interface_matrix import filter_interfaces
from arctic3d.modules.sequence import align_sequences, to_fasta
from arctic3d.modules.sequence import cycle_alignment, LETTERS, to_fasta

log = logging.getLogger("arctic3d.log")

Expand Down Expand Up @@ -423,6 +422,78 @@ def fetch_pdb_files(pdb_to_fetch, uniprot_id):
return validated_pdb_and_cifs


def write_renumbered_pdb(pdb_torenum, pdb_aln_string, uniprot_aln_string):
"""
Writes down the renumbered pdb file based on the alignment between the
pdb and the uniprot sequence.
Insertions in the pdb sequence are marked with a letter in the insertion
code column (column 27).
Parameters
----------
pdb_torenum : str or Path
input pdb file
pdb_renum : str or Path
output pdb file
pdb_aln_string : str
pdb sequence in the alignment
uniprot_aln_string : str
uniprot sequence in the alignment
"""
pdb_renum = Path(f"{pdb_torenum.stem}-renum.pdb")
resid_idx, prev_resid, uniprot_resid = -1, -1, 1
uniprot_letter_idx = 0

file_content = ""
with open(pdb_renum, "w") as wfile:
with open(pdb_torenum) as rfile:
for ln in rfile:
if ln.startswith(RECORDS):
resid = ln[22:26].strip()
if resid != prev_resid:
resid_idx += 1
found_uniprot = False
while found_uniprot is False:
if pdb_aln_string[resid_idx] == "-":
# unaligned uniprot residue: we skip it and
# increment the uniprot_resid
uniprot_resid += 1
resid_idx += 1
else:
found_uniprot = True
# pdb residue is not a gap (anymore)
if uniprot_aln_string[resid_idx] == "-":
# unaligned residue: we add a letter in the
# insertion code column
if uniprot_letter_idx > len(LETTERS) - 1:
log.warning(
"A lot of gaps in the alignment..."
"reinitializing letters..."
)
uniprot_letter_idx = 0
letter = LETTERS[uniprot_letter_idx]
res_to_write = f"{uniprot_resid-1}{letter}"
uniprot_letter_idx += 1
else:
# aligned residue, write down its uniprot residue
res_to_write = f"{uniprot_resid} "
uniprot_resid += 1
uniprot_letter_idx = 0
prev_resid = resid
# renumbering takes place here
n_spaces = 5 - len(res_to_write)
resid_str = f"{' ' * n_spaces}{res_to_write} "
file_content += f"{ln[:22]}{resid_str}{ln[28:]}"
else:
file_content += f"{ln}"
wfile.write(file_content)
return pdb_renum


def renumber_pdb_from_uniprot(pdb_f, uniprot_id):
"""
Renumbers a pdb file based on the information coming from the corresponding
Expand All @@ -440,7 +511,9 @@ def renumber_pdb_from_uniprot(pdb_f, uniprot_id):
out_pdb_renum : Path
renumbered pdb file
"""
log.warning("Uniprot-based renumbering should not be completely trusted...")
log.warning(
"Uniprot-based renumbering should not be completely trusted..."
)
url = f"{UNIPROT_API_URL}/{uniprot_id}"
try:
pdb_dict = make_request(url, None)
Expand All @@ -453,48 +526,33 @@ def renumber_pdb_from_uniprot(pdb_f, uniprot_id):
fasta_f = to_fasta(pdb_f, temp=False)
fasta_sequences = SeqIO.parse(open(fasta_f.name), "fasta")

# looping over sequences
max_id = -1.0
for fasta in fasta_sequences:
name, seq = fasta.id, str(fasta.seq)
aln_fname, top_aln = align_sequences(ref_seq, seq)
identity = str(top_aln).count("|") / float(min(len(ref_seq), len(seq)))

log.info(f"sequence {name} has identity {identity}")
if identity > max_id:
max_id = identity
max_id_chain = name.split("|")[1]
shutil.copy(aln_fname, f"{uniprot_id}.aln")
os.unlink(aln_fname)

aln_fname = f"{uniprot_id}.aln"
max_id_chain, max_id = cycle_alignment(fasta_sequences, ref_seq, aln_fname)
# preprocess pdb before renumbering
log.info(
f"Renumbering chain {max_id_chain} of {pdb_f} ({max_id} identity with {uniprot_id})"
f"Renumbering chain {max_id_chain} of {pdb_f}"
f" ({(max_id*100):.2f}% identity with {uniprot_id})"
)
if max_id < 0.9:
log.warning(
f"Identity between {max_id_chain} and {uniprot_id} lower than 90%"
)
pdb_torenum = selchain_pdb(pdb_f, max_id_chain)
pdb_numb_lines = open(f"{uniprot_id}.aln", "r").read().split("\n")
nlines_pdb = list(range(2, len(pdb_numb_lines), 4))
nlines_uniprot = list(range(0, len(pdb_numb_lines), 4))

pdb_aln_string = "".join(
[pdb_numb_lines[n].split()[2] for n in nlines_pdb]
)
uniprot_aln_string = "".join(
[pdb_numb_lines[n].split()[2] for n in nlines_uniprot]
)

pdb_renum = write_renumbered_pdb(
pdb_torenum, pdb_aln_string, uniprot_aln_string
)
pdb_torenum = preprocess_pdb(pdb_f, max_id_chain)
pdb_numb = open(f"{uniprot_id}.aln", "r").read().split("\n")[2]
pdb_renum = Path(f"{pdb_torenum.stem}-renum.pdb")

# renumbering. Pdb is aligned to uniprot, so each letter in the alignment
# is positioned at the correct residue index (adjusted with + 1)
numbering_list = [n + 1 for n in range(len(pdb_numb)) if pdb_numb[n] != "-"]
resid_idx, prev_resid = -1, -1
file_content = ""
with open(pdb_renum, "w") as wfile:
with open(pdb_torenum) as rfile:
for ln in rfile:
if ln.startswith(RECORDS):
resid = ln[22:26].strip()
if resid != prev_resid:
resid_idx += 1
prev_resid = resid
# renumbering takes place here
n_spaces = 4 - len(str(numbering_list[resid_idx]))
resid_str = f"{' ' * n_spaces}{numbering_list[resid_idx]} "
file_content += f"{ln[:22]}{resid_str}{ln[27:]}"
else:
file_content += f"{ln}"
wfile.write(file_content)
os.unlink(pdb_torenum)
out_pdb_renum = pdb_renum.rename(f"{pdb_f.stem}-{uniprot_id}.pdb")
return out_pdb_renum
Expand Down
Loading

0 comments on commit aa1b408

Please sign in to comment.