From aa1b40804cc4e51935db94a3e46b2a37f1ba78a3 Mon Sep 17 00:00:00 2001 From: mgiulini Date: Tue, 26 Mar 2024 12:33:52 +0100 Subject: [PATCH] added renumbering input pdb logic --- src/arctic3d/cli.py | 46 +++++--- src/arctic3d/modules/input.py | 25 ++-- src/arctic3d/modules/interface_matrix.py | 7 +- src/arctic3d/modules/output.py | 8 +- src/arctic3d/modules/pdb.py | 140 ++++++++++++++++------- src/arctic3d/modules/sequence.py | 82 ++++++++++--- 6 files changed, 222 insertions(+), 86 deletions(-) diff --git a/src/arctic3d/cli.py b/src/arctic3d/cli.py index a39567f..5772d97 100644 --- a/src/arctic3d/cli.py +++ b/src/arctic3d/cli.py @@ -8,7 +8,7 @@ from arctic3d.modules.blast import run_blast from arctic3d.modules.clustering import filter_clusters from arctic3d.modules.cluster_interfaces import cluster_interfaces -from arctic3d.modules.input import Input +from arctic3d.modules.input import is_fasta, is_pdb, is_uniprot from arctic3d.modules.interface import ( get_interface_residues, read_interface_residues, @@ -25,14 +25,25 @@ argument_parser = argparse.ArgumentParser() +# three arguments: uniprot_id, fasta_file, pdb_file argument_parser.add_argument( - "input_arg", - help="", + "--input_uniprot", + help="Uniprot ID to search", +) + +argument_parser.add_argument( + "--input_fasta", + help="FASTA file", +) + +argument_parser.add_argument( + "--input_pdb", + help="input PDB file to be used", ) argument_parser.add_argument( "--db", - help="", + help="database to use for blast search", ) argument_parser.add_argument( @@ -170,7 +181,9 @@ def maincli(): def main( - input_arg, + input_uniprot, + input_fasta, + input_pdb, db, interface_file, out_partner, @@ -192,22 +205,24 @@ def main( init_message = get_init_message() log.info(init_message) st_time = time.time() - inp = Input(input_arg) + + # handling input files input_files = {} - # retrieve uniprot information - if inp.is_fasta(): - input_files["fasta"] = Path(inp.arg) + if is_fasta(input_fasta): + input_files["fasta"] = Path(input_fasta) uniprot_id = run_blast(input_files["fasta"], db) - if inp.is_uniprot(): - uniprot_id = inp.arg.upper() - if inp.is_pdb(): - input_files["pdb"] = Path(inp.arg) + if is_pdb(input_pdb): + input_files["pdb"] = Path(input_pdb) if not interface_file: fasta_f = to_fasta(input_files["pdb"], temp=False) uniprot_id = run_blast(fasta_f.name, db) else: input_files["interface_file"] = Path(interface_file) uniprot_id = None + # 25/7/2023: if the pdb is provided, the blast uniprot_id can be + # overwritten by specifying the uniprot_id input argument + if is_uniprot(input_uniprot): + uniprot_id = input_uniprot.upper() # create output folder run_dir_path = create_output_folder(run_dir, uniprot_id) @@ -256,10 +271,11 @@ def main( if interface_residues: # retrieve pdb file - if inp.is_pdb(): + if "pdb" in input_files: if not interface_file: log.warning( - f"""Input pdb file submitted without interface file. Renumbering input pdb to match uniprot ID {uniprot_id}.""" + f"input pdb file submitted without interface file. " + f"Renumbering input pdb to match uniprot ID {uniprot_id}" ) # interfaces will be filtered later filtered_interfaces = None diff --git a/src/arctic3d/modules/input.py b/src/arctic3d/modules/input.py index a168716..c757ba6 100644 --- a/src/arctic3d/modules/input.py +++ b/src/arctic3d/modules/input.py @@ -1,12 +1,19 @@ -class Input: - def __init__(self, input_arg): - self.arg = input_arg +def is_fasta(input_fasta): + if input_fasta: + return input_fasta.endswith(".fasta") + else: + return False - def is_fasta(self): - return self.arg.endswith(".fasta") - def is_pdb(self): - return self.arg.endswith(".pdb") +def is_pdb(input_pdb): + if input_pdb: + return input_pdb.endswith(".pdb") + else: + return False - def is_uniprot(self): - return len(self.arg.split(".")) == 1 + +def is_uniprot(input_uniprot): + if input_uniprot: + return len(input_uniprot.split(".")) == 1 + else: + return False diff --git a/src/arctic3d/modules/interface_matrix.py b/src/arctic3d/modules/interface_matrix.py index 43ac873..bcf7880 100644 --- a/src/arctic3d/modules/interface_matrix.py +++ b/src/arctic3d/modules/interface_matrix.py @@ -8,6 +8,8 @@ import pandas as pd from scipy.spatial.distance import cdist +from arctic3d.modules.sequence import LETTERS + SIGMA = 1.9 log = logging.getLogger("arctic3d.log") @@ -239,7 +241,10 @@ def interface_matrix(interface_dict, pdb_path, int_cov_cutoff=0.7): if not os.path.exists(pdb_path): raise Exception(f"pdb_path {pdb_path} does not exist") mdu = mda.Universe(pdb_path) - pdb_resids = mdu.select_atoms("name CA").resids + pdb_resids = mdu.select_atoms( + f'protein and name CA and not icode in {" ".join(LETTERS)}' + ).resids + print(f"pdb_resids {pdb_resids}") retained_interfaces = filter_interfaces( interface_dict, pdb_resids, int_cov_cutoff ) diff --git a/src/arctic3d/modules/output.py b/src/arctic3d/modules/output.py index 93a868e..062579b 100644 --- a/src/arctic3d/modules/output.py +++ b/src/arctic3d/modules/output.py @@ -229,9 +229,13 @@ def output_pdb(pdb_f, cl_residues_probs): with open(new_filename, "w") as wfile: for ln in original_content: if ln.startswith("ATOM"): - resid = int(ln[23:27].strip()) + resid = int(ln[23:26].strip()) + ins_code = ln[26] new_beta = 0.00 - if resid in cl_residues_probs[cl_id].keys(): + if ( + resid in cl_residues_probs[cl_id].keys() + and ins_code == " " + ): new_beta = ( st_beta + (100 - st_beta) * cl_residues_probs[cl_id][resid] diff --git a/src/arctic3d/modules/pdb.py b/src/arctic3d/modules/pdb.py index fdcc076..ca98567 100644 --- a/src/arctic3d/modules/pdb.py +++ b/src/arctic3d/modules/pdb.py @@ -1,6 +1,5 @@ import logging import os -import shutil from pathlib import Path import jsonpickle @@ -17,7 +16,7 @@ from arctic3d.functions import make_request from arctic3d.modules.interface_matrix import filter_interfaces -from arctic3d.modules.sequence import align_sequences, to_fasta +from arctic3d.modules.sequence import cycle_alignment, LETTERS, to_fasta log = logging.getLogger("arctic3d.log") @@ -423,6 +422,78 @@ def fetch_pdb_files(pdb_to_fetch, uniprot_id): return validated_pdb_and_cifs +def write_renumbered_pdb(pdb_torenum, pdb_aln_string, uniprot_aln_string): + """ + Writes down the renumbered pdb file based on the alignment between the + pdb and the uniprot sequence. + + Insertions in the pdb sequence are marked with a letter in the insertion + code column (column 27). + + Parameters + ---------- + pdb_torenum : str or Path + input pdb file + + pdb_renum : str or Path + output pdb file + + pdb_aln_string : str + pdb sequence in the alignment + + uniprot_aln_string : str + uniprot sequence in the alignment + """ + pdb_renum = Path(f"{pdb_torenum.stem}-renum.pdb") + resid_idx, prev_resid, uniprot_resid = -1, -1, 1 + uniprot_letter_idx = 0 + + file_content = "" + with open(pdb_renum, "w") as wfile: + with open(pdb_torenum) as rfile: + for ln in rfile: + if ln.startswith(RECORDS): + resid = ln[22:26].strip() + if resid != prev_resid: + resid_idx += 1 + found_uniprot = False + while found_uniprot is False: + if pdb_aln_string[resid_idx] == "-": + # unaligned uniprot residue: we skip it and + # increment the uniprot_resid + uniprot_resid += 1 + resid_idx += 1 + else: + found_uniprot = True + # pdb residue is not a gap (anymore) + if uniprot_aln_string[resid_idx] == "-": + # unaligned residue: we add a letter in the + # insertion code column + if uniprot_letter_idx > len(LETTERS) - 1: + log.warning( + "A lot of gaps in the alignment..." + "reinitializing letters..." + ) + uniprot_letter_idx = 0 + letter = LETTERS[uniprot_letter_idx] + res_to_write = f"{uniprot_resid-1}{letter}" + uniprot_letter_idx += 1 + else: + # aligned residue, write down its uniprot residue + res_to_write = f"{uniprot_resid} " + uniprot_resid += 1 + uniprot_letter_idx = 0 + prev_resid = resid + # renumbering takes place here + n_spaces = 5 - len(res_to_write) + resid_str = f"{' ' * n_spaces}{res_to_write} " + file_content += f"{ln[:22]}{resid_str}{ln[28:]}" + else: + file_content += f"{ln}" + wfile.write(file_content) + return pdb_renum + + def renumber_pdb_from_uniprot(pdb_f, uniprot_id): """ Renumbers a pdb file based on the information coming from the corresponding @@ -440,7 +511,9 @@ def renumber_pdb_from_uniprot(pdb_f, uniprot_id): out_pdb_renum : Path renumbered pdb file """ - log.warning("Uniprot-based renumbering should not be completely trusted...") + log.warning( + "Uniprot-based renumbering should not be completely trusted..." + ) url = f"{UNIPROT_API_URL}/{uniprot_id}" try: pdb_dict = make_request(url, None) @@ -453,48 +526,33 @@ def renumber_pdb_from_uniprot(pdb_f, uniprot_id): fasta_f = to_fasta(pdb_f, temp=False) fasta_sequences = SeqIO.parse(open(fasta_f.name), "fasta") - # looping over sequences - max_id = -1.0 - for fasta in fasta_sequences: - name, seq = fasta.id, str(fasta.seq) - aln_fname, top_aln = align_sequences(ref_seq, seq) - identity = str(top_aln).count("|") / float(min(len(ref_seq), len(seq))) - - log.info(f"sequence {name} has identity {identity}") - if identity > max_id: - max_id = identity - max_id_chain = name.split("|")[1] - shutil.copy(aln_fname, f"{uniprot_id}.aln") - os.unlink(aln_fname) - + aln_fname = f"{uniprot_id}.aln" + max_id_chain, max_id = cycle_alignment(fasta_sequences, ref_seq, aln_fname) # preprocess pdb before renumbering log.info( - f"Renumbering chain {max_id_chain} of {pdb_f} ({max_id} identity with {uniprot_id})" + f"Renumbering chain {max_id_chain} of {pdb_f}" + f" ({(max_id*100):.2f}% identity with {uniprot_id})" + ) + if max_id < 0.9: + log.warning( + f"Identity between {max_id_chain} and {uniprot_id} lower than 90%" + ) + pdb_torenum = selchain_pdb(pdb_f, max_id_chain) + pdb_numb_lines = open(f"{uniprot_id}.aln", "r").read().split("\n") + nlines_pdb = list(range(2, len(pdb_numb_lines), 4)) + nlines_uniprot = list(range(0, len(pdb_numb_lines), 4)) + + pdb_aln_string = "".join( + [pdb_numb_lines[n].split()[2] for n in nlines_pdb] + ) + uniprot_aln_string = "".join( + [pdb_numb_lines[n].split()[2] for n in nlines_uniprot] + ) + + pdb_renum = write_renumbered_pdb( + pdb_torenum, pdb_aln_string, uniprot_aln_string ) - pdb_torenum = preprocess_pdb(pdb_f, max_id_chain) - pdb_numb = open(f"{uniprot_id}.aln", "r").read().split("\n")[2] - pdb_renum = Path(f"{pdb_torenum.stem}-renum.pdb") - # renumbering. Pdb is aligned to uniprot, so each letter in the alignment - # is positioned at the correct residue index (adjusted with + 1) - numbering_list = [n + 1 for n in range(len(pdb_numb)) if pdb_numb[n] != "-"] - resid_idx, prev_resid = -1, -1 - file_content = "" - with open(pdb_renum, "w") as wfile: - with open(pdb_torenum) as rfile: - for ln in rfile: - if ln.startswith(RECORDS): - resid = ln[22:26].strip() - if resid != prev_resid: - resid_idx += 1 - prev_resid = resid - # renumbering takes place here - n_spaces = 4 - len(str(numbering_list[resid_idx])) - resid_str = f"{' ' * n_spaces}{numbering_list[resid_idx]} " - file_content += f"{ln[:22]}{resid_str}{ln[27:]}" - else: - file_content += f"{ln}" - wfile.write(file_content) os.unlink(pdb_torenum) out_pdb_renum = pdb_renum.rename(f"{pdb_f.stem}-{uniprot_id}.pdb") return out_pdb_renum diff --git a/src/arctic3d/modules/sequence.py b/src/arctic3d/modules/sequence.py index 8da889c..79b643f 100644 --- a/src/arctic3d/modules/sequence.py +++ b/src/arctic3d/modules/sequence.py @@ -1,26 +1,39 @@ +import logging import tempfile from Bio import Align from Bio.Align import substitution_matrices from pdbtools.pdb_tofasta import pdb_to_fasta - -def load_seq(fasta_file): - """ - Load sequence. - - Parameters - ---------- - fasta_file : str - Fasta file. - - Returns - ------- - fasta_seq : str - Fasta sequence. - - """ - pass +import os +import shutil + +log = logging.getLogger("arctic3d.log") + +LETTERS = [ + "A", + "B", + "C", + "D", + "E", + "F", + "G", + "H", + "I", + "K", + "L", + "M", + "N", + "P", + "Q", + "R", + "S", + "T", + "V", + "W", + "Y", + "Z", + ] def to_fasta(pdb_f, temp): @@ -74,9 +87,42 @@ def align_sequences(seq1, seq2): """ aln_fname = "blosum80.aln" aligner = Align.PairwiseAligner() - aligner.substitution_matrix = substitution_matrices.load("BLOSUM80") + aligner.substitution_matrix = substitution_matrices.load("BLOSUM62") alns = aligner.align(seq1, seq2) top_aln = alns[0] with open(aln_fname, "w") as fh: fh.write(str(top_aln)) return aln_fname, top_aln + + +def cycle_alignment(fasta_sequences, ref_seq, output_aln_fname): + """ + Perform pairwise alignment between a reference sequence and a list of + sequences. + + Parameters + ---------- + fasta_sequences : list + List of sequences + + ref_seq : str + Reference sequence + + output_aln_fname : str + Output alignment file name + """ + # looping over sequences + max_id = -1.0 + for fasta in fasta_sequences: + name, seq = fasta.id, str(fasta.seq) + aln_fname, top_aln = align_sequences(ref_seq, seq) + identity = str(top_aln).count("|") / float(min(len(ref_seq), len(seq))) + # compute percentage and logging + perc_identity = identity * 100 + log.info(f"sequence {name} has {perc_identity:.2f}% sequence identity") + if identity > max_id: + max_id = identity + max_id_chain = name.split("|")[1] + shutil.copy(aln_fname, output_aln_fname) + os.unlink(aln_fname) + return max_id_chain, max_id