Skip to content

Commit

Permalink
Refactoring for MS2Rescore implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
rodvrees committed Feb 2, 2024
1 parent 4af9fb7 commit 2dc1890
Show file tree
Hide file tree
Showing 4 changed files with 101 additions and 100 deletions.
2 changes: 1 addition & 1 deletion im2deep/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
"""IM2Deep: Deep learning framework for peptide collisional cross section prediction."""

__version__ = "0.1.1"
__version__ = "0.1.2"
92 changes: 83 additions & 9 deletions im2deep/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,19 @@
from typing import Optional

import click
# import pandas as pd
import pandas as pd

# from deeplc import DeepLC
# from psm_utils.io import read_file
# from psm_utils.io.exceptions import PSMUtilsIOException
# from psm_utils.io.peptide_record import peprec_to_proforma
# from psm_utils.psm import PSM
# from psm_utils.psm_list import PSMList
from psm_utils.io import read_file
from psm_utils.io.exceptions import PSMUtilsIOException
from psm_utils.io.peptide_record import peprec_to_proforma
from psm_utils.psm import PSM
from psm_utils.psm_list import PSMList
from rich.logging import RichHandler

from im2deep._exceptions import IM2DeepError
from im2deep.im2deep import predict_ccs

# from im2deep.calibrate import linear_calibration

REFERENCE_DATASET_PATH = Path(__file__).parent / "reference_data" / "reference_ccs.zip"
Expand Down Expand Up @@ -48,7 +50,8 @@ def setup_logging(passed_level):
handlers=[RichHandler()],
)

#Command line arguments TODO: Make config_parser script

# Command line arguments TODO: Make config_parser script
@click.command()
@click.argument("psm_file", type=click.Path(exists=True, dir_okay=False))
@click.option(
Expand Down Expand Up @@ -98,7 +101,6 @@ def setup_logging(passed_level):
default=2,
help="Charge state to use for calibration. Only used if calibrate_per_charge is set to False.",
)

def main(
psm_file: str,
calibration_file: Optional[str] = None,
Expand All @@ -111,11 +113,83 @@ def main(
):
"""Command line interface to IM2Deep."""
setup_logging(log_level)

with open(psm_file) as f:
first_line_pred = f.readline().strip()
if calibration_file:
with open(calibration_file) as fc:
first_line_cal = fc.readline().strip()

if "modifications" in first_line_pred.split(",") and "seq" in first_line_pred.split(","):
# Read input file
df_pred = pd.read_csv(psm_file)
df_pred.fillna("", inplace=True)

list_of_psms = []
for seq, mod, charge, ident in zip(
df_pred["seq"], df_pred["modifications"], df_pred["charge"], df_pred.index
):
list_of_psms.append(
PSM(peptidoform=peprec_to_proforma(seq, mod, charge), spectrum_id=ident)
)
psm_list_pred = PSMList(psm_list=list_of_psms)

else:
# psm_list_pred = read_file(file_pred)
try:
psm_list_pred = read_file(psm_file)
except PSMUtilsIOException:
LOGGER.error("Invalid input file. Please check the format of the input file.")
sys.exit(1)

psm_list_cal = []
if (
calibration_file
and "modifications" in first_line_cal.split(",")
and "seq" in first_line_cal.split(",")
):
df_cal = pd.read_csv(calibration_file)
df_cal.fillna("", inplace=True)
del calibration_file

list_of_cal_psms = []
for seq, mod, charge, ident, CCS in zip(
df_cal["seq"],
df_cal["modifications"],
df_cal["charge"],
df_cal.index,
df_cal["CCS"],
):
list_of_cal_psms.append(
PSM(peptidoform=peprec_to_proforma(seq, mod, charge), spectrum_id=ident)
)
psm_list_cal = PSMList(psm_list=list_of_cal_psms)
psm_list_cal_df = psm_list_cal.to_dataframe()
psm_list_cal_df["ccs_observed"] = df_cal["CCS"]
del df_cal

else:
LOGGER.error(
"Invalid calibration file. Please check the format of the calibration file."
)
sys.exit(1)

if not output_file:
output_file = Path(psm_file).parent / (Path(psm_file).stem + "_IM2Deep-predictions.csv")
try:
predict_ccs(psm_file, calibration_file, REFERENCE_DATASET_PATH, output_file, model_name, calibrate_per_charge, use_charge_state, n_jobs)
predict_ccs(
psm_list_pred,
psm_list_cal_df,
output_file=output_file,
model_name=model_name,
calibrate_per_charge=calibrate_per_charge,
use_charge_state=use_charge_state,
n_jobs=n_jobs,
)
except IM2DeepError as e:
LOGGER.error(e)
sys.exit(1)


if __name__ == "__main__":
main()
7 changes: 4 additions & 3 deletions im2deep/calibrate.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,9 +78,10 @@ def get_ccs_shift(
"""Calculating CCS shift based on {} overlapping peptide-charge pairs
between PSMs and reference dataset""".format(both.shape[0])
)
LOGGER.debug(both.columns)
# How much CCS in calibration data is larger than reference CCS, so predictions
# need to be increased by this amount
return 0 if both.shape[0] == 0 else np.mean(both["observed_ccs"] - both["CCS"])
return 0 if both.shape[0] == 0 else np.mean(both["ccs_observed"] - both["CCS"])


def get_ccs_shift_per_charge(cal_df: pd.DataFrame, reference_dataset: pd.DataFrame) -> ndarray:
Expand Down Expand Up @@ -123,7 +124,7 @@ def get_ccs_shift_per_charge(cal_df: pd.DataFrame, reference_dataset: pd.DataFra
how="inner",
suffixes=("_ref", "_data"),
)
return both.groupby("charge").apply(lambda x: np.mean(x["observed_ccs"] - x["CCS"])).to_dict()
return both.groupby("charge").apply(lambda x: np.mean(x["ccs_observed"] - x["CCS"])).to_dict()


def calculate_ccs_shift(
Expand Down Expand Up @@ -184,7 +185,7 @@ def linear_calibration(
)
else:
shift_factor = calculate_ccs_shift(
preds_df, reference_dataset, per_charge=False, use_charge_state=use_charge_state
calibration_dataset, reference_dataset, per_charge=False, use_charge_state=use_charge_state
)
preds_df["predicted_ccs_calibrated"] = preds_df.apply(
lambda x: x["predicted_ccs"] + shift_factor, axis=1
Expand Down
100 changes: 13 additions & 87 deletions im2deep/im2deep.py
Original file line number Diff line number Diff line change
@@ -1,89 +1,32 @@
import logging
import sys
from pathlib import Path

import pandas as pd
from deeplc import DeepLC
from psm_utils.io import read_file
from psm_utils.io.exceptions import PSMUtilsIOException
from psm_utils.io.peptide_record import peprec_to_proforma
from psm_utils.psm import PSM
from psm_utils.psm_list import PSMList
from rich.logging import RichHandler

from im2deep.calibrate import linear_calibration

LOGGER = logging.getLogger(__name__)
REFERENCE_DATASET_PATH = Path(__file__).parent / "reference_data" / "reference_ccs.zip"


# TODO: get file reading out of the function
def predict_ccs(
file_pred,
file_cal=None,
psm_list_pred: PSMList,
psm_list_cal_df=None,
file_reference=REFERENCE_DATASET_PATH,
file_pred_out=None,
output_file=None,
model_name="tims",
calibrate_per_charge=True,
use_charge_state=2,
n_jobs=None,
write_output=True,
):
"""Run IM2Deep."""
LOGGER.info("IM2Deep started.")
reference_dataset = pd.read_csv(file_reference)

with open(file_pred) as f:
first_line_pred = f.readline().strip()
if file_cal:
with open(file_cal) as fc:
first_line_cal = fc.readline().strip()

if "modifications" in first_line_pred.split(",") and "seq" in first_line_pred.split(","):
# Read input file
df_pred = pd.read_csv(file_pred)
df_pred.fillna("", inplace=True)

list_of_psms = []
for seq, mod, charge, ident in zip(
df_pred["seq"], df_pred["modifications"], df_pred["charge"], df_pred.index
):
list_of_psms.append(
PSM(peptidoform=peprec_to_proforma(seq, mod, charge), spectrum_id=ident)
)
psm_list_pred = PSMList(psm_list=list_of_psms)

else:
# psm_list_pred = read_file(file_pred)
try:
psm_list_pred = read_file(file_pred)
except PSMUtilsIOException:
LOGGER.error("Invalid input file. Please check the format of the input file.")
sys.exit(1)

psm_list_cal = []
if (
file_cal
and "modifications" in first_line_cal.split(",")
and "seq" in first_line_cal.split(",")
):
df_cal = pd.read_csv(file_cal)
df_cal.fillna("", inplace=True)
del file_cal

list_of_cal_psms = []
for seq, mod, charge, ident, CCS in zip(
df_cal["seq"], df_cal["modifications"], df_cal["charge"], df_cal.index, df_cal["CCS"]
):
list_of_cal_psms.append(
PSM(peptidoform=peprec_to_proforma(seq, mod, charge), spectrum_id=ident)
)
psm_list_cal = PSMList(psm_list=list_of_cal_psms)
psm_list_cal_df = psm_list_cal.to_dataframe()
psm_list_cal_df["observed_ccs"] = df_cal["CCS"]
del df_cal

else:
LOGGER.error("Invalid calibration file. Please check the format of the calibration file.")
sys.exit(1)

if model_name == "tims":
path_model = Path(__file__).parent / "models" / "TIMS"

Expand All @@ -103,34 +46,17 @@ def predict_ccs(
per_charge=calibrate_per_charge,
use_charge_state=use_charge_state,
)

LOGGER.info("Writing output file...")
if file_pred_out:
file_pred_out = open(file_pred_out, "w")
file_pred_out.write("seq,modifications,charge,predicted CCS\n")
for seq, mod, charge, ident, CCS in zip(
df_pred["seq"],
df_pred["modifications"],
df_pred["charge"],
df_pred.index,
calibrated_psm_list_pred_df["predicted_ccs_calibrated"],
):
file_pred_out.write(f"{seq},{mod},{charge},{CCS}\n")
file_pred_out.close()
else:
#Get path of psm file
output_file = Path(file_pred).parent / (Path(file_pred).stem + "_IM2Deep-predictions.csv")
LOGGER.info("Writing output file to %s", output_file)
if write_output:
LOGGER.info("Writing output file...")
output_file = open(output_file, "w")
output_file.write("seq,modifications,charge,predicted CCS\n")
for seq, mod, charge, ident, CCS in zip(
df_pred["seq"],
df_pred["modifications"],
df_pred["charge"],
df_pred.index,
for peptidoform, charge, CCS in zip(
calibrated_psm_list_pred_df["peptidoform"],
calibrated_psm_list_pred_df["charge"],
calibrated_psm_list_pred_df["predicted_ccs_calibrated"],
):
output_file.write(f"{seq},{mod},{charge},{CCS}\n")
output_file.write(f"{peptidoform},{charge},{CCS}\n")
output_file.close()

LOGGER.info("IM2Deep finished!")
return calibrated_psm_list_pred_df["predicted_ccs_calibrated"]

0 comments on commit 2dc1890

Please sign in to comment.