Skip to content

Commit

Permalink
Merge pull request #103 from compomics/diann-io
Browse files Browse the repository at this point in the history
Add I/O support for some DIA search engines
  • Loading branch information
RalfG authored Nov 6, 2024
2 parents 27b8591 + 029ba11 commit 133559a
Show file tree
Hide file tree
Showing 14 changed files with 695 additions and 4 deletions.
7 changes: 5 additions & 2 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,10 @@ Supported file formats
===================================================================================================================== ======================== =============== ===============
File format psm_utils tag Read support Write support
===================================================================================================================== ======================== =============== ===============
`AlphaDIA precursors TSV <https://alphadia.readthedocs.io/en/latest/quickstart.html#output-files>`_ ``alphadia`` ✅ ❌
`DIA-NN TSV <https://github.com/vdemichev/DiaNN#output>`_ ``diann`` ✅ ❌
`FlashLFQ generic TSV <https://github.com/smith-chem-wisc/FlashLFQ/wiki/Identification-Input-Formats>`_ ``flashlfq`` ✅ ✅
`FragPipe PSM TSV <https://fragpipe.nesvilab.org/docs/tutorial_fragpipe_outputs.html#psmtsv/>`_ ``fragpipe`` ✅ ❌
`ionbot CSV <https://ionbot.cloud/>`_ ``ionbot`` ✅ ❌
`OpenMS idXML <https://www.openms.de/>`_ ``idxml`` ✅ ✅
`MaxQuant msms.txt <https://www.maxquant.org/>`_ ``msms`` ✅ ❌
Expand All @@ -99,10 +102,10 @@ Supported file formats
`Peptide Record <https://psm-utils.readthedocs.io/en/stable/api/psm_utils.io/#module-psm_utils.io.peptide_record>`_ ``peprec`` ✅ ✅
`pepXML <http://tools.proteomecenter.org/wiki/index.php?title=Formats:pepXML>`_ ``pepxml`` ✅ ❌
`Percolator tab <https://github.com/percolator/percolator/wiki/Interface>`_ ``percolator`` ✅ ✅
Proteome Discoverer MSF ``proteome_discoverer`` ✅ ❌
`Proteome Discoverer MSF <#>`_ ``proteome_discoverer`` ✅ ❌
`Sage Parquet <https://github.com/lazear/sage/blob/v0.14.7/DOCS.md#interpreting-sage-output>`_ ``sage_parquet`` ✅ ❌
`Sage TSV <https://github.com/lazear/sage/blob/v0.14.7/DOCS.md#interpreting-sage-output>`_ ``sage_tsv`` ✅ ❌
ProteoScape Parquet ``proteoscape`` ✅ ❌
`ProteoScape Parquet <#>`_ ``proteoscape`` ✅ ❌
`TSV <https://psm-utils.readthedocs.io/en/stable/api/psm_utils.io/#module-psm_utils.io.tsv>`_ ``tsv`` ✅ ✅
`X!Tandem XML <https://www.thegpm.org/tandem/>`_ ``xtandem`` ✅ ❌
===================================================================================================================== ======================== =============== ===============
Expand Down
24 changes: 24 additions & 0 deletions docs/source/api/psm_utils.io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,22 @@ psm_utils.io



psm_utils.io.alphapept
##################

.. automodule:: psm_utils.io.alphapept
:members:
:inherited-members:


psm_utils.io.diann
##################

.. automodule:: psm_utils.io.diann
:members:
:inherited-members:


psm_utils.io.flashlfq
#####################

Expand All @@ -15,6 +31,14 @@ psm_utils.io.flashlfq
:inherited-members:


psm_utils.io.fragpipe
##################

.. automodule:: psm_utils.io.fragpipe
:members:
:inherited-members:


psm_utils.io.idxml
##################

Expand Down
4 changes: 4 additions & 0 deletions example_files/alphadia.precursors.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
base_width_mobility base_width_rt rt_observed mobility_observed mono_ms1_intensity top_ms1_intensity sum_ms1_intensity weighted_ms1_intensity weighted_mass_deviation weighted_mass_error mz_observed mono_ms1_height top_ms1_height sum_ms1_height weighted_ms1_height isotope_intensity_correlation isotope_height_correlation n_observations intensity_correlation height_correlation intensity_fraction height_fraction intensity_fraction_weighted height_fraction_weighted mean_observation_score sum_b_ion_intensity sum_y_ion_intensity diff_b_y_ion_intensity f_masked fragment_scan_correlation template_scan_correlation fragment_frame_correlation top3_frame_correlation template_frame_correlation top3_b_ion_correlation n_b_ions top3_y_ion_correlation n_y_ions cycle_fwhm mobility_fwhm delta_frame_peak top_3_ms2_mass_error mean_ms2_mass_error n_overlapping mean_overlapping_intensity mean_overlapping_mass_error precursor_idx rank frame_center scan_center score elution_group_idx frame_start scan_stop frame_stop scan_start proteins rt_calibrated flat_frag_start_idx charge mods decoy sequence mz_library channel genes i_0 flat_frag_stop_idx i_2 i_1 i_3 mobility_library rt_library mod_sites delta_rt n_K n_R n_P _decoy proba qval _candidate_idx valid candidate_idx run mod_seq_hash mod_seq_charge_hash pg_master pg pg_qval intensity
0.000000 40.673340 2800.518555 0.000001 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 894.337830 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 2.000000 0.968887 0.845673 1.000000 1.000000 1.000000 1.000000 0.000000 0.000000 15.266385 -15.266385 1.000000 0.000000 0.000000 0.929785 0.975279 0.000000 0.000000 0.000000 0.948546 12.000000 14.244627 0.000000 -0.500000 0.132713 -0.218829 0.000000 0.000000 0.000000 10447876 0 72329 0 136.160126 5238821 71876 1 72933 0 P18899 2347.609131 59818105 3 0 SSYGSSSNDDSYGSSNNDDSYGSSNK 894.337830 0 DDR48_YEAST 0.273118 59818117 0.249391 0.348172 0.129319 0.948457 1399.216187 452.909424 1 0 0 0.000000 0.000000 0.000000 10447876 True 10447876 LFQ_Orbitrap_AIF_Condition_A_Sample_Alpha_01 8562405370847133435 8562405370847133438 P18899 P18899 0.000000 190103852.035206
0.000000 40.745483 1647.208252 0.000001 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 986.440491 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 2.000000 0.991654 0.992141 1.000000 1.000000 1.000000 1.000000 0.000000 0.000000 14.408463 -14.408463 1.000000 0.000000 0.000000 0.738752 0.974915 0.000000 0.000000 0.000000 0.880488 12.000000 9.885651 0.000000 0.000000 -0.391579 -0.698411 0.000000 0.000000 0.000000 8793636 0 42431 0 122.278320 4411698 41978 1 43035 0 Q9ULU4 1670.462402 49907897 2 0 SSQGSSSSTQSAPSETASASK 986.440491 0 PKCB1_HUMAN 0.380560 49907909 0.190793 0.352861 0.075786 1.158085 387.834503 -23.254150 1 0 1 0.000000 0.000000 0.000000 8793636 True 8793636 LFQ_Orbitrap_AIF_Condition_A_Sample_Alpha_01 5824087303549386971 5824087303549386973 Q9ULU4 Q9ULU4 0.000000 195496849.073322
0.000000 52.349121 2678.317139 0.000001 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 905.432312 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 2.000000 0.986449 0.931379 1.000000 1.000000 1.000000 1.000000 0.000000 0.000000 16.636572 -16.636572 1.000000 0.000000 0.000000 0.978579 0.996334 0.000000 0.000000 0.000000 0.988605 12.000000 13.867673 0.000000 0.000000 -0.432777 0.780247 0.000000 0.000000 0.000000 7132549 0 69158 0 152.012512 3581144 68554 1 69913 0 O60763 2646.791260 39980635 2 0 SSQTSGTNEQSSAIVSAR 905.432312 0 USO1_HUMAN 0.404900 39980647 0.177361 0.352328 0.065410 1.110423 1774.035034 31.525879 0 1 0 0.000000 0.000000 0.000000 7132549 True 7132549 LFQ_Orbitrap_AIF_Condition_A_Sample_Alpha_01 14912031975374993231 14912031975374993233 O60763 O60763 0.000000 406414129.849395
6 changes: 6 additions & 0 deletions example_files/fragpipe.psm.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
Spectrum Spectrum File Peptide Modified Peptide Extended Peptide Prev AA Next AA Peptide Length Charge Retention Observed Mass Calibrated Observed Mass Observed M/Z Calibrated Observed M/Z Calculated Peptide Mass Calculated M/Z Delta Mass Expectation Hyperscore Nextscore Probability Number of Enzymatic Termini Number of Missed Cleavages Protein Start Protein End Intensity Assigned Modifications Observed Modifications Purity Is Unique Protein Protein ID Entry Name Gene Protein Description Mapped Genes Mapped Proteins
LFQ_Orbitrap_AIF_Human_01.100000.100000.0 D:\test\interact-LFQ_Orbitrap_AIF_Human_01_rank1.pep.xml SEDCFILDHGK SEDCFILDHGK PFAQGAIK.SEDCFILDHGK.DGKIFVWK K D 11 3 3813.8638 1319.5804 1319.5807 440.8674 440.8675 1319.5815 440.8678 -0.0008 0.01264961000000 19.3701 15.5657 0.9968 2 0 328 338 0.0000 4C(57.0214) 0.00 false sp|GELS_HUMAN| GELS_HUMAN GSN sp|P06396|GELS_HUMAN
LFQ_Orbitrap_AIF_Human_01.100002.100002.0 D:\test\interact-LFQ_Orbitrap_AIF_Human_01_rank1.pep.xml FLLEAGADQEHK KGHIEMVR.FLLEAGADQEHK.TDEMHTAI R T 12 3 3813.9346 1356.6663 1356.6665 453.2294 453.2294 1356.6672 453.2297 -0.0007 0.01950739000000 18.9370 14.3831 0.9985 2 0 419 430 0.0000 0.00 false sp|O75179|ANR17_HUMAN O75179 ANR17_HUMAN ANKRD17 Ankyrin repeat domain-containing protein 17 ANKHD1 sp|Q8IWZ3|ANKH1_HUMAN
LFQ_Orbitrap_AIF_Human_01.100004.100004.0 D:\test\interact-LFQ_Orbitrap_AIF_Human_01_rank1.pep.xml REELSNVLAAMR REELSNVLAAM[147]R THIRAKRK.REELSNVLAAMR.KAAAKKD K K 12 3 3814.0050 1403.7197 1403.7198 468.9138 468.9139 1403.7190 468.9136 0.0008 0.00008879724000 24.3292 15.9192 0.9998 2 1 87 98 0.0000 11M(15.9949) 0.00 true sp|Q9Y3U8|RL36_HUMAN Q9Y3U8 RL36_HUMAN RPL36 Large ribosomal subunit protein eL36
LFQ_Orbitrap_AIF_Human_01.100040.100040.0 D:\test\interact-LFQ_Orbitrap_AIF_Human_01_rank1.pep.xml LHISPSNMTNQNTPEYMEK LHISPSNM[147]TNQNTPEYMEKc[17] EYFGPDFK.LHISPSNMTNQNTPEYMEK.IKQRIFEN K I 19 3 3815.4023 2248.0256 2248.0251 750.3491 750.3490 2248.0254 750.3491 -0.0002 0.00194418200000 21.2429 21.2429 0.7143 2 0 344 362 0.0000 8M(15.9949), C-term(-0.9840) 0.00 true sp|Q92769|HDAC2_HUMAN Q92769 HDAC2_HUMAN HDAC2 Histone deacetylase 2
LFQ_Orbitrap_AIF_Human_01.101373.101373.0 D:\test\interact-LFQ_Orbitrap_AIF_Human_01_rank1.pep.xml ANIAVQR n[43]ANIAVQR .ANIAVQR.IKREFKEV M I 7 2 3866.1475 812.4501 812.4503 407.2323 407.2324 812.4505 407.2325 -0.0002 0.11090580000000 17.1991 14.1196 0.9898 2 0 2 8 0.0000 N-term(42.0106) 0.00 true sp|P61086|UBE2K_HUMAN P61086 UBE2K_HUMAN UBE2K Ubiquitin-conjugating enzyme E2 K
25 changes: 23 additions & 2 deletions psm_utils/io/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,10 @@

from rich.progress import track

import psm_utils.io.alphadia as alphadia
import psm_utils.io.diann as diann
import psm_utils.io.flashlfq as flashlfq
import psm_utils.io.fragpipe as fragpipe
import psm_utils.io.idxml as idxml
import psm_utils.io.ionbot as ionbot
import psm_utils.io.maxquant as maxquant
Expand Down Expand Up @@ -113,13 +116,31 @@
"extension": ".parquet",
"filename_pattern": r"^.*(?:_|\.)sage.parquet$",
},
"parquet": { # List after proteoscape and sage to avoid extension matching conflicts
"fragpipe": {
"reader": fragpipe.FragPipeReader,
"writer": None,
"extension": ".tsv",
"filename_pattern": r"^.*(?:_|\.)?psm\.tsv$",
},
"alphadia": {
"reader": alphadia.AlphaDIAReader,
"writer": None,
"extension": ".tsv",
"filename_pattern": r"^.*(?:_|\.)?precursors\.tsv$",
},
"diann": {
"reader": diann.DIANNTSVReader,
"writer": None,
"extension": ".tsv",
"filename_pattern": r"^.*(?:_|\.)?diann\.tsv$",
},
"parquet": { # List after more specific Parquet patterns to avoid matching conflicts
"reader": parquet.ParquetReader,
"writer": parquet.ParquetWriter,
"extension": ".parquet",
"filename_pattern": r"^.*\.parquet$",
},
"tsv": { # List after sage to avoid extension matching conflicts
"tsv": { # List after more specific TSV patterns to avoid matching conflicts
"reader": tsv.TSVReader,
"writer": tsv.TSVWriter,
"extension": ".tsv",
Expand Down
112 changes: 112 additions & 0 deletions psm_utils/io/alphadia.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
"""Reader for PSM files from the AlphaDIA search engine."""

from __future__ import annotations

import csv
from abc import ABC
from typing import Iterable, Optional

from psm_utils.io._base_classes import ReaderBase
from psm_utils.io._utils import set_csv_field_size_limit
from psm_utils.psm import PSM
from psm_utils.psm_list import PSMList

set_csv_field_size_limit()

# TODO: check
RESCORING_FEATURES = [
"rt_observed",
"mobility_observed",
"mz_observed",
"charge",
"delta_rt",
]


class AlphaDIAReader(ReaderBase, ABC):
def __init__(self, filename, *args, **kwargs):
"""
Reader for AlphaDIA ``precursor.tsv`` file.
Parameters
----------
filename : str or Path
Path to PSM file.
"""
super().__init__(filename, *args, **kwargs)
self.filename = filename

def __iter__(self) -> Iterable[PSM]:
"""Iterate over file and return PSMs one-by-one."""
with open(self.filename) as msms_in:
reader = csv.DictReader(msms_in, delimiter="\t")
for row in reader:
yield self._get_peptide_spectrum_match(row)

def _get_peptide_spectrum_match(self, psm_dict) -> PSM:
"""Parse a single PSM from a AlphaDIA PSM file."""
rescoring_features = {}
for ft in RESCORING_FEATURES:
try:
rescoring_features[ft] = psm_dict[ft]
except KeyError:
continue

return PSM(
peptidoform=self._parse_peptidoform(
psm_dict["sequence"], psm_dict["mods"], psm_dict["mod_sites"], psm_dict["charge"]
),
spectrum_id=psm_dict["frame_start"], # TODO: needs to be checked
run=psm_dict["run"],
spectrum=psm_dict["frame_start"], # TODO: needs to be checked
is_decoy=bool(int(psm_dict["decoy"])),
score=psm_dict["score"],
qvalue=psm_dict["qval"],
pep=psm_dict["proba"],
precursor_mz=psm_dict["mz_observed"],
retention_time=psm_dict["rt_observed"],
ion_mobility=psm_dict["mobility_observed"],
protein_list=psm_dict["proteins"].split(";"),
rank=int(psm_dict["rank"]) + 1, # AlphaDIA ranks are 0-based
source="AlphaDIA",
provenance_data=({"alphadia_filename": str(self.filename)}),
metadata={},
rescoring_features=rescoring_features,
)

@staticmethod
def _parse_peptidoform(sequence: str, mods: str, mod_sites, charge: Optional[str]) -> str:
"""Parse a peptidoform from a AlphaDIA PSM file."""
# Parse modifications
if mods:
sequence_list = [""] + list(sequence) + [""] # N-term, sequence, C-term
for mod, site in zip(mods.split(";"), mod_sites.split(";")):
site = int(site)
name = mod.split("@")[0]
# N-terminal modification
if site == 0:
sequence_list[0] = f"[{name}]-"
# C-terminal modification
elif site == -1:
sequence_list[-1] = f"-[{name}]"
# Sequence modification
else:
sequence_list[site] = f"{sequence_list[site]}[{name}]"
sequence = "".join(sequence_list)

# Add charge
if charge:
sequence += f"/{int(float(charge))}"

return sequence

@classmethod
def from_dataframe(cls, dataframe) -> PSMList:
"""Create a PSMList from a AlphaDIA Pandas DataFrame."""
return PSMList(
psm_list=[
cls._get_peptide_spectrum_match(cls(""), entry)
for entry in dataframe.to_dict(orient="records")
]
)
125 changes: 125 additions & 0 deletions psm_utils/io/diann.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
"""
Reader for PSM files from DIA-NN
Reads the '.tsv' file as defined on the
`DIA-NN documentation page <https://github.com/vdemichev/DiaNN/tree/1.8.1?tab=readme-ov-file#main-output-reference>`_.
Notes
-----
- DIA-NN calculates q-values at both the run and library level. The run-level q-value is used as
the PSM q-value.
- DIA-NN currently does not return precursor m/z values.
- DIA-NN currently does not support C-terminal modifications in its searches.
"""

from __future__ import annotations

import csv
import re
from typing import Iterable, Optional

from psm_utils.io._base_classes import ReaderBase
from psm_utils.io._utils import set_csv_field_size_limit
from psm_utils.psm import PSM
from psm_utils.psm_list import PSMList

set_csv_field_size_limit()

RESCORING_FEATURES = [
"RT",
"Predicted.RT",
"iRT",
"Predicted.iRT",
"Ms1.Profile.Corr",
"Ms1.Area",
"IM",
"iIM",
"Predicted.IM",
"Predicted.iIM",
]


class DIANNTSVReader(ReaderBase):
def __init__(self, filename, *args, **kwargs) -> None:
"""
Reader for DIA-NN '.tsv' file.
Parameters
----------
filename : str or Path
Path to PSM file.
"""
super().__init__(filename, *args, **kwargs)
self.filename = filename

def __iter__(self) -> Iterable[PSM]:
"""Iterate over file and return PSMs one-by-one."""
with open(self.filename) as msms_in:
reader = csv.DictReader(msms_in, delimiter="\t")
for row in reader:
yield self._get_peptide_spectrum_match(row)

def _get_peptide_spectrum_match(self, psm_dict) -> PSM:
"""Parse a single PSM from a DIA-NN PSM file."""
rescoring_features = {}
for ft in RESCORING_FEATURES:
try:
rescoring_features[ft] = psm_dict[ft]
except KeyError:
continue

return PSM(
peptidoform=self._parse_peptidoform(
psm_dict["Modified.Sequence"], psm_dict["Precursor.Charge"]
),
spectrum_id=psm_dict["MS2.Scan"],
run=psm_dict["Run"],
is_decoy=False,
qvalue=psm_dict["Q.Value"],
pep=float(psm_dict["PEP"]),
score=float(psm_dict["CScore"]),
precursor_mz=None, # Not returned by DIA-NN :(
retention_time=float(psm_dict["RT"]),
ion_mobility=float(psm_dict["IM"]),
protein_list=psm_dict["Protein.Ids"].split(";"),
source="diann",
rank=None,
provenance_data=({"diann_filename": str(self.filename)}),
rescoring_features=rescoring_features,
metadata={},
)

@staticmethod
def _parse_peptidoform(peptide: str, charge: Optional[str]) -> str:
# Add charge
if charge:
peptide += f"/{int(float(charge))}"

# Replace parentheses with square brackets and capitalize UniMod prefix
pattern = r"\(UniMod:(\d+)\)"
replacement = r"[UNIMOD:\1]"
peptide = re.sub(pattern, replacement, peptide)

# Add hyphen for N-terminal modifications
# If [UNIMOD:n] occurs before the first amino acid, a hyphen is added before the first
# amino acid
if peptide[0] == "[":
# Hyphen after the closing bracket
peptide = peptide.replace("]", "]-", 1)

# C-terminal modifications are currently not supported in DIA-NN

return peptide

@classmethod
def from_dataframe(cls, dataframe) -> PSMList:
"""Create a PSMList from a DIA-NN Pandas DataFrame."""
return PSMList(
ptm_list=[
cls._get_peptide_spectrum_match(cls(""), entry)
for entry in dataframe.to_dict(orient="records")
]
)
Loading

0 comments on commit 133559a

Please sign in to comment.