-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #103 from compomics/diann-io
Add I/O support for some DIA search engines
- Loading branch information
Showing
14 changed files
with
695 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
base_width_mobility base_width_rt rt_observed mobility_observed mono_ms1_intensity top_ms1_intensity sum_ms1_intensity weighted_ms1_intensity weighted_mass_deviation weighted_mass_error mz_observed mono_ms1_height top_ms1_height sum_ms1_height weighted_ms1_height isotope_intensity_correlation isotope_height_correlation n_observations intensity_correlation height_correlation intensity_fraction height_fraction intensity_fraction_weighted height_fraction_weighted mean_observation_score sum_b_ion_intensity sum_y_ion_intensity diff_b_y_ion_intensity f_masked fragment_scan_correlation template_scan_correlation fragment_frame_correlation top3_frame_correlation template_frame_correlation top3_b_ion_correlation n_b_ions top3_y_ion_correlation n_y_ions cycle_fwhm mobility_fwhm delta_frame_peak top_3_ms2_mass_error mean_ms2_mass_error n_overlapping mean_overlapping_intensity mean_overlapping_mass_error precursor_idx rank frame_center scan_center score elution_group_idx frame_start scan_stop frame_stop scan_start proteins rt_calibrated flat_frag_start_idx charge mods decoy sequence mz_library channel genes i_0 flat_frag_stop_idx i_2 i_1 i_3 mobility_library rt_library mod_sites delta_rt n_K n_R n_P _decoy proba qval _candidate_idx valid candidate_idx run mod_seq_hash mod_seq_charge_hash pg_master pg pg_qval intensity | ||
0.000000 40.673340 2800.518555 0.000001 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 894.337830 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 2.000000 0.968887 0.845673 1.000000 1.000000 1.000000 1.000000 0.000000 0.000000 15.266385 -15.266385 1.000000 0.000000 0.000000 0.929785 0.975279 0.000000 0.000000 0.000000 0.948546 12.000000 14.244627 0.000000 -0.500000 0.132713 -0.218829 0.000000 0.000000 0.000000 10447876 0 72329 0 136.160126 5238821 71876 1 72933 0 P18899 2347.609131 59818105 3 0 SSYGSSSNDDSYGSSNNDDSYGSSNK 894.337830 0 DDR48_YEAST 0.273118 59818117 0.249391 0.348172 0.129319 0.948457 1399.216187 452.909424 1 0 0 0.000000 0.000000 0.000000 10447876 True 10447876 LFQ_Orbitrap_AIF_Condition_A_Sample_Alpha_01 8562405370847133435 8562405370847133438 P18899 P18899 0.000000 190103852.035206 | ||
0.000000 40.745483 1647.208252 0.000001 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 986.440491 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 2.000000 0.991654 0.992141 1.000000 1.000000 1.000000 1.000000 0.000000 0.000000 14.408463 -14.408463 1.000000 0.000000 0.000000 0.738752 0.974915 0.000000 0.000000 0.000000 0.880488 12.000000 9.885651 0.000000 0.000000 -0.391579 -0.698411 0.000000 0.000000 0.000000 8793636 0 42431 0 122.278320 4411698 41978 1 43035 0 Q9ULU4 1670.462402 49907897 2 0 SSQGSSSSTQSAPSETASASK 986.440491 0 PKCB1_HUMAN 0.380560 49907909 0.190793 0.352861 0.075786 1.158085 387.834503 -23.254150 1 0 1 0.000000 0.000000 0.000000 8793636 True 8793636 LFQ_Orbitrap_AIF_Condition_A_Sample_Alpha_01 5824087303549386971 5824087303549386973 Q9ULU4 Q9ULU4 0.000000 195496849.073322 | ||
0.000000 52.349121 2678.317139 0.000001 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 905.432312 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 2.000000 0.986449 0.931379 1.000000 1.000000 1.000000 1.000000 0.000000 0.000000 16.636572 -16.636572 1.000000 0.000000 0.000000 0.978579 0.996334 0.000000 0.000000 0.000000 0.988605 12.000000 13.867673 0.000000 0.000000 -0.432777 0.780247 0.000000 0.000000 0.000000 7132549 0 69158 0 152.012512 3581144 68554 1 69913 0 O60763 2646.791260 39980635 2 0 SSQTSGTNEQSSAIVSAR 905.432312 0 USO1_HUMAN 0.404900 39980647 0.177361 0.352328 0.065410 1.110423 1774.035034 31.525879 0 1 0 0.000000 0.000000 0.000000 7132549 True 7132549 LFQ_Orbitrap_AIF_Condition_A_Sample_Alpha_01 14912031975374993231 14912031975374993233 O60763 O60763 0.000000 406414129.849395 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
Spectrum Spectrum File Peptide Modified Peptide Extended Peptide Prev AA Next AA Peptide Length Charge Retention Observed Mass Calibrated Observed Mass Observed M/Z Calibrated Observed M/Z Calculated Peptide Mass Calculated M/Z Delta Mass Expectation Hyperscore Nextscore Probability Number of Enzymatic Termini Number of Missed Cleavages Protein Start Protein End Intensity Assigned Modifications Observed Modifications Purity Is Unique Protein Protein ID Entry Name Gene Protein Description Mapped Genes Mapped Proteins | ||
LFQ_Orbitrap_AIF_Human_01.100000.100000.0 D:\test\interact-LFQ_Orbitrap_AIF_Human_01_rank1.pep.xml SEDCFILDHGK SEDCFILDHGK PFAQGAIK.SEDCFILDHGK.DGKIFVWK K D 11 3 3813.8638 1319.5804 1319.5807 440.8674 440.8675 1319.5815 440.8678 -0.0008 0.01264961000000 19.3701 15.5657 0.9968 2 0 328 338 0.0000 4C(57.0214) 0.00 false sp|GELS_HUMAN| GELS_HUMAN GSN sp|P06396|GELS_HUMAN | ||
LFQ_Orbitrap_AIF_Human_01.100002.100002.0 D:\test\interact-LFQ_Orbitrap_AIF_Human_01_rank1.pep.xml FLLEAGADQEHK KGHIEMVR.FLLEAGADQEHK.TDEMHTAI R T 12 3 3813.9346 1356.6663 1356.6665 453.2294 453.2294 1356.6672 453.2297 -0.0007 0.01950739000000 18.9370 14.3831 0.9985 2 0 419 430 0.0000 0.00 false sp|O75179|ANR17_HUMAN O75179 ANR17_HUMAN ANKRD17 Ankyrin repeat domain-containing protein 17 ANKHD1 sp|Q8IWZ3|ANKH1_HUMAN | ||
LFQ_Orbitrap_AIF_Human_01.100004.100004.0 D:\test\interact-LFQ_Orbitrap_AIF_Human_01_rank1.pep.xml REELSNVLAAMR REELSNVLAAM[147]R THIRAKRK.REELSNVLAAMR.KAAAKKD K K 12 3 3814.0050 1403.7197 1403.7198 468.9138 468.9139 1403.7190 468.9136 0.0008 0.00008879724000 24.3292 15.9192 0.9998 2 1 87 98 0.0000 11M(15.9949) 0.00 true sp|Q9Y3U8|RL36_HUMAN Q9Y3U8 RL36_HUMAN RPL36 Large ribosomal subunit protein eL36 | ||
LFQ_Orbitrap_AIF_Human_01.100040.100040.0 D:\test\interact-LFQ_Orbitrap_AIF_Human_01_rank1.pep.xml LHISPSNMTNQNTPEYMEK LHISPSNM[147]TNQNTPEYMEKc[17] EYFGPDFK.LHISPSNMTNQNTPEYMEK.IKQRIFEN K I 19 3 3815.4023 2248.0256 2248.0251 750.3491 750.3490 2248.0254 750.3491 -0.0002 0.00194418200000 21.2429 21.2429 0.7143 2 0 344 362 0.0000 8M(15.9949), C-term(-0.9840) 0.00 true sp|Q92769|HDAC2_HUMAN Q92769 HDAC2_HUMAN HDAC2 Histone deacetylase 2 | ||
LFQ_Orbitrap_AIF_Human_01.101373.101373.0 D:\test\interact-LFQ_Orbitrap_AIF_Human_01_rank1.pep.xml ANIAVQR n[43]ANIAVQR .ANIAVQR.IKREFKEV M I 7 2 3866.1475 812.4501 812.4503 407.2323 407.2324 812.4505 407.2325 -0.0002 0.11090580000000 17.1991 14.1196 0.9898 2 0 2 8 0.0000 N-term(42.0106) 0.00 true sp|P61086|UBE2K_HUMAN P61086 UBE2K_HUMAN UBE2K Ubiquitin-conjugating enzyme E2 K |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,112 @@ | ||
"""Reader for PSM files from the AlphaDIA search engine.""" | ||
|
||
from __future__ import annotations | ||
|
||
import csv | ||
from abc import ABC | ||
from typing import Iterable, Optional | ||
|
||
from psm_utils.io._base_classes import ReaderBase | ||
from psm_utils.io._utils import set_csv_field_size_limit | ||
from psm_utils.psm import PSM | ||
from psm_utils.psm_list import PSMList | ||
|
||
set_csv_field_size_limit() | ||
|
||
# TODO: check | ||
RESCORING_FEATURES = [ | ||
"rt_observed", | ||
"mobility_observed", | ||
"mz_observed", | ||
"charge", | ||
"delta_rt", | ||
] | ||
|
||
|
||
class AlphaDIAReader(ReaderBase, ABC): | ||
def __init__(self, filename, *args, **kwargs): | ||
""" | ||
Reader for AlphaDIA ``precursor.tsv`` file. | ||
Parameters | ||
---------- | ||
filename : str or Path | ||
Path to PSM file. | ||
""" | ||
super().__init__(filename, *args, **kwargs) | ||
self.filename = filename | ||
|
||
def __iter__(self) -> Iterable[PSM]: | ||
"""Iterate over file and return PSMs one-by-one.""" | ||
with open(self.filename) as msms_in: | ||
reader = csv.DictReader(msms_in, delimiter="\t") | ||
for row in reader: | ||
yield self._get_peptide_spectrum_match(row) | ||
|
||
def _get_peptide_spectrum_match(self, psm_dict) -> PSM: | ||
"""Parse a single PSM from a AlphaDIA PSM file.""" | ||
rescoring_features = {} | ||
for ft in RESCORING_FEATURES: | ||
try: | ||
rescoring_features[ft] = psm_dict[ft] | ||
except KeyError: | ||
continue | ||
|
||
return PSM( | ||
peptidoform=self._parse_peptidoform( | ||
psm_dict["sequence"], psm_dict["mods"], psm_dict["mod_sites"], psm_dict["charge"] | ||
), | ||
spectrum_id=psm_dict["frame_start"], # TODO: needs to be checked | ||
run=psm_dict["run"], | ||
spectrum=psm_dict["frame_start"], # TODO: needs to be checked | ||
is_decoy=bool(int(psm_dict["decoy"])), | ||
score=psm_dict["score"], | ||
qvalue=psm_dict["qval"], | ||
pep=psm_dict["proba"], | ||
precursor_mz=psm_dict["mz_observed"], | ||
retention_time=psm_dict["rt_observed"], | ||
ion_mobility=psm_dict["mobility_observed"], | ||
protein_list=psm_dict["proteins"].split(";"), | ||
rank=int(psm_dict["rank"]) + 1, # AlphaDIA ranks are 0-based | ||
source="AlphaDIA", | ||
provenance_data=({"alphadia_filename": str(self.filename)}), | ||
metadata={}, | ||
rescoring_features=rescoring_features, | ||
) | ||
|
||
@staticmethod | ||
def _parse_peptidoform(sequence: str, mods: str, mod_sites, charge: Optional[str]) -> str: | ||
"""Parse a peptidoform from a AlphaDIA PSM file.""" | ||
# Parse modifications | ||
if mods: | ||
sequence_list = [""] + list(sequence) + [""] # N-term, sequence, C-term | ||
for mod, site in zip(mods.split(";"), mod_sites.split(";")): | ||
site = int(site) | ||
name = mod.split("@")[0] | ||
# N-terminal modification | ||
if site == 0: | ||
sequence_list[0] = f"[{name}]-" | ||
# C-terminal modification | ||
elif site == -1: | ||
sequence_list[-1] = f"-[{name}]" | ||
# Sequence modification | ||
else: | ||
sequence_list[site] = f"{sequence_list[site]}[{name}]" | ||
sequence = "".join(sequence_list) | ||
|
||
# Add charge | ||
if charge: | ||
sequence += f"/{int(float(charge))}" | ||
|
||
return sequence | ||
|
||
@classmethod | ||
def from_dataframe(cls, dataframe) -> PSMList: | ||
"""Create a PSMList from a AlphaDIA Pandas DataFrame.""" | ||
return PSMList( | ||
psm_list=[ | ||
cls._get_peptide_spectrum_match(cls(""), entry) | ||
for entry in dataframe.to_dict(orient="records") | ||
] | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,125 @@ | ||
""" | ||
Reader for PSM files from DIA-NN | ||
Reads the '.tsv' file as defined on the | ||
`DIA-NN documentation page <https://github.com/vdemichev/DiaNN/tree/1.8.1?tab=readme-ov-file#main-output-reference>`_. | ||
Notes | ||
----- | ||
- DIA-NN calculates q-values at both the run and library level. The run-level q-value is used as | ||
the PSM q-value. | ||
- DIA-NN currently does not return precursor m/z values. | ||
- DIA-NN currently does not support C-terminal modifications in its searches. | ||
""" | ||
|
||
from __future__ import annotations | ||
|
||
import csv | ||
import re | ||
from typing import Iterable, Optional | ||
|
||
from psm_utils.io._base_classes import ReaderBase | ||
from psm_utils.io._utils import set_csv_field_size_limit | ||
from psm_utils.psm import PSM | ||
from psm_utils.psm_list import PSMList | ||
|
||
set_csv_field_size_limit() | ||
|
||
RESCORING_FEATURES = [ | ||
"RT", | ||
"Predicted.RT", | ||
"iRT", | ||
"Predicted.iRT", | ||
"Ms1.Profile.Corr", | ||
"Ms1.Area", | ||
"IM", | ||
"iIM", | ||
"Predicted.IM", | ||
"Predicted.iIM", | ||
] | ||
|
||
|
||
class DIANNTSVReader(ReaderBase): | ||
def __init__(self, filename, *args, **kwargs) -> None: | ||
""" | ||
Reader for DIA-NN '.tsv' file. | ||
Parameters | ||
---------- | ||
filename : str or Path | ||
Path to PSM file. | ||
""" | ||
super().__init__(filename, *args, **kwargs) | ||
self.filename = filename | ||
|
||
def __iter__(self) -> Iterable[PSM]: | ||
"""Iterate over file and return PSMs one-by-one.""" | ||
with open(self.filename) as msms_in: | ||
reader = csv.DictReader(msms_in, delimiter="\t") | ||
for row in reader: | ||
yield self._get_peptide_spectrum_match(row) | ||
|
||
def _get_peptide_spectrum_match(self, psm_dict) -> PSM: | ||
"""Parse a single PSM from a DIA-NN PSM file.""" | ||
rescoring_features = {} | ||
for ft in RESCORING_FEATURES: | ||
try: | ||
rescoring_features[ft] = psm_dict[ft] | ||
except KeyError: | ||
continue | ||
|
||
return PSM( | ||
peptidoform=self._parse_peptidoform( | ||
psm_dict["Modified.Sequence"], psm_dict["Precursor.Charge"] | ||
), | ||
spectrum_id=psm_dict["MS2.Scan"], | ||
run=psm_dict["Run"], | ||
is_decoy=False, | ||
qvalue=psm_dict["Q.Value"], | ||
pep=float(psm_dict["PEP"]), | ||
score=float(psm_dict["CScore"]), | ||
precursor_mz=None, # Not returned by DIA-NN :( | ||
retention_time=float(psm_dict["RT"]), | ||
ion_mobility=float(psm_dict["IM"]), | ||
protein_list=psm_dict["Protein.Ids"].split(";"), | ||
source="diann", | ||
rank=None, | ||
provenance_data=({"diann_filename": str(self.filename)}), | ||
rescoring_features=rescoring_features, | ||
metadata={}, | ||
) | ||
|
||
@staticmethod | ||
def _parse_peptidoform(peptide: str, charge: Optional[str]) -> str: | ||
# Add charge | ||
if charge: | ||
peptide += f"/{int(float(charge))}" | ||
|
||
# Replace parentheses with square brackets and capitalize UniMod prefix | ||
pattern = r"\(UniMod:(\d+)\)" | ||
replacement = r"[UNIMOD:\1]" | ||
peptide = re.sub(pattern, replacement, peptide) | ||
|
||
# Add hyphen for N-terminal modifications | ||
# If [UNIMOD:n] occurs before the first amino acid, a hyphen is added before the first | ||
# amino acid | ||
if peptide[0] == "[": | ||
# Hyphen after the closing bracket | ||
peptide = peptide.replace("]", "]-", 1) | ||
|
||
# C-terminal modifications are currently not supported in DIA-NN | ||
|
||
return peptide | ||
|
||
@classmethod | ||
def from_dataframe(cls, dataframe) -> PSMList: | ||
"""Create a PSMList from a DIA-NN Pandas DataFrame.""" | ||
return PSMList( | ||
ptm_list=[ | ||
cls._get_peptide_spectrum_match(cls(""), entry) | ||
for entry in dataframe.to_dict(orient="records") | ||
] | ||
) |
Oops, something went wrong.