Merge pull request #103 from compomics/diann-io

Add I/O support for some DIA search engines
compomics · Nov 6, 2024 · 133559a · 133559a
2 parents 27b8591 + 029ba11
commit 133559a
Show file tree

Hide file tree

Showing 14 changed files with 695 additions and 4 deletions.
diff --git a/README.rst b/README.rst
@@ -89,7 +89,10 @@ Supported file formats
 ===================================================================================================================== ======================== =============== ===============
  File format                                                                                                           psm_utils tag            Read support    Write support
 ===================================================================================================================== ======================== =============== ===============
+ `AlphaDIA precursors TSV <https://alphadia.readthedocs.io/en/latest/quickstart.html#output-files>`_                   ``alphadia``             ✅              ❌
+ `DIA-NN TSV <https://github.com/vdemichev/DiaNN#output>`_                                                             ``diann``                ✅              ❌
  `FlashLFQ generic TSV <https://github.com/smith-chem-wisc/FlashLFQ/wiki/Identification-Input-Formats>`_               ``flashlfq``             ✅              ✅
+ `FragPipe PSM TSV <https://fragpipe.nesvilab.org/docs/tutorial_fragpipe_outputs.html#psmtsv/>`_                       ``fragpipe``             ✅              ❌
  `ionbot CSV <https://ionbot.cloud/>`_                                                                                 ``ionbot``               ✅              ❌
  `OpenMS idXML <https://www.openms.de/>`_                                                                              ``idxml``                ✅              ✅
  `MaxQuant msms.txt <https://www.maxquant.org/>`_                                                                      ``msms``                 ✅              ❌
@@ -99,10 +102,10 @@ Supported file formats
  `Peptide Record <https://psm-utils.readthedocs.io/en/stable/api/psm_utils.io/#module-psm_utils.io.peptide_record>`_   ``peprec``               ✅              ✅
  `pepXML <http://tools.proteomecenter.org/wiki/index.php?title=Formats:pepXML>`_                                       ``pepxml``               ✅              ❌
  `Percolator tab <https://github.com/percolator/percolator/wiki/Interface>`_                                           ``percolator``           ✅              ✅
- Proteome Discoverer MSF                                                                                               ``proteome_discoverer``  ✅              ❌
+ `Proteome Discoverer MSF <#>`_                                                                                        ``proteome_discoverer``  ✅              ❌
  `Sage Parquet <https://github.com/lazear/sage/blob/v0.14.7/DOCS.md#interpreting-sage-output>`_                        ``sage_parquet``         ✅              ❌
  `Sage TSV <https://github.com/lazear/sage/blob/v0.14.7/DOCS.md#interpreting-sage-output>`_                            ``sage_tsv``             ✅              ❌
- ProteoScape Parquet                                                                                                   ``proteoscape``          ✅              ❌
+ `ProteoScape Parquet <#>`_                                                                                            ``proteoscape``          ✅              ❌
  `TSV <https://psm-utils.readthedocs.io/en/stable/api/psm_utils.io/#module-psm_utils.io.tsv>`_                         ``tsv``                  ✅              ✅
  `X!Tandem XML <https://www.thegpm.org/tandem/>`_                                                                      ``xtandem``              ✅              ❌
 ===================================================================================================================== ======================== =============== ===============

diff --git a/docs/source/api/psm_utils.io.rst b/docs/source/api/psm_utils.io.rst
@@ -7,6 +7,22 @@ psm_utils.io
 
 
 
+psm_utils.io.alphapept
+##################
+
+.. automodule:: psm_utils.io.alphapept
+   :members:
+   :inherited-members:
+
+
+psm_utils.io.diann
+##################
+
+.. automodule:: psm_utils.io.diann
+   :members:
+   :inherited-members:
+
+
 psm_utils.io.flashlfq
 #####################
 
@@ -15,6 +31,14 @@ psm_utils.io.flashlfq
    :inherited-members:
 
 
+psm_utils.io.fragpipe
+##################
+
+.. automodule:: psm_utils.io.fragpipe
+   :members:
+   :inherited-members:
+
+
 psm_utils.io.idxml
 ##################
 

diff --git a/example_files/alphadia.precursors.tsv b/example_files/alphadia.precursors.tsv
@@ -0,0 +1,4 @@
+base_width_mobility	base_width_rt	rt_observed	mobility_observed	mono_ms1_intensity	top_ms1_intensity	sum_ms1_intensity	weighted_ms1_intensity	weighted_mass_deviation	weighted_mass_error	mz_observed	mono_ms1_height	top_ms1_height	sum_ms1_height	weighted_ms1_height	isotope_intensity_correlation	isotope_height_correlation	n_observations	intensity_correlation	height_correlation	intensity_fraction	height_fraction	intensity_fraction_weighted	height_fraction_weighted	mean_observation_score	sum_b_ion_intensity	sum_y_ion_intensity	diff_b_y_ion_intensity	f_masked	fragment_scan_correlation	template_scan_correlation	fragment_frame_correlation	top3_frame_correlation	template_frame_correlation	top3_b_ion_correlation	n_b_ions	top3_y_ion_correlation	n_y_ions	cycle_fwhm	mobility_fwhm	delta_frame_peak	top_3_ms2_mass_error	mean_ms2_mass_error	n_overlapping	mean_overlapping_intensity	mean_overlapping_mass_error	precursor_idx	rank	frame_center	scan_center	score	elution_group_idx	frame_start	scan_stop	frame_stop	scan_start	proteins	rt_calibrated	flat_frag_start_idx	charge	mods	decoy	sequence	mz_library	channel	genes	i_0	flat_frag_stop_idx	i_2	i_1	i_3	mobility_library	rt_library	mod_sites	delta_rt	n_K	n_R	n_P	_decoy	proba	qval	_candidate_idx	valid	candidate_idx	run	mod_seq_hash	mod_seq_charge_hash	pg_master	pg	pg_qval	intensity
+0.000000	40.673340	2800.518555	0.000001	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	894.337830	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	2.000000	0.968887	0.845673	1.000000	1.000000	1.000000	1.000000	0.000000	0.000000	15.266385	-15.266385	1.000000	0.000000	0.000000	0.929785	0.975279	0.000000	0.000000	0.000000	0.948546	12.000000	14.244627	0.000000	-0.500000	0.132713	-0.218829	0.000000	0.000000	0.000000	10447876	0	72329	0	136.160126	5238821	71876	1	72933	0	P18899	2347.609131	59818105	3		0	SSYGSSSNDDSYGSSNNDDSYGSSNK	894.337830	0	DDR48_YEAST	0.273118	59818117	0.249391	0.348172	0.129319	0.948457	1399.216187		452.909424	1	0	0	0.000000	0.000000	0.000000	10447876	True	10447876	LFQ_Orbitrap_AIF_Condition_A_Sample_Alpha_01	8562405370847133435	8562405370847133438	P18899	P18899	0.000000	190103852.035206
+0.000000	40.745483	1647.208252	0.000001	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	986.440491	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	2.000000	0.991654	0.992141	1.000000	1.000000	1.000000	1.000000	0.000000	0.000000	14.408463	-14.408463	1.000000	0.000000	0.000000	0.738752	0.974915	0.000000	0.000000	0.000000	0.880488	12.000000	9.885651	0.000000	0.000000	-0.391579	-0.698411	0.000000	0.000000	0.000000	8793636	0	42431	0	122.278320	4411698	41978	1	43035	0	Q9ULU4	1670.462402	49907897	2		0	SSQGSSSSTQSAPSETASASK	986.440491	0	PKCB1_HUMAN	0.380560	49907909	0.190793	0.352861	0.075786	1.158085	387.834503		-23.254150	1	0	1	0.000000	0.000000	0.000000	8793636	True	8793636	LFQ_Orbitrap_AIF_Condition_A_Sample_Alpha_01	5824087303549386971	5824087303549386973	Q9ULU4	Q9ULU4	0.000000	195496849.073322
+0.000000	52.349121	2678.317139	0.000001	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	905.432312	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	2.000000	0.986449	0.931379	1.000000	1.000000	1.000000	1.000000	0.000000	0.000000	16.636572	-16.636572	1.000000	0.000000	0.000000	0.978579	0.996334	0.000000	0.000000	0.000000	0.988605	12.000000	13.867673	0.000000	0.000000	-0.432777	0.780247	0.000000	0.000000	0.000000	7132549	0	69158	0	152.012512	3581144	68554	1	69913	0	O60763	2646.791260	39980635	2		0	SSQTSGTNEQSSAIVSAR	905.432312	0	USO1_HUMAN	0.404900	39980647	0.177361	0.352328	0.065410	1.110423	1774.035034		31.525879	0	1	0	0.000000	0.000000	0.000000	7132549	True	7132549	LFQ_Orbitrap_AIF_Condition_A_Sample_Alpha_01	14912031975374993231	14912031975374993233	O60763	O60763	0.000000	406414129.849395
diff --git a/example_files/fragpipe.psm.tsv b/example_files/fragpipe.psm.tsv
@@ -0,0 +1,6 @@
+Spectrum	Spectrum File	Peptide	Modified Peptide	Extended Peptide	Prev AA	Next AA	Peptide Length	Charge	Retention	Observed Mass	Calibrated Observed Mass	Observed M/Z	Calibrated Observed M/Z	Calculated Peptide Mass	Calculated M/Z	Delta Mass	Expectation	Hyperscore	Nextscore	Probability	Number of Enzymatic Termini	Number of Missed Cleavages	Protein Start	Protein End	Intensity	Assigned Modifications	Observed Modifications	Purity	Is Unique	Protein	Protein ID	Entry Name	Gene	Protein Description	Mapped Genes	Mapped Proteins
+LFQ_Orbitrap_AIF_Human_01.100000.100000.0	D:\test\interact-LFQ_Orbitrap_AIF_Human_01_rank1.pep.xml	SEDCFILDHGK	SEDCFILDHGK	PFAQGAIK.SEDCFILDHGK.DGKIFVWK	K	D	11	3	3813.8638	1319.5804	1319.5807	440.8674	440.8675	1319.5815	440.8678	-0.0008	0.01264961000000	19.3701	15.5657	0.9968	2	0	328	338	0.0000	4C(57.0214)		0.00	false	sp|GELS_HUMAN|	GELS_HUMAN				GSN	sp|P06396|GELS_HUMAN
+LFQ_Orbitrap_AIF_Human_01.100002.100002.0	D:\test\interact-LFQ_Orbitrap_AIF_Human_01_rank1.pep.xml	FLLEAGADQEHK		KGHIEMVR.FLLEAGADQEHK.TDEMHTAI	R	T	12	3	3813.9346	1356.6663	1356.6665	453.2294	453.2294	1356.6672	453.2297	-0.0007	0.01950739000000	18.9370	14.3831	0.9985	2	0	419	430	0.0000			0.00	false	sp|O75179|ANR17_HUMAN	O75179	ANR17_HUMAN	ANKRD17	Ankyrin repeat domain-containing protein 17	ANKHD1	sp|Q8IWZ3|ANKH1_HUMAN
+LFQ_Orbitrap_AIF_Human_01.100004.100004.0	D:\test\interact-LFQ_Orbitrap_AIF_Human_01_rank1.pep.xml	REELSNVLAAMR	REELSNVLAAM[147]R	THIRAKRK.REELSNVLAAMR.KAAAKKD	K	K	12	3	3814.0050	1403.7197	1403.7198	468.9138	468.9139	1403.7190	468.9136	0.0008	0.00008879724000	24.3292	15.9192	0.9998	2	1	87	98	0.0000	11M(15.9949)		0.00	true	sp|Q9Y3U8|RL36_HUMAN	Q9Y3U8	RL36_HUMAN	RPL36	Large ribosomal subunit protein eL36
+LFQ_Orbitrap_AIF_Human_01.100040.100040.0	D:\test\interact-LFQ_Orbitrap_AIF_Human_01_rank1.pep.xml	LHISPSNMTNQNTPEYMEK	LHISPSNM[147]TNQNTPEYMEKc[17]	EYFGPDFK.LHISPSNMTNQNTPEYMEK.IKQRIFEN	K	I	19	3	3815.4023	2248.0256	2248.0251	750.3491	750.3490	2248.0254	750.3491	-0.0002	0.00194418200000	21.2429	21.2429	0.7143	2	0	344	362	0.0000	8M(15.9949), C-term(-0.9840)		0.00	true	sp|Q92769|HDAC2_HUMAN	Q92769	HDAC2_HUMAN	HDAC2	Histone deacetylase 2
+LFQ_Orbitrap_AIF_Human_01.101373.101373.0	D:\test\interact-LFQ_Orbitrap_AIF_Human_01_rank1.pep.xml	ANIAVQR	n[43]ANIAVQR	.ANIAVQR.IKREFKEV	M	I	7	2	3866.1475	812.4501	812.4503	407.2323	407.2324	812.4505	407.2325	-0.0002	0.11090580000000	17.1991	14.1196	0.9898	2	0	2	8	0.0000	N-term(42.0106)		0.00	true	sp|P61086|UBE2K_HUMAN	P61086	UBE2K_HUMAN	UBE2K	Ubiquitin-conjugating enzyme E2 K
diff --git a/psm_utils/io/__init__.py b/psm_utils/io/__init__.py
@@ -8,7 +8,10 @@
 
 from rich.progress import track
 
+import psm_utils.io.alphadia as alphadia
+import psm_utils.io.diann as diann
 import psm_utils.io.flashlfq as flashlfq
+import psm_utils.io.fragpipe as fragpipe
 import psm_utils.io.idxml as idxml
 import psm_utils.io.ionbot as ionbot
 import psm_utils.io.maxquant as maxquant
@@ -113,13 +116,31 @@
         "extension": ".parquet",
         "filename_pattern": r"^.*(?:_|\.)sage.parquet$",
     },
-    "parquet": {  # List after proteoscape and sage to avoid extension matching conflicts
+    "fragpipe": {
+        "reader": fragpipe.FragPipeReader,
+        "writer": None,
+        "extension": ".tsv",
+        "filename_pattern": r"^.*(?:_|\.)?psm\.tsv$",
+    },
+    "alphadia": {
+        "reader": alphadia.AlphaDIAReader,
+        "writer": None,
+        "extension": ".tsv",
+        "filename_pattern": r"^.*(?:_|\.)?precursors\.tsv$",
+    },
+    "diann": {
+        "reader": diann.DIANNTSVReader,
+        "writer": None,
+        "extension": ".tsv",
+        "filename_pattern": r"^.*(?:_|\.)?diann\.tsv$",
+    },
+    "parquet": {  # List after more specific Parquet patterns to avoid matching conflicts
         "reader": parquet.ParquetReader,
         "writer": parquet.ParquetWriter,
         "extension": ".parquet",
         "filename_pattern": r"^.*\.parquet$",
     },
-    "tsv": {  # List after sage to avoid extension matching conflicts
+    "tsv": {  # List after more specific TSV patterns to avoid matching conflicts
         "reader": tsv.TSVReader,
         "writer": tsv.TSVWriter,
         "extension": ".tsv",

diff --git a/psm_utils/io/alphadia.py b/psm_utils/io/alphadia.py
@@ -0,0 +1,112 @@
+"""Reader for PSM files from the AlphaDIA search engine."""
+
+from __future__ import annotations
+
+import csv
+from abc import ABC
+from typing import Iterable, Optional
+
+from psm_utils.io._base_classes import ReaderBase
+from psm_utils.io._utils import set_csv_field_size_limit
+from psm_utils.psm import PSM
+from psm_utils.psm_list import PSMList
+
+set_csv_field_size_limit()
+
+# TODO: check
+RESCORING_FEATURES = [
+    "rt_observed",
+    "mobility_observed",
+    "mz_observed",
+    "charge",
+    "delta_rt",
+]
+
+
+class AlphaDIAReader(ReaderBase, ABC):
+    def __init__(self, filename, *args, **kwargs):
+        """
+        Reader for AlphaDIA ``precursor.tsv`` file.
+
+        Parameters
+        ----------
+        filename : str or Path
+            Path to PSM file.
+
+        """
+        super().__init__(filename, *args, **kwargs)
+        self.filename = filename
+
+    def __iter__(self) -> Iterable[PSM]:
+        """Iterate over file and return PSMs one-by-one."""
+        with open(self.filename) as msms_in:
+            reader = csv.DictReader(msms_in, delimiter="\t")
+            for row in reader:
+                yield self._get_peptide_spectrum_match(row)
+
+    def _get_peptide_spectrum_match(self, psm_dict) -> PSM:
+        """Parse a single PSM from a AlphaDIA PSM file."""
+        rescoring_features = {}
+        for ft in RESCORING_FEATURES:
+            try:
+                rescoring_features[ft] = psm_dict[ft]
+            except KeyError:
+                continue
+
+        return PSM(
+            peptidoform=self._parse_peptidoform(
+                psm_dict["sequence"], psm_dict["mods"], psm_dict["mod_sites"], psm_dict["charge"]
+            ),
+            spectrum_id=psm_dict["frame_start"],  # TODO: needs to be checked
+            run=psm_dict["run"],
+            spectrum=psm_dict["frame_start"],  # TODO: needs to be checked
+            is_decoy=bool(int(psm_dict["decoy"])),
+            score=psm_dict["score"],
+            qvalue=psm_dict["qval"],
+            pep=psm_dict["proba"],
+            precursor_mz=psm_dict["mz_observed"],
+            retention_time=psm_dict["rt_observed"],
+            ion_mobility=psm_dict["mobility_observed"],
+            protein_list=psm_dict["proteins"].split(";"),
+            rank=int(psm_dict["rank"]) + 1,  # AlphaDIA ranks are 0-based
+            source="AlphaDIA",
+            provenance_data=({"alphadia_filename": str(self.filename)}),
+            metadata={},
+            rescoring_features=rescoring_features,
+        )
+
+    @staticmethod
+    def _parse_peptidoform(sequence: str, mods: str, mod_sites, charge: Optional[str]) -> str:
+        """Parse a peptidoform from a AlphaDIA PSM file."""
+        # Parse modifications
+        if mods:
+            sequence_list = [""] + list(sequence) + [""]  # N-term, sequence, C-term
+            for mod, site in zip(mods.split(";"), mod_sites.split(";")):
+                site = int(site)
+                name = mod.split("@")[0]
+                # N-terminal modification
+                if site == 0:
+                    sequence_list[0] = f"[{name}]-"
+                # C-terminal modification
+                elif site == -1:
+                    sequence_list[-1] = f"-[{name}]"
+                # Sequence modification
+                else:
+                    sequence_list[site] = f"{sequence_list[site]}[{name}]"
+            sequence = "".join(sequence_list)
+
+        # Add charge
+        if charge:
+            sequence += f"/{int(float(charge))}"
+
+        return sequence
+
+    @classmethod
+    def from_dataframe(cls, dataframe) -> PSMList:
+        """Create a PSMList from a AlphaDIA Pandas DataFrame."""
+        return PSMList(
+            psm_list=[
+                cls._get_peptide_spectrum_match(cls(""), entry)
+                for entry in dataframe.to_dict(orient="records")
+            ]
+        )
diff --git a/psm_utils/io/diann.py b/psm_utils/io/diann.py
@@ -0,0 +1,125 @@
+"""
+Reader for PSM files from DIA-NN
+
+Reads the '.tsv' file as defined on the
+`DIA-NN documentation page <https://github.com/vdemichev/DiaNN/tree/1.8.1?tab=readme-ov-file#main-output-reference>`_.
+
+Notes
+-----
+
+- DIA-NN calculates q-values at both the run and library level. The run-level q-value is used as
+  the PSM q-value.
+- DIA-NN currently does not return precursor m/z values.
+- DIA-NN currently does not support C-terminal modifications in its searches.
+
+"""
+
+from __future__ import annotations
+
+import csv
+import re
+from typing import Iterable, Optional
+
+from psm_utils.io._base_classes import ReaderBase
+from psm_utils.io._utils import set_csv_field_size_limit
+from psm_utils.psm import PSM
+from psm_utils.psm_list import PSMList
+
+set_csv_field_size_limit()
+
+RESCORING_FEATURES = [
+    "RT",
+    "Predicted.RT",
+    "iRT",
+    "Predicted.iRT",
+    "Ms1.Profile.Corr",
+    "Ms1.Area",
+    "IM",
+    "iIM",
+    "Predicted.IM",
+    "Predicted.iIM",
+]
+
+
+class DIANNTSVReader(ReaderBase):
+    def __init__(self, filename, *args, **kwargs) -> None:
+        """
+        Reader for DIA-NN '.tsv' file.
+
+        Parameters
+        ----------
+        filename : str or Path
+            Path to PSM file.
+
+        """
+        super().__init__(filename, *args, **kwargs)
+        self.filename = filename
+
+    def __iter__(self) -> Iterable[PSM]:
+        """Iterate over file and return PSMs one-by-one."""
+        with open(self.filename) as msms_in:
+            reader = csv.DictReader(msms_in, delimiter="\t")
+            for row in reader:
+                yield self._get_peptide_spectrum_match(row)
+
+    def _get_peptide_spectrum_match(self, psm_dict) -> PSM:
+        """Parse a single PSM from a DIA-NN PSM file."""
+        rescoring_features = {}
+        for ft in RESCORING_FEATURES:
+            try:
+                rescoring_features[ft] = psm_dict[ft]
+            except KeyError:
+                continue
+
+        return PSM(
+            peptidoform=self._parse_peptidoform(
+                psm_dict["Modified.Sequence"], psm_dict["Precursor.Charge"]
+            ),
+            spectrum_id=psm_dict["MS2.Scan"],
+            run=psm_dict["Run"],
+            is_decoy=False,
+            qvalue=psm_dict["Q.Value"],
+            pep=float(psm_dict["PEP"]),
+            score=float(psm_dict["CScore"]),
+            precursor_mz=None,  # Not returned by DIA-NN :(
+            retention_time=float(psm_dict["RT"]),
+            ion_mobility=float(psm_dict["IM"]),
+            protein_list=psm_dict["Protein.Ids"].split(";"),
+            source="diann",
+            rank=None,
+            provenance_data=({"diann_filename": str(self.filename)}),
+            rescoring_features=rescoring_features,
+            metadata={},
+        )
+
+    @staticmethod
+    def _parse_peptidoform(peptide: str, charge: Optional[str]) -> str:
+        # Add charge
+        if charge:
+            peptide += f"/{int(float(charge))}"
+
+        # Replace parentheses with square brackets and capitalize UniMod prefix
+        pattern = r"\(UniMod:(\d+)\)"
+        replacement = r"[UNIMOD:\1]"
+        peptide = re.sub(pattern, replacement, peptide)
+
+        # Add hyphen for N-terminal modifications
+        # If [UNIMOD:n] occurs before the first amino acid, a hyphen is added before the first
+        # amino acid
+        if peptide[0] == "[":
+            # Hyphen after the closing bracket
+            peptide = peptide.replace("]", "]-", 1)
+
+        # C-terminal modifications are currently not supported in DIA-NN
+
+        return peptide
+
+    @classmethod
+    def from_dataframe(cls, dataframe) -> PSMList:
+        """Create a PSMList from a DIA-NN Pandas DataFrame."""
+        return PSMList(
+            ptm_list=[
+                cls._get_peptide_spectrum_match(cls(""), entry)
+                for entry in dataframe.to_dict(orient="records")
+            ]
+        )