Skip to content

Commit

Permalink
Merge pull request #96 from compomics/feature/flashlfq-output
Browse files Browse the repository at this point in the history
Add reader and writer for FlashLFQ
  • Loading branch information
RalfG authored Sep 5, 2024
2 parents 83dff49 + 12b1540 commit 565934a
Show file tree
Hide file tree
Showing 8 changed files with 374 additions and 2 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,12 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## Unreleased
## [1.1.0] - 2024-09-05

### Added

- `Peptidoform`: Add `modified_sequence` property to return the modified sequence in ProForma format, but without charge state.
- `io`: Add support for reading and writing FlashLFQ generic TSV files.


## [1.0.1] - 2024-08-28
Expand Down
1 change: 1 addition & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ Supported file formats
===================================================================================================================== ======================== =============== ===============
File format psm_utils tag Read support Write support
===================================================================================================================== ======================== =============== ===============
`FlashLFQ generic TSV <https://github.com/smith-chem-wisc/FlashLFQ/wiki/Identification-Input-Formats>`_ ``flashlfq`` ✅ ✅
`ionbot CSV <https://ionbot.cloud/>`_ ``ionbot`` ✅ ❌
`OpenMS idXML <https://www.openms.de/>`_ ``idxml`` ✅ ✅
`MaxQuant msms.txt <https://www.maxquant.org/>`_ ``msms`` ✅ ❌
Expand Down
8 changes: 8 additions & 0 deletions docs/source/api/psm_utils.io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,14 @@ psm_utils.io



psm_utils.io.flashlfq
##################

.. automodule:: psm_utils.io.flashlfq
:members:
:inherited-members:


psm_utils.io.idxml
##################

Expand Down
18 changes: 18 additions & 0 deletions example_files/example.flashlfq.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
File Name Scan Retention Time Precursor Charge Base Sequence Full Sequence Peptide Monoisotopic Mass Protein Accession
SmallCalibratible_Yeast 24.80555 2 KAPAGGAADAAAK KAPAGGAADAAAK
SmallCalibratible_Yeast 24.95372 2 KAPAAAPAASK KAPAAAPAASK
SmallCalibratible_Yeast 24.77032 2 KQAIETANK KQAIETANK
SmallCalibratible_Yeast 24.17319 2 RVDEGGAQDK RVDEGGAQDK
SmallCalibratible_Yeast 24.26695 2 KDAEPQSDSTTSK KDAEPQSDSTTSK
SmallCalibratible_Yeast 24.10798 2 EKAEAEAEK EKAEAEAEK
SmallCalibratible_Yeast 24.06874 2 EKAEAEAEK EKAEAEAEK
SmallCalibratible_Yeast 24.77398 2 FKEEDEKESQR FKEEDEKESQR
SmallCalibratible_Yeast 24.90638 2 YDHEASSSYK YDHEASSSYK
SmallCalibratible_Yeast 24.40345 3 SKDVTDSATTKK SKDVTDSATTKK
SmallCalibratible_Yeast 24.71679 2 FKEEDEKESQR FKEEDEKESQR
SmallCalibratible_Yeast 24.39968 2 ALKQEGAANK ALKQEGAANK
SmallCalibratible_Yeast 24.67303 2 SKDVTDSATTK SKDVTDSATTK
SmallCalibratible_Yeast 24.45053 2 KLEDHPK KLEDHPK
SmallCalibratible_Yeast 24.77398 1 HIDAGAK HIDAGAK
SmallCalibratible_Yeast 24.9022 2 YLAKEEEKK YLAKEEEKK
SmallCalibratible_Yeast 24.76278 2 YAGEVSHDDK YAGEVSHDDK
2 changes: 1 addition & 1 deletion psm_utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""Common utilities for parsing and handling PSMs, and search engine results."""

__version__ = "1.0.1"
__version__ = "1.1.0"
__all__ = ["Peptidoform", "PSM", "PSMList"]

from warnings import filterwarnings
Expand Down
7 changes: 7 additions & 0 deletions psm_utils/io/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

from rich.progress import track

import psm_utils.io.flashlfq as flashlfq
import psm_utils.io.idxml as idxml
import psm_utils.io.ionbot as ionbot
import psm_utils.io.maxquant as maxquant
Expand All @@ -28,6 +29,12 @@
from psm_utils.psm_list import PSMList

FILETYPES = {
"flashlfq": {
"reader": flashlfq.FlashLFQReader,
"writer": flashlfq.FlashLFQWriter,
"extension": ".tsv",
"filename_pattern": r"^.*\.flashlfq\.tsv$",
},
"ionbot": {
"reader": ionbot.IonbotReader,
"writer": None,
Expand Down
228 changes: 228 additions & 0 deletions psm_utils/io/flashlfq.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,228 @@
"""
Reader and writer for the FlashLFQ generic TSV format.
See the `FlashLFQ documentation <https://github.com/smith-chem-wisc/FlashLFQ/wiki/Identification-Input-Formats>`_
for more information on the format.
Notes
-----
- The FlashLFQ format does not contain the actual spectrum identifier. When reading a FlashLFQ
file, the spectrum identifier is set to the row number in the file.
- The FlashLFQ format does not contain the precursor m/z, but the theoretical monoisotopic mass.
This value is not read into the PSM object, but can be calculated from the peptidoform.
- To read from a FlashLFQ file, the ``Full Sequence`` column is expected to contain a ProForma v2
compatible peptidoform notation.
"""

from __future__ import annotations

import csv
import logging
from pathlib import Path
from typing import Optional, Union

import numpy as np

from psm_utils.io._base_classes import ReaderBase, WriterBase
from psm_utils.io._utils import set_csv_field_size_limit
from psm_utils.io.exceptions import PSMUtilsIOException
from psm_utils.psm import PSM
from psm_utils.psm_list import PSMList

set_csv_field_size_limit()

LOGGER = logging.getLogger(__name__)


class FlashLFQReader(ReaderBase):
"""Reader for FlashLFQ TSV format."""

required_columns = ["Full Sequence", "Precursor Charge"]

def __iter__(self):
"""Iterate over file and return PSMs one-by-one."""
with open(self.filename, "rt") as open_file:
reader = csv.DictReader(open_file, delimiter="\t")
if not all(col in reader.fieldnames for col in self.required_columns):
raise PSMUtilsIOException(
f"FlashLFQ TSV file must contain the following columns: {self.required_columns}"
)
for i, row in enumerate(reader):
yield self._parse_entry(row, spectrum_id=str(i))

def _parse_entry(self, entry: dict, spectrum_id) -> PSM:
"""Parse single FlashLFQ TSV entry to :py:class:`~psm_utils.psm.PSM`."""
# Replace empty strings with None
entry = {k: v if v else None for k, v in entry.items()}

# Parse entry
return PSM(
peptidoform=f"{entry['Full Sequence']}/{entry['Precursor Charge']}",
spectrum_id=spectrum_id,
run=entry.get("File Name"),
retention_time=entry.get("Scan Retention Time"),
protein_list=self._parse_protein_list(entry.get("Protein Accessions")),
)

@staticmethod
def _parse_protein_list(protein_accessions: Optional[str]) -> list[str]:
"""Parse protein list string to list of protein accessions."""
if not protein_accessions:
return []
elif ";" in protein_accessions: # Docs define separator as semicolon
return protein_accessions.split(";")
elif "|" in protein_accessions: # Example file uses pipe
return protein_accessions.split("|")
else:
return [protein_accessions] # Single protein


class FlashLFQWriter(WriterBase):
"""Reader for FlashLFQ TSV format."""

def __init__(
self,
filename: Union[str, Path],
*args,
fdr_threshold: float = 0.01,
only_targets: bool = True,
**kwargs,
):
"""
Reader for psm_utils TSV format.
Parameters
----------
filename
Path to PSM file.
fdr_threshold
FDR threshold for filtering PSMs.
only_targets
If True, only target PSMs are written to file. If False, both target and decoy PSMs
are written.
"""
super().__init__(filename, *args, **kwargs)

self.fdr_threshold = fdr_threshold
self.only_targets = only_targets

self._open_file = None
self._writer = None
self.fieldnames = None

def __enter__(self) -> FlashLFQWriter:
if Path(self.filename).is_file():
# Get fieldnames from existing file
with open(self.filename, "rt") as open_file:
# Get fieldnames
self.fieldnames = open_file.readline().strip().split("\t")
mode = "at"
else:
# Set default fieldnames
self.fieldnames = [
"File Name",
"Base Sequence",
"Full Sequence",
"Peptide Monoisotope Mass",
"Scan Retention Time",
"Precursor Charge",
"Protein Accessions",
]
mode = "wt"

# Open file and writer
self._open_file = open(self.filename, mode, newline="")
self._writer = csv.DictWriter(
self._open_file,
fieldnames=self.fieldnames,
extrasaction="ignore",
delimiter="\t",
)

if mode == "wt":
self._writer.writeheader()

return self

def __exit__(self, *args, **kwargs) -> None:
self._open_file.close()
self._open_file = None
self._writer = None

def write_psm(self, psm: PSM):
"""
Write a single PSM to new or existing PSM file.
Parameters
----------
psm
PSM object to write.
"""
if psm.qvalue and psm.qvalue > self.fdr_threshold:
return
if self.only_targets and psm.is_decoy:
return

entry = self._psm_to_entry(psm)
try:
self._writer.writerow(entry)
except AttributeError as e:
raise PSMUtilsIOException(
f"`write_psm` method can only be called if `{self.__class__.__qualname__}`"
"is opened in context (i.e., using the `with` statement)."
) from e

def write_file(self, psm_list: PSMList):
"""
Write an entire PSMList to a new PSM file.
Parameters
----------
psm_list
PSMList object to write to file.
"""
# Filter out decoys
if self.only_targets:
# Accept both None and False
target_mask = np.array([not psm.is_decoy for psm in psm_list])
LOGGER.debug(f"Skipping {~target_mask.sum()} decoy PSMs for FlashLFQ file.")
else:
target_mask = np.ones(len(psm_list), dtype=bool)

# Filter out PSMs above FDR threshold
if any(psm.qvalue is None for psm in psm_list):
LOGGER.warning(
"Not all PSMs have a q-value. Skipping FDR filtering for FlashLFQ file."
)
fdr_mask = np.ones(len(psm_list), dtype=bool)
else:
fdr_mask = psm_list["qvalue"] <= self.fdr_threshold
filtered_by_fdr = (~fdr_mask & target_mask).sum()
LOGGER.debug(f"Skipping {filtered_by_fdr} PSMs above FDR threshold for FlashLFQ file.")

filtered_psm_list = psm_list[target_mask & fdr_mask]

with open(self.filename, "wt", newline="") as f:
writer = csv.DictWriter(
f, fieldnames=self.fieldnames, delimiter="\t", extrasaction="ignore"
)
writer.writeheader()
for psm in filtered_psm_list:
writer.writerow(self._psm_to_entry(psm))

@staticmethod
def _psm_to_entry(psm: PSM) -> dict:
"""Convert :py:class:`~psm_utils.psm.PSM` to FlashLFQ TSV entry."""
return {
"File Name": psm.run,
"Base Sequence": psm.peptidoform.sequence,
"Full Sequence": psm.peptidoform.modified_sequence,
"Peptide Monoisotope Mass": psm.peptidoform.theoretical_mass,
"Scan Retention Time": psm.retention_time,
"Precursor Charge": psm.peptidoform.precursor_charge,
"Protein Accessions": ";".join(psm.protein_list),
}
Loading

0 comments on commit 565934a

Please sign in to comment.