From abd1eea41625cfc3e12deb28848f894aa63e6ded Mon Sep 17 00:00:00 2001 From: RalfG Date: Fri, 25 Aug 2023 14:40:17 +0200 Subject: [PATCH 01/13] - Fixed: io.Percolator: Allow rescoring features that are not in feature names (dictwriter extrasaction) - Changed: Make io reader read_file method inheritable. --- psm_utils/io/_base_classes.py | 5 ++--- psm_utils/io/idxml.py | 8 +------- psm_utils/io/maxquant.py | 28 ++++++------------------- psm_utils/io/msamanda.py | 13 ++---------- psm_utils/io/mzid.py | 4 ---- psm_utils/io/peptide_record.py | 28 ++++++------------------- psm_utils/io/percolator.py | 37 +++++++++------------------------- psm_utils/io/sage.py | 13 ++++-------- psm_utils/io/tsv.py | 16 +++++---------- psm_utils/io/xtandem.py | 12 +---------- 10 files changed, 37 insertions(+), 127 deletions(-) diff --git a/psm_utils/io/_base_classes.py b/psm_utils/io/_base_classes.py index 60a5a00..f4c3cfc 100644 --- a/psm_utils/io/_base_classes.py +++ b/psm_utils/io/_base_classes.py @@ -41,11 +41,9 @@ def __exit__(self, *args, **kwargs): def __iter__(self): raise NotImplementedError() - @abstractmethod def read_file(self) -> PSMList: """Read full PSM file into a PSMList object.""" - raise NotImplementedError() - + return PSMList(psm_list=[psm for psm in self.__iter__()]) class WriterBase(ABC): """Abstract base class for PSM file writers.""" @@ -63,6 +61,7 @@ def __exit__(self, *args, **kwargs): @abstractmethod def write_psm(self, psm: PSM): """Write a single PSM to the PSM file.""" + raise NotImplementedError() @abstractmethod def write_file(self, psm_list: PSMList): diff --git a/psm_utils/io/idxml.py b/psm_utils/io/idxml.py index b02ba99..fced5ca 100644 --- a/psm_utils/io/idxml.py +++ b/psm_utils/io/idxml.py @@ -35,10 +35,6 @@ def __iter__(self): for peptide_hit in entry["PeptideHit"]: yield self._parse_psm(entry, peptide_hit) - def read_file(self) -> PSMList: - """Read full PSM file into a PSMList object.""" - return PSMList(psm_list=[psm for psm in self.__iter__()]) - @staticmethod def _parse_peptidoform(sequence: str, charge: int): """ @@ -74,9 +70,7 @@ def _parse_is_decoy(target_decoy: str): def _parse_psm(self, entry: dict, peptide_hit: dict) -> PSM: """Parse idXML PSM to :py:class:`~psm_utils.psm.PSM`.""" return PSM( - peptidoform=self._parse_peptidoform( - peptide_hit["sequence"], peptide_hit["charge"] - ), + peptidoform=self._parse_peptidoform(peptide_hit["sequence"], peptide_hit["charge"]), spectrum_id=entry["spectrum_reference"], is_decoy=self._parse_is_decoy(peptide_hit["target_decoy"]), score=peptide_hit["score"], diff --git a/psm_utils/io/maxquant.py b/psm_utils/io/maxquant.py index 89c2c9c..55814be 100644 --- a/psm_utils/io/maxquant.py +++ b/psm_utils/io/maxquant.py @@ -84,10 +84,6 @@ def __iter__(self): psm = self._get_peptide_spectrum_match(psm_dict) yield psm - def read_file(self) -> PSMList: - """Read full MaxQuant msms.txt PSM file into a PSMList object.""" - return PSMList(psm_list=[psm for psm in self.__iter__()]) - def _validate_msms(self) -> None: with open(self.filename, "r") as msms_file: msms_reader = csv.DictReader(msms_file, delimiter="\t") @@ -97,23 +93,17 @@ def _validate_msms(self) -> None: def _evaluate_columns(columns) -> bool: """Case insensitive column evaluation msms file.""" columns = list(map(lambda col: col.lower(), columns)) - column_check = [ - True if col.lower() in columns else False for col in MSMS_REQUIRED_COLUMNS - ] + column_check = [True if col.lower() in columns else False for col in MSMS_REQUIRED_COLUMNS] if not all(column_check): raise MSMSParsingError( f"Missing columns: {list(compress(MSMS_REQUIRED_COLUMNS, list(~np.array(column_check))))}" ) - def _get_peptide_spectrum_match( - self, psm_dict: dict[str, str | float] - ) -> PSM: + def _get_peptide_spectrum_match(self, psm_dict: dict[str, str | float]) -> PSM: """Return a PSM object from MaxQuant msms.txt PSM file.""" psm = PSM( - peptidoform=self._parse_peptidoform( - psm_dict["Modified sequence"], psm_dict["Charge"] - ), + peptidoform=self._parse_peptidoform(psm_dict["Modified sequence"], psm_dict["Charge"]), spectrum_id=psm_dict["Scan number"], run=psm_dict["Raw file"], is_decoy=psm_dict["Reverse"] == "+", @@ -148,21 +138,15 @@ def _parse_peptidoform(modified_seq: str, charge: int) -> Peptidoform: # if N-term mod if match.start() == 0: - modified_seq = re.sub( - f"\({se_mod_string}\)", f"[{match[1]}]-", modified_seq - ) + modified_seq = re.sub(f"\({se_mod_string}\)", f"[{match[1]}]-", modified_seq) # if C-term mod elif match.end() == modified_seq_len: - modified_seq = re.sub( - f"\({se_mod_string}\)", f"-[{match[1]}]", modified_seq - ) + modified_seq = re.sub(f"\({se_mod_string}\)", f"-[{match[1]}]", modified_seq) # if modification on amino acid else: - modified_seq = re.sub( - f"\({se_mod_string}\)", f"[{match[1]}]", modified_seq - ) + modified_seq = re.sub(f"\({se_mod_string}\)", f"[{match[1]}]", modified_seq) modified_seq += f"/{charge}" diff --git a/psm_utils/io/msamanda.py b/psm_utils/io/msamanda.py index 77d24bc..b79c7fd 100644 --- a/psm_utils/io/msamanda.py +++ b/psm_utils/io/msamanda.py @@ -13,7 +13,6 @@ from psm_utils.exceptions import PSMUtilsException from psm_utils.io._base_classes import ReaderBase from psm_utils.psm import PSM, Peptidoform -from psm_utils.psm_list import PSMList logger = logging.getLogger(__name__) @@ -66,10 +65,6 @@ def __iter__(self): for psm_dict in reader: yield self._get_peptide_spectrum_match(psm_dict) - def read_file(self) -> PSMList: - """Read full PSM file into a PSMList object.""" - return PSMList(psm_list=[psm for psm in self.__iter__()]) - def _evaluate_columns(self, columns) -> bool: """Column evaluation for MS Amanda file.""" # Check if required columns are present @@ -84,9 +79,7 @@ def _evaluate_columns(self, columns) -> bool: self._present_columns.append("Rank") # Get list of present rescoring features - self._rescoring_feature_columns = [ - col for col in RESCORING_FEATURES if col in columns - ] + self._rescoring_feature_columns = [col for col in RESCORING_FEATURES if col in columns] # Add remaining columns to metadata self._metadata_columns = [ @@ -116,9 +109,7 @@ def _get_peptide_spectrum_match(self, psm_dict: dict[str, str | float]) -> PSM: if col in self._rescoring_feature_columns }, metadata={ - col: str(value) - for col, value in psm_dict.items() - if col in self._metadata_columns + col: str(value) for col, value in psm_dict.items() if col in self._metadata_columns }, ) if self._has_rank_column: diff --git a/psm_utils/io/mzid.py b/psm_utils/io/mzid.py index 8c37657..0aae4ec 100644 --- a/psm_utils/io/mzid.py +++ b/psm_utils/io/mzid.py @@ -166,10 +166,6 @@ def __iter__(self): spectrum_id, spectrum_title, run, rt, entry ) - def read_file(self) -> PSMList: - """Read full mzid file to PSM list object.""" - return PSMList(psm_list=[psm for psm in self]) - @staticmethod def _get_xml_namespace(root_tag): """Get the namespace of the xml root.""" diff --git a/psm_utils/io/peptide_record.py b/psm_utils/io/peptide_record.py index c4b2a2a..2c1e6da 100644 --- a/psm_utils/io/peptide_record.py +++ b/psm_utils/io/peptide_record.py @@ -199,9 +199,7 @@ def __init__( # Define named tuple for single Peptide Record entries, based on # configured columns columns = self._peprec.required_columns + self._peprec.optional_columns - self.PeprecEntry = namedtuple( - "PeprecEntry", columns, defaults=[None for _ in columns] - ) + self.PeprecEntry = namedtuple("PeprecEntry", columns, defaults=[None for _ in columns]) def __iter__(self) -> Iterable[PSM]: """Iterate over file and return PSMs one-by-one.""" @@ -212,16 +210,6 @@ def __iter__(self) -> Iterable[PSM]: psm = self._entry_to_psm(entry, filename=self.filename) yield psm - def read_file(self) -> PSMList: - """Read full Peptide Record PSM file into a PSMList object.""" - psm_list = [] - with open(self.filename) as peprec_in: - reader = csv.DictReader(peprec_in, delimiter=self._peprec.separator) - for row in reader: - entry = self.PeprecEntry(**row) - psm_list.append(self._entry_to_psm(entry, filename=self.filename)) - return PSMList(psm_list=psm_list) - @staticmethod def _entry_to_psm(entry: NamedTuple, filename: Optional[str] = None) -> PSM: """Parse single Peptide Record entry to `PSM`.""" @@ -280,8 +268,7 @@ def __enter__(self) -> PeptideRecordWriter: self._open_file = open(self.filename, "wt", newline="") self._writer = csv.DictWriter( self._open_file, - fieldnames=_PeptideRecord.required_columns - + _PeptideRecord.optional_columns, + fieldnames=_PeptideRecord.required_columns + _PeptideRecord.optional_columns, extrasaction="ignore", delimiter=" ", ) @@ -352,9 +339,7 @@ def write_file(self, psm_list: PSMList): """ with open(self.filename, "wt", newline="") as f: - fieldnames = ( - _PeptideRecord.required_columns + _PeptideRecord.optional_columns - ) + fieldnames = _PeptideRecord.required_columns + _PeptideRecord.optional_columns writer = csv.DictWriter(f, fieldnames=fieldnames, delimiter=" ") writer.writeheader() for psm in psm_list: @@ -391,9 +376,7 @@ def peprec_to_proforma( peptide = [""] + list(peptide) + [""] # Add modification labels - for position, label in zip( - modifications.split("|")[::2], modifications.split("|")[1::2] - ): + for position, label in zip(modifications.split("|")[::2], modifications.split("|")[1::2]): try: peptide[int(position)] += f"[{label}]" except ValueError as e: @@ -402,7 +385,8 @@ def peprec_to_proforma( ) from e except IndexError as e: raise InvalidPeprecModificationError( - f"PEPREC modification has invalid position {position} in peptide `{''.join(peptide)}`." + f"PEPREC modification has invalid position {position} in " + f"peptide `{''.join(peptide)}`." ) from e # Add dashes between residues and termini, and join sequence diff --git a/psm_utils/io/percolator.py b/psm_utils/io/percolator.py index bd23af2..200f40b 100644 --- a/psm_utils/io/percolator.py +++ b/psm_utils/io/percolator.py @@ -111,13 +111,6 @@ def __iter__(self) -> Iterable[PSM]: psm = self._parse_entry(entry) yield psm - def read_file(self) -> PSMList: - """Read full PSM file into a PSMList object.""" - psm_list = [] - for psm in self.__iter__(): - psm_list.append(psm) - return PSMList(psm_list=psm_list) - @staticmethod def _read_header(filename): with open(filename, "rt") as f: @@ -184,19 +177,11 @@ def _parse_entry(self, entry): peptidoform=peptidoform, spectrum_id=entry[self.id_column], is_decoy=is_decoy, - score=float(entry[self.score_column.lower()]) - if self.score_column - else None, + score=float(entry[self.score_column.lower()]) if self.score_column else None, qvalue=entry["q-value"] if "q-value" in entry else None, - pep=entry["posterior_error_prob"] - if "posterior_error_prob" in entry - else None, - precursor_mz=float(entry[self.mz_column.lower()]) - if self.mz_column - else None, - retention_time=float(entry[self.rt_column.lower()]) - if self.rt_column - else None, + pep=entry["posterior_error_prob"] if "posterior_error_prob" in entry else None, + precursor_mz=float(entry[self.mz_column.lower()]) if self.mz_column else None, + retention_time=float(entry[self.rt_column.lower()]) if self.rt_column else None, protein_list=protein_list, source="percolator", provenance_data={"filename": str(self.filename)}, @@ -224,7 +209,7 @@ def __init__( style: str Percolator Tab style. One of {``pin``, ``pout``}. If ``pin``, the columns ``SpecId``, ``Label``, ``ScanNr``, ``ChargeN``, ``PSMScore``, ``Peptide``, and - ``Proteins`` are written alongside the requested feature names + ``Proteins`` are written alongside the requested feature names (see ``feature_names``). If ``pout``, the columns ``PSMId``, ``Label``, ``score``, ``q-value``, ``posterior_error_prob``, ``peptide``, and ``proteinIds`` are written. feature_names: list[str], optional @@ -257,9 +242,7 @@ def __init__( "proteinIds", ] else: - raise ValueError( - "Invalid Percolator Tab style. Should be one of {`pin`, `pout`}." - ) + raise ValueError("Invalid Percolator Tab style. Should be one of {`pin`, `pout`}.") self.style = style self._open_file = None self._writer = None @@ -280,9 +263,7 @@ def __enter__(self) -> PercolatorTabWriter: fieldnames = line.strip().split("\t") break else: - raise ValueError( - f"File {self.filename} is not a valid Percolator Tab file." - ) + raise ValueError(f"File {self.filename} is not a valid Percolator Tab file.") # Determine last scan number open_file.seek(0) last_line = None @@ -336,7 +317,9 @@ def write_file(self, psm_list: PSMList): with _PercolatorTabIO( self.filename, "wt", newline="", protein_separator=self._protein_separator ) as f: - writer = csv.DictWriter(f, fieldnames=self._columns, delimiter="\t") + writer = csv.DictWriter( + f, fieldnames=self._columns, delimiter="\t", extrasaction="ignore" + ) writer.writeheader() for i, psm in enumerate(psm_list): entry = self._psm_to_entry(psm) diff --git a/psm_utils/io/sage.py b/psm_utils/io/sage.py index a16d6f0..25abe82 100644 --- a/psm_utils/io/sage.py +++ b/psm_utils/io/sage.py @@ -48,13 +48,6 @@ def __iter__(self) -> Iterable[PSM]: psm = self._get_peptide_spectrum_match(row) yield psm - def read_file(self) -> PSMList: - """Read full PSM file into a PSMList object.""" - psm_list = [] - for psm in self.__iter__(): - psm_list.append(psm) - return PSMList(psm_list=psm_list) - def _get_peptide_spectrum_match(self, psm_dict) -> PSM: """Parse a single PSM from a sage PSM file.""" rescoring_features = {} @@ -96,8 +89,10 @@ def _get_peptide_spectrum_match(self, psm_dict) -> PSM: spectrum_id=psm_dict["scannr"], run=Path(psm_dict["filename"]).stem, is_decoy=True - if psm_dict["label"] == "-1" else False - if psm_dict["label"] == "1" else None, + if psm_dict["label"] == "-1" + else False + if psm_dict["label"] == "1" + else None, qvalue=psm_dict["spectrum_q"], score=float(psm_dict[self.score_column]), precursor_mz=self._parse_precursor_mz(psm_dict["expmass"], psm_dict["charge"]), diff --git a/psm_utils/io/tsv.py b/psm_utils/io/tsv.py index ee7b92b..60f7cc1 100644 --- a/psm_utils/io/tsv.py +++ b/psm_utils/io/tsv.py @@ -68,10 +68,6 @@ def __iter__(self): for row in reader: yield PSM(**self._parse_entry(row)) - def read_file(self) -> PSMList: - """Read full PSM file into a PSMList object.""" - return PSMList(psm_list=[psm for psm in self.__iter__()]) - @staticmethod def _parse_entry(entry: dict): """Parse single TSV entry to :py:class:`~psm_utils.psm.PSM`.""" @@ -211,7 +207,9 @@ def write_file(self, psm_list: PSMList): if not self.fieldnames: raise ValueError("`example_psm` required when writing to new file.") with open(self.filename, "wt", newline="") as f: - writer = csv.DictWriter(f, fieldnames=self.fieldnames, delimiter="\t") + writer = csv.DictWriter( + f, fieldnames=self.fieldnames, delimiter="\t", extrasaction="ignore" + ) writer.writeheader() for psm in psm_list: writer.writerow(self._psm_to_entry(psm)) @@ -228,15 +226,11 @@ def _psm_to_entry(psm: PSM) -> dict: # Flatten dictionary items if entry["provenance_data"]: - entry.update( - {"provenance:" + k: v for k, v in entry["provenance_data"].items()} - ) + entry.update({"provenance:" + k: v for k, v in entry["provenance_data"].items()}) if entry["metadata"]: entry.update({"meta:" + k: v for k, v in entry["metadata"].items()}) if entry["rescoring_features"]: - entry.update( - {"rescoring:" + k: v for k, v in entry["rescoring_features"].items()} - ) + entry.update({"rescoring:" + k: v for k, v in entry["rescoring_features"].items()}) del entry["provenance_data"] del entry["metadata"] del entry["rescoring_features"] diff --git a/psm_utils/io/xtandem.py b/psm_utils/io/xtandem.py index 8bf60a9..52e8ccd 100644 --- a/psm_utils/io/xtandem.py +++ b/psm_utils/io/xtandem.py @@ -57,9 +57,7 @@ class XTandemReader(ReaderBase): - def __init__( - self, filename: str | Path, *args, decoy_prefix="DECOY_", **kwargs - ) -> None: + def __init__(self, filename: str | Path, *args, decoy_prefix="DECOY_", **kwargs) -> None: """ Reader for X!Tandem XML PSM files. @@ -101,14 +99,6 @@ def __iter__(self): psm = self._parse_entry(entry) yield psm - def read_file(self) -> PSMList: - """Read full PSM file into a PSMList object.""" - psm_list = [] - with tandem.read(str(self.filename)) as reader: - for entry in reader: - psm_list.append(self._parse_entry(entry)) - return PSMList(psm_list=psm_list) - def _parse_peptidoform(self, peptide_entry, charge: int) -> Peptidoform: """Parse X!Tandem XML peptide entry to :py:class:`~psm_utils.peptidoform.Peptidoform`.""" if "aa" in peptide_entry: From 82da9a2a7cc6906cabdc036bb13bf5645e0f8b59 Mon Sep 17 00:00:00 2001 From: RalfG Date: Fri, 25 Aug 2023 16:45:24 +0200 Subject: [PATCH 02/13] Added: - Support for Proteome Discoverer MSF files - Peptidoform: Added support for initialization from a pyteomics.proforma.ProForma object - PSMList: Added append and extend methods. Changed: - PSM: Values of the rescoring_features dictionary are now coerced to floats Fixed: - Fix issue where `psm_list["protein_list"]` resulted in a Numpy error due to inconsistent shape of the lists. --- .gitignore | 7 +- README.rst | 27 +- docs/source/api/psm_utils.io.rst | 22 +- psm_utils/io/__init__.py | 7 + psm_utils/io/_pd_msf_tables.py | 799 ++++++++++++++++++++++++++++ psm_utils/io/peptideshaker.py | 201 ------- psm_utils/io/proteome_discoverer.py | 304 +++++++++++ psm_utils/peptidoform.py | 54 +- psm_utils/psm.py | 2 +- psm_utils/psm_list.py | 28 +- pyproject.toml | 25 +- tests/test_io/test_msamanda.py | 1 - 12 files changed, 1203 insertions(+), 274 deletions(-) create mode 100644 psm_utils/io/_pd_msf_tables.py delete mode 100644 psm_utils/io/peptideshaker.py create mode 100644 psm_utils/io/proteome_discoverer.py diff --git a/.gitignore b/.gitignore index b6e4761..0432116 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ +# Ruff +.ruff_cache/ + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] @@ -103,7 +106,8 @@ celerybeat.pid # Environments .env -.venv +.venv/ +.venv*/ env/ venv/ ENV/ @@ -127,3 +131,4 @@ dmypy.json # Pyre type checker .pyre/ +.vscode/settings.json diff --git a/README.rst b/README.rst index 86008a6..23a293d 100644 --- a/README.rst +++ b/README.rst @@ -86,19 +86,20 @@ Goals and non-goals Supported file formats ********************** -===================================================================================================================== =============== =============== =============== - File format psm_utils tag Read support Write support -===================================================================================================================== =============== =============== =============== - `OpenMS idXML `_ ``idxml`` ✅ ❌ - `MaxQuant msms.txt `_ ``msms`` ✅ ❌ - `MS Amanda CSV `_ ``msamanda`` ✅ ❌ - `mzIdentML `_ ``mzid`` ✅ ✅ - `Peptide Record `_ ``peprec`` ✅ ✅ - `Percolator tab `_ ``percolator`` ✅ ✅ - `Sage `_ ``sage`` ✅ ❌ - `TSV `_ ``tsv`` ✅ ✅ - `X!Tandem XML `_ ``xtandem`` ✅ ❌ -===================================================================================================================== =============== =============== =============== +===================================================================================================================== ======================== =============== =============== + File format psm_utils tag Read support Write support +===================================================================================================================== ======================== =============== =============== + `OpenMS idXML `_ ``idxml`` ✅ ❌ + `MaxQuant msms.txt `_ ``msms`` ✅ ❌ + `MS Amanda CSV `_ ``msamanda`` ✅ ❌ + `mzIdentML `_ ``mzid`` ✅ ✅ + `Peptide Record `_ ``peprec`` ✅ ✅ + `Percolator tab `_ ``percolator`` ✅ ✅ + Proteome Discoverer MSF ``proteome_discoverer`` ✅ ❌ + `Sage `_ ``sage`` ✅ ❌ + `TSV `_ ``tsv`` ✅ ✅ + `X!Tandem XML `_ ``xtandem`` ✅ ❌ +===================================================================================================================== ======================== =============== =============== Legend: ✅ Supported, ❌ Unsupported diff --git a/docs/source/api/psm_utils.io.rst b/docs/source/api/psm_utils.io.rst index 3f4956d..dcd169b 100644 --- a/docs/source/api/psm_utils.io.rst +++ b/docs/source/api/psm_utils.io.rst @@ -8,7 +8,7 @@ psm_utils.io psm_utils.io.idxml -##################### +################## .. automodule:: psm_utils.io.idxml :members: @@ -25,7 +25,7 @@ psm_utils.io.maxquant psm_utils.io.msamanda -########################## +##################### .. automodule:: psm_utils.io.msamanda :members: @@ -34,7 +34,7 @@ psm_utils.io.msamanda psm_utils.io.mzid -##################### +################# .. automodule:: psm_utils.io.mzid :members: @@ -52,7 +52,7 @@ psm_utils.io.peptide_record psm_utils.io.percolator -########################### +####################### .. automodule:: psm_utils.io.percolator :members: @@ -60,8 +60,16 @@ psm_utils.io.percolator +psm_utils.io.proteome_discoverer +################################ +.. automodule:: psm_utils.io.proteome_discoverer + :members: + :inherited-members: + + + psm_utils.io.sage -########################### +################# .. automodule:: psm_utils.io.sage :members: @@ -70,7 +78,7 @@ psm_utils.io.sage psm_utils.io.tsv -########################## +################ .. automodule:: psm_utils.io.tsv :members: @@ -79,7 +87,7 @@ psm_utils.io.tsv psm_utils.io.xtandem -########################## +#################### .. automodule:: psm_utils.io.xtandem :members: diff --git a/psm_utils/io/__init__.py b/psm_utils/io/__init__.py index 0d27a5c..be0b424 100644 --- a/psm_utils/io/__init__.py +++ b/psm_utils/io/__init__.py @@ -14,6 +14,7 @@ import psm_utils.io.mzid as mzid import psm_utils.io.peptide_record as peptide_record import psm_utils.io.percolator as percolator +import psm_utils.io.proteome_discoverer as proteome_discoverer import psm_utils.io.tsv as tsv import psm_utils.io.xtandem as xtandem import psm_utils.io.sage as sage @@ -53,6 +54,12 @@ "extension": ".percolator.txt", "filename_pattern": r"^.*\.(?:(?:pin)|(?:pout))$", }, + "proteome_discoverer": { + "reader": proteome_discoverer.MSFReader, + "writer": None, + "extension": ".msf", + "filename_pattern": r"^.*\.msf$", + }, "tsv": { "reader": tsv.TSVReader, "writer": tsv.TSVWriter, diff --git a/psm_utils/io/_pd_msf_tables.py b/psm_utils/io/_pd_msf_tables.py new file mode 100644 index 0000000..a125da8 --- /dev/null +++ b/psm_utils/io/_pd_msf_tables.py @@ -0,0 +1,799 @@ +"""SQLAlchemy models for Mascot MSF files.""" + +from sqlalchemy import ( + CHAR, + BigInteger, + Boolean, + Column, + DateTime, + Float, + Index, + Integer, + LargeBinary, + SmallInteger, + String, + Table, + Text, + UniqueConstraint, + text, +) +from sqlalchemy.ext.declarative import declarative_base +from sqlalchemy.sql.sqltypes import NullType + +Base = declarative_base() +metadata = Base.metadata + + +class AminoAcidModification(Base): + __tablename__ = "AminoAcidModifications" + + AminoAcidModificationID = Column(Integer, primary_key=True) + ModificationName = Column(String, nullable=False) + DeltaMass = Column(Float) + Substitution = Column(String) + LeavingGroup = Column(String) + Abbreviation = Column(String, nullable=False) + PositionType = Column(Integer, nullable=False) + IsActive = Column(Boolean) + DeltaAverageMass = Column(Float) + UnimodAccession = Column(String) + IsSubstitution = Column(Boolean, nullable=False, server_default=text("0")) + + +class AminoAcidModificationsAminoAcid(Base): + __tablename__ = "AminoAcidModificationsAminoAcids" + + AminoAcidModificationID = Column(Integer, primary_key=True, nullable=False) + AminoAcidID = Column(Integer, primary_key=True, nullable=False) + Classification = Column(Integer, nullable=False) + + +class AminoAcidModificationsAminoAcidsNL(Base): + __tablename__ = "AminoAcidModificationsAminoAcidsNL" + + AminoAcidModificationID = Column(Integer, primary_key=True, nullable=False) + AminoAcidID = Column(Integer, primary_key=True, nullable=False) + NeutralLossID = Column(Integer, primary_key=True, nullable=False) + + +class AminoAcidModificationsNeutralLoss(Base): + __tablename__ = "AminoAcidModificationsNeutralLosses" + + NeutralLossID = Column(Integer, primary_key=True) + Name = Column(String, nullable=False) + MonoisotopicMass = Column(Float, nullable=False) + AverageMass = Column(Float, nullable=False) + + +class AminoAcid(Base): + __tablename__ = "AminoAcids" + + AminoAcidID = Column(Integer, primary_key=True) + AminoAcidName = Column(String, nullable=False) + OneLetterCode = Column(CHAR) + ThreeLetterCode = Column(CHAR) + MonoisotopicMass = Column(Float, nullable=False) + AverageMass = Column(Float, nullable=False) + SumFormula = Column(String) + + +class AnnotationDataVersion(Base): + __tablename__ = "AnnotationDataVersion" + + PcDataVersion = Column(Integer, primary_key=True) + PcDataRelease = Column(BigInteger, nullable=False) + + +class AnnotationDataset(Base): + __tablename__ = "AnnotationDataset" + + DatasetId = Column(Integer, primary_key=True) + Name = Column(String, nullable=False) + DisplayName = Column(String, nullable=False) + Guid = Column(String, nullable=False) + Description = Column(Text) + + +class AnnotationGroup(Base): + __tablename__ = "AnnotationGroups" + + AnnotationGroupId = Column(Integer, primary_key=True, nullable=False) + Description = Column(Text) + DatasetId = Column(Integer, primary_key=True, nullable=False) + Position = Column(Integer, nullable=False) + ColorR = Column(Integer, nullable=False) + ColorG = Column(Integer, nullable=False) + ColorB = Column(Integer, nullable=False) + GroupDefinition = Column(LargeBinary) + + +class AnnotationType(Base): + __tablename__ = "AnnotationTypes" + + AnnotationTypeId = Column(Integer, primary_key=True) + Name = Column(String, nullable=False) + Description = Column(Text) + + +class Annotation(Base): + __tablename__ = "Annotations" + + AnnotationId = Column(Integer, primary_key=True) + Accession = Column(String, nullable=False) + Description = Column(Text) + type = Column(Integer) + + +class AnnotationsAnnotationGroup(Base): + __tablename__ = "AnnotationsAnnotationGroups" + + AnnotationId = Column(Integer, primary_key=True, nullable=False) + AnnotationGroupId = Column(Integer, primary_key=True, nullable=False) + + +class AnnotationsProtein(Base): + __tablename__ = "AnnotationsProtein" + + proteinID = Column(Integer, primary_key=True, nullable=False) + AnnotationId = Column(Integer, primary_key=True, nullable=False) + Evidence = Column(Integer, primary_key=True) + PositionBegin = Column(Integer, primary_key=True) + PositionEnd = Column(Integer) + ProteinAccession = Column(String, primary_key=True, nullable=False) + + +class Chromatogram(Base): + __tablename__ = "Chromatograms" + + FileID = Column(Integer, primary_key=True, nullable=False) + TraceType = Column(Integer, primary_key=True, nullable=False) + Chromatogram = Column(String, nullable=False) + + +class CustomDataField(Base): + __tablename__ = "CustomDataFields" + + FieldID = Column(Integer, primary_key=True) + Guid = Column(String, nullable=False) + DisplayName = Column(String, nullable=False) + SourceNodeNumber = Column(Integer, nullable=False) + TargetNodeNumber = Column(Integer, nullable=False) + DataType = Column(Integer, nullable=False) + DataTarget = Column(Integer, nullable=False) + Version = Column(Float, nullable=False) + AccessMode = Column(Integer, server_default=text("0")) + Visibility = Column(Integer, server_default=text("0")) + GroupVisibility = Column(Integer, server_default=text("0")) + Format = Column(String) + PlotType = Column(Integer, nullable=False) + DataPurpose = Column(String) + + +class CustomDataPeptide(Base): + __tablename__ = "CustomDataPeptides" + + FieldID = Column(Integer, primary_key=True, nullable=False) + PeptideID = Column(Integer, primary_key=True, nullable=False, index=True) + FieldValue = Column(String) + + +class CustomDataPeptidesDecoy(Base): + __tablename__ = "CustomDataPeptides_decoy" + + FieldID = Column(Integer, primary_key=True, nullable=False) + PeptideID = Column(Integer, primary_key=True, nullable=False, index=True) + FieldValue = Column(String) + + +class CustomDataProcessingNode(Base): + __tablename__ = "CustomDataProcessingNodes" + + FieldID = Column(Integer, primary_key=True, nullable=False) + ProcessingNodeNumber = Column(Integer, primary_key=True, nullable=False, index=True) + FieldValue = Column(String) + + +class CustomDataProtein(Base): + __tablename__ = "CustomDataProteins" + + FieldID = Column(Integer, primary_key=True, nullable=False) + ProteinID = Column(Integer, primary_key=True, nullable=False, index=True) + FieldValue = Column(String) + + +class CustomDataProteinsDecoy(Base): + __tablename__ = "CustomDataProteins_decoy" + + FieldID = Column(Integer, primary_key=True, nullable=False) + ProteinID = Column(Integer, primary_key=True, nullable=False, index=True) + FieldValue = Column(String) + + +class CustomDataSpectra(Base): + __tablename__ = "CustomDataSpectra" + + FieldID = Column(Integer, primary_key=True, nullable=False) + SpectrumID = Column(Integer, primary_key=True, nullable=False, index=True) + FieldValue = Column(String) + + +class Enzyme(Base): + __tablename__ = "Enzymes" + + EnzymeID = Column(Integer, primary_key=True) + Name = Column(String, nullable=False) + Abbreviation = Column(String, nullable=False) + Seperator = Column(String, nullable=False) + NonSeperator = Column(String, nullable=False) + Offset = Column(Integer, nullable=False) + + +class EnzymesCleavageSpecificity(Base): + __tablename__ = "EnzymesCleavageSpecificities" + + EnzymeID = Column(Integer, primary_key=True, nullable=False) + Specificity = Column(Integer, primary_key=True, nullable=False) + + +class EventAnnotation(Base): + __tablename__ = "EventAnnotations" + __table_args__ = ( + Index( + "IX_EventAnnotations_IsotopePatternID_QuanResultID", "IsotopePatternID", "QuanResultID" + ), + Index("IX_EventAnnotations_QuanResultID_QuanChannelID", "QuanResultID", "QuanChannelID"), + ) + + EventID = Column(Integer, primary_key=True) + Charge = Column(SmallInteger, nullable=False) + IsotopePatternID = Column(Integer, nullable=False) + QuanResultID = Column(Integer, nullable=False) + QuanChannelID = Column(Integer, nullable=False) + + +class EventAreaAnnotation(Base): + __tablename__ = "EventAreaAnnotations" + + EventID = Column(Integer, primary_key=True) + Charge = Column(SmallInteger, nullable=False) + IsotopePatternID = Column(Integer, nullable=False, index=True) + QuanResultID = Column(Integer, nullable=False) + + +class Event(Base): + __tablename__ = "Events" + __table_args__ = ( + Index("IX_Events_FileID_LeftRT_RightRT", "FileID", "LeftRT", "RightRT"), + Index("IX_Events_FileID_RT", "FileID", "RT"), + ) + + EventID = Column(Integer, primary_key=True) + Mass = Column(Float, nullable=False) + MassAvg = Column(Float, nullable=False) + Area = Column(Float, nullable=False) + Intensity = Column(Float, nullable=False) + PeakWidth = Column(Float, nullable=False) + RT = Column(Float, nullable=False) + LeftRT = Column(Float, nullable=False) + RightRT = Column(Float, nullable=False) + SN = Column(Float, nullable=False, server_default=text("0.0")) + FileID = Column(Integer, nullable=False) + + +class FastaFile(Base): + __tablename__ = "FastaFiles" + + FastaFileID = Column(Integer, primary_key=True) + FileName = Column(String, nullable=False) + State = Column(Integer, nullable=False) + VirtualFileName = Column(String, nullable=False) + FileSize = Column(BigInteger, nullable=False) + FileTime = Column(BigInteger, nullable=False) + NumberOfProteins = Column(BigInteger) + NumberOfAminoAcids = Column(BigInteger) + FileHashCode = Column(BigInteger) + Hidden = Column(Boolean, nullable=False) + IsSrfImport = Column(Boolean, nullable=False) + IsScheduledForDeletion = Column(Boolean, nullable=False, server_default=text("0")) + + +class FastaFilesProteinAnnotation(Base): + __tablename__ = "FastaFilesProteinAnnotations" + + FastaFileID = Column(Integer, primary_key=True, nullable=False) + ProteinAnnotationID = Column(Integer, primary_key=True, nullable=False, index=True) + + +class FileInfo(Base): + __tablename__ = "FileInfos" + + FileID = Column(Integer, primary_key=True) + FileName = Column(String, nullable=False) + FileTime = Column(String, nullable=False) + FileSize = Column(BigInteger, nullable=False) + PhysicalFileName = Column(String, nullable=False) + FileType = Column(SmallInteger, nullable=False) + + +class MassPeakRelation(Base): + __tablename__ = "MassPeakRelations" + + MassPeakID = Column(Integer, primary_key=True, nullable=False) + RelatedMassPeakID = Column(Integer, primary_key=True, nullable=False) + + +class MassPeak(Base): + __tablename__ = "MassPeaks" + + MassPeakID = Column(Integer, primary_key=True) + Charge = Column(SmallInteger) + Intensity = Column(Float) + Mass = Column(Float) + ScanNumbers = Column(String) + FileID = Column(Integer) + PercentIsolationInterference = Column(Float) + IonInjectTime = Column(Integer) + + +class PeptideScore(Base): + __tablename__ = "PeptideScores" + + ProcessingNodeNumber = Column(Integer, primary_key=True, nullable=False) + PeptideID = Column(Integer, primary_key=True, nullable=False) + ScoreID = Column(Integer, primary_key=True, nullable=False) + ProcessingNodeID = Column(Integer) + ScoreValue = Column(Float, nullable=False) + + +class PeptideScoreDecoy(Base): + __tablename__ = "PeptideScores_decoy" + + ProcessingNodeNumber = Column(Integer, primary_key=True, nullable=False) + PeptideID = Column(Integer, primary_key=True, nullable=False) + ScoreID = Column(Integer, primary_key=True, nullable=False) + ProcessingNodeID = Column(Integer) + ScoreValue = Column(Float, nullable=False) + + +class Peptide(Base): + __tablename__ = "Peptides" + + ProcessingNodeNumber = Column(Integer, primary_key=True, nullable=False) + PeptideID = Column(Integer, primary_key=True, nullable=False, index=True) + SpectrumID = Column(Integer, nullable=False, index=True) + TotalIonsCount = Column(SmallInteger, nullable=False) + MatchedIonsCount = Column(SmallInteger, nullable=False) + ConfidenceLevel = Column(SmallInteger, nullable=False) + SearchEngineRank = Column(Integer, nullable=False) + Hidden = Column(Boolean, nullable=False, server_default=text("0")) + Sequence = Column(String) + Annotation = Column(String) + UniquePeptideSequenceID = Column(Integer, nullable=False, server_default=text("1")) + MissedCleavages = Column(SmallInteger, nullable=False) + + +class PeptidesAminoAcidModification(Base): + __tablename__ = "PeptidesAminoAcidModifications" + + ProcessingNodeNumber = Column(Integer, primary_key=True, nullable=False) + PeptideID = Column(Integer, primary_key=True, nullable=False) + AminoAcidModificationID = Column(Integer, primary_key=True, nullable=False) + Position = Column(Integer, primary_key=True, nullable=False) + + +class PeptidesAminoAcidModificationsDecoy(Base): + __tablename__ = "PeptidesAminoAcidModifications_decoy" + + ProcessingNodeNumber = Column(Integer, primary_key=True, nullable=False) + PeptideID = Column(Integer, primary_key=True, nullable=False) + AminoAcidModificationID = Column(Integer, primary_key=True, nullable=False) + Position = Column(Integer, primary_key=True, nullable=False) + + +class PeptidesProtein(Base): + __tablename__ = "PeptidesProteins" + + ProcessingNodeNumber = Column(Integer, primary_key=True, nullable=False) + PeptideID = Column(Integer, primary_key=True, nullable=False, index=True) + ProteinID = Column(Integer, primary_key=True, nullable=False) + + +class PeptidesProteinDecoy(Base): + __tablename__ = "PeptidesProteins_decoy" + + ProcessingNodeNumber = Column(Integer, primary_key=True, nullable=False) + PeptideID = Column(Integer, primary_key=True, nullable=False, index=True) + ProteinID = Column(Integer, primary_key=True, nullable=False) + + +class PeptidesReferenceSpectra(Base): + __tablename__ = "PeptidesReferenceSpectra" + + PeptideID = Column(Integer, primary_key=True) + ReferenceSpectrumID = Column(Integer, nullable=False) + + +class PeptidesTerminalModification(Base): + __tablename__ = "PeptidesTerminalModifications" + + ProcessingNodeNumber = Column(Integer, primary_key=True, nullable=False) + PeptideID = Column(Integer, primary_key=True, nullable=False) + TerminalModificationID = Column(Integer, primary_key=True, nullable=False) + + +class PeptidesTerminalModificationDecoy(Base): + __tablename__ = "PeptidesTerminalModifications_decoy" + + ProcessingNodeNumber = Column(Integer, primary_key=True, nullable=False) + PeptideID = Column(Integer, primary_key=True, nullable=False) + TerminalModificationID = Column(Integer, primary_key=True, nullable=False) + + +class PeptideDecoy(Base): + __tablename__ = "Peptides_decoy" + + ProcessingNodeNumber = Column(Integer, primary_key=True, nullable=False) + PeptideID = Column(Integer, primary_key=True, nullable=False, index=True) + SpectrumID = Column(Integer, nullable=False, index=True) + TotalIonsCount = Column(SmallInteger, nullable=False) + MatchedIonsCount = Column(SmallInteger, nullable=False) + ConfidenceLevel = Column(SmallInteger, nullable=False) + SearchEngineRank = Column(Integer, nullable=False) + Sequence = Column(String) + Annotation = Column(String) + UniquePeptideSequenceID = Column(Integer, nullable=False, server_default=text("1")) + MissedCleavages = Column(SmallInteger, nullable=False) + + +t_PrecursorIonAreaSearchSpectra = Table( + "PrecursorIonAreaSearchSpectra", + metadata, + Column("QuanResultID", Integer, nullable=False, index=True), + Column("SearchSpectrumID", Integer), +) + + +t_PrecursorIonQuanResults = Table( + "PrecursorIonQuanResults", + metadata, + Column("QuanChannelID", Integer, nullable=False), + Column("QuanResultID", Integer, nullable=False), + Column("Mass", Float, nullable=False), + Column("Charge", Integer, nullable=False), + Column("Area", Float), + Column("RetentionTime", Float), + Index( + "IX_PrecursorIonQuanResults_QuanResultID_QuanChannelID", "QuanResultID", "QuanChannelID" + ), +) + + +t_PrecursorIonQuanResultsSearchSpectra = Table( + "PrecursorIonQuanResultsSearchSpectra", + metadata, + Column("ProcessingNodeNumber", Integer, nullable=False), + Column("QuanResultID", Integer, nullable=False, index=True), + Column("SearchSpectrumID", Integer, index=True), +) + + +t_ProcessingNodeConnectionPoints = Table( + "ProcessingNodeConnectionPoints", + metadata, + Column("ProcessingNodeID", Integer, nullable=False), + Column("Interface", String, nullable=False), + Column("ConnectionDirection", Integer, nullable=False), + Column("ConnectionMode", Integer, nullable=False), + Column("ConnectionMultiplicity", Integer, nullable=False), + Column("ConnectionRequirement", Integer, nullable=False), + Column("DataTypeSpecialization", String, nullable=False), + Column("ConnectionDisplayName", String, nullable=False), +) + + +class ProcessingNodeExtension(Base): + __tablename__ = "ProcessingNodeExtensions" + + ExtensionID = Column(Integer, primary_key=True) + ProcessingNodeNumber = Column(Integer, nullable=False) + Guid = Column(String, nullable=False) + Purpose = Column(String, nullable=False) + PurposeDetail = Column(String) + MajorVersion = Column(Integer, nullable=False) + MinorVersion = Column(Integer, nullable=False) + Settings = Column(Text) + + +class ProcessingNodeFilterParameter(Base): + __tablename__ = "ProcessingNodeFilterParameters" + + ProcessingNodeNumber = Column(Integer, primary_key=True, nullable=False) + FilterParameterName = Column(String, primary_key=True, nullable=False) + FilterModuleTypeID = Column(Integer, nullable=False) + FilterModuleNumber = Column(Integer, nullable=False) + ProcessingNodeID = Column(Integer, nullable=False) + FilterParameterValue = Column(Float, nullable=False) + + +t_ProcessingNodeInterfaces = Table( + "ProcessingNodeInterfaces", + metadata, + Column("ProcessingNodeID", Integer, nullable=False), + Column("InterfaceKind", Integer, nullable=False), + Column("InterfaceName", String, nullable=False), +) + + +class ProcessingNodeParameter(Base): + __tablename__ = "ProcessingNodeParameters" + + ProcessingNodeNumber = Column(Integer, primary_key=True, nullable=False) + ParameterName = Column(String, primary_key=True, nullable=False) + FriendlyName = Column(String, nullable=False) + ProcessingNodeID = Column(Integer, nullable=False) + IntendedPurpose = Column(Integer, nullable=False) + PurposeDetails = Column(String, nullable=False) + Hidden = Column(Boolean, nullable=False) + Advanced = Column(Boolean, nullable=False) + Category = Column(String, nullable=False) + Position = Column(Integer, nullable=False) + ParameterValue = Column(String, nullable=False) + ValueDisplayString = Column(String, nullable=False) + + +class ProcessingNodeScore(Base): + __tablename__ = "ProcessingNodeScores" + __table_args__ = (UniqueConstraint("ProcessingNodeID", "ScoreName"),) + + ProcessingNodeID = Column(Integer, nullable=False) + ScoreID = Column(Integer, primary_key=True) + ScoreName = Column(String, nullable=False) + FriendlyName = Column(String, nullable=False) + Description = Column(String, nullable=False) + FormatString = Column(String, nullable=False) + ScoreCategory = Column(Integer, nullable=False) + Hidden = Column(Boolean, nullable=False) + IsMainScore = Column(Boolean, nullable=False) + ScoreGUID = Column(String, nullable=False) + + +class ProcessingNode(Base): + __tablename__ = "ProcessingNodes" + + ProcessingNodeNumber = Column(Integer, primary_key=True) + ProcessingNodeID = Column(Integer, nullable=False) + ProcessingNodeParentNumber = Column(String, nullable=False) + NodeName = Column(String) + FriendlyName = Column(String, nullable=False) + MajorVersion = Column(Integer, nullable=False) + MinorVersion = Column(Integer, nullable=False) + NodeComment = Column(String) + NodeGUID = Column(String, nullable=False) + ProcessingNodeState = Column(Integer, nullable=False, server_default=text("0")) + + +class ProcessingNodesSpectra(Base): + __tablename__ = "ProcessingNodesSpectra" + + SendingProcessingNodeNumber = Column(Integer, primary_key=True, nullable=False) + SpectrumID = Column(Integer, primary_key=True, nullable=False, index=True) + + +class ProteinAnnotation(Base): + __tablename__ = "ProteinAnnotations" + __table_args__ = ( + Index( + "IX_ProteinAnnotations_ProteinID_DescriptionHashCode", + "ProteinID", + "DescriptionHashCode", + ), + ) + + ProteinAnnotationID = Column(Integer, primary_key=True) + ProteinID = Column(Integer, nullable=False) + DescriptionHashCode = Column(BigInteger, nullable=False) + Description = Column(Text, nullable=False) + TaxonomyID = Column(Integer, nullable=False, index=True) + + +class ProteinIdentificationGroup(Base): + __tablename__ = "ProteinIdentificationGroups" + + ProteinIdentificationGroupId = Column(Integer, primary_key=True, nullable=False) + ProcessingNodeNumber = Column(Integer, primary_key=True, nullable=False) + + +class ProteinScore(Base): + __tablename__ = "ProteinScores" + + ProcessingNodeNumber = Column(Integer, primary_key=True, nullable=False) + ProteinID = Column(Integer, primary_key=True, nullable=False) + ProteinIdentificationGroupID = Column(Integer, nullable=False) + ProteinScore = Column(Float, nullable=False) + Coverage = Column(Float, nullable=False, server_default=text("0")) + + +class ProteinScoresDecoy(Base): + __tablename__ = "ProteinScores_decoy" + + ProcessingNodeNumber = Column(Integer, primary_key=True, nullable=False) + ProteinID = Column(Integer, primary_key=True, nullable=False) + ProteinIdentificationGroupID = Column(Integer, nullable=False) + ProteinScore = Column(Float, nullable=False) + Coverage = Column(Float, nullable=False, server_default=text("0")) + + +class Protein(Base): + __tablename__ = "Proteins" + + ProteinID = Column(Integer, primary_key=True) + Sequence = Column(Text, nullable=False) + SequenceHashCode = Column(BigInteger, nullable=False, index=True) + IsMasterProtein = Column(Boolean, nullable=False, server_default=text("0")) + + +t_ProteinsProteinGroups = Table( + "ProteinsProteinGroups", + metadata, + Column("ProteinID", Integer, nullable=False), + Column("ProteinGroupID", Integer, nullable=False), +) + + +class PtmAnnotationDatum(Base): + __tablename__ = "PtmAnnotationData" + + AnnotationType = Column(Integer, primary_key=True, nullable=False) + ProteinId = Column(Integer, primary_key=True, nullable=False) + AnnotationId = Column(Integer, primary_key=True, nullable=False) + Position = Column(Integer, primary_key=True, nullable=False) + Annotation = Column(String) + + +class ReferenceSpectra(Base): + __tablename__ = "ReferenceSpectra" + + ReferenceSpectrumId = Column(Integer, primary_key=True) + Sequence = Column(String, nullable=False) + SequenceHashCode = Column(BigInteger, nullable=False) + Spectrum = Column(String, nullable=False) + SpectrumHashCode = Column(BigInteger, nullable=False) + Comment = Column(Text) + CommentHashCode = Column(BigInteger, nullable=False) + + +class ReporterIonQuanResult(Base): + __tablename__ = "ReporterIonQuanResults" + __table_args__ = ( + Index( + "IX_ReporterIonQuanResults_ProcessingNodeNumber_SpectrumID", + "ProcessingNodeNumber", + "SpectrumID", + ), + ) + + ProcessingNodeNumber = Column(Integer, primary_key=True, nullable=False) + QuanChannelID = Column(Integer, primary_key=True, nullable=False) + SpectrumID = Column(Integer, primary_key=True, nullable=False) + Mass = Column(Float, nullable=False) + Height = Column(Float) + + +t_ReporterIonQuanResultsSearchSpectra = Table( + "ReporterIonQuanResultsSearchSpectra", + metadata, + Column("ProcessingNodeNumber", Integer, nullable=False), + Column("SpectrumID", Integer, nullable=False), + Column("SearchSpectrumID", Integer, index=True), +) + + +class ScanEvent(Base): + __tablename__ = "ScanEvents" + + ScanEventID = Column(Integer, primary_key=True) + MSLevel = Column(Integer, nullable=False) + Polarity = Column(Integer, nullable=False) + ScanType = Column(Integer, nullable=False) + Ionization = Column(Integer, nullable=False) + MassAnalyzer = Column(Integer, nullable=False) + ActivationType = Column(Integer, nullable=False) + + +class SchemaInfo(Base): + __tablename__ = "SchemaInfo" + + Version = Column(Integer, primary_key=True) + Kind = Column(String, nullable=False) + Date = Column(DateTime, nullable=False) + SoftwareVersion = Column(String, nullable=False) + Comment = Column(Text, nullable=False) + + +class Spectrum(Base): + __tablename__ = "Spectra" + + UniqueSpectrumID = Column(Integer, primary_key=True) + Spectrum = Column(String, nullable=False) + SpectrumHashCode = Column(BigInteger) + + +class SpectrumHeader(Base): + __tablename__ = "SpectrumHeaders" + + SpectrumID = Column(Integer, primary_key=True) + MassPeakID = Column(Integer) + ScanEventID = Column(Integer) + LastScan = Column(Integer) + FirstScan = Column(Integer) + RetentionTime = Column(Float) + Hidden = Column(Boolean, nullable=False, server_default=text("0")) + ScanNumbers = Column(String) + Charge = Column(SmallInteger) + Mass = Column(Float) + CreatingProcessingNodeNumber = Column(Integer, nullable=False) + UniqueSpectrumID = Column(Integer, nullable=False, server_default=text("0")) + + +class SpectrumScore(Base): + __tablename__ = "SpectrumScores" + + ProcessingNodeNumber = Column(Integer, primary_key=True, nullable=False) + SpectrumID = Column(Integer, primary_key=True, nullable=False) + Score = Column(Float, nullable=False) + + +t_TaxonomyNames = Table( + "TaxonomyNames", + metadata, + Column("TaxonomyID", Integer, nullable=False, index=True), + Column("Name", String), + Column("NameCategory", Integer, nullable=False), +) + + +class TaxonomyNode(Base): + __tablename__ = "TaxonomyNodes" + __table_args__ = ( + Index("IX_TaxonomyNodes_LeftNodeIndex_RightNodeIndex", "LeftNodeIndex", "RightNodeIndex"), + ) + + TaxonomyID = Column(Integer, primary_key=True, unique=True) + ParentTaxonomyID = Column(Integer, nullable=False) + TaxonomyRank = Column(Integer, nullable=False) + LeftNodeIndex = Column(Integer, nullable=False) + RightNodeIndex = Column(Integer, nullable=False) + + +t_WorkflowInfo = Table( + "WorkflowInfo", + metadata, + Column("WorkflowName", String, nullable=False), + Column("WorkflowDescription", String, nullable=False), + Column("WorkflowState", Integer, nullable=False, server_default=text("0")), + Column("WorkflowStartDate", DateTime, nullable=False), + Column("WorkflowTemplate", String, nullable=False), + Column("User", String, nullable=False), + Column("WorkflowGUID", String, nullable=False), + Column("MachineGUID", String, nullable=False), + Column("MachineName", String, nullable=False), + Column("MergeSimilarIdentificationResults", Boolean, nullable=False), + Column("IsValid", Boolean, nullable=False), + Column("Version", Integer, nullable=False), +) + + +class WorkflowMessage(Base): + __tablename__ = "WorkflowMessages" + + MessageID = Column(Integer, primary_key=True) + ProcessingNodeID = Column(Integer, nullable=False) + ProcessingNodeNumber = Column(Integer, nullable=False) + Time = Column(BigInteger, nullable=False) + MessageKind = Column(Integer, nullable=False) + Message = Column(String, nullable=False) + + +t_sqlite_sequence = Table( + "sqlite_sequence", metadata, Column("name", NullType), Column("seq", NullType) +) diff --git a/psm_utils/io/peptideshaker.py b/psm_utils/io/peptideshaker.py deleted file mode 100644 index e7a876d..0000000 --- a/psm_utils/io/peptideshaker.py +++ /dev/null @@ -1,201 +0,0 @@ -"""PeptideShaker Extended PSM Report.""" - -import logging -import os -from typing import Union - -import click -import numpy as np -import pandas as pd - -from psm_utils.io.peptide_record import PeptideRecord - -logger = logging.getLogger(__name__) - - -@pd.api.extensions.register_dataframe_accessor("ext_psm_report") -class ExtendedPsmReportAccessor: - """ - Pandas extension for PeptideShaker Extended PSM Reports. - - Examples - -------- - >>> import pandas as pd - >>> from ms2rescore.peptideshaker import ExtendedPsmReportAccessor - >>> psm_report = pd.DataFrame.ext_psm_report.from_tsv(kwargs["input_psm_report"]) - >>> peprec = psm_report.ext_psm_report.to_peprec() - """ - - def __init__(self, pandas_obj: pd.DataFrame) -> None: - """Pandas extension for PeptideShaker Extended PSM Reports.""" - self._obj = pandas_obj - self._validate() - - def _validate(self): - """Validate Pandas DataFrame as Extended PSM Report.""" - # TODO: Implement validation of PSM report DataFrame - self.drop_invalid_amino_acids() - - def drop_invalid_amino_acids(self, invalid_amino_acids=r"[BJOUXZ]"): - """Drop all PSMs (rows) with peptides containing invalid amino acids.""" - to_drop = self._obj[ - self._obj['Sequence'].str.contains(invalid_amino_acids, regex=True) - ].index - if len(to_drop) > 0: - logger.warning( - "Dropping %i PSMs from report due to invalid amino acids (%s)", - len(to_drop), - invalid_amino_acids - ) - self._obj = self._obj.drop(index=to_drop) - - @staticmethod - def from_tsv(path: Union[str, os.PathLike]) -> pd.DataFrame: - """Read Extended PSM Report from TSV file.""" - ext_psm_report = pd.read_csv(path, sep="\t", index_col=0) - ext_psm_report.ext_psm_report._validate() - return ext_psm_report - - @staticmethod - def from_xls(path: Union[str, os.PathLike]) -> pd.DataFrame: - """Read Extended PSM Report from XLS file.""" - ext_psm_report = pd.read_excel(path, sheet_name=0, index_col=0) - pd.ext_psm_report._validate(ext_psm_report) - return ext_psm_report - - @staticmethod - def from_file(path: Union[str, os.PathLike]) -> pd.DataFrame: - """Read Extended PSM Report from file, inferring filetype from extension.""" - ext = os.path.splitext(path)[-1].lower() - if (ext == ".tsv") or (ext == ".txt"): - return pd.DataFrame.ext_psm_report.from_tsv(path) - elif (ext == ".xls") or (ext == ".xlsx"): - return pd.DataFrame.ext_psm_report.from_xls(path) - else: - raise NotImplementedError( - f"Extended PSM Report with filetype extension {ext} is not supported." - ) - - @staticmethod - def _parse_modification(modified_seq): - """ - Parse modified sequence to peprec modification string. - - TODO: Do not hardcode modification mapping. - TODO: Refactor method (e.g. use regex for matching). - TODO: Parse C-term modifications - - """ - # Initiate variables for nterm, seq and cterm - mod_list = list() - nterm, seq, cterm = modified_seq.split("-") - - # Initiatle variable for nterm - pyro_bool = False - - # Initiate variables for seq - mod_index = 0 - mod_description = False # to check if it's an amino acid (False) or a description in < ... > (True) - - # Check amino terminus for modifications - if nterm == "ace": - mod_list.append("0|Acetyl") - elif nterm == "pyro": - pyro_bool = True - elif nterm != "NH2": - print("Unknown N-terminal modification: {}".format(nterm)) - - # Check internal sequence - for char in seq: - if char == "<": - mod_peprec = "{}|".format(mod_index) - mod_name = "" - mod_description = True - elif char == ">": - mod_description = False - if mod_name == 'ox': - mod_peprec += 'Oxidation' - elif mod_name == 'cmm': - mod_peprec += 'Carbamidomethyl' - elif mod_name == 'deam': - mod_peprec += 'Deamidated' - else: - logger.warning("Unknown internal modification: %s", mod_name) - mod_list.append("{}".format(mod_peprec)) # peprec format - mod_peprec = "" - - else: - if pyro_bool: - if char == 'C': - mod_name = "Pyro-carbamidomethyl" - elif char == 'Q': - mod_name = "Gln->pyro-Glu" - elif char == 'E': - mod_name = "Glu->pyro-Glu" - elif char == 'P': - mod_name = "Pro->pyro-Glu" - else: - logger.warning("Unknown N-terminal pyro modification from %s", char) - mod_list.append("1|{}".format(mod_name)) - pyro_bool = False - mod_index += 1 - mod_name = "" - else: - if mod_description: - mod_name += char - else: - mod_index += 1 - - mods_peprec = "|".join(mod_list) - if mods_peprec == "": - mods_peprec = "-" - - return mods_peprec - - def to_peprec(self): - """Convert Extended PSM Report to PEPREC.""" - column_mapping = { - "Spectrum Title": "spec_id", - "Modified Sequence": "modifications", - "Sequence": "peptide", - "Measured Charge": "charge", - "Decoy": "Label", - "RT": "observed_retention_time", - "Confidence [%]": "psm_score", - } - - # Convert DataFrame to PEPREC - df = self._obj[column_mapping.keys()].rename(columns=column_mapping) - df["charge"] = df["charge"].str.strip("+") - df["modifications"] = df["modifications"].apply(self._parse_modification) - df["Label"] = df["Label"].apply( - lambda x: 1 if x == 0 else (-1 if x == 1 else np.nan) - ) - if df["Label"].isna().any(): - raise ValueError( - "Missing target/decoy labels in PeptideShaker Extended PSM " - "Report." - ) - - peprec = PeptideRecord() - peprec.df = df - return peprec - - def get_search_engine_features(self): - """Get pandas.DataFrame with search engine features.""" - # TODO: Implement this! - raise NotImplementedError - - -@click.command() -@click.argument("input-psm-report") -@click.argument("output-peprec") -def main(**kwargs): - """Convert Extended PSM Report to PEPREC.""" - psm_report = pd.DataFrame.ext_psm_report.from_file(kwargs["input_psm_report"]) - peprec = psm_report.ext_psm_report.to_peprec() - peprec.to_csv(kwargs["output_peprec"]) - - -if __name__ == "__main__": - main() diff --git a/psm_utils/io/proteome_discoverer.py b/psm_utils/io/proteome_discoverer.py new file mode 100644 index 0000000..edb7a5e --- /dev/null +++ b/psm_utils/io/proteome_discoverer.py @@ -0,0 +1,304 @@ +"""Reader for Proteome Discoverer MSF PSM files.""" + +import logging +import re +from collections import defaultdict +from pathlib import Path +from typing import Dict, List, Tuple, Union + +import pyteomics.proforma as proforma +from sqlalchemy import create_engine +from sqlalchemy.orm import sessionmaker + +import psm_utils.io._pd_msf_tables as msf +from psm_utils import PSM, Peptidoform +from psm_utils.io._base_classes import ReaderBase + +logger = logging.getLogger(__name__) + +COMPATIBLE_VERSIONS = [79] + + +class MSFReader(ReaderBase): + """Reader for Proteome Discoverer MSF files.""" + + def __init__( + self, + filename: Union[str, Path], + *args, + **kwargs, + ) -> None: + """ + Reader for Proteome Discoverer MSF file. + + Parameters + ---------- + filename: str, pathlib.Path + Path to MSF file. + + """ + super().__init__(filename, *args, **kwargs) + + self._engine = create_engine(f"sqlite:///{self.filename.as_posix()}") + self._session = sessionmaker(bind=self._engine)() + + self._check_version() + + def __len__(self): + """Return number of PSMs in file.""" + return sum( + self._session.query(peptide).count() for peptide in [msf.Peptide, msf.PeptideDecoy] + ) + + def __iter__(self): + """Iterate over file and return PSMs one-by-one.""" + for is_decoy in [False, True]: + modifications = self._get_modifications(is_decoy) + terminal_modifications = self._get_terminal_modifications(is_decoy) + protein_entries = self._get_protein_entries(is_decoy) + main_score = self._get_main_score(is_decoy) + secondary_scores = self._get_secondary_scores(is_decoy) + + for entry in self._iter_peptides(is_decoy): + peptide_id = entry.PeptideDecoy.PeptideID if is_decoy else entry.Peptide.PeptideID + yield self._parse_entry( + entry, + modifications[peptide_id], + terminal_modifications[peptide_id], + protein_entries[peptide_id], + main_score[peptide_id], + secondary_scores[peptide_id], + is_decoy, + ) + + def _check_version(self): + """Check if MSF file version is compatible.""" + version = self._session.query(msf.SchemaInfo.Version).first()[0] + if version not in COMPATIBLE_VERSIONS: + logger.warning( + f"MSF file version {version} might not be compatible with this reader. " + f"Checked versions are: {COMPATIBLE_VERSIONS}." + ) + + def _iter_peptides(self, is_decoy: bool): + """Iterate over peptides in MSF file.""" + Peptide = msf.PeptideDecoy if is_decoy else msf.Peptide + for entry in ( + self._session.query(Peptide, msf.SpectrumHeader, msf.MassPeak, msf.FileInfo) + .select_from(Peptide) + .join(msf.SpectrumHeader, Peptide.SpectrumID == msf.SpectrumHeader.SpectrumID) + .join(msf.MassPeak, msf.MassPeak.MassPeakID == msf.SpectrumHeader.MassPeakID) + .join(msf.FileInfo, msf.FileInfo.FileID == msf.MassPeak.FileID) + ): + yield entry + + def _get_modifications(self, is_decoy: bool) -> Dict[int, Tuple[int, int]]: + """Get all modifications per peptide ID.""" + PeptidesAminoAcidModification = ( + msf.PeptidesAminoAcidModificationsDecoy + if is_decoy + else msf.PeptidesAminoAcidModification + ) + query = ( + self._session.query( + PeptidesAminoAcidModification.PeptideID, + PeptidesAminoAcidModification.Position, + msf.AminoAcidModification.UnimodAccession, + ) + .select_from(PeptidesAminoAcidModification) + .join( + msf.AminoAcidModification, + PeptidesAminoAcidModification.AminoAcidModificationID + == msf.AminoAcidModification.AminoAcidModificationID, + ) + ) + modifications_by_peptide = defaultdict(list) + for peptide_id, position, unimod_accession in query: + modifications_by_peptide[peptide_id].append((position, unimod_accession)) + + return modifications_by_peptide + + def _get_terminal_modifications(self, is_decoy: bool) -> Dict[int, Tuple[int, int]]: + """Get terminal modifications for a peptide.""" + PeptidesTerminalModification = ( + msf.PeptidesTerminalModification if is_decoy else msf.PeptidesTerminalModificationDecoy + ) + query = ( + self._session.query( + PeptidesTerminalModification.PeptideID, + msf.AminoAcidModification.PositionType, + msf.AminoAcidModification.UnimodAccession, + ) + .select_from(msf.AminoAcidModification) + .join( + PeptidesTerminalModification, + PeptidesTerminalModification.TerminalModificationID + == msf.AminoAcidModification.AminoAcidModificationID, + ) + ) + terminal_modifications = defaultdict(list) + for peptide_id, position_type, unimod_accession in query: + terminal_modifications[peptide_id].append((position_type, unimod_accession)) + return terminal_modifications + + def _get_protein_entries(self, is_decoy: bool) -> Dict[int, List[str]]: + """Get protein descriptions or a peptide.""" + PeptidesProtein = msf.PeptidesProteinDecoy if is_decoy else msf.PeptidesProtein + query = ( + self._session.query(PeptidesProtein.PeptideID, msf.ProteinAnnotation.Description) + .select_from(PeptidesProtein) + .join( + msf.ProteinAnnotation, + PeptidesProtein.ProteinID == msf.ProteinAnnotation.ProteinID, + ) + ) + proteins = defaultdict(list) + for peptide_id, description in query: + proteins[peptide_id].append(re.sub(r"^>", "", description)) + return proteins + + def _get_main_score(self, is_decoy: bool) -> Dict[int, Tuple[float, str]]: + """Get main score and its name for a peptide.""" + PeptideScore = msf.PeptideScoreDecoy if is_decoy else msf.PeptideScore + query = ( + self._session.query( + PeptideScore.PeptideID, PeptideScore.ScoreValue, msf.ProcessingNodeScore.ScoreName + ) + .select_from(PeptideScore) + .join( + msf.ProcessingNodeScore, + msf.ProcessingNodeScore.ScoreID == PeptideScore.ScoreID, + ) + .filter(msf.ProcessingNodeScore.IsMainScore == True) # noqa: E712 + ) + scores = dict() + for peptide_id, score_value, score_name in query: + scores[peptide_id] = (score_value, score_name) + return scores + + def _get_secondary_scores(self, is_decoy: bool) -> Dict[int, Dict[str, float]]: + """Get secondary scores and their names for a peptide.""" + PeptideScore = msf.PeptideScoreDecoy if is_decoy else msf.PeptideScore + query = ( + self._session.query( + PeptideScore.PeptideID, PeptideScore.ScoreValue, msf.ProcessingNodeScore.ScoreName + ) + .select_from(PeptideScore) + .join( + msf.ProcessingNodeScore, + msf.ProcessingNodeScore.ScoreID == PeptideScore.ScoreID, + ) + .filter(msf.ProcessingNodeScore.IsMainScore == False) # noqa: E712 + ) + scores = defaultdict(dict) + for peptide_id, score_value, score_name in query: + scores[peptide_id][score_name] = score_value + return scores + + def _compile_peptidoform( + self, + sequence: str, + charge: int, + modifications: List[Tuple[int, int]], + terminal_modifications: List[Tuple[int, int]], + ) -> Peptidoform: + """ + Compile a peptidoform from a sequence, charge, and list of (terminal) modifications. + + Parameters + ---------- + sequence + The stripped sequence of the peptidoform. + charge + Precursor charge. + modifications + List of tuples of the form (position, unimod identifier). + terminal_modifications + List of tuples of the form (position type, unimod identifier). + + Notes + ----- + The position type is either 1 (Any N-term), 2 (Any C-term), 3 (Protein N-term), or 4 + (Protein C-term). Position type 0 (Anywhere) should not be present in the + terminal_modifications list. + + """ + modifications_dict = defaultdict(list) + for position, unimod_id in modifications: + modifications_dict[position].append(proforma.process_tag_tokens(f"U:{unimod_id}")) + + n_term = [ + proforma.process_tag_tokens(f"U:{unimod_id}") + for position_type, unimod_id in terminal_modifications + if position_type in [1, 3] # Position types 'Any N-term' or 'Protein N-term' + ] + c_term = [ + proforma.process_tag_tokens(f"U:{unimod_id}") + for position_type, unimod_id in terminal_modifications + if position_type in [2, 4] # Position types 'Any C-term' or 'Protein C-term' + ] + + sequence = [(aa, modifications_dict[i] or None) for i, aa in enumerate(sequence)] + properties = { + "n_term": n_term, + "c_term": c_term, + "charge_state": proforma.ChargeState(charge), + "unlocalized_modifications": [], + "labile_modifications": [], + "fixed_modifications": [], + "intervals": [], + "isotopes": [], + "group_ids": [], + } + + return Peptidoform(proforma.ProForma(sequence, properties)) + + def _parse_entry( + self, + entry: Tuple[msf.Peptide, msf.SpectrumHeader, msf.MassPeak, msf.FileInfo], + modifications: List[Tuple[int, int]], + terminal_modifications: List[Tuple[int, int]], + protein_entries: List[str], + main_score: Tuple[float, str], + secondary_scores: Dict[str, float], + is_decoy: bool, + ) -> PSM: + """Parse an entry from the MSF file.""" + peptide = entry.PeptideDecoy if is_decoy else entry.Peptide + return PSM( + peptidoform=self._compile_peptidoform( + peptide.Sequence, + entry.SpectrumHeader.Charge, + modifications, + terminal_modifications, + ), + spectrum_id=entry.SpectrumHeader.LastScan, + run=Path(entry.FileInfo.FileName).stem, + is_decoy=is_decoy, + score=main_score[0], + qvalue=None, + pep=None, + precursor_mz=entry.MassPeak.Mass, + retention_time=entry.SpectrumHeader.RetentionTime, + ion_mobility=None, + protein_list=protein_entries, + rank=peptide.SearchEngineRank, + source="proteome_discoverer", + provenance_data={ + "scan_numbers": entry.SpectrumHeader.ScanNumbers, + }, + metadata={ + "ms1_intensity": str(entry.MassPeak.Intensity), + "ms1_percent_isolation_interference": str( + entry.MassPeak.PercentIsolationInterference + ), + "ms1_ion_inject_time": str(entry.MassPeak.IonInjectTime), + "main_score_name": main_score[1], + **secondary_scores, + }, + rescoring_features={ + "missed_cleavages": peptide.MissedCleavages, + "total_ions_count": peptide.TotalIonsCount, + "matched_ions_count": peptide.MatchedIonsCount, + }, + ) diff --git a/psm_utils/peptidoform.py b/psm_utils/peptidoform.py index f7de6a1..814a8df 100644 --- a/psm_utils/peptidoform.py +++ b/psm_utils/peptidoform.py @@ -12,20 +12,15 @@ class Peptidoform: Peptide sequence, modifications and charge state represented in ProForma notation. """ - def __init__(self, proforma_sequence: str) -> None: + def __init__(self, proforma_sequence: [str, proforma.ProForma]) -> None: """ Peptide sequence, modifications and charge state represented in ProForma notation. Parameters ---------- - proforma_sequence : str - Peptidoform sequence in ProForma v2 notation. - - Examples - -------- - >>> peptidoform = Peptidoform("ACDM[Oxidation]EK") - >>> peptidoform.theoretical_mass - 711.2567622919099 + proforma_sequence + Peptidoform sequence in ProForma v2 notation as :py:class:`str` or + :py:class:`pyteomics.proforma.ProForma` object. Attributes ---------- @@ -34,18 +29,30 @@ def __init__(self, proforma_sequence: str) -> None: properties : dict[str, Any] Dict with sequence-wide properties. + Examples + -------- + >>> peptidoform = Peptidoform("ACDM[Oxidation]EK") + >>> peptidoform.theoretical_mass + 711.2567622919099 + """ - try: - self.parsed_sequence, self.properties = proforma.parse(proforma_sequence) - except proforma.ProFormaError as e: - raise PeptidoformException( - f"Could not parse ProForma sequence: {proforma_sequence}" - ) from e + if isinstance(proforma_sequence, str): + try: + self.parsed_sequence, self.properties = proforma.parse(proforma_sequence) + except proforma.ProFormaError as e: + raise PeptidoformException( + f"Could not parse ProForma sequence: {proforma_sequence}" + ) from e + elif isinstance(proforma_sequence, proforma.ProForma): + self.parsed_sequence = proforma_sequence.sequence + self.properties = proforma_sequence.properties + else: + raise TypeError( + f"Expected ProForma sequence or ProForma object, got {type(proforma_sequence)}." + ) if self.properties["isotopes"]: - raise NotImplementedError( - "Peptidoforms with isotopes are currently not supported." - ) + raise NotImplementedError("Peptidoforms with isotopes are currently not supported.") def __repr__(self) -> str: return f"{self.__class__.__qualname__}('{self.proforma}')" @@ -186,8 +193,7 @@ def sequential_composition(self) -> list[mass.Composition]: position_comp += tag.composition except (AttributeError, KeyError) as e: raise ModificationException( - "Cannot resolve composition for modification " - f"{tag.value}." + "Cannot resolve composition for modification " f"{tag.value}." ) from e comp_list.append(position_comp) @@ -282,9 +288,7 @@ def sequential_theoretical_mass(self) -> float: try: position_mass = mass.std_aa_mass[aa] except (AttributeError, KeyError) as e: - raise AmbiguousResidueException( - f"Cannot resolve mass for amino acid {aa}." - ) from e + raise AmbiguousResidueException(f"Cannot resolve mass for amino acid {aa}.") from e # Fixed modifications if aa in fixed_rules: position_mass += fixed_rules[aa] @@ -428,9 +432,7 @@ def _rename_modification_list(mods): "fixed_modifications", ]: if self.properties[mod_type]: - self.properties[mod_type] = _rename_modification_list( - self.properties[mod_type] - ) + self.properties[mod_type] = _rename_modification_list(self.properties[mod_type]) def add_fixed_modifications( self, modification_rules: list[tuple[str, list[str]]] | dict[str, list[str]] diff --git a/psm_utils/psm.py b/psm_utils/psm.py index 8dbf5e7..affa2ab 100644 --- a/psm_utils/psm.py +++ b/psm_utils/psm.py @@ -27,7 +27,7 @@ class PSM(BaseModel): source: Optional[str] = None provenance_data: Optional[Dict[str, str]] = dict() metadata: Optional[Dict[str, str]] = dict() - rescoring_features: Optional[Dict[str, str]] = dict() + rescoring_features: Optional[Dict[str, float]] = dict() class Config: arbitrary_types_allowed = True # Allows non-pydantic class Peptidoform diff --git a/psm_utils/psm_list.py b/psm_utils/psm_list.py index 8efaf6a..ff71c32 100644 --- a/psm_utils/psm_list.py +++ b/psm_utils/psm_list.py @@ -1,7 +1,6 @@ from __future__ import annotations import re -from itertools import compress from typing import Iterable, List, Sequence import numpy as np @@ -99,7 +98,12 @@ def __getitem__(self, item) -> PSM | list[PSM]: return PSMList(psm_list=self.psm_list[item]) elif isinstance(item, str): # Return PSM property as array across full PSMList - return np.array([psm[item] for psm in self.psm_list]) + try: + # Let NumPy coerce dtype (e.g., multidimensional arrays) + return np.array([psm[item] for psm in self.psm_list]) + except ValueError: + # If dtype is not consistent, force dtype to be object + return np.array([psm[item] for psm in self.psm_list], dtype=object) elif _is_iterable_of_bools(item): # Return new PSMList with items that were True return PSMList(psm_list=[self.psm_list[i] for i in np.flatnonzero(item)]) @@ -121,7 +125,7 @@ def __setitem__(self, item, values: Sequence) -> None: @property def collections(self) -> list: """List of collections in :py:class:`PSMList`.""" - if (self["collection"] != None).any(): + if (self["collection"] != None).any(): # noqa: E711 return list(np.unique(self["collection"])) else: return [None] @@ -129,11 +133,19 @@ def collections(self) -> list: @property def runs(self) -> list: """List of runs in :py:class:`PSMList`.""" - if (self["run"] != None).any(): + if (self["run"] != None).any(): # noqa: E711 return list(np.unique(self["run"])) else: return [None] + def append(self, psm: PSM) -> None: + """Append PSM to :py:class:`PSMList`.""" + self.psm_list.append(psm) + + def extend(self, psm_list: PSMList) -> None: + """Extend :py:class:`PSMList` with another :py:class:`PSMList`.""" + self.psm_list.extend(psm_list) + def get_psm_dict(self): """Get nested dictionary of PSMs by collection, run, and spectrum_id.""" psm_dict = {} @@ -199,9 +211,7 @@ def find_decoys(self, decoy_pattern: str) -> None: """ decoy_pattern = re.compile(decoy_pattern) for psm in self: - psm.is_decoy = all( - [decoy_pattern.search(p) is not None for p in psm.protein_list] - ) + psm.is_decoy = all([decoy_pattern.search(p) is not None for p in psm.protein_list]) def calculate_qvalues(self, reverse: bool = True, **kwargs) -> None: """ @@ -289,9 +299,7 @@ def add_fixed_modifications( ] for psm in self.psm_list: if psm.peptidoform.properties["fixed_modifications"]: - psm.peptidoform.properties["fixed_modifications"].extend( - modification_rules - ) + psm.peptidoform.properties["fixed_modifications"].extend(modification_rules) else: psm.peptidoform.properties["fixed_modifications"] = modification_rules diff --git a/pyproject.toml b/pyproject.toml index d2005e0..f47b71e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,12 +2,12 @@ name = "psm-utils" description = "Common utilities for parsing and handling peptide-spectrum matches and search engine results." readme = "README.rst" -license = {file = "LICENSE"} +license = { file = "LICENSE" } keywords = ["proteomics", "peptide", "spectrum", "identification", "parsing"] authors = [ - {name = "Ralf Gabriels", email = "ralf@gabriels.dev"}, - {name = "Robbin Bouwmeester", email = "robbin.bouwmeester@ugent.be"}, - {name = "Arthur Declercq", email = "arthur.declercq@ugent.be"}, + { name = "Ralf Gabriels", email = "ralf@gabriels.dev" }, + { name = "Robbin Bouwmeester", email = "robbin.bouwmeester@ugent.be" }, + { name = "Arthur Declercq", email = "arthur.declercq@ugent.be" }, ] classifiers = [ "Intended Audience :: Science/Research", @@ -28,15 +28,11 @@ dependencies = [ "click", "rich", "pydantic", + "sqlalchemy", ] [project.optional-dependencies] -dev = [ - "black", - "isort>5", - "pytest", - "pytest-cov" -] +dev = ["black", "isort>5", "pytest", "pytest-cov"] docs = [ "sphinx", "numpydoc>=1,<2", @@ -47,10 +43,7 @@ docs = [ "sphinx_rtd_theme", "sphinx-autobuild", ] -online = [ - "streamlit", - "plotly", -] +online = ["streamlit", "plotly"] [project.urls] GitHub = "https://github.com/compomics/psm_utils" @@ -74,3 +67,7 @@ profile = "black" [tool.black] line-length = 99 target-version = ['py37'] + +[tool.ruff] +line-length = 99 +target-version = 'py37' diff --git a/tests/test_io/test_msamanda.py b/tests/test_io/test_msamanda.py index cecb62e..8007b02 100644 --- a/tests/test_io/test_msamanda.py +++ b/tests/test_io/test_msamanda.py @@ -1,7 +1,6 @@ import pytest import psm_utils.io.msamanda as msamanda -from psm_utils import peptidoform, psm, psm_list TEST_COL = [ "Title", From ab8390d790209e871cf4baab0a630664a9cc918d Mon Sep 17 00:00:00 2001 From: RalfG Date: Fri, 25 Aug 2023 16:53:25 +0200 Subject: [PATCH 03/13] Black formatting (line length 99) --- psm_utils/io/__init__.py | 8 ++------ psm_utils/io/_base_classes.py | 1 + 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/psm_utils/io/__init__.py b/psm_utils/io/__init__.py index be0b424..f77e6f5 100644 --- a/psm_utils/io/__init__.py +++ b/psm_utils/io/__init__.py @@ -92,9 +92,7 @@ def _infer_filetype(filename: str): """Infer filetype from filename.""" for filetype, properties in FILETYPES.items(): - if re.fullmatch( - properties["filename_pattern"], str(filename), flags=re.IGNORECASE - ): + if re.fullmatch(properties["filename_pattern"], str(filename), flags=re.IGNORECASE): return filetype else: raise PSMUtilsIOException("Could not infer filetype.") @@ -260,9 +258,7 @@ def convert( if _supports_write_psm(writer_cls): # Setup iterator, potentially with progress bar iterator = ( - track(reader, description="[green]Converting file") - if show_progressbar - else reader + track(reader, description="[green]Converting file") if show_progressbar else reader ) # Get example PSM and instantiate writer diff --git a/psm_utils/io/_base_classes.py b/psm_utils/io/_base_classes.py index f4c3cfc..60f9c19 100644 --- a/psm_utils/io/_base_classes.py +++ b/psm_utils/io/_base_classes.py @@ -45,6 +45,7 @@ def read_file(self) -> PSMList: """Read full PSM file into a PSMList object.""" return PSMList(psm_list=[psm for psm in self.__iter__()]) + class WriterBase(ABC): """Abstract base class for PSM file writers.""" From c20f728465975ef28b958f596e22431dff3311d8 Mon Sep 17 00:00:00 2001 From: RalfG Date: Fri, 25 Aug 2023 17:50:18 +0200 Subject: [PATCH 04/13] `io.mzid`: Allow score key not to be present in all PSMs in a single mzid file `io.mzid`: Add support for user to define custom score key `io.mzid`: Add `Proteome Discoverer Delta Score` to known scores --- psm_utils/io/mzid.py | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/psm_utils/io/mzid.py b/psm_utils/io/mzid.py index 0aae4ec..0a1ef20 100644 --- a/psm_utils/io/mzid.py +++ b/psm_utils/io/mzid.py @@ -57,6 +57,7 @@ "ProteinProspector:score", "ProteinScape:SequestMetaScore", "ProteomeDiscoverer:Delta Score", + "Proteome Discoverer Delta Score", "SEQUEST:xcorr", "SIM-XL score ", "SQID:score ", @@ -87,7 +88,7 @@ class MzidReader(ReaderBase): - def __init__(self, filename: str | Path, *args, **kwargs) -> None: + def __init__(self, filename: str | Path, *args, score_key: str = None, **kwargs) -> None: """ Reader for mzIdentML PSM files. @@ -95,6 +96,9 @@ def __init__(self, filename: str | Path, *args, **kwargs) -> None: ---------- filename: str, pathlib.Path Path to PSM file. + score_key: str, optional + Name of the score metric to use as PSM score. If not provided, the score metric is + inferred from the file if one of the child parameters of ``MS:1001143`` is present. Examples -------- @@ -127,8 +131,10 @@ def __init__(self, filename: str | Path, *args, **kwargs) -> None: """ super().__init__(filename, *args, **kwargs) + + self.score_key = score_key + self._non_metadata_keys = None - self._score_key = None self._rt_key = None self._spectrum_rt_key = None self._qvalue_key = None @@ -258,12 +264,17 @@ def _get_peptide_spectrum_match( else: psm_spectrum_id = spectrum_id + try: + score = sii[self.score_key] + except KeyError: + score = None + psm = PSM( peptidoform=peptidoform, spectrum_id=psm_spectrum_id, run=run, is_decoy=is_decoy, - score=sii[self._score_key], + score=score, qvalue=sii[self._qvalue_key] if self._qvalue_key else None, pep=sii[self._pep_key] if self._pep_key else None, precursor_mz=precursor_mz, @@ -288,8 +299,9 @@ def _get_non_metadata_keys(self, keys: list): "Modification", ] # Get the score key and add to default keys - self._score_key = self._infer_score_name(keys) - default_keys.append(self._score_key) + if not self.score_key: + self.score_key = self._infer_score_name(keys) + default_keys.append(self.score_key) # Get the q-value key and add to default keys self._qvalue_key = self._infer_qvalue_name(keys) From 86a0b1d3c734252280135137fdb4536b1d99da46 Mon Sep 17 00:00:00 2001 From: RalfG Date: Fri, 25 Aug 2023 18:12:11 +0200 Subject: [PATCH 05/13] Fix sage test with coerced float rescoring_features --- tests/test_data/results.sage.tsv | 2 +- tests/test_io/test_sage.py | 37 ++++++++++++++++++++++++++------ 2 files changed, 31 insertions(+), 8 deletions(-) diff --git a/tests/test_data/results.sage.tsv b/tests/test_data/results.sage.tsv index 6840337..9d6ec16 100644 --- a/tests/test_data/results.sage.tsv +++ b/tests/test_data/results.sage.tsv @@ -1,2 +1,2 @@ peptide proteins num_proteins filename scannr rank label expmass calcmass charge peptide_len missed_cleavages isotope_error precursor_ppm fragment_ppm hyperscore delta_next delta_best rt aligned_rt predicted_rt delta_rt_model matched_peaks longest_b longest_y longest_y_pct matched_intensity_pct scored_candidates poisson sage_discriminant_score posterior_error spectrum_q peptide_q protein_q ms1_intensity ms2_intensity -LQSRPAAPPAPGPGQLTLR sp|Q99536|VAT1_HUMAN 1 LQSRPAAPPAPGPGQLTLR.mzML controllerType=0 controllerNumber=1 scan=30069 1 1 1926.0815 1926.08 3 19 0 0.0 0.8239083 0.5347518 71.78844460255384 71.78844460255384 0.0 108.2854 NaN 0.0 NaN 22 9 12 0.6315789 50.785 1 -1.9562811911083433 1.2944585 1.0 1.0 1.0 1.0 306146180.0 56930696.0 +LQSRPAAPPAPGPGQLTLR sp|Q99536|VAT1_HUMAN 1 LQSRPAAPPAPGPGQLTLR.mzML controllerType=0 controllerNumber=1 scan=30069 1 1 1926.0815 1926.08 3 19 0 0.0 0.8239083 0.5347518 71.78844460255384 71.78844460255384 0.0 108.2854 0.0 0.0 0.0 22 9 12 0.6315789 50.785 1 -1.9562811911083433 1.2944585 1.0 1.0 1.0 1.0 306146180.0 56930696.0 diff --git a/tests/test_io/test_sage.py b/tests/test_io/test_sage.py index d7addd6..10d2bcc 100644 --- a/tests/test_io/test_sage.py +++ b/tests/test_io/test_sage.py @@ -1,12 +1,12 @@ """Tests for psm_utils.io.sage.""" -from psm_utils.psm import PSM from psm_utils.io.sage import SageReader +from psm_utils.psm import PSM test_psm = PSM( - peptidoform='LQSRPAAPPAPGPGQLTLR/3', - spectrum_id='controllerType=0 controllerNumber=1 scan=30069', - run='LQSRPAAPPAPGPGQLTLR', + peptidoform="LQSRPAAPPAPGPGQLTLR/3", + spectrum_id="controllerType=0 controllerNumber=1 scan=30069", + run="LQSRPAAPPAPGPGQLTLR", collection=None, spectrum=None, is_decoy=False, @@ -16,11 +16,34 @@ precursor_mz=643.0349916987367, retention_time=108.2854, ion_mobility=None, - protein_list=['sp|Q99536|VAT1_HUMAN'], + protein_list=["sp|Q99536|VAT1_HUMAN"], rank=1, - source='sage', + source="sage", metadata={}, - rescoring_features={'expmass': '1926.0815', 'calcmass': '1926.08', 'peptide_len': '19', 'missed_cleavages': '0', 'isotope_error': '0.0', 'precursor_ppm': '0.8239083', 'fragment_ppm': '0.5347518', 'hyperscore': '71.78844460255384', 'delta_next': '71.78844460255384', 'delta_best': '0.0', 'delta_rt_model': 'NaN', 'aligned_rt': 'NaN', 'predicted_rt': '0.0', 'matched_peaks': '22', 'longest_b': '9', 'longest_y': '12', 'longest_y_pct': '0.6315789', 'matched_intensity_pct': '50.785', 'scored_candidates': '1', 'poisson': '-1.9562811911083433', 'ms1_intensity': '306146180.0', 'ms2_intensity': '56930696.0'}, + rescoring_features={ + "expmass": 1926.0815, + "calcmass": 1926.08, + "peptide_len": 19.0, + "missed_cleavages": 0.0, + "isotope_error": 0.0, + "precursor_ppm": 0.8239083, + "fragment_ppm": 0.5347518, + "hyperscore": 71.78844460255384, + "delta_next": 71.78844460255384, + "delta_best": 0.0, + "delta_rt_model": 0.0, + "aligned_rt": 0.0, + "predicted_rt": 0.0, + "matched_peaks": 22.0, + "longest_b": 9.0, + "longest_y": 12.0, + "longest_y_pct": 0.6315789, + "matched_intensity_pct": 50.785, + "scored_candidates": 1.0, + "poisson": -1.9562811911083433, + "ms1_intensity": 306146180.0, + "ms2_intensity": 56930696.0, + }, ) From 22373f9b670858bf990b81aab0c1134cdf2b1c77 Mon Sep 17 00:00:00 2001 From: Buur Louise <77845950+louisebuur@users.noreply.github.com> Date: Tue, 12 Sep 2023 16:31:43 +0200 Subject: [PATCH 06/13] Update msamanda.py --- psm_utils/io/msamanda.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/psm_utils/io/msamanda.py b/psm_utils/io/msamanda.py index b79c7fd..3040a74 100644 --- a/psm_utils/io/msamanda.py +++ b/psm_utils/io/msamanda.py @@ -121,16 +121,15 @@ def _parse_peptidoform(seq, modifications, charge): "Parse MSAmanda sequence, modifications and charge to proforma sequence" peptide = [""] + [aa.upper() for aa in seq] + [""] pattern = re.compile( - r"(?P[A-Z])(?P-term|\d+)\((?P[A-Za-z]+)\|([-0-9.]+)\|(variable|fixed)\);?" + r"(?:(?:(?P[A-Z])(?P\d+))|(?P[CN]-Term))\((?P[^|()]+)\|(?P[-0-9.]+)\|(?Pvariable|fixed)\);?" ) for match in pattern.finditer(modifications): - if match.group("loc") == "-term": - if match.group("site") == "N": - peptide[0] = peptide[0] + f'[{match.group("mod_name")}]' - elif match.group("site") == "C": - peptide[-1] = peptide[-1] + f'[{match.group("mod_name")}]' - else: + if match.group("term") == "N-Term": + peptide[0] = peptide[0] + f'[{match.group("mod_name")}]' + elif match.group("term") == "C-Term": + peptide[-1] = peptide[-1] + f'[{match.group("mod_name")}]' + if match.group("loc") is not None: peptide[int(match.group("loc"))] = ( peptide[int(match.group("loc"))] + f'[{match.group("mod_name")}]' ) From dcf28dba8a405b4fb7c6c34a5061a022769025b9 Mon Sep 17 00:00:00 2001 From: louisebuur <77845950+louisebuur@users.noreply.github.com> Date: Wed, 13 Sep 2023 14:57:14 +0200 Subject: [PATCH 07/13] Update test_msamanda.py --- tests/test_io/test_msamanda.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_io/test_msamanda.py b/tests/test_io/test_msamanda.py index 8007b02..48f6820 100644 --- a/tests/test_io/test_msamanda.py +++ b/tests/test_io/test_msamanda.py @@ -50,7 +50,7 @@ def test_parse_peptidoform(self): 4, ), ("TLPMFHDEEHAR", "", 3), - ("VSAGEIAVTGAGR", "C-term(Amidated|-0.984016|variable)", 2), + ("VSAGEIAVTGAGR", "C-Term(Amidated|-0.984016|variable)", 2), ("VQAELDETK", "", 2), ], "expected_output": [ From 7be325ed8736b42ff68d86049915d694f16bf19e Mon Sep 17 00:00:00 2001 From: louisebuur <77845950+louisebuur@users.noreply.github.com> Date: Wed, 13 Sep 2023 15:08:51 +0200 Subject: [PATCH 08/13] Update test_msamanda.py --- tests/test_io/test_msamanda.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_io/test_msamanda.py b/tests/test_io/test_msamanda.py index 48f6820..82e94f1 100644 --- a/tests/test_io/test_msamanda.py +++ b/tests/test_io/test_msamanda.py @@ -46,7 +46,7 @@ def test_parse_peptidoform(self): ), ( "LRDTcLQK", - "N-term(Acetyl|40|variable);C5(Carbamidomethyl|57.021464|fixed)", + "N-Term(Acetyl|40|variable);C5(Carbamidomethyl|57.021464|fixed)", 4, ), ("TLPMFHDEEHAR", "", 3), From adb0f875be9b429ffc96578d9858c3abc86c50c1 Mon Sep 17 00:00:00 2001 From: RalfG Date: Wed, 13 Sep 2023 15:45:57 +0200 Subject: [PATCH 09/13] Set newer build>os configuration for readthedocs --- .readthedocs.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.readthedocs.yml b/.readthedocs.yml index 0a400bd..3f8e86a 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -1,7 +1,11 @@ version: 2 +build: + os: ubuntu-22.04 + tools: + python: "3.11" + python: - version: '3.8' install: - method: pip path: . From 357c7495336b2276bb2564119b90bd380bcfcca3 Mon Sep 17 00:00:00 2001 From: RalfG Date: Wed, 13 Sep 2023 15:47:27 +0200 Subject: [PATCH 10/13] io.mzid: Give `PeptideShaker PSM score` priority over other potential search engine scores (required for correct PeptideShaker mzid parsing) --- psm_utils/io/mzid.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/psm_utils/io/mzid.py b/psm_utils/io/mzid.py index 0a1ef20..315b153 100644 --- a/psm_utils/io/mzid.py +++ b/psm_utils/io/mzid.py @@ -29,7 +29,9 @@ # Excerpt from MS:1001143 items (PSM-level search engine specific statistic) # Not all child terms are used, as not all statistics are direct scores. +# Items are sorted by priority (if more scores are present, the first found one is used) STANDARD_SEARCHENGINE_SCORES = [ + "PeptideShaker PSM score", "Amanda:AmandaScore", "Andromeda:score", "Byonic:Score", @@ -49,7 +51,6 @@ "OMSSA:evalue", "OpenPepXL:score", "PEAKS:peptideScore", - "PeptideShaker PSM score", "Phenyx:Pepzscore", "ProLuCID:xcorr", "ProSight:specral C-score", From 5434309255366087bd6391abd066c187819b0540 Mon Sep 17 00:00:00 2001 From: RalfG Date: Wed, 13 Sep 2023 15:47:59 +0200 Subject: [PATCH 11/13] Upgrade Github action versions --- .github/workflows/publish.yml | 34 +++++++------- .github/workflows/test.yml | 87 +++++++++++++++++------------------ 2 files changed, 60 insertions(+), 61 deletions(-) diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 29b3934..91ac915 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -8,24 +8,24 @@ jobs: publish: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: '3.8' + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: "3.8" - - name: Install dependencies - run: | - python -m pip install --upgrade pip flit - pip install --editable .[dev] + - name: Install dependencies + run: | + python -m pip install --upgrade pip flit + pip install --editable .[dev] - - name: Test package - run: | - pytest + - name: Test package + run: | + pytest - - name: Build and publish to PyPI - env: - FLIT_USERNAME: ${{ secrets.PYPI_USERNAME }} - FLIT_PASSWORD: ${{ secrets.PYPI_PASSWORD }} - run: flit publish + - name: Build and publish to PyPI + env: + FLIT_USERNAME: ${{ secrets.PYPI_USERNAME }} + FLIT_PASSWORD: ${{ secrets.PYPI_PASSWORD }} + run: flit publish diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 766f11e..bea1eee 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -10,64 +10,63 @@ jobs: test-with-codecov: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - - name: Set up Python 3.8 - uses: actions/setup-python@v4 - with: - python-version: '3.8' + - name: Set up Python 3.8 + uses: actions/setup-python@v4 + with: + python-version: "3.8" - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install flit flake8 + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install flit flake8 - - name: Lint with flake8 - run: | - # stop the build if there are Python syntax errors or undefined names - flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics - # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide - flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics - - name: Install with Flit - run: flit install --deps all + - name: Install with Flit + run: flit install --deps all - - name: Test with pytest and codecov - run: | - pytest --cov=psm_utils tests - - - name: Upload coverage reports to Codecov - uses: codecov/codecov-action@v3 + - name: Test with pytest and codecov + run: | + pytest --cov=psm_utils tests + - name: Upload coverage reports to Codecov + uses: codecov/codecov-action@v3 test-platforms: runs-on: ubuntu-latest strategy: matrix: - python-version: ['3.8', '3.9', '3.10', '3.11'] + python-version: ["3.8", "3.9", "3.10", "3.11"] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 - with: - python-version: ${{ matrix.python-version }} + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} - - name: Install package and its dependencies - run: | - python -m pip install --upgrade pip - pip install flit flake8 + - name: Install package and its dependencies + run: | + python -m pip install --upgrade pip + pip install flit flake8 - - name: Lint with flake8 - run: | - # stop the build if there are Python syntax errors or undefined names - flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics - # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide - flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics - - name: Install with Flit - run: flit install --deps all + - name: Install with Flit + run: flit install --deps all - - name: Test with pytest - run: | - pytest + - name: Test with pytest + run: | + pytest From 7fc6cd775b93835a936f5c82ec55af2706162c03 Mon Sep 17 00:00:00 2001 From: RalfG Date: Wed, 13 Sep 2023 15:58:38 +0200 Subject: [PATCH 12/13] online: Remove useless `== True` --- online/pages/1_PSM_file_statistics.py | 31 ++++++++------------------- 1 file changed, 9 insertions(+), 22 deletions(-) diff --git a/online/pages/1_PSM_file_statistics.py b/online/pages/1_PSM_file_statistics.py index f143853..9daf6f7 100644 --- a/online/pages/1_PSM_file_statistics.py +++ b/online/pages/1_PSM_file_statistics.py @@ -92,14 +92,14 @@ def _input_form(self): "Score type: order", options=[True, False], format_func=lambda x: "Higher score is better" - if x == True + if x else "Lower score is better", ) self.state["log_scale"] = row[1].radio( "Score type: scale", options=[False, True], format_func=lambda x: "Logarithmic scale (e.g., e-value)" - if x == True + if x else "Linear scale (e.g., Andromeda score)", help=( """ @@ -142,9 +142,7 @@ def _read_file(self): # Write file to disk for psm_utils; then read with NamedTemporaryFile(mode="wb", delete=False) as tmp_file: if self.state["input_file"].name.lower().endswith(".gz"): - tmp_file.write( - gzip.decompress(self.state["input_file"].getvalue()) - ) + tmp_file.write(gzip.decompress(self.state["input_file"].getvalue())) else: tmp_file.write(self.state["input_file"].getvalue()) tmp_file.flush() @@ -179,7 +177,7 @@ def _prepare_psms(self): ) # If no q-values, try to calculate - if (psm_list["qvalue"] == None).any(): + if (psm_list["qvalue"] == None).any(): # noqa: E711 # If no decoys, display error if percent_decoys == 0.0: st.error( @@ -231,13 +229,9 @@ def _show_results(self): n_collections = psm_df["collection"].unique().shape[0] n_runs = psm_df[["run", "collection"]].drop_duplicates().shape[0] - n_spectra = ( - psm_df[["spectrum_id", "run", "collection"]].drop_duplicates().shape[0] - ) + n_spectra = psm_df[["spectrum_id", "run", "collection"]].drop_duplicates().shape[0] n_psms = psm_df.shape[0] - n_peptidoforms = ( - psm_df["peptidoform"].apply(lambda x: x.proforma).unique().shape[0] - ) + n_peptidoforms = psm_df["peptidoform"].apply(lambda x: x.proforma).unique().shape[0] percent_decoys = np.count_nonzero(psm_list["is_decoy"]) / len(psm_list) row_1 = st.columns(3) @@ -262,19 +256,12 @@ def _show_results(self): else: psm_df_filtered = psm_df[psm_df["qvalue"] <= self.state["fdr_threshold"]] n_spectra = ( - psm_df_filtered[["spectrum_id", "run", "collection"]] - .drop_duplicates() - .shape[0] + psm_df_filtered[["spectrum_id", "run", "collection"]].drop_duplicates().shape[0] ) n_psms = psm_df_filtered.shape[0] - n_peptides = ( - psm_df["peptidoform"].apply(lambda x: x.sequence).unique().shape[0] - ) + n_peptides = psm_df["peptidoform"].apply(lambda x: x.sequence).unique().shape[0] n_peptidoforms = ( - psm_df_filtered["peptidoform"] - .apply(lambda x: x.proforma) - .unique() - .shape[0] + psm_df_filtered["peptidoform"].apply(lambda x: x.proforma).unique().shape[0] ) row_3 = st.columns(4) From 600697b01b53d4b478389dc6bc0c0be54688718e Mon Sep 17 00:00:00 2001 From: RalfG Date: Fri, 15 Sep 2023 11:51:33 +0200 Subject: [PATCH 13/13] io.tsv: Raise more descriptive error if TSV row cannot be parsed into PSM --- psm_utils/io/tsv.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/psm_utils/io/tsv.py b/psm_utils/io/tsv.py index 60f7cc1..98751f8 100644 --- a/psm_utils/io/tsv.py +++ b/psm_utils/io/tsv.py @@ -52,6 +52,8 @@ from pathlib import Path from typing import Optional +from pydantic import ValidationError + from psm_utils.io._base_classes import ReaderBase, WriterBase from psm_utils.io.exceptions import PSMUtilsIOException from psm_utils.psm import PSM @@ -66,10 +68,13 @@ def __iter__(self): with open(self.filename, "rt") as open_file: reader = csv.DictReader(open_file, delimiter="\t") for row in reader: - yield PSM(**self._parse_entry(row)) + try: + yield PSM(**self._parse_entry(row)) + except ValidationError as e: + raise PSMUtilsIOException(f"Could not parse PSM from row: `{row}`") from e @staticmethod - def _parse_entry(entry: dict): + def _parse_entry(entry: dict) -> dict: """Parse single TSV entry to :py:class:`~psm_utils.psm.PSM`.""" # Replace empty strings with None entry = {k: v if v else None for k, v in entry.items()}