diff --git a/src/gpsea/model/_protein.py b/src/gpsea/model/_protein.py index 2fe55684..6db36e09 100644 --- a/src/gpsea/model/_protein.py +++ b/src/gpsea/model/_protein.py @@ -118,21 +118,28 @@ class FeatureType(enum.Enum): A region of interest that cannot be described in other subsections. """ + ZINC_FINGER = enum.auto() + """ + A zinc finger is a small, functional, independently folded domain that coordinates one or more zinc ions to stabilize its structure through cysteine and/or histidine residues. + """ + @staticmethod def from_string(category: str) -> "FeatureType": - cat_lover = category.lower() - if cat_lover == "repeat": + cat_lower = category.lower() + if cat_lower == "repeat": return FeatureType.REGION - elif cat_lover == "motif": + elif cat_lower == "motif": return FeatureType.MOTIF - elif cat_lover == "domain": + elif cat_lower == "domain": return FeatureType.DOMAIN - elif cat_lover == "region": + elif cat_lower == "region": return FeatureType.REGION - elif cat_lover == "coiled coil": + elif cat_lower == "coiled coil": return FeatureType.REGION - elif cat_lover == "compositional bias": + elif cat_lower == "compositional bias": return FeatureType.COMPOSITIONAL_BIAS + elif cat_lower == "zinc finger": + return FeatureType.ZINC_FINGER else: raise ValueError(f'Unrecognized protein feature type: "{category}"') @@ -361,19 +368,16 @@ def from_uniprot_json( regions = list() for feature in data["features"]: - try: - region_name = feature["description"] - locus = feature["location"] - region_start = int(locus["start"]["value"]) - 1 # convert to 0-based coordinates - region_end = int(locus["end"]["value"]) - feature_type = FeatureType.from_string(feature["type"]) - finfo = FeatureInfo( - name=region_name, region=Region(start=region_start, end=region_end) - ) - pfeature = ProteinFeature.create(info=finfo, feature_type=feature_type) - regions.append(pfeature) - except Exception as feature_exception: - print(f"Could not parse feature: {str(feature_exception)} (skipping)") + region_name = feature["description"] + locus = feature["location"] + region_start = int(locus["start"]["value"]) - 1 # convert to 0-based coordinates + region_end = int(locus["end"]["value"]) + feature_type = FeatureType.from_string(feature["type"]) + finfo = FeatureInfo( + name=region_name, region=Region(start=region_start, end=region_end) + ) + pfeature = ProteinFeature.create(info=finfo, feature_type=feature_type) + regions.append(pfeature) return ProteinMetadata( protein_id=protein_id, diff --git a/src/gpsea/preprocessing/_uniprot.py b/src/gpsea/preprocessing/_uniprot.py index ccdf5ed7..03ee6663 100644 --- a/src/gpsea/preprocessing/_uniprot.py +++ b/src/gpsea/preprocessing/_uniprot.py @@ -110,7 +110,9 @@ def annotate(self, protein_id: str) -> ProteinMetadata: Args: protein_id (str): A protein ID Returns: - Sequence[ProteinMetadata]: A sequence of ProteinMetadata objects, or an empty sequence if no data was found. + ProteinMetadata: A :class:`~gpsea.model.ProteinMetadata` corresponding to the input `protein_id`. + Raises: + ValueError: in case of issues with `protein_id`, I/O issues, or parsing the REST response. """ if not isinstance(protein_id, str): raise ValueError(f'Protein ID must be a str but it was {type(protein_id)}') diff --git a/tests/preprocessing/data/uniprot_response/P17010_manual_download.json b/tests/preprocessing/data/uniprot_response/P17010_manual_download.json new file mode 100644 index 00000000..eb8e66d3 --- /dev/null +++ b/tests/preprocessing/data/uniprot_response/P17010_manual_download.json @@ -0,0 +1,282 @@ +{ + "entryType": "UniProtKB reviewed (Swiss-Prot)", + "primaryAccession": "P17010", + "features": [ + { + "type": "Zinc finger", + "location": { + "start": { + "value": 425, + "modifier": "EXACT" + }, + "end": { + "value": 447, + "modifier": "EXACT" + } + }, + "description": "C2H2-type 1", + "evidences": [ + { + "evidenceCode": "ECO:0000255", + "source": "PROSITE-ProRule", + "id": "PRU00042" + } + ] + }, + { + "type": "Zinc finger", + "location": { + "start": { + "value": 456, + "modifier": "EXACT" + }, + "end": { + "value": 478, + "modifier": "EXACT" + } + }, + "description": "C2H2-type 2", + "evidences": [ + { + "evidenceCode": "ECO:0000255", + "source": "PROSITE-ProRule", + "id": "PRU00042" + } + ] + }, + { + "type": "Zinc finger", + "location": { + "start": { + "value": 488, + "modifier": "EXACT" + }, + "end": { + "value": 510, + "modifier": "EXACT" + } + }, + "description": "C2H2-type 3", + "evidences": [ + { + "evidenceCode": "ECO:0000255", + "source": "PROSITE-ProRule", + "id": "PRU00042" + } + ] + }, + { + "type": "Zinc finger", + "location": { + "start": { + "value": 519, + "modifier": "EXACT" + }, + "end": { + "value": 542, + "modifier": "EXACT" + } + }, + "description": "C2H2-type 4", + "evidences": [ + { + "evidenceCode": "ECO:0000255", + "source": "PROSITE-ProRule", + "id": "PRU00042" + } + ] + }, + { + "type": "Zinc finger", + "location": { + "start": { + "value": 548, + "modifier": "EXACT" + }, + "end": { + "value": 570, + "modifier": "EXACT" + } + }, + "description": "C2H2-type 5", + "evidences": [ + { + "evidenceCode": "ECO:0000255", + "source": "PROSITE-ProRule", + "id": "PRU00042" + } + ] + }, + { + "type": "Zinc finger", + "location": { + "start": { + "value": 576, + "modifier": "EXACT" + }, + "end": { + "value": 599, + "modifier": "EXACT" + } + }, + "description": "C2H2-type 6", + "evidences": [ + { + "evidenceCode": "ECO:0000255", + "source": "PROSITE-ProRule", + "id": "PRU00042" + } + ] + }, + { + "type": "Zinc finger", + "location": { + "start": { + "value": 605, + "modifier": "EXACT" + }, + "end": { + "value": 627, + "modifier": "EXACT" + } + }, + "description": "C2H2-type 7", + "evidences": [ + { + "evidenceCode": "ECO:0000255", + "source": "PROSITE-ProRule", + "id": "PRU00042" + } + ] + }, + { + "type": "Zinc finger", + "location": { + "start": { + "value": 633, + "modifier": "EXACT" + }, + "end": { + "value": 656, + "modifier": "EXACT" + } + }, + "description": "C2H2-type 8", + "evidences": [ + { + "evidenceCode": "ECO:0000255", + "source": "PROSITE-ProRule", + "id": "PRU00042" + } + ] + }, + { + "type": "Zinc finger", + "location": { + "start": { + "value": 662, + "modifier": "EXACT" + }, + "end": { + "value": 684, + "modifier": "EXACT" + } + }, + "description": "C2H2-type 9", + "evidences": [ + { + "evidenceCode": "ECO:0000255", + "source": "PROSITE-ProRule", + "id": "PRU00042" + } + ] + }, + { + "type": "Zinc finger", + "location": { + "start": { + "value": 690, + "modifier": "EXACT" + }, + "end": { + "value": 713, + "modifier": "EXACT" + } + }, + "description": "C2H2-type 10", + "evidences": [ + { + "evidenceCode": "ECO:0000255", + "source": "PROSITE-ProRule", + "id": "PRU00042" + } + ] + }, + { + "type": "Zinc finger", + "location": { + "start": { + "value": 719, + "modifier": "EXACT" + }, + "end": { + "value": 741, + "modifier": "EXACT" + } + }, + "description": "C2H2-type 11", + "evidences": [ + { + "evidenceCode": "ECO:0000255", + "source": "PROSITE-ProRule", + "id": "PRU00042" + } + ] + }, + { + "type": "Zinc finger", + "location": { + "start": { + "value": 747, + "modifier": "EXACT" + }, + "end": { + "value": 770, + "modifier": "EXACT" + } + }, + "description": "C2H2-type 12", + "evidences": [ + { + "evidenceCode": "ECO:0000255", + "source": "PROSITE-ProRule", + "id": "PRU00042" + } + ] + }, + { + "type": "Zinc finger", + "location": { + "start": { + "value": 776, + "modifier": "EXACT" + }, + "end": { + "value": 798, + "modifier": "EXACT" + } + }, + "description": "C2H2-type 13", + "evidences": [ + { + "evidenceCode": "ECO:0000255", + "source": "PROSITE-ProRule", + "id": "PRU00042" + } + ] + } + ], + "extraAttributes": { + "uniParcId": "UPI000013C504" + } +} \ No newline at end of file diff --git a/tests/preprocessing/test_uniprot_json.py b/tests/preprocessing/test_uniprot_json.py index 6765640d..51558b5c 100644 --- a/tests/preprocessing/test_uniprot_json.py +++ b/tests/preprocessing/test_uniprot_json.py @@ -9,6 +9,10 @@ ITPR1_protein_len = 2758 +#P17010 +ZFX_protein_len = 805 +ZFX_protein_id = "NP_001171555.1" + class TestUniprotJsonToMetadata: """ Test function that ingests UniProt JSON and transforms it to a ProteinMetadata object @@ -36,6 +40,28 @@ def q8izt6_protein_metadata( protein_length=ITPR1_protein_len, ) + @pytest.fixture + def P17010_json_file_path( + self, + fpath_preprocessing_data_dir: str, + ) -> str: + return os.path.join(fpath_preprocessing_data_dir, "uniprot_response", "P17010_manual_download.json") + + @pytest.fixture + def P17010_protein_metadata( + self, + P17010_json_file_path: str, + ) -> ProteinMetadata: + """ + :returns: ProteinMetadata created from a downloaded UniProt JSON file + """ + return ProteinMetadata.from_uniprot_json( + protein_id=ZFX_protein_id, + label=ZFX_protein_id, + uniprot_json=P17010_json_file_path, + protein_length=ZFX_protein_len, + ) + def test_general_info( self, q8izt6_protein_metadata: ProteinMetadata, @@ -68,3 +94,20 @@ def test_first_feature( assert feature_0.info.start == 919 assert feature_0.info.end == 1056 assert feature_0.info.name == "Calponin-homology (CH) 1" + + + def test_ZFX( + self, + P17010_protein_metadata: ProteinMetadata, + ): + """ + :[{"type":"Zinc finger", + "location":{"start":{"value":425,"modifier":"EXACT"}, + "end":{"value":447,"modifier":"EXACT"}}," + """ + assert P17010_protein_metadata is not None + feature_0 = P17010_protein_metadata.protein_features[0] + assert feature_0.feature_type == FeatureType.ZINC_FINGER + assert feature_0.info.start == 424 ## zero based open-closed + assert feature_0.info.end == 447 +