Skip to content

Commit

Permalink
Merge pull request #346 from monarch-initiative/zinc_finger
Browse files Browse the repository at this point in the history
adding zinc finger to ProteinFeatures
  • Loading branch information
ielis authored Nov 13, 2024
2 parents 89185fd + 9b6a6b1 commit 4263bca
Show file tree
Hide file tree
Showing 4 changed files with 352 additions and 21 deletions.
44 changes: 24 additions & 20 deletions src/gpsea/model/_protein.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,21 +118,28 @@ class FeatureType(enum.Enum):
A region of interest that cannot be described in other subsections.
"""

ZINC_FINGER = enum.auto()
"""
A zinc finger is a small, functional, independently folded domain that coordinates one or more zinc ions to stabilize its structure through cysteine and/or histidine residues.
"""

@staticmethod
def from_string(category: str) -> "FeatureType":
cat_lover = category.lower()
if cat_lover == "repeat":
cat_lower = category.lower()
if cat_lower == "repeat":
return FeatureType.REGION
elif cat_lover == "motif":
elif cat_lower == "motif":
return FeatureType.MOTIF
elif cat_lover == "domain":
elif cat_lower == "domain":
return FeatureType.DOMAIN
elif cat_lover == "region":
elif cat_lower == "region":
return FeatureType.REGION
elif cat_lover == "coiled coil":
elif cat_lower == "coiled coil":
return FeatureType.REGION
elif cat_lover == "compositional bias":
elif cat_lower == "compositional bias":
return FeatureType.COMPOSITIONAL_BIAS
elif cat_lower == "zinc finger":
return FeatureType.ZINC_FINGER
else:
raise ValueError(f'Unrecognized protein feature type: "{category}"')

Expand Down Expand Up @@ -361,19 +368,16 @@ def from_uniprot_json(

regions = list()
for feature in data["features"]:
try:
region_name = feature["description"]
locus = feature["location"]
region_start = int(locus["start"]["value"]) - 1 # convert to 0-based coordinates
region_end = int(locus["end"]["value"])
feature_type = FeatureType.from_string(feature["type"])
finfo = FeatureInfo(
name=region_name, region=Region(start=region_start, end=region_end)
)
pfeature = ProteinFeature.create(info=finfo, feature_type=feature_type)
regions.append(pfeature)
except Exception as feature_exception:
print(f"Could not parse feature: {str(feature_exception)} (skipping)")
region_name = feature["description"]
locus = feature["location"]
region_start = int(locus["start"]["value"]) - 1 # convert to 0-based coordinates
region_end = int(locus["end"]["value"])
feature_type = FeatureType.from_string(feature["type"])
finfo = FeatureInfo(
name=region_name, region=Region(start=region_start, end=region_end)
)
pfeature = ProteinFeature.create(info=finfo, feature_type=feature_type)
regions.append(pfeature)

return ProteinMetadata(
protein_id=protein_id,
Expand Down
4 changes: 3 additions & 1 deletion src/gpsea/preprocessing/_uniprot.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,9 @@ def annotate(self, protein_id: str) -> ProteinMetadata:
Args:
protein_id (str): A protein ID
Returns:
Sequence[ProteinMetadata]: A sequence of ProteinMetadata objects, or an empty sequence if no data was found.
ProteinMetadata: A :class:`~gpsea.model.ProteinMetadata` corresponding to the input `protein_id`.
Raises:
ValueError: in case of issues with `protein_id`, I/O issues, or parsing the REST response.
"""
if not isinstance(protein_id, str):
raise ValueError(f'Protein ID must be a str but it was {type(protein_id)}')
Expand Down
282 changes: 282 additions & 0 deletions tests/preprocessing/data/uniprot_response/P17010_manual_download.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,282 @@
{
"entryType": "UniProtKB reviewed (Swiss-Prot)",
"primaryAccession": "P17010",
"features": [
{
"type": "Zinc finger",
"location": {
"start": {
"value": 425,
"modifier": "EXACT"
},
"end": {
"value": 447,
"modifier": "EXACT"
}
},
"description": "C2H2-type 1",
"evidences": [
{
"evidenceCode": "ECO:0000255",
"source": "PROSITE-ProRule",
"id": "PRU00042"
}
]
},
{
"type": "Zinc finger",
"location": {
"start": {
"value": 456,
"modifier": "EXACT"
},
"end": {
"value": 478,
"modifier": "EXACT"
}
},
"description": "C2H2-type 2",
"evidences": [
{
"evidenceCode": "ECO:0000255",
"source": "PROSITE-ProRule",
"id": "PRU00042"
}
]
},
{
"type": "Zinc finger",
"location": {
"start": {
"value": 488,
"modifier": "EXACT"
},
"end": {
"value": 510,
"modifier": "EXACT"
}
},
"description": "C2H2-type 3",
"evidences": [
{
"evidenceCode": "ECO:0000255",
"source": "PROSITE-ProRule",
"id": "PRU00042"
}
]
},
{
"type": "Zinc finger",
"location": {
"start": {
"value": 519,
"modifier": "EXACT"
},
"end": {
"value": 542,
"modifier": "EXACT"
}
},
"description": "C2H2-type 4",
"evidences": [
{
"evidenceCode": "ECO:0000255",
"source": "PROSITE-ProRule",
"id": "PRU00042"
}
]
},
{
"type": "Zinc finger",
"location": {
"start": {
"value": 548,
"modifier": "EXACT"
},
"end": {
"value": 570,
"modifier": "EXACT"
}
},
"description": "C2H2-type 5",
"evidences": [
{
"evidenceCode": "ECO:0000255",
"source": "PROSITE-ProRule",
"id": "PRU00042"
}
]
},
{
"type": "Zinc finger",
"location": {
"start": {
"value": 576,
"modifier": "EXACT"
},
"end": {
"value": 599,
"modifier": "EXACT"
}
},
"description": "C2H2-type 6",
"evidences": [
{
"evidenceCode": "ECO:0000255",
"source": "PROSITE-ProRule",
"id": "PRU00042"
}
]
},
{
"type": "Zinc finger",
"location": {
"start": {
"value": 605,
"modifier": "EXACT"
},
"end": {
"value": 627,
"modifier": "EXACT"
}
},
"description": "C2H2-type 7",
"evidences": [
{
"evidenceCode": "ECO:0000255",
"source": "PROSITE-ProRule",
"id": "PRU00042"
}
]
},
{
"type": "Zinc finger",
"location": {
"start": {
"value": 633,
"modifier": "EXACT"
},
"end": {
"value": 656,
"modifier": "EXACT"
}
},
"description": "C2H2-type 8",
"evidences": [
{
"evidenceCode": "ECO:0000255",
"source": "PROSITE-ProRule",
"id": "PRU00042"
}
]
},
{
"type": "Zinc finger",
"location": {
"start": {
"value": 662,
"modifier": "EXACT"
},
"end": {
"value": 684,
"modifier": "EXACT"
}
},
"description": "C2H2-type 9",
"evidences": [
{
"evidenceCode": "ECO:0000255",
"source": "PROSITE-ProRule",
"id": "PRU00042"
}
]
},
{
"type": "Zinc finger",
"location": {
"start": {
"value": 690,
"modifier": "EXACT"
},
"end": {
"value": 713,
"modifier": "EXACT"
}
},
"description": "C2H2-type 10",
"evidences": [
{
"evidenceCode": "ECO:0000255",
"source": "PROSITE-ProRule",
"id": "PRU00042"
}
]
},
{
"type": "Zinc finger",
"location": {
"start": {
"value": 719,
"modifier": "EXACT"
},
"end": {
"value": 741,
"modifier": "EXACT"
}
},
"description": "C2H2-type 11",
"evidences": [
{
"evidenceCode": "ECO:0000255",
"source": "PROSITE-ProRule",
"id": "PRU00042"
}
]
},
{
"type": "Zinc finger",
"location": {
"start": {
"value": 747,
"modifier": "EXACT"
},
"end": {
"value": 770,
"modifier": "EXACT"
}
},
"description": "C2H2-type 12",
"evidences": [
{
"evidenceCode": "ECO:0000255",
"source": "PROSITE-ProRule",
"id": "PRU00042"
}
]
},
{
"type": "Zinc finger",
"location": {
"start": {
"value": 776,
"modifier": "EXACT"
},
"end": {
"value": 798,
"modifier": "EXACT"
}
},
"description": "C2H2-type 13",
"evidences": [
{
"evidenceCode": "ECO:0000255",
"source": "PROSITE-ProRule",
"id": "PRU00042"
}
]
}
],
"extraAttributes": {
"uniParcId": "UPI000013C504"
}
}
Loading

0 comments on commit 4263bca

Please sign in to comment.