From 3881606023c78a785f26e1a1abc07ed5cdfb67c2 Mon Sep 17 00:00:00 2001 From: SportsDynamicsDS Date: Wed, 17 Jul 2024 09:26:43 +0000 Subject: [PATCH] Enriched metadata with date, game_week and game_id --- kloppy/domain/models/common.py | 7 ++++ .../serializers/event/sportec/deserializer.py | 9 +++++ .../event/statsperform/deserializer.py | 6 ++++ .../event/statsperform/parsers/base.py | 13 +++++++- .../event/statsperform/parsers/f24_xml.py | 30 +++++++++++++++-- .../event/statsperform/parsers/ma1_json.py | 33 +++++++++++++++++-- .../serializers/tracking/secondspectrum.py | 25 ++++++++++++-- .../tracking/sportec/deserializer.py | 11 ++++++- .../serializers/tracking/statsperform.py | 9 ++++- .../serializers/tracking/tracab/tracab_dat.py | 9 ++++- .../files/second_spectrum_fake_metadata.json | 4 ++- kloppy/tests/test_helpers.py | 5 ++- kloppy/tests/test_secondspectrum.py | 22 +++++++++++-- kloppy/tests/test_sportec.py | 25 ++++++++++++++ kloppy/tests/test_statsperform.py | 27 ++++++++++++--- kloppy/tests/test_tracab.py | 18 +++++++++- 16 files changed, 231 insertions(+), 22 deletions(-) diff --git a/kloppy/domain/models/common.py b/kloppy/domain/models/common.py index 88d98264..c057b9e6 100644 --- a/kloppy/domain/models/common.py +++ b/kloppy/domain/models/common.py @@ -1003,6 +1003,10 @@ class Metadata: orientation: See [`Orientation`][kloppy.domain.models.common.Orientation] flags: provider: See [`Provider`][kloppy.domain.models.common.Provider] + date: Date of the game. + game_week: Game week (or match day) of the game. It can also be the stage + (ex: "8th Finals"), if the game is happening during a cup or a play-off. + game_id: Game id of the game from the provider. """ teams: List[Team] @@ -1014,6 +1018,9 @@ class Metadata: coordinate_system: CoordinateSystem score: Optional[Score] = None frame_rate: Optional[float] = None + date: Optional[datetime] = None + game_week: Optional[str] = None + game_id: Optional[str] = None attributes: Optional[Dict] = field(default_factory=dict, compare=False) def __post_init__(self): diff --git a/kloppy/infra/serializers/event/sportec/deserializer.py b/kloppy/infra/serializers/event/sportec/deserializer.py index 4119574c..7b8ccf08 100644 --- a/kloppy/infra/serializers/event/sportec/deserializer.py +++ b/kloppy/infra/serializers/event/sportec/deserializer.py @@ -404,6 +404,12 @@ def deserialize(self, inputs: SportecEventDataInputs) -> EventDataset: event_root = objectify.fromstring(inputs.event_data.read()) with performance_logging("parse data", logger=logger): + date = parse( + match_root.MatchInformation.General.attrib["KickoffTime"] + ).astimezone(timezone.utc) + game_week = match_root.MatchInformation.General.attrib["MatchDay"] + game_id = match_root.MatchInformation.General.attrib["MatchId"] + sportec_metadata = sportec_metadata_from_xml_elm(match_root) teams = home_team, away_team = sportec_metadata.teams transformer = self.get_transformer( @@ -632,6 +638,9 @@ def deserialize(self, inputs: SportecEventDataInputs) -> EventDataset: flags=~(DatasetFlag.BALL_STATE | DatasetFlag.BALL_OWNING_TEAM), provider=Provider.SPORTEC, coordinate_system=transformer.get_to_coordinate_system(), + date=date, + game_week=game_week, + game_id=game_id, ) return EventDataset( diff --git a/kloppy/infra/serializers/event/statsperform/deserializer.py b/kloppy/infra/serializers/event/statsperform/deserializer.py index d25b203c..05f92e3f 100644 --- a/kloppy/infra/serializers/event/statsperform/deserializer.py +++ b/kloppy/infra/serializers/event/statsperform/deserializer.py @@ -614,6 +614,9 @@ def deserialize(self, inputs: StatsPerformInputs) -> EventDataset: periods = metadata_parser.extract_periods() score = metadata_parser.extract_score() teams = metadata_parser.extract_lineups() + date = events_parser.extract_date() + game_week = events_parser.extract_game_week() + game_id = events_parser.extract_game_id() raw_events = [ event for event in events_parser.extract_events() @@ -827,6 +830,9 @@ def deserialize(self, inputs: StatsPerformInputs) -> EventDataset: if inputs.event_feed.upper() == "F24" else Provider.STATSPERFORM, coordinate_system=transformer.get_to_coordinate_system(), + date=date, + game_week=game_week, + game_id=game_id, ) return EventDataset( diff --git a/kloppy/infra/serializers/event/statsperform/parsers/base.py b/kloppy/infra/serializers/event/statsperform/parsers/base.py index 9bb60f43..90a97ffe 100644 --- a/kloppy/infra/serializers/event/statsperform/parsers/base.py +++ b/kloppy/infra/serializers/event/statsperform/parsers/base.py @@ -12,7 +12,6 @@ from datetime import datetime from dataclasses import dataclass, field -from typing import List, Optional @dataclass @@ -53,6 +52,18 @@ def extract_score(self) -> Optional[Score]: """Return the score of the game.""" return None + def extract_date(self) -> Optional[str]: + """Return the date of the game.""" + return None + + def extract_game_week(self) -> Optional[str]: + """Return the game_week of the game.""" + return None + + def extract_game_id(self) -> Optional[str]: + """Return the game_id of the game.""" + return None + def extract_lineups(self) -> Tuple[Team, Team]: """Return the home and away team.""" raise NotImplementedError diff --git a/kloppy/infra/serializers/event/statsperform/parsers/f24_xml.py b/kloppy/infra/serializers/event/statsperform/parsers/f24_xml.py index 34096577..f32dbd95 100644 --- a/kloppy/infra/serializers/event/statsperform/parsers/f24_xml.py +++ b/kloppy/infra/serializers/event/statsperform/parsers/f24_xml.py @@ -1,9 +1,9 @@ """XML parser for Opta F24 feeds.""" import pytz -from datetime import datetime -from typing import List +from datetime import datetime, timezone +from typing import List, Optional +from dateutil.parser import parse -from kloppy.domain import Period from .base import OptaXMLParser, OptaEvent @@ -53,3 +53,27 @@ def extract_events(self) -> List[OptaEvent]: ) for event in game_elm.iterchildren("Event") ] + + def extract_date(self) -> Optional[str]: + """Return the date of the game.""" + game_elm = self.root.find("Game") + if game_elm and "game_date" in game_elm.attrib: + return parse(game_elm.attrib["game_date"]).astimezone(timezone.utc) + else: + return None + + def extract_game_week(self) -> Optional[str]: + """Return the game_week of the game.""" + game_elm = self.root.find("Game") + if game_elm and "matchday" in game_elm.attrib: + return game_elm.attrib["matchday"] + else: + return None + + def extract_game_id(self) -> Optional[str]: + """Return the game_id of the game.""" + game_elm = self.root.find("Game") + if game_elm and "id" in game_elm.attrib: + return game_elm.attrib["id"] + else: + return None diff --git a/kloppy/infra/serializers/event/statsperform/parsers/ma1_json.py b/kloppy/infra/serializers/event/statsperform/parsers/ma1_json.py index a3a97ec2..9f4764b7 100644 --- a/kloppy/infra/serializers/event/statsperform/parsers/ma1_json.py +++ b/kloppy/infra/serializers/event/statsperform/parsers/ma1_json.py @@ -1,6 +1,6 @@ """JSON parser for Stats Perform MA1 feeds.""" import pytz -from datetime import datetime +from datetime import datetime, timezone from typing import Any, Optional, List, Tuple, Dict from kloppy.domain import Period, Score, Team, Ground, Player @@ -30,7 +30,13 @@ def extract_periods(self) -> List[Period]: return parsed_periods def extract_score(self) -> Optional[Score]: - return None + live_data = self.root["liveData"] + match_details = live_data["matchDetails"] + home_score = match_details["scores"]["total"]["home"] + away_score = match_details["scores"]["total"]["away"] + if home_score is None or away_score is None: + return None + return Score(home=home_score, away=away_score) def extract_lineups(self) -> Tuple[Team, Team]: teams = {} @@ -76,6 +82,29 @@ def extract_lineups(self) -> Tuple[Team, Team]: raise DeserializationError("Lineup incomplete") return home_team, away_team + def extract_date(self) -> Optional[str]: + """Return the date of the game.""" + if "matchInfo" in self.root and "date" in self.root["matchInfo"]: + return datetime.strptime( + self.root["matchInfo"]["date"], "%Y-%m-%dZ" + ).astimezone(timezone.utc) + else: + return None + + def extract_game_week(self) -> Optional[str]: + """Return the game_week of the game.""" + if "matchInfo" in self.root and "week" in self.root["matchInfo"]: + return self.root["matchInfo"]["week"] + else: + return None + + def extract_game_id(self) -> Optional[str]: + """Return the game_id of the game.""" + if "matchInfo" in self.root and "id" in self.root["matchInfo"]: + return self.root["matchInfo"]["id"] + else: + return None + def _parse_teams(self) -> List[Dict[str, Any]]: parsed_teams = [] match_info = self.root["matchInfo"] diff --git a/kloppy/infra/serializers/tracking/secondspectrum.py b/kloppy/infra/serializers/tracking/secondspectrum.py index 0412c15f..ca4441e8 100644 --- a/kloppy/infra/serializers/tracking/secondspectrum.py +++ b/kloppy/infra/serializers/tracking/secondspectrum.py @@ -1,8 +1,8 @@ import json import logging -from datetime import timedelta +from datetime import datetime, timedelta, timezone import warnings -from typing import Tuple, Dict, Optional, Union, NamedTuple, IO +from typing import Dict, Optional, Union, NamedTuple, IO from lxml import objectify @@ -23,6 +23,7 @@ Player, Provider, PlayerData, + Score, ) from kloppy.utils import Readable, performance_logging @@ -290,16 +291,34 @@ def _iter(): ) orientation = Orientation.NOT_SET + if metadata: + score = Score( + home=metadata["homeScore"], away=metadata["awayScore"] + ) + year, month, day = ( + metadata["year"], + metadata["month"], + metadata["day"], + ) + date = datetime(year, month, day, 0, 0, tzinfo=timezone.utc) + game_id = metadata["ssiId"] + else: + score = None + date = None + game_id = None + metadata = Metadata( teams=teams, periods=periods, pitch_dimensions=transformer.get_to_coordinate_system().pitch_dimensions, - score=None, + score=score, frame_rate=frame_rate, orientation=orientation, provider=Provider.SECONDSPECTRUM, flags=DatasetFlag.BALL_OWNING_TEAM | DatasetFlag.BALL_STATE, coordinate_system=transformer.get_to_coordinate_system(), + date=date, + game_id=game_id, ) return TrackingDataset( diff --git a/kloppy/infra/serializers/tracking/sportec/deserializer.py b/kloppy/infra/serializers/tracking/sportec/deserializer.py index 038cb3ab..d4b8ab59 100644 --- a/kloppy/infra/serializers/tracking/sportec/deserializer.py +++ b/kloppy/infra/serializers/tracking/sportec/deserializer.py @@ -2,7 +2,8 @@ import warnings from collections import defaultdict from typing import NamedTuple, Optional, Union, IO -from datetime import timedelta +from datetime import timedelta, timezone +from dateutil.parser import parse from lxml import objectify @@ -128,6 +129,11 @@ def deserialize( ) with performance_logging("parse raw data", logger=logger): + date = parse( + match_root.MatchInformation.General.attrib["KickoffTime"] + ).astimezone(timezone.utc) + game_week = match_root.MatchInformation.General.attrib["MatchDay"] + game_id = match_root.MatchInformation.General.attrib["MatchId"] def _iter(): player_map = {} @@ -229,6 +235,9 @@ def _iter(): provider=Provider.SPORTEC, flags=DatasetFlag.BALL_OWNING_TEAM | DatasetFlag.BALL_STATE, coordinate_system=transformer.get_to_coordinate_system(), + date=date, + game_week=game_week, + game_id=game_id, ) return TrackingDataset( diff --git a/kloppy/infra/serializers/tracking/statsperform.py b/kloppy/infra/serializers/tracking/statsperform.py index 88d47965..ae25eda0 100644 --- a/kloppy/infra/serializers/tracking/statsperform.py +++ b/kloppy/infra/serializers/tracking/statsperform.py @@ -136,6 +136,10 @@ def deserialize(self, inputs: StatsPerformInputs) -> TrackingDataset: for period in meta_data_parser.extract_periods() } teams_list = list(meta_data_parser.extract_lineups()) + score = meta_data_parser.extract_score() + date = meta_data_parser.extract_date() + game_week = meta_data_parser.extract_game_week() + game_id = meta_data_parser.extract_game_id() with performance_logging("Loading tracking data", logger=logger): tracking_data = inputs.raw_data.read().decode("ascii").splitlines() @@ -192,12 +196,15 @@ def _iter(): teams=teams_list, periods=list(periods.values()), pitch_dimensions=transformer.get_to_coordinate_system().pitch_dimensions, - score=None, + score=score, frame_rate=frame_rate, orientation=orientation, provider=Provider.STATSPERFORM, flags=DatasetFlag.BALL_STATE, coordinate_system=transformer.get_to_coordinate_system(), + date=date, + game_week=game_week, + game_id=game_id, ) return TrackingDataset( diff --git a/kloppy/infra/serializers/tracking/tracab/tracab_dat.py b/kloppy/infra/serializers/tracking/tracab/tracab_dat.py index a878c9e3..0ccf6895 100644 --- a/kloppy/infra/serializers/tracking/tracab/tracab_dat.py +++ b/kloppy/infra/serializers/tracking/tracab/tracab_dat.py @@ -1,8 +1,9 @@ import logging -from datetime import timedelta +from datetime import timedelta, timezone import warnings from typing import Dict, Optional, Union import html +from dateutil.parser import parse from lxml import objectify @@ -169,6 +170,10 @@ def deserialize(self, inputs: TRACABInputs) -> TrackingDataset: pitch_size_height = float( match.attrib["fPitchYSizeMeters"].replace(",", ".") ) + date = parse(meta_data.match.attrib["dtDate"]).astimezone( + timezone.utc + ) + game_id = meta_data.match.attrib["iId"] periods = [] for period in match.iterchildren(tag="period"): @@ -269,6 +274,8 @@ def _iter(): provider=Provider.TRACAB, flags=DatasetFlag.BALL_OWNING_TEAM | DatasetFlag.BALL_STATE, coordinate_system=transformer.get_to_coordinate_system(), + date=date, + game_id=game_id, ) return TrackingDataset( diff --git a/kloppy/tests/files/second_spectrum_fake_metadata.json b/kloppy/tests/files/second_spectrum_fake_metadata.json index 546620f3..18490768 100644 --- a/kloppy/tests/files/second_spectrum_fake_metadata.json +++ b/kloppy/tests/files/second_spectrum_fake_metadata.json @@ -343,7 +343,9 @@ "optaUuid": "plq0s041krxbe84x8t7ic4pwp" } ], - "ssiId": null, + "homeScore": 2, + "awayScore": 1, + "ssiId": "1234456", "optaId": 1234456, "optaUuid": "evpjwmqgrefu1yb27oh74i8yw", "homeSsiId": "zvmye4srxdo1zvczazdppju0a03sado8a74z", diff --git a/kloppy/tests/test_helpers.py b/kloppy/tests/test_helpers.py index 0c421891..01f9c0e7 100644 --- a/kloppy/tests/test_helpers.py +++ b/kloppy/tests/test_helpers.py @@ -60,6 +60,9 @@ def _get_tracking_dataset(self): score=None, provider=None, coordinate_system=None, + date="2024-05-19T13:30:00", + game_week="35", + game_id="2374516", ) tracking_data = TrackingDataset( @@ -170,7 +173,7 @@ def test_transform_to_orientation(self): # Create a dataset with the KLOPPY pitch dimensions # and HOME_AWAY orientation original = self._get_tracking_dataset().transform( - to_pitch_dimensions=to_pitch_dimensions, + to_pitch_dimensions=to_pitch_dimensions ) assert original.metadata.orientation == Orientation.HOME_AWAY assert original.frames[0].ball_coordinates == Point3D(x=1, y=0, z=0) diff --git a/kloppy/tests/test_secondspectrum.py b/kloppy/tests/test_secondspectrum.py index b31f7a4a..f640bc25 100644 --- a/kloppy/tests/test_secondspectrum.py +++ b/kloppy/tests/test_secondspectrum.py @@ -1,11 +1,9 @@ -import logging -from datetime import timedelta +from datetime import datetime, timedelta, timezone from pathlib import Path import pytest from kloppy.domain import ( - AttackingDirection, Orientation, Provider, Point, @@ -100,6 +98,24 @@ def test_correct_deserialization( assert pitch_dimensions.y_dim.min == -33.985 assert pitch_dimensions.y_dim.max == 33.985 + # Check enriched metadata + date = dataset.metadata.date + if date: + assert isinstance(date, datetime) + assert date == datetime( + 1900, 1, 26, 0, 0, tzinfo=timezone.utc + ) + + game_week = dataset.metadata.game_week + if game_week: + assert isinstance(game_week, str) + assert game_week == "1" + + game_id = dataset.metadata.game_id + if game_id: + assert isinstance(game_id, str) + assert game_id == "1234456" + def test_correct_normalized_deserialization( self, meta_data: Path, raw_data: Path, additional_meta_data: Path ): diff --git a/kloppy/tests/test_sportec.py b/kloppy/tests/test_sportec.py index e85e5540..38381fad 100644 --- a/kloppy/tests/test_sportec.py +++ b/kloppy/tests/test_sportec.py @@ -203,3 +203,28 @@ def test_load_only_alive_frames(self, raw_data: Path, meta_data: Path): only_alive=True, ) assert len(dataset) == 199 + + def test_enriched_metadata(self, raw_data: Path, meta_data: Path): + dataset = sportec.load_tracking( + raw_data=raw_data, + meta_data=meta_data, + coordinates="sportec", + only_alive=True, + ) + + date = dataset.metadata.date + if date: + assert isinstance(date, datetime) + assert date == datetime( + 2020, 6, 5, 18, 30, 0, 210000, tzinfo=timezone.utc + ) + + game_week = dataset.metadata.game_week + if game_week: + assert isinstance(game_week, str) + assert game_week == "30" + + game_id = dataset.metadata.game_id + if game_id: + assert isinstance(game_id, str) + assert game_id == "DFL-MAT-003BN1" diff --git a/kloppy/tests/test_statsperform.py b/kloppy/tests/test_statsperform.py index d35b039a..57923711 100644 --- a/kloppy/tests/test_statsperform.py +++ b/kloppy/tests/test_statsperform.py @@ -145,6 +145,22 @@ def test_periods(self, tracking_dataset: TrackingDataset): 2020, 8, 23, 12, 56, 30, tzinfo=timezone.utc ) + def test_enriched_metadata(self, tracking_dataset: TrackingDataset): + date = tracking_dataset.metadata.date + if date: + assert isinstance(date, datetime) + assert date == datetime(2020, 8, 23, 0, 0, tzinfo=timezone.utc) + + game_week = tracking_dataset.metadata.game_week + if game_week: + assert isinstance(game_week, str) + assert game_week == "1" + + game_id = tracking_dataset.metadata.game_id + if game_id: + assert isinstance(game_id, str) + assert game_id == "7ijuqohwgmplbxdj1625sxwfe" + class TestStatsPerformEvent: """Tests related to deserializing the MA3 event data feed. @@ -155,10 +171,13 @@ class TestStatsPerformEvent: def test_deserialize_all(self, event_dataset: EventDataset): assert event_dataset.metadata.provider == Provider.STATSPERFORM - assert event_dataset.metadata.coordinate_system == OptaCoordinateSystem( - # StatsPerform does not provide pitch dimensions - pitch_length=None, - pitch_width=None, + assert ( + event_dataset.metadata.coordinate_system + == OptaCoordinateSystem( + # StatsPerform does not provide pitch dimensions + pitch_length=None, + pitch_width=None, + ) ) assert len(event_dataset.records) == 1652 diff --git a/kloppy/tests/test_tracab.py b/kloppy/tests/test_tracab.py index 3298e2ce..192f7166 100644 --- a/kloppy/tests/test_tracab.py +++ b/kloppy/tests/test_tracab.py @@ -1,5 +1,5 @@ from pathlib import Path -from datetime import timedelta +from datetime import datetime, timedelta, timezone import pytest @@ -234,6 +234,22 @@ def test_correct_normalized_deserialization( player_home_1 ].coordinates == Point(x=1.0019047619047619, y=0.49602941176470583) + date = dataset.metadata.date + if date: + assert isinstance(date, datetime) + assert date == datetime( + 2023, 12, 15, 20, 32, 20, tzinfo=timezone.utc + ) + + game_week = dataset.metadata.game_week + if game_week: + assert isinstance(game_week, str) + + game_id = dataset.metadata.game_id + if game_id: + assert isinstance(game_id, str) + assert game_id == "1" + class TestTracabMeta2: def test_correct_deserialization(