Skip to content

Commit

Permalink
Enriched metadata with date, game_week and game_id
Browse files Browse the repository at this point in the history
  • Loading branch information
SportsDynamicsDS committed Jul 18, 2024
1 parent d001eb0 commit 3881606
Show file tree
Hide file tree
Showing 16 changed files with 231 additions and 22 deletions.
7 changes: 7 additions & 0 deletions kloppy/domain/models/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -1003,6 +1003,10 @@ class Metadata:
orientation: See [`Orientation`][kloppy.domain.models.common.Orientation]
flags:
provider: See [`Provider`][kloppy.domain.models.common.Provider]
date: Date of the game.
game_week: Game week (or match day) of the game. It can also be the stage
(ex: "8th Finals"), if the game is happening during a cup or a play-off.
game_id: Game id of the game from the provider.
"""

teams: List[Team]
Expand All @@ -1014,6 +1018,9 @@ class Metadata:
coordinate_system: CoordinateSystem
score: Optional[Score] = None
frame_rate: Optional[float] = None
date: Optional[datetime] = None
game_week: Optional[str] = None
game_id: Optional[str] = None
attributes: Optional[Dict] = field(default_factory=dict, compare=False)

def __post_init__(self):
Expand Down
9 changes: 9 additions & 0 deletions kloppy/infra/serializers/event/sportec/deserializer.py
Original file line number Diff line number Diff line change
Expand Up @@ -404,6 +404,12 @@ def deserialize(self, inputs: SportecEventDataInputs) -> EventDataset:
event_root = objectify.fromstring(inputs.event_data.read())

with performance_logging("parse data", logger=logger):
date = parse(
match_root.MatchInformation.General.attrib["KickoffTime"]
).astimezone(timezone.utc)
game_week = match_root.MatchInformation.General.attrib["MatchDay"]
game_id = match_root.MatchInformation.General.attrib["MatchId"]

sportec_metadata = sportec_metadata_from_xml_elm(match_root)
teams = home_team, away_team = sportec_metadata.teams
transformer = self.get_transformer(
Expand Down Expand Up @@ -632,6 +638,9 @@ def deserialize(self, inputs: SportecEventDataInputs) -> EventDataset:
flags=~(DatasetFlag.BALL_STATE | DatasetFlag.BALL_OWNING_TEAM),
provider=Provider.SPORTEC,
coordinate_system=transformer.get_to_coordinate_system(),
date=date,
game_week=game_week,
game_id=game_id,
)

return EventDataset(
Expand Down
6 changes: 6 additions & 0 deletions kloppy/infra/serializers/event/statsperform/deserializer.py
Original file line number Diff line number Diff line change
Expand Up @@ -614,6 +614,9 @@ def deserialize(self, inputs: StatsPerformInputs) -> EventDataset:
periods = metadata_parser.extract_periods()
score = metadata_parser.extract_score()
teams = metadata_parser.extract_lineups()
date = events_parser.extract_date()
game_week = events_parser.extract_game_week()
game_id = events_parser.extract_game_id()
raw_events = [
event
for event in events_parser.extract_events()
Expand Down Expand Up @@ -827,6 +830,9 @@ def deserialize(self, inputs: StatsPerformInputs) -> EventDataset:
if inputs.event_feed.upper() == "F24"
else Provider.STATSPERFORM,
coordinate_system=transformer.get_to_coordinate_system(),
date=date,
game_week=game_week,
game_id=game_id,
)

return EventDataset(
Expand Down
13 changes: 12 additions & 1 deletion kloppy/infra/serializers/event/statsperform/parsers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@

from datetime import datetime
from dataclasses import dataclass, field
from typing import List, Optional


@dataclass
Expand Down Expand Up @@ -53,6 +52,18 @@ def extract_score(self) -> Optional[Score]:
"""Return the score of the game."""
return None

def extract_date(self) -> Optional[str]:
"""Return the date of the game."""
return None

def extract_game_week(self) -> Optional[str]:
"""Return the game_week of the game."""
return None

def extract_game_id(self) -> Optional[str]:
"""Return the game_id of the game."""
return None

def extract_lineups(self) -> Tuple[Team, Team]:
"""Return the home and away team."""
raise NotImplementedError
Expand Down
30 changes: 27 additions & 3 deletions kloppy/infra/serializers/event/statsperform/parsers/f24_xml.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
"""XML parser for Opta F24 feeds."""
import pytz
from datetime import datetime
from typing import List
from datetime import datetime, timezone
from typing import List, Optional
from dateutil.parser import parse

from kloppy.domain import Period
from .base import OptaXMLParser, OptaEvent


Expand Down Expand Up @@ -53,3 +53,27 @@ def extract_events(self) -> List[OptaEvent]:
)
for event in game_elm.iterchildren("Event")
]

def extract_date(self) -> Optional[str]:
"""Return the date of the game."""
game_elm = self.root.find("Game")
if game_elm and "game_date" in game_elm.attrib:
return parse(game_elm.attrib["game_date"]).astimezone(timezone.utc)
else:
return None

def extract_game_week(self) -> Optional[str]:
"""Return the game_week of the game."""
game_elm = self.root.find("Game")
if game_elm and "matchday" in game_elm.attrib:
return game_elm.attrib["matchday"]
else:
return None

def extract_game_id(self) -> Optional[str]:
"""Return the game_id of the game."""
game_elm = self.root.find("Game")
if game_elm and "id" in game_elm.attrib:
return game_elm.attrib["id"]
else:
return None
33 changes: 31 additions & 2 deletions kloppy/infra/serializers/event/statsperform/parsers/ma1_json.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""JSON parser for Stats Perform MA1 feeds."""
import pytz
from datetime import datetime
from datetime import datetime, timezone
from typing import Any, Optional, List, Tuple, Dict

from kloppy.domain import Period, Score, Team, Ground, Player
Expand Down Expand Up @@ -30,7 +30,13 @@ def extract_periods(self) -> List[Period]:
return parsed_periods

def extract_score(self) -> Optional[Score]:
return None
live_data = self.root["liveData"]
match_details = live_data["matchDetails"]
home_score = match_details["scores"]["total"]["home"]
away_score = match_details["scores"]["total"]["away"]
if home_score is None or away_score is None:
return None
return Score(home=home_score, away=away_score)

def extract_lineups(self) -> Tuple[Team, Team]:
teams = {}
Expand Down Expand Up @@ -76,6 +82,29 @@ def extract_lineups(self) -> Tuple[Team, Team]:
raise DeserializationError("Lineup incomplete")
return home_team, away_team

def extract_date(self) -> Optional[str]:
"""Return the date of the game."""
if "matchInfo" in self.root and "date" in self.root["matchInfo"]:
return datetime.strptime(
self.root["matchInfo"]["date"], "%Y-%m-%dZ"
).astimezone(timezone.utc)
else:
return None

def extract_game_week(self) -> Optional[str]:
"""Return the game_week of the game."""
if "matchInfo" in self.root and "week" in self.root["matchInfo"]:
return self.root["matchInfo"]["week"]
else:
return None

def extract_game_id(self) -> Optional[str]:
"""Return the game_id of the game."""
if "matchInfo" in self.root and "id" in self.root["matchInfo"]:
return self.root["matchInfo"]["id"]
else:
return None

def _parse_teams(self) -> List[Dict[str, Any]]:
parsed_teams = []
match_info = self.root["matchInfo"]
Expand Down
25 changes: 22 additions & 3 deletions kloppy/infra/serializers/tracking/secondspectrum.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import json
import logging
from datetime import timedelta
from datetime import datetime, timedelta, timezone
import warnings
from typing import Tuple, Dict, Optional, Union, NamedTuple, IO
from typing import Dict, Optional, Union, NamedTuple, IO

from lxml import objectify

Expand All @@ -23,6 +23,7 @@
Player,
Provider,
PlayerData,
Score,
)

from kloppy.utils import Readable, performance_logging
Expand Down Expand Up @@ -290,16 +291,34 @@ def _iter():
)
orientation = Orientation.NOT_SET

if metadata:
score = Score(
home=metadata["homeScore"], away=metadata["awayScore"]
)
year, month, day = (
metadata["year"],
metadata["month"],
metadata["day"],
)
date = datetime(year, month, day, 0, 0, tzinfo=timezone.utc)
game_id = metadata["ssiId"]
else:
score = None
date = None
game_id = None

metadata = Metadata(
teams=teams,
periods=periods,
pitch_dimensions=transformer.get_to_coordinate_system().pitch_dimensions,
score=None,
score=score,
frame_rate=frame_rate,
orientation=orientation,
provider=Provider.SECONDSPECTRUM,
flags=DatasetFlag.BALL_OWNING_TEAM | DatasetFlag.BALL_STATE,
coordinate_system=transformer.get_to_coordinate_system(),
date=date,
game_id=game_id,
)

return TrackingDataset(
Expand Down
11 changes: 10 additions & 1 deletion kloppy/infra/serializers/tracking/sportec/deserializer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
import warnings
from collections import defaultdict
from typing import NamedTuple, Optional, Union, IO
from datetime import timedelta
from datetime import timedelta, timezone
from dateutil.parser import parse

from lxml import objectify

Expand Down Expand Up @@ -128,6 +129,11 @@ def deserialize(
)

with performance_logging("parse raw data", logger=logger):
date = parse(
match_root.MatchInformation.General.attrib["KickoffTime"]
).astimezone(timezone.utc)
game_week = match_root.MatchInformation.General.attrib["MatchDay"]
game_id = match_root.MatchInformation.General.attrib["MatchId"]

def _iter():
player_map = {}
Expand Down Expand Up @@ -229,6 +235,9 @@ def _iter():
provider=Provider.SPORTEC,
flags=DatasetFlag.BALL_OWNING_TEAM | DatasetFlag.BALL_STATE,
coordinate_system=transformer.get_to_coordinate_system(),
date=date,
game_week=game_week,
game_id=game_id,
)

return TrackingDataset(
Expand Down
9 changes: 8 additions & 1 deletion kloppy/infra/serializers/tracking/statsperform.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,10 @@ def deserialize(self, inputs: StatsPerformInputs) -> TrackingDataset:
for period in meta_data_parser.extract_periods()
}
teams_list = list(meta_data_parser.extract_lineups())
score = meta_data_parser.extract_score()
date = meta_data_parser.extract_date()
game_week = meta_data_parser.extract_game_week()
game_id = meta_data_parser.extract_game_id()

with performance_logging("Loading tracking data", logger=logger):
tracking_data = inputs.raw_data.read().decode("ascii").splitlines()
Expand Down Expand Up @@ -192,12 +196,15 @@ def _iter():
teams=teams_list,
periods=list(periods.values()),
pitch_dimensions=transformer.get_to_coordinate_system().pitch_dimensions,
score=None,
score=score,
frame_rate=frame_rate,
orientation=orientation,
provider=Provider.STATSPERFORM,
flags=DatasetFlag.BALL_STATE,
coordinate_system=transformer.get_to_coordinate_system(),
date=date,
game_week=game_week,
game_id=game_id,
)

return TrackingDataset(
Expand Down
9 changes: 8 additions & 1 deletion kloppy/infra/serializers/tracking/tracab/tracab_dat.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import logging
from datetime import timedelta
from datetime import timedelta, timezone
import warnings
from typing import Dict, Optional, Union
import html
from dateutil.parser import parse

from lxml import objectify

Expand Down Expand Up @@ -169,6 +170,10 @@ def deserialize(self, inputs: TRACABInputs) -> TrackingDataset:
pitch_size_height = float(
match.attrib["fPitchYSizeMeters"].replace(",", ".")
)
date = parse(meta_data.match.attrib["dtDate"]).astimezone(
timezone.utc
)
game_id = meta_data.match.attrib["iId"]

periods = []
for period in match.iterchildren(tag="period"):
Expand Down Expand Up @@ -269,6 +274,8 @@ def _iter():
provider=Provider.TRACAB,
flags=DatasetFlag.BALL_OWNING_TEAM | DatasetFlag.BALL_STATE,
coordinate_system=transformer.get_to_coordinate_system(),
date=date,
game_id=game_id,
)

return TrackingDataset(
Expand Down
4 changes: 3 additions & 1 deletion kloppy/tests/files/second_spectrum_fake_metadata.json
Original file line number Diff line number Diff line change
Expand Up @@ -343,7 +343,9 @@
"optaUuid": "plq0s041krxbe84x8t7ic4pwp"
}
],
"ssiId": null,
"homeScore": 2,
"awayScore": 1,
"ssiId": "1234456",
"optaId": 1234456,
"optaUuid": "evpjwmqgrefu1yb27oh74i8yw",
"homeSsiId": "zvmye4srxdo1zvczazdppju0a03sado8a74z",
Expand Down
5 changes: 4 additions & 1 deletion kloppy/tests/test_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,9 @@ def _get_tracking_dataset(self):
score=None,
provider=None,
coordinate_system=None,
date="2024-05-19T13:30:00",
game_week="35",
game_id="2374516",
)

tracking_data = TrackingDataset(
Expand Down Expand Up @@ -170,7 +173,7 @@ def test_transform_to_orientation(self):
# Create a dataset with the KLOPPY pitch dimensions
# and HOME_AWAY orientation
original = self._get_tracking_dataset().transform(
to_pitch_dimensions=to_pitch_dimensions,
to_pitch_dimensions=to_pitch_dimensions
)
assert original.metadata.orientation == Orientation.HOME_AWAY
assert original.frames[0].ball_coordinates == Point3D(x=1, y=0, z=0)
Expand Down
22 changes: 19 additions & 3 deletions kloppy/tests/test_secondspectrum.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
import logging
from datetime import timedelta
from datetime import datetime, timedelta, timezone
from pathlib import Path

import pytest

from kloppy.domain import (
AttackingDirection,
Orientation,
Provider,
Point,
Expand Down Expand Up @@ -100,6 +98,24 @@ def test_correct_deserialization(
assert pitch_dimensions.y_dim.min == -33.985
assert pitch_dimensions.y_dim.max == 33.985

# Check enriched metadata
date = dataset.metadata.date
if date:
assert isinstance(date, datetime)
assert date == datetime(
1900, 1, 26, 0, 0, tzinfo=timezone.utc
)

game_week = dataset.metadata.game_week
if game_week:
assert isinstance(game_week, str)
assert game_week == "1"

game_id = dataset.metadata.game_id
if game_id:
assert isinstance(game_id, str)
assert game_id == "1234456"

def test_correct_normalized_deserialization(
self, meta_data: Path, raw_data: Path, additional_meta_data: Path
):
Expand Down
Loading

0 comments on commit 3881606

Please sign in to comment.