Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add game lineups to dataset #231

Merged
merged 6 commits into from
Oct 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 83 additions & 0 deletions dbt/models/base/base_game_lineups.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
with
json_game_lineups as (

select
json(value) as raw_json_row,
str_split(filename, '/')[4] as season,
json_extract_string(raw_json_row, '$.game_id')::integer as game_id,
row_number() over (partition by game_id order by season desc) as n

from {{ source("raw_tfmkt", "game_lineups") }}

),
home_club_starting_lineup as (

select
unnest(json_transform(json_extract(raw_json_row, '$.home_club.starting_lineup'), '["JSON"]')) as json_row,
(str_split(json_extract_string(raw_json_row, '$.home_club.href'), '/')[5])::integer as club_id,
'starting_lineup' as "type",
game_id
from json_game_lineups

where n = 1

),
home_club_substitutes as (

select
unnest(json_transform(json_extract(raw_json_row, '$.home_club.substitutes'), '["JSON"]')) as json_row,
(str_split(json_extract_string(raw_json_row, '$.home_club.href'), '/')[5])::integer as club_id,
'substitutes' as "type",
game_id
from json_game_lineups

where n = 1

),
away_club_starting_lineup as (

select
unnest(json_transform(json_extract(raw_json_row, '$.away_club.starting_lineup'), '["JSON"]')) as json_row,
(str_split(json_extract_string(raw_json_row, '$.away_club.href'), '/')[5])::integer as club_id,
'starting_lineup' as "type",
game_id
from json_game_lineups

where n = 1

),
away_club_substitutes as (

select
unnest(json_transform(json_extract(raw_json_row, '$.away_club.substitutes'), '["JSON"]')) as json_row,
(str_split(json_extract_string(raw_json_row, '$.away_club.href'), '/')[5])::integer as club_id,
'substitutes' as "type",
game_id
from json_game_lineups

where n = 1

),
all_game_lineups as (

select * from home_club_starting_lineup
UNION ALL
select * from home_club_substitutes
UNION ALL
select * from away_club_starting_lineup
UNION ALL
select * from away_club_substitutes

)

select
game_id,
club_id,
"type",
(json_row ->> 'number') as "number",
(str_split((json_row ->> 'href'), '/')[5])::integer as player_id,
(json_row ->> 'name') as "player_name",
(json_row ->> 'team_captain')::integer as "team_captain",
(json_row ->> 'position') as "position",

from all_game_lineups
32 changes: 30 additions & 2 deletions dbt/models/base/base_games.sql
Original file line number Diff line number Diff line change
@@ -1,5 +1,18 @@
with
json_games as (
json_game_lineups as (

select
json(value) as json_row,
str_split(filename, '/')[4] as season,
json_extract_string(json_row, '$.game_id') as game_id,
json_extract_string(json_row, '$.home_club.formation') as home_club_formation,
json_extract_string(json_row, '$.away_club.formation') as away_club_formation,
row_number() over (partition by game_id order by season desc) as n

from {{ source("raw_tfmkt", "game_lineups") }}

),
json_raw_games as (

select
json(value) as json_row,
Expand All @@ -9,6 +22,19 @@ with

from {{ source("raw_tfmkt", "games") }}

),
json_games as (

select
json_raw_games.*,
json_game_lineups.home_club_formation,
json_game_lineups.away_club_formation

from json_raw_games
left join json_game_lineups
on json_raw_games.game_id = json_game_lineups.game_id and json_game_lineups.n = 1
where json_raw_games.n = 1

)

select
Expand Down Expand Up @@ -53,7 +79,9 @@ select
json_extract_string(json_row, '$.referee') as referee,
(
'https://www.transfermarkt.co.uk' || json_extract_string(json_row, '$.href')
) as url
) as url,
home_club_formation,
away_club_formation

from json_games

Expand Down
10 changes: 10 additions & 0 deletions dbt/models/base/sources.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,3 +48,13 @@ sources:
columns=struct_pack(value := 'VARCHAR'), delim='\1', quote='\0',
filename=True
)

- name: game_lineups
meta:
external_location: >
read_csv(
'../data/raw/*/game_lineups.json.gz',
header=False,
columns=struct_pack(value := 'VARCHAR'), delim='\1', quote='\0',
filename=True
)
22 changes: 22 additions & 0 deletions dbt/models/curated/game_lineups.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
with game_lineups_cte as (

select * from {{ ref('base_game_lineups') }}

)

select
{{ dbt_utils.generate_surrogate_key([
'game_id',
'player_id',
'club_id',
'type',
'player_name',
'position',
'team_captain',
'number'
]) }} as game_lineups_id,
game_lineups_cte.*

from game_lineups_cte

order by game_id, club_id, "type"
23 changes: 23 additions & 0 deletions dbt/models/curated/models.yml
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,8 @@ models:
- away_club_name
- home_club_manager_name
- away_club_manager_name
- home_club_formation
- away_club_formation
- stadium
- attendance
- referee
Expand Down Expand Up @@ -282,3 +284,24 @@ models:
- name: minute
tests:
- not_null

- name: game_lineups
tests:
- dbt_expectations.expect_table_columns_to_contain_set:
column_list:
- game_id
- club_id
- player_id
- player_name
- number
- position
- team_captain
- type
- dbt_expectations.expect_table_row_count_to_be_between:
min_value: 81000
max_value: 1340000
columns:
- name: game_lineups_id
tests:
- not_null
- unique
5 changes: 3 additions & 2 deletions scripts/acquire.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,8 @@ class Asset():
'games': 'competitions',
'clubs': 'competitions',
'players': 'clubs',
'appearances': 'players'
'appearances': 'players',
'game_lineups': 'games'
}

def __init__(self, name) -> None:
Expand Down Expand Up @@ -223,7 +224,7 @@ def acquire_on_cloud(job_name, job_queue, job_definition, branch, message, args,
local_parser.add_argument(
'--asset',
help="Name of the asset to be acquired",
choices=['clubs', 'players', 'games', 'appearances', 'all'],
choices=['clubs', 'players', 'games', 'game_lineups', 'appearances', 'all'],
required=True
)
local_parser.add_argument(
Expand Down
38 changes: 38 additions & 0 deletions transfermarkt_datasets/assets/cur_game_lineups.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
from transfermarkt_datasets.core.asset import Asset
from transfermarkt_datasets.core.schema import Schema, Field

class CurGameLineupsAsset(Asset):

name = "cur_game_lineups"
description = """
The `games_lineups` asset contains one row per game player in the dataset.
Players are extracted from the game ["line-ups"](https://www.transfermarkt.co.uk/spielbericht/aufstellung/spielbericht/3098550) in transfermarkt and they are tied to one particular `game`, identified by the `game_id` column.
"""
file_name = "game_lineups.csv.gz"

def __init__(self, *args, **kwargs) -> None:
super().__init__(*args, **kwargs)

self.schema = Schema(
fields=[
Field(
name='game_lineups_id',
type='string',
description="Surrogate key"
),
Field(name='game_id', type='integer'),
Field(name='player_id', type='integer'),
Field(name='club_id', type='integer'),
Field(name='type', type='string'),
Field(name='player_name', type='string'),
Field(name='team_captain', type='string'),
Field(name='number', type='string'),
Field(name='position', type='string'),
]
)

self.schema.primary_key = [
'game_id',
'player_id',
'club_id',
]
2 changes: 2 additions & 0 deletions transfermarkt_datasets/assets/cur_games.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ def __init__(self, *args, **kwargs) -> None:
Field(name='away_club_name', type='string', tags=["explore"]),
Field(name='home_club_manager_name', type='string'),
Field(name='away_club_manager_name', type='string'),
Field(name='home_club_formation', type='string'),
Field(name='away_club_formation', type='string'),
Field(name='stadium', type='string'),
Field(name='attendance', type='integer'),
Field(name='referee', type='string'),
Expand Down