Skip to content

Commit

Permalink
Add game lineups to dataset (#231)
Browse files Browse the repository at this point in the history
  • Loading branch information
LarchLiu authored Oct 10, 2023
1 parent bcb1e57 commit 8fd956b
Show file tree
Hide file tree
Showing 8 changed files with 211 additions and 4 deletions.
83 changes: 83 additions & 0 deletions dbt/models/base/base_game_lineups.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
with
json_game_lineups as (

select
json(value) as raw_json_row,
str_split(filename, '/')[4] as season,
json_extract_string(raw_json_row, '$.game_id')::integer as game_id,
row_number() over (partition by game_id order by season desc) as n

from {{ source("raw_tfmkt", "game_lineups") }}

),
home_club_starting_lineup as (

select
unnest(json_transform(json_extract(raw_json_row, '$.home_club.starting_lineup'), '["JSON"]')) as json_row,
(str_split(json_extract_string(raw_json_row, '$.home_club.href'), '/')[5])::integer as club_id,
'starting_lineup' as "type",
game_id
from json_game_lineups

where n = 1

),
home_club_substitutes as (

select
unnest(json_transform(json_extract(raw_json_row, '$.home_club.substitutes'), '["JSON"]')) as json_row,
(str_split(json_extract_string(raw_json_row, '$.home_club.href'), '/')[5])::integer as club_id,
'substitutes' as "type",
game_id
from json_game_lineups

where n = 1

),
away_club_starting_lineup as (

select
unnest(json_transform(json_extract(raw_json_row, '$.away_club.starting_lineup'), '["JSON"]')) as json_row,
(str_split(json_extract_string(raw_json_row, '$.away_club.href'), '/')[5])::integer as club_id,
'starting_lineup' as "type",
game_id
from json_game_lineups

where n = 1

),
away_club_substitutes as (

select
unnest(json_transform(json_extract(raw_json_row, '$.away_club.substitutes'), '["JSON"]')) as json_row,
(str_split(json_extract_string(raw_json_row, '$.away_club.href'), '/')[5])::integer as club_id,
'substitutes' as "type",
game_id
from json_game_lineups

where n = 1

),
all_game_lineups as (

select * from home_club_starting_lineup
UNION ALL
select * from home_club_substitutes
UNION ALL
select * from away_club_starting_lineup
UNION ALL
select * from away_club_substitutes

)

select
game_id,
club_id,
"type",
(json_row ->> 'number') as "number",
(str_split((json_row ->> 'href'), '/')[5])::integer as player_id,
(json_row ->> 'name') as "player_name",
(json_row ->> 'team_captain')::integer as "team_captain",
(json_row ->> 'position') as "position",

from all_game_lineups
32 changes: 30 additions & 2 deletions dbt/models/base/base_games.sql
Original file line number Diff line number Diff line change
@@ -1,5 +1,18 @@
with
json_games as (
json_game_lineups as (

select
json(value) as json_row,
str_split(filename, '/')[4] as season,
json_extract_string(json_row, '$.game_id') as game_id,
json_extract_string(json_row, '$.home_club.formation') as home_club_formation,
json_extract_string(json_row, '$.away_club.formation') as away_club_formation,
row_number() over (partition by game_id order by season desc) as n

from {{ source("raw_tfmkt", "game_lineups") }}

),
json_raw_games as (

select
json(value) as json_row,
Expand All @@ -9,6 +22,19 @@ with

from {{ source("raw_tfmkt", "games") }}

),
json_games as (

select
json_raw_games.*,
json_game_lineups.home_club_formation,
json_game_lineups.away_club_formation

from json_raw_games
left join json_game_lineups
on json_raw_games.game_id = json_game_lineups.game_id and json_game_lineups.n = 1
where json_raw_games.n = 1

)

select
Expand Down Expand Up @@ -53,7 +79,9 @@ select
json_extract_string(json_row, '$.referee') as referee,
(
'https://www.transfermarkt.co.uk' || json_extract_string(json_row, '$.href')
) as url
) as url,
home_club_formation,
away_club_formation

from json_games

Expand Down
10 changes: 10 additions & 0 deletions dbt/models/base/sources.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,3 +48,13 @@ sources:
columns=struct_pack(value := 'VARCHAR'), delim='\1', quote='\0',
filename=True
)
- name: game_lineups
meta:
external_location: >
read_csv(
'../data/raw/*/game_lineups.json.gz',
header=False,
columns=struct_pack(value := 'VARCHAR'), delim='\1', quote='\0',
filename=True
)
22 changes: 22 additions & 0 deletions dbt/models/curated/game_lineups.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
with game_lineups_cte as (

select * from {{ ref('base_game_lineups') }}

)

select
{{ dbt_utils.generate_surrogate_key([
'game_id',
'player_id',
'club_id',
'type',
'player_name',
'position',
'team_captain',
'number'
]) }} as game_lineups_id,
game_lineups_cte.*

from game_lineups_cte

order by game_id, club_id, "type"
23 changes: 23 additions & 0 deletions dbt/models/curated/models.yml
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,8 @@ models:
- away_club_name
- home_club_manager_name
- away_club_manager_name
- home_club_formation
- away_club_formation
- stadium
- attendance
- referee
Expand Down Expand Up @@ -282,3 +284,24 @@ models:
- name: minute
tests:
- not_null

- name: game_lineups
tests:
- dbt_expectations.expect_table_columns_to_contain_set:
column_list:
- game_id
- club_id
- player_id
- player_name
- number
- position
- team_captain
- type
- dbt_expectations.expect_table_row_count_to_be_between:
min_value: 81000
max_value: 1340000
columns:
- name: game_lineups_id
tests:
- not_null
- unique
5 changes: 3 additions & 2 deletions scripts/acquire.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,8 @@ class Asset():
'games': 'competitions',
'clubs': 'competitions',
'players': 'clubs',
'appearances': 'players'
'appearances': 'players',
'game_lineups': 'games'
}

def __init__(self, name) -> None:
Expand Down Expand Up @@ -223,7 +224,7 @@ def acquire_on_cloud(job_name, job_queue, job_definition, branch, message, args,
local_parser.add_argument(
'--asset',
help="Name of the asset to be acquired",
choices=['clubs', 'players', 'games', 'appearances', 'all'],
choices=['clubs', 'players', 'games', 'game_lineups', 'appearances', 'all'],
required=True
)
local_parser.add_argument(
Expand Down
38 changes: 38 additions & 0 deletions transfermarkt_datasets/assets/cur_game_lineups.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
from transfermarkt_datasets.core.asset import Asset
from transfermarkt_datasets.core.schema import Schema, Field

class CurGameLineupsAsset(Asset):

name = "cur_game_lineups"
description = """
The `games_lineups` asset contains one row per game player in the dataset.
Players are extracted from the game ["line-ups"](https://www.transfermarkt.co.uk/spielbericht/aufstellung/spielbericht/3098550) in transfermarkt and they are tied to one particular `game`, identified by the `game_id` column.
"""
file_name = "game_lineups.csv.gz"

def __init__(self, *args, **kwargs) -> None:
super().__init__(*args, **kwargs)

self.schema = Schema(
fields=[
Field(
name='game_lineups_id',
type='string',
description="Surrogate key"
),
Field(name='game_id', type='integer'),
Field(name='player_id', type='integer'),
Field(name='club_id', type='integer'),
Field(name='type', type='string'),
Field(name='player_name', type='string'),
Field(name='team_captain', type='string'),
Field(name='number', type='string'),
Field(name='position', type='string'),
]
)

self.schema.primary_key = [
'game_id',
'player_id',
'club_id',
]
2 changes: 2 additions & 0 deletions transfermarkt_datasets/assets/cur_games.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ def __init__(self, *args, **kwargs) -> None:
Field(name='away_club_name', type='string', tags=["explore"]),
Field(name='home_club_manager_name', type='string'),
Field(name='away_club_manager_name', type='string'),
Field(name='home_club_formation', type='string'),
Field(name='away_club_formation', type='string'),
Field(name='stadium', type='string'),
Field(name='attendance', type='integer'),
Field(name='referee', type='string'),
Expand Down

0 comments on commit 8fd956b

Please sign in to comment.