-
Notifications
You must be signed in to change notification settings - Fork 2
/
getOdds.R
145 lines (115 loc) · 5.21 KB
/
getOdds.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
getOdds <- function (url, teams) {
webpage <- read_html(url)
#Using CSS selectors to scrap the rankings section
players <- html_nodes(webpage,'.leftPad')
#Converting the ranking data to text
players_data <- html_text(players)
players_data <- players_data[!grepl("Goalscorer Markets", players_data)]
players_data <- players_data[!grepl("\\d", players_data)]
players_data <- players_data[!players_data %in% teams]
players_data <- substr(players_data, 1, nchar(players_data) - 2)
players_data <- players_data[players_data != '']
players_data <- players_data[!grepl("\\n", players_data)]
#Using CSS selectors to scrap the rankings section
odds_html <- html_nodes(webpage,'.eventprice')
#Converting the ranking data to text
odds <- html_text(odds_html)
odds <- gsub('\n\t\t\t\n\t\t\t\n\t\t\t\t\n\t\t\t\t\t',
'',
odds)
odds <- gsub('\n\t\t\t\t\n\t\t\t\n\t\t\t\n\t\t',
'',
odds)
# Replace 'evens' with 1/1
odds <- ifelse(odds == 'EVS', '1/1', odds)
odds <- ifelse(odds == '1/1000','0',odds)
# Divide odds by odds + 1
odds.split <- strsplit(odds, split = "/") %>%
lapply(as.numeric) %>%
lapply(function(x) x[1]/x[2]) %>%
lapply(function(x) round(1-(x/(x+1)), 4)) %>%
do.call(rbind, .)
# Bind players and odds
result <- data.frame('player' = players_data, 'probability' = odds.split)
# Keep first occurrence of each player (focusing on next gameweek)
result <- result[match(unique(result$player), result$player),]
return(result)
}
getLineups <- function (url, fpl.2) {
webpage <- read_html(url)
#Using CSS selectors to scrap the rankings section
players <- html_nodes(webpage,'.player-name')
teams <- html_nodes(webpage, 'h2')
#Converting the data
teams_data <- data.frame(team=html_text(teams), stringsAsFactors = F) %>%
mutate(team = case_when(team=="Tottenham Hotspur" ~ "Spurs",
team=="Huddersfield Town" ~ "Huddersfield",
team=="Brighton and Hove Albion" ~ "Brighton",
team=="Leicester City" ~ "Leicester",
team=="Manchester City" ~ "Man City",
team=="Manchester United" ~ "Man Utd",
team=="Newcastle United" ~ "Newcastle",
team=="Stoke City" ~ "Stoke",
team=="Swansea City" ~ "Swansea",
team=="West Bromwich Albion" ~ "West Brom",
team=="West Ham United" ~ "West Ham",
TRUE ~ team)) %>%
mutate(teamid = row_number())
# Player names and teams
players_data <- data.frame(teamid = rep(1:20, each=11),
web_name=html_text(players)) %>%
mutate(web_name = gsub("é","e",
gsub("á", "a",
gsub("õ", "o",
gsub("í","i",
gsub("ã","a",
gsub("Ö","O",
gsub("ß","ss",web_name)))))))) %>%
inner_join(teams_data, by = "teamid") %>%
mutate(web_name = gsub("Ayoze Perez", "Ayoze", web_name),
web_name = gsub("Mat Ryan", "Ryan", web_name),
web_name = gsub("S Cook", "Steve Cook", web_name),
web_name = gsub("Cook", "Lewis Cook", web_name)) %>%
mutate(player_name = web_name,
second_name = web_name) %>%
select(-teamid)
# --------------- Data matching --------------
# Filter fpl data
fpl_formatch <- fpl.2 %>%
select(id, web_name, player_name, second_name, team)
# Make names consistent
right <- fpl_formatch
# Get left table
left <- players_data
# To do - sort out matching. Why doesn't partial match work? Check against the example.
# Link to FPL data
matches.out <- fastLink(
dfA = left,
dfB = right,
varnames = c("web_name","team"),
stringdist.match = c("web_name"), # Specifies the variables you want to treat as strings for fuzzy matching
#partial.match = c("web_name"), # Specifes variables where you want the algorithm to check for partial matches
verbose = T,
return.all = T
#threshold.match = .01 # Match probability threshold. The default is .85, and you can play around with different values
)
# Gives the match rate, estimated falst positive rate (FDR) and estimated false negative rate (FNR)
summary(matches.out)
# Extracts the matched data
a <- matches.out$matches$inds.a
b <- matches.out$matches$inds.b
# Compile matched data
left[a, 'matchindex'] <- b
namesmatched <- cbind(fpl_formatch[b,],"matchindex"=b, "match"=matches.out$posterior)
matched.data <- left_join(left,
namesmatched,
by="matchindex")
# Keep most likely match for each
dedup <- matched.data %>%
group_by(id) %>%
mutate(rank = ifelse(is.na(match), 1, rank(match, ties.method='first'))) %>%
filter(rank == 1)
# Return player id's and whether they're in the starting lineup
result <- dedup %>% ungroup %>% select(id) %>% mutate(pred_lineup=1)
return(result)
}