diff --git a/R/helper_funcs.R b/R/helper_funcs.R index 619a309..7208d5c 100644 --- a/R/helper_funcs.R +++ b/R/helper_funcs.R @@ -11,13 +11,30 @@ get_mfl_id = function(id_col = NULL, player_name = NULL, first = NULL, team = team, id = NA ) + max_len = max(lengths(l_p_info)) + length_1 = lengths(l_p_info) == 1 + + l_p_info[length_1] = lapply(l_p_info[length_1], function(x) { + rep(x, max_len) + }) + + if(!is.null(player_name)) { + if(is.null(first)) { + l_p_info$first = sub("\\s+.*$", "", player_name) + } + + if(is.null(last)) { + l_p_info$last = sub(".*?\\s+", "", player_name) + } + } + l_p_info = Filter(Negate(is.null), l_p_info) l_p_info = lapply(l_p_info, function(x) { - x = rename_vec(x, unlist(pos_corrections)) + x = rename_vec(toupper(x), unlist(pos_corrections)) x = rename_vec(x, unlist(team_corrections)) - x = gsub("[[:punct:]]+", "", x) - x = gsub("\\s+(?i)(defense|jr|sr|[iv]+)$", "", x) + x = gsub("\\s+(defense|jr|sr|[iv]+)\\.?$", "", tolower(x)) + x = gsub("[[:punct:]]+|\\s+", "", x) x }) @@ -34,57 +51,64 @@ get_mfl_id = function(id_col = NULL, player_name = NULL, first = NULL, } ref_table = player_table %>% + mutate(across(where(is.character), tolower)) %>% transmute(id = id, player_name = paste(first_name, last_name), - player_name = gsub("\\s+(?i)(defense|jr|sr|[iv]+)$", "", player_name), - player_name = gsub("[[:punct:]]|\\s+", "", tolower(player_name)), - last = gsub("\\s+(?i)(defense|jr|sr|[iv]+)$", "", last_name), - last = gsub("[[:punct:]]|\\s+", "", tolower(last)), - pos = rename_vec(position, unlist(pos_corrections)), - team = rename_vec(team, unlist(team_corrections))) - - - df_p_info = as.data.frame(l_p_info) + player_name = gsub("\\s+(defense|jr|sr|[iv]+)\\.?$", "", player_name), + player_name = gsub("[[:punct:]]|\\s+", "", player_name), + last = gsub("\\s+(defense|jr|sr|[iv]+)\\.?$", "", last_name), + last = gsub("[[:punct:]]|\\s+", "", last), + first = gsub("\\s+(defense|jr|sr|[iv]+)\\.?$", "", first_name), + first = gsub("[[:punct:]]|\\s+", "", first), + pos = rename_vec(toupper(position), unlist(pos_corrections)), + pos = tolower(pos), + team = rename_vec(toupper(team), unlist(team_corrections)), + team = tolower(team)) # If pos = DST, replace by team name - if("pos" %in% names(df_p_info)) { - df_p_info$id = ifelse( - df_p_info$pos == "DST", - ref_table$id[match(df_p_info$team, ref_table$team)], - df_p_info$id + if("pos" %in% names(l_p_info)) { + l_p_info$id = ifelse( + l_p_info$pos == "DST", + ref_table$id[match(l_p_info$team, ref_table$team)], + l_p_info$id ) - if(all(df_p_info$pos == "DST")) { - return(df_p_info$id) - } } - if(!"player_name" %in% names(df_p_info)) { - df_p_info$player_name = tolower(paste0(df_p_info$first, df_p_info$last)) - } else { - df_p_info$player_name = gsub("\\s+", "", tolower(df_p_info$player_name)) - } + col_combos = list( + c("player_name", "pos", "team"), + c("last", "pos", "team"), + c("player_name", "team"), + c("player_name", "pos"), + c("last", "team"), + c("first", "pos", "team") + ) + combo_idx = vapply(col_combos, function(x) { + all(x %in% names(l_p_info)) + }, logical(1L)) - df_p_info = df_p_info %>% - mutate( - id = dplyr::case_when( - !is.na(id) ~ id, - paste0(player_name, pos, team) %in% do.call(paste0, ref_table[c("player_name", "pos", "team")]) ~ - ref_table$id[match(paste0(player_name, pos, team), do.call(paste0, ref_table[c("player_name", "pos", "team")]))], - paste0(player_name, pos, team) %in% do.call(paste0, ref_table[c("last", "pos", "team")]) ~ - ref_table$id[match(paste0(player_name, pos, team), do.call(paste0, ref_table[c("last", "pos", "team")]))], - paste0(player_name, pos) %in% do.call(paste0, ref_table[c("player_name", "pos")]) ~ - ref_table$id[match(paste0(player_name, pos), do.call(paste0, ref_table[c("player_name", "pos")]))], - paste0(player_name, pos) %in% do.call(paste0, ref_table[c("last", "pos")]) ~ - ref_table$id[match(paste0(player_name, pos), do.call(paste0, ref_table[c("last", "pos")]))], - paste0(player_name, team) %in% do.call(paste0, ref_table[c("player_name", "team")]) ~ - ref_table$id[match(paste0(player_name, team), do.call(paste0, ref_table[c("player_name", "team")]))], - paste0(player_name, team) %in% do.call(paste0, ref_table[c("last", "team")]) ~ - ref_table$id[match(paste0(player_name, team), do.call(paste0, ref_table[c("last", "team")]))], - TRUE ~ NA_character_ - ) - ) + for(combo in col_combos[combo_idx]) { + id_idx = is.na(l_p_info$id) + + l_p_info_vec = do.call(paste0, l_p_info[combo])[id_idx] + ref_table_vec = do.call(paste0, ref_table[combo]) + + # Removing dups from reftable + ref_dups = ref_table_vec[duplicated(ref_table_vec)] + keep_in_ref = !ref_table_vec %in% ref_dups + ref_table_vec = ref_table_vec[keep_in_ref] + + + l = lapply(l_p_info_vec, function(y) { + which(ref_table_vec %in% y) + }) + l[lengths(l) != 1] = NA_integer_ + match_vec = unlist(l) + + l_p_info$id[id_idx] = ref_table$id[keep_in_ref][match_vec] + + } + l_p_info$id - df_p_info$id } get_scrape_year <- function(date) { @@ -144,6 +168,17 @@ row_sd = function(x, na.rm = FALSE) { r_sd } +impute_and_score_sources = function(data_result, scoring_rules) { + scoring_objs = make_scoring_tables(scoring_rules) + + data_result = impute_via_rates_and_mean(data_result, scoring_objs) + data_result = impute_bonus_cols(data_result, scoring_objs$scoring_tables) + + data_result[] = source_points(data_result, scoring_rules, return_data_result = TRUE) + data_result +} + + # Returns new player_id table update_player_id_table = function(player_id_table = NULL, id_column, value) { diff --git a/R/source_scrapes.R b/R/source_scrapes.R index c53e2b8..d560ff8 100644 --- a/R/source_scrapes.R +++ b/R/source_scrapes.R @@ -44,12 +44,12 @@ scrape_cbs = function(pos = c("QB", "RB", "WR", "TE", "K", "DST"), season = NULL # Get PID if(pos == "DST") { - site_id = html_page %>% + cbs_id = html_page %>% rvest::html_elements("span.TeamName a") %>% rvest::html_attr("href") %>% sub(".*?([A-Z]{2,3}).*", "\\1", .) } else { - site_id = html_page %>% + cbs_id = html_page %>% rvest::html_elements("table > tbody > tr > td:nth-child(1) > span.CellPlayerName--long > span > a") %>% rvest::html_attr("href") %>% sub(".*?([0-9]+).*", "\\1", .) @@ -65,20 +65,21 @@ scrape_cbs = function(pos = c("QB", "RB", "WR", "TE", "K", "DST"), season = NULL out_df = out_df %>% tidyr::extract(player, c("player", "pos", "team"), ".*?\\s{2,}[A-Z]{1,3}\\s{2,}[A-Z]{2,3}\\s{2,}(.*?)\\s{2,}(.*?)\\s{2,}(.*)") %>% - dplyr::mutate(src_id = site_id, + dplyr::mutate(src_id = cbs_id, data_src = "CBS", id = player_ids$id[match(src_id, player_ids$cbs_id)]) out_df$id = get_mfl_id( + id_col = cbs_id, player_name = out_df$player, pos = out_df$pos, team = out_df$team ) } else { - out_df$team = site_id + out_df$team = cbs_id out_df$data_src = "CBS" dst_ids = ff_player_data[ff_player_data$position == "Def", c("id", "team")] dst_ids$team[dst_ids$team == "OAK"] = "LV" - out_df$id = dst_ids$id[match(site_id, dst_ids$team)] + out_df$id = dst_ids$id[match(cbs_id, dst_ids$team)] out_df$src_id = player_ids$cbs_id[match(out_df$id, player_ids$id)] } @@ -546,7 +547,14 @@ scrape_walterfootball <- function(pos = c("QB", "RB", "WR", "TE", "K"), # FleaFlicker ---- scrape_fleaflicker <- function(pos = c("QB", "RB", "WR", "TE", "K", "DST", "DL", "LB", "DB"), - season = 2022, week = NULL, draft = FALSE, weekly = TRUE) { + season = NULL, week = NULL, draft = FALSE, weekly = TRUE) { + + if(is.null(season)) { + season = get_scrape_year() + } + if(is.null(week)) { + week = get_scrape_week() + } # IDP positions if("DL" %in% pos) { @@ -609,8 +617,9 @@ scrape_fleaflicker <- function(pos = c("QB", "RB", "WR", "TE", "K", "DST", "DL", position, "&tableOffset=", offset) - Sys.sleep(1L) - + if(i != 1L) { + Sys.sleep(2L) + } # 20 rows of player data by position html_page <- site_session %>% @@ -672,13 +681,21 @@ scrape_fleaflicker <- function(pos = c("QB", "RB", "WR", "TE", "K", "DST", "DL", mutate(player = gsub("^Q(?=[A-Z])", "", player, perl = TRUE)) %>% extract(player, into = c("first_name", "last_name", "pos_temp", "team", "bye"), regex = "(.*?)\\s+(.*?)\\s+(.*?)\\s+(.*?)\\s+.*(\\d+)\\)$", convert = TRUE) %>% - unite("player", first_name:last_name, sep = " ") %>% + tidyr::unite("player", first_name:last_name, sep = " ", remove = FALSE) %>% mutate(data_src = "FleaFlicker") %>% # rename for now so while loop works mutate(src_id = fleaflicker_id, - id = get_mfl_id(fleaflicker_id, pos = pos, player_name = player), + id = get_mfl_id( + id_col = fleaflicker_id, + player_name = player, + first = first_name, + last = last_name, + pos = pos, + team = team + ), pos_temp = pos) %>% - rename(pos = pos_temp) + rename(pos = pos_temp) %>% + select(-first_name, -last_name) } ) @@ -1196,7 +1213,8 @@ scrape_espn = function(pos = c("QB", "RB", "WR", "TE", "K", "DST"), season = NUL out_df$id = ffanalytics:::get_mfl_id( out_df$espn_id, player_name = out_df$player_name, - pos = out_df$position + pos = out_df$position, + team = out_df$team ) } diff --git a/R/sysdata.rda b/R/sysdata.rda index f2a9385..f1406bd 100644 Binary files a/R/sysdata.rda and b/R/sysdata.rda differ diff --git a/data-raw/player_directories2.R b/data-raw/player_directories2.R new file mode 100644 index 0000000..f49c904 --- /dev/null +++ b/data-raw/player_directories2.R @@ -0,0 +1,267 @@ + +# Info -------------------------------------------------------------------- + +# This script will parse player directories for CBS, FFToday, Fantasypros and NFL. +# Make sure you load the ffanalytics package first to have access to tidyverse and rvest. +# The command to create the sysdata.rda file is usethis::use_data(player_ids, overwrite = TRUE, internal = TRUE). +# This command needs to be executed from the package root folder. + +#### CBS Players #### ---- +devtools::load_all() +library(dplyr) + +scrape_cbs = ffanalytics:::scrape_cbs() + +final_cbs = dplyr::bind_rows(scrape_cbs) %>% + dplyr::transmute(merge_id = gsub("[[:punct:]]|\\s+", "", tolower(player)), + merge_id = paste0(gsub("\\s+", "", merge_id), "_", tolower(pos)), + cbs_id = src_id, + id) + +#### FFToday Players #### ---- + +scrape_fft = ffanalytics:::scrape_fftoday() + +final_fft = dplyr::bind_rows(scrape_fft) %>% + dplyr::transmute(player = ifelse(pos == "DST", team, player), + merge_id = gsub("[[:punct:]]|\\s+", "", tolower(player)), + merge_id = paste0(gsub("\\s+", "", merge_id), "_", tolower(pos)), + fftoday_id = src_id, + id, + player = NULL) + + + +#### FantasyPros #### ---- +# Fantasy pros numeric ids + +# Getting Players from last years stats +# Getting links + +scrape_fp = scrape_fantasypros() + +final_fp_all = dplyr::bind_rows(scrape_fp) %>% + dplyr::distinct() %>% + transmute(player = ifelse(pos == "DST", team, player), + merge_id = gsub("[[:punct:]]|\\s+", "", tolower(player)), + merge_id = paste0(gsub("\\s+", "", merge_id), "_", tolower(pos)), + fantasypro_num_id = src_id, + id, + player = NULL) + +#### NFL Players #### ---- + +scrape_nfl = ffanalytics:::scrape_nfl() + +final_nfl = dplyr::bind_rows(scrape_nfl) %>% + transmute(player = ifelse(pos == "DST", team, player), + merge_id = gsub("[[:punct:]]|\\s+", "", tolower(player)), + merge_id = paste0(gsub("\\s+", "", merge_id), "_", tolower(pos)), + nfl_id = src_id, + id, + player = NULL) + + + + +#### NUmber fire #### ---- +scrape_nf = scrape_numberfire() + +final_nf = scrape_nf %>% + dplyr::bind_rows() %>% + transmute(player = ifelse(pos == "DST", team, player), + merge_id = gsub("[[:punct:]]|\\s+", "", tolower(player)), + merge_id = paste0(gsub("\\s+", "", merge_id), "_", tolower(pos)), + numfire_id = src_id, + id, + player = NULL) + + + +#### RTSports #### ---- + +rt_scrape = ffanalytics:::scrape_rtsports() + +final_rt = rt_scrape %>% + dplyr::bind_rows() %>% + transmute(player = ifelse(pos == "DST", team, player), + merge_id = gsub("[[:punct:]]|\\s+", "", tolower(player)), + merge_id = paste0(gsub("\\s+", "", merge_id), "_", tolower(pos)), + rts_id = src_id, + id, + player = NULL) + + +#### Fleaflicker ---- + +ff_scrape = scrape_fleaflicker() + +final_flfl = ff_scrape %>% + dplyr::bind_rows() %>% + transmute(player = ifelse(pos == "DST", team, player), + merge_id = gsub("[[:punct:]]|\\s+", "", tolower(player)), + merge_id = paste0(gsub("\\s+", "", merge_id), "_", tolower(pos)), + fleaflicker_id = src_id, + id, + player = NULL) + + +#### Yahoo ---- + +yahoo_draft_info = ffanalytics:::yahoo_draft() + +final_yahoo = yahoo_draft_info %>% + transmute(merge_id = gsub("[[:punct:]]|\\s+", "", tolower(player_name)), + merge_id = paste0(gsub("\\s+", "", merge_id), "_", tolower(pos)), + stats_id = yahoo_id, + id) + + +# Getting ESPN ID's +scrape_espn = ffanalytics:::scrape_espn() + +final_espn = scrape_espn %>% + dplyr::bind_rows() %>% + transmute(merge_id = gsub("[[:punct:]]|\\s+", "", tolower(player)), + merge_id = paste0(gsub("\\s+", "", merge_id), "_", tolower(pos)), + espn_id = src_id, + id) + + +# Cleaning up above scrapes + +rm(list = grep("^(?!final).+", ls(), value = TRUE, perl = TRUE)) +gc() + + + +# updating player_ids table by name & pos + +curr_ids = ffanalytics:::player_ids + +my_fl_ids = httr::GET("https://api.myfantasyleague.com/2023/export?TYPE=players&L=&APIKEY=&DETAILS=1&SINCE=&PLAYERS=&JSON=1") %>% + httr::content() %>% + `[[`("players") %>% + `[[`("player") %>% + purrr::map(tibble::as_tibble) %>% + dplyr::bind_rows() %>% + mutate(nfl_id = basename(nfl_id)) %>% # prob unnecessary in future + tidyr::extract(name, c("last_name", "first_name"), "(.+),\\s(.+)") %>% + mutate(across(everything(), ~gsub("(?![.-])[[:punct:]]", "", ., perl = TRUE))) %>% + dplyr::mutate(name = paste0(first_name," ",last_name)) + +updated_ids = my_fl_ids %>% + select(first_name, last_name, position, id, team, ends_with("_id")) %>% + filter(if_any(ends_with("_id"), ~ . != 0 & !is.na(.))) %>% + mutate(position = ifelse(position %in% names(pos_corrections), unlist(pos_corrections)[position], position), + merge_id = paste0(first_name, last_name), + merge_id = gsub("[[:punct:]]|\\s+", "", tolower(merge_id)), + merge_id = paste0(merge_id, "_", tolower(position))) + +new_ids = mget(ls(pattern = "^final")) +new_ids = Reduce(function(x, y) { + full_join(x, y, "merge_id") %>% + mutate(id = case_when( + !is.na(id.x) & !is.na(id.y) & id.x != id.y ~ NA_character_, + TRUE ~ coalesce(id.x, id.y) + )) %>% + select(-id.x, -id.y) +}, new_ids) %>% + filter(!grepl("_(dst|def)$", merge_id)) %>% + distinct() + +# Updating the common columns in the myfantasyleague data +# common_cols = setdiff(intersect(names(new_ids), grep("_id$", names(updated_ids), value = TRUE)), "merge_id") +curr_cols = setdiff(grep("_id$", names(curr_ids), value = TRUE), "id") + + +for(j in curr_cols) { + + if(j %in% names(updated_ids) && j %in% names(new_ids)) { + + df_updated = updated_ids[c("id", "merge_id", j)] + updated_name = paste0(j, "_updated") + names(df_updated)[3] = updated_name + + df_new = new_ids[!is.na(new_ids[[j]]) & !is.na(new_ids$id), c("id", "merge_id", j)] + new_name = paste0(j, "_new") + names(df_new)[3] = new_name + + curr_ids = curr_ids %>% + full_join(df_updated, "id") %>% + left_join(distinct(select(df_new, -id)), "merge_id") %>% + left_join(distinct(select(df_new, -merge_id)), "id") + + curr_ids[[j]] = do.call(dplyr::coalesce, curr_ids[grepl(j, names(curr_ids), fixed = TRUE)]) + curr_ids[grep(paste0(j, "_.+|merge_id"), names(curr_ids))] = NULL + + } else if(j %in% names(updated_ids)) { + + df_updated = updated_ids[c("id", j)] + updated_name = paste0(j, "_updated") + names(df_updated)[2] = updated_name + + curr_ids = curr_ids %>% + full_join(df_updated, "id") + + curr_ids[[j]] = coalesce(curr_ids[[j]], curr_ids[[updated_name]]) + curr_ids[[updated_name]] = NULL + + } else if(j %in% names(new_ids)) { + + df_updated = updated_ids[c("id", "merge_id")] + + df_new = new_ids[!is.na(new_ids[[j]]) & !is.na(new_ids$id), c("id", "merge_id", j)] + new_name = paste0(j, "_new") + names(df_new)[3] = new_name + + curr_ids = curr_ids %>% + full_join(df_updated, "id") %>% + left_join(distinct(select(df_new, -id)), "merge_id") %>% + left_join(distinct(select(df_new, -merge_id)), "id") + + + curr_ids[[j]] = do.call(dplyr::coalesce, curr_ids[grepl(j, names(curr_ids), fixed = TRUE)]) + curr_ids[grep(paste0(j, "_.+|merge_id"), names(curr_ids))] = NULL + + } +} + + +# Run necessary QA. Looks at the data. Etc.. + + + +dim(ffanalytics:::player_ids) +dim(curr_ids) + + +curr_ids[curr_ids$id %in% curr_ids$id[duplicated(curr_ids$id)], ] + + + +colSums(!is.na(ffanalytics:::player_ids)) +colSums(!is.na(curr_ids)) + +# New - old +colSums(!is.na(curr_ids)) - colSums(!is.na(ffanalytics:::player_ids)) + +# any duplicates +sum(duplicated(curr_ids$id)) +sum(duplicated(ffanalytics:::player_ids)) + +temp_file = tempfile(fileext = ".rds") +print(temp_file) +saveRDS(curr_ids, temp_file) + + +# After running necessary QA, replace data +player_ids = readRDS(temp_file) +usethis::use_data(bonus_col_sets, bonus_col_coefs, + pts_bracket_coefs, player_ids, + overwrite = TRUE, internal = TRUE) + + + + +