-
Notifications
You must be signed in to change notification settings - Fork 12
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
tidy_rules (attempt 2) #22
Conversation
@topepo Please let me know if this does not fit the package, I will create a standalone package to do the above. |
Sorry for the delay. I'm look at these now. I do see some parsing errors: library(Cubist)
#> Loading required package: lattice
data("attrition", package = "rsample")
attrition <- tibble::as_tibble(attrition)
cubist_model_commitees <-
Cubist::cubist(
x = attrition %>% dplyr::select(-MonthlyIncome,-Attrition),
y = attrition %>% dplyr::select(MonthlyIncome) %>% unlist(),
committees = 7
)
rules <- tidy_rules(cubist_model_commitees)
# picking up the next line's output
rules[8,]
#> # A tibble: 1 x 8
#> support mean min max error lhs rhs committees
#> <int> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <int>
#> 1 69 19192. 18041 19999 416 JobLev… (13633) + (1076 * Jo… 1
rules$rhs[8]
#> [1] "(13633) + (1076 * JobLevel) + (8 * TotalWorkingYears * Model * 2:)"
# Rule 1/8: [69 cases, mean 19191.8, range 18041 to 19999, est err 416.0]
#
# if
# JobLevel > 4
# then
# outcome = 13633 + 1076 JobLevel + 8 TotalWorkingYears
#
# Model 2: Created on 2019-03-05 by the reprex package (v0.2.1) I might try to take a little time to prototype something for you using the |
I took some time and looked over notes from 2012 (!) about the model file. HEre's some code that's not perfect but is a good start on the model file. Maybe you could check it over and make suggestions (it needs more error trapping). library(stringr)
library(tidyverse)
parse_model_file <- function(txt) {
txt_rows <- str_split(txt, pattern = "\n") %>% unlist()
# These are the markers for where committees start
comm_inds <- str_which(txt_rows, "^rules=")
num_comm <- length(comm_inds)
# container for results for each committee
comms <- list(length = num_comm)
# Within each committee, elements are `type` (for each condition) and `coeff`
# (for model eq). A rule starts with a `conds` element and that tells us
# how many lines make up the rule elements. The rule elements start right after
# the `rules` line. Immediately after these is a single line with the information
# on the regression equation.
for (i in seq_along(comm_inds)) {
loc <- comm_inds[i]
# Get the locations of the model file that encompasses the committee's rows
if (i < num_comm) {
uppr <- comm_inds[i + 1] - 1
} else {
uppr <- length(txt_rows)
}
num_rules <- rule_info(txt_rows[loc])
comm_data <-
tibble(
rule_num = 1:num_rules,
rule = NA,
eq = NA
)
# Where are the lines that show the `conds` lines
attr_inds <- find_cond_info(txt_rows, loc, uppr)
cond_att <- map_dfr(attr_inds, parse_cond, txt = txt_rows)
comm_data <- bind_cols(comm_data, cond_att)
# Loop over all of the rules and get their rule conditions
for (j in seq_along(attr_inds)) {
att_loc <- attr_inds[j] + 1:comm_data$conds[j]
atts <- map_chr(txt_rows[att_loc], make_conds)
atts <- str_c(atts, collapse = " & ")
comm_data$rule[j] <- atts
}
# Get regression equations
eq_ind <- attr_inds + comm_data$conds + 1
comm_data$eq <- map_chr(txt_rows[eq_ind], get_reg_eq)
comm_data$committee <- i
comms[[i]] <- comm_data
}
res <-
bind_rows(comms) %>%
dplyr::select(committee, rule_num, rule, eq, cover, mean,
lower = loval, upper = hival, err = esterr)
res
}
# ------------------------------------------------------------------------------
find_cond_info <- function(txt, strt = 0, stp = 0) {
txt <- txt[(strt + 1):(stp - 1)]
str_which(txt, "^conds=") + strt
}
parse_cond <- function(ind, txt) {
entires <- str_split(txt[ind], " ") %>% unlist()
tmp <- map(entires, ~ str_split(.x, pattern = "=") %>% unlist())
nms <- map_chr(tmp, pluck, 1)
vals <- map(tmp, str_remove_all, pattern = "\"")
vals <- map_dbl(vals, ~ pluck(.x, 2) %>% as.numeric())
names(vals) <- nms
as.data.frame(t(vals))
}
rule_info <- function(txt) {
txt <- str_remove_all(txt, "\"")
txt <- str_remove(txt, "^rules=")
as.integer(txt)
}
# ------------------------------------------------------------------------------
get_reg_eq <- function(txt) {
entires <- str_split(txt, " ") %>% unlist()
n <- length(entires)
vals <- map_chr(entires, reg_terms)
lp <- vals[1]
vals <- vals[-1]
if (length(vals) > 0) {
n_elem <- length(vals)
if (n_elem %% 2 != 0) {
stop("number of remaining terms not even", call. = FALSE)
}
n_terms <- n_elem/2
split_terms <- split(vals, rep(1:n_terms, each = 2))
terms <- map_chr(split_terms, paste_slopes)
terms <- str_c(terms, collapse = " + ")
lp <- str_c(lp, " + ", terms, collapse = "")
}
lp
}
reg_terms <- function(txt) {
if (str_detect(txt, "^coeff")) {
val <- str_remove(txt[1], "coeff=\"")
val <- str_remove(val, "\"")
} else {
val <- str_remove(txt[1], "att=\"")
val <- str_remove(val, "\"")
}
val
}
paste_slopes <- function(txt) {
str_c("(", txt[2], "*", txt[1], ")", sep = " ")
}
# ------------------------------------------------------------------------------
make_conds <- function(txt) {
res <- map_chr(txt, single_cond)
res <- str_c(res, collapse = " & ")
res <- str_replace_all(res, "\"", "'")
res
}
single_cond <- function(txt) {
if (str_detect(txt, "type=\"2")) {
res <- cond_2(txt)
} else {
res <- cond_3(txt)
}
res
}
cond_2 <- function(txt) {
entires <- str_split(txt, " ") %>% unlist
rms <- "(att=\")|(cut=\")|(result=\")"
entires <- map_chr(entires, str_remove_all, rms)
entires <- map_chr(entires, str_remove_all, "\"")
str_c("(", entires[2], entires[4], entires[3], ")", sep = " ")
}
cond_3 <- function(txt) {
entires <- str_split(txt, " ") %>% unlist
var_name <- entires[2]
var_name <- str_remove(var_name, "att=\"")
var_name <- str_remove(var_name, "\"")
elts <- entires[3]
elts <- str_remove(elts, "elts=")
str_c("(", var_name, " %in% c(", elts, ") )", sep = " ")
}
# ------------------------------------------------------------------------------
library(Cubist)
data("attrition", package = "rsample")
attrition <- tibble::as_tibble(attrition)
cubist_model_commitees <-
Cubist::cubist(
x = attrition %>% dplyr::select(-MonthlyIncome, -Attrition),
y = attrition %>% dplyr::select(MonthlyIncome) %>% unlist(),
committees = 7
)
# debug(parse_model_file)
committe_info <- parse_model_file(cubist_model_commitees$model)
committe_info$rule
committe_info$eq (edit) removed unused functions |
It looks like this is in the |
@topepo Thanks for pointing out errors in the code which made tidyrules possible. Let me know if any of the work with tidyrules is helful in tidymodels/rules. I will be happy to contribute. |
No problem. I have to get a new version of I saw that
|
Following up our discussion here: topepo/C5.0#16