tidy_rules (attempt 2) #22

talegari · 2018-09-05T07:30:22Z

Following up our discussion here: topepo/C5.0#16

talegari · 2019-02-06T15:18:34Z

@topepo Please let me know if this does not fit the package, I will create a standalone package to do the above.

topepo · 2019-03-05T20:28:48Z

Sorry for the delay. I'm look at these now.

I do see some parsing errors:

library(Cubist)
#> Loading required package: lattice

data("attrition", package = "rsample")
attrition <- tibble::as_tibble(attrition)


cubist_model_commitees <-
  Cubist::cubist(
    x   = attrition %>% dplyr::select(-MonthlyIncome,-Attrition),
    y = attrition %>% dplyr::select(MonthlyIncome) %>% unlist(),
    committees = 7
  )

rules <- tidy_rules(cubist_model_commitees)
# picking up the next line's output
rules[8,]
#> # A tibble: 1 x 8
#>   support   mean   min   max error lhs     rhs                   committees
#>     <int>  <dbl> <dbl> <dbl> <dbl> <chr>   <chr>                      <int>
#> 1      69 19192. 18041 19999   416 JobLev… (13633) + (1076 * Jo…          1
rules$rhs[8]
#> [1] "(13633) + (1076 * JobLevel) + (8 * TotalWorkingYears * Model * 2:)"
# Rule 1/8: [69 cases, mean 19191.8, range 18041 to 19999, est err 416.0]
# 
# if
# JobLevel > 4
# then
# outcome = 13633 + 1076 JobLevel + 8 TotalWorkingYears
# 
# Model 2:

Created on 2019-03-05 by the reprex package (v0.2.1)

I might try to take a little time to prototype something for you using the model object.

topepo · 2019-03-06T01:26:45Z

I took some time and looked over notes from 2012 (!) about the model file. HEre's some code that's not perfect but is a good start on the model file.

Maybe you could check it over and make suggestions (it needs more error trapping).

library(stringr)
library(tidyverse)

parse_model_file <- function(txt) {
  txt_rows <- str_split(txt, pattern = "\n") %>% unlist()
  
  # These are the markers for where committees start
  comm_inds <- str_which(txt_rows, "^rules=")
  num_comm <- length(comm_inds)
  # container for results for each committee
  comms <- list(length = num_comm)
  
  # Within each committee, elements are `type` (for each condition) and `coeff`
  # (for model eq). A rule starts with a `conds` element and that tells us 
  # how many lines make up the rule elements. The rule elements start right after
  # the `rules` line. Immediately after these is a single line with the information
  # on the regression equation. 
  
  for (i in seq_along(comm_inds)) {
    loc <- comm_inds[i]
    # Get the locations of the model file that encompasses the committee's rows
    if (i < num_comm) {
      uppr <- comm_inds[i + 1] - 1
    } else {
      uppr <- length(txt_rows)
    }
    num_rules <- rule_info(txt_rows[loc])
    comm_data <-
      tibble(
        rule_num = 1:num_rules,
        rule = NA,
        eq = NA
      )
    # Where are the lines that show the `conds` lines
    attr_inds <- find_cond_info(txt_rows, loc, uppr)
    cond_att <- map_dfr(attr_inds, parse_cond, txt = txt_rows)
    comm_data <- bind_cols(comm_data, cond_att)
    
    # Loop over all of the rules and get their rule conditions
    for (j in seq_along(attr_inds)) {
      att_loc <- attr_inds[j] + 1:comm_data$conds[j]
      atts <- map_chr(txt_rows[att_loc], make_conds)
      atts <- str_c(atts, collapse = " & ")
      comm_data$rule[j] <- atts
    }

    # Get regression equations
    eq_ind <- attr_inds + comm_data$conds + 1
    comm_data$eq <- map_chr(txt_rows[eq_ind], get_reg_eq)
    comm_data$committee <- i
    comms[[i]] <- comm_data
  }
  res <- 
    bind_rows(comms) %>% 
    dplyr::select(committee, rule_num, rule, eq, cover, mean, 
                  lower = loval, upper = hival, err = esterr)
  res
}

# ------------------------------------------------------------------------------

find_cond_info <- function(txt, strt = 0, stp = 0) {
  txt <- txt[(strt + 1):(stp - 1)]
  str_which(txt, "^conds=") + strt
}

parse_cond <- function(ind, txt) {
  entires <- str_split(txt[ind], " ") %>% unlist()
  tmp <- map(entires, ~ str_split(.x, pattern = "=") %>% unlist())
  nms <- map_chr(tmp, pluck, 1)
  vals <- map(tmp, str_remove_all, pattern = "\"")
  vals <- map_dbl(vals, ~ pluck(.x, 2) %>% as.numeric())
  names(vals) <- nms
  as.data.frame(t(vals))
}

rule_info <- function(txt) {
  txt <- str_remove_all(txt, "\"")
  txt <- str_remove(txt, "^rules=")
  as.integer(txt)
}


# ------------------------------------------------------------------------------

get_reg_eq <- function(txt) {
  entires <- str_split(txt, " ") %>% unlist()
  n <- length(entires)
  vals <- map_chr(entires, reg_terms)
  lp <- vals[1]
  vals <- vals[-1]
  if (length(vals) > 0) {
    n_elem <- length(vals)
    if (n_elem %% 2 != 0) {
      stop("number of remaining terms not even", call. = FALSE)
    }
    n_terms <- n_elem/2
    split_terms <- split(vals, rep(1:n_terms, each = 2))
    terms <- map_chr(split_terms, paste_slopes)
    terms <- str_c(terms, collapse = " + ")
    lp <- str_c(lp, " + ", terms, collapse = "")
  }
  lp
}

reg_terms <- function(txt) {
  if (str_detect(txt, "^coeff")) {
    val <- str_remove(txt[1], "coeff=\"")
    val <- str_remove(val, "\"")
  } else {
    val <- str_remove(txt[1], "att=\"")
    val <- str_remove(val, "\"") 
  }
  val
}

paste_slopes <- function(txt) {
  str_c("(", txt[2], "*", txt[1], ")", sep = " ")
}

# ------------------------------------------------------------------------------

make_conds <- function(txt) {
  res <- map_chr(txt, single_cond)
  res <- str_c(res, collapse = " & ")
  res <- str_replace_all(res, "\"", "'")
  res
}

single_cond <- function(txt) {
  if (str_detect(txt, "type=\"2")) {
    res <- cond_2(txt)
  } else {
    res <- cond_3(txt)
  }
  res
}

cond_2 <- function(txt) {
  entires <- str_split(txt, " ") %>% unlist
  rms <- "(att=\")|(cut=\")|(result=\")"
  entires <- map_chr(entires, str_remove_all, rms)
  entires <- map_chr(entires, str_remove_all, "\"")
  str_c("(", entires[2], entires[4], entires[3], ")", sep = " ")
}

cond_3 <- function(txt) {
  entires <- str_split(txt, " ") %>% unlist
  var_name <- entires[2]
  var_name <- str_remove(var_name, "att=\"")
  var_name <- str_remove(var_name, "\"")
  elts <- entires[3]
  elts <- str_remove(elts, "elts=")
  str_c("(", var_name, " %in% c(", elts, ") )", sep = " ")
}

# ------------------------------------------------------------------------------

library(Cubist)

data("attrition", package = "rsample")
attrition <- tibble::as_tibble(attrition)

cubist_model_commitees <-
  Cubist::cubist(
    x = attrition %>% dplyr::select(-MonthlyIncome, -Attrition),
    y = attrition %>% dplyr::select(MonthlyIncome) %>% unlist(),
    committees = 7
  )

# debug(parse_model_file)
committe_info <- parse_model_file(cubist_model_commitees$model)
committe_info$rule
committe_info$eq

(edit) removed unused functions

topepo · 2020-01-07T03:13:38Z

It looks like this is in the tidyrules package

talegari · 2020-01-07T04:29:51Z

@topepo Thanks for pointing out errors in the code which made tidyrules possible. Let me know if any of the work with tidyrules is helful in tidymodels/rules. I will be happy to contribute.

topepo · 2020-01-07T17:28:20Z

No problem. I have to get a new version of Cubist on CRAN due to some gcc changes.

I saw that tidyrules was on cran so I removed it here and updated the vignette to use it.

rules only contains thin wrappers for C5 and cubist so that they can be used with parsnip. If you'd like to contribute to tidypredict, there are two related issues tidymodels/tidypredict#57 and tidymodels/tidypredict#58 that would use some help.

talegari added 2 commits August 3, 2018 14:46

tidy_rules

62ef28e

handled comments from max in topepo/C5.0#16 (comment)

3980c89

talegari mentioned this pull request Sep 5, 2018

Tidy rules topepo/C5.0#16

Closed

topepo closed this Jan 7, 2020

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

tidy_rules (attempt 2) #22

tidy_rules (attempt 2) #22

talegari commented Sep 5, 2018

talegari commented Feb 6, 2019

topepo commented Mar 5, 2019

topepo commented Mar 6, 2019 •

edited

Loading

topepo commented Jan 7, 2020

talegari commented Jan 7, 2020

topepo commented Jan 7, 2020

tidy_rules (attempt 2) #22

tidy_rules (attempt 2) #22

Conversation

talegari commented Sep 5, 2018

talegari commented Feb 6, 2019

topepo commented Mar 5, 2019

topepo commented Mar 6, 2019 • edited Loading

topepo commented Jan 7, 2020

talegari commented Jan 7, 2020

topepo commented Jan 7, 2020

topepo commented Mar 6, 2019 •

edited

Loading