1.1-DecisionTrees.html

<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>

<meta charset="utf-8">
<meta name="generator" content="quarto-1.4.549">

<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">

<meta name="dcterms.date" content="2024-03-17">

<title>Learning with (Decision) Trees</title>
<style>
code{white-space: pre-wrap;}
span.smallcaps{font-variant: small-caps;}
div.columns{display: flex; gap: min(4vw, 1.5em);}
div.column{flex: auto; overflow-x: auto;}
div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
ul.task-list{list-style: none;}
ul.task-list li input[type="checkbox"] {
  width: 0.8em;
  margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */ 
  vertical-align: middle;
}
/* CSS for syntax highlighting */
pre > code.sourceCode { white-space: pre; position: relative; }
pre > code.sourceCode > span { line-height: 1.25; }
pre > code.sourceCode > span:empty { height: 1.2em; }
.sourceCode { overflow: visible; }
code.sourceCode > span { color: inherit; text-decoration: inherit; }
div.sourceCode { margin: 1em 0; }
pre.sourceCode { margin: 0; }
@media screen {
div.sourceCode { overflow: auto; }
}
@media print {
pre > code.sourceCode { white-space: pre-wrap; }
pre > code.sourceCode > span { text-indent: -5em; padding-left: 5em; }
}
pre.numberSource code
  { counter-reset: source-line 0; }
pre.numberSource code > span
  { position: relative; left: -4em; counter-increment: source-line; }
pre.numberSource code > span > a:first-child::before
  { content: counter(source-line);
    position: relative; left: -1em; text-align: right; vertical-align: baseline;
    border: none; display: inline-block;
    -webkit-touch-callout: none; -webkit-user-select: none;
    -khtml-user-select: none; -moz-user-select: none;
    -ms-user-select: none; user-select: none;
    padding: 0 4px; width: 4em;
  }
pre.numberSource { margin-left: 3em;  padding-left: 4px; }
div.sourceCode
  {   }
@media screen {
pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
}
</style>


<script src="1.1-DecisionTrees_files/libs/clipboard/clipboard.min.js"></script>
<script src="1.1-DecisionTrees_files/libs/quarto-html/quarto.js"></script>
<script src="1.1-DecisionTrees_files/libs/quarto-html/popper.min.js"></script>
<script src="1.1-DecisionTrees_files/libs/quarto-html/tippy.umd.min.js"></script>
<script src="1.1-DecisionTrees_files/libs/quarto-html/anchor.min.js"></script>
<link href="1.1-DecisionTrees_files/libs/quarto-html/tippy.css" rel="stylesheet">
<link href="1.1-DecisionTrees_files/libs/quarto-html/quarto-syntax-highlighting.css" rel="stylesheet" id="quarto-text-highlighting-styles">
<script src="1.1-DecisionTrees_files/libs/bootstrap/bootstrap.min.js"></script>
<link href="1.1-DecisionTrees_files/libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
<link href="1.1-DecisionTrees_files/libs/bootstrap/bootstrap.min.css" rel="stylesheet" id="quarto-bootstrap" data-mode="light">

  <script src="https://polyfill.io/v3/polyfill.min.js?features=es6"></script>
  <script src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-chtml-full.js" type="text/javascript"></script>

<script type="text/javascript">
const typesetMath = (el) => {
  if (window.MathJax) {
    // MathJax Typeset
    window.MathJax.typeset([el]);
  } else if (window.katex) {
    // KaTeX Render
    var mathElements = el.getElementsByClassName("math");
    var macros = [];
    for (var i = 0; i < mathElements.length; i++) {
      var texText = mathElements[i].firstChild;
      if (mathElements[i].tagName == "SPAN") {
        window.katex.render(texText.data, mathElements[i], {
          displayMode: mathElements[i].classList.contains('display'),
          throwOnError: false,
          macros: macros,
          fleqn: false
        });
      }
    }
  }
}
window.Quarto = {
  typesetMath
};
</script>

</head>

<body>

<div id="quarto-content" class="page-columns page-rows-contents page-layout-article">
<div id="quarto-margin-sidebar" class="sidebar margin-sidebar">
  <nav id="TOC" role="doc-toc" class="toc-active">
    <h2 id="toc-title">Table of contents</h2>
   
  <ul>
  <li><a href="#introduction" id="toc-introduction" class="nav-link active" data-scroll-target="#introduction">Introduction</a>
  <ul class="collapse">
  <li><a href="#supervised-learning" id="toc-supervised-learning" class="nav-link" data-scroll-target="#supervised-learning">Supervised Learning</a></li>
  </ul></li>
  <li><a href="#decision-tree-basics" id="toc-decision-tree-basics" class="nav-link" data-scroll-target="#decision-tree-basics">Decision tree basics</a>
  <ul class="collapse">
  <li><a href="#decision-tree" id="toc-decision-tree" class="nav-link" data-scroll-target="#decision-tree">Decision tree</a></li>
  <li><a href="#training-and-testing-decision-trees" id="toc-training-and-testing-decision-trees" class="nav-link" data-scroll-target="#training-and-testing-decision-trees">Training and testing decision trees</a></li>
  <li><a href="#basic-notation" id="toc-basic-notation" class="nav-link" data-scroll-target="#basic-notation">Basic notation</a></li>
  <li><a href="#tree-growing-procedure" id="toc-tree-growing-procedure" class="nav-link" data-scroll-target="#tree-growing-procedure">Tree-Growing Procedure</a></li>
  <li><a href="#choosing-the-best-split-for-a-variable" id="toc-choosing-the-best-split-for-a-variable" class="nav-link" data-scroll-target="#choosing-the-best-split-for-a-variable">Choosing the best split for a Variable</a>
  <ul class="collapse">
  <li><a href="#splitting-strategies" id="toc-splitting-strategies" class="nav-link" data-scroll-target="#splitting-strategies">Splitting Strategies</a></li>
  <li><a href="#node-information-functions" id="toc-node-information-functions" class="nav-link" data-scroll-target="#node-information-functions">Node information functions</a></li>
  <li><a href="#information-gain" id="toc-information-gain" class="nav-link" data-scroll-target="#information-gain">Information gain</a></li>
  </ul></li>
  <li><a href="#example" id="toc-example" class="nav-link" data-scroll-target="#example">Example</a></li>
  <li><a href="#recursive-partitioning" id="toc-recursive-partitioning" class="nav-link" data-scroll-target="#recursive-partitioning">Recursive Partitioning</a></li>
  <li><a href="#stop-spliting" id="toc-stop-spliting" class="nav-link" data-scroll-target="#stop-spliting">Stop spliting</a></li>
  <li><a href="#plurality-rule" id="toc-plurality-rule" class="nav-link" data-scroll-target="#plurality-rule">Plurality rule</a></li>
  <li><a href="#choosing-the-best-split-in-regression-trees" id="toc-choosing-the-best-split-in-regression-trees" class="nav-link" data-scroll-target="#choosing-the-best-split-in-regression-trees">Choosing the best split in Regression Trees</a></li>
  <li><a href="#estimating-the-misclassification-rate" id="toc-estimating-the-misclassification-rate" class="nav-link" data-scroll-target="#estimating-the-misclassification-rate">Estimating the misclassification rate</a></li>
  <li><a href="#tree-pruning" id="toc-tree-pruning" class="nav-link" data-scroll-target="#tree-pruning">Tree Pruning</a>
  <ul class="collapse">
  <li><a href="#cost-complexity-pruning" id="toc-cost-complexity-pruning" class="nav-link" data-scroll-target="#cost-complexity-pruning">Cost complexity pruning</a></li>
  </ul></li>
  <li><a href="#advantages-and-disadvantages-of-trees" id="toc-advantages-and-disadvantages-of-trees" class="nav-link" data-scroll-target="#advantages-and-disadvantages-of-trees">Advantages and disadvantages of trees</a></li>
  </ul></li>
  <li><a href="#bibliography" id="toc-bibliography" class="nav-link" data-scroll-target="#bibliography">Bibliography</a></li>
  </ul>
<div class="quarto-alternate-formats"><h2>Other Formats</h2><ul><li><a href="1.1-DecisionTrees.pdf"><i class="bi bi-file-pdf"></i>PDF</a></li></ul></div></nav>
</div>
<main class="content" id="quarto-document-content">

<header id="title-block-header" class="quarto-title-block default">
<div class="quarto-title">
<h1 class="title">Learning with (Decision) Trees</h1>
</div>


<div class="quarto-title-meta">

    <div>
    <div class="quarto-title-meta-heading">Authors</div>
    <div class="quarto-title-meta-contents">
             <p>Esteban Vegas </p>
             <p>Ferran Reverter </p>
             <p>Alex Sanchez </p>
          </div>
  </div>
    
    <div>
    <div class="quarto-title-meta-heading">Published</div>
    <div class="quarto-title-meta-contents">
      <p class="date">March 17, 2024</p>
    </div>
  </div>
  
    
  </div>
  

</header>


<div class="cell">
<div class="sourceCode cell-code" id="cb1"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="fu">options</span>(<span class="at">width=</span><span class="dv">100</span>) </span>
<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="cf">if</span>(<span class="sc">!</span><span class="fu">require</span>(<span class="st">"knitr"</span>)) <span class="fu">install.packages</span>(<span class="st">"knitr"</span>)</span>
<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(<span class="st">"knitr"</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
<section id="introduction" class="level1">
<h1>Introduction</h1>
<p>Decision trees have been around for a number of years. Their recent revival is due to the discovery that ensembles of slightly different trees tend to produce much higher accuracy on previously unseen data, a phenomenon known as generalization. Ensembles of trees will be discussed but let us focus first on individual trees.</p>
<section id="supervised-learning" class="level2">
<h2 class="anchored" data-anchor-id="supervised-learning">Supervised Learning</h2>
<p>Classification and Regression are supervised learning task. Learning set is of the form: <span class="math display">\[
\mathcal{L}=\{(\mathbf{x}_i,y_i)\}_{i=1}^n
\]</span> Classification (Two classes, binary) <span class="math display">\[
\mathbf{x}\in\mathbb{R}^d,\qquad y\in\{-1,+1\}
\]</span> Regression <span class="math display">\[
\mathbf{x}\in\mathbb{R}^d,\qquad y\in\mathbb{R}
\]</span></p>
</section>
</section>
<section id="decision-tree-basics" class="level1">
<h1>Decision tree basics</h1>
<p><img src="images/rf0.jpg" class="img-fluid" style="width:75.0%"></p>
<section id="decision-tree" class="level2">
<h2 class="anchored" data-anchor-id="decision-tree">Decision tree</h2>
<p>We can see Fig.1.</p>
<ol type="1">
<li>A tree is a set of nodes and edges organized in a hierarchical fashion. In contrast to a graph, in a tree there are no loops. Internal nodes are denoted with circles and terminal nodes with squares.</li>
<li>A decision tree is a tree where each split node stores a boolean test function to be applied to the incoming data. Each leaf stores the final answer (predictor)</li>
</ol>
</section>
<section id="training-and-testing-decision-trees" class="level2">
<h2 class="anchored" data-anchor-id="training-and-testing-decision-trees">Training and testing decision trees</h2>
<p><img src="images/rf00.jpg" class="img-fluid" style="width:75.0%"></p>
</section>
<section id="basic-notation" class="level2">
<h2 class="anchored" data-anchor-id="basic-notation">Basic notation</h2>
<p>We can see Fig.2.</p>
<ol type="1">
<li>Input data is represented as a collection of points in the <span class="math inline">\(d\)</span>-dimensional space which are labeled by their feature responses.</li>
<li>A decision tree is a hierarchical structure of connected nodes.<br>
</li>
<li>Training a decision tree involves sending all training data <span class="math inline">\(\{v\}\)</span> into the tree and optimizing the parameters of the split nodes so as to optimize a chosen cost function.</li>
<li>During testing, a split (internal) node applies a test to the input data <span class="math inline">\(v\)</span> and sends it to the appropriate child. The process is repeated until a leaf (terminal) node is reached (beige path).</li>
</ol>
<p><img src="images/partition.jpg" class="img-fluid" style="width:75.0%"></p>
</section>
<section id="tree-growing-procedure" class="level2">
<h2 class="anchored" data-anchor-id="tree-growing-procedure">Tree-Growing Procedure</h2>
<p>We can see Fig.3.</p>
<p>In order to grow a classification tree, we need to answer four basic questions:</p>
<ol type="1">
<li>How do we choose the Boolean conditions for splitting at each node?</li>
<li>Which criterion should we use to split a parent node into its two child nodes?</li>
<li>How do we decide when a node become a terminal node (i.e., stop splitting)?</li>
<li>How do we assign a class to a terminal node?</li>
</ol>
</section>
<section id="choosing-the-best-split-for-a-variable" class="level2">
<h2 class="anchored" data-anchor-id="choosing-the-best-split-for-a-variable">Choosing the best split for a Variable</h2>
<section id="splitting-strategies" class="level3">
<h3 class="anchored" data-anchor-id="splitting-strategies">Splitting Strategies</h3>
<p>At each node, the tree-growing algorithm has to decide on which variable it is “best” to split. We need to consider every possible split over all variables present at that node, then enumerate all possible splits, evaluate each one, and decide which is best in some sense.</p>
<p>For a description of splitting rules, we need to make a distinction between ordinal (or continuous) and nominal (or categorical) variables. For a continuous or ordinal variable, the number of possible splits at a given node is one fewer than the number of its distinctly observed values. Suppose that a particular categorical variable is defined by <span class="math inline">\(m\)</span> distinct categories, there are <span class="math inline">\(2^{m-1}-1\)</span> distinct splits.</p>
<p>We first need to choose the best split for a given variable. Accordingly, we have to measure of goodness of a split. Let <span class="math inline">\(C_1\)</span>,…,<span class="math inline">\(C_K\)</span> be the <span class="math inline">\(K\geq 2\)</span> classes. For node <span class="math inline">\(\tau\)</span>, we denote by <span class="math display">\[
P(X\in C_k\vert \tau)
\]</span> the conditional probability that an observation <span class="math inline">\(x\)</span> is in <span class="math inline">\(C_k\)</span> given that it falls into the node <span class="math inline">\(\tau\)</span>.</p>
</section>
<section id="node-information-functions" class="level3">
<h3 class="anchored" data-anchor-id="node-information-functions">Node information functions</h3>
<p>We can see Fig.4</p>
<p>![Node impurity functions for the two-class case. The entropy function (rescaled) is the red curve, the Gini index is the green curve, and the resubstitution estimate of the misclassification rate is the blue curve. <img src="images/impurity.jpg" class="img-fluid" style="width:75.0%"></p>
<p>To choose the best split over all variables, we first need to choose the best split for a given variable. Accordingly, we define a measure of goodness of a split. For node <span class="math inline">\(\tau\)</span>, the node information function <span class="math inline">\(i(\tau)\)</span> is definded by <span class="math display">\[
i(\tau)=\phi\Big(p(1\vert \tau),...,p(K\vert \tau)\Big)
\]</span> where <span class="math inline">\(p(k\vert \tau)\)</span> is an estimate of <span class="math inline">\(P(X\in C_k\vert \tau)\)</span>.</p>
<p>We require <span class="math inline">\(\phi\)</span> to be a symmetric function, defined on the set of all <span class="math inline">\(K\)</span>-tuples of probabilities <span class="math inline">\((p_1,\dots,p_k)\)</span> with unit sum, minimized at the points <span class="math inline">\((1,0,\cdots,0)\)</span>, <span class="math inline">\((0,1,\cdots,0)\)</span>, <span class="math inline">\((0,0,\cdots,1)\)</span>, and maximized at the point <span class="math inline">\((\frac{1}{K},\frac{1}{K},\cdots,\frac{1}{K})\)</span>. One such function <span class="math inline">\(\phi\)</span> is the entropy function, <span class="math display">\[
i(\tau)=-\sum_{i=1}^K p(k\vert \tau)\log p(k\vert \tau)
\]</span></p>
<p>An alternative to the the entropy is the Gini index, given by</p>
<p><span class="math display">\[
i(\tau)=1-\sum_{i=1}^K p(k\vert \tau)^2
\]</span></p>
<p>And the misclassification rate</p>
<p><span class="math display">\[
i(\tau)=\sum_{i=1}^K p(k\vert \tau)(1-p(k\vert \tau))
\]</span></p>
</section>
<section id="information-gain" class="level3">
<h3 class="anchored" data-anchor-id="information-gain">Information gain</h3>
<p><img src="images/rf11.jpg" class="img-fluid" style="width:80.0%"></p>
<p>We can see Fig.5.</p>
<p>Suppose, at node <span class="math inline">\(\tau\)</span> , we apply split <span class="math inline">\(s\)</span> so that a proportion <span class="math inline">\(p_L\)</span> of the observations drops down to the left child-node <span class="math inline">\(\tau_L\)</span> and the remaining proportion <span class="math inline">\(p_R\)</span> drops down to the right child-node <span class="math inline">\(\tau_R\)</span></p>
<p>The goodness of split <span class="math inline">\(s\)</span> at node <span class="math inline">\(\tau\)</span> is given by the reduction in impurity gained by splitting the parent node <span class="math inline">\(\tau\)</span> into its child nodes, <span class="math inline">\(\tau_L\)</span> and <span class="math inline">\(\tau_R\)</span>,</p>
<p><span class="math display">\[
I(s,\tau)= i(\tau)-p_L i(\tau_L)-p_R i(\tau_R)
\]</span></p>
<p>The best split for the single variable <span class="math inline">\(X_j\)</span> is the one that has the largest value of <span class="math inline">\(I(s,\tau)\)</span> over all <span class="math inline">\(s\in\mathcal{S}_j\)</span>, the set of possible distinct splits for <span class="math inline">\(X_j\)</span> .</p>
</section>
</section>
<section id="example" class="level2">
<h2 class="anchored" data-anchor-id="example">Example</h2>
<p>Consider first, the parent node <span class="math inline">\(\tau\)</span>, estimate <span class="math inline">\(P(\frac{+1}{\tau})\)</span> by <span class="math inline">\(n_{+1}/n_{++}\)</span> and <span class="math inline">\(P(\frac{-1}{\tau})\)</span> by <span class="math inline">\(n_{+2}/n_{++}\)</span>, and then the estimated impurity function is: <span class="math display">\[
i(\tau)=-\big(\frac{n_{+1}}{n_{++}}\big)\log\big(\frac{n_{+1}}{n_{++}}\big)-\big(\frac{n_{+2}}{n_{++}}\big)\log\big(\frac{n_{+2}}{n_{++}}\big)
\]</span> Note that <span class="math inline">\(i(\tau)\)</span> is completely independent of the type of proposed split.</p>
<p>Now, for the child nodes, <span class="math inline">\(\tau_L\)</span> y <span class="math inline">\(\tau_R\)</span>. We estimated <span class="math inline">\(p_L=n_{1+}/n_{++}\)</span> and <span class="math inline">\(p_R=n_{2+}/n_{++}\)</span></p>
<p>For <span class="math inline">\(X_j\leq s\)</span>, we estimate <span class="math inline">\(P(\frac{+1}{\tau_L})=n_{11}/n_{1+}\)</span> and <span class="math inline">\(P(\frac{-1}{\tau_L})=n_{12}/n_{1+}\)</span> For the condition <span class="math inline">\(X_j&gt; s\)</span>, we estimate <span class="math inline">\(P(\frac{+1}{\tau_R})=n_{21}/n_{2+}\)</span> and <span class="math inline">\(P(\frac{-1}{\tau_R})=n_{22}/n_{2+}\)</span>. We then compute: <span class="math display">\[
i(\tau_L)=-\big(\frac{n_{11}}{n_{1+}}\big)\log\big(\frac{n_{11}}{n_{1+}}\big)-\big(\frac{n_{12}}{n_{1+}}\big)\log\big(\frac{n_{12}}{n_{1+}}\big)
\]</span> <span class="math display">\[
i(\tau_R)=-\big(\frac{n_{21}}{n_{2+}}\big)\log\big(\frac{n_{21}}{n_{2+}}\big)-\big(\frac{n_{22}}{n_{2+}}\big)\log\big(\frac{n_{22}}{n_{2+}}\big)
\]</span> and, then we can compute <span class="math inline">\(I(s,\tau)\)</span>.</p>
</section>
<section id="recursive-partitioning" class="level2">
<h2 class="anchored" data-anchor-id="recursive-partitioning">Recursive Partitioning</h2>

</section>
<section id="stop-spliting" class="level2">
<h2 class="anchored" data-anchor-id="stop-spliting">Stop spliting</h2>

</section>
<section id="plurality-rule" class="level2">
<h2 class="anchored" data-anchor-id="plurality-rule">Plurality rule</h2>
<p>We can see Fig.6.</p>
<p>How do we associate a class with a terminal node?</p>
<p>Suppose at terminal node <span class="math inline">\(\tau\)</span> there are <span class="math inline">\(n(\tau)\)</span> observations, of which <span class="math inline">\(n_k(\tau)\)</span> are from class <span class="math inline">\(C_k\)</span>, <span class="math inline">\(k=1,...,K\)</span>. Then, the class which corresponds to the largest of the <span class="math inline">\(\{n_1(\tau),...,n_k(\tau)\}\)</span> is assigned to <span class="math inline">\(\tau\)</span>.</p>
<p>This is called the plurality rule. This rule can be derived from the Bayes’s rule classifier, where we assign the node <span class="math inline">\(\tau\)</span> to class <span class="math inline">\(C_i\)</span> if <span class="math inline">\(p(i|\tau) = \max_k p(k|\tau)\)</span>.</p>
<p><img src="images/rf1.jpg" class="img-fluid" style="width:75.0%"></p>
</section>
<section id="choosing-the-best-split-in-regression-trees" class="level2">
<h2 class="anchored" data-anchor-id="choosing-the-best-split-in-regression-trees">Choosing the best split in Regression Trees</h2>
<p>We now discuss the process of building a regression tree. Roughly speaking, there are two steps.</p>
<p>For instance, suppose that in Step 1 we obtain two regions, <span class="math inline">\(R_1\)</span> and <span class="math inline">\(R_2\)</span>, and that the response mean of the training observations in the first region is 10, while the response mean of the training observations in the second region is 20. Then for a given observation <span class="math inline">\(X=x\)</span>, if <span class="math inline">\(x\in R_1\)</span> we will predict a value of 10, and if <span class="math inline">\(x\in R_2\)</span> we will predict a value of 20.</p>
<p>We now elaborate on Step 1 above. How do we construct the regions <span class="math inline">\(R_1,...,R_J\)</span>? In theory, the regions could have any shape. However, we choose to divide the predictor space into high-dimensional rectangles, or boxes, for simplicity and for ease of interpretation of the resulting predictive model. The goal is to find boxes <span class="math inline">\(R_1,...,R_J\)</span>? that minimize the RSS (Residual Sum of Squares), given by <span class="math display">\[
\sum_{j=1}^J\sum_{i\in R_j}(y_i-\hat{y}_{R_j})^2
\]</span></p>
<p>where <span class="math inline">\(\hat{y}_{R_j}\)</span> is the mean response for the training observations within the j-th box. Unfortunately, it is computationally infeasible to consider every possible partition of the feature space into <span class="math inline">\(J\)</span> boxes. For this reason, we take a top-down, greedy approach that is known as recursive binary splitting. The approach is top-down because it begins at the top of the tree (at which point all observations belong to a single region) and then successively splits the predictor space; each split is indicated via two new branches further down on the tree. It is greedy because at each step of the tree-building process, the best split is made at that particular step, rather than looking ahead and picking a split that will lead to a better tree in some future step.</p>
<p>In order to perform recursive binary splitting, we first select the predictor <span class="math inline">\(X_j\)</span> and the cutpoint <span class="math inline">\(s\)</span> such that splitting the predictor space into the regions <span class="math inline">\(\{X|X_j \leq s\}\)</span> and <span class="math inline">\(\{X|X_j &gt; s\}\)</span> leads to the greatest possible reduction in RSS. (The notation <span class="math inline">\(\{X|X_j \leq  s\}\)</span> means the region of predictor space in which <span class="math inline">\(X_j\)</span> takes on a value less than <span class="math inline">\(s\)</span>). That is, we consider all predictors <span class="math inline">\(X_1,...,X_p\)</span>, and all possible values of the cutpoint <span class="math inline">\(s\)</span> for each of the predictors, and then choose the predictor and cutpoint such that the resulting tree has the lowest RSS. In greater detail, for any <span class="math inline">\(j\)</span> and <span class="math inline">\(s\)</span>, we define the pair of half-planes <span class="math display">\[
R_1(j,s)=\{X|X_j \leq  s\},\qquad R_2(j,s)=\{X|X_j &gt; s\}
\]</span> and we seek the value of <span class="math inline">\(j\)</span> and <span class="math inline">\(s\)</span> that minimize the equation <span class="math display">\[\begin{equation}\label{tree1}
\sum_{i:x_i\in R_1}(y_i-\hat{y}_{R_1})^2+\sum_{i:x_i\in R_2}(y_i-\hat{y}_{R_2})^2
\end{equation}\]</span></p>
<p>where <span class="math inline">\(\hat{y}_{R_1}\)</span> is the mean response for the training observations in <span class="math inline">\(R_1(j,s)\)</span>, and <span class="math inline">\(\hat{y}_{R_2}\)</span> is the mean response for the training observations in <span class="math inline">\(R_2(j,s)\)</span>. Finding the values of <span class="math inline">\(j\)</span> and <span class="math inline">\(s\)</span> that minimize (<span class="math inline">\(\ref{tree1}\)</span>) can be done quite quickly, especially when the number of features <span class="math inline">\(p\)</span> is not too large.</p>
<p>Next, we repeat the process, looking for the best predictor and best cutpoint in order to split the data further so as to minimize the RSS within each of the resulting regions. However, this time, instead of splitting the entire predictor space, we split one of the two previously identified regions. We now have three regions. Again, we look to split one of these three regions further, so as to minimize the RSS. The process continues until a stopping criterion is reached; for instance, we may continue until no region contains more than five observations.</p>
<p>Once the regions <span class="math inline">\(R_1,...,R_J\)</span> have been created, we predict the response for a given test observation using the mean of the training observations in the region to which that test observation belongs.</p>
</section>
<section id="estimating-the-misclassification-rate" class="level2">
<h2 class="anchored" data-anchor-id="estimating-the-misclassification-rate">Estimating the misclassification rate</h2>
<p>Let <span class="math inline">\(T\)</span> be the tree classifier and let <span class="math inline">\(\tilde{T}=\{\tau_1,\tau_2,...,\tau_L\}\)</span> denote the set of all terminal nodes of <span class="math inline">\(T\)</span>. We can now estimate the true misclassification rate, <span class="math display">\[\begin{equation}\label{e1}
R(T)=\sum_{l=1}^LR(\tau_l)P(\tau_l),
\end{equation}\]</span> for <span class="math inline">\(T\)</span>, where <span class="math inline">\(P(\tau)\)</span> is the probability that an observation falls into node <span class="math inline">\(\tau\)</span> and <span class="math inline">\(R(\tau)\)</span> is the within-node misclassification rate of an observation in node <span class="math inline">\(\tau\)</span>.</p>
<p>If we estimate <span class="math inline">\(R(\tau)\)</span> by <span class="math display">\[
r(\tau)=1-\max_kp(k|\tau)
\]</span> and we estimate <span class="math inline">\(P(\tau_l)\)</span> by the proportion <span class="math inline">\(p(\tau_l)\)</span> of all observations that fall into node <span class="math inline">\(\tau_l\)</span>, then, the resubstitution estimate of <span class="math inline">\(R(T)\)</span> is</p>
<p><span class="math display">\[
\hat{R}(T)=\sum_{l=1}^Lr(\tau_l)p(\tau_l).
\]</span></p>
<p>The resubstitution estimate <span class="math inline">\(\hat{R}(T)\)</span>, however, leaves much to be desired as an estimate of <span class="math inline">\(R(T)\)</span>.</p>
<p>First, bigger trees (i.e., more splitting) have smaller values of <span class="math inline">\(\hat{R}(T)\)</span>; that is, <span class="math inline">\(\hat{R}(T')\leq \hat{R}(T)\)</span>), where <span class="math inline">\(T'\)</span> is formed by splitting a terminal node of <span class="math inline">\(T\)</span>. For example, if a tree is allowed to grow until every terminal node contains only a single observation, then that node is classified by the class of that observation and <span class="math inline">\(\hat{R}(T)=0\)</span>.</p>
<p>Second, using only the resubstitution estimate tends to generate trees that are too big for the given data.</p>
<p>Third, the resubstitution estimate <span class="math inline">\(\hat{R}(T)\)</span> is a much-too-optimistic estimate of <span class="math inline">\(R(T)\)</span>. More realistic estimates of <span class="math inline">\(R(T)\)</span> are given below.</p>
</section>
<section id="tree-pruning" class="level2">
<h2 class="anchored" data-anchor-id="tree-pruning">Tree Pruning</h2>
<p>Since decision trees have a very high tendency to over-fit the data, a smaller tree with fewer splits might lead to increase the generalization capability. Lower variance (estimation error) at the cost of a little bias (approximation error).</p>
<p>One possible alternative to the process described above is to build the tree only so long as the decrease in the node impurity measure, due to each split exceeds some (high) threshold. However, due to greedy nature of the splitting algorithm, it is too short-sighted since a seemingly worthless split early on in the tree might be followed by a very good split i.e., a split that leads to a large reduction in impurity later on.</p>
<p>Therefore, a better strategy is to grow a very large tree <span class="math inline">\(T_0\)</span>, and then prune it back in order to obtain a subtree.</p>
<section id="cost-complexity-pruning" class="level3">
<h3 class="anchored" data-anchor-id="cost-complexity-pruning">Cost complexity pruning</h3>
<p>A sequence of trees indexed by a nonnegative tuning parameter <span class="math inline">\(\alpha\)</span> is considered. For each value of <span class="math inline">\(\alpha\)</span> there corresponds a subtree <span class="math inline">\(T\subset T_0\)</span> such that the penalized misclassification rate <span class="math display">\[
R_\alpha(T)=R(T)+\alpha|T|
\]</span> is as small as possible. Here <span class="math inline">\(|T|\)</span> indicates the number of terminal nodes of the subtree <span class="math inline">\(T\)</span>, Think of <span class="math inline">\(\alpha|T|\)</span> as a penalty term for tree size, so that <span class="math inline">\(R_\alpha(T)\)</span> penalizes <span class="math inline">\(R(T)\)</span> (<span class="math inline">\(\ref{e1}\)</span>) for generating too large a tree. For each <span class="math inline">\(\alpha\)</span>, we then choose that subtree <span class="math inline">\(T(\alpha)\)</span> of <span class="math inline">\(T_0\)</span> that minimizes <span class="math inline">\(R_\alpha(T)\)</span>.</p>
<p>The tuning parameter <span class="math inline">\(\alpha\)</span> controls a trade-off between the subtree’s complexity and its fit to the training data. When <span class="math inline">\(\alpha=0\)</span>, then the subtree <span class="math inline">\(T\)</span> will simply equal <span class="math inline">\(T_0\)</span>. As <span class="math inline">\(\alpha\)</span> increases, there is a price to pay for having a tree with many terminal nodes, and so the above equation will tend to be minimized for a smaller subtree.</p>
<p>Breiman et al.&nbsp;(1984) showed that for every <span class="math inline">\(\alpha\)</span>, there exists a unique smallest minimizing subtree.</p>
<p>Depending on the cost of each additional leaf (i.e.&nbsp;the <span class="math inline">\(\alpha\)</span> value) different sub-trees of <span class="math inline">\(T_0\)</span> minimise the error-complexity measure. Breiman and his colleagues proved that although <span class="math inline">\(\alpha\)</span> can run through a continuum of values there is a sequence of pruned trees such that each element is optimal for a range of <span class="math inline">\(\alpha\)</span>, and so there is only a finite number of interesting <span class="math inline">\(\alpha\)</span> values.</p>
<p><span class="math display">\[
0 = \alpha_0 &lt; \alpha_1 &lt; \alpha_2 &lt; \alpha_3 &lt; \cdots &lt; \alpha_M,
\]</span></p>
<p>Furthermore, they developed an algorithm that generates a parametric family of pruned trees</p>
<p><span class="math display">\[
T_0 \prec T_1 \prec T_2 \prec T_3 \prec \cdots \prec T_M,
\]</span></p>
<p>such that each <span class="math inline">\(T_i\)</span> in the sequence is characterised by adifferent value <span class="math inline">\(\alpha_i\)</span>. They proved that each tree <span class="math inline">\(T_i\)</span> in this sequence is optimal from the error-complexity perspective within the interval <span class="math inline">\([\alpha_i,\alpha_{i+1})\)</span>.</p>
<p>So far, we have constructed a finite sequence of decreasing-size subtrees <span class="math inline">\(T_1,T_2,T_3,\dots,T_M\)</span> by pruning more and more nodes from <span class="math inline">\(T_0\)</span>. When do we stop pruning? Which subtree of the sequence do we choose as the “best” pruned subtree? Choice of the best subtree depends upon having a good estimate of the misclassification rate <span class="math inline">\(R(T_k)\)</span> corresponding to the subtree <span class="math inline">\(T_k\)</span>. Breiman et al.&nbsp;(1984) offered two estimation methods: use an independent test sample or use cross-validation. When the data set is very large, use of an independent test set is straightforward and computationally efficient, and is, generally, the preferred estimation method. For smaller data sets, crossvalidation is preferred.</p>
</section>
</section>
<section id="advantages-and-disadvantages-of-trees" class="level2">
<h2 class="anchored" data-anchor-id="advantages-and-disadvantages-of-trees">Advantages and disadvantages of trees</h2>
<ol type="1">
<li>Trees are very easy to explain to people. In fact, they are even easier to explain than linear regression!</li>
<li>Some people believe that decision trees more closely mirror human decision-making than do the regression and classification approaches.</li>
<li>Trees can be displayed graphically, and are easily interpreted even by a non-expert (especially if they are small).</li>
<li>Trees can easily handle qualitative predictors without the need to create dummy variables.</li>
<li>Unfortunately, trees generally do not have the same level of predictive accuracy as some of the other regression and classification approaches.</li>
</ol>
<p>However, by aggregating many decision trees, using methods like bagging, random forests, and boosting, the predictive performance of trees can be substantially improved. We introduce these concepts next.</p>
</section>
</section>
<section id="bibliography" class="level1">
<h1>Bibliography</h1>
<p>A. Criminisi, J. Shotton and E. Konukoglu. Decision Forests for Classification, Regression, Density Estimation, Manifold Learning and Semi-Supervised Learning. Microsoft Research technical report TR-2011-114.</p>
<p>The Elements of Statistical Learning: Data Mining, Inference, and Prediction. Second Edition. Springer. 2009.</p>
</section>

</main>
<!-- /main column -->
<script id="quarto-html-after-body" type="application/javascript">
window.document.addEventListener("DOMContentLoaded", function (event) {
  const toggleBodyColorMode = (bsSheetEl) => {
    const mode = bsSheetEl.getAttribute("data-mode");
    const bodyEl = window.document.querySelector("body");
    if (mode === "dark") {
      bodyEl.classList.add("quarto-dark");
      bodyEl.classList.remove("quarto-light");
    } else {
      bodyEl.classList.add("quarto-light");
      bodyEl.classList.remove("quarto-dark");
    }
  }
  const toggleBodyColorPrimary = () => {
    const bsSheetEl = window.document.querySelector("link#quarto-bootstrap");
    if (bsSheetEl) {
      toggleBodyColorMode(bsSheetEl);
    }
  }
  toggleBodyColorPrimary();  
  const icon = "";
  const anchorJS = new window.AnchorJS();
  anchorJS.options = {
    placement: 'right',
    icon: icon
  };
  anchorJS.add('.anchored');
  const isCodeAnnotation = (el) => {
    for (const clz of el.classList) {
      if (clz.startsWith('code-annotation-')) {                     
        return true;
      }
    }
    return false;
  }
  const clipboard = new window.ClipboardJS('.code-copy-button', {
    text: function(trigger) {
      const codeEl = trigger.previousElementSibling.cloneNode(true);
      for (const childEl of codeEl.children) {
        if (isCodeAnnotation(childEl)) {
          childEl.remove();
        }
      }
      return codeEl.innerText;
    }
  });
  clipboard.on('success', function(e) {
    // button target
    const button = e.trigger;
    // don't keep focus
    button.blur();
    // flash "checked"
    button.classList.add('code-copy-button-checked');
    var currentTitle = button.getAttribute("title");
    button.setAttribute("title", "Copied!");
    let tooltip;
    if (window.bootstrap) {
      button.setAttribute("data-bs-toggle", "tooltip");
      button.setAttribute("data-bs-placement", "left");
      button.setAttribute("data-bs-title", "Copied!");
      tooltip = new bootstrap.Tooltip(button, 
        { trigger: "manual", 
          customClass: "code-copy-button-tooltip",
          offset: [0, -8]});
      tooltip.show();    
    }
    setTimeout(function() {
      if (tooltip) {
        tooltip.hide();
        button.removeAttribute("data-bs-title");
        button.removeAttribute("data-bs-toggle");
        button.removeAttribute("data-bs-placement");
      }
      button.setAttribute("title", currentTitle);
      button.classList.remove('code-copy-button-checked');
    }, 1000);
    // clear code selection
    e.clearSelection();
  });
  function tippyHover(el, contentFn, onTriggerFn, onUntriggerFn) {
    const config = {
      allowHTML: true,
      maxWidth: 500,
      delay: 100,
      arrow: false,
      appendTo: function(el) {
          return el.parentElement;
      },
      interactive: true,
      interactiveBorder: 10,
      theme: 'quarto',
      placement: 'bottom-start',
    };
    if (contentFn) {
      config.content = contentFn;
    }
    if (onTriggerFn) {
      config.onTrigger = onTriggerFn;
    }
    if (onUntriggerFn) {
      config.onUntrigger = onUntriggerFn;
    }
    window.tippy(el, config); 
  }
  const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
  for (var i=0; i<noterefs.length; i++) {
    const ref = noterefs[i];
    tippyHover(ref, function() {
      // use id or data attribute instead here
      let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
      try { href = new URL(href).hash; } catch {}
      const id = href.replace(/^#\/?/, "");
      const note = window.document.getElementById(id);
      return note.innerHTML;
    });
  }
  const xrefs = window.document.querySelectorAll('a.quarto-xref');
  const processXRef = (id, note) => {
    // Strip column container classes
    const stripColumnClz = (el) => {
      el.classList.remove("page-full", "page-columns");
      if (el.children) {
        for (const child of el.children) {
          stripColumnClz(child);
        }
      }
    }
    stripColumnClz(note)
    if (id === null || id.startsWith('sec-')) {
      // Special case sections, only their first couple elements
      const container = document.createElement("div");
      if (note.children && note.children.length > 2) {
        container.appendChild(note.children[0].cloneNode(true));
        for (let i = 1; i < note.children.length; i++) {
          const child = note.children[i];
          if (child.tagName === "P" && child.innerText === "") {
            continue;
          } else {
            container.appendChild(child.cloneNode(true));
            break;
          }
        }
        if (window.Quarto?.typesetMath) {
          window.Quarto.typesetMath(container);
        }
        return container.innerHTML
      } else {
        if (window.Quarto?.typesetMath) {
          window.Quarto.typesetMath(note);
        }
        return note.innerHTML;
      }
    } else {
      // Remove any anchor links if they are present
      const anchorLink = note.querySelector('a.anchorjs-link');
      if (anchorLink) {
        anchorLink.remove();
      }
      if (window.Quarto?.typesetMath) {
        window.Quarto.typesetMath(note);
      }
      // TODO in 1.5, we should make sure this works without a callout special case
      if (note.classList.contains("callout")) {
        return note.outerHTML;
      } else {
        return note.innerHTML;
      }
    }
  }
  for (var i=0; i<xrefs.length; i++) {
    const xref = xrefs[i];
    tippyHover(xref, undefined, function(instance) {
      instance.disable();
      let url = xref.getAttribute('href');
      let hash = undefined; 
      if (url.startsWith('#')) {
        hash = url;
      } else {
        try { hash = new URL(url).hash; } catch {}
      }
      if (hash) {
        const id = hash.replace(/^#\/?/, "");
        const note = window.document.getElementById(id);
        if (note !== null) {
          try {
            const html = processXRef(id, note.cloneNode(true));
            instance.setContent(html);
          } finally {
            instance.enable();
            instance.show();
          }
        } else {
          // See if we can fetch this
          fetch(url.split('#')[0])
          .then(res => res.text())
          .then(html => {
            const parser = new DOMParser();
            const htmlDoc = parser.parseFromString(html, "text/html");
            const note = htmlDoc.getElementById(id);
            if (note !== null) {
              const html = processXRef(id, note);
              instance.setContent(html);
            } 
          }).finally(() => {
            instance.enable();
            instance.show();
          });
        }
      } else {
        // See if we can fetch a full url (with no hash to target)
        // This is a special case and we should probably do some content thinning / targeting
        fetch(url)
        .then(res => res.text())
        .then(html => {
          const parser = new DOMParser();
          const htmlDoc = parser.parseFromString(html, "text/html");
          const note = htmlDoc.querySelector('main.content');
          if (note !== null) {
            // This should only happen for chapter cross references
            // (since there is no id in the URL)
            // remove the first header
            if (note.children.length > 0 && note.children[0].tagName === "HEADER") {
              note.children[0].remove();
            }
            const html = processXRef(null, note);
            instance.setContent(html);
          } 
        }).finally(() => {
          instance.enable();
          instance.show();
        });
      }
    }, function(instance) {
    });
  }
      let selectedAnnoteEl;
      const selectorForAnnotation = ( cell, annotation) => {
        let cellAttr = 'data-code-cell="' + cell + '"';
        let lineAttr = 'data-code-annotation="' +  annotation + '"';
        const selector = 'span[' + cellAttr + '][' + lineAttr + ']';
        return selector;
      }
      const selectCodeLines = (annoteEl) => {
        const doc = window.document;
        const targetCell = annoteEl.getAttribute("data-target-cell");
        const targetAnnotation = annoteEl.getAttribute("data-target-annotation");
        const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation));
        const lines = annoteSpan.getAttribute("data-code-lines").split(",");
        const lineIds = lines.map((line) => {
          return targetCell + "-" + line;
        })
        let top = null;
        let height = null;
        let parent = null;
        if (lineIds.length > 0) {
            //compute the position of the single el (top and bottom and make a div)
            const el = window.document.getElementById(lineIds[0]);
            top = el.offsetTop;
            height = el.offsetHeight;
            parent = el.parentElement.parentElement;
          if (lineIds.length > 1) {
            const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]);
            const bottom = lastEl.offsetTop + lastEl.offsetHeight;
            height = bottom - top;
          }
          if (top !== null && height !== null && parent !== null) {
            // cook up a div (if necessary) and position it 
            let div = window.document.getElementById("code-annotation-line-highlight");
            if (div === null) {
              div = window.document.createElement("div");
              div.setAttribute("id", "code-annotation-line-highlight");
              div.style.position = 'absolute';
              parent.appendChild(div);
            }
            div.style.top = top - 2 + "px";
            div.style.height = height + 4 + "px";
            div.style.left = 0;
            let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter");
            if (gutterDiv === null) {
              gutterDiv = window.document.createElement("div");
              gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter");
              gutterDiv.style.position = 'absolute';
              const codeCell = window.document.getElementById(targetCell);
              const gutter = codeCell.querySelector('.code-annotation-gutter');
              gutter.appendChild(gutterDiv);
            }
            gutterDiv.style.top = top - 2 + "px";
            gutterDiv.style.height = height + 4 + "px";
          }
          selectedAnnoteEl = annoteEl;
        }
      };
      const unselectCodeLines = () => {
        const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"];
        elementsIds.forEach((elId) => {
          const div = window.document.getElementById(elId);
          if (div) {
            div.remove();
          }
        });
        selectedAnnoteEl = undefined;
      };
        // Handle positioning of the toggle
    window.addEventListener(
      "resize",
      throttle(() => {
        elRect = undefined;
        if (selectedAnnoteEl) {
          selectCodeLines(selectedAnnoteEl);
        }
      }, 10)
    );
    function throttle(fn, ms) {
    let throttle = false;
    let timer;
      return (...args) => {
        if(!throttle) { // first call gets through
            fn.apply(this, args);
            throttle = true;
        } else { // all the others get throttled
            if(timer) clearTimeout(timer); // cancel #2
            timer = setTimeout(() => {
              fn.apply(this, args);
              timer = throttle = false;
            }, ms);
        }
      };
    }
      // Attach click handler to the DT
      const annoteDls = window.document.querySelectorAll('dt[data-target-cell]');
      for (const annoteDlNode of annoteDls) {
        annoteDlNode.addEventListener('click', (event) => {
          const clickedEl = event.target;
          if (clickedEl !== selectedAnnoteEl) {
            unselectCodeLines();
            const activeEl = window.document.querySelector('dt[data-target-cell].code-annotation-active');
            if (activeEl) {
              activeEl.classList.remove('code-annotation-active');
            }
            selectCodeLines(clickedEl);
            clickedEl.classList.add('code-annotation-active');
          } else {
            // Unselect the line
            unselectCodeLines();
            clickedEl.classList.remove('code-annotation-active');
          }
        });
      }
  const findCites = (el) => {
    const parentEl = el.parentElement;
    if (parentEl) {
      const cites = parentEl.dataset.cites;
      if (cites) {
        return {
          el,
          cites: cites.split(' ')
        };
      } else {
        return findCites(el.parentElement)
      }
    } else {
      return undefined;
    }
  };
  var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
  for (var i=0; i<bibliorefs.length; i++) {
    const ref = bibliorefs[i];
    const citeInfo = findCites(ref);
    if (citeInfo) {
      tippyHover(citeInfo.el, function() {
        var popup = window.document.createElement('div');
        citeInfo.cites.forEach(function(cite) {
          var citeDiv = window.document.createElement('div');
          citeDiv.classList.add('hanging-indent');
          citeDiv.classList.add('csl-entry');
          var biblioDiv = window.document.getElementById('ref-' + cite);
          if (biblioDiv) {
            citeDiv.innerHTML = biblioDiv.innerHTML;
          }
          popup.appendChild(citeDiv);
        });
        return popup.innerHTML;
      });
    }
  }
});
</script>
</div> <!-- /content -->


</body></html>