index.html

<!DOCTYPE html>
<html>
<head>
  <title>R 语言中级培训</title>
  <meta charset="utf-8">
  <meta name="description" content="R 语言中级培训">
  <meta name="author" content="Copyright @Transwarp Inc. | All Rights Reserved">
  <meta name="generator" content="slidify" />
  <meta name="apple-mobile-web-app-capable" content="yes">
  <meta http-equiv="X-UA-Compatible" content="chrome=1">
  <link rel="stylesheet" href="./libraries/frameworks/io2012/css/default.css" media="all" >
  <link rel="stylesheet" href="./libraries/frameworks/io2012/css/phone.css" 
    media="only screen and (max-device-width: 480px)" >
  <link rel="stylesheet" href="./libraries/frameworks/io2012/css/slidify.css" >
  <link rel="stylesheet" href="./libraries/highlighters/highlight.js/css/zenburn.css" />
  <base target="_blank"> <!-- This amazingness opens all links in a new tab. -->  
  
  <!-- Grab CDN jQuery, fall back to local if offline -->
  <script src="http://ajax.aspnetcdn.com/ajax/jQuery/jquery-1.7.min.js"></script>
  <script>window.jQuery || document.write('<script src="./libraries/widgets/quiz/js/jquery.js"><\/script>')</script> 
  <script data-main="./libraries/frameworks/io2012/js/slides" 
    src="./libraries/frameworks/io2012/js/require-1.0.8.min.js">
  </script>
  
  

</head>
<body style="opacity: 0">
  <slides class="layout-widescreen">
    
    <!-- LOGO SLIDE -->
        <slide class="title-slide segue nobackground">
  <hgroup class="auto-fadein">
    <h1>R 语言中级培训</h1>
    <h2>星环数据挖掘组_daitao.xing</h2>
    <p>Copyright @Transwarp Inc. | All Rights Reserved<br/></p>
  </hgroup>
  <article></article>  
</slide>
    

    <!-- SLIDES -->
    <slide class="" id="slide-1" style="background:;">
  <article data-timings="">
    
  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="" id="slide-2" style="background:;">
  <hgroup>
    <h2>目录</h2>
  </hgroup>
  <article data-timings="">
    <ul>
<li>R的核心</li>
<li>替代For的Apply家族函数</li>
<li>管道操作</li>
<li>数据获取</li>
<li>R高效数据处理</li>
<li>其他有关数据科学的topic </li>
<li>深入了解数据:ggplot2</li>
</ul>

  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="segue dark" id="slide-3" style="background:;">
  <hgroup>
    <h2>R的核心:函数式编程思想</h2>
  </hgroup>
  <article data-timings="">
    
  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="" id="slide-4" style="background:;">
  <hgroup>
    <h2>FP</h2>
  </hgroup>
  <article data-timings="">
    <pre><code class="r">1+2
</code></pre>

<pre><code>## [1] 3
</code></pre>

<pre><code class="r">&#39;+&#39;(1,2)
</code></pre>

<pre><code>## [1] 3
</code></pre>

<pre><code class="r">funs &lt;- c(lm, median, sd, mad, IQR)
</code></pre>

<ul>
<li>Anonymous functions </li>
<li>Closures</li>
<li>List of functions</li>
</ul>

  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="" id="slide-5" style="background:;">
  <article data-timings="">
    <pre><code class="r">summary &lt;- function(x) {
 c(mean(x, na.rm = TRUE),
   median(x, na.rm = TRUE),
   sd(x, na.rm = TRUE),
   mad(x, na.rm = TRUE),
   IQR(x, na.rm = TRUE))
}
summary &lt;- function(x) {
  funs &lt;- c(mean, median, sd, mad, IQR)
  lapply(funs, function(f) f(x, na.rm = TRUE))
}
</code></pre>

  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="" id="slide-6" style="background:;">
  <hgroup>
    <h2>闭包</h2>
  </hgroup>
  <article data-timings="">
    <pre><code class="r">power &lt;- function(exponent) {
  function(x) {
    x ^ exponent
  }
}
</code></pre>

  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="" id="slide-7" style="background:;">
  <hgroup>
    <h2>Lists of functions</h2>
  </hgroup>
  <article data-timings="">
    <pre><code class="r">compute_mean &lt;- list(
  base = function(x) mean(x),
  sum = function(x) sum(x) / length(x),
  manual = function(x) {
    total &lt;- 0
    n &lt;- length(x)
    for (i in seq_along(x)) {
      total &lt;- total + x[i] / n
    }
    total
  }
)

compute_mean$sum()
compute_mean[[&quot;sum&quot;]]()
</code></pre>

  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="" id="slide-8" style="background:;">
  <hgroup>
    <h2>FP的效率</h2>
  </hgroup>
  <article data-timings="">
    <pre><code class="r">x &lt;- c()
system.time({
    for(i in 1:100000){
      if(i %% 2 ==0)
        x &lt;- c(x,i)
    } 
})
</code></pre>

<pre><code>##    user  system elapsed 
##   4.475   0.608   5.089
</code></pre>

<pre><code class="r">system.time({
  x &lt;- 1:100000
  x[x %% 2 == 0]
})
</code></pre>

<pre><code>##    user  system elapsed 
##   0.002   0.001   0.003
</code></pre>

  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="segue dark" id="slide-9" style="background:;">
  <hgroup>
    <h2>R中的管道操作</h2>
  </hgroup>
  <article data-timings="">
    
  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="" id="slide-10" style="background:;">
  <hgroup>
    <h2>管道操作</h2>
  </hgroup>
  <article data-timings="">
    <ul>
<li>shell |</li>
<li>magrittr 和 pipeR(renkun)</li>
<li>上一步输出为下一步的输入</li>
<li>比管道更加灵活（主动判断应该填入的位置）</li>
<li>%&gt;% 将结果输送到函数的第一个参数</li>
<li>%&gt;&gt;% 将结果输送到表达式中.的位置</li>
<li>recharts</li>
</ul>

  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="" id="slide-11" style="background:;">
  <hgroup>
    <h2>自定义的管道</h2>
  </hgroup>
  <article data-timings="">
    <pre><code class="r">`%^_^%` &lt;- function(from,to) {
  cat(paste(from,&quot;smiles to&quot;,to))
}
&quot;Ken&quot; %^_^% &quot;Jenny&quot;
</code></pre>

<pre><code>## Ken smiles to Jenny
</code></pre>

  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="" id="slide-12" style="background:;">
  <article data-timings="">
    <pre><code class="r">library(magrittr)
rnorm(10000,mean=10,sd=1) %&gt;&gt;% 
  sample(.,size=length(.)*0.1,replace=FALSE) %&gt;% 
  log %&gt;%
  diff %&gt;&gt;%
  plot(.,col=&quot;red&quot;,type=&quot;l&quot;,
    main=sprintf(&quot;length: %d&quot;,length(.)))
</code></pre>

<p><img src="assets/fig/unnamed-chunk-8-1.png" alt="plot of chunk unnamed-chunk-8"> </p>

  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="segue dark" id="slide-13" style="background:;">
  <hgroup>
    <h2>R apply家族</h2>
  </hgroup>
  <article data-timings="">
    
  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="" id="slide-14" style="background:;">
  <hgroup>
    <h2>替代for的高效函数</h2>
  </hgroup>
  <article data-timings="">
    <ul>
<li>apply</li>
<li>mapply</li>
<li>lapply</li>
<li>sapply</li>
</ul>

  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="" id="slide-15" style="background:;">
  <hgroup>
    <h2>apply(X,margin,FUN)</h2>
  </hgroup>
  <article data-timings="">
    <p>对对象的每一个部分施加函数</p>

<pre><code class="r">apply(iris[,1:3],2,max)
</code></pre>

<pre><code>## Sepal.Length  Sepal.Width Petal.Length 
##          7.9          4.4          6.9
</code></pre>

<p>也可以对高维数组操作</p>

<pre><code class="r">x &lt;- 1:27
dim(x) &lt;- c(3,3,3)
apply(x, c(1,2), FUN = paste,collapse =&quot;,&quot;)
</code></pre>

<pre><code>##      [,1]      [,2]      [,3]     
## [1,] &quot;1,10,19&quot; &quot;4,13,22&quot; &quot;7,16,25&quot;
## [2,] &quot;2,11,20&quot; &quot;5,14,23&quot; &quot;8,17,26&quot;
## [3,] &quot;3,12,21&quot; &quot;6,15,24&quot; &quot;9,18,27&quot;
</code></pre>

  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="" id="slide-16" style="background:;">
  <hgroup>
    <h2>lapply(X,FUN)和sapply(X,FUN)</h2>
  </hgroup>
  <article data-timings="">
    <p>对对象的每一个元素进行操作(当对象是DF时,逐列进行)</p>

<pre><code class="r">temp &lt;- iris[,1:3]
head(sapply(temp,as.character))
</code></pre>

<pre><code>##      Sepal.Length Sepal.Width Petal.Length
## [1,] &quot;5.1&quot;        &quot;3.5&quot;       &quot;1.4&quot;       
## [2,] &quot;4.9&quot;        &quot;3&quot;         &quot;1.4&quot;       
## [3,] &quot;4.7&quot;        &quot;3.2&quot;       &quot;1.3&quot;       
## [4,] &quot;4.6&quot;        &quot;3.1&quot;       &quot;1.5&quot;       
## [5,] &quot;5&quot;          &quot;3.6&quot;       &quot;1.4&quot;       
## [6,] &quot;5.4&quot;        &quot;3.9&quot;       &quot;1.7&quot;
</code></pre>

<p>借助高效的管道函数，我们可以构造出更灵活的用法</p>

<pre><code class="r">sapply(1:3, . %&gt;% seq_len %&gt;% sum)
</code></pre>

<pre><code>## [1] 1 3 6
</code></pre>

  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="" id="slide-17" style="background:;">
  <article data-timings="">
    <pre><code class="r">funs2 &lt;- list(
  sum = function(x, ...) sum(x, ..., na.rm = TRUE),
  mean = function(x, ...) mean(x, ..., na.rm = TRUE),
  median = function(x, ...) median(x, ..., na.rm = TRUE)
)
lapply(funs2, function(f) f(x))
</code></pre>

  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="" id="slide-18" style="background:;">
  <hgroup>
    <h2>Mapply(FUN,....,MoreArg = ,....)</h2>
  </hgroup>
  <article data-timings="">
    <ul>
<li>对多个对象逐个元素进行操作(比如对DF中的多列同时操作)</li>
</ul>

<pre><code class="r">mapply(paste,
       1:5,letters[1:5],LETTERS[1:5],
       MoreArgs = list(sep=&#39;-&#39;))
</code></pre>

<pre><code>## [1] &quot;1-a-A&quot; &quot;2-b-B&quot; &quot;3-c-C&quot; &quot;4-d-D&quot; &quot;5-e-E&quot;
</code></pre>

<ul>
<li>当函数FUN需要多个参数输入时</li>
</ul>

<pre><code class="r">test &lt;- c(&quot;0&quot;,&quot;01&quot;,&quot;002&quot;)
res &lt;- mapply(function(x, y) paste0(rep(x, y), collapse = &quot;&quot;), 0, 3- nchar(test))
paste0(res,test)
</code></pre>

<pre><code>## [1] &quot;000&quot; &quot;001&quot; &quot;002&quot;
</code></pre>

  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="" id="slide-19" style="background:;">
  <hgroup>
    <h2>rollapply</h2>
  </hgroup>
  <article data-timings="">
    <pre><code class="r">library(zoo)
</code></pre>

<pre><code>## 
## Attaching package: &#39;zoo&#39;
## 
## The following objects are masked from &#39;package:base&#39;:
## 
##     as.Date, as.Date.numeric
</code></pre>

<pre><code class="r">z &lt;- rnorm(6)
rollapply(z, 2, sum)
</code></pre>

<pre><code>## [1]  0.8225873  2.0785559 -0.6813391 -1.9719372 -0.4199782
</code></pre>

  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="" id="slide-20" style="background:;">
  <hgroup>
    <h2>案例1：bootstrap 抽样</h2>
  </hgroup>
  <article data-timings="">
    <pre><code class="r">boot_lm &lt;- function(formula,data,...){
    function(){
      lm(formula = formula,
         data = data[sample(nrow(data),replace = T),],...)
    }
}

iris_boot &lt;- boot_lm(Sepal.Length ~ Petal.Length,iris)
bstrap &lt;- sapply(X= 1:1000,
                 FUN = function(x) iris_boot()$coef)
apply(bstrap,MARGIN = 1,FUN = quantile,prob =c(0.025,0.5,0.975))
</code></pre>

<pre><code>##       (Intercept) Petal.Length
## 2.5%     4.163022    0.3718770
## 50%      4.305461    0.4095615
## 97.5%    4.446607    0.4439611
</code></pre>

  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="" id="slide-21" style="background:;">
  <hgroup>
    <h2>案例2：担保链分析中的递归</h2>
  </hgroup>
  <article data-timings="">
    <pre><code class="r">findCon = function(uid,friends.connected,friends.whole){
  tmp=c()
  for(i in friends.connected){
    for(j in 1:length(i)){
      tmp1 = unique(rbind(friends.whole[friends.whole$to == i[j],],friends.whole[friends.whole$from == i[j],]))
      tmp = unique(rbind(tmp,tmp1))
    }
  }
  if(dim(unique(rbind(tmp,friends.connected)))[1] == dim(friends.connected)[1]){
    return(friends.connected)
  }else {
    friends.connected=unique(rbind(tmp,friends.connected))
    findCon(uid, friends.connected,friends.whole)
  }
}
uid=&quot;20111214000138&quot;
system.time({
friends.connected = unique(rbind(friends.whole[friends.whole$to ==uid,],friends.whole[friends.whole$from == uid,]))
friends.1=findCon(uid,friends.connected,friends.whole)
friends.1=friends.1[complete.cases(friends.1),]
})
</code></pre>

  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="" id="slide-22" style="background:;">
  <article data-timings="">
    <pre><code class="r">friends.whole&lt;-read.table(&quot;data.csv&quot;,header=T,sep=&quot;,&quot;,col.names=c(&quot;from&quot;,&quot;to&quot;))
findCon1 = function(friends.connected,friends.whole){
  index &lt;- 1:nrow(friends.whole)
  len1 &lt;- nrow(as.data.frame(friends.connected))
  index_target &lt;- unique(unlist(as.list((friends.connected))))
  tmp &lt;- sapply(friends.whole,function(x,y) y[x %in% index_target],index) %&gt;%
                unlist() %&gt;%
                unique()
  friends.connected &lt;- friends.whole[tmp,]
  if(nrow(friends.connected) == len1){
    return(friends.connected)
  }else {
    findCon1(friends.connected,friends.whole)
  }
}
uid=&quot;20111214000138&quot;
system.time({friends.1=findCon1(uid,friends.whole) %&gt;%
            unique()
})
</code></pre>

  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="" id="slide-23" style="background:;">
  <article data-timings="">
    <p><img src="./assets/fig/xiaoguo.png" alt="效果对比"></p>

  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="segue dark" id="slide-24" style="background:;">
  <hgroup>
    <h2>R的数据获取，web scraping</h2>
  </hgroup>
  <article data-timings="">
    
  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="" id="slide-25" style="background:;">
  <article data-timings="">
    <ul>
<li>XML package 结构化网页数据抓取</li>
<li>rvest package(Hadley god) （非结构化网页数据抓取）</li>
</ul>

  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="" id="slide-26" style="background:;">
  <hgroup>
    <h2>结构化网页数据抓取:XML package</h2>
  </hgroup>
  <article data-timings="">
    <pre><code class="r">library(XML)
url &lt;- &#39;http://www.basketball-reference.com/teams/NYK/2015_games.html&#39; 
tables &lt;- readHTMLTable(url,
                        stringsAsFactors = FALSE,
                        header=F)
data &lt;- tables[[1]]
head(data,2)
</code></pre>

<pre><code>##   V1                V2        V3 V4        V5 V6                  V7 V8 V9
## 1  1 Wed, Oct 29, 2014 8:00p EST    Box Score          Chicago Bulls  L   
## 2  2 Thu, Oct 30, 2014 8:00p EST    Box Score  @ Cleveland Cavaliers  W   
##   V10 V11 V12 V13 V14 V15
## 1  80 104   0   1 L 1    
## 2  95  90   1   1 W 1
</code></pre>

<p>查看网页上的超链接</p>

<pre><code class="r">getHTMLLinks(url)
</code></pre>

  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="" id="slide-27" style="background:;">
  <hgroup>
    <h2>非结构化数据的获取:rvest package</h2>
  </hgroup>
  <article data-timings="">
    <p><a href="http://www.w3schools.com/cssref/css_selectors.asp">css协议</a></p>

<pre><code class="r">library(rvest)
freak &lt;- html_session(&quot;http://torrentfreak.com/top-10-most-pirated-movies-of-the-week-130304/&quot;)
freak %&gt;% 
  html_nodes(&quot;.widg-topcomments-post-title&quot;) %&gt;% 
  html_text() %&gt;% .[1:2]
</code></pre>

<pre><code>## [1] &quot;Transmission Releases Long-Awaited BitTorrent Client For Windows&quot;
## [2] &quot;Pirated ‘Star Wars: The Force Awakens’ Blu-Ray Leaks Online&quot;
</code></pre>

  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="segue dark" id="slide-28" style="background:;">
  <hgroup>
    <h2>特征工程和数据预处理</h2>
  </hgroup>
  <article data-timings="">
    
  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="" id="slide-29" style="background:;">
  <hgroup>
    <h2>reshape2</h2>
  </hgroup>
  <article data-timings="">
    
  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="" id="slide-30" style="background:;">
  <hgroup>
    <h2>数据的两种形状</h2>
  </hgroup>
  <article data-timings="">
    <p>统计中待分析的数据框通常有两种形式</p>

<ul>
<li>长型数据（堆叠数据），长型数据是各变量取值在一列中，而对应的变量名在另一列。</li>
<li>宽型数据（非堆叠数据），宽型数据一般是各变量取值类型一致，而变量以不同列的形式构成。</li>
</ul>

<p>例如iris的前四列子集即是一个典型的宽型数据。例如下面将宽型数据转为长型数据：</p>

<pre><code class="r">data_w &lt;- iris[,1:4]
data_l &lt;- stack(data_w)
data_w &lt;- unstack(data_l)
</code></pre>

  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="" id="slide-31" style="background:;">
  <hgroup>
    <h2>数据的两种形状</h2>
  </hgroup>
  <article data-timings="">
    <p>只要在一列中存在分类变量，都可以将其看作是长型数据。在上例中iris的前四列可以看作是宽型数据，但最后两列可以看作是一个长型数据。可以根据Species变量将数据转为宽型。并得到各花种类的平均值。</p>

<pre><code class="r">subdata &lt;- iris[,4:5]
data_w &lt;- unstack(subdata)
colMeans(data_w)
</code></pre>

<pre><code>##     setosa versicolor  virginica 
##      0.246      1.326      2.026
</code></pre>

  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="" id="slide-32" style="background:;">
  <hgroup>
    <h2>数据重塑计算</h2>
  </hgroup>
  <article data-timings="">
    <p>在实践中这种单纯的长宽格式互转并不多见，因为我们并不是需要不同的数据格式，而需要不同格式下的分析结果。在上例中我们先转换数据格式再计算分析结果，而更常见的是一步直接得到分析结果。此时我们需要的是更为强大的reshape2包。</p>

<pre><code class="r">library(reshape2)
dcast(data=subdata,             # 分析对象
      formula=Species~.,        # 数据分组的方式
      value.var=&#39;Petal.Width&#39;,  # 要计算的数值对象
      fun=mean)                 # 计算用函数名
</code></pre>

<pre><code>##      Species     .
## 1     setosa 0.246
## 2 versicolor 1.326
## 3  virginica 2.026
</code></pre>

<p>dcast的思路和aggregate很相似，都是根据变量切分数据，再对分组后的数据进行计算，但dcast的输出格式和功能在多维情况下要方便很多。</p>

  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="" id="slide-33" style="background:;">
  <hgroup>
    <h2>数据重塑计算</h2>
  </hgroup>
  <article data-timings="">
    <p>即melt函数，将一个宽型数据融合成一个长型数据。例如我们将iris数据集进行融合。</p>

<pre><code class="r">iris_long &lt;- melt(data=iris,     # 要融合的对象
                  id=&#39;Species&#39;)  # 哪些变量不参与到融合中
</code></pre>

<p>一个纯粹的长型数据，只包含一个数值变量，其它均为分类变量。而一个纯粹的宽型数据，则不包含分类变量，均为数值变量。而现实中你遇到要处理的数据，则多半是二者的混杂，正如iris数据集那样。</p>

  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="" id="slide-34" style="background:;">
  <hgroup>
    <h2>数据重塑计算</h2>
  </hgroup>
  <article data-timings="">
    <p>melt和dcast正如同是铁匠的两种得力工具，melt可以看作是炼炉，负责融合数据，成为一个纯粹的长型。而dcast则可以看作是铁锤，负责重铸数据，使之成为需要的格式，同时加以分析。下面的例子就是将之前生成的数据进行汇总计算</p>

<pre><code class="r">dcast(data=iris_long,
      formula=Species~variable,
      value.var=&#39;value&#39;,fun=mean)
</code></pre>

<pre><code>##      Species Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1     setosa        5.006       3.428        1.462       0.246
## 2 versicolor        5.936       2.770        4.260       1.326
## 3  virginica        6.588       2.974        5.552       2.026
</code></pre>

  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="" id="slide-35" style="background:;">
  <hgroup>
    <h2>小练习</h2>
  </hgroup>
  <article data-timings="">
    <p>tips数据集练习，它是一个餐厅侍者收集的关于小费的数据，其中包含了七个变量，包括总费用、付小费的金额、付款者性别、是否吸烟、日期、日间、顾客人数。计算不同性别顾客是否会支付不同的小费比例。则可以按sex变量汇集数据。</p>

<pre><code class="r">dcast(tips,sex~.,value.var=&#39;tip&#39;,fun=mean)
</code></pre>

<pre><code>##      sex        .
## 1 Female 2.833448
## 2   Male 3.089618
</code></pre>

<p>又或者，按sex和size变量划分数据，分别计算小费金额，可以观察到用餐人数越多时，小费相应给的越多，而且男性顾客一般会比女性顾客大方一点。</p>

<pre><code class="r">dcast(tips,sex~size,value.var=&#39;tip&#39;,fun=mean)
</code></pre>

<pre><code>##      sex        1        2        3        4    5    6
## 1 Female 1.276667 2.528448 3.250000 4.021111 5.14 4.60
## 2   Male 1.920000 2.614184 3.476667 4.172143 3.75 5.85
</code></pre>

  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="" id="slide-36" style="background:;">
  <hgroup>
    <h2>dcast函数的使用前提</h2>
  </hgroup>
  <article data-timings="">
    <ul>
<li>数据中已经存在分类变量，例如sex或者smoke</li>
<li>根据分类变量划分数据</li>
<li>再计算某个数值变量的指标</li>
</ul>

  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="" id="slide-37" style="background:;">
  <hgroup>
    <h2>更复杂的需求</h2>
  </hgroup>
  <article data-timings="">
    <p>如果我们想同时计算出不同性别顾客的小费和总费用。但现有的数据集中并没有这种分类变量，怎么处理呢？</p>

<p>一种是笨一点的方法，将前面用过的方法用两次，然后合并这两个结果。但这种方法在多变量情况下并不好。</p>

<pre><code class="r">dcast(tips,sex~.,value.var=&#39;tip&#39;,fun=mean)
</code></pre>

<pre><code>##      sex        .
## 1 Female 2.833448
## 2   Male 3.089618
</code></pre>

<pre><code class="r">dcast(tips,sex~.,value.var=&#39;total_bill&#39;,fun=mean)
</code></pre>

<pre><code>##      sex        .
## 1 Female 18.05690
## 2   Male 20.74408
</code></pre>

<p>另一种推荐的方法就是使用前面提到的melt函数，先将数据融合成纯粹的长型数据，再用dcast重铸。</p>

<pre><code class="r">tips_melt &lt;- melt(data = tips, id.vars=c(&#39;sex&#39;,&#39;smoker&#39;,&#39;time&#39;,&#39;size&#39;,&#39;day&#39;))
dcast(data = tips_melt, sex ~ variable, value.var=&#39;value&#39;,fun= mean)
</code></pre>

<pre><code>##      sex total_bill      tip
## 1 Female   18.05690 2.833448
## 2   Male   20.74408 3.089618
</code></pre>

  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="" id="slide-38" style="background:;">
  <hgroup>
    <h2>更复杂的需求</h2>
  </hgroup>
  <article data-timings="">
    <p>要同时考虑不同性别和吸烟习惯的顾客给小费的相对例。</p>

<pre><code class="r">tips_mean &lt;- dcast(data = tips_melt, sex+ smoker~ variable, fun= mean)
tips_mean$rate &lt;- with(tips_mean,tip/total_bill)
tips_mean
</code></pre>

<pre><code>##      sex smoker total_bill      tip      rate
## 1 Female     No   18.10519 2.773519 0.1531892
## 2 Female    Yes   17.97788 2.931515 0.1630623
## 3   Male     No   19.79124 3.113402 0.1573122
## 4   Male    Yes   22.28450 3.051167 0.1369188
</code></pre>

<p>在dcast函数中的公式同时考虑到了三个分类变量，在第二步计算了小费相对于总餐费的比率，可以清楚的看到，吸烟的女性顾客相对是最大方的，而吸烟的男性则是最小气的。</p>

  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="" id="slide-39" style="background:;">
  <hgroup>
    <h2>时间相关数据的类别</h2>
  </hgroup>
  <article data-timings="">
    <ul>
<li>时间类对象，仅包含日期和时间信息的数据</li>
<li>时间序列类对象，在一个普通的数据对象上附加了时间戳的数据</li>
</ul>

  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="" id="slide-40" style="background:;">
  <hgroup>
    <h2>时间类对象</h2>
  </hgroup>
  <article data-timings="">
    <ul>
<li>简单的Date类型，只包含日期而不包含时钟信息</li>
<li>复杂的POSIXct类型。不仅包括日期还包括了时钟和时区信息。</li>
</ul>

  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="" id="slide-41" style="background:;">
  <hgroup>
    <h2>Date类型</h2>
  </hgroup>
  <article data-timings="">
    <p>数据量少的情况下，可以手工输入为字符串格式，然后转为Date类型，数据量多的话应从外部文件输入，再转为Date格式，两种方式都需要as.Date函数。</p>

<pre><code class="r">date1 &lt;- &#39;1989-05-04&#39;
date1 &lt;- as.Date(date1)
class(date1)
</code></pre>

<pre><code>## [1] &quot;Date&quot;
</code></pre>

<pre><code class="r">date1 &lt;- &#39;05/04/1989&#39;
date1 &lt;- as.Date(date1,format=&#39;%m/%d/%Y&#39;)
</code></pre>

  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="" id="slide-42" style="background:;">
  <hgroup>
    <h2>Date类型</h2>
  </hgroup>
  <article data-timings="">
    <p>通常的输入格式是用短横隔开，如果是其它格式，则在as.Date函数内需要有format参数来确定。Date类数据可以进行常规的加减和比较。</p>

<pre><code class="r">date2 &lt;- date1 + 31
date2 - date1
</code></pre>

<pre><code>## Time difference of 31 days
</code></pre>

<pre><code class="r">date2 &gt; date1
</code></pre>

<pre><code>## [1] TRUE
</code></pre>

  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="" id="slide-43" style="background:;">
  <hgroup>
    <h2>Date类型</h2>
  </hgroup>
  <article data-timings="">
    <p>时间类数据都是从1970年1月1日作为起始点计算。例如计算从那天开始直到现在的天数。</p>

<pre><code class="r">Sys.Date() - structure(0, class=&#39;Date&#39;)
</code></pre>

<pre><code>## Time difference of 16899 days
</code></pre>

<p>我们也可以创建一个日期向量，并进行计算。</p>

<pre><code class="r">dates &lt;- seq(date1, length=4, by=&#39;day&#39;)
format(dates, &#39;%w&#39;)
</code></pre>

<pre><code>## [1] &quot;4&quot; &quot;5&quot; &quot;6&quot; &quot;0&quot;
</code></pre>

<pre><code class="r">weekdays(dates)
</code></pre>

<pre><code>## [1] &quot;星期四&quot; &quot;星期五&quot; &quot;星期六&quot; &quot;星期日&quot;
</code></pre>

<p>如果需要了解更多日期的格式转换，可以参见strptime函数的帮助。</p>

  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="" id="slide-44" style="background:;">
  <hgroup>
    <h2>POSIXct类型</h2>
  </hgroup>
  <article data-timings="">
    <p>POSIXct类型的数据创建和计算是类似的。</p>

<pre><code class="r">time1 &lt;- &#39;1989-05-04&#39;
time1 &lt;- as.POSIXct(time1)
time1 &lt;- &quot;2011-03-1 01:30:00&quot;
time1 &lt;- as.POSIXct(time1,format=&quot;%Y-%m-%d %H:%M:%S&quot;)
time1 &lt;- as.POSIXct(&quot;2011-03-1 01:30:00&quot;,tz=&#39;GMT&#39;)
time2 &lt;- seq(from=time1,to=Sys.time(),by=&#39;month&#39;)
</code></pre>

<p>POSIXct类型的数据可以不包括时钟信息，或者在日期后加空格以冒号分隔时钟信息，也可以加上时区缩写。如果对输入格式有特别要求，可以使用format参数对输入格式进行设定，再行转换。</p>

  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="" id="slide-45" style="background:;">
  <hgroup>
    <h2>POSIXct类型</h2>
  </hgroup>
  <article data-timings="">
    <p>之前我们都是输入字符串再转为时间，这种方式有点繁琐，我们也可以直接从数值转为时间</p>

<pre><code class="r">time1 &lt;- ISOdatetime(2011,1,1,0,0,0)
rtimes &lt;- ISOdatetime(2013, rep(4:5,5), sample(30,10), 0, 0, 0)
</code></pre>

<p>ISOdatetime函数能将数值转为POSIXct时间对象，六个输入数值参数分别为年、月、日、时、分、秒。上面第二行代码使用了向量化特性，随机生成了10个时间。</p>

  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="segue dark" id="slide-46" style="background:;">
  <hgroup>
    <h2>字符串处理</h2>
  </hgroup>
  <article data-timings="">
    
  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="" id="slide-47" style="background:;">
  <hgroup>
    <h2>字符串处理概要</h2>
  </hgroup>
  <article data-timings="">
    <p>在文本数据挖掘日趋重要的背景下，在处理字符这种非结构化数据时，你需要能够熟练的操作字符串对象。</p>

<ul>
<li>获取字符串长度：<code>nchar()</code></li>
<li>字符串分割：<code>strsplit()</code></li>
<li>字符串拼接：<code>paste()</code></li>
<li>字符串截取：<code>substr()</code></li>
<li>字符串替代：<code>gsub()</code></li>
<li>字符串匹配：<code>grep()</code></li>
</ul>

  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="" id="slide-48" style="background:;">
  <hgroup>
    <h2>获取字符串长度</h2>
  </hgroup>
  <article data-timings="">
    <p>nchar()能够获取字符串的长度，它也支持字符串向量操作。注意它和length()的结果是有区别的。</p>

<pre><code class="r">fruit &lt;- &#39;apple orange grape banana&#39;
nchar(fruit)
</code></pre>

<pre><code>## [1] 25
</code></pre>

  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="" id="slide-49" style="background:;">
  <hgroup>
    <h2>字符串分割</h2>
  </hgroup>
  <article data-timings="">
    <p>strsplit()负责将字符串按照某种分割形式将其进行划分，需要设定分隔符。下面我们是用空格来作为分隔符将fruit分为四个元素。</p>

<pre><code class="r">strsplit(fruit,split=&#39; &#39;)
</code></pre>

<pre><code>## [[1]]
## [1] &quot;apple&quot;  &quot;orange&quot; &quot;grape&quot;  &quot;banana&quot;
</code></pre>

<pre><code class="r">fruitvec &lt;- unlist(strsplit(fruit,split=&#39; &#39;))
</code></pre>

  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="" id="slide-50" style="background:;">
  <hgroup>
    <h2>字符串拼接</h2>
  </hgroup>
  <article data-timings="">
    <p>paste()负责将若干个字符串相连结，返回成单独的字符串。其优点在于，就算有的处理对象不是字符型也能自动转为字符型。另一个相似的函数paste0是设置无需分隔符的拼接。</p>

<pre><code class="r">paste(fruitvec,collapse=&#39;,&#39;)
</code></pre>

<pre><code>## [1] &quot;apple,orange,grape,banana&quot;
</code></pre>

  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="" id="slide-51" style="background:;">
  <hgroup>
    <h2>字符串截取</h2>
  </hgroup>
  <article data-timings="">
    <p>substr()能对给定的字符串对象取出子集，其参数是子集所处的起始和终止位置。</p>

<pre><code class="r">substr(fruit, 1,5)
</code></pre>

<pre><code>## [1] &quot;apple&quot;
</code></pre>

  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="" id="slide-52" style="background:;">
  <hgroup>
    <h2>字符串替代</h2>
  </hgroup>
  <article data-timings="">
    <p>gsub()负责搜索字符串的特定表达式，并用新的内容加以替代。sub()函数是类似的，但只替代第一个发现结果。</p>

<pre><code class="r">gsub(&#39;apple&#39;,&#39;strawberry&#39;,fruit)
</code></pre>

<pre><code>## [1] &quot;strawberry orange grape banana&quot;
</code></pre>

  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="" id="slide-53" style="background:;">
  <hgroup>
    <h2>字符串匹配</h2>
  </hgroup>
  <article data-timings="">
    <p>grep()负责搜索给定字符串对象中特定表达式 ，并返回其位置索引。grepl()函数与之类似，但其后面的&quot;l&quot;则意味着返回的将是逻辑值。</p>

<pre><code class="r">grep(&#39;grape&#39;,fruitvec)
</code></pre>

<pre><code>## [1] 3
</code></pre>

  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="" id="slide-54" style="background:;">
  <hgroup>
    <h2>dplyr(<a href="http://hadley.nz/">hadley</a>)</h2>
  </hgroup>
  <article data-timings="">
    
  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="" id="slide-55" style="background:;">
  <article data-timings="">
    <ul>
<li>filter() (and slice())</li>
<li>arrange()</li>
<li>select() (and rename())</li>
<li>distinct()</li>
<li>mutate() (and transmute())</li>
<li>summarise()</li>
<li>sample_n() and sample_frac()</li>
</ul>

  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="" id="slide-56" style="background:;">
  <article data-timings="">
    <pre><code class="r">library(nycflights13)
dim(flights)
</code></pre>

<pre><code>## [1] 336776     16
</code></pre>

<pre><code class="r">#&gt; [1] 336776     16
head(flights)
</code></pre>

<pre><code>## Source: local data frame [6 x 16]
## 
##    year month   day dep_time dep_delay arr_time arr_delay carrier tailnum
##   (int) (int) (int)    (int)     (dbl)    (int)     (dbl)   (chr)   (chr)
## 1  2013     1     1      517         2      830        11      UA  N14228
## 2  2013     1     1      533         4      850        20      UA  N24211
## 3  2013     1     1      542         2      923        33      AA  N619AA
## 4  2013     1     1      544        -1     1004       -18      B6  N804JB
## 5  2013     1     1      554        -6      812       -25      DL  N668DN
## 6  2013     1     1      554        -4      740        12      UA  N39463
## Variables not shown: flight (int), origin (chr), dest (chr), air_time
##   (dbl), distance (dbl), hour (dbl), minute (dbl)
</code></pre>

  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="" id="slide-57" style="background:;">
  <hgroup>
    <h2>filter() 和slice()</h2>
  </hgroup>
  <article data-timings="">
    <pre><code class="r">filter(flights, month == 1, day == 1)
</code></pre>

<pre><code>## Source: local data frame [842 x 16]
## 
##     year month   day dep_time dep_delay arr_time arr_delay carrier tailnum
##    (int) (int) (int)    (int)     (dbl)    (int)     (dbl)   (chr)   (chr)
## 1   2013     1     1      517         2      830        11      UA  N14228
## 2   2013     1     1      533         4      850        20      UA  N24211
## 3   2013     1     1      542         2      923        33      AA  N619AA
## 4   2013     1     1      544        -1     1004       -18      B6  N804JB
## 5   2013     1     1      554        -6      812       -25      DL  N668DN
## 6   2013     1     1      554        -4      740        12      UA  N39463
## 7   2013     1     1      555        -5      913        19      B6  N516JB
## 8   2013     1     1      557        -3      709       -14      EV  N829AS
## 9   2013     1     1      557        -3      838        -8      B6  N593JB
## 10  2013     1     1      558        -2      753         8      AA  N3ALAA
## ..   ...   ...   ...      ...       ...      ...       ...     ...     ...
## Variables not shown: flight (int), origin (chr), dest (chr), air_time
##   (dbl), distance (dbl), hour (dbl), minute (dbl)
</code></pre>

<pre><code class="r">slice(flights,1:10)
</code></pre>

  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="" id="slide-58" style="background:;">
  <hgroup>
    <h2>arrange</h2>
  </hgroup>
  <article data-timings="">
    <pre><code class="r">arrange(flights, year, month, day)
</code></pre>

<pre><code>## Source: local data frame [336,776 x 16]
## 
##     year month   day dep_time dep_delay arr_time arr_delay carrier tailnum
##    (int) (int) (int)    (int)     (dbl)    (int)     (dbl)   (chr)   (chr)
## 1   2013     1     1      517         2      830        11      UA  N14228
## 2   2013     1     1      533         4      850        20      UA  N24211
## 3   2013     1     1      542         2      923        33      AA  N619AA
## 4   2013     1     1      544        -1     1004       -18      B6  N804JB
## 5   2013     1     1      554        -6      812       -25      DL  N668DN
## 6   2013     1     1      554        -4      740        12      UA  N39463
## 7   2013     1     1      555        -5      913        19      B6  N516JB
## 8   2013     1     1      557        -3      709       -14      EV  N829AS
## 9   2013     1     1      557        -3      838        -8      B6  N593JB
## 10  2013     1     1      558        -2      753         8      AA  N3ALAA
## ..   ...   ...   ...      ...       ...      ...       ...     ...     ...
## Variables not shown: flight (int), origin (chr), dest (chr), air_time
##   (dbl), distance (dbl), hour (dbl), minute (dbl)
</code></pre>

  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="" id="slide-59" style="background:;">
  <hgroup>
    <h2>select() (and rename())</h2>
  </hgroup>
  <article data-timings="">
    <pre><code class="r">select(flights, year:month, day)
</code></pre>

<pre><code>## Source: local data frame [336,776 x 3]
## 
##     year month   day
##    (int) (int) (int)
## 1   2013     1     1
## 2   2013     1     1
## 3   2013     1     1
## 4   2013     1     1
## 5   2013     1     1
## 6   2013     1     1
## 7   2013     1     1
## 8   2013     1     1
## 9   2013     1     1
## 10  2013     1     1
## ..   ...   ...   ...
</code></pre>

<pre><code class="r">rename(flights, tail_num = tailnum)
</code></pre>

<pre><code>## Source: local data frame [336,776 x 16]
## 
##     year month   day dep_time dep_delay arr_time arr_delay carrier
##    (int) (int) (int)    (int)     (dbl)    (int)     (dbl)   (chr)
## 1   2013     1     1      517         2      830        11      UA
## 2   2013     1     1      533         4      850        20      UA
## 3   2013     1     1      542         2      923        33      AA
## 4   2013     1     1      544        -1     1004       -18      B6
## 5   2013     1     1      554        -6      812       -25      DL
## 6   2013     1     1      554        -4      740        12      UA
## 7   2013     1     1      555        -5      913        19      B6
## 8   2013     1     1      557        -3      709       -14      EV
## 9   2013     1     1      557        -3      838        -8      B6
## 10  2013     1     1      558        -2      753         8      AA
## ..   ...   ...   ...      ...       ...      ...       ...     ...
## Variables not shown: tail_num (chr), flight (int), origin (chr), dest
##   (chr), air_time (dbl), distance (dbl), hour (dbl), minute (dbl)
</code></pre>

  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="" id="slide-60" style="background:;">
  <hgroup>
    <h2>distinct()</h2>
  </hgroup>
  <article data-timings="">
    <pre><code class="r">distinct(select(flights, tailnum))
</code></pre>

<pre><code>## Source: local data frame [4,044 x 1]
## 
##    tailnum
##      (chr)
## 1   N14228
## 2   N24211
## 3   N619AA
## 4   N804JB
## 5   N668DN
## 6   N39463
## 7   N516JB
## 8   N829AS
## 9   N593JB
## 10  N3ALAA
## ..     ...
</code></pre>

  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="" id="slide-61" style="background:;">
  <hgroup>
    <h2>mutate() (and transmute())</h2>
  </hgroup>
  <article data-timings="">
    <pre><code class="r">mutate(flights,
  gain = arr_delay - dep_delay,
  gain_per_hour = gain / (air_time / 60)
)
</code></pre>

<pre><code>## Source: local data frame [336,776 x 18]
## 
##     year month   day dep_time dep_delay arr_time arr_delay carrier tailnum
##    (int) (int) (int)    (int)     (dbl)    (int)     (dbl)   (chr)   (chr)
## 1   2013     1     1      517         2      830        11      UA  N14228
## 2   2013     1     1      533         4      850        20      UA  N24211
## 3   2013     1     1      542         2      923        33      AA  N619AA
## 4   2013     1     1      544        -1     1004       -18      B6  N804JB
## 5   2013     1     1      554        -6      812       -25      DL  N668DN
## 6   2013     1     1      554        -4      740        12      UA  N39463
## 7   2013     1     1      555        -5      913        19      B6  N516JB
## 8   2013     1     1      557        -3      709       -14      EV  N829AS
## 9   2013     1     1      557        -3      838        -8      B6  N593JB
## 10  2013     1     1      558        -2      753         8      AA  N3ALAA
## ..   ...   ...   ...      ...       ...      ...       ...     ...     ...
## Variables not shown: flight (int), origin (chr), dest (chr), air_time
##   (dbl), distance (dbl), hour (dbl), minute (dbl), gain (dbl),
##   gain_per_hour (dbl)
</code></pre>

  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="" id="slide-62" style="background:;">
  <hgroup>
    <h2>summarise()</h2>
  </hgroup>
  <article data-timings="">
    <ul>
<li>n()</li>
<li>sum()</li>
<li>mean()</li>
</ul>

<pre><code class="r">summarise(flights,
  delay = mean(dep_delay, na.rm = TRUE))
</code></pre>

<pre><code>## Source: local data frame [1 x 1]
## 
##      delay
##      (dbl)
## 1 12.63907
</code></pre>

  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="" id="slide-63" style="background:;">
  <article data-timings="">
    <p>##sample_n() and sample_frac()</p>

<pre><code class="r">sample_n(flights, 0.01)
</code></pre>

  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="" id="slide-64" style="background:;">
  <hgroup>
    <h2>group operations</h2>
  </hgroup>
  <article data-timings="">
    <pre><code class="r">by_tailnum &lt;- group_by(flights, tailnum)
delay &lt;- summarise(by_tailnum,
  count = n(),
  dist = mean(distance, na.rm = TRUE),
  delay = mean(arr_delay, na.rm = TRUE))
delay &lt;- filter(delay, count &gt; 20, dist &lt; 2000)
</code></pre>

  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="" id="slide-65" style="background:;">
  <hgroup>
    <h2>chain</h2>
  </hgroup>
  <article data-timings="">
    <pre><code class="r">flights %&gt;%
  group_by(year, month, day) %&gt;%
  select(arr_delay, dep_delay) %&gt;%
  summarise(
    arr = mean(arr_delay, na.rm = TRUE),
    dep = mean(dep_delay, na.rm = TRUE)
  ) %&gt;%
  filter(arr &gt; 30 | dep &gt; 30) %&gt;%
  head(3)
</code></pre>

<pre><code>## Source: local data frame [3 x 5]
## Groups: year, month [2]
## 
##    year month   day      arr      dep
##   (int) (int) (int)    (dbl)    (dbl)
## 1  2013     1    16 34.24736 24.61287
## 2  2013     1    31 32.60285 28.65836
## 3  2013     2    11 36.29009 39.07360
</code></pre>

  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="" id="slide-66" style="background:;">
  <hgroup>
    <h2>窗口函数</h2>
  </hgroup>
  <article data-timings="">
    <ul>
<li>ranking and ordering funtions 

<ul>
<li>row_number(),min_rank(),dense_rank(),cume_dist(),percent_rank(),ntail()</li>
<li>lead(),lag()</li>
</ul></li>
<li>累计聚合函数

<ul>
<li>cumsum(),cummin(),cummax(),cumall(),cumany(),cummean(),n(),sum()</li>
</ul></li>
</ul>

  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="" id="slide-67" style="background:;">
  <article data-timings="">
    <pre><code class="r">library(Lahman)
batting &lt;- select(tbl_df(Batting), playerID, yearID, teamID, G, AB:H) 
batting &lt;- arrange(batting, playerID, yearID, teamID)
players &lt;- group_by(batting, playerID)

# 找出每一位选手击球次数最多的两年
filter(players, min_rank(desc(H)) &lt;= 2 &amp; H &gt; 0)
# 对于每一位选手，按照每年参加过的比赛次数排序
mutate(players, G_rank = min_rank(G))

# 对于每一位选手，找出比去年表现好的年份
filter(players, G &gt; lag(G))
</code></pre>

  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="segue dark" id="slide-68" style="background:;">
  <hgroup>
    <h2>数据科学中的其他课题</h2>
  </hgroup>
  <article data-timings="">
    
  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="" id="slide-69" style="background:;">
  <hgroup>
    <h2>缺失值处理和异常值检测</h2>
  </hgroup>
  <article data-timings="">
    <ul>
<li>mice和outlier</li>
<li>缺失值处理

<ul>
<li>缺失的原因：误输入，异常值，特殊值</li>
<li>数据分布的特性</li>
<li>删除、插补、回归拟合</li>
</ul></li>
<li>异常值检测

<ul>
<li>噪音</li>
<li>特殊值</li>
</ul></li>
</ul>

  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="segue dark" id="slide-70" style="background:;">
  <hgroup>
    <h2>直观的了解数据:ggplot2</h2>
  </hgroup>
  <article data-timings="">
    
  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="" id="slide-71" style="background:;">
  <article data-timings="">
    <pre><code class="r">library(ggplot2)
ggplot(mpg,aes(hwy,cty)) +
  geom_point(aes(color =as.factor(cyl))) +
  geom_smooth(method = &#39;lm&#39;) +
  coord_cartesian()+
  theme_bw()
</code></pre>

<p><img src="assets/fig/unnamed-chunk-57-1.png" alt="plot of chunk unnamed-chunk-57"> </p>

  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="" id="slide-72" style="background:;">
  <article data-timings="">
    <pre><code class="r">ggplot(mpg,aes(hwy,cty)) +
  geom_point(aes(color =as.factor(cyl))) +
  geom_smooth(aes(color = as.factor(cyl)),method = &#39;lm&#39;)
</code></pre>

<p><img src="assets/fig/unnamed-chunk-58-1.png" alt="plot of chunk unnamed-chunk-58"> </p>

  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="" id="slide-73" style="background:;">
  <article data-timings="">
    <ul>
<li>ggplot(data,aes(x,y,...))</li>
<li>geom_*(data,aes(x,y,color,fill,shape,size,...))

<ul>
<li>geom_histgram(binwidth =5)</li>
<li>geom_bar(stat= &quot;identity&quot;,postion = c(&quot;dodge&quot;,&quot;stack&quot;,&quot;fill&quot;))</li>
<li>geom_point()</li>
<li>geom_text(aes(label))</li>
<li>geom_boxplot()</li>
</ul></li>
<li>facet_*(formula) 分面（按网格画图）

<ul>
<li>facet_grid(x~y) </li>
<li>facet_warp(~x)</li>
</ul></li>
</ul>

  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="" id="slide-74" style="background:;">
  <article data-timings="">
    <pre><code class="r">ggplot(mtcars) +
  geom_histogram(aes(mpg),binwidth = 2)
</code></pre>

<p><img src="assets/fig/unnamed-chunk-60-1.png" alt="plot of chunk unnamed-chunk-60"> </p>

  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="" id="slide-75" style="background:;">
  <article data-timings="">
    <pre><code class="r">ggplot(mtcars) + 
  geom_bar(aes(as.factor(cyl),fill = as.factor(gear)))
</code></pre>

<p><img src="assets/fig/unnamed-chunk-61-1.png" alt="plot of chunk unnamed-chunk-61"> </p>

  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="" id="slide-76" style="background:;">
  <article data-timings="">
    <pre><code class="r">ggplot(mtcars) + 
  geom_bar(aes(as.factor(cyl),fill = as.factor(gear)),position = &quot;fill&quot;)
</code></pre>

<p><img src="assets/fig/unnamed-chunk-62-1.png" alt="plot of chunk unnamed-chunk-62"> </p>

  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="" id="slide-77" style="background:;">
  <article data-timings="">
    <pre><code class="r">data &lt;- table(as.factor(mtcars$cyl),as.factor(mtcars$gear))
data 
</code></pre>

<pre><code>##    
##      3  4  5
##   4  1  8  2
##   6  2  4  1
##   8 12  0  2
</code></pre>

<pre><code class="r">data &lt;- as.data.frame(data)
head(data)
</code></pre>

<pre><code>##   Var1 Var2 Freq
## 1    4    3    1
## 2    6    3    2
## 3    8    3   12
## 4    4    4    8
## 5    6    4    4
## 6    8    4    0
</code></pre>

  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="" id="slide-78" style="background:;">
  <article data-timings="">
    <pre><code class="r">ggplot(data) + 
  geom_bar(aes(x= Var1,y = Freq,fill =Var2),stat = &quot;identity&quot;)
</code></pre>

<p><img src="assets/fig/unnamed-chunk-64-1.png" alt="plot of chunk unnamed-chunk-64"> </p>

  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="" id="slide-79" style="background:;">
  <article data-timings="">
    <pre><code class="r">ggplot(mtcars) +
  geom_boxplot(aes(x = as.factor(cyl),y = mpg))
</code></pre>

<p><img src="assets/fig/unnamed-chunk-65-1.png" alt="plot of chunk unnamed-chunk-65"> </p>

  </article>
  <!-- Presenter Notes -->
</slide>

<slide class="" id="slide-80" style="background:;">
  <article data-timings="">
    <pre><code class="r">ggplot(mtcars) +
  geom_point(aes(x = mpg,y = disp,color = as.factor(vs))) +
  facet_grid(cyl~gear)
</code></pre>

<p><img src="assets/fig/unnamed-chunk-66-1.png" alt="plot of chunk unnamed-chunk-66"> </p>

  </article>
  <!-- Presenter Notes -->
</slide>

    <slide class="backdrop"></slide>
  </slides>
  <div class="pagination pagination-small" id='io2012-ptoc' style="display:none;">
    <ul>
      <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=1 title='NA'>
         1
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=2 title='目录'>
         2
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=3 title='R的核心:函数式编程思想'>
         3
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=4 title='FP'>
         4
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=5 title='NA'>
         5
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=6 title='闭包'>
         6
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=7 title='Lists of functions'>
         7
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=8 title='FP的效率'>
         8
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=9 title='R中的管道操作'>
         9
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=10 title='管道操作'>
         10
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=11 title='自定义的管道'>
         11
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=12 title='NA'>
         12
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=13 title='R apply家族'>
         13
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=14 title='替代for的高效函数'>
         14
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=15 title='apply(X,margin,FUN)'>
         15
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=16 title='lapply(X,FUN)和sapply(X,FUN)'>
         16
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=17 title='NA'>
         17
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=18 title='Mapply(FUN,....,MoreArg = ,....)'>
         18
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=19 title='rollapply'>
         19
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=20 title='案例1：bootstrap 抽样'>
         20
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=21 title='案例2：担保链分析中的递归'>
         21
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=22 title='NA'>
         22
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=23 title='NA'>
         23
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=24 title='R的数据获取，web scraping'>
         24
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=25 title='NA'>
         25
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=26 title='结构化网页数据抓取:XML package'>
         26
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=27 title='非结构化数据的获取:rvest package'>
         27
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=28 title='特征工程和数据预处理'>
         28
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=29 title='reshape2'>
         29
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=30 title='数据的两种形状'>
         30
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=31 title='数据的两种形状'>
         31
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=32 title='数据重塑计算'>
         32
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=33 title='数据重塑计算'>
         33
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=34 title='数据重塑计算'>
         34
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=35 title='小练习'>
         35
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=36 title='dcast函数的使用前提'>
         36
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=37 title='更复杂的需求'>
         37
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=38 title='更复杂的需求'>
         38
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=39 title='时间相关数据的类别'>
         39
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=40 title='时间类对象'>
         40
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=41 title='Date类型'>
         41
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=42 title='Date类型'>
         42
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=43 title='Date类型'>
         43
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=44 title='POSIXct类型'>
         44
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=45 title='POSIXct类型'>
         45
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=46 title='字符串处理'>
         46
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=47 title='字符串处理概要'>
         47
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=48 title='获取字符串长度'>
         48
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=49 title='字符串分割'>
         49
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=50 title='字符串拼接'>
         50
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=51 title='字符串截取'>
         51
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=52 title='字符串替代'>
         52
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=53 title='字符串匹配'>
         53
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=54 title='dplyr(<a href="http://hadley.nz/">hadley</a>)'>
         54
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=55 title='NA'>
         55
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=56 title='NA'>
         56
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=57 title='filter() 和slice()'>
         57
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=58 title='arrange'>
         58
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=59 title='select() (and rename())'>
         59
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=60 title='distinct()'>
         60
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=61 title='mutate() (and transmute())'>
         61
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=62 title='summarise()'>
         62
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=63 title='NA'>
         63
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=64 title='group operations'>
         64
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=65 title='chain'>
         65
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=66 title='窗口函数'>
         66
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=67 title='NA'>
         67
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=68 title='数据科学中的其他课题'>
         68
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=69 title='缺失值处理和异常值检测'>
         69
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=70 title='直观的了解数据:ggplot2'>
         70
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=71 title='NA'>
         71
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=72 title='NA'>
         72
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=73 title='NA'>
         73
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=74 title='NA'>
         74
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=75 title='NA'>
         75
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=76 title='NA'>
         76
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=77 title='NA'>
         77
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=78 title='NA'>
         78
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=79 title='NA'>
         79
      </a>
    </li>
    <li>
      <a href="#" target="_self" rel='tooltip' 
        data-slide=80 title='NA'>
         80
      </a>
    </li>
  </ul>
  </div>  <!--[if IE]>
    <script 
      src="http://ajax.googleapis.com/ajax/libs/chrome-frame/1/CFInstall.min.js">  
    </script>
    <script>CFInstall.check({mode: 'overlay'});</script>
  <![endif]-->
</body>
  <!-- Load Javascripts for Widgets -->
  
  <!-- LOAD HIGHLIGHTER JS FILES -->
  <script src="./libraries/highlighters/highlight.js/highlight.pack.js"></script>
  <script>hljs.initHighlightingOnLoad();</script>
  <!-- DONE LOADING HIGHLIGHTER JS FILES -->
   
  </html>