From 08c76d36ab80dca76f6afc631604422b60f1e3fd Mon Sep 17 00:00:00 2001 From: Wei Wei Date: Thu, 29 Sep 2016 17:39:28 -0400 Subject: [PATCH 1/2] Here is my markdown document! --- Class 7 Instructions.Rmd | 25 +++++++++++++++---------- Class7.Rproj | 13 +++++++++++++ 2 files changed, 28 insertions(+), 10 deletions(-) create mode 100644 Class7.Rproj diff --git a/Class 7 Instructions.Rmd b/Class 7 Instructions.Rmd index 5ae641a..cec4556 100644 --- a/Class 7 Instructions.Rmd +++ b/Class 7 Instructions.Rmd @@ -18,7 +18,7 @@ library(tidyr, dplyr) ##Upload wide format instructor data (instructor_activity_wide.csv) ```{r} -data_wide <- read.table("~/Documents/NYU/EDCT2550/Assignments/Assignment 3/instructor_activity_wide.csv", sep = ",", header = TRUE) +data_wide <- read.table("instructor_activity_wide.csv", sep = ",", header = TRUE) #Now view the data you have uploaded and notice how its structure: each variable is a date and each row is a type of measure. View(data_wide) @@ -59,9 +59,11 @@ instructor_data <- spread(data_long, variables, measure) ##Now we have a workable instructor data set!The next step is to create a workable student data set. Upload the data "student_activity.csv". View your file once you have uploaded it and then draw on a piece of paper the structure that you want before you attempt to code it. Write the code you use in the chunk below. (Hint: you can do it in one step) ```{r} - +student_data <- read.table("student_activity.csv", sep = ",", header = TRUE) +student_data <- spread(student_data, variable, measure) ``` + ##Now that you have workable student data set, subset it to create a data set that only includes data from the second class. To do this we will use the dplyr package (We will need to call dplyr in the command by writing dplyr:: because dplyr uses commands that exist in other packages but to do different operations.) @@ -75,7 +77,7 @@ student_data_2 <- dplyr::filter(student_data, date == 20160204) Now subset the student_activity data frame to create a data frame that only includes students who have sat at table 4. Write your code in the following chunk: ```{r} - +student_data_3 <- dplyr::filter(student_data, table == 4) ``` ##Make a new variable @@ -89,18 +91,18 @@ instructor_data <- dplyr::mutate(instructor_data, total_sleep = s_deep + s_light Now, refering to the cheat sheet, create a data frame called "instructor_sleep" that contains ONLY the total_sleep variable. Write your code in the following code chunk: ```{r} - +instructor_sleep <- dplyr::select(instructor_data, total_sleep) ``` Now, we can combine several commands together to create a new variable that contains a grouping. The following code creates a weekly grouping variable called "week" in the instructor data set: -```{r} +```{r, eval=FLASE} instructor_data <- dplyr::mutate(instructor_data, week = dplyr::ntile(date, 3)) ``` Create the same variables for the student data frame, write your code in the code chunk below: -```{r} - +```{r, eval=FLASE} +student_data <- dplyr::mutate(student_data, week = dplyr::ntile(date, 3)) ``` ##Sumaraizing @@ -117,7 +119,8 @@ student_data %>% dplyr::group_by(date) %>% dplyr::summarise(mean(motivation)) Create two new data sets using this method. One that sumarizes average motivation for students for each week (student_week) and another than sumarizes "m_active_time" for the instructor per week (instructor_week). Write your code in the following chunk: ```{r} - +student_week <- student_data %>% dplyr::group_by(week) %>% dplyr::summarise(mean(motivation)) +instructor_week <- instructor_data %>% dplyr::group_by(week) %>% dplyr::summarise(mean(m_active_time)) ``` ##Merging @@ -130,8 +133,10 @@ merge <- dplyr::full_join(instructor_week, student_week, "week") ##Visualize Visualize the relationship between these two variables (mean motivation and mean instructor activity) with the "plot" command and then run a Pearson correlation test (hint: cor.test()). Write the code for the these commands below: -```{r} - +```{r, eval=FLASE} +names(merge)<-c("week", "avg_student", "avg_instructor") +plot(merge$avg_student, merge$avg_instructor) +cor.test(merge$avg_student, merge$avg_instructor) ``` Fnally save your markdown document and your plot to this folder and comit, push and pull your repo to submit. diff --git a/Class7.Rproj b/Class7.Rproj new file mode 100644 index 0000000..8e3c2eb --- /dev/null +++ b/Class7.Rproj @@ -0,0 +1,13 @@ +Version: 1.0 + +RestoreWorkspace: Default +SaveWorkspace: Default +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 2 +Encoding: UTF-8 + +RnwWeave: Sweave +LaTeX: pdfLaTeX From 5b79f005dc06027fdc2c8ee8836913f2af953125 Mon Sep 17 00:00:00 2001 From: Wei Wei Date: Thu, 29 Sep 2016 20:04:30 -0400 Subject: [PATCH 2/2] Here are my markdown document and the plot! --- .gitignore | 4 + Class 7 Instructions.Rmd | 36 +++--- Class_7_Instructions.html | 249 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 271 insertions(+), 18 deletions(-) create mode 100644 .gitignore create mode 100644 Class_7_Instructions.html diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..5b6a065 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +.Rproj.user +.Rhistory +.RData +.Ruserdata diff --git a/Class 7 Instructions.Rmd b/Class 7 Instructions.Rmd index cec4556..cf31fc0 100644 --- a/Class 7 Instructions.Rmd +++ b/Class 7 Instructions.Rmd @@ -9,7 +9,7 @@ date: "February 13, 2016" ##Install packages for manipulating data We will use two packages: tidyr and dplyr -```{r} +```{r, eval=FALSE} #Insall packages install.packages("tidyr", "dplyr") #Load packages @@ -17,7 +17,7 @@ library(tidyr, dplyr) ``` ##Upload wide format instructor data (instructor_activity_wide.csv) -```{r} +```{r, eval=FALSE} data_wide <- read.table("instructor_activity_wide.csv", sep = ",", header = TRUE) #Now view the data you have uploaded and notice how its structure: each variable is a date and each row is a type of measure. @@ -37,7 +37,7 @@ The gather command requires the following input arguments: - value: Name of new value column - ...: Names of source columns that contain values -```{r} +```{r, eval=FALSE} data_long <- gather(data_wide, date, variables) #Rename the variables so we don't get confused about what is what! names(data_long) <- c("variables", "date", "measure") @@ -52,13 +52,13 @@ The spread function requires the following input: - key: Name of column containing the new column names - value: Name of column containing values -```{r} +```{r, eval=FALSE} instructor_data <- spread(data_long, variables, measure) ``` ##Now we have a workable instructor data set!The next step is to create a workable student data set. Upload the data "student_activity.csv". View your file once you have uploaded it and then draw on a piece of paper the structure that you want before you attempt to code it. Write the code you use in the chunk below. (Hint: you can do it in one step) -```{r} +```{r, eval=FALSE} student_data <- read.table("student_activity.csv", sep = ",", header = TRUE) student_data <- spread(student_data, variable, measure) ``` @@ -70,13 +70,13 @@ To do this we will use the dplyr package (We will need to call dplyr in the comm Notice that the way we subset is with a logical rule, in this case date == 20160204. In R, when we want to say that something "equals" something else we need to use a double equals sign "==". (A single equals sign means the same as <-). -```{r} +```{r, eval=FALSE} student_data_2 <- dplyr::filter(student_data, date == 20160204) ``` Now subset the student_activity data frame to create a data frame that only includes students who have sat at table 4. Write your code in the following chunk: -```{r} +```{r, eval=FALSE} student_data_3 <- dplyr::filter(student_data, table == 4) ``` @@ -84,31 +84,31 @@ student_data_3 <- dplyr::filter(student_data, table == 4) It is useful to be able to make new variables for analysis. We can either apend a new variable to our dataframe or we can replace some variables with a new variable. Below we will use the "mutate" function to create a new variable "total_sleep" from the light and deep sleep variables in the instructor data. -```{r} +```{r, eval=FALSE} instructor_data <- dplyr::mutate(instructor_data, total_sleep = s_deep + s_light) ``` Now, refering to the cheat sheet, create a data frame called "instructor_sleep" that contains ONLY the total_sleep variable. Write your code in the following code chunk: -```{r} +```{r, eval=FALSE} instructor_sleep <- dplyr::select(instructor_data, total_sleep) ``` Now, we can combine several commands together to create a new variable that contains a grouping. The following code creates a weekly grouping variable called "week" in the instructor data set: -```{r, eval=FLASE} +```{r, eval=FALSE} instructor_data <- dplyr::mutate(instructor_data, week = dplyr::ntile(date, 3)) ``` Create the same variables for the student data frame, write your code in the code chunk below: -```{r, eval=FLASE} +```{r, eval=FALSE} student_data <- dplyr::mutate(student_data, week = dplyr::ntile(date, 3)) ``` ##Sumaraizing Next we will summarize the student data. First we can simply take an average of one of our student variables such as motivation: -```{r} +```{r, eval=FALSE} student_data %>% dplyr::summarise(mean(motivation)) #That isn't super interesting, so let's break it down by week: @@ -118,7 +118,7 @@ student_data %>% dplyr::group_by(date) %>% dplyr::summarise(mean(motivation)) Create two new data sets using this method. One that sumarizes average motivation for students for each week (student_week) and another than sumarizes "m_active_time" for the instructor per week (instructor_week). Write your code in the following chunk: -```{r} +```{r, eval=FALSE} student_week <- student_data %>% dplyr::group_by(week) %>% dplyr::summarise(mean(motivation)) instructor_week <- instructor_data %>% dplyr::group_by(week) %>% dplyr::summarise(mean(m_active_time)) ``` @@ -126,17 +126,17 @@ instructor_week <- instructor_data %>% dplyr::group_by(week) %>% dplyr::summaris ##Merging Now we will merge these two data frames using dplyr. -```{r} +```{r,eval=FALSE } merge <- dplyr::full_join(instructor_week, student_week, "week") ``` ##Visualize Visualize the relationship between these two variables (mean motivation and mean instructor activity) with the "plot" command and then run a Pearson correlation test (hint: cor.test()). Write the code for the these commands below: -```{r, eval=FLASE} -names(merge)<-c("week", "avg_student", "avg_instructor") -plot(merge$avg_student, merge$avg_instructor) -cor.test(merge$avg_student, merge$avg_instructor) +```{r, eval=FALSE} +names(merge)<-c("week", "student_avg", "instructor_avg") +plot(merge$student_avg, merge$instructor_avg) +cor.test(merge$student_avg, merge$instructor_avg) ``` Fnally save your markdown document and your plot to this folder and comit, push and pull your repo to submit. diff --git a/Class_7_Instructions.html b/Class_7_Instructions.html new file mode 100644 index 0000000..81ee06e --- /dev/null +++ b/Class_7_Instructions.html @@ -0,0 +1,249 @@ + + + + + + + + + + + + + + + +Assignment 3 + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + +
+

In this assignment you will be practising data tidying. You will be using the data we have collected from class and data generated from the instructor wearing a wristband activity tracker.

+
+
+

First, you need to import into R a data set containing information about Charles’ activity for the last three weeks. You can find this data set within the Assignment 3 repository you cloned to create this project.

+
+
+

Install packages for manipulating data

+

We will use two packages: tidyr and dplyr

+
#Insall packages
+install.packages("tidyr", "dplyr")
+#Load packages
+library(tidyr, dplyr)
+
+
+

Upload wide format instructor data (instructor_activity_wide.csv)

+
data_wide <- read.table("instructor_activity_wide.csv", sep = ",", header = TRUE)
+
+#Now view the data you have uploaded and notice how its structure: each variable is a date and each row is a type of measure.
+View(data_wide)
+
+#R doesn't like having variable names that consist only of numbers so, as you can see, every variable starts with the letter "X". The numbers represent dates in the format year-month-day.
+
+
+

This is not a convenient format for us to analyze. What we need is for each type of measure to be a column. Your fisrt task is to convert wide format to long format data. To do this we will use the “gather” function: gather(data, time, variables)

+

The gather command requires the following input arguments:

+
    +
  • data: Data object
  • +
  • key: Name of new key column (made from names of data columns)
  • +
  • value: Name of new value column
  • +
  • …: Names of source columns that contain values
  • +
+
data_long <- gather(data_wide, date, variables)
+#Rename the variables so we don't get confused about what is what!
+names(data_long) <- c("variables", "date", "measure")
+#Take a look at your new data, looks weird huh?
+View(data_long)
+
+
+

Now convert this long format into separate columns using the “spread” function to separate by the type of measure

+

The spread function requires the following input:

+
    +
  • data: Data object
  • +
  • key: Name of column containing the new column names
  • +
  • value: Name of column containing values
  • +
+
instructor_data <- spread(data_long, variables, measure)
+
+
+

Now we have a workable instructor data set!The next step is to create a workable student data set. Upload the data “student_activity.csv”. View your file once you have uploaded it and then draw on a piece of paper the structure that you want before you attempt to code it. Write the code you use in the chunk below. (Hint: you can do it in one step)

+
student_data <- read.table("student_activity.csv", sep = ",", header = TRUE)
+student_data <- spread(student_data, variable, measure)
+
+
+

Now that you have workable student data set, subset it to create a data set that only includes data from the second class.

+

To do this we will use the dplyr package (We will need to call dplyr in the command by writing dplyr:: because dplyr uses commands that exist in other packages but to do different operations.)

+

Notice that the way we subset is with a logical rule, in this case date == 20160204. In R, when we want to say that something “equals” something else we need to use a double equals sign “==”. (A single equals sign means the same as <-).

+
student_data_2 <- dplyr::filter(student_data, date == 20160204)
+

Now subset the student_activity data frame to create a data frame that only includes students who have sat at table 4. Write your code in the following chunk:

+
student_data_3 <- dplyr::filter(student_data, table == 4)
+
+
+

Make a new variable

+

It is useful to be able to make new variables for analysis. We can either apend a new variable to our dataframe or we can replace some variables with a new variable. Below we will use the “mutate” function to create a new variable “total_sleep” from the light and deep sleep variables in the instructor data.

+
instructor_data <- dplyr::mutate(instructor_data, total_sleep = s_deep + s_light)
+

Now, refering to the cheat sheet, create a data frame called “instructor_sleep” that contains ONLY the total_sleep variable. Write your code in the following code chunk:

+
instructor_sleep <- dplyr::select(instructor_data, total_sleep)
+

Now, we can combine several commands together to create a new variable that contains a grouping. The following code creates a weekly grouping variable called “week” in the instructor data set:

+
instructor_data <- dplyr::mutate(instructor_data, week = dplyr::ntile(date, 3))
+

Create the same variables for the student data frame, write your code in the code chunk below:

+
student_data <- dplyr::mutate(student_data, week = dplyr::ntile(date, 3))
+
+
+

Sumaraizing

+

Next we will summarize the student data. First we can simply take an average of one of our student variables such as motivation:

+
student_data %>% dplyr::summarise(mean(motivation))
+
+#That isn't super interesting, so let's break it down by week:
+
+student_data %>% dplyr::group_by(date) %>% dplyr::summarise(mean(motivation))
+

Create two new data sets using this method. One that sumarizes average motivation for students for each week (student_week) and another than sumarizes “m_active_time” for the instructor per week (instructor_week). Write your code in the following chunk:

+
student_week <- student_data %>% dplyr::group_by(week) %>% dplyr::summarise(mean(motivation))
+instructor_week <- instructor_data %>% dplyr::group_by(week) %>% dplyr::summarise(mean(m_active_time))
+
+
+

Merging

+

Now we will merge these two data frames using dplyr.

+
merge <- dplyr::full_join(instructor_week, student_week, "week")
+
+
+

Visualize

+

Visualize the relationship between these two variables (mean motivation and mean instructor activity) with the “plot” command and then run a Pearson correlation test (hint: cor.test()). Write the code for the these commands below:

+
names(merge)<-c("week", "student_avg", "instructor_avg")
+plot(merge$student_avg, merge$instructor_avg)
+cor.test(merge$student_avg, merge$instructor_avg)
+

Fnally save your markdown document and your plot to this folder and comit, push and pull your repo to submit.

+
+ + + + +
+ + + + + + + +