-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathimport_sql_export.R
134 lines (106 loc) · 4.08 KB
/
import_sql_export.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
# Purpose: Sample code for reading in data, using SQL in R, and exporting
# Programmer: Ben Porter
# Date: 2/3/2014
# github link: https://github.com/benporter/R-import-sql-export/tree/master
# Load Libraries
library(XLConnect) #needed if using the readWorksheetFromFile() or writeWorksheetToFile() function
library(xlsx) #needed if using the read.xlsx() function
library(sqldf) #needed if using the sqldf() function
############################
# Part 1: Reading in Data
############################
# prints the current working directory
getwd()
# sets the current working directory
setwd("C:/Users/nbkznwu/Documents/R/QMAP Training Course/sample files/SAT Data/")
getwd()
#lists out the files in the current working directory
list.files()
# Import 1: XLConnect method for reading in .XLSX files
df <- readWorksheetFromFile(file="SAT Scores 2012 by State.xlsx",
sheet = "rdata",
header = TRUE)
# Import 2: xlsx library method for reading in .XLSX files
df <- read.xlsx(file="SAT Scores 2012 by State.xlsx",
sheetName="rdata")
# Import 3: CSV files (no additional packages needed)
df <- read.csv(file="SAT Scores 2012 by State.csv")
# Import 4: TXT files, tab delimited
df <- read.table(file="SAT Scores 2012 by State.txt",
header=TRUE,
sep="\t")
###########################################
# Part 2: Inspect and Clean Up the Data
###########################################
#inspect data
nrow(df) # number of rows
ncol(df) # number of columns
head(df) # first 6 rows
tail(df) # last 6 rows
df # prints entire dataset
# counts the number of missing values for each column
sapply(df, function(x) sum(is.na(x)))
# counts the number of null values for each column
sapply(df, function(x) sum(is.null(x)))
# counts the number of empty string values for each column
sapply(df, function(x) sum(x==""))
# compute summary statistics on each column
summary(df)
# SQL doesn't like periods in column names, so use gsub() to replace . with _
colnames(df)
colnames(df) <- gsub(colnames(df),pattern=".",replacement="_",fixed = TRUE)
colnames(df)
###########################################
# Part 3: Use SQL Against the Data
###########################################
# Notes:
# sqldf uses sqlite syntax
# use double quotes around the entire sql statement and single quotes around strings
# Count the number of records
sqldf("select count(*) as record_count
from df")
# get a string of the column names, comma separated
columnList <- paste(colnames(df),collapse=", ")
columnList
# count the unique number of records
# example of a sub-query and example of using variables in the query using the paste() function
sqldf(paste("select count(*)
from ( select count(*) as record_count
from df
group by " , columnList,
")"
)
)
# create a table, called df_nc where ST=NC
df_nc <- sqldf("select *,
1 as beststate
from df
where State='North Carolina'")
# left join example, orginal dataset and NC datatset
df_joined <- sqldf("select df.*,
df_nc.beststate
from df
left join
df_nc on df.State=df_nc.State")
# case logic example, used to change NA's to 0's
df_joined <- sqldf("select *,
case when beststate = 1 then 1
else 0
end as beststate_cleaned
from df_joined")
###########################################
# Part 4: Export the data
###########################################
# XLConnect library method to write an .XLSX file
writeWorksheetToFile(file = "SAT_output_file.xlsx",
data = df_joined,
sheet = "CustomTabName",
startRow = 1)
# Built-in method to write a .CSV file
write.table(x=df_joined,
file = "SAT_output_file.csv",
sep = ",")
# Built-in method to write a tab delimited .txt file
write.table(x=df_joined,
file = "SAT_output_file.txt",
sep = "\t")