-
Notifications
You must be signed in to change notification settings - Fork 0
/
comparing adjacent counties separate script.R
247 lines (205 loc) · 11.9 KB
/
comparing adjacent counties separate script.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
#doing the same process as "comparing adjacent counties.R", but for all the files in the fullStateComparisons folder
#hit the Source button in the top right to make the inputs work correctly
#have FIPSInfo pulled up so that you can search for counties that don't have a FIPS match
library(jsonlite)
library(stringr)
library(dplyr)
library(readr)
library(ggplot2)
library(colorspace)
# readIn <- read_json(r"(.\adjacentCountiesData\tennessee & virginia vs kentucky.txt)")
# readIn <- read_json(r"(.\adjacentCountiesData\fullStateComparisons\tennessee vs kentucky full.txt)")
readline(prompt="press enter") #stops the other readline from screwing up
# colorspace <- colorspace::hclwizard(gui = "tcltk")
paths <- list.files(path = "./adjacentCountiesData/fullStateComparisons", pattern = ".txt?")
FIPSInfo <- read_delim("adjacentCountiesData/www2.census.gov_geo_docs_reference_codes2020_national_county2020.txt",
delim = "|", escape_double = FALSE, trim_ws = TRUE)
#remove "County" from the end of each
FIPSInfo$COUNTYNAME <- str_remove(FIPSInfo$COUNTYNAME, " County$")
#create a combined fips column
FIPSInfo$COMBINEDFIPS <- as.numeric(paste(sep = "", FIPSInfo$STATEFP, FIPSInfo$COUNTYFP))
for(path in paths){
readIn <- read_json(paste(sep = "", "./adjacentCountiesData/fullStateComparisons/", path))
pathNoTxt <- substr(path, 1, nchar(path)-4)
#create a standardized list of counties based on the MapChart JSON file
#comes from https://www.mapchart.net/usa-counties.html
countiesList <- readIn$groups[[1]]$paths
counties <- data.frame(countyName = character(), stateAbb = character())
for(county in countiesList){
#have to do a dumb split for counties with "St." in them like St. Louis, because they're stored as St__Louis__MN which splits wrong
split <- str_split(county, "__")
lengthSplit <- length(split[[1]])
counties <- counties %>% add_row(countyName = paste(split[[1]][1:lengthSplit-1], collapse=". "), stateAbb = split[[1]][lengthSplit])
}
#construct a list of matching FIPS just based on state and county matches
#prints what counties aren't found so you can manually search later
fipMatches <- list()
for(i in 1:length(counties[[1]])){
FIP <- -1
state <- counties[i,2]
county <- counties[i,1]
county <- str_replace_all(county, "_", " ")
# print(paste("searching for", state, county))
filter <- FIPSInfo %>% filter(STATE == state) %>% filter(COUNTYNAME == county)
#a match was found if the length is 1
if(dim(filter)[1] == 1){
# print(paste("match found", filter$COMBINEDFIPS[1]))
FIP <- filter$COMBINEDFIPS[1]
} else {
print(paste("no match found for" , county, state))
counties$fipMatches[counties$countyName==county] <- readline(prompt = paste(sep="", "what is the FIPS id for: ", county, ",", state, "? "))
Sys.sleep(1) #stop readline from skipping, maybe?
}
fipMatches <- c(fipMatches, FIP)
}
counties$fipMatches <- as.numeric(fipMatches)
#adding norton's FIP
# counties$fipMatches[39] <- 51720
# counties$fipMatches[counties$countyName=="Van_Buren"] <- 47175
#########################################
#creating plots for HDM and insurance levels based on the MapQuest group
# hrtDisease <- read_csv("./formattedData/formattedAtlasDatawStateSorted.csv")
# percentInsured <- read_csv("./formattedData/percentInsured.csv")
didTable <- read_csv("./formattedData/DidTable.csv")
# hrtDiseasefiltered <- filter(hrtDisease, cnty_fips %in% counties$fipMatches)
# percentInsuredfiltered <- filter(percentInsured, combinedfips %in% counties$fipMatches)
#filter didTable to only include counties in the MapQuest group
didTablefiltered <- filter(didTable, GEO_ID %in% counties$fipMatches)
#split them based on their expansion status
# unique(didTablefiltered$YEAR_TREATED)
expandedList <- filter(didTablefiltered, YEAR_TREATED == 2014)
notExpandedList <- filter(didTablefiltered, (YEAR_TREATED > 2018 | YEAR_TREATED == 0))
paste("# of unintentional exclusions:", dim(didTablefiltered)[1] - dim(expandedList)[1] - dim(notExpandedList)[1])
expandedList$GROUP <- "Expanded Medicaid"
notExpandedList$GROUP <- "Did Not Expand Medicaid"
combinedList <- rbind(expandedList, notExpandedList)
combinedList$YEAR <- as.character(combinedList$YEAR) #fixes graph
years <- unique(combinedList$YEAR)
hdmPlotSeries <- data.frame(year = character(), meanValue = numeric(), sd = numeric(), group = character())
groups <- c("Expanded Medicaid", "Did Not Expand Medicaid")
for(y in years){
for(g in groups){
filteredByYearGroup <- combinedList %>% filter(YEAR == y & GROUP == g)
hdmPlotSeries <- add_row(hdmPlotSeries, year = y, meanValue = mean(filteredByYearGroup$HRTDISEASE), sd = sd(filteredByYearGroup$HRTDISEASE), group = g)
}
}
# ggplot(hdmPlotSeries, aes(x = year, y = meanValue, group = group))
cbPalette <- c("#E69F00", "#56B4E9", "#009E73", "#CC79A7", "#F0E442", "#0072B2", "#D55E00", "#999999")
ggplot(hdmPlotSeries, aes(x = year, y = meanValue, group = group, color = group)) + geom_line(linewidth = 2.4) + geom_point(
aes(fill = group),
size = 5,
pch = 21, # Type of point that allows us to have both color (border) and fill.
colour = "#FFFFFF",
stroke = 1 # The width of the border, i.e. stroke.
) + labs(title = paste(sep="", "Average Heart Disease Mortality of Selected Counties Based on Medicaid Expansion Status, 2010-2019"),
x = "Year",
y = "Age-standardized Mortality Rate per 100,000") + theme(plot.title = element_text(size=8)
) + scale_fill_manual(values=cbPalette) + scale_colour_manual(values=cbPalette)
ggsave(paste(sep = "", "figures/Comparing Adjacent States/", "hdmPlot", pathNoTxt, ".pdf"), # jpg, png, eps, tex, etc.
plot = last_plot(), # or an explicit ggplot object name,
width = 7, height = 5,
units = "in", # other options c("in", "cm", "mm"),
dpi = 300)
years <- unique(combinedList$YEAR)
insurancePlotSeries <- data.frame(year = character(), meanValue = numeric(), group = character())
groups <- c("Expanded Medicaid", "Did Not Expand Medicaid")
for(y in years){
for(g in groups){
filteredByYearGroup <- combinedList %>% filter(YEAR == y & GROUP == g)
insurancePlotSeries <- add_row(insurancePlotSeries, year = y, meanValue = mean(filteredByYearGroup$INSURANCERATE), group = g)
}
}
ggplot(insurancePlotSeries, aes(x = year, y = meanValue, group = group, color = group)) + geom_line(linewidth = 2.4) + geom_point(
aes(fill = group),
size = 5,
pch = 21, # Type of point that allows us to have both color (border) and fill.
colour = "#FFFFFF",
stroke = 1 # The width of the border, i.e. stroke.
) + labs(title = paste(sep="", "Average Percent of People Covered by Insurance of Selected Counties Based on Medicaid Expansion Status, 2010-2019"),
x = "Year",
y = "% Covered by Insurance") + theme(plot.title = element_text(size=8)
) + scale_fill_manual(values=cbPalette) + scale_colour_manual(values=cbPalette)
ggsave(paste(sep = "", "figures/Comparing Adjacent States/", "insurancePlot", pathNoTxt, ".pdf"), # jpg, png, eps, tex, etc.
plot = last_plot(), # or an explicit ggplot object name,
width = 7, height = 5,
units = "in", # other options c("in", "cm", "mm"),
dpi = 300)
############################################
#plotting HDM and insurance on maps
library(usmap)
#bing AI code btw, fun!
#find the change in HDM from 2010 to 2019, then plot it on the map
# Filter the rows where YEAR is either 2010 or 2019
df <- filter(combinedList, YEAR %in% c(2010,2019))
# Group by GEO_ID and calculate the difference in HRTDISEASE between 2010 and 2019
df <- df %>%
group_by(GEO_ID) %>%
summarize(change = (HRTDISEASE[YEAR == 2019] - HRTDISEASE[YEAR == 2010])/HRTDISEASE[YEAR == 2010] * 100)
# Rename the columns for plot_usmap
df <- df %>%
rename(fips = GEO_ID, values = change)
p <- plot_usmap(data = df, regions = "states", include = unique(counties$stateAbb)) +
labs(title = "Heart Disease Mortality Change from 2010 to 2019 by County") +
theme(panel.background=element_blank()) + scale_fill_gradient2(low = "#005696", high = "#8E063B", mid = "lightgrey", midpoint = 0, name = "% Change in HDM", limits = c(-55, 80)) + theme(legend.position = "right")
plot(p)
pdf(paste(sep = "", "figures/Comparing Adjacent States/", "HDMMap", pathNoTxt, ".pdf"), width = 7, height = 5)
print(p)
dev.off()
#same, but for insurance instead
# Filter the rows where YEAR is either 2010 or 2019
df <- filter(combinedList, YEAR %in% c(2010,2019))
# Group by GEO_ID and calculate the difference in HRTDISEASE between 2010 and 2019
df <- df %>%
group_by(GEO_ID) %>%
summarize(change = (INSURANCERATE[YEAR == 2019] - INSURANCERATE[YEAR == 2010])/INSURANCERATE[YEAR == 2010] * 100)
# Rename the columns for plot_usmap
df <- df %>%
rename(fips = GEO_ID, values = change)
p <- plot_usmap(data = df, regions = "states", include = unique(counties$stateAbb)) +
labs(title = "Insurance Coverage Change from 2010 to 2019 by County") +
theme(panel.background=element_blank()) + scale_fill_gradient2(high = "#005696", low = "#8E063B", mid = "lightgrey", midpoint = 0, name = "% Change in Insurance Coverage", limits = c(-30, 30)) + theme(legend.position = "right")
# scale_fill_gradientn(
# colors=viridis(3, option = "viridis"), name = "% Change in Insurance Coverage", limits = c(-10, 30))
pdf(paste(sep = "", "figures/Comparing Adjacent States/", "InsuranceMap", pathNoTxt, ".pdf"), width = 7, height = 5)
plot(p)
dev.off()
#also plot the treatment (just for being obvious)
treatment <- combinedList %>% group_by(GEO_ID) %>% summarize(expanded = ifelse((sum(TREATED) > 4), "Expanded", "Did Not Expand")) %>% rename(fips = GEO_ID, values = expanded)
p <- plot_usmap(data = treatment, regions = "states", include = unique(counties$stateAbb)) +
labs(title = "Medicaid Expansion Status by County") +
theme(panel.background=element_blank()) + theme(legend.position = "right")
pdf(paste(sep = "", "figures/Comparing Adjacent States/", "TreatmentMap", pathNoTxt, ".pdf"), width = 7, height = 5)
plot(p)
print(p)
dev.off()
library(synthdid)
# #trying out a synth did to see what difference it makes
# #https://synth-inference.github.io/synthdid/articles/synthdid.html
combinedList[combinedList$YEAR_TREATED>2018,]$TREATED <- 0
setup <- synthdid::panel.matrices(as.data.frame(combinedList), unit = 1, time = 2, treatment = 3, outcome = 6)
tau.hat = synthdid_estimate(setup$Y, setup$N0, setup$T0)
sink(paste(sep = "", "figures/Comparing Adjacent States/", "SDIDSummary", pathNoTxt, ".txt")) #opens sink
print(tau.hat)
print(summary(tau.hat))
sink() #closes sink
pdf(paste(sep = "", "figures/Comparing Adjacent States/", "SDIDResults2", pathNoTxt, ".pdf"), width = 5, height = 3.5)
p <- synthdid_plot(tau.hat) + scale_color_discrete(labels=c('Synthetic Control', 'Treated')) +
labs(x='Date',
y='Preventable Heart Disease\n Mortality Rate per 100,000') + scale_x_continuous(breaks = 2010:2020)
plot(p)
dev.off()
pdf(paste(sep = "", "figures/Comparing Adjacent States/", "SDIDResultsLinedUp", pathNoTxt, ".pdf"), width = 7, height = 5)
p <- synthdid_plot(tau.hat, overlay = 1)
plot(p)
dev.off()
# issues with multicollinearity on the obesity variable in the foels within adjust.outcome.for.x, will look at later
# library(xsynthdid)
# dat <- as.data.frame(combinedList)
# dat$y.adj <- adjust.outcome.for.x(combinedList,unit=1,time = 2, treatment = 3, outcome = 6,x=c(8:10, 12))
# pm <- panel.matrices(dat, unit=1,time = 2,treatment = 3, outcome = "y.adj")
# sdid.adj = synthdid_estimate(Y=pm$Y,N0=pm$N0,T0 = pm$T0)
# sdid.adj
# p <- synthdid_plot(sdid.adj) + scale_color_discrete(labels=c('Synthetic Control', 'Treated')) +
# labs(x='Date',
# y='Preventable Heart Disease\n Mortality Rate') + scale_x_continuous(breaks = 2010:2020)
}