-
Notifications
You must be signed in to change notification settings - Fork 0
/
cleaning nationwide obesity data.R
62 lines (47 loc) · 2.55 KB
/
cleaning nationwide obesity data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
library(dplyr)
library(tibble)
library(zoo)
#fixing nationwide obesity data, interpolating for every other year
obesity <- read_csv("rawData/Nationwide Obesity from NCHS/data.csv", skip = 1)
evenYears <- seq(from = 2000, to = 2018, by = 2)
evenYearsT <- as_tibble(matrix(NA, ncol=length(evenYears)))
names(evenYearsT) <- evenYears
for(i in length(obesity):1){
obesity <- obesity %>% add_column(evenYearsT[,i], .after=i)
}
names(obesity) <- 1999:2018
obesityLong <- data.frame(year = names(obesity), obesity = as.numeric(obesity[1,]), severeObesity = as.numeric(obesity[2,]))
obesityLong$obesity[20] = obesityLong$obesity[19] + (obesityLong$obesity[19] - obesityLong$obesity[17])/2
obesityLong$severeObesity[20] = obesityLong$severeObesity[19] + (obesityLong$severeObesity[19] - obesityLong$severeObesity[17])/2
obesityLong <- obesityLong %>%
mutate(obesity = na.approx(obesity)) %>%
mutate(severeObesity = na.approx(severeObesity))
#add 2019, assuming no change from the previous year
obesityLong <- obesityLong %>% add_row(data.frame(year = as.character(2019), obesity = obesityLong$obesity[20], severeObesity = obesityLong$severeObesity[20]))
plot(obesityLong$obesity)
plot(obesityLong$severeObesity)
row.names(obesityLong) <- obesityLong$year
obesityLong$year <- NULL
obesityFinal <- as.data.frame(t(obesityLong))
write.csv(obesityFinal, "./formattedData/nationwideObesityByYear1999-2018.csv")
#plot obesity
obesityLong$year <- row.names(obesityLong)
# ggplot(obesityLong, aes(x = year, y = obesity)) + geom_point()
#
# #convert year to be e.g. '02, but i need to make a factor in the correct order, otherwise '99 is after everything else
# obesityLong$year <- paste(sep="", "'", substr(obesityLong$year, 3, 4))
# obesityLong$year <- factor(obesityLong$year, levels = obesityLong$year)
#remove years up to 2010
obesityLong <- obesityLong %>% filter(year >= 2010 & year < 2019)
write_csv(obesityLong %>% select(year, obesity), 'figures/final paper/figure5ddata.csv')
png(filename = 'figures/final paper/figure5d.png', units = 'in', width = 5.95, height = 3.5, res=300, type = c('cairo'))
ggplot(obesityLong, aes(x = year, y = obesity, group = 1)) + geom_line(linewidth = 1.5, color = "black") + geom_point(
fill = "black",
size = 3.5,
pch = 21, # Type of point that allows us to have both color (border) and fill.
colour = "#FFFFFF",
stroke = 1 # The width of the border, i.e. stroke.
) + labs(title = "Obesity prevalence in the US, 2010-2018",
x = "Year",
y = "Nationwide obesity prevalence (%)") + theme(plot.title = element_text(size=14))
dev.off()