-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpart1_wrangling.qmd
executable file
·950 lines (708 loc) · 30.2 KB
/
part1_wrangling.qmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
---
title: "Part I — Data Wrangling & Feature Engineering"
subtitle: "GPCO 468 Capstone Project"
author: "Putra Farrel Azhar"
format: html
editor: visual
out: html
---
# Preliminary logistics
## Loading Packages
```{r message = FALSE}
rm(list = ls())
library(tidyverse)
library(janitor)
library(fixest)
library(didimputation)
library(did)
library(knitr)
library(here)
library(DT)
library(plm)
library(lfe)
library(stargazer)
library(ivreg)
library(ggplot2)
library(ggthemes)
library(didimputation)
library(pwr)
library(WebPower)
library(ICC)
library(fishmethods)
library(parameters)
library(clubSandwich)
library(pdftools)
library(tidygeocoder)
library(tigris)
library(lubridate)
library(extrafont)
library(expss)
library(htmltools)
library(webshot)
library(sf)
library(tigris)
set.seed(0000)
```
## Metadata about the EV dataset
Electric Vehicle Population Size History By County Metadata Updated: January 19, 2024
This shows the number of vehicles that were registered by Washington State Department of Licensing (DOL) each month. The data is separated by county for passenger vehicles and trucks.
DOL integrates National Highway Traffic Safety Administration (NHTSA) data and the Environmental Protection Agency (EPA) fuel efficiency ratings with DOL titling and registration data to create this information.
# EV dataset
```{r message = FALSE}
# Loading the raw ev registration data
ev <- read_csv(here("data_raw", "AFV.csv"))
ev <- clean_names(ev)
# The temporal scope of the outcome variable
mindate <- min(ev$date)
maxdate <- max(ev$date)
print(paste("The oldest date is ", mindate, " and the newest date is ", maxdate))
```
# Feature engineering on the EV dataset
```{r}
# The temporal scope of the outcome variable
mindate <- min(ev$date)
maxdate <- max(ev$date)
print(paste("The oldest date is ", mindate, " and the newest date is ", maxdate))
```
## Removing states that have less than 2 counties
```{r}
# Create a table with the number of distinct counties for each state
distinct_counties_per_state <- ev %>%
group_by(state) %>%
summarise(Number_of_Distinct_Counties = n_distinct(county))
# Print the distinct counties table
print(distinct_counties_per_state)
ev <- ev %>%
filter(!state %in% c("SD",
"ND",
"MS",
"ME",
"DE",
"DC"))
# Total number of unique counties
n_county = length(unique(ev$county))
print(n_county)
# Total number of unique states
n_state = length(unique(ev$state))
print(n_state)
```
## Assessment of NAs in the EV dataset
```{r}
na_ev <- sapply(ev, function(x) sum(is.na(x)))
# If you want to view the counts
print(na_ev)
ev_clean <- na.omit(ev)
na_ev_clean <- sapply(ev_clean, function(x) sum(is.na(x)))
# If you want to view the counts
print(na_ev_clean)
# The temporal scope of the outcome variable
mindate_clean <- min(ev_clean$date)
maxdate_clean <- max(ev_clean$date)
print(paste("The oldest date is ", mindate, " and the newest date is ", maxdate))
```
## Renaming the variable names of the EV dataset
```{r}
ev_clean <- ev_clean %>%
rename(
type = vehicle_primary_use,
bev = battery_electric_vehicles_be_vs,
phev = plug_in_hybrid_electric_vehicles_phe_vs,
ev = electric_vehicle_ev_total,
non_ev = non_electric_vehicle_total,
tot_vehicle = total_vehicles,
ev_percent = percent_electric_vehicles
)
```
## Adding year-month variable in the cleaned EV dataset
```{r}
# First, ensure the 'date' column is a character if it's not already
ev_clean$date <- as.character(ev_clean$date)
# Now, convert it to a Date object using as.Date() with the correct format
ev_clean$date <- as.Date(ev_clean$date, format = "%B %d %Y")
# Create the year_month variable
ev_clean$year_month <- format(ev_clean$date, "%Y-%m")
```
## Saving the cleaned EV dataset
```{r}
write.csv(ev_clean, here("data_clean", "ev_clean.csv"), row.names = FALSE)
```
# Charging station dataset
```{r}
# Loading the raw ev charging station data
station <- read_csv(here("data_raw", "EVI.csv"))
station <- clean_names(station)
```
# Feature engineering on the station dataset
## Assesssment of raw NAs
```{r}
# Count NA values in each column
na_station <- colSums(is.na(station))
# Create a data frame
na_station_df <- data.frame(Column = names(na_station), NA_Count = na_station)
# Print or view the data frame
print(na_station_df)
```
## Subsetting the variables
```{r}
# subset the variables
station_clean <- station[, c("id",
"city",
"state",
"zip",
"access_code",
"open_date",
"latitude",
"longitude"
)]
```
## Assesssment of clean NAs
```{r}
# Count NA values in each column
na_station_clean <- colSums(is.na(station_clean))
# Create a dataframe
na_station_clean_df <- data.frame(Column = names(na_station_clean), NA_Count = na_station_clean)
# Print or view the dataframe
print(na_station_clean_df)
# removing all charging station that have NAs
station_clean <- na.omit(station_clean)
```
## Filtering only to states to match EV dataset
```{r}
# remove white space in the state variable
ev_clean <- ev_clean %>%
mutate(state = trimws(state))
station_clean <- station_clean %>%
mutate(state = trimws(state))
# distinct states
distinct_state_station <- unique(station_clean$state)
distinct_state_ev <- unique(ev_clean$state)
# print of distinct states
print(distinct_state_station)
print(distinct_state_ev)
# list of state codes to keep
valid_state_codes <- c("WA", "CA", "CO", "TX", "OK", "NJ", "IN", "AK", "AZ", "OR", "RI", "VA", "TN", "NY", "MD", "NC", "FL", "ID", "UT", "MO", "AL", "LA", "IL", "GA", "NV", "MI", "HI", "SC", "CT", "KY", "PR", "NE", "AR", "MT", "NH", "PA", "NM", "MA", "MN", "KS", "OH", "IA", "WI", "WY")
# filter the data frame to include only rows with state codes in the specified list
station_clean <- station_clean[station_clean$state %in% valid_state_codes, ]
# final check to count the distinct states
length_state_station_clean <- length(unique(station_clean$state))
length_state_ev_clean <- length(unique(ev_clean$state))
# Print or view the count of distinct states
print(length_state_station_clean)
print(length_state_ev_clean)
```
# Identifying the county of the station dataset
```{r}
# downloading U.S. counties boundaries
options(tigris_class = "sf")
counties <- counties()
# converting the station_clean to a .shp file
station_clean_sf <- st_as_sf(station_clean, coords = c("longitude", "latitude"), crs = 4326)
# Check CRS of the station data
st_crs(station_clean_sf)
# Check CRS of the counties data
st_crs(counties)
# Transform the CRS of station_clean_sf to match counties
station_clean_sf <- st_transform(station_clean_sf, st_crs(counties))
station_with_county <- st_join(station_clean_sf, counties)
# cleaning the names
station_with_county <- clean_names(station_with_county)
names(station_with_county)[names(station_with_county) == "name"] <- "county"
```
# Creating public and private charging station dataset
## Subsetting the variables on the station sf dataset
```{r}
# subset the variables
station_with_county <- station_with_county[, c("id",
"city",
"state",
"zip",
"access_code",
"open_date",
"geometry",
"county",
"aland",
"awater"
)]
```
## Creating the year_month variable
```{r}
## creatin a backup
# station_backup <- station_with_county
# station_with_county <- station_backup
# Convert to Date and then to year_month format
station_with_county$open_date <- as.Date(station_with_county$open_date)
station_with_county$year_month <- format(station_with_county$open_date, "%Y-%m")
```
## Seperating two types of charging station
```{r}
# subset the variables
station_temp <- station_with_county[, c("id",
"state",
"access_code",
"county",
"aland",
"awater",
"year_month"
)]
# converting back to non-spatial data frame
station_temp <- st_drop_geometry(station_temp)
# public charging stations
public <- c("public")
public_station <- station_temp[station_temp$access_code %in% public, ]
public_station <- public_station[, -which(names(public_station) == "access_code")]
# subset the variables in public
public_station <- public_station[, c("id",
"state",
"county",
"year_month"
)]
# private charging stations
# public charging stations
private <- c("private")
private_station <- station_temp[station_temp$access_code %in% private, ]
private_station <- private_station[, -which(names(private_station) == "access_code")]
# subset the variables in public
private_station <- private_station[, c("id",
"state",
"county",
"year_month"
)]
```
## Saving the two types of charging station dataset
```{r}
write.csv(public_station, here("data_clean", "public_station.csv"), row.names = FALSE)
write.csv(private_station, here("data_clean", "private_station.csv"), row.names = FALSE)
write.csv(station_with_county, here("data_clean", "charging_station.shp"), row.names = FALSE)
write.csv(station_temp, here("data_clean", "charging_station.csv"), row.names = FALSE)
```
# Aggregating the public station by year_month and county (running count)
```{r}
# Ensure year_month is recognized as a Date
public_station$year_month <- as.Date(paste0(public_station$year_month, "-01"))
# Aggregate the data to get a running total of charging stations
treat_public <- public_station %>%
dplyr::group_by(county, state) %>%
dplyr::arrange(year_month) %>%
dplyr::mutate(tally_of_the_charging_station = cumsum(!is.na(id))) %>%
dplyr::select(county, state, year_month, tally_of_the_charging_station) %>%
dplyr::ungroup()
# Convert year_month back to "YYYY-MM" format for the final DataFrame
treat_public$year_month <- format(treat_public$year_month, "%Y-%m")
# View the first few rows of the final aggregated data frame
head(treat_public)
# Collapse the data frame to get the total tally of the charging station for each year_month
collapsed_treat_public <- treat_public %>%
dplyr::group_by(county, state, year_month) %>%
dplyr::summarize(total_tally_of_the_charging_station = max(tally_of_the_charging_station), .groups = 'drop')
# View the first few rows of the collapsed data frame
head(collapsed_treat_public)
# Filter the dataset for entries from 2017-01 onwards before expanding
filtered_treat_public <- collapsed_treat_public %>%
filter(as.Date(paste0(year_month, "-01")) >= as.Date("2017-01-01"))
# Identify the full range of year_month values from 2017-01 onwards
full_dates <- seq(as.Date("2017-01-01"),
max(as.Date(paste0(filtered_treat_public$year_month, "-01"))),
by="month")
# Convert full_dates back to "YYYY-MM" format
full_dates_formatted <- format(full_dates, "%Y-%m")
# Expand the filtered dataset to include all combinations of county, state, and year_month from 2017-01
expanded_treat_public <- filtered_treat_public %>%
tidyr::complete(county, state, year_month = full_dates_formatted, fill = list(total_tally_of_the_charging_station = 0)) %>%
dplyr::group_by(county, state) %>%
dplyr::arrange(year_month) %>%
dplyr::mutate(total_tally_of_the_charging_station = cummax(total_tally_of_the_charging_station))
# View the first few rows of the filtered, expanded, and filled data frame
head(expanded_treat_public)
```
## Assessment of NAs
```{r}
# Count NA values in each column
na_expanded_treat_public <- colSums(is.na(collapsed_treat_public))
# Create a dataframe
na_expanded_treat_public_df <- data.frame(Column = names(na_expanded_treat_public), NA_Count = na_expanded_treat_public)
# Print or view the dataframe
print(na_expanded_treat_public_df)
```
# Merging the public charging station treatment with EV dataset
## Merging the two dataset
```{r}
# performing the left join
df <- ev_clean %>%
left_join(expanded_treat_public, by = c("state",
"county",
"year_month"))
df <- df %>%
rename(
public_station = total_tally_of_the_charging_station)
```
## Assessment of NAs
```{r}
# Count NA values in each column
na_df <- colSums(is.na(df))
# Create a dataframe
na_df_dataframe <- data.frame(Column = names(na_df), NA_Count = na_df)
# Print or view the dataframe
print(na_df_dataframe)
# counties that have NAs
counties_states_with_NA <- df %>%
filter(is.na(public_station)) %>%
distinct(county, state)
# View the result
print(counties_states_with_NA)
```
## Removing county with NAs
```{r}
# Remove these counties from the dataset
df_clean <- df %>%
anti_join(counties_states_with_NA, by = c("county", "state"))
# View the first few rows of the cleaned data frame
head(df_clean)
```
## Final assessment of NAs
```{r}
# Count NA values in each column
na_df_clean <- colSums(is.na(df_clean))
# Create a dataframe
na_df_clean_df <- data.frame(Column = names(na_df_clean), NA_Count = na_df_clean)
# Print or view the dataframe
print(na_df_clean_df)
```
# NEVI dataset
```{r}
# Loading the raw NEVI funding data
nevi <- readxl::read_xlsx(here("data_raw", "funding.xlsx"))
nevi <- clean_names(nevi)
view(nevi)
```
# Feature engineeering on the NEVI dataset
## Fixing and converting date to year_month
```{r}
# First, ensure the 'date' column is a character if it's not already
nevi$date <- as.character(nevi$date)
# Now, convert it to a Date object using as.Date() with the correct format
nevi$date <- as.Date(nevi$date, format = "%Y-%m-%d")
# Create the year_month variable
nevi$year_month <- format(nevi$date, "%Y-%m")
# removing the date column
nevi_clean <- subset(nevi, select = -date)
# Update the 'year_month' column for the specific years 2022 and 2023
nevi_clean$year_month <- ifelse(nevi_clean$year_month == "2022-09", "2022-01",
ifelse(nevi_clean$year_month %in% c("2023-09", "2023-10"), "2023-01",
nevi_clean$year_month))
write.csv(nevi_clean, here("data_clean", "nevi_clean.csv"), row.names = FALSE)
```
## change the state to state 2-alphabet code
```{r}
# Load the nevi_clean dataset
nevi_clean <- read.csv(here("data_clean", "nevi_clean.csv"), stringsAsFactors = FALSE)
# Create a named vector with state names as names and codes as values
state_names_to_codes <- c("Alabama" = "AL", "Alaska" = "AK", "Arizona" = "AZ", "Arkansas" = "AR",
"California" = "CA", "Colorado" = "CO", "Connecticut" = "CT", "Delaware" = "DE",
"Florida" = "FL", "Georgia" = "GA", "Hawaii" = "HI", "Idaho" = "ID", "Illinois" = "IL",
"Indiana" = "IN", "Iowa" = "IA", "Kansas" = "KS", "Kentucky" = "KY", "Louisiana" = "LA",
"Maine" = "ME", "Maryland" = "MD", "Massachusetts" = "MA", "Michigan" = "MI",
"Minnesota" = "MN", "Mississippi" = "MS", "Missouri" = "MO", "Montana" = "MT",
"Nebraska" = "NE", "Nevada" = "NV", "New Hampshire" = "NH", "New Jersey" = "NJ",
"New Mexico" = "NM", "New York" = "NY", "North Carolina" = "NC", "North Dakota" = "ND",
"Ohio" = "OH", "Oklahoma" = "OK", "Oregon" = "OR", "Pennsylvania" = "PA",
"Rhode Island" = "RI", "South Carolina" = "SC", "South Dakota" = "SD", "Tennessee" = "TN",
"Texas" = "TX", "Utah" = "UT", "Vermont" = "VT", "Virginia" = "VA", "Washington" = "WA",
"West Virginia" = "WV", "Wisconsin" = "WI", "Wyoming" = "WY", "District of Columbia" = "DC")
# Convert state names to their two-letter codes
nevi_clean$state <- state_names_to_codes[nevi_clean$state]
```
## Saving the cleaned NEVI dataset
```{r}
write.csv(nevi_clean, here("data_clean", "nevi_clean.csv"), row.names = FALSE)
```
# Merging NEVI and combined dataset
```{r}
# performing the left join
df_clean <- df_clean %>%
left_join(nevi_clean, by = c("state",
"year_month"))
df_clean <- df_clean %>%
mutate(funding = coalesce(funding, 0))
```
## Saving the merged NEVI dataset
```{r}
write.csv(df_clean, here("data_clean", "df_clean.csv"), row.names = FALSE)
```
# Control variable: county population dataset
## County population for year 2010 - 2022
```{r}
# county population estimate 2010 - 2019
pop1 <- readxl::read_xlsx(here("data_raw", "pop1.xlsx"))
# county population estimate 2020 - 2022
pop2 <- readxl::read_xlsx(here("data_raw", "pop2.xlsx"))
# State name to code mapping
state_names_to_codes <- c("Alabama" = "AL", "Alaska" = "AK", "Arizona" = "AZ", "Arkansas" = "AR",
"California" = "CA", "Colorado" = "CO", "Connecticut" = "CT", "Delaware" = "DE",
"Florida" = "FL", "Georgia" = "GA", "Hawaii" = "HI", "Idaho" = "ID", "Illinois" = "IL",
"Indiana" = "IN", "Iowa" = "IA", "Kansas" = "KS", "Kentucky" = "KY", "Louisiana" = "LA",
"Maine" = "ME", "Maryland" = "MD", "Massachusetts" = "MA", "Michigan" = "MI",
"Minnesota" = "MN", "Mississippi" = "MS", "Missouri" = "MO", "Montana" = "MT",
"Nebraska" = "NE", "Nevada" = "NV", "New Hampshire" = "NH", "New Jersey" = "NJ",
"New Mexico" = "NM", "New York" = "NY", "North Carolina" = "NC", "North Dakota" = "ND",
"Ohio" = "OH", "Oklahoma" = "OK", "Oregon" = "OR", "Pennsylvania" = "PA",
"Rhode Island" = "RI", "South Carolina" = "SC", "South Dakota" = "SD", "Tennessee" = "TN",
"Texas" = "TX", "Utah" = "UT", "Vermont" = "VT", "Virginia" = "VA", "Washington" = "WA",
"West Virginia" = "WV", "Wisconsin" = "WI", "Wyoming" = "WY", "District of Columbia" = "DC")
# Function to extract state code and clean county names
extract_and_clean <- function(df) {
df$state <- sapply(df$county, function(county_with_state) {
state_name <- str_extract(county_with_state, ",\\s*[^,]+$") %>%
sub(",\\s*", "", .) %>%
trimws()
state_names_to_codes[state_name]
})
df$county <- sapply(df$county, function(county_with_state) {
cleaned_name <- sub(" County.*", "", county_with_state)
cleaned_name <- sub("^\\.", "", cleaned_name)
str_extract(cleaned_name, "^[^,]+") %>% trimws()
})
df
}
# Apply transformations to both dataframes
pop1 <- extract_and_clean(pop1)
pop2 <- extract_and_clean(pop2)
# Merge the datasets by the cleaned 'county' and 'state' columns
combined_pop <- merge(pop1, pop2, by = c("county", "state"), all = TRUE)
# Check the first few rows of the combined dataset
head(combined_pop)
# save the combined_pop dataset
write.csv(combined_pop, here("data_clean", "combined_pop.csv"), row.names = FALSE)
```
## County population for year 2023 - 2024 and merge with combined pop
```{r}
# county population estimate 2023
combined_pop <- read_csv(here("data_clean", "combined_pop.csv"))
pop3_state <- readxl::read_xlsx(here("data_raw", "pop3_state.xlsx"))
# Clean the state names in pop3_state and convert them to two-letter codes
state_names_to_codes <- c("Alabama" = "AL", "Alaska" = "AK", "Arizona" = "AZ", "Arkansas" = "AR",
"California" = "CA", "Colorado" = "CO", "Connecticut" = "CT", "Delaware" = "DE",
"Florida" = "FL", "Georgia" = "GA", "Hawaii" = "HI", "Idaho" = "ID",
"Illinois" = "IL", "Indiana" = "IN", "Iowa" = "IA", "Kansas" = "KS",
"Kentucky" = "KY", "Louisiana" = "LA", "Maine" = "ME", "Maryland" = "MD",
"Massachusetts" = "MA", "Michigan" = "MI", "Minnesota" = "MN",
"Mississippi" = "MS", "Missouri" = "MO", "Montana" = "MT", "Nebraska" = "NE",
"Nevada" = "NV", "New Hampshire" = "NH", "New Jersey" = "NJ", "New Mexico" = "NM",
"New York" = "NY", "North Carolina" = "NC", "North Dakota" = "ND", "Ohio" = "OH",
"Oklahoma" = "OK", "Oregon" = "OR", "Pennsylvania" = "PA", "Rhode Island" = "RI",
"South Carolina" = "SC", "South Dakota" = "SD", "Tennessee" = "TN", "Texas" = "TX",
"Utah" = "UT", "Vermont" = "VT", "Virginia" = "VA", "Washington" = "WA",
"West Virginia" = "WV", "Wisconsin" = "WI", "Wyoming" = "WY", "District of Columbia" = "DC")
# Remove the leading period and convert state names to codes
pop3_state$state <- sub("^\\.", "", pop3_state$state)
pop3_state$state <- sapply(pop3_state$state, function(state_name) state_names_to_codes[state_name])
# Rename columns for clarity and merging
colnames(pop3_state) <- c("state", "total_state_pop_2023")
# Calculate the total 2022 state population within combined_pop
combined_pop <- combined_pop %>%
group_by(state) %>%
mutate(total_state_pop_2022 = sum(`2022`, na.rm = TRUE)) %>%
ungroup()
# Calculate each county's share of the 2022 state population
combined_pop <- combined_pop %>%
mutate(`2022_share` = (`2022` / total_state_pop_2022) * 100)
# Merge the 2023 state population data from pop3_state with combined_pop
combined_pop <- left_join(combined_pop, pop3_state, by = "state")
# Calculate the 2023 population for each county based on its 2022 share
combined_pop <- combined_pop %>%
mutate(`2023` = (`2022_share` / 100) * total_state_pop_2023)
# Inspect the first few rows of the updated combined_pop dataframe
head(combined_pop)
# Round up 2023 to the nearest whole number
combined_pop$`2023` <- ceiling(combined_pop$`2023`)
# Duplicate the population estimate for 2023 to 2024
combined_pop$`2024` <- combined_pop$`2023`
# Subset the combined population data years
# subset the variables
pop_all <- combined_pop[, c("county",
"state",
"2010",
"2011",
"2012",
"2013",
"2014",
"2015",
"2016",
"2017",
"2018",
"2019",
"2020",
"2021",
"2022",
"2023",
"2024"
)]
# omitting NAs
pop_all <- na.omit(pop_all)
# saving the county population 2010 - 2024 dataset
write.csv(pop_all, here("data_clean", "pop_all.csv"), row.names = FALSE)
```
## Subset and save population data for year 2017 - 2024
```{r}
# subset the variables
pop_17_24 <- combined_pop[, c("county",
"state",
"2017",
"2018",
"2019",
"2020",
"2021",
"2022",
"2023",
"2024"
)]
# saving the county population 2017 - 2024 dataset
write.csv(pop_17_24, here("data_clean", "pop_17_24.csv"), row.names = FALSE)
```
## Convert to a long-format with clean names
```{r}
# clean the column names
pop_17_24 <- clean_names(pop_17_24)
# Renaming the columns by removing 'x'
names(pop_17_24) <- sub("^x", "", names(pop_17_24))
# Check the names to verify
names(pop_17_24)
# Convert from wide to long format
pop_clean <- pop_17_24 %>%
pivot_longer(
cols = `2017`:`2024`,
names_to = "year_month",
values_to = "population"
) %>%
# Ensure the 'year_month' column is correctly formatted
mutate(year_month = paste0(year_month, "-01")) # Appending "-01" to make it a consistent year-month format
# View the first few rows of the transformed data
head(pop_clean)
# saving the cleaned county population 2017 - 2024 in long format dataset
write.csv(pop_clean, here("data_clean", "pop_clean.csv"), row.names = FALSE)
```
# Merging county population and combined dataset
```{r}
# Create a new column 'year' from 'year_month'
pop_clean <- pop_clean %>%
mutate(year = sub("-.*", "", year_month))
# Create a new column 'year' from 'year_month'
df_clean <- df_clean %>%
mutate(year = sub("-.*", "", year_month))
# Removing year_month column
pop_clean <- pop_clean[, c("county",
"state",
"year",
"population"
)]
# saving the cleaned county population 2017 - 2024 in long format dataset
write.csv(pop_clean, here("data_clean", "pop_clean.csv"), row.names = FALSE)
# performing the left join
df_clean <- df_clean %>%
left_join(pop_clean, by = c("county",
"state",
"year"
))
# saving the cleaned county population 2017 - 2024 in long format dataset
write.csv(df_clean, here("data_clean", "df_clean.csv"), row.names = FALSE)
```
# Control variable: state population
```{r}
# county population estimate 2017 - 2019
pop_clean <- read_csv(here("data_clean", "pop_clean.csv"))
# omit NAs
pop_clean <- na.omit(pop_clean)
# clean names
pop_clean <- clean_names(pop_clean)
```
## Aggregate the county level to state-level
```{r}
# Removing year_month column
state_pop <- pop_clean[, c("state",
"year",
"population"
)]
# Aggregate the population by state and year
state_pop <- state_pop %>%
group_by(state, year) %>%
summarise(state_population = sum(population, na.rm = TRUE)) %>%
ungroup()
# Check the aggregated data
head(state_pop)
```
# Merging state population and combined dataset
```{r}
# converting year variable as character
state_pop$year <- as.character(state_pop$year)
# performing the left join
df_clean <- df_clean %>%
left_join(state_pop, by = c("state",
"year"
))
```
# Instrument variable: NEVI
```{r}
# Assuming state_pop is your dataframe and it's already been loaded
df_clean <- df_clean %>%
mutate(nevi = case_when(
year < 2022 ~ 0,
year == 2022 ~ 1,
year >= 2023 & year <= 2024 ~ 2
))
# To make 'nevi' a factor (optional), you can convert it like this:
df_clean$nevi <- as.factor(df_clean$nevi)
# Checking the dataframe
head(state_pop)
```
# Control variable: gas price dataset
```{r}
# Define the data
data <- data.frame(
Year = c("2017", "2017", "2017", "2017", "2017", "2017", "2017", "2017", "2017", "2017", "2017", "2017",
"2018", "2018", "2018", "2018", "2018", "2018", "2018", "2018", "2018", "2018", "2018", "2018",
"2019", "2019", "2019", "2019", "2019", "2019", "2019", "2019", "2019", "2019", "2019", "2019",
"2020", "2020", "2020", "2020", "2020", "2020", "2020", "2020", "2020", "2020", "2020", "2020",
"2021", "2021", "2021", "2021", "2021", "2021", "2021", "2021", "2021", "2021", "2021", "2021",
"2022", "2022", "2022", "2022", "2022", "2022", "2022", "2022", "2022", "2022", "2022", "2022",
"2023", "2023", "2023", "2023", "2023", "2023", "2023", "2023", "2023", "2023", "2023", "2023",
"2024", "2024"),
Month = c("01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12",
"01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12",
"01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12",
"01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12",
"01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12",
"01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12",
"01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12",
"01", "02"),
Gas_Price = c(2.458, 2.416, 2.437, 2.528, 2.503, 2.460, 2.414, 2.494, 2.761, 2.621, 2.678, 2.594,
2.671, 2.705, 2.709, 2.873, 2.987, 2.970, 2.928, 2.914, 2.915, 2.943, 2.736, 2.457,
2.338, 2.393, 2.594, 2.881, 2.946, 2.804, 2.823, 2.707, 2.681, 2.724, 2.693, 2.645,
2.636, 2.533, 2.329, 1.938, 1.961, 2.170, 2.272, 2.272, 2.274, 2.248, 2.200, 2.284,
2.420, 2.587, 2.898, 2.948, 3.076, 3.157, 3.231, 3.255, 3.272, 3.384, 3.491, 3.406,
3.413, 3.611, 4.322, 4.213, 4.545, 5.032, 4.668, 4.087, 3.817, 3.935, 3.799, 3.324,
3.445, 3.501, 3.535, 3.711, 3.666, 3.684, 3.712, 3.954, 3.958, 3.742, 3.443, 3.257,
3.197, 3.328)
)
# Combine 'Year' and 'Month' columns to create 'year_month' column
data$year_month <- paste(data$Year, data$Month, sep = "-")
# Select data from January 2017 until February 2024
start_date <- "2017-01"
end_date <- "2024-02"
data_selected <- data[data$year_month >= start_date & data$year_month <= end_date, ]
# Create the final data frame
gas_clean <- data.frame(year_month = data_selected$year_month, Gas_Price = data_selected$Gas_Price)
gas_clean <- clean_names(gas_clean)
# Write the data frame to a CSV file
write.csv(gas_clean, here("data_clean", "gas_clean.csv"), row.names = FALSE)
```
# Merging gas and combined dataseet
```{r}
# performing the left join
df_clean <- df_clean %>%
left_join(gas_clean, by = c("year_month"
))
```
# Saving the final dataset
```{r}
# save the combined_pop dataset
write.csv(df_clean, here("data_clean", "df_final.csv"), row.names = FALSE)
```