analysis/part1_wrangling.qmd

---
title: "Part I — Data Wrangling & Feature Engineering"
subtitle: "GPCO 468 Capstone Project"
author: "Putra Farrel Azhar"
format: html
editor: visual
out: html
---

# Preliminary logistics

## Loading Packages

```{r message = FALSE}
rm(list = ls())
library(tidyverse)
library(janitor)
library(fixest)
library(didimputation)
library(did)
library(knitr)
library(here)
library(DT)
library(plm)
library(lfe)
library(stargazer)
library(ivreg)
library(ggplot2)
library(ggthemes)
library(didimputation)
library(pwr)        
library(WebPower)   
library(ICC)        
library(fishmethods)
library(parameters)
library(clubSandwich)
library(pdftools)
library(tidygeocoder)
library(tigris)
library(lubridate)
library(extrafont)
library(expss)
library(htmltools)
library(webshot)
library(sf)
library(tigris)
set.seed(0000)
```

## Metadata about the EV dataset

Electric Vehicle Population Size History By County Metadata Updated: January 19, 2024

This shows the number of vehicles that were registered by Washington State Department of Licensing (DOL) each month. The data is separated by county for passenger vehicles and trucks.

DOL integrates National Highway Traffic Safety Administration (NHTSA) data and the Environmental Protection Agency (EPA) fuel efficiency ratings with DOL titling and registration data to create this information.

# EV dataset

```{r message = FALSE}
# Loading the raw ev registration data
ev <- read_csv(here("data_raw", "AFV.csv"))

ev <- clean_names(ev)

# The temporal scope of the outcome variable
mindate <- min(ev$date)

maxdate <- max(ev$date)

print(paste("The oldest date is ", mindate, " and the newest date is ", maxdate))

```

# Feature engineering on the EV dataset

```{r}
# The temporal scope of the outcome variable
mindate <- min(ev$date)

maxdate <- max(ev$date)

print(paste("The oldest date is ", mindate, " and the newest date is ", maxdate))

```

## Removing states that have less than 2 counties

```{r}
# Create a table with the number of distinct counties for each state
distinct_counties_per_state <- ev %>%
  group_by(state) %>%
  summarise(Number_of_Distinct_Counties = n_distinct(county))

# Print the distinct counties table
print(distinct_counties_per_state)

ev <- ev %>%
  filter(!state %in% c("SD", 
                       "ND",
                       "MS",
                       "ME",
                       "DE",
                       "DC"))

# Total number of unique counties
n_county = length(unique(ev$county))

print(n_county)

# Total number of unique states
n_state = length(unique(ev$state))

print(n_state)

```

## Assessment of NAs in the EV dataset

```{r}
na_ev <- sapply(ev, function(x) sum(is.na(x)))

# If you want to view the counts
print(na_ev)

ev_clean <- na.omit(ev)

na_ev_clean <- sapply(ev_clean, function(x) sum(is.na(x)))

# If you want to view the counts
print(na_ev_clean)

# The temporal scope of the outcome variable
mindate_clean <- min(ev_clean$date)

maxdate_clean <- max(ev_clean$date)

print(paste("The oldest date is ", mindate, " and the newest date is ", maxdate))

```

## Renaming the variable names of the EV dataset

```{r}
ev_clean <- ev_clean %>%
  rename(
    type = vehicle_primary_use,
    bev = battery_electric_vehicles_be_vs,
    phev = plug_in_hybrid_electric_vehicles_phe_vs,
    ev = electric_vehicle_ev_total,
    non_ev = non_electric_vehicle_total,
    tot_vehicle = total_vehicles,
    ev_percent = percent_electric_vehicles
  )
```

## Adding year-month variable in the cleaned EV dataset

```{r}
# First, ensure the 'date' column is a character if it's not already
ev_clean$date <- as.character(ev_clean$date)

# Now, convert it to a Date object using as.Date() with the correct format
ev_clean$date <- as.Date(ev_clean$date, format = "%B %d %Y")

# Create the year_month variable
ev_clean$year_month <- format(ev_clean$date, "%Y-%m")
```

## Saving the cleaned EV dataset

```{r}
write.csv(ev_clean, here("data_clean", "ev_clean.csv"), row.names = FALSE)
```

# Charging station dataset

```{r}
# Loading the raw ev charging station data
station <- read_csv(here("data_raw", "EVI.csv"))

station <- clean_names(station)

```

# Feature engineering on the station dataset

## Assesssment of raw NAs

```{r}
# Count NA values in each column
na_station <- colSums(is.na(station))

# Create a data frame
na_station_df <- data.frame(Column = names(na_station), NA_Count = na_station)

# Print or view the data frame
print(na_station_df)
```

## Subsetting the variables

```{r}
# subset the variables
station_clean <- station[, c("id",
                             "city",
                             "state",
                             "zip",
                             "access_code",
                             "open_date",
                             "latitude",
                             "longitude"
                             )]

```

## Assesssment of clean NAs

```{r}
# Count NA values in each column
na_station_clean <- colSums(is.na(station_clean))

# Create a dataframe
na_station_clean_df <- data.frame(Column = names(na_station_clean), NA_Count = na_station_clean)

# Print or view the dataframe
print(na_station_clean_df)

# removing all charging station that have NAs
station_clean <- na.omit(station_clean)
```

## Filtering only to states to match EV dataset

```{r}
# remove white space in the state variable
ev_clean <- ev_clean %>%
  mutate(state = trimws(state))

station_clean <- station_clean %>%
  mutate(state = trimws(state))

# distinct states
distinct_state_station <- unique(station_clean$state)
distinct_state_ev <- unique(ev_clean$state)

# print of distinct states
print(distinct_state_station)
print(distinct_state_ev)

# list of state codes to keep
valid_state_codes <- c("WA", "CA", "CO", "TX", "OK", "NJ", "IN", "AK", "AZ", "OR", "RI", "VA", "TN", "NY", "MD", "NC", "FL", "ID", "UT", "MO", "AL", "LA", "IL", "GA", "NV", "MI", "HI", "SC", "CT", "KY", "PR", "NE", "AR", "MT", "NH", "PA", "NM", "MA", "MN", "KS", "OH", "IA", "WI", "WY")

# filter the data frame to include only rows with state codes in the specified list
station_clean <- station_clean[station_clean$state %in% valid_state_codes, ]

# final check to count the distinct states
length_state_station_clean <- length(unique(station_clean$state))
length_state_ev_clean <- length(unique(ev_clean$state))

# Print or view the count of distinct states
print(length_state_station_clean)
print(length_state_ev_clean)

```

# Identifying the county of the station dataset

```{r}
# downloading U.S. counties boundaries
options(tigris_class = "sf")
counties <- counties()

# converting the station_clean to a .shp file
station_clean_sf <- st_as_sf(station_clean, coords = c("longitude", "latitude"), crs = 4326)

# Check CRS of the station data
st_crs(station_clean_sf)

# Check CRS of the counties data
st_crs(counties)

# Transform the CRS of station_clean_sf to match counties
station_clean_sf <- st_transform(station_clean_sf, st_crs(counties))

station_with_county <- st_join(station_clean_sf, counties)

# cleaning the names
station_with_county <- clean_names(station_with_county)

names(station_with_county)[names(station_with_county) == "name"] <- "county"

```

# Creating public and private charging station dataset

## Subsetting the variables on the station sf dataset

```{r}
# subset the variables
station_with_county <- station_with_county[, c("id",
                                               "city", 
                                               "state",
                                               "zip",
                                               "access_code",
                                               "open_date",
                                               "geometry",
                                               "county",
                                               "aland",
                                               "awater"
                                               )]

```

## Creating the year_month variable

```{r}
## creatin a backup
# station_backup <- station_with_county
# station_with_county <- station_backup

# Convert to Date and then to year_month format
station_with_county$open_date <- as.Date(station_with_county$open_date)

station_with_county$year_month <- format(station_with_county$open_date, "%Y-%m")
```

## Seperating two types of charging station

```{r}
# subset the variables
station_temp <- station_with_county[, c("id", 
                                        "state",
                                        "access_code",
                                        "county",
                                        "aland",
                                        "awater",
                                        "year_month"
                                        )]

# converting back to non-spatial data frame
station_temp <- st_drop_geometry(station_temp)

# public charging stations
public <- c("public")

public_station <- station_temp[station_temp$access_code %in% public, ]

public_station <- public_station[, -which(names(public_station) == "access_code")]

# subset the variables in public
public_station <- public_station[, c("id", 
                                        "state",
                                        "county",
                                        "year_month"
                                        )]

# private charging stations
# public charging stations
private <- c("private")

private_station <- station_temp[station_temp$access_code %in% private, ]

private_station <- private_station[, -which(names(private_station) == "access_code")]

# subset the variables in public
private_station <- private_station[, c("id", 
                                        "state",
                                        "county",
                                        "year_month"
                                        )]

```

## Saving the two types of charging station dataset

```{r}
write.csv(public_station, here("data_clean", "public_station.csv"), row.names = FALSE)
write.csv(private_station, here("data_clean", "private_station.csv"), row.names = FALSE)
write.csv(station_with_county, here("data_clean", "charging_station.shp"), row.names = FALSE)
write.csv(station_temp, here("data_clean", "charging_station.csv"), row.names = FALSE)

```

# Aggregating the public station by year_month and county (running count)

```{r}
# Ensure year_month is recognized as a Date
public_station$year_month <- as.Date(paste0(public_station$year_month, "-01"))

# Aggregate the data to get a running total of charging stations
treat_public <- public_station %>%
  dplyr::group_by(county, state) %>%
  dplyr::arrange(year_month) %>%
  dplyr::mutate(tally_of_the_charging_station = cumsum(!is.na(id))) %>%
  dplyr::select(county, state, year_month, tally_of_the_charging_station) %>%
  dplyr::ungroup()

# Convert year_month back to "YYYY-MM" format for the final DataFrame
treat_public$year_month <- format(treat_public$year_month, "%Y-%m")

# View the first few rows of the final aggregated data frame
head(treat_public)

# Collapse the data frame to get the total tally of the charging station for each year_month
collapsed_treat_public <- treat_public %>%
  dplyr::group_by(county, state, year_month) %>%
  dplyr::summarize(total_tally_of_the_charging_station = max(tally_of_the_charging_station), .groups = 'drop')

# View the first few rows of the collapsed data frame
head(collapsed_treat_public)

# Filter the dataset for entries from 2017-01 onwards before expanding
filtered_treat_public <- collapsed_treat_public %>%
  filter(as.Date(paste0(year_month, "-01")) >= as.Date("2017-01-01"))

# Identify the full range of year_month values from 2017-01 onwards
full_dates <- seq(as.Date("2017-01-01"),
                  max(as.Date(paste0(filtered_treat_public$year_month, "-01"))),
                  by="month")

# Convert full_dates back to "YYYY-MM" format
full_dates_formatted <- format(full_dates, "%Y-%m")

# Expand the filtered dataset to include all combinations of county, state, and year_month from 2017-01
expanded_treat_public <- filtered_treat_public %>%
  tidyr::complete(county, state, year_month = full_dates_formatted, fill = list(total_tally_of_the_charging_station = 0)) %>%
  dplyr::group_by(county, state) %>%
  dplyr::arrange(year_month) %>%
  dplyr::mutate(total_tally_of_the_charging_station = cummax(total_tally_of_the_charging_station))

# View the first few rows of the filtered, expanded, and filled data frame
head(expanded_treat_public)

```

## Assessment of NAs

```{r}
# Count NA values in each column
na_expanded_treat_public <- colSums(is.na(collapsed_treat_public))

# Create a dataframe
na_expanded_treat_public_df <- data.frame(Column = names(na_expanded_treat_public), NA_Count = na_expanded_treat_public)

# Print or view the dataframe
print(na_expanded_treat_public_df)
```

# Merging the public charging station treatment with EV dataset

## Merging the two dataset

```{r}
# performing the left join
df <- ev_clean %>%
  left_join(expanded_treat_public, by = c("state", 
                                          "county", 
                                          "year_month"))
df <- df %>%
  rename(
    public_station = total_tally_of_the_charging_station)

```

## Assessment of NAs

```{r}
# Count NA values in each column
na_df <- colSums(is.na(df))

# Create a dataframe
na_df_dataframe <- data.frame(Column = names(na_df), NA_Count = na_df)

# Print or view the dataframe
print(na_df_dataframe)

# counties that have NAs
counties_states_with_NA <- df %>%
  filter(is.na(public_station)) %>%
  distinct(county, state)

# View the result
print(counties_states_with_NA)
```

## Removing county with NAs

```{r}
# Remove these counties from the dataset
df_clean <- df %>%
  anti_join(counties_states_with_NA, by = c("county", "state"))

# View the first few rows of the cleaned data frame
head(df_clean)
```

## Final assessment of NAs

```{r}
# Count NA values in each column
na_df_clean <- colSums(is.na(df_clean))

# Create a dataframe
na_df_clean_df <- data.frame(Column = names(na_df_clean), NA_Count = na_df_clean)

# Print or view the dataframe
print(na_df_clean_df)

```

# NEVI dataset

```{r}
# Loading the raw NEVI funding data
nevi <- readxl::read_xlsx(here("data_raw", "funding.xlsx"))

nevi <- clean_names(nevi)

view(nevi)
```

# Feature engineeering on the NEVI dataset

## Fixing and converting date to year_month

```{r}
# First, ensure the 'date' column is a character if it's not already
nevi$date <- as.character(nevi$date)

# Now, convert it to a Date object using as.Date() with the correct format
nevi$date <- as.Date(nevi$date, format = "%Y-%m-%d")

# Create the year_month variable
nevi$year_month <- format(nevi$date, "%Y-%m")

# removing the date column
nevi_clean <- subset(nevi, select = -date)

# Update the 'year_month' column for the specific years 2022 and 2023
nevi_clean$year_month <- ifelse(nevi_clean$year_month == "2022-09", "2022-01", 
                                ifelse(nevi_clean$year_month %in% c("2023-09", "2023-10"), "2023-01", 
                                       nevi_clean$year_month))

write.csv(nevi_clean, here("data_clean", "nevi_clean.csv"), row.names = FALSE)

```

## change the state to state 2-alphabet code

```{r}
# Load the nevi_clean dataset
nevi_clean <- read.csv(here("data_clean", "nevi_clean.csv"), stringsAsFactors = FALSE)

# Create a named vector with state names as names and codes as values
state_names_to_codes <- c("Alabama" = "AL", "Alaska" = "AK", "Arizona" = "AZ", "Arkansas" = "AR", 
                          "California" = "CA", "Colorado" = "CO", "Connecticut" = "CT", "Delaware" = "DE", 
                          "Florida" = "FL", "Georgia" = "GA", "Hawaii" = "HI", "Idaho" = "ID", "Illinois" = "IL", 
                          "Indiana" = "IN", "Iowa" = "IA", "Kansas" = "KS", "Kentucky" = "KY", "Louisiana" = "LA", 
                          "Maine" = "ME", "Maryland" = "MD", "Massachusetts" = "MA", "Michigan" = "MI", 
                          "Minnesota" = "MN", "Mississippi" = "MS", "Missouri" = "MO", "Montana" = "MT", 
                          "Nebraska" = "NE", "Nevada" = "NV", "New Hampshire" = "NH", "New Jersey" = "NJ", 
                          "New Mexico" = "NM", "New York" = "NY", "North Carolina" = "NC", "North Dakota" = "ND", 
                          "Ohio" = "OH", "Oklahoma" = "OK", "Oregon" = "OR", "Pennsylvania" = "PA", 
                          "Rhode Island" = "RI", "South Carolina" = "SC", "South Dakota" = "SD", "Tennessee" = "TN", 
                          "Texas" = "TX", "Utah" = "UT", "Vermont" = "VT", "Virginia" = "VA", "Washington" = "WA", 
                          "West Virginia" = "WV", "Wisconsin" = "WI", "Wyoming" = "WY", "District of Columbia" = "DC")

# Convert state names to their two-letter codes
nevi_clean$state <- state_names_to_codes[nevi_clean$state]

```

## Saving the cleaned NEVI dataset

```{r}
write.csv(nevi_clean, here("data_clean", "nevi_clean.csv"), row.names = FALSE)
 
```

# Merging NEVI and combined dataset

```{r}
# performing the left join
df_clean <- df_clean %>%
  left_join(nevi_clean, by = c("state", 
                               "year_month"))

df_clean <- df_clean %>%
  mutate(funding = coalesce(funding, 0))
```

## Saving the merged NEVI dataset

```{r}
write.csv(df_clean, here("data_clean", "df_clean.csv"), row.names = FALSE)
 
```

# Control variable: county population dataset

## County population for year 2010 - 2022

```{r}
# county population estimate 2010 - 2019
pop1 <- readxl::read_xlsx(here("data_raw", "pop1.xlsx"))

# county population estimate 2020 - 2022
pop2 <- readxl::read_xlsx(here("data_raw", "pop2.xlsx"))

# State name to code mapping
state_names_to_codes <- c("Alabama" = "AL", "Alaska" = "AK", "Arizona" = "AZ", "Arkansas" = "AR", 
                          "California" = "CA", "Colorado" = "CO", "Connecticut" = "CT", "Delaware" = "DE", 
                          "Florida" = "FL", "Georgia" = "GA", "Hawaii" = "HI", "Idaho" = "ID", "Illinois" = "IL", 
                          "Indiana" = "IN", "Iowa" = "IA", "Kansas" = "KS", "Kentucky" = "KY", "Louisiana" = "LA", 
                          "Maine" = "ME", "Maryland" = "MD", "Massachusetts" = "MA", "Michigan" = "MI", 
                          "Minnesota" = "MN", "Mississippi" = "MS", "Missouri" = "MO", "Montana" = "MT", 
                          "Nebraska" = "NE", "Nevada" = "NV", "New Hampshire" = "NH", "New Jersey" = "NJ", 
                          "New Mexico" = "NM", "New York" = "NY", "North Carolina" = "NC", "North Dakota" = "ND", 
                          "Ohio" = "OH", "Oklahoma" = "OK", "Oregon" = "OR", "Pennsylvania" = "PA", 
                          "Rhode Island" = "RI", "South Carolina" = "SC", "South Dakota" = "SD", "Tennessee" = "TN", 
                          "Texas" = "TX", "Utah" = "UT", "Vermont" = "VT", "Virginia" = "VA", "Washington" = "WA", 
                          "West Virginia" = "WV", "Wisconsin" = "WI", "Wyoming" = "WY", "District of Columbia" = "DC")

# Function to extract state code and clean county names
extract_and_clean <- function(df) {
  df$state <- sapply(df$county, function(county_with_state) {
    state_name <- str_extract(county_with_state, ",\\s*[^,]+$") %>% 
      sub(",\\s*", "", .) %>% 
      trimws()
    state_names_to_codes[state_name]
  })
  df$county <- sapply(df$county, function(county_with_state) {
    cleaned_name <- sub(" County.*", "", county_with_state)
    cleaned_name <- sub("^\\.", "", cleaned_name)
    str_extract(cleaned_name, "^[^,]+") %>% trimws()
  })
  df
}

# Apply transformations to both dataframes
pop1 <- extract_and_clean(pop1)
pop2 <- extract_and_clean(pop2)

# Merge the datasets by the cleaned 'county' and 'state' columns
combined_pop <- merge(pop1, pop2, by = c("county", "state"), all = TRUE)

# Check the first few rows of the combined dataset
head(combined_pop)

# save the combined_pop dataset
write.csv(combined_pop, here("data_clean", "combined_pop.csv"), row.names = FALSE)
```

## County population for year 2023 - 2024 and merge with combined pop

```{r}
# county population estimate 2023
combined_pop <- read_csv(here("data_clean", "combined_pop.csv"))

pop3_state <- readxl::read_xlsx(here("data_raw", "pop3_state.xlsx"))

# Clean the state names in pop3_state and convert them to two-letter codes
state_names_to_codes <- c("Alabama" = "AL", "Alaska" = "AK", "Arizona" = "AZ", "Arkansas" = "AR", 
                          "California" = "CA", "Colorado" = "CO", "Connecticut" = "CT", "Delaware" = "DE", 
                          "Florida" = "FL", "Georgia" = "GA", "Hawaii" = "HI", "Idaho" = "ID", 
                          "Illinois" = "IL", "Indiana" = "IN", "Iowa" = "IA", "Kansas" = "KS", 
                          "Kentucky" = "KY", "Louisiana" = "LA", "Maine" = "ME", "Maryland" = "MD", 
                          "Massachusetts" = "MA", "Michigan" = "MI", "Minnesota" = "MN", 
                          "Mississippi" = "MS", "Missouri" = "MO", "Montana" = "MT", "Nebraska" = "NE", 
                          "Nevada" = "NV", "New Hampshire" = "NH", "New Jersey" = "NJ", "New Mexico" = "NM", 
                          "New York" = "NY", "North Carolina" = "NC", "North Dakota" = "ND", "Ohio" = "OH", 
                          "Oklahoma" = "OK", "Oregon" = "OR", "Pennsylvania" = "PA", "Rhode Island" = "RI", 
                          "South Carolina" = "SC", "South Dakota" = "SD", "Tennessee" = "TN", "Texas" = "TX", 
                          "Utah" = "UT", "Vermont" = "VT", "Virginia" = "VA", "Washington" = "WA", 
                          "West Virginia" = "WV", "Wisconsin" = "WI", "Wyoming" = "WY", "District of Columbia" = "DC")

# Remove the leading period and convert state names to codes
pop3_state$state <- sub("^\\.", "", pop3_state$state)
pop3_state$state <- sapply(pop3_state$state, function(state_name) state_names_to_codes[state_name])

# Rename columns for clarity and merging
colnames(pop3_state) <- c("state", "total_state_pop_2023")

# Calculate the total 2022 state population within combined_pop
combined_pop <- combined_pop %>%
  group_by(state) %>%
  mutate(total_state_pop_2022 = sum(`2022`, na.rm = TRUE)) %>%
  ungroup()

# Calculate each county's share of the 2022 state population
combined_pop <- combined_pop %>%
  mutate(`2022_share` = (`2022` / total_state_pop_2022) * 100)

# Merge the 2023 state population data from pop3_state with combined_pop
combined_pop <- left_join(combined_pop, pop3_state, by = "state")

# Calculate the 2023 population for each county based on its 2022 share
combined_pop <- combined_pop %>%
  mutate(`2023` = (`2022_share` / 100) * total_state_pop_2023)

# Inspect the first few rows of the updated combined_pop dataframe
head(combined_pop)

# Round up 2023 to the nearest whole number
combined_pop$`2023` <- ceiling(combined_pop$`2023`)

# Duplicate the population estimate for 2023  to 2024
combined_pop$`2024` <- combined_pop$`2023`

# Subset the combined population data years

# subset the variables
pop_all <- combined_pop[, c("county",
                             "state",
                             "2010",
                             "2011",
                             "2012",
                             "2013",
                             "2014",
                             "2015",
                             "2016",
                             "2017",
                             "2018",
                             "2019",
                             "2020",
                             "2021",
                             "2022",
                             "2023",
                             "2024"
                             )]

# omitting NAs
pop_all <- na.omit(pop_all)

# saving the county population 2010 - 2024 dataset
write.csv(pop_all, here("data_clean", "pop_all.csv"), row.names = FALSE)
```

## Subset and save population data for year 2017 - 2024

```{r}
# subset the variables
pop_17_24 <- combined_pop[, c("county",
                             "state",
                             "2017",
                             "2018",
                             "2019",
                             "2020",
                             "2021",
                             "2022",
                             "2023",
                             "2024"
                             )]

# saving the county population 2017 - 2024 dataset
write.csv(pop_17_24, here("data_clean", "pop_17_24.csv"), row.names = FALSE)
```

## Convert to a long-format with clean names

```{r}
# clean the column names
pop_17_24 <- clean_names(pop_17_24)

# Renaming the columns by removing 'x'
names(pop_17_24) <- sub("^x", "", names(pop_17_24))

# Check the names to verify
names(pop_17_24)

# Convert from wide to long format
pop_clean <- pop_17_24 %>%
  pivot_longer(
    cols = `2017`:`2024`, 
    names_to = "year_month", 
    values_to = "population"
  ) %>%
  # Ensure the 'year_month' column is correctly formatted
  mutate(year_month = paste0(year_month, "-01")) # Appending "-01" to make it a consistent year-month format

# View the first few rows of the transformed data
head(pop_clean)

# saving the cleaned county population 2017 - 2024 in long format dataset
write.csv(pop_clean, here("data_clean", "pop_clean.csv"), row.names = FALSE)
```

# Merging county population and combined dataset

```{r}
# Create a new column 'year' from 'year_month'
pop_clean <- pop_clean %>%
  mutate(year = sub("-.*", "", year_month))

# Create a new column 'year' from 'year_month'
df_clean <- df_clean %>%
  mutate(year = sub("-.*", "", year_month))

# Removing year_month column
pop_clean <- pop_clean[, c("county",
                           "state",
                           "year",
                           "population"
                           )]

# saving the cleaned county population 2017 - 2024 in long format dataset
write.csv(pop_clean, here("data_clean", "pop_clean.csv"), row.names = FALSE)

# performing the left join
df_clean <- df_clean %>%
  left_join(pop_clean, by = c("county", 
                              "state", 
                              "year"
                              ))

# saving the cleaned county population 2017 - 2024 in long format dataset
write.csv(df_clean, here("data_clean", "df_clean.csv"), row.names = FALSE)

```

# Control variable: state population

```{r}
# county population estimate 2017 - 2019
pop_clean <- read_csv(here("data_clean", "pop_clean.csv"))

# omit NAs
pop_clean <- na.omit(pop_clean)

# clean names
pop_clean <- clean_names(pop_clean)

```

## Aggregate the county level to state-level

```{r}
# Removing year_month column
state_pop <- pop_clean[, c("state",
                           "year",
                           "population"
                           )]

# Aggregate the population by state and year
state_pop <- state_pop %>%
  group_by(state, year) %>%
  summarise(state_population = sum(population, na.rm = TRUE)) %>%
  ungroup()

# Check the aggregated data
head(state_pop)

```

# Merging state population and combined dataset

```{r}
# converting year variable as character
state_pop$year <- as.character(state_pop$year)

# performing the left join
df_clean <- df_clean %>%
  left_join(state_pop, by = c("state", 
                              "year"
                              ))
```

# Instrument variable: NEVI

```{r}
# Assuming state_pop is your dataframe and it's already been loaded
df_clean <- df_clean %>%
  mutate(nevi = case_when(
    year < 2022 ~ 0,
    year == 2022 ~ 1,
    year >= 2023 & year <= 2024 ~ 2
  ))

# To make 'nevi' a factor (optional), you can convert it like this:
df_clean$nevi <- as.factor(df_clean$nevi)

# Checking the dataframe
head(state_pop)

```

# Control variable: gas price dataset

```{r}
# Define the data
data <- data.frame(
  Year = c("2017", "2017", "2017", "2017", "2017", "2017", "2017", "2017", "2017", "2017", "2017", "2017",
           "2018", "2018", "2018", "2018", "2018", "2018", "2018", "2018", "2018", "2018", "2018", "2018",
           "2019", "2019", "2019", "2019", "2019", "2019", "2019", "2019", "2019", "2019", "2019", "2019",
           "2020", "2020", "2020", "2020", "2020", "2020", "2020", "2020", "2020", "2020", "2020", "2020",
           "2021", "2021", "2021", "2021", "2021", "2021", "2021", "2021", "2021", "2021", "2021", "2021",
           "2022", "2022", "2022", "2022", "2022", "2022", "2022", "2022", "2022", "2022", "2022", "2022",
           "2023", "2023", "2023", "2023", "2023", "2023", "2023", "2023", "2023", "2023", "2023", "2023",
           "2024", "2024"),
  Month = c("01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12",
            "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12",
            "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12",
            "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12",
            "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12",
            "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12",
            "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12",
            "01", "02"),
  Gas_Price = c(2.458, 2.416, 2.437, 2.528, 2.503, 2.460, 2.414, 2.494, 2.761, 2.621, 2.678, 2.594,
                2.671, 2.705, 2.709, 2.873, 2.987, 2.970, 2.928, 2.914, 2.915, 2.943, 2.736, 2.457,
                2.338, 2.393, 2.594, 2.881, 2.946, 2.804, 2.823, 2.707, 2.681, 2.724, 2.693, 2.645,
                2.636, 2.533, 2.329, 1.938, 1.961, 2.170, 2.272, 2.272, 2.274, 2.248, 2.200, 2.284,
                2.420, 2.587, 2.898, 2.948, 3.076, 3.157, 3.231, 3.255, 3.272, 3.384, 3.491, 3.406,
                3.413, 3.611, 4.322, 4.213, 4.545, 5.032, 4.668, 4.087, 3.817, 3.935, 3.799, 3.324,
                3.445, 3.501, 3.535, 3.711, 3.666, 3.684, 3.712, 3.954, 3.958, 3.742, 3.443, 3.257,
                3.197, 3.328)
)

# Combine 'Year' and 'Month' columns to create 'year_month' column
data$year_month <- paste(data$Year, data$Month, sep = "-")

# Select data from January 2017 until February 2024
start_date <- "2017-01"
end_date <- "2024-02"
data_selected <- data[data$year_month >= start_date & data$year_month <= end_date, ]

# Create the final data frame
gas_clean <- data.frame(year_month = data_selected$year_month, Gas_Price = data_selected$Gas_Price)

gas_clean <- clean_names(gas_clean)

# Write the data frame to a CSV file
write.csv(gas_clean, here("data_clean", "gas_clean.csv"), row.names = FALSE)
```

# Merging gas and combined dataseet

```{r}
# performing the left join
df_clean <- df_clean %>%
  left_join(gas_clean, by = c("year_month"
                              ))
```

# Saving the final dataset

```{r}
# save the combined_pop dataset
write.csv(df_clean, here("data_clean", "df_final.csv"), row.names = FALSE)
```