analysis/part2_estimation.qmd

---
title: "Part II — Data Analysis & Causal Inference"
subtitle: "GPCO 468 Capstone Project"
author: "Putra Farrel Azhar"
format: html
editor: visual
out: html
---

# Loading packages

```{r message = FALSE}
rm(list = ls())
library(tidyverse)
library(janitor)
library(fixest)
library(didimputation)
library(did)
library(knitr)
library(here)
library(DT)
library(plm)
library(lfe)
library(stargazer)
library(ivreg)
library(ggplot2)
library(ggthemes)
library(didimputation)
library(pwr)        
library(WebPower)   
library(ICC)        
library(fishmethods)
library(parameters)
library(clubSandwich)
library(pdftools)
library(tidygeocoder)
library(tigris)
library(lubridate)
library(extrafont)
library(expss)
library(htmltools)
library(webshot)
library(sf)
library(tigris)
set.seed(0000)
```

# Loading datasets

```{r message = FALSE}
# Final panel dataset
df_final <- read_csv(here("data_clean", "df_final.csv"))

# EV clean dataset
ev_clean <- read_csv(here("data_clean", "ev_clean.csv"))

# Cleaning the names
df_final <- clean_names(df_final)

ev_clean <- clean_names(ev_clean)

```

# Additional cleaning & wrangling

```{r}
# Create the lagged outcome variable (6 periods lag)
df_final <- df_final %>%
  arrange(state, county, year_month) %>% # Replace county with the appropriate unit identifier if needed
  group_by(state, county) %>% # Group by the relevant unit
  mutate(bev_lag6 = lag(bev, 6)) %>%
  ungroup()

df_final <- df_final %>%
  arrange(state, county, year_month) %>% # Replace county with the appropriate unit identifier if needed
  group_by(state, county) %>% # Group by the relevant unit
  mutate(phev_lag6 = lag(phev, 6)) %>%
  ungroup()

# Count NA values in each column
na_df_final <- colSums(is.na(df_final))

# Create a dataframe
na_df_final_df <- data.frame(Column = names(na_df_final), NA_Count = na_df_final)

# Print or view the dataframe
print(na_df_final_df)

# omit all NAs
df_final <- na.omit(df_final)

```

# BEVs and PHEVs growth plot

```{r, fig.width=8, fig.height=6.5}
# Aggregate data by 'year_month' for BEV and PHEV
ev_aggregated <- ev_clean %>%
  group_by(year_month) %>%
  summarise(BEV = sum(bev, na.rm = TRUE), 
            PHEV = sum(phev, na.rm = TRUE))

# Convert 'year_month' to Date class assuming the first day of the month
ev_aggregated$year_month <- as.Date(paste0(ev_aggregated$year_month, "-01"))

# Create the line plot
p1 <- ggplot(ev_aggregated, aes(x = year_month)) +
  geom_line(aes(y = BEV, color = "Battery electric (BEV)")) +
  geom_line(aes(y = PHEV, color = "Plug-in hybrid (PHEV)"), linetype = "dashed") +
  scale_color_manual(values = c("Battery electric (BEV)" = "darkblue", "Plug-in hybrid (PHEV)" = "darkred")) +
  labs(
    x = "Date",
    y = "Number of DOL Registration Grants",
    caption = "Figure 1: Temporal Growth of EV Adoption. Data Source: Washington State Department of Licensing (DOL), Feb. 2024."
  ) +
  theme_stata(scheme = "s1mono") +
  theme(
    legend.title = element_blank(),
    legend.position = "bottom",
    axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
    axis.text.y = element_text(size = 12),
    axis.title = element_text(size = 14),
    plot.caption = element_text(hjust = 0.5, size = 10),
    legend.text = element_text(size = 12)
  ) +
  scale_x_date(date_breaks = "6 months", date_labels = "%b %Y") +
  scale_y_continuous(
    breaks = seq(0, max(ev_aggregated$BEV, ev_aggregated$PHEV), by = 25000),
    labels = scales::comma
  )

# Add vertical lines for FY22 and FY23 and their labels
p1 <- p1 +
  geom_vline(xintercept = as.numeric(as.Date("2022-01-01")), linetype = "longdash", color = "black") +
  geom_vline(xintercept = as.numeric(as.Date("2023-01-01")), linetype = "longdash", color = "black") +
  geom_text(aes(x = as.Date("2022-01-01"), y = max(ev_aggregated$BEV) * 0.95, label = "NEVI FY22"), angle = 90, vjust = 1.5, size = 3.5, family = "Times New Roman") +
  geom_text(aes(x = as.Date("2023-01-01"), y = max(ev_aggregated$PHEV) * 0.95, label = "NEVI FY23"), angle = 90, vjust = 1.5, size = 3.5, family = "Times New Roman")

# Print the plot
p1 + theme(
  text = element_text(family = "Times New Roman"),
  axis.title.x = element_text(margin = margin(t = 10), family = "Times New Roman"),
  axis.title.y = element_text(margin = margin(r = 10), family = "Times New Roman"),
  plot.caption = element_text(hjust = 0.5, size = 10, family = "Times New Roman"),
  legend.text = element_text(size = 12, family = "Times New Roman")
)

ggsave(here("plots", "figure1.png"), p1, width = 10, height = 8, dpi = 500)
```

# Summary statistic

```{r}
# using stargazer to create summary statistic
# Subset dataframe to exclude specific columns
df_final_sum <- df_final[, c("bev", "phev","non_ev", "public_station", "funding", "population", "state_population", "nevi", "gas_price")]

sum_stat<- data.frame(df_final_sum)

stargazer(sum_stat, type = "html",
          summary.stat = c("mean", "sd", "min", "max", "n"),
          title = "Table 1. Summary statistics of the estimation sample",
          digits = 3,
          style = "qje",
          column.labels = c("Mean", "St. Dev.", "Min", "Max", "Obv"),
          covariate.labels = c("Uptake of battery electric vehicles (#)", 
                               "Uptake of plug-in hybrids (#)",
                               "Uptake of non-electric vehicles (#)",
                               "Public Charging Stations (#)",
                               "NEVI fundings ($ amount)",
                               "County level population (#)",
                               "State level population (#)",
                               "NEVI funding round",
                               "Avg. retail gasoline price ($ per gallon)"), # replace with your actual variables
          add.lines = list(c("No. of observations", "", "", "", "Number of observations here")),
          notes = "Note: NEVI fundings are distributed to each state at the beginning of year 2022 and 2023.") # Continue your note here

```

# Fixest dictionary

```{r}
setFixest_dict(c(bev = "BEVs", 
                 phev = "PHEVs",
                 non_ev = "Non-EVs",
                 public_station = "Public charging station",
                 funding = "NEVI funding ($)",
                 population = "County populations",
                 state_population = "State populations",
                 nevi = "NEVI round",
                 gas_price = "Avg. retail gas price ($)",
                 typeTruck = "Type: Truck",
                 year_month = "Year-month FE",
                 state = "State FE",
                 bev_lag6 = "BEVs",
                 phev_lag6 = "PHEVs",
                 public_station_hat1 = "Public charging station (fitted)"
                 ))
```

# FE models

```{r}
# FE w/o control variables
m1 <- feols(c(bev, phev) ~ public_station
            | state + year_month, 
            data = df_final)

# FE w/ control variables
m2 <- feols(c(bev, phev) ~ public_station + non_ev + type + gas_price + state_population
            | state + year_month, 
            data = df_final)

# FE w/o control variables lagged
m3 <- feols(c(bev_lag6, phev_lag6) ~ public_station
            | state + year_month, 
            data = df_final)

# FE w/ control variables lagged
m4 <- feols(c(bev_lag6, phev_lag6) ~ public_station + non_ev + type + gas_price + state_population
            | state + year_month, 
            data = df_final)

# FE w/ control variables
l1 <- feols(c(bev, phev) ~ public_station + non_ev + type + gas_price + state_population + population
            | state + year_month, 
            data = df_final)


e1 <- etable(m1, m2, l1, view = TRUE,
             notes = c("Table 1: Fixed effect models on the uptake of EVs",
                       "Notes: Avg. retail gas price ($) was removed due to colinearity"),
             headers = list(":_:" = list("FE (w/o controls)" = 2,
                                         "FE (w/ controls)" = 2,
                                         "FE (w/ reduced n-obv)" = 2)))

e2 <- etable(m2, m4, view = TRUE,
             notes = c("Table 2: Lagged fixed effect models on the uptake of EVs",
                       "Notes: Avg. retail gas pric e ($) was removed due to colinearity"),
             headers = list(":_:" = list("FE (w/o lag)" = 2,
                                         "FE (w/ 6-month lag)" = 2)))

c2 <- etable(m1, m2, m4, view = TRUE,
             notes = c("Table 1: Fixed effect models on the uptake of EVs",
                       "Notes: Avg. retail gas pric e ($) was removed due to colinearity"),
             headers = list(":_:" = list("FE (w/o controls)" = 2,
                                         "FE (w/ controls)" = 2,
                                         "FE (w/ 6-month lag)" = 2)))

```

# IV: Funding models

```{r}
# First Stage: Regress the treatment variable on the instrument and control variable
v1.1 <- feols(public_station ~ funding + non_ev + type + gas_price + state_population | state + year_month, data = df_final)

# Get the predicted values of the treatment variable from the first stage
df_final$public_station_hat1 <- predict(v1.1)

# Second Stage: Regress the outcome variable on the predicted treatment and control variable
v1.2 <- feols(c(bev, phev)~ public_station_hat1 + non_ev + type + gas_price + state_population | state + year_month, data = df_final)

# Second Stage: Regress the outcome variable on the predicted treatment and control variable lagged 6
v1.2lag <- feols(c(bev_lag6, phev_lag6) ~ public_station_hat1 + non_ev + type + state_population | state + year_month, data = df_final)

etable(v1.1, v1.2, v1.2lag, view = TRUE,
       notes = c("Table 3: Two-stage least squares (2SLS) models on the uptake of EVs",
                 "Notes: Avg. retail gas price ($) was removed due to colinearity"),
       headers = list(":_:" = list("First stage" = 1,
                                   "Second stage" = 2,
                                   "Second stage (w/ 6-month lag)" = 2)
                      )
)

v2 <- feols(c(bev_lag6, phev_lag6) ~ funding , data = df_final)

etable(v2, view = TRUE,
       notes = c("Table 4: Reduced-form results",
                 "Notes: Correlation of funding (IV) w/o controls"),
       headers = list(":_:" = list("Reduced Form" = 1
                                   )
                      )
)
```

# Power plots

```{r}
pwr_1 = pwr.t.test(n = 18643,
                  d = NULL,
                  sig.level = 0.05,
                  power = 0.8,
                  type = "two.sample",
                  alternative="two.sided")
plot(pwr_1)

```