14.us-name-origin.Rmd

---
title: "Representation analysis of name origin in the US"
---

```{r setup, include=FALSE}
library(tidyverse)
library(lubridate)
source("utils/r-utils.R")
theme_set(theme_bw() + theme(legend.title = element_blank()))
```

Only keep articles from 2002 because few authors had nationality predictions before 2002 (mostly due to missing metadata).
See [093.summary-stats](093.summary-stats.html) for more details.

```{r}
load("Rdata/raws.Rdata")

alpha_threshold <- qnorm(0.975)

pubmed_nat_df <- corr_authors %>%
  filter(year(year) >= 2002) %>%
  separate_rows(countries, sep = ",") %>%
  filter(countries == "US") %>%
  left_join(nationalize_df, by = c("fore_name", "last_name")) %>%
  group_by(pmid, journal, publication_date, year, adjusted_citations) %>%
  summarise_at(vars(African:SouthAsian), mean, na.rm = T) %>%
  ungroup()

iscb_nat_df <- keynotes %>%
  separate_rows(afflcountries, sep = "\\|") %>%
  filter(afflcountries == "United States") %>%
  left_join(nationalize_df, by = c("fore_name", "last_name"))

start_year <- 1992
end_year <- 2019
n_years <- end_year - start_year
my_jours <- unique(pubmed_nat_df$journal)
my_confs <- unique(iscb_nat_df$conference)
n_jours <- length(my_jours)
n_confs <- length(my_confs)
region_levels <- paste(c("Celtic/English", "European", "East Asian", "Hispanic", "South Asian", "Arabic", "Hebrew", "African", "Nordic", "Greek"), "names")

region_cols <- c("#ffffb3", "#fccde5", "#b3de69", "#fdb462", "#80b1d3", "#8dd3c7", "#bebada", "#fb8072", "#bc80bd", "#ccebc5")
```

## Organize data

Prepare data frames for later analyses:

- rbind results of race predictions in iscb and Pubmed
- pivot long
- compute mean, sd, marginal error

```{r}
iscb_pubmed_oth <- iscb_nat_df %>%
  rename("journal" = conference) %>%
  select(year, journal, African:SouthAsian, publication_date) %>%
  mutate(
    type = "Keynote speakers/Fellows",
    adjusted_citations = 1
  ) %>%
  bind_rows(
    pubmed_nat_df %>%
      select(year, journal, African:SouthAsian, publication_date, adjusted_citations) %>%
      mutate(type = "Pubmed authors")
  ) %>%
  mutate(OtherCategories = SouthAsian + Hispanic + Jewish + Muslim + Nordic + Greek + African) %>%
  pivot_longer(c(African:SouthAsian, OtherCategories),
    names_to = "region",
    values_to = "probabilities"
  ) %>%
  filter(!is.na(probabilities)) %>%
  group_by(type, year, region)

iscb_pubmed_sum_oth <- iscb_pubmed_oth %>%
  summarise(
    mean_prob = mean(probabilities),
    se_prob = sd(probabilities)/sqrt(n()),
    me_prob = alpha_threshold * se_prob,
    .groups = "drop"
  )

iscb_pubmed_sum <- iscb_pubmed_sum_oth %>%
  filter(region != "OtherCategories")
```

## Figures for paper

```{r fig.height=7, fig.width=9, warning=FALSE}
fig_us_name_origina <- iscb_pubmed_sum %>%
  filter(year < "2020-01-01") %>%
  region_breakdown("main", region_levels, fct_rev(type)) +
  guides(fill = guide_legend(nrow = 2))

large_regions <- c("CelticEnglish", "EastAsian", "European", "OtherCategories")

## Mean and standard deviation of predicted probabilities:
fig_us_name_originb <- iscb_pubmed_sum_oth %>%
  filter(region %in% large_regions) %>%
  recode_region() %>%
  gam_and_ci(
    df2 = iscb_pubmed_oth %>%
      filter(region %in% large_regions) %>%
      recode_region(),
    start_y = start_year, end_y = end_year
  ) +
  theme(
    legend.position = c(0.88, 0.83),
    panel.grid.minor = element_blank(),
    legend.margin = margin(-0.5, 0, 0, 0, unit = "cm"),
    legend.text = element_text(size = 6)
  ) +
  facet_wrap(vars(fct_relevel(region, large_regions)), nrow = 1)

fig_us_name_origin <- cowplot::plot_grid(fig_us_name_origina, fig_us_name_originb, labels = "AUTO", ncol = 1, rel_heights = c(1.3, 1))
fig_us_name_origin
ggsave("figs/us_name_origin.png", fig_us_name_origin, width = 6.5, height = 5.5, dpi = 600)
ggsave("figs/us_name_origin.svg", fig_us_name_origin, width = 6.5, height = 5.5)
```

## Hypothesis testing

```{r}
iscb_lm <- iscb_pubmed_oth %>%
  ungroup() %>%
  mutate(
    # year = c(scale(year)),
    # year = as.factor(year),
    type = relevel(as.factor(type), ref = "Pubmed authors")
  )
main_lm <- function(regioni) {
  glm(type ~ year + probabilities,
    data = iscb_lm %>%
      filter(region == regioni, !is.na(probabilities), year(year) >= 2002),
    family = "binomial"
  )
}

inte_lm <- function(regioni) {
  glm(type ~ probabilities * year,
    data = iscb_lm %>%
      filter(region == regioni, !is.na(probabilities), year(year) >= 2002),
    family = "binomial"
  )
}

main_list <- lapply(large_regions, main_lm)

names(main_list) <- large_regions
lapply(main_list, broom::tidy)

inte_list <- lapply(large_regions, inte_lm)
lapply(inte_list, broom::tidy)
for (i in 1:4) {
  print(anova(main_list[[i]], inte_list[[i]], test = "Chisq"))
}
```
Interaction terms do not predict `type` over and above the main effect of name origin probability and year (_p_ > 0.01).

```{r echo = F}
get_exp <- function(i, colu) {
  broom::tidy(main_list[[i]]) %>%
    filter(term == "probabilities") %>%
    pull(colu)
}

print_p <- function(x) sprintf("%0.5g", x)
```

## Conclusion

An East Asian name has `r exp(get_exp(2, 'estimate'))` the odds of being selected as an honoree, significantly lower compared to other names ($\beta_\textrm{East Asian} =$ `r print_p(get_exp(2, 'estimate'))`, _P_ = `r print_p(get_exp(2, 'p.value'))`).
The two groups of scientists did not have a significant association with names predicted to be Celtic/English (_P_ = `r print_p(get_exp(1, 'p.value'))`), European (_P_ = `r print_p(get_exp(3, 'p.value'))`), or in Other categories (_P_ = `r print_p(get_exp(4, 'p.value'))`).

## Supplement

### Supplementary Figure S7 {#sup_fig_s7}
It's difficult to come to a conclusion for other regions with so few data points and the imperfect accuracy of our prediction.
There seems to be little difference between the proportion of keynote speakers of African, Arabic, South Asian and Hispanic origin than those in the field.
However, just because a nationality isn't underrepresented against the field doesn't mean scientists from that nationality are appropriately represented.

```{r fig.height=6, warning=FALSE}
df2 <- iscb_pubmed_oth %>%
  filter(region != "OtherCategories") %>%
  recode_region()

fig_s7 <- iscb_pubmed_sum %>%
  recode_region() %>%
  gam_and_ci(
    df2 = df2,
    start_y = start_year, end_y = end_year
  ) +
  theme(legend.position = c(0.8, 0.1)) +
  facet_wrap(vars(fct_relevel(region, region_levels)), ncol = 3)

fig_s7
ggsave("figs/fig_s7.png", fig_s7, width = 6, height = 6)
ggsave("figs/fig_s7.svg", fig_s7, width = 6, height = 6)
```


```{r}
sessionInfo()
```