Skip to content

Commit

Permalink
update with ggplot
Browse files Browse the repository at this point in the history
  • Loading branch information
dalejn committed Oct 18, 2020
1 parent 728365e commit fde08d3
Showing 1 changed file with 81 additions and 3 deletions.
84 changes: 81 additions & 3 deletions cleanBib.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -680,7 +680,7 @@
"hispanic = [10]\n",
"print ('looping through your references, predicting gender and race')\n",
"\n",
"columns=['Reference Key','Author','Gender','W','A']\n",
"columns=['CitationKey','Author','Gender','W','A', 'GendCat']\n",
"paper_df = pd.DataFrame(columns=columns)\n",
"\n",
"gender = []\n",
Expand Down Expand Up @@ -765,9 +765,9 @@
"\tif la_gender['gender'] == 'unknown':\n",
"\t\tla_g = gb[2:] \n",
"\t\n",
"\tfa_data = np.array([paper,'%s,%s'%(fa_fname,fa_lname),'%s,%s'%(fa_gender['gender'],fa_gender['accuracy']),fa_race[0],np.sum(fa_race[1:])]).reshape(1,5)\n",
"\tfa_data = np.array([paper,'%s,%s'%(fa_fname,fa_lname),'%s,%s'%(fa_gender['gender'],fa_gender['accuracy']),fa_race[0],np.sum(fa_race[1:]), '']).reshape(1,6)\n",
"\tpaper_df = paper_df.append(pd.DataFrame(fa_data,columns=columns),ignore_index =True)\n",
"\tla_data = np.array([paper,'%s,%s'%(la_fname,la_lname),'%s,%s'%(la_gender['gender'],la_gender['accuracy']),la_race[0],np.sum(la_race[1:])]).reshape(1,5)\n",
"\tla_data = np.array([paper,'%s,%s'%(la_fname,la_lname),'%s,%s'%(la_gender['gender'],la_gender['accuracy']),la_race[0],np.sum(la_race[1:]), '%s%s' % (fa_gender['gender'], la_gender['gender'])]).reshape(1,6)\n",
"\tpaper_df = paper_df.append(pd.DataFrame(la_data,columns=columns),ignore_index =True)\n",
"\n",
"\tmm = fa_g[0]*la_g[0]\n",
Expand Down Expand Up @@ -887,6 +887,84 @@
"paper_df.to_csv('/home/jovyan/predictions.csv')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"kernel": "R"
},
"outputs": [],
"source": [
"# Plot a histogram #\n",
"names <- read.csv('/home/jovyan/predictions.csv', header=T)\n",
"total_citations <- nrow(na.omit(names))\n",
"names$GendCat <- gsub(\"female\", \"W\", names$GendCat, fixed=T)\n",
"names$GendCat <- gsub(\"male\", \"M\", names$GendCat, fixed=T)\n",
"names$GendCat <- gsub(\"unknown\", \"U\", names$GendCat, fixed=T)\n",
"gend_cats <- unique(names$GendCat) # get a vector of all the gender categories in your paper\n",
"\n",
"# Create an empty data frame that will be used to plot the histogram. This will have the gender category (e.g., WW, MM) in the first column and the percentage (e.g., number of WW citations divided by total number of citations * 100) in the second column #\n",
"dat_for_plot <- data.frame(gender_category = NA,\n",
" number = NA,\n",
" percentage = NA)\n",
"\n",
"\n",
"### Loop through each gender category from your paper, calculate the citation percentage of each gender category, and save the gender category and its citation percentage in dat_for_plot data frame ###\n",
"if (length(names$GendCat) != 1) {\n",
" \n",
" for (i in 1:length(gend_cats)){\n",
" \n",
" # Create an empty temporary data frame that will be binded to the dat_for_plot data frame\n",
" temp_df <- data.frame(gender_category = NA,\n",
" number = NA,\n",
" percentage = NA)\n",
" \n",
" # Get the gender category, the number of citations with that category, and calculate the percentage of citations with that category\n",
" gend_cat <- gend_cats[i]\n",
" number_gend_cat <- length(names$GendCat[names$GendCat == gend_cat])\n",
" perc_gend_cat <- (number_gend_cat / total_citations) * 100\n",
" \n",
" # Bind this information to the original data frame\n",
" temp_df$gender_category <- gend_cat\n",
" temp_df$number <- number_gend_cat\n",
" temp_df$percentage <- perc_gend_cat\n",
" dat_for_plot <- rbind(dat_for_plot, temp_df)\n",
" \n",
" }\n",
" \n",
"}\n",
"\n",
"\n",
"# Create a data frame with only the WW, MW, WM, MM categories and their base rates - to plot percent citations relative to benchmarks\n",
"dat_for_baserate_plot <- subset(dat_for_plot, gender_category == 'WW' | gender_category == 'MW' | gender_category == 'WM' | gender_category == 'MM')\n",
"dat_for_baserate_plot$baserate <- c(6.7, 9.4, 25.5, 58.4)\n",
"dat_for_baserate_plot$citation_rel_to_baserate <- dat_for_baserate_plot$percentage - dat_for_baserate_plot$baserate\n",
"\n",
"\n",
"# Plot the Histogram of Number of Papers per category against predicted gender category #\n",
"\n",
"library(ggplot2)\n",
"\n",
"dat_for_plot = dat_for_plot[-1:-2,]\n",
"\n",
"dat_for_plot$gender_category <- factor(dat_for_plot$gender_category, levels = dat_for_plot$gender_category)\n",
"ggplot(dat_for_plot[-c(1),], aes(x = gender_category, y = number, fill = gender_category)) +\n",
" geom_bar(stat = 'identity', width = 0.75, na.rm = TRUE, show.legend = TRUE) + \n",
" scale_x_discrete(limits = c('WW', 'MW', 'WM', 'MM', 'UW', 'UM', 'WU', 'MU', 'UU')) +\n",
" geom_text(aes(label = number), vjust = -0.3, color = 'black', size = 2.5) +\n",
" theme(legend.position = 'right') + theme_minimal() +\n",
" xlab('Predicted gender category') + ylab('Number of papers') + ggtitle(\"\") + theme_classic(base_size=15)\n",
"\n",
"\n",
"# Plot the Histogram of % citations relative to benchmarks against predicted gender category\n",
"ggplot(dat_for_baserate_plot, aes(x = gender_category, y = citation_rel_to_baserate, fill = gender_category)) +\n",
" geom_bar(stat = 'identity', width = 0.75, na.rm = TRUE, show.legend = TRUE) +\n",
" scale_x_discrete(limits = c('WW', 'MW', 'WM', 'MM')) +\n",
" geom_text(aes(label = round(citation_rel_to_baserate, digits = 2)), vjust = -0.3, color = 'black', size = 2.5) +\n",
" theme(legend.position = 'right') + theme_minimal() +\n",
" xlab('Predicted gender category') + ylab('% of citations relative to benchmarks') + ggtitle(\"\") + theme_classic(base_size=15)"
]
},
{
"cell_type": "markdown",
"metadata": {
Expand Down

0 comments on commit fde08d3

Please sign in to comment.