diff --git a/_freeze/chapters/evaluation_data/execute-results/html.json b/_freeze/chapters/evaluation_data/execute-results/html.json index 99b9010..1677ea7 100644 --- a/_freeze/chapters/evaluation_data/execute-results/html.json +++ b/_freeze/chapters/evaluation_data/execute-results/html.json @@ -1,7 +1,7 @@ { - "hash": "b9cd5e9fb7889a16ccdf9ec951bb5e0d", + "hash": "a4e7b5185bb73b611a1a82aea9bff999", "result": { - "markdown": "# Evaluation data: description, exploration, checks\n\n\n::: {.cell}\n\n```{.r .cell-code code-summary=\"load packages\"}\nsource(here::here(\"code\", \"shared_packages_code.R\"))\n\n#devtools::install_github(\"rethinkpriorities/rp-r-package\")\nlibrary(rethinkpriorities)\n\n#devtools::install_github(\"rethinkpriorities/r-noodling-package\") #mainly used playing in real time\nlibrary(rnoodling)\n\nlibrary(here)\nlibrary(dplyr)\nlibrary(pacman)\n\np_load(formattable, sparkline, install=FALSE)\n\np_load(DT, santoku, lme4, huxtable, janitor, emmeans, sjPlot, sjmisc, ggeffects, ggrepel, likert, labelled, plotly, stringr, install=FALSE)\n\np_load(ggthemes, paletteer, ggridges, install=FALSE)\n\nselect <- dplyr::select \n\noptions(knitr.duplicate.label = \"allow\")\n\noptions(mc.cores = parallel::detectCores())\n#rstan_options(auto_write = TRUE)\n\n#library(hunspell)\n\n#(brms)\n\n#devtools::install_github(\"bergant/airtabler\")\np_load(airtabler)\n\n#remotes::install_github(\"rmcelreath/rethinking\")\n#library(rethinking)\n```\n:::\n\n\n\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code code-summary=\"input from airtable\"}\nbase_id <- \"appbPYEw9nURln7Qg\"\n\n# Set your Airtable API key\n#Sys.setenv(AIRTABLE_API_KEY = \"\") \n#this should be set in my .Renviron file\n\n\n# Read data from a specific view\n\nevals <- air_get(base = base_id, \"output_eval\") \n\nall_pub_records <- data.frame()\npub_records <- air_select(base = base_id, table = \"crucial_research\")\n\n# Append the records to the list\nall_pub_records <- bind_rows(all_pub_records, pub_records)\n\n# While the length of the records list is 100 (the maximum), fetch more records\nwhile(nrow(pub_records) == 100) {\n # Get the ID of the last record in the list\n offset <- get_offset(pub_records)\n \n # Fetch the next 100 records, starting after the last ID\n pub_records <- air_select(base = base_id, table = \"crucial_research\", offset = offset)\n \n # Append the records to the df\n all_pub_records <- bind_rows(all_pub_records, pub_records)\n}\n```\n:::\n\n::: {.cell}\n\n```{.r .cell-code code-summary=\"just the useful and publish-able data, clean a bit\"}\ncolnames(evals) <- snakecase::to_snake_case(colnames(evals))\n\nevals_pub <- evals %>% \n dplyr::rename(stage_of_process = stage_of_process_todo_from_crucial_research_2) %>% \n mutate(stage_of_process = unlist(stage_of_process)) %>% \n dplyr::filter(stage_of_process == \"published\") %>% \n select(id, crucial_research, paper_abbrev, evaluator_name, category, source_main, author_agreement, overall, lb_overall, ub_overall, conf_index_overall, advancing_knowledge_and_practice, lb_advancing_knowledge_and_practice, ub_advancing_knowledge_and_practice, conf_index_advancing_knowledge_and_practice, methods_justification_reasonableness_validity_robustness, lb_methods_justification_reasonableness_validity_robustness, ub_methods_justification_reasonableness_validity_robustness, conf_index_methods_justification_reasonableness_validity_robustness, logic_communication, lb_logic_communication, ub_logic_communication, conf_index_logic_communication, engaging_with_real_world_impact_quantification_practice_realism_and_relevance, lb_engaging_with_real_world_impact_quantification_practice_realism_and_relevance, ub_engaging_with_real_world_impact_quantification_practice_realism_and_relevance, conf_index_engaging_with_real_world_impact_quantification_practice_realism_and_relevance, relevance_to_global_priorities, lb_relevance_to_global_priorities, ub_relevance_to_global_priorities, conf_index_relevance_to_global_priorities, journal_quality_predict, lb_journal_quality_predict, ub_journal_quality_predict, conf_index_journal_quality_predict, open_collaborative_replicable, conf_index_open_collaborative_replicable, lb_open_collaborative_replicable, ub_open_collaborative_replicable, merits_journal, lb_merits_journal, ub_merits_journal, conf_index_merits_journal)\n\nevals_pub %<>%\n tidyr::unnest_wider(category, names_sep = \"\") %>%\n tidyr::unnest_wider(paper_abbrev, names_sep = \"\") %>%\nmutate(across(everything(), unlist)) %>% #unlist list columns \n dplyr::rename(paper_abbrev = paper_abbrev1)\n\n#Todo -- check the unlist is not propagating the entry\n#Note: category, topic_subfield, and source have multiple meaningful categories. These will need care \n```\n:::\n\n::: {.cell}\n\n```{.r .cell-code code-summary=\"Shorten names \"}\nnew_names <- c(\n \"eval_name\" = \"evaluator_name\",\n \"cat_1\" = \"category1\",\n \"cat_2\" = \"category2\",\n \"cat_3\" = \"category3\",\n \"crucial_rsx\" = \"crucial_research\",\n \"conf_overall\" = \"conf_index_overall\",\n \"adv_knowledge\" = \"advancing_knowledge_and_practice\",\n \"lb_adv_knowledge\" = \"lb_advancing_knowledge_and_practice\",\n \"ub_adv_knowledge\" = \"ub_advancing_knowledge_and_practice\",\n \"conf_adv_knowledge\" = \"conf_index_advancing_knowledge_and_practice\",\n \"methods\" = \"methods_justification_reasonableness_validity_robustness\",\n \"lb_methods\" = \"lb_methods_justification_reasonableness_validity_robustness\",\n \"ub_methods\" = \"ub_methods_justification_reasonableness_validity_robustness\",\n \"conf_methods\" = \"conf_index_methods_justification_reasonableness_validity_robustness\",\n \"logic_comms\" = \"logic_communication\",\n \"lb_logic_comms\" = \"lb_logic_communication\",\n \"ub_logic_comms\" = \"ub_logic_communication\",\n \"conf_logic_comms\" = \"conf_index_logic_communication\",\n \"real_world\" = \"engaging_with_real_world_impact_quantification_practice_realism_and_relevance\",\n \"lb_real_world\" = \"lb_engaging_with_real_world_impact_quantification_practice_realism_and_relevance\",\n \"ub_real_world\" = \"ub_engaging_with_real_world_impact_quantification_practice_realism_and_relevance\",\n \"conf_real_world\" = \"conf_index_engaging_with_real_world_impact_quantification_practice_realism_and_relevance\",\n \"gp_relevance\" = \"relevance_to_global_priorities\",\n \"lb_gp_relevance\" = \"lb_relevance_to_global_priorities\",\n \"ub_gp_relevance\" = \"ub_relevance_to_global_priorities\",\n \"conf_gp_relevance\" = \"conf_index_relevance_to_global_priorities\",\n \"journal_predict\" = \"journal_quality_predict\",\n \"lb_journal_predict\" = \"lb_journal_quality_predict\",\n \"ub_journal_predict\" = \"ub_journal_quality_predict\",\n \"conf_journal_predict\" = \"conf_index_journal_quality_predict\",\n \"open_sci\" = \"open_collaborative_replicable\",\n \"conf_open_sci\" = \"conf_index_open_collaborative_replicable\",\n \"lb_open_sci\" = \"lb_open_collaborative_replicable\",\n \"ub_open_sci\" = \"ub_open_collaborative_replicable\",\n \"conf_merits_journal\" = \"conf_index_merits_journal\"\n)\n\nevals_pub <- evals_pub %>%\n rename(!!!new_names)\n\n# Function to insert a newline character every 15 characters\nwrap_text <- function(x, width = 15) {\n gsub(\"(.{1,15})\", \"\\\\1-\\n\", x)\n}\n\nevals_pub$source_main_wrapped <- wrap_text(evals_pub$source_main, 15)\n\nevals_pub$eval_name <- ifelse(\n grepl(\"^\\\\b\\\\w+\\\\b$|\\\\bAnonymous\\\\b\", evals_pub$eval_name),\n paste0(\"Anonymous_\", seq_along(evals_pub$eval_name)),\n evals_pub$eval_name\n)\n\n\n# make the old names into labels\n\n# Create a list of labels\nlabels <- str_replace_all(new_names, \"_\", \" \")\nlabels <- str_to_title(labels)\n \n# Assign labels to the dataframe\n# for(i in seq_along(labels)) {\n# col_name <- new_names[names(new_names)[i]]\n# label <- labels[i]\n# attr(evals_pub[[col_name]], \"label\") <- label\n# }\n# \n```\n:::\n\n\n\n\n### Reconcile uncertainty ratings and CIs {-}\n\nWhere people gave only confidence level 'dots', we impute CIs (confidence/credible intervals). We follow the correspondence described [here](https://effective-giving-marketing.gitbook.io/unjournal-x-ea-and-global-priorities-research/policies-projects-evaluation-workflow/evaluation/guidelines-for-evaluators#1-5-dots-explanation-and-relation-to-cis). (Otherwise where they gave actual CIs, we use these.)^[Note this is only a first-pass; a more sophisticated approach may be warranted in future.]\n\n::: {.callout-note collapse=\"true\"}\n## Dots to interval choices\n\n> 5 = Extremely confident, i.e., 90% confidence interval spans +/- 4 points or less)\n\nFor 0-100 ratings, code the LB as $min(R - 4\\times \\frac{R}{100},0)$ and the UB as $max(R + 4\\times \\frac{R}{100},0)$, where R is the stated (middle) rating. This 'scales' the CI, as interpreted, to be proportional to the rating, with a maximum 'interval' of about 8, with the rating is about 96.\n\n> 4 = Very*confident: 90% confidence interval +/- 8 points or less\n\nFor 0-100 ratings, code the LB as $min(R - 8\\times \\frac{R}{100},0)$ and the UB as $max(R + 8\\times \\frac{R}{100},0)$, where R is the stated (middle) rating. \n\n> 3 = Somewhat** confident: 90% confidence interval +/- 15 points or less \n\n> 2 = Not very** confident: 90% confidence interval, +/- 25 points or less\n\nComparable scaling for the 2-3 ratings as for the 4 and 5 rating.\n\n> 1 = Not** confident: (90% confidence interval +/- more than 25 points)\n \nCode LB as $min(R - 37.5\\times \\frac{R}{100},0)$ and the UB as $max(R + 37.5\\times \\frac{R}{100},0)$. \n \nThis is just a first-pass. There might be a more information-theoretic way of doing this. On the other hand, we might be switching the evaluations to use a different tool soon, perhaps getting rid of the 1-5 confidence ratings altogether.\n\n::: \n\n\n\n::: {.cell}\n\n```{.r .cell-code code-summary=\"reconcile explicit bounds and stated confidence level\"}\n# Define the baseline widths for each confidence rating\nbaseline_widths <- c(4, 8, 15, 25, 37.5)\n\n# Define a function to calculate the lower and upper bounds, where given only an index\ncalc_bounds <- function(rating, confidence, lb_explicit, ub_explicit, scale=100) {\n # Check if confidence is NA\n if (is.na(confidence)) {\n return(c(lb_explicit, ub_explicit)) # Return explicit bounds if confidence is NA\n } else {\n baseline_width <- baseline_widths[confidence]\n lb <- pmax(rating - baseline_width * rating / scale, 0)\n ub <- pmin(rating + baseline_width * rating / scale, scale)\n return(c(lb, ub))\n }\n}\n\n# Function to calculate bounds for a single category\ncalc_category_bounds <- function(df, category, scale=100) {\n # Calculate bounds\n bounds <- mapply(calc_bounds, df[[category]], df[[paste0(\"conf_\", category)]], df[[paste0(\"lb_\", category)]], df[[paste0(\"ub_\", category)]])\n \n # Convert to data frame and ensure it has the same number of rows as the input\n bounds_df <- as.data.frame(t(bounds))\n rownames(bounds_df) <- NULL\n \n # Add bounds to original data frame\n df[[paste0(category, \"_lb_imp\")]] <- bounds_df[, 1]\n df[[paste0(category, \"_ub_imp\")]] <- bounds_df[, 2]\n \n return(df)\n}\n\n\n# Lists of categories\n\nrating_cats <- c(\"overall\", \"adv_knowledge\", \"methods\", \"logic_comms\", \"real_world\", \"gp_relevance\", \"open_sci\")\n\n#... 'predictions' are currently 1-5 (0-5?)\npred_cats <- c(\"journal_predict\", \"merits_journal\")\n\n# Apply the function to each category\n# DR: I don't love this looping 'edit in place' code approach, but whatever\nfor (cat in rating_cats) {\n evals_pub <- calc_category_bounds(evals_pub, cat, scale=100)\n}\n\nfor (cat in pred_cats) {\n evals_pub <- calc_category_bounds(evals_pub, cat, scale=5)\n}\n```\n:::\n\n::: {.cell}\n\n```{.r .cell-code code-summary=\"save data for others' use\"}\nevals_pub %>% saveRDS(file = here(\"data\", \"evals.Rdata\"))\nevals_pub %>% write_csv(file = here(\"data\", \"evals.csv\"))\n\n#evals_pub %>% readRDS(file = here(\"data\", \"evals.Rdata\"))\n```\n:::\n\n\n \n# Basic presentation\n\n## What sorts of papers/projects are we considering and evaluating? \n\nIn this section, we give some simple data summaries and visualizations, for a broad description of The Unjournal's coverage. \n\nIn the interactive tables below we give some key attributes of the papers and the evaluators, and a preview of the evaluations.\n\n\n::: column-body-outset\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\n(\n all_evals_dt <- evals_pub %>%\n arrange(paper_abbrev, eval_name) %>%\n dplyr::select(paper_abbrev, crucial_rsx, eval_name, cat_1, cat_2, source_main_wrapped, author_agreement) %>%\n dplyr::select(-matches(\"ub_|lb_|conf\")) %>% \n #rename_all(~ gsub(\"_\", \" \", .)) %>% \n rename(\"Research _____________________\" = \"crucial_rsx\" \n ) %>%\n DT::datatable(\n caption = \"Evaluations (confidence bounds not shown)\", \n filter = 'top',\n rownames= FALSE,\n options = list(pageLength = 7)\n )\n)\n```\n\n::: {.cell-output-display}\n```{=html}\n
\n\n```\n:::\n:::\n\n\n\n\\\n\nNext, the 'middle ratings and predictions'.\n\n\n::: {.cell}\n\n```{.r .cell-code code-summary=\"Data datable (all shareable relevant data)\"}\n(\n all_evals_dt <- evals_pub %>%\n arrange(paper_abbrev, eval_name, overall) %>%\n dplyr::select(paper_abbrev, eval_name, all_of(rating_cats)) %>%\n DT::datatable(\n caption = \"Evaluations and predictions (confidence bounds not shown)\", \n filter = 'top',\n rownames= FALSE,\n options = list(pageLength = 7)\n )\n)\n```\n\n::: {.cell-output-display}\n```{=html}\n
\n\n```\n:::\n:::\n\n\\\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\n(\n all_evals_dt_ci <- evals_pub %>%\n arrange(paper_abbrev, eval_name) %>%\n dplyr::select(paper_abbrev, eval_name, conf_overall, rating_cats, matches(\"ub_imp|lb_imp\")) %>%\n DT::datatable(\n caption = \"Evaluations and (imputed*) confidence bounds)\", \n filter = 'top',\n rownames= FALSE,\n options = list(pageLength = 7)\n )\n)\n```\n:::\n\n:::\n\n\n::: {.callout-note collapse=\"true\"}\n## Next consider...\n\n- Composition of research evaluated\n - By field (economics, psychology, etc.)\n - By subfield of economics \n - By topic/cause area (Global health, economic development, impact of technology, global catastrophic risks, etc. )\n - By source (submitted, identified with author permission, direct evaluation)\n \n- Timing of intake and evaluation^[Consider: timing might be its own section or chapter; this is a major thing journals track, and we want to keep track of ourselves]\n\n:::\n\nThe funnel plot below starts with the paper we prioritized for likely Unjournal evaluation, marking these as 'considering'.\n\n\n::: {.cell}\n\n```{.r .cell-code}\n#Add in the 3 different evaluation input sources\n#update to be automated rather than hard-coded - to look at David's work here\n\npapers_considered <- all_pub_records %>% nrow()\n\npapers_deprio <- all_pub_records %>% filter(`stage of process/todo` == \"de-prioritized\") %>% nrow()\n\npapers_evaluated <- all_pub_records %>% filter(`stage of process/todo` %in% c(\"published\",\n \"contacting/awaiting_authors_response_to_evaluation\",\n \"awaiting_publication_ME_comments\",\n \"awaiting_evaluations\")) %>% nrow()\n\npapers_complete <- all_pub_records %>% filter(`stage of process/todo` == \"published\") %>% \nnrow()\n\npapers_in_progress <- papers_evaluated-papers_complete\n\npapers_still_in_consideration <- all_pub_records %>% filter(`stage of process/todo` == \"considering\") %>% nrow()\n\n\nfig <- plot_ly(\n type = \"sankey\",\n orientation = \"h\",\n\n node = list(\n label = c(\"Prioritized\", \"Eval uated\", \"Complete\", \"In progress\", \"Still in consideration\", \"De-prioritized\"),\n color = c(\"orange\", \"green\", \"green\", \"orange\", \"orange\", \"red\"),\n pad = 15,\n thickness = 20,\n line = list(\n color = \"black\",\n width = 0.5\n )\n ),\n\n link = list(\n source = c(0,1,1,0,0),\n target = c(1,2,3,4,5),\n value = c(\n papers_evaluated,\n papers_complete,\n papers_in_progress,\n papers_still_in_consideration,\n papers_deprio\n ))\n )\nfig <- fig %>% layout(\n title = \"Unjournal paper funnel\",\n font = list(\n size = 10\n )\n)\n\nfig \n```\n\n::: {.cell-output-display}\n```{=html}\n
\n\n```\n:::\n:::\n\n::: {.cell}\n\n```{.r .cell-code}\nsummary_df <- evals_pub %>%\n distinct(crucial_rsx, .keep_all = T) %>% \n group_by(cat_1) %>%\n summarise(count = n()) \n\nsummary_df$cat_1[is.na(summary_df$cat_1)] <- \"Unknown\"\n\nsummary_df <- summary_df %>%\n arrange(-desc(count)) %>%\n mutate(cat_1 = factor(cat_1, levels = unique(cat_1)))\n\n# Create stacked bar chart\nggplot(summary_df, aes(x = cat_1, y = count)) +\n geom_bar(stat = \"identity\") + \n theme_minimal() +\n labs(x = \"Paper category\", y = \"Count\", \n title = \"Count of evaluated papers by primary category\") \n```\n\n::: {.cell-output-display}\n![](evaluation_data_files/figure-html/unnamed-chunk-14-1.png){width=672}\n:::\n:::\n\n::: {.cell}\n\n```{.r .cell-code}\n# Bar plot\nggplot(evals_pub, aes(x = source_main_wrapped)) + \n geom_bar(position = \"stack\", stat = \"count\") +\n labs(x = \"Source\", y = \"Count\") +\n theme_light() +\n theme_minimal() +\n ggtitle(\"Evaluations by source of the paper\") + # add title\n theme(\n panel.grid.major = element_blank(),\n panel.grid.minor = element_blank(),\n text = element_text(size = 16), # changing all text size to 16\n axis.text.y = element_text(size = 10),\n axis.text.x = element_text(size = 14)\n )\n```\n\n::: {.cell-output-display}\n![](evaluation_data_files/figure-html/unnamed-chunk-16-1.png){width=672}\n:::\n:::\n\n::: {.cell}\n\n```{.r .cell-code}\nall_pub_records$is_evaluated = all_pub_records$`stage of process/todo` %in% c(\"published\",\n \"contacting/awaiting_authors_response_to_evaluation\",\n \"awaiting_publication_ME_comments\",\n \"awaiting_evaluations\")\n\nall_pub_records$source_main[all_pub_records$source_main == \"NA\"] <- \"Not applicable\" \nall_pub_records$source_main[all_pub_records$source_main == \"internal-from-syllabus-agenda-policy-database\"] <- \"Internal: syllabus, agenda, etc.\" \nall_pub_records$source_main = tidyr::replace_na(all_pub_records$source_main, \"Unknown\")\n\n\n\nggplot(all_pub_records, aes(x = fct_infreq(source_main), fill = is_evaluated)) + \n geom_bar(position = \"stack\", stat = \"count\") +\n labs(x = \"Source\", y = \"Count\", fill = \"Selected for\\nevaluation?\") +\n coord_flip() + # flipping the coordinates to have categories on y-axis (on the left)\n theme_light() +\n theme_minimal() +\n ggtitle(\"Evaluations by source of the paper\") +# add title\n theme(\n panel.grid.major = element_blank(),\n panel.grid.minor = element_blank(),\n text = element_text(size = 16), # changing all text size to 16\n axis.text.y = element_text(size = 12),\n axis.text.x = element_text(size = 14)\n )\n```\n\n::: {.cell-output-display}\n![](evaluation_data_files/figure-html/unnamed-chunk-18-1.png){width=672}\n:::\n:::\n\n\n\n### The distribution of ratings and predictions {-}\n\nNext, we present the ratings and predictions along with 'uncertainty measures'.^[We use \"ub imp\" (and \"lb imp\") to denote the upper and lower bounds given by evaluators.] Where evaluators gave only a 1-5 confidence level^[More or less, the ones who report a level for 'conf overall', although some people did this for some but not others], we use the imputations discussed and coded above. \n\n\n- For each category and prediction (overall and by paper)\n\n\n::: column-body-outset\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nwrap_text <- function(text, width) {\n sapply(strwrap(text, width = width, simplify = FALSE), paste, collapse = \"\\n\")\n}\n\nevals_pub$wrapped_pub_names <- wrap_text(evals_pub$paper_abbrev, width = 15)\n\n\n\n# Dot plot\nggplot(evals_pub, aes(x = paper_abbrev, y = overall)) +\n geom_point(stat = \"identity\", size = 4, shape = 1, colour = \"lightblue\", stroke = 3) +\n geom_text_repel(aes(label = eval_name), \n size = 3, \n box.padding = unit(0.35, \"lines\"),\n point.padding = unit(0.3, \"lines\")) +\n coord_flip() + # flipping the coordinates to have categories on y-axis (on the left)\n theme_light() +\n xlab(\"Paper\") + # remove x-axis label\n ylab(\"Overall score\") + # name y-axis\n ggtitle(\"Overall scores of evaluated papers\") +# add title\n theme(\n panel.grid.major = element_blank(),\n panel.grid.minor = element_blank(),\n text = element_text(size = 14), # changing all text size to 16\n axis.text.y = element_text(size = 8),\n axis.text.x = element_text(size = 12)\n )\n```\n\n::: {.cell-output-display}\n![](evaluation_data_files/figure-html/unnamed-chunk-20-1.png){width=672}\n:::\n:::\n\n:::\n\n\n::: column-body-outset\n\n\n::: {.cell}\n\n```{.r .cell-code}\nunit.scale = function(x) (x*100 - min(x*100)) / (max(x*100) - min(x*100))\nevaluations_table <- evals_pub %>%\n select(paper_abbrev, eval_name, cat_1, source_main, overall, adv_knowledge, methods, logic_comms, journal_predict) %>%\n arrange(desc(paper_abbrev))\n\nout = formattable(\n evaluations_table,\n list(\n #area(col = 5:8) ~ function(x) percent(x / 100, digits = 0),\n area(col = 5:8) ~ color_tile(\"#FA614B66\",\"#3E7DCC\"),\n `journal_predict` = proportion_bar(\"#DeF7E9\", unit.scale)\n )\n)\nout\n```\n\n::: {.cell-output-display}\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n
paper_abbrev eval_name cat_1 source_main overall adv_knowledge methods logic_comms journal_predict
Well-being: Cash vs. psychotherapy Anonymous_13 GH&D internal-NBER 90 90 90 80 4.0
Well-being: Cash vs. psychotherapy Hannah Metzler GH&D internal-NBER 75 70 90 75 3.0
Nonprofit Govc.: Randomized healthcare DRC Wayne Aaron Sandholtz GH&D internal-NBER 65 70 60 55 3.6
LT CEA: Resilient foods vs. AGI safety Scott Janzwood long-term-relevant submitted 65 NA NA NA NA
LT CEA: Resilient foods vs. AGI safety Anca Hanea long-term-relevant submitted 80 80 70 85 3.5
LT CEA: Resilient foods vs. AGI safety Alex Bates long-term-relevant submitted 40 30 50 60 2.0
Env. fx of prod.: ecological obs Elias Cisneros NA internal-NBER 88 90 75 80 4.0
Env. fx of prod.: ecological obs Anonymous_12 NA internal-NBER 70 70 70 75 4.0
CBT Human K, Ghana Anonymous_11 NA internal-NBER 75 60 90 70 4.0
CBT Human K, Ghana Anonymous_16 NA internal-NBER 75 65 60 75 NA
Banning wildlife trade can boost demand Anonymous_3 conservation submitted 75 70 80 70 3.0
Banning wildlife trade can boost demand Liew Jia Huan conservation submitted 75 80 50 70 2.5
Advance market commit. (vaccines) David Manheim policy internal-from-syllabus-agenda-policy-database 80 25 95 75 3.0
Advance market commit. (vaccines) Joel Tan policy internal-from-syllabus-agenda-policy-database 79 90 70 70 5.0
Advance market commit. (vaccines) Dan Tortorice policy internal-from-syllabus-agenda-policy-database 80 90 80 80 4.0
AI and econ. growth Seth Benzell macroeconomics internal-from-syllabus-agenda-policy-database 80 75 80 70 NA
AI and econ. growth Phil Trammel macroeconomics internal-from-syllabus-agenda-policy-database 92 97 70 45 3.5
\n:::\n:::\n\n:::\n\n\nNext, look for systematic variation \n\n- By field and topic area of paper\n\n- By submission/selection route\n\n- By evaluation manager\n\n... perhaps building a model of this. We are looking for systematic 'biases and trends', loosely speaking, to help us better understand how our evaluation system is working.\n\n\\\n\n\n\n\n### Relationship among the ratings (and predictions) {-} \n\n- Correlation matrix\n\n- ANOVA\n\n- PCA (Principle components)\n\n- With other 'control' factors?\n\n- How do the specific measures predict the aggregate ones (overall rating, merited publication)\n - CF 'our suggested weighting'\n\n\n## Aggregation of expert opinion (modeling)\n\n\n## Notes on sources and approaches\n\n\n::: {.callout-note collapse=\"true\"}\n\n## Hanea et al {-}\n(Consult, e.g., repliCATS/Hanea and others work; meta-science and meta-analysis approaches)\n\n`aggrecat` package\n\n> Although the accuracy, calibration, and informativeness of the majority of methods are very similar, a couple of the aggregation methods consistently distinguish themselves as among the best or worst. Moreover, the majority of methods outperform the usual benchmarks provided by the simple average or the median of estimates.\n\n[Hanea et al, 2021](https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0256919#sec007)\n\n However, these are in a different context. Most of those measures are designed to deal with probablistic forecasts for binary outcomes, where the predictor also gives a 'lower bound' and 'upper bound' for that probability. We could roughly compare that to our continuous metrics with 90% CI's (or imputations for these).\n\nFurthermore, many (all their successful measures?) use 'performance-based weights', accessing metrics from prior prediction performance of the same forecasters We do not have these, nor do we have a sensible proxy for this. \n:::\n\n\n::: {.callout-note collapse=\"true\"}\n## D Veen et al (2017)\n\n[link](https://www.researchgate.net/profile/Duco-Veen/publication/319662351_Using_the_Data_Agreement_Criterion_to_Rank_Experts'_Beliefs/links/5b73e2dc299bf14c6da6c663/Using-the-Data-Agreement-Criterion-to-Rank-Experts-Beliefs.pdf)\n\n... we show how experts can be ranked based on their knowledge and their level of (un)certainty. By letting experts specify their knowledge in the form of a probability distribution, we can assess how accurately they can predict new data, and how appropriate their level of (un)certainty is. The expert’s specified probability distribution can be seen as a prior in a Bayesian statistical setting. We evaluate these priors by extending an existing prior-data (dis)agreement measure, the Data Agreement Criterion, and compare this approach to using Bayes factors to assess prior specification. We compare experts with each other and the data to evaluate their appropriateness. Using this method, new research questions can be asked and answered, for instance: Which expert predicts the new data best? Is there agreement between my experts and the data? Which experts’ representation is more valid or useful? Can we reach convergence between expert judgement and data? We provided an empirical example ranking (regional) directors of a large financial institution based on their predictions of turnover. \n\nBe sure to consult the [correction made here](https://www.semanticscholar.org/paper/Correction%3A-Veen%2C-D.%3B-Stoel%2C-D.%3B-Schalken%2C-N.%3B-K.%3B-Veen-Stoel/a2882e0e8606ef876133f25a901771259e7033b1)\n\n::: \n\n\n::: {.callout-note collapse=\"true\"}\n## Also seems relevant:\n\nSee [Gsheet HERE](https://docs.google.com/spreadsheets/d/14japw6eLGpGjEWy1MjHNJXU1skZY_GAIc2uC2HIUalM/edit#gid=0), generated from an Elicit.org inquiry.\n\n\n::: \n\n\n\nIn spite of the caveats in the fold above, we construct some measures of aggregate beliefs using the `aggrecat` package. We will make (and explain) some ad-hoc choices here. We present these:\n\n1. For each paper\n2. For categories of papers and cross-paper categories of evaluations\n3. For the overall set of papers and evaluations\n\nWe can also hold onto these aggregated metrics for later use in modeling.\n\n\n- Simple averaging\n\n- Bayesian approaches \n\n- Best-performing approaches from elsewhere \n\n- Assumptions over unit-level random terms \n\n### Explicit modeling of 'research quality' (for use in prizes, etc.) {-}\n\n- Use the above aggregation as the outcome of interest, or weight towards categories of greater interest?\n\n- Model with controls -- look for greatest positive residual? \n\n\n## Inter-rater reliability\n\n## Decomposing variation, dimension reduction, simple linear models\n\n\n## Later possiblities\n\n- Relation to evaluation text content (NLP?)\n\n- Relation/prediction of later outcomes (traditional publication, citations, replication)\n\n\n\n## Scoping our future coverage\n\nWe have funding to evaluate roughly 50-70 papers/projects per year, given our proposed incentives.\n\nConsider:\n\n- How many relevant NBER papers come out per year?\n\n- How much relevant work in other prestige archives?\n\n- What quotas do we want (by cause, etc.) and how feasible are these?\n\n", + "markdown": "# Evaluation data: description, exploration, checks\n\n\n::: {.cell}\n\n```{.r .cell-code code-summary=\"load packages\"}\nsource(here::here(\"code\", \"shared_packages_code.R\"))\n\n#devtools::install_github(\"rethinkpriorities/rp-r-package\")\nlibrary(rethinkpriorities)\n\n#devtools::install_github(\"rethinkpriorities/r-noodling-package\") #mainly used playing in real time\nlibrary(rnoodling)\n\nlibrary(here)\nlibrary(dplyr)\nlibrary(pacman)\n\np_load(formattable, sparkline, install=FALSE)\n\np_load(DT, santoku, lme4, huxtable, janitor, emmeans, sjPlot, sjmisc, ggeffects, ggrepel, likert, labelled, plotly, stringr, install=FALSE)\n\np_load(ggthemes, paletteer, ggridges, install=FALSE)\n\nselect <- dplyr::select \n\noptions(knitr.duplicate.label = \"allow\")\n\noptions(mc.cores = parallel::detectCores())\n#rstan_options(auto_write = TRUE)\n\n#library(hunspell)\n\n#(brms)\n\n#devtools::install_github(\"bergant/airtabler\")\np_load(airtabler)\n\n#remotes::install_github(\"rmcelreath/rethinking\")\n#library(rethinking)\n```\n:::\n\n\n\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code code-summary=\"input from airtable\"}\nbase_id <- \"appbPYEw9nURln7Qg\"\n\n# Set your Airtable API key\n#Sys.setenv(AIRTABLE_API_KEY = \"\") \n#this should be set in my .Renviron file\n\n\n# Read data from a specific view\n\nevals <- air_get(base = base_id, \"output_eval\") \n\nall_pub_records <- data.frame()\npub_records <- air_select(base = base_id, table = \"crucial_research\")\n\n# Append the records to the list\nall_pub_records <- bind_rows(all_pub_records, pub_records)\n\n# While the length of the records list is 100 (the maximum), fetch more records\nwhile(nrow(pub_records) == 100) {\n # Get the ID of the last record in the list\n offset <- get_offset(pub_records)\n \n # Fetch the next 100 records, starting after the last ID\n pub_records <- air_select(base = base_id, table = \"crucial_research\", offset = offset)\n \n # Append the records to the df\n all_pub_records <- bind_rows(all_pub_records, pub_records)\n}\n```\n:::\n\n::: {.cell}\n\n```{.r .cell-code code-summary=\"just the useful and publish-able data, clean a bit\"}\ncolnames(evals) <- snakecase::to_snake_case(colnames(evals))\n\nevals_pub <- evals %>% \n dplyr::rename(stage_of_process = stage_of_process_todo_from_crucial_research_2) %>% \n mutate(stage_of_process = unlist(stage_of_process)) %>% \n dplyr::filter(stage_of_process == \"published\") %>% \n select(id, crucial_research, paper_abbrev, evaluator_name, category, source_main, author_agreement, overall, lb_overall, ub_overall, conf_index_overall, advancing_knowledge_and_practice, lb_advancing_knowledge_and_practice, ub_advancing_knowledge_and_practice, conf_index_advancing_knowledge_and_practice, methods_justification_reasonableness_validity_robustness, lb_methods_justification_reasonableness_validity_robustness, ub_methods_justification_reasonableness_validity_robustness, conf_index_methods_justification_reasonableness_validity_robustness, logic_communication, lb_logic_communication, ub_logic_communication, conf_index_logic_communication, engaging_with_real_world_impact_quantification_practice_realism_and_relevance, lb_engaging_with_real_world_impact_quantification_practice_realism_and_relevance, ub_engaging_with_real_world_impact_quantification_practice_realism_and_relevance, conf_index_engaging_with_real_world_impact_quantification_practice_realism_and_relevance, relevance_to_global_priorities, lb_relevance_to_global_priorities, ub_relevance_to_global_priorities, conf_index_relevance_to_global_priorities, journal_quality_predict, lb_journal_quality_predict, ub_journal_quality_predict, conf_index_journal_quality_predict, open_collaborative_replicable, conf_index_open_collaborative_replicable, lb_open_collaborative_replicable, ub_open_collaborative_replicable, merits_journal, lb_merits_journal, ub_merits_journal, conf_index_merits_journal)\n\nevals_pub %<>%\n tidyr::unnest_wider(category, names_sep = \"\") %>%\n tidyr::unnest_wider(paper_abbrev, names_sep = \"\") %>%\nmutate(across(everything(), unlist)) %>% #unlist list columns \n dplyr::rename(paper_abbrev = paper_abbrev1)\n\n#Todo -- check the unlist is not propagating the entry\n#Note: category, topic_subfield, and source have multiple meaningful categories. These will need care \n```\n:::\n\n::: {.cell}\n\n```{.r .cell-code code-summary=\"Shorten names \"}\nnew_names <- c(\n \"eval_name\" = \"evaluator_name\",\n \"cat_1\" = \"category1\",\n \"cat_2\" = \"category2\",\n \"cat_3\" = \"category3\",\n \"crucial_rsx\" = \"crucial_research\",\n \"conf_overall\" = \"conf_index_overall\",\n \"adv_knowledge\" = \"advancing_knowledge_and_practice\",\n \"lb_adv_knowledge\" = \"lb_advancing_knowledge_and_practice\",\n \"ub_adv_knowledge\" = \"ub_advancing_knowledge_and_practice\",\n \"conf_adv_knowledge\" = \"conf_index_advancing_knowledge_and_practice\",\n \"methods\" = \"methods_justification_reasonableness_validity_robustness\",\n \"lb_methods\" = \"lb_methods_justification_reasonableness_validity_robustness\",\n \"ub_methods\" = \"ub_methods_justification_reasonableness_validity_robustness\",\n \"conf_methods\" = \"conf_index_methods_justification_reasonableness_validity_robustness\",\n \"logic_comms\" = \"logic_communication\",\n \"lb_logic_comms\" = \"lb_logic_communication\",\n \"ub_logic_comms\" = \"ub_logic_communication\",\n \"conf_logic_comms\" = \"conf_index_logic_communication\",\n \"real_world\" = \"engaging_with_real_world_impact_quantification_practice_realism_and_relevance\",\n \"lb_real_world\" = \"lb_engaging_with_real_world_impact_quantification_practice_realism_and_relevance\",\n \"ub_real_world\" = \"ub_engaging_with_real_world_impact_quantification_practice_realism_and_relevance\",\n \"conf_real_world\" = \"conf_index_engaging_with_real_world_impact_quantification_practice_realism_and_relevance\",\n \"gp_relevance\" = \"relevance_to_global_priorities\",\n \"lb_gp_relevance\" = \"lb_relevance_to_global_priorities\",\n \"ub_gp_relevance\" = \"ub_relevance_to_global_priorities\",\n \"conf_gp_relevance\" = \"conf_index_relevance_to_global_priorities\",\n \"journal_predict\" = \"journal_quality_predict\",\n \"lb_journal_predict\" = \"lb_journal_quality_predict\",\n \"ub_journal_predict\" = \"ub_journal_quality_predict\",\n \"conf_journal_predict\" = \"conf_index_journal_quality_predict\",\n \"open_sci\" = \"open_collaborative_replicable\",\n \"conf_open_sci\" = \"conf_index_open_collaborative_replicable\",\n \"lb_open_sci\" = \"lb_open_collaborative_replicable\",\n \"ub_open_sci\" = \"ub_open_collaborative_replicable\",\n \"conf_merits_journal\" = \"conf_index_merits_journal\"\n)\n\nevals_pub <- evals_pub %>%\n rename(!!!new_names)\n\n# Function to insert a newline character every 15 characters\nwrap_text <- function(x, width = 15) {\n gsub(\"(.{1,15})\", \"\\\\1-\\n\", x)\n}\n\nevals_pub$source_main_wrapped <- wrap_text(evals_pub$source_main, 15)\n\nevals_pub$eval_name <- ifelse(\n grepl(\"^\\\\b\\\\w+\\\\b$|\\\\bAnonymous\\\\b\", evals_pub$eval_name),\n paste0(\"Anonymous_\", seq_along(evals_pub$eval_name)),\n evals_pub$eval_name\n)\n\n\n# make the old names into labels\n\n# Create a list of labels\nlabels <- str_replace_all(new_names, \"_\", \" \")\nlabels <- str_to_title(labels)\n \n# Assign labels to the dataframe\n# for(i in seq_along(labels)) {\n# col_name <- new_names[names(new_names)[i]]\n# label <- labels[i]\n# attr(evals_pub[[col_name]], \"label\") <- label\n# }\n# \n```\n:::\n\n\n\n\n### Reconcile uncertainty ratings and CIs {-}\n\nWhere people gave only confidence level 'dots', we impute CIs (confidence/credible intervals). We follow the correspondence described [here](https://effective-giving-marketing.gitbook.io/unjournal-x-ea-and-global-priorities-research/policies-projects-evaluation-workflow/evaluation/guidelines-for-evaluators#1-5-dots-explanation-and-relation-to-cis). (Otherwise where they gave actual CIs, we use these.)^[Note this is only a first-pass; a more sophisticated approach may be warranted in future.]\n\n::: {.callout-note collapse=\"true\"}\n## Dots to interval choices\n\n> 5 = Extremely confident, i.e., 90% confidence interval spans +/- 4 points or less)\n\nFor 0-100 ratings, code the LB as $min(R - 4\\times \\frac{R}{100},0)$ and the UB as $max(R + 4\\times \\frac{R}{100},0)$, where R is the stated (middle) rating. This 'scales' the CI, as interpreted, to be proportional to the rating, with a maximum 'interval' of about 8, with the rating is about 96.\n\n> 4 = Very*confident: 90% confidence interval +/- 8 points or less\n\nFor 0-100 ratings, code the LB as $min(R - 8\\times \\frac{R}{100},0)$ and the UB as $max(R + 8\\times \\frac{R}{100},0)$, where R is the stated (middle) rating. \n\n> 3 = Somewhat** confident: 90% confidence interval +/- 15 points or less \n\n> 2 = Not very** confident: 90% confidence interval, +/- 25 points or less\n\nComparable scaling for the 2-3 ratings as for the 4 and 5 rating.\n\n> 1 = Not** confident: (90% confidence interval +/- more than 25 points)\n \nCode LB as $min(R - 37.5\\times \\frac{R}{100},0)$ and the UB as $max(R + 37.5\\times \\frac{R}{100},0)$. \n \nThis is just a first-pass. There might be a more information-theoretic way of doing this. On the other hand, we might be switching the evaluations to use a different tool soon, perhaps getting rid of the 1-5 confidence ratings altogether.\n\n::: \n\n\n\n::: {.cell}\n\n```{.r .cell-code code-summary=\"reconcile explicit bounds and stated confidence level\"}\n# Define the baseline widths for each confidence rating\nbaseline_widths <- c(4, 8, 15, 25, 37.5)\n\n# Define a function to calculate the lower and upper bounds, where given only an index\ncalc_bounds <- function(rating, confidence, lb_explicit, ub_explicit, scale=100) {\n # Check if confidence is NA\n if (is.na(confidence)) {\n return(c(lb_explicit, ub_explicit)) # Return explicit bounds if confidence is NA\n } else {\n baseline_width <- baseline_widths[confidence]\n lb <- pmax(rating - baseline_width * rating / scale, 0)\n ub <- pmin(rating + baseline_width * rating / scale, scale)\n return(c(lb, ub))\n }\n}\n\n# Function to calculate bounds for a single category\ncalc_category_bounds <- function(df, category, scale=100) {\n # Calculate bounds\n bounds <- mapply(calc_bounds, df[[category]], df[[paste0(\"conf_\", category)]], df[[paste0(\"lb_\", category)]], df[[paste0(\"ub_\", category)]])\n \n # Convert to data frame and ensure it has the same number of rows as the input\n bounds_df <- as.data.frame(t(bounds))\n rownames(bounds_df) <- NULL\n \n # Add bounds to original data frame\n df[[paste0(category, \"_lb_imp\")]] <- bounds_df[, 1]\n df[[paste0(category, \"_ub_imp\")]] <- bounds_df[, 2]\n \n return(df)\n}\n\n\n# Lists of categories\n\nrating_cats <- c(\"overall\", \"adv_knowledge\", \"methods\", \"logic_comms\", \"real_world\", \"gp_relevance\", \"open_sci\")\n\n#... 'predictions' are currently 1-5 (0-5?)\npred_cats <- c(\"journal_predict\", \"merits_journal\")\n\n# Apply the function to each category\n# DR: I don't love this looping 'edit in place' code approach, but whatever\nfor (cat in rating_cats) {\n evals_pub <- calc_category_bounds(evals_pub, cat, scale=100)\n}\n\nfor (cat in pred_cats) {\n evals_pub <- calc_category_bounds(evals_pub, cat, scale=5)\n}\n```\n:::\n\n::: {.cell}\n\n```{.r .cell-code code-summary=\"save data for others' use\"}\nevals_pub %>% saveRDS(file = here(\"data\", \"evals.Rdata\"))\nevals_pub %>% write_csv(file = here(\"data\", \"evals.csv\"))\n\n#evals_pub %>% readRDS(file = here(\"data\", \"evals.Rdata\"))\n```\n:::\n\n\n \n# Basic presentation\n\n## What sorts of papers/projects are we considering and evaluating? \n\nIn this section, we give some simple data summaries and visualizations, for a broad description of The Unjournal's coverage. \n\nIn the interactive tables below we give some key attributes of the papers and the evaluators, and a preview of the evaluations.\n\n\n::: column-body-outset\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\n(\n all_evals_dt <- evals_pub %>%\n arrange(paper_abbrev, eval_name) %>%\n dplyr::select(paper_abbrev, crucial_rsx, eval_name, cat_1, cat_2, source_main_wrapped, author_agreement) %>%\n dplyr::select(-matches(\"ub_|lb_|conf\")) %>% \n #rename_all(~ gsub(\"_\", \" \", .)) %>% \n rename(\"Research _____________________\" = \"crucial_rsx\" \n ) %>%\n DT::datatable(\n caption = \"Evaluations (confidence bounds not shown)\", \n filter = 'top',\n rownames= FALSE,\n options = list(pageLength = 7)\n )\n)\n```\n\n::: {.cell-output-display}\n```{=html}\n
\n\n```\n:::\n:::\n\n\n\n\\\n\nNext, the 'middle ratings and predictions'.\n\n\n::: {.cell}\n\n```{.r .cell-code code-summary=\"Data datable (all shareable relevant data)\"}\n(\n all_evals_dt <- evals_pub %>%\n arrange(paper_abbrev, eval_name, overall) %>%\n dplyr::select(paper_abbrev, eval_name, all_of(rating_cats)) %>%\n DT::datatable(\n caption = \"Evaluations and predictions (confidence bounds not shown)\", \n filter = 'top',\n rownames= FALSE,\n options = list(pageLength = 7)\n )\n)\n```\n\n::: {.cell-output-display}\n```{=html}\n
\n\n```\n:::\n:::\n\n\\\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\n(\n all_evals_dt_ci <- evals_pub %>%\n arrange(paper_abbrev, eval_name) %>%\n dplyr::select(paper_abbrev, eval_name, conf_overall, rating_cats, matches(\"ub_imp|lb_imp\")) %>%\n DT::datatable(\n caption = \"Evaluations and (imputed*) confidence bounds)\", \n filter = 'top',\n rownames= FALSE,\n options = list(pageLength = 7)\n )\n)\n```\n:::\n\n:::\n\n\n::: {.callout-note collapse=\"true\"}\n## Next consider...\n\n- Composition of research evaluated\n - By field (economics, psychology, etc.)\n - By subfield of economics \n - By topic/cause area (Global health, economic development, impact of technology, global catastrophic risks, etc. )\n - By source (submitted, identified with author permission, direct evaluation)\n \n- Timing of intake and evaluation^[Consider: timing might be its own section or chapter; this is a major thing journals track, and we want to keep track of ourselves]\n\n:::\n\nThe funnel plot below starts with the paper we prioritized for likely Unjournal evaluation, marking these as 'considering'.\n\n\n::: {.cell}\n\n```{.r .cell-code}\n#Add in the 3 different evaluation input sources\n#update to be automated rather than hard-coded - to look at David's work here\n\npapers_considered <- all_pub_records %>% nrow()\n\npapers_deprio <- all_pub_records %>% filter(`stage of process/todo` == \"de-prioritized\") %>% nrow()\n\npapers_evaluated <- all_pub_records %>% filter(`stage of process/todo` %in% c(\"published\",\n \"contacting/awaiting_authors_response_to_evaluation\",\n \"awaiting_publication_ME_comments\",\n \"awaiting_evaluations\")) %>% nrow()\n\npapers_complete <- all_pub_records %>% filter(`stage of process/todo` == \"published\") %>% \nnrow()\n\npapers_in_progress <- papers_evaluated - papers_complete\n\npapers_still_in_consideration <- all_pub_records %>% filter(`stage of process/todo` == \"considering\") %>% nrow()\n\n\n#todo: adjust wording of hover notes ('source, target...etc')\n\nfig <- plot_ly(\n type = \"sankey\",\n orientation = \"h\",\n\n node = list(\n label = c(\"Prioritized\", \"Evaluating(ed)\", \"Complete\", \"In progress\", \"Still in consideration\", \"De-prioritized\"),\n color = c(\"orange\", \"green\", \"green\", \"orange\", \"orange\", \"red\"),\n#Todo: adjust 'location' to group these left to right\n pad = 15,\n thickness = 20,\n line = list(\n color = \"black\",\n width = 0.5\n )\n ),\n\n link = list(\n source = c(0,1,1,0,0),\n target = c(1,2,3,4,5),\n value = c(\n papers_evaluated,\n papers_complete,\n papers_in_progress,\n papers_still_in_consideration,\n papers_deprio\n ))\n )\nfig <- fig %>% layout(\n title = \"Unjournal paper funnel\",\n font = list(\n size = 10\n )\n)\n\nfig \n```\n\n::: {.cell-output-display}\n```{=html}\n
\n\n```\n:::\n:::\n\n\n\n(In future, will make interactive/dashboards of the elements below)\n\n\n::: {.cell}\n\n```{.r .cell-code}\nsummary_df <- evals_pub %>%\n distinct(crucial_rsx, .keep_all = T) %>% \n group_by(cat_1) %>%\n summarise(count = n()) \n\nsummary_df$cat_1[is.na(summary_df$cat_1)] <- \"Unknown\"\n\nsummary_df <- summary_df %>%\n arrange(-desc(count)) %>%\n mutate(cat_1 = factor(cat_1, levels = unique(cat_1)))\n\n# Create stacked bar chart\nggplot(summary_df, aes(x = cat_1, y = count)) +\n geom_bar(stat = \"identity\") + \n theme_minimal() +\n labs(x = \"Paper category\", y = \"Count\", \n title = \"Count of evaluated papers by primary category\") \n```\n\n::: {.cell-output-display}\n![](evaluation_data_files/figure-html/unnamed-chunk-14-1.png){width=672}\n:::\n:::\n\n::: {.cell}\n\n```{.r .cell-code}\n# Bar plot\nggplot(evals_pub, aes(x = source_main_wrapped)) + \n geom_bar(position = \"stack\", stat = \"count\") +\n labs(x = \"Source\", y = \"Count\") +\n theme_light() +\n theme_minimal() +\n ggtitle(\"Pool of research/evaluations by paper source\") + # add title\n theme(\n panel.grid.major = element_blank(),\n panel.grid.minor = element_blank(),\n text = element_text(size = 16), # changing all text size to 16\n axis.text.y = element_text(size = 10),\n axis.text.x = element_text(size = 14)\n )\n```\n\n::: {.cell-output-display}\n![](evaluation_data_files/figure-html/unnamed-chunk-16-1.png){width=672}\n:::\n:::\n\n::: {.cell}\n\n```{.r .cell-code}\nall_pub_records$is_evaluated = all_pub_records$`stage of process/todo` %in% c(\"published\",\n \"contacting/awaiting_authors_response_to_evaluation\",\n \"awaiting_publication_ME_comments\",\n \"awaiting_evaluations\")\n\nall_pub_records$source_main[all_pub_records$source_main == \"NA\"] <- \"Not applicable\" \nall_pub_records$source_main[all_pub_records$source_main == \"internal-from-syllabus-agenda-policy-database\"] <- \"Internal: syllabus, agenda, etc.\" \nall_pub_records$source_main = tidyr::replace_na(all_pub_records$source_main, \"Unknown\")\n\n\n\nggplot(all_pub_records, aes(x = fct_infreq(source_main), fill = is_evaluated)) + \n geom_bar(position = \"stack\", stat = \"count\") +\n labs(x = \"Source\", y = \"Count\", fill = \"Selected for\\nevaluation?\") +\n coord_flip() + # flipping the coordinates to have categories on y-axis (on the left)\n theme_light() +\n theme_minimal() +\n ggtitle(\"Evaluations by source of the paper\") +# add title\n theme(\n panel.grid.major = element_blank(),\n panel.grid.minor = element_blank(),\n text = element_text(size = 16), # changing all text size to 16\n axis.text.y = element_text(size = 12),\n axis.text.x = element_text(size = 14)\n )\n```\n\n::: {.cell-output-display}\n![](evaluation_data_files/figure-html/unnamed-chunk-18-1.png){width=672}\n:::\n:::\n\n\n\n### The distribution of ratings and predictions {-}\n\nNext, we present the ratings and predictions along with 'uncertainty measures'.^[We use \"ub imp\" (and \"lb imp\") to denote the upper and lower bounds given by evaluators.] Where evaluators gave only a 1-5 confidence level^[More or less, the ones who report a level for 'conf overall', although some people did this for some but not others], we use the imputations discussed and coded above. \n\n\n- For each category and prediction (overall and by paper)\n\n\n::: column-body-outset\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nwrap_text <- function(text, width) {\n sapply(strwrap(text, width = width, simplify = FALSE), paste, collapse = \"\\n\")\n}\n\nevals_pub$wrapped_pub_names <- wrap_text(evals_pub$paper_abbrev, width = 15)\n\n\n#todo -- sort by average overall, use color and vertical spacing more\n#todo: introduce a carriage return into the paper names (workaround) to wrap these and save horizontal space\n\n\n# Dot plot\nggplot(evals_pub, aes(x = paper_abbrev, y = overall)) +\n geom_point(stat = \"identity\", size = 3, shape = 1, colour = \"lightblue\", stroke = 2) +\n geom_text_repel(aes(label = eval_name), \n size = 3, \n box.padding = unit(0.35, \"lines\"),\n point.padding = unit(0.3, \"lines\")) +\n coord_flip() + # flipping the coordinates to have categories on y-axis (on the left)\n theme_light() +\n xlab(\"Paper\") + # remove x-axis label\n ylab(\"Overall score\") + # name y-axis\n ggtitle(\"Overall scores of evaluated papers\") +# add title\n theme(\n panel.grid.major = element_blank(),\n panel.grid.minor = element_blank(),\n text = element_text(size = 14), # changing all text size to 16\n axis.text.y = element_text(size = 8),\n axis.text.x = element_text(size = 12)\n )\n```\n\n::: {.cell-output-display}\n![](evaluation_data_files/figure-html/unnamed-chunk-20-1.png){width=672}\n:::\n\n```{.r .cell-code}\n#todo -- add more vertical space between papers\n```\n:::\n\n:::\n\n\nIn future (todo), we aim to build a dashboard allowing people to use the complete set of ratings and predictions, and choose their own weightings. (Also incorporating the evaluator uncertainty in reasonable ways.)\n\n*The below should be fixed -- the column widths below are misleading*\n\n::: {.callout-note collapse=\"true\"}\n## Future vis\n\nSpider or radial chart \n\nEach rating is a dimension or attribute (potentially normalized)\npotentially superimpose a 'circle' for the suggested weighting or overall. \n\nEach paper gets its own spider, with all others (or the average) in faded color behind it as a comparator. \n\nIdeally user can switch on/off \n\nBeware -- people infer things from the shape's size\n\n\n::: \n\n::: column-body-outset\n\n\n::: {.cell}\n\n```{.r .cell-code}\nunit.scale = function(x) (x*100 - min(x*100)) / (max(x*100) - min(x*100))\nevaluations_table <- evals_pub %>%\n select(paper_abbrev, eval_name, cat_1, source_main, overall, adv_knowledge, methods, logic_comms, journal_predict) %>%\n arrange(desc(paper_abbrev))\n\nout = formattable(\n evaluations_table,\n list(\n #area(col = 5:8) ~ function(x) percent(x / 100, digits = 0),\n area(col = 5:8) ~ color_tile(\"#FA614B66\",\"#3E7DCC\"),\n `journal_predict` = proportion_bar(\"#DeF7E9\", unit.scale)\n )\n)\nout\n```\n\n::: {.cell-output-display}\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n
paper_abbrev eval_name cat_1 source_main overall adv_knowledge methods logic_comms journal_predict
Well-being: Cash vs. psychotherapy Anonymous_13 GH&D internal-NBER 90 90 90 80 4.0
Well-being: Cash vs. psychotherapy Hannah Metzler GH&D internal-NBER 75 70 90 75 3.0
Nonprofit Govc.: Randomized healthcare DRC Wayne Aaron Sandholtz GH&D internal-NBER 65 70 60 55 3.6
LT CEA: Resilient foods vs. AGI safety Scott Janzwood long-term-relevant submitted 65 NA NA NA NA
LT CEA: Resilient foods vs. AGI safety Anca Hanea long-term-relevant submitted 80 80 70 85 3.5
LT CEA: Resilient foods vs. AGI safety Alex Bates long-term-relevant submitted 40 30 50 60 2.0
Env. fx of prod.: ecological obs Elias Cisneros NA internal-NBER 88 90 75 80 4.0
Env. fx of prod.: ecological obs Anonymous_12 NA internal-NBER 70 70 70 75 4.0
CBT Human K, Ghana Anonymous_11 NA internal-NBER 75 60 90 70 4.0
CBT Human K, Ghana Anonymous_16 NA internal-NBER 75 65 60 75 NA
Banning wildlife trade can boost demand Anonymous_3 conservation submitted 75 70 80 70 3.0
Banning wildlife trade can boost demand Liew Jia Huan conservation submitted 75 80 50 70 2.5
Advance market commit. (vaccines) David Manheim policy internal-from-syllabus-agenda-policy-database 80 25 95 75 3.0
Advance market commit. (vaccines) Joel Tan policy internal-from-syllabus-agenda-policy-database 79 90 70 70 5.0
Advance market commit. (vaccines) Dan Tortorice policy internal-from-syllabus-agenda-policy-database 80 90 80 80 4.0
AI and econ. growth Seth Benzell macroeconomics internal-from-syllabus-agenda-policy-database 80 75 80 70 NA
AI and econ. growth Phil Trammel macroeconomics internal-from-syllabus-agenda-policy-database 92 97 70 45 3.5
\n:::\n:::\n\n:::\n\n\nNext, look for systematic variation \n\n- By field and topic area of paper\n\n- By submission/selection route\n\n- By evaluation manager\n\n... perhaps building a model of this. We are looking for systematic 'biases and trends', loosely speaking, to help us better understand how our evaluation system is working.\n\n\\\n\n\n\n\n### Relationship among the ratings (and predictions) {-} \n\n- Correlation matrix\n\n- ANOVA\n\n- PCA (Principle components)\n\n- With other 'control' factors?\n\n- How do the specific measures predict the aggregate ones (overall rating, merited publication)\n - CF 'our suggested weighting'\n\n\n## Aggregation of expert opinion (modeling)\n\n\n## Notes on sources and approaches\n\n\n::: {.callout-note collapse=\"true\"}\n\n## Hanea et al {-}\n(Consult, e.g., repliCATS/Hanea and others work; meta-science and meta-analysis approaches)\n\n`aggrecat` package\n\n> Although the accuracy, calibration, and informativeness of the majority of methods are very similar, a couple of the aggregation methods consistently distinguish themselves as among the best or worst. Moreover, the majority of methods outperform the usual benchmarks provided by the simple average or the median of estimates.\n\n[Hanea et al, 2021](https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0256919#sec007)\n\n However, these are in a different context. Most of those measures are designed to deal with probablistic forecasts for binary outcomes, where the predictor also gives a 'lower bound' and 'upper bound' for that probability. We could roughly compare that to our continuous metrics with 90% CI's (or imputations for these).\n\nFurthermore, many (all their successful measures?) use 'performance-based weights', accessing metrics from prior prediction performance of the same forecasters We do not have these, nor do we have a sensible proxy for this. \n:::\n\n\n::: {.callout-note collapse=\"true\"}\n## D Veen et al (2017)\n\n[link](https://www.researchgate.net/profile/Duco-Veen/publication/319662351_Using_the_Data_Agreement_Criterion_to_Rank_Experts'_Beliefs/links/5b73e2dc299bf14c6da6c663/Using-the-Data-Agreement-Criterion-to-Rank-Experts-Beliefs.pdf)\n\n... we show how experts can be ranked based on their knowledge and their level of (un)certainty. By letting experts specify their knowledge in the form of a probability distribution, we can assess how accurately they can predict new data, and how appropriate their level of (un)certainty is. The expert’s specified probability distribution can be seen as a prior in a Bayesian statistical setting. We evaluate these priors by extending an existing prior-data (dis)agreement measure, the Data Agreement Criterion, and compare this approach to using Bayes factors to assess prior specification. We compare experts with each other and the data to evaluate their appropriateness. Using this method, new research questions can be asked and answered, for instance: Which expert predicts the new data best? Is there agreement between my experts and the data? Which experts’ representation is more valid or useful? Can we reach convergence between expert judgement and data? We provided an empirical example ranking (regional) directors of a large financial institution based on their predictions of turnover. \n\nBe sure to consult the [correction made here](https://www.semanticscholar.org/paper/Correction%3A-Veen%2C-D.%3B-Stoel%2C-D.%3B-Schalken%2C-N.%3B-K.%3B-Veen-Stoel/a2882e0e8606ef876133f25a901771259e7033b1)\n\n::: \n\n\n::: {.callout-note collapse=\"true\"}\n## Also seems relevant:\n\nSee [Gsheet HERE](https://docs.google.com/spreadsheets/d/14japw6eLGpGjEWy1MjHNJXU1skZY_GAIc2uC2HIUalM/edit#gid=0), generated from an Elicit.org inquiry.\n\n\n::: \n\n\n\nIn spite of the caveats in the fold above, we construct some measures of aggregate beliefs using the `aggrecat` package. We will make (and explain) some ad-hoc choices here. We present these:\n\n1. For each paper\n2. For categories of papers and cross-paper categories of evaluations\n3. For the overall set of papers and evaluations\n\nWe can also hold onto these aggregated metrics for later use in modeling.\n\n\n- Simple averaging\n\n- Bayesian approaches \n\n- Best-performing approaches from elsewhere \n\n- Assumptions over unit-level random terms \n\n### Explicit modeling of 'research quality' (for use in prizes, etc.) {-}\n\n- Use the above aggregation as the outcome of interest, or weight towards categories of greater interest?\n\n- Model with controls -- look for greatest positive residual? \n\n\n## Inter-rater reliability\n\n## Decomposing variation, dimension reduction, simple linear models\n\n\n## Later possiblities\n\n- Relation to evaluation text content (NLP?)\n\n- Relation/prediction of later outcomes (traditional publication, citations, replication)\n\n\n\n## Scoping our future coverage\n\nWe have funding to evaluate roughly 50-70 papers/projects per year, given our proposed incentives.\n\nConsider:\n\n- How many relevant NBER papers come out per year?\n\n- How much relevant work in other prestige archives?\n\n- What quotas do we want (by cause, etc.) and how feasible are these?\n\n", "supporting": [ "evaluation_data_files" ], diff --git a/_freeze/chapters/evaluation_data/figure-html/unnamed-chunk-16-1.png b/_freeze/chapters/evaluation_data/figure-html/unnamed-chunk-16-1.png index 4690280..f11e507 100644 Binary files a/_freeze/chapters/evaluation_data/figure-html/unnamed-chunk-16-1.png and b/_freeze/chapters/evaluation_data/figure-html/unnamed-chunk-16-1.png differ diff --git a/_freeze/chapters/evaluation_data/figure-html/unnamed-chunk-20-1.png b/_freeze/chapters/evaluation_data/figure-html/unnamed-chunk-20-1.png index 89cd41b..29df110 100644 Binary files a/_freeze/chapters/evaluation_data/figure-html/unnamed-chunk-20-1.png and b/_freeze/chapters/evaluation_data/figure-html/unnamed-chunk-20-1.png differ diff --git a/_publish.yml b/_publish.yml new file mode 100644 index 0000000..1ecbd98 --- /dev/null +++ b/_publish.yml @@ -0,0 +1,4 @@ +- source: project + netlify: + - id: e484e784-b6f7-4d9d-a85e-131611bc8264 + url: 'https://stirring-arithmetic-e23d09.netlify.app' diff --git a/chapters/evaluation_data.qmd b/chapters/evaluation_data.qmd index 2760244..4ec77be 100644 --- a/chapters/evaluation_data.qmd +++ b/chapters/evaluation_data.qmd @@ -396,18 +396,21 @@ papers_evaluated <- all_pub_records %>% filter(`stage of process/todo` %in% c(" papers_complete <- all_pub_records %>% filter(`stage of process/todo` == "published") %>% nrow() -papers_in_progress <- papers_evaluated-papers_complete +papers_in_progress <- papers_evaluated - papers_complete papers_still_in_consideration <- all_pub_records %>% filter(`stage of process/todo` == "considering") %>% nrow() +#todo: adjust wording of hover notes ('source, target...etc') + fig <- plot_ly( type = "sankey", orientation = "h", node = list( - label = c("Prioritized", "Eval uated", "Complete", "In progress", "Still in consideration", "De-prioritized"), + label = c("Prioritized", "Evaluating(ed)", "Complete", "In progress", "Still in consideration", "De-prioritized"), color = c("orange", "green", "green", "orange", "orange", "red"), +#Todo: adjust 'location' to group these left to right pad = 15, thickness = 20, line = list( @@ -439,6 +442,8 @@ fig ``` +(In future, will make interactive/dashboards of the elements below) + ```{r} summary_df <- evals_pub %>% @@ -474,7 +479,7 @@ ggplot(evals_pub, aes(x = source_main_wrapped)) + labs(x = "Source", y = "Count") + theme_light() + theme_minimal() + - ggtitle("Evaluations by source of the paper") + # add title + ggtitle("Pool of research/evaluations by paper source") + # add title theme( panel.grid.major = element_blank(), panel.grid.minor = element_blank(), @@ -535,10 +540,13 @@ wrap_text <- function(text, width) { evals_pub$wrapped_pub_names <- wrap_text(evals_pub$paper_abbrev, width = 15) +#todo -- sort by average overall, use color and vertical spacing more +#todo: introduce a carriage return into the paper names (workaround) to wrap these and save horizontal space + # Dot plot ggplot(evals_pub, aes(x = paper_abbrev, y = overall)) + - geom_point(stat = "identity", size = 4, shape = 1, colour = "lightblue", stroke = 3) + + geom_point(stat = "identity", size = 3, shape = 1, colour = "lightblue", stroke = 2) + geom_text_repel(aes(label = eval_name), size = 3, box.padding = unit(0.35, "lines"), @@ -555,10 +563,33 @@ ggplot(evals_pub, aes(x = paper_abbrev, y = overall)) + axis.text.y = element_text(size = 8), axis.text.x = element_text(size = 12) ) +#todo -- add more vertical space between papers + ``` ::: +In future (todo), we aim to build a dashboard allowing people to use the complete set of ratings and predictions, and choose their own weightings. (Also incorporating the evaluator uncertainty in reasonable ways.) + +*The below should be fixed -- the column widths below are misleading* + +::: {.callout-note collapse="true"} +## Future vis + +Spider or radial chart + +Each rating is a dimension or attribute (potentially normalized) +potentially superimpose a 'circle' for the suggested weighting or overall. + +Each paper gets its own spider, with all others (or the average) in faded color behind it as a comparator. + +Ideally user can switch on/off + +Beware -- people infer things from the shape's size + + +::: + ::: column-body-outset ```{r} diff --git a/docs/chapters/evaluation_data.html b/docs/chapters/evaluation_data.html index 5be249f..9187d86 100644 --- a/docs/chapters/evaluation_data.html +++ b/docs/chapters/evaluation_data.html @@ -217,7 +217,7 @@

-
load packages
source(here::here("code", "shared_packages_code.R"))
+
load packages
source(here::here("code", "shared_packages_code.R"))
 
 #devtools::install_github("rethinkpriorities/rp-r-package")
 library(rethinkpriorities)
@@ -471,8 +471,8 @@ 

-
save data for others’ use
evals_pub %>% saveRDS(file = here("data", "evals.Rdata"))
-evals_pub %>% write_csv(file = here("data", "evals.csv"))
+
save data for others’ use
evals_pub %>% saveRDS(file = here("data", "evals.Rdata"))
+evals_pub %>% write_csv(file = here("data", "evals.csv"))
 
 #evals_pub %>% readRDS(file = here("data", "evals.Rdata"))
@@ -502,8 +502,8 @@

)

-
- +
+


@@ -522,8 +522,8 @@

)

-
- +
+


@@ -587,18 +587,21 @@

papers_complete <- all_pub_records %>% filter(`stage of process/todo` == "published") %>% nrow() -papers_in_progress <- papers_evaluated-papers_complete +papers_in_progress <- papers_evaluated - papers_complete papers_still_in_consideration <- all_pub_records %>% filter(`stage of process/todo` == "considering") %>% nrow() +#todo: adjust wording of hover notes ('source, target...etc') + fig <- plot_ly( type = "sankey", orientation = "h", node = list( - label = c("Prioritized", "Eval uated", "Complete", "In progress", "Still in consideration", "De-prioritized"), + label = c("Prioritized", "Evaluating(ed)", "Complete", "In progress", "Still in consideration", "De-prioritized"), color = c("orange", "green", "green", "orange", "orange", "red"), +#Todo: adjust 'location' to group these left to right pad = 15, thickness = 20, line = list( @@ -627,10 +630,11 @@

fig
-
- +
+
+

(In future, will make interactive/dashboards of the elements below)

Code
summary_df <- evals_pub %>%
   distinct(crucial_rsx, .keep_all = T) %>% 
@@ -660,7 +664,7 @@ 

labs(x = "Source", y = "Count") + theme_light() + theme_minimal() + - ggtitle("Evaluations by source of the paper") + # add title + ggtitle("Pool of research/evaluations by paper source") + # add title theme( panel.grid.major = element_blank(), panel.grid.minor = element_blank(), @@ -716,10 +720,13 @@

evals_pub$wrapped_pub_names <- wrap_text(evals_pub$paper_abbrev, width = 15) +#todo -- sort by average overall, use color and vertical spacing more +#todo: introduce a carriage return into the paper names (workaround) to wrap these and save horizontal space + # Dot plot ggplot(evals_pub, aes(x = paper_abbrev, y = overall)) + - geom_point(stat = "identity", size = 4, shape = 1, colour = "lightblue", stroke = 3) + + geom_point(stat = "identity", size = 3, shape = 1, colour = "lightblue", stroke = 2) + geom_text_repel(aes(label = eval_name), size = 3, box.padding = unit(0.35, "lines"), @@ -739,11 +746,35 @@

+
Code
#todo -- add more vertical space between papers
+
+
+ +

In future (todo), we aim to build a dashboard allowing people to use the complete set of ratings and predictions, and choose their own weightings. (Also incorporating the evaluator uncertainty in reasonable ways.)

+

The below should be fixed – the column widths below are misleading

+
+ +
+
+

Spider or radial chart

+

Each rating is a dimension or attribute (potentially normalized) potentially superimpose a ‘circle’ for the suggested weighting or overall.

+

Each paper gets its own spider, with all others (or the average) in faded color behind it as a comparator.

+

Ideally user can switch on/off

+

Beware – people infer things from the shape’s size

+
-
Code
unit.scale = function(x) (x*100 - min(x*100)) / (max(x*100) - min(x*100))
+
Code
unit.scale = function(x) (x*100 - min(x*100)) / (max(x*100) - min(x*100))
 evaluations_table <- evals_pub %>%
   select(paper_abbrev, eval_name, cat_1, source_main, overall, adv_knowledge, methods, logic_comms, journal_predict) %>%
   arrange(desc(paper_abbrev))
@@ -1313,7 +1344,7 @@ 

2.3 Notes on sources and approaches

- -
+

(Consult, e.g., repliCATS/Hanea and others work; meta-science and meta-analysis approaches)

aggrecat package

@@ -1336,7 +1367,7 @@

- -
+

link

… we show how experts can be ranked based on their knowledge and their level of (un)certainty. By letting experts specify their knowledge in the form of a probability distribution, we can assess how accurately they can predict new data, and how appropriate their level of (un)certainty is. The expert’s specified probability distribution can be seen as a prior in a Bayesian statistical setting. We evaluate these priors by extending an existing prior-data (dis)agreement measure, the Data Agreement Criterion, and compare this approach to using Bayes factors to assess prior specification. We compare experts with each other and the data to evaluate their appropriateness. Using this method, new research questions can be asked and answered, for instance: Which expert predicts the new data best? Is there agreement between my experts and the data? Which experts’ representation is more valid or useful? Can we reach convergence between expert judgement and data? We provided an empirical example ranking (regional) directors of a large financial institution based on their predictions of turnover.

@@ -1354,7 +1385,7 @@

- -
+

See Gsheet HERE, generated from an Elicit.org inquiry.

@@ -1570,705 +1601,736 @@

diff --git a/docs/chapters/evaluation_data_files/figure-html/unnamed-chunk-16-1.png b/docs/chapters/evaluation_data_files/figure-html/unnamed-chunk-16-1.png index 4690280..f11e507 100644 Binary files a/docs/chapters/evaluation_data_files/figure-html/unnamed-chunk-16-1.png and b/docs/chapters/evaluation_data_files/figure-html/unnamed-chunk-16-1.png differ diff --git a/docs/chapters/evaluation_data_files/figure-html/unnamed-chunk-20-1.png b/docs/chapters/evaluation_data_files/figure-html/unnamed-chunk-20-1.png index 89cd41b..29df110 100644 Binary files a/docs/chapters/evaluation_data_files/figure-html/unnamed-chunk-20-1.png and b/docs/chapters/evaluation_data_files/figure-html/unnamed-chunk-20-1.png differ diff --git a/docs/search.json b/docs/search.json index 55ba045..5e538d8 100644 --- a/docs/search.json +++ b/docs/search.json @@ -18,7 +18,7 @@ "href": "chapters/evaluation_data.html#what-sorts-of-papersprojects-are-we-considering-and-evaluating", "title": "\n1  Evaluation data: description, exploration, checks\n", "section": "\n2.1 What sorts of papers/projects are we considering and evaluating?", - "text": "2.1 What sorts of papers/projects are we considering and evaluating?\nIn this section, we give some simple data summaries and visualizations, for a broad description of The Unjournal’s coverage.\nIn the interactive tables below we give some key attributes of the papers and the evaluators, and a preview of the evaluations.\n\n\nCode(\n all_evals_dt <- evals_pub %>%\n arrange(paper_abbrev, eval_name) %>%\n dplyr::select(paper_abbrev, crucial_rsx, eval_name, cat_1, cat_2, source_main_wrapped, author_agreement) %>%\n dplyr::select(-matches(\"ub_|lb_|conf\")) %>% \n #rename_all(~ gsub(\"_\", \" \", .)) %>% \n rename(\"Research _____________________\" = \"crucial_rsx\" \n ) %>%\n DT::datatable(\n caption = \"Evaluations (confidence bounds not shown)\", \n filter = 'top',\n rownames= FALSE,\n options = list(pageLength = 7)\n )\n)\n\n\n\n\n\n\n\nNext, the ‘middle ratings and predictions’.\n\nData datable (all shareable relevant data)(\n all_evals_dt <- evals_pub %>%\n arrange(paper_abbrev, eval_name, overall) %>%\n dplyr::select(paper_abbrev, eval_name, all_of(rating_cats)) %>%\n DT::datatable(\n caption = \"Evaluations and predictions (confidence bounds not shown)\", \n filter = 'top',\n rownames= FALSE,\n options = list(pageLength = 7)\n )\n)\n\n\n\n\n\n\n\n\n\nCode(\n all_evals_dt_ci <- evals_pub %>%\n arrange(paper_abbrev, eval_name) %>%\n dplyr::select(paper_abbrev, eval_name, conf_overall, rating_cats, matches(\"ub_imp|lb_imp\")) %>%\n DT::datatable(\n caption = \"Evaluations and (imputed*) confidence bounds)\", \n filter = 'top',\n rownames= FALSE,\n options = list(pageLength = 7)\n )\n)\n\n\n\n\n\n\n\n\n\nNext consider…\n\n\n\n\n\n\nComposition of research evaluated\n\nBy field (economics, psychology, etc.)\nBy subfield of economics\nBy topic/cause area (Global health, economic development, impact of technology, global catastrophic risks, etc. )\nBy source (submitted, identified with author permission, direct evaluation)\n\n\nTiming of intake and evaluation2\n\n\n\n\n\nThe funnel plot below starts with the paper we prioritized for likely Unjournal evaluation, marking these as ‘considering’.\n\nCode#Add in the 3 different evaluation input sources\n#update to be automated rather than hard-coded - to look at David's work here\n\npapers_considered <- all_pub_records %>% nrow()\n\npapers_deprio <- all_pub_records %>% filter(`stage of process/todo` == \"de-prioritized\") %>% nrow()\n\npapers_evaluated <- all_pub_records %>% filter(`stage of process/todo` %in% c(\"published\",\n \"contacting/awaiting_authors_response_to_evaluation\",\n \"awaiting_publication_ME_comments\",\n \"awaiting_evaluations\")) %>% nrow()\n\npapers_complete <- all_pub_records %>% filter(`stage of process/todo` == \"published\") %>% \nnrow()\n\npapers_in_progress <- papers_evaluated-papers_complete\n\npapers_still_in_consideration <- all_pub_records %>% filter(`stage of process/todo` == \"considering\") %>% nrow()\n\n\nfig <- plot_ly(\n type = \"sankey\",\n orientation = \"h\",\n\n node = list(\n label = c(\"Prioritized\", \"Eval uated\", \"Complete\", \"In progress\", \"Still in consideration\", \"De-prioritized\"),\n color = c(\"orange\", \"green\", \"green\", \"orange\", \"orange\", \"red\"),\n pad = 15,\n thickness = 20,\n line = list(\n color = \"black\",\n width = 0.5\n )\n ),\n\n link = list(\n source = c(0,1,1,0,0),\n target = c(1,2,3,4,5),\n value = c(\n papers_evaluated,\n papers_complete,\n papers_in_progress,\n papers_still_in_consideration,\n papers_deprio\n ))\n )\nfig <- fig %>% layout(\n title = \"Unjournal paper funnel\",\n font = list(\n size = 10\n )\n)\n\nfig \n\n\n\n\n\n\nCodesummary_df <- evals_pub %>%\n distinct(crucial_rsx, .keep_all = T) %>% \n group_by(cat_1) %>%\n summarise(count = n()) \n\nsummary_df$cat_1[is.na(summary_df$cat_1)] <- \"Unknown\"\n\nsummary_df <- summary_df %>%\n arrange(-desc(count)) %>%\n mutate(cat_1 = factor(cat_1, levels = unique(cat_1)))\n\n# Create stacked bar chart\nggplot(summary_df, aes(x = cat_1, y = count)) +\n geom_bar(stat = \"identity\") + \n theme_minimal() +\n labs(x = \"Paper category\", y = \"Count\", \n title = \"Count of evaluated papers by primary category\") \n\n\n\n\n\nCode# Bar plot\nggplot(evals_pub, aes(x = source_main_wrapped)) + \n geom_bar(position = \"stack\", stat = \"count\") +\n labs(x = \"Source\", y = \"Count\") +\n theme_light() +\n theme_minimal() +\n ggtitle(\"Evaluations by source of the paper\") + # add title\n theme(\n panel.grid.major = element_blank(),\n panel.grid.minor = element_blank(),\n text = element_text(size = 16), # changing all text size to 16\n axis.text.y = element_text(size = 10),\n axis.text.x = element_text(size = 14)\n )\n\n\n\n\n\nCodeall_pub_records$is_evaluated = all_pub_records$`stage of process/todo` %in% c(\"published\",\n \"contacting/awaiting_authors_response_to_evaluation\",\n \"awaiting_publication_ME_comments\",\n \"awaiting_evaluations\")\n\nall_pub_records$source_main[all_pub_records$source_main == \"NA\"] <- \"Not applicable\" \nall_pub_records$source_main[all_pub_records$source_main == \"internal-from-syllabus-agenda-policy-database\"] <- \"Internal: syllabus, agenda, etc.\" \nall_pub_records$source_main = tidyr::replace_na(all_pub_records$source_main, \"Unknown\")\n\n\n\nggplot(all_pub_records, aes(x = fct_infreq(source_main), fill = is_evaluated)) + \n geom_bar(position = \"stack\", stat = \"count\") +\n labs(x = \"Source\", y = \"Count\", fill = \"Selected for\\nevaluation?\") +\n coord_flip() + # flipping the coordinates to have categories on y-axis (on the left)\n theme_light() +\n theme_minimal() +\n ggtitle(\"Evaluations by source of the paper\") +# add title\n theme(\n panel.grid.major = element_blank(),\n panel.grid.minor = element_blank(),\n text = element_text(size = 16), # changing all text size to 16\n axis.text.y = element_text(size = 12),\n axis.text.x = element_text(size = 14)\n )\n\n\n\n\nThe distribution of ratings and predictions\nNext, we present the ratings and predictions along with ‘uncertainty measures’.3 Where evaluators gave only a 1-5 confidence level4, we use the imputations discussed and coded above.\n\nFor each category and prediction (overall and by paper)\n\n\n\nCodewrap_text <- function(text, width) {\n sapply(strwrap(text, width = width, simplify = FALSE), paste, collapse = \"\\n\")\n}\n\nevals_pub$wrapped_pub_names <- wrap_text(evals_pub$paper_abbrev, width = 15)\n\n\n\n# Dot plot\nggplot(evals_pub, aes(x = paper_abbrev, y = overall)) +\n geom_point(stat = \"identity\", size = 4, shape = 1, colour = \"lightblue\", stroke = 3) +\n geom_text_repel(aes(label = eval_name), \n size = 3, \n box.padding = unit(0.35, \"lines\"),\n point.padding = unit(0.3, \"lines\")) +\n coord_flip() + # flipping the coordinates to have categories on y-axis (on the left)\n theme_light() +\n xlab(\"Paper\") + # remove x-axis label\n ylab(\"Overall score\") + # name y-axis\n ggtitle(\"Overall scores of evaluated papers\") +# add title\n theme(\n panel.grid.major = element_blank(),\n panel.grid.minor = element_blank(),\n text = element_text(size = 14), # changing all text size to 16\n axis.text.y = element_text(size = 8),\n axis.text.x = element_text(size = 12)\n )\n\n\n\n\n\n\n\nCodeunit.scale = function(x) (x*100 - min(x*100)) / (max(x*100) - min(x*100))\nevaluations_table <- evals_pub %>%\n select(paper_abbrev, eval_name, cat_1, source_main, overall, adv_knowledge, methods, logic_comms, journal_predict) %>%\n arrange(desc(paper_abbrev))\n\nout = formattable(\n evaluations_table,\n list(\n #area(col = 5:8) ~ function(x) percent(x / 100, digits = 0),\n area(col = 5:8) ~ color_tile(\"#FA614B66\",\"#3E7DCC\"),\n `journal_predict` = proportion_bar(\"#DeF7E9\", unit.scale)\n )\n)\nout\n\n\n\n\n\npaper_abbrev\n\n\neval_name\n\n\ncat_1\n\n\nsource_main\n\n\noverall\n\n\nadv_knowledge\n\n\nmethods\n\n\nlogic_comms\n\n\njournal_predict\n\n\n\n\n\nWell-being: Cash vs. psychotherapy\n\n\nAnonymous_13\n\n\nGH&D\n\n\ninternal-NBER\n\n\n90\n\n\n90\n\n\n90\n\n\n80\n\n\n4.0\n\n\n\n\nWell-being: Cash vs. psychotherapy\n\n\nHannah Metzler\n\n\nGH&D\n\n\ninternal-NBER\n\n\n75\n\n\n70\n\n\n90\n\n\n75\n\n\n3.0\n\n\n\n\nNonprofit Govc.: Randomized healthcare DRC\n\n\nWayne Aaron Sandholtz\n\n\nGH&D\n\n\ninternal-NBER\n\n\n65\n\n\n70\n\n\n60\n\n\n55\n\n\n3.6\n\n\n\n\nLT CEA: Resilient foods vs. AGI safety\n\n\nScott Janzwood\n\n\nlong-term-relevant\n\n\nsubmitted\n\n\n65\n\n\nNA\n\n\nNA\n\n\nNA\n\n\nNA\n\n\n\n\nLT CEA: Resilient foods vs. AGI safety\n\n\nAnca Hanea\n\n\nlong-term-relevant\n\n\nsubmitted\n\n\n80\n\n\n80\n\n\n70\n\n\n85\n\n\n3.5\n\n\n\n\nLT CEA: Resilient foods vs. AGI safety\n\n\nAlex Bates\n\n\nlong-term-relevant\n\n\nsubmitted\n\n\n40\n\n\n30\n\n\n50\n\n\n60\n\n\n2.0\n\n\n\n\nEnv. fx of prod.: ecological obs\n\n\nElias Cisneros\n\n\nNA\n\n\ninternal-NBER\n\n\n88\n\n\n90\n\n\n75\n\n\n80\n\n\n4.0\n\n\n\n\nEnv. fx of prod.: ecological obs\n\n\nAnonymous_12\n\n\nNA\n\n\ninternal-NBER\n\n\n70\n\n\n70\n\n\n70\n\n\n75\n\n\n4.0\n\n\n\n\nCBT Human K, Ghana\n\n\nAnonymous_11\n\n\nNA\n\n\ninternal-NBER\n\n\n75\n\n\n60\n\n\n90\n\n\n70\n\n\n4.0\n\n\n\n\nCBT Human K, Ghana\n\n\nAnonymous_16\n\n\nNA\n\n\ninternal-NBER\n\n\n75\n\n\n65\n\n\n60\n\n\n75\n\n\nNA\n\n\n\n\nBanning wildlife trade can boost demand\n\n\nAnonymous_3\n\n\nconservation\n\n\nsubmitted\n\n\n75\n\n\n70\n\n\n80\n\n\n70\n\n\n3.0\n\n\n\n\nBanning wildlife trade can boost demand\n\n\nLiew Jia Huan\n\n\nconservation\n\n\nsubmitted\n\n\n75\n\n\n80\n\n\n50\n\n\n70\n\n\n2.5\n\n\n\n\nAdvance market commit. (vaccines)\n\n\nDavid Manheim\n\n\npolicy\n\n\ninternal-from-syllabus-agenda-policy-database\n\n\n80\n\n\n25\n\n\n95\n\n\n75\n\n\n3.0\n\n\n\n\nAdvance market commit. (vaccines)\n\n\nJoel Tan\n\n\npolicy\n\n\ninternal-from-syllabus-agenda-policy-database\n\n\n79\n\n\n90\n\n\n70\n\n\n70\n\n\n5.0\n\n\n\n\nAdvance market commit. (vaccines)\n\n\nDan Tortorice\n\n\npolicy\n\n\ninternal-from-syllabus-agenda-policy-database\n\n\n80\n\n\n90\n\n\n80\n\n\n80\n\n\n4.0\n\n\n\n\nAI and econ. growth\n\n\nSeth Benzell\n\n\nmacroeconomics\n\n\ninternal-from-syllabus-agenda-policy-database\n\n\n80\n\n\n75\n\n\n80\n\n\n70\n\n\nNA\n\n\n\n\nAI and econ. growth\n\n\nPhil Trammel\n\n\nmacroeconomics\n\n\ninternal-from-syllabus-agenda-policy-database\n\n\n92\n\n\n97\n\n\n70\n\n\n45\n\n\n3.5\n\n\n\n\n\n\n\nNext, look for systematic variation\n\nBy field and topic area of paper\nBy submission/selection route\nBy evaluation manager\n\n… perhaps building a model of this. We are looking for systematic ‘biases and trends’, loosely speaking, to help us better understand how our evaluation system is working.\n\nRelationship among the ratings (and predictions)\n\nCorrelation matrix\nANOVA\nPCA (Principle components)\nWith other ‘control’ factors?\n\nHow do the specific measures predict the aggregate ones (overall rating, merited publication)\n\nCF ‘our suggested weighting’" + "text": "2.1 What sorts of papers/projects are we considering and evaluating?\nIn this section, we give some simple data summaries and visualizations, for a broad description of The Unjournal’s coverage.\nIn the interactive tables below we give some key attributes of the papers and the evaluators, and a preview of the evaluations.\n\n\nCode(\n all_evals_dt <- evals_pub %>%\n arrange(paper_abbrev, eval_name) %>%\n dplyr::select(paper_abbrev, crucial_rsx, eval_name, cat_1, cat_2, source_main_wrapped, author_agreement) %>%\n dplyr::select(-matches(\"ub_|lb_|conf\")) %>% \n #rename_all(~ gsub(\"_\", \" \", .)) %>% \n rename(\"Research _____________________\" = \"crucial_rsx\" \n ) %>%\n DT::datatable(\n caption = \"Evaluations (confidence bounds not shown)\", \n filter = 'top',\n rownames= FALSE,\n options = list(pageLength = 7)\n )\n)\n\n\n\n\n\n\n\nNext, the ‘middle ratings and predictions’.\n\nData datable (all shareable relevant data)(\n all_evals_dt <- evals_pub %>%\n arrange(paper_abbrev, eval_name, overall) %>%\n dplyr::select(paper_abbrev, eval_name, all_of(rating_cats)) %>%\n DT::datatable(\n caption = \"Evaluations and predictions (confidence bounds not shown)\", \n filter = 'top',\n rownames= FALSE,\n options = list(pageLength = 7)\n )\n)\n\n\n\n\n\n\n\n\n\nCode(\n all_evals_dt_ci <- evals_pub %>%\n arrange(paper_abbrev, eval_name) %>%\n dplyr::select(paper_abbrev, eval_name, conf_overall, rating_cats, matches(\"ub_imp|lb_imp\")) %>%\n DT::datatable(\n caption = \"Evaluations and (imputed*) confidence bounds)\", \n filter = 'top',\n rownames= FALSE,\n options = list(pageLength = 7)\n )\n)\n\n\n\n\n\n\n\n\n\nNext consider…\n\n\n\n\n\n\nComposition of research evaluated\n\nBy field (economics, psychology, etc.)\nBy subfield of economics\nBy topic/cause area (Global health, economic development, impact of technology, global catastrophic risks, etc. )\nBy source (submitted, identified with author permission, direct evaluation)\n\n\nTiming of intake and evaluation2\n\n\n\n\n\nThe funnel plot below starts with the paper we prioritized for likely Unjournal evaluation, marking these as ‘considering’.\n\nCode#Add in the 3 different evaluation input sources\n#update to be automated rather than hard-coded - to look at David's work here\n\npapers_considered <- all_pub_records %>% nrow()\n\npapers_deprio <- all_pub_records %>% filter(`stage of process/todo` == \"de-prioritized\") %>% nrow()\n\npapers_evaluated <- all_pub_records %>% filter(`stage of process/todo` %in% c(\"published\",\n \"contacting/awaiting_authors_response_to_evaluation\",\n \"awaiting_publication_ME_comments\",\n \"awaiting_evaluations\")) %>% nrow()\n\npapers_complete <- all_pub_records %>% filter(`stage of process/todo` == \"published\") %>% \nnrow()\n\npapers_in_progress <- papers_evaluated - papers_complete\n\npapers_still_in_consideration <- all_pub_records %>% filter(`stage of process/todo` == \"considering\") %>% nrow()\n\n\n#todo: adjust wording of hover notes ('source, target...etc')\n\nfig <- plot_ly(\n type = \"sankey\",\n orientation = \"h\",\n\n node = list(\n label = c(\"Prioritized\", \"Evaluating(ed)\", \"Complete\", \"In progress\", \"Still in consideration\", \"De-prioritized\"),\n color = c(\"orange\", \"green\", \"green\", \"orange\", \"orange\", \"red\"),\n#Todo: adjust 'location' to group these left to right\n pad = 15,\n thickness = 20,\n line = list(\n color = \"black\",\n width = 0.5\n )\n ),\n\n link = list(\n source = c(0,1,1,0,0),\n target = c(1,2,3,4,5),\n value = c(\n papers_evaluated,\n papers_complete,\n papers_in_progress,\n papers_still_in_consideration,\n papers_deprio\n ))\n )\nfig <- fig %>% layout(\n title = \"Unjournal paper funnel\",\n font = list(\n size = 10\n )\n)\n\nfig \n\n\n\n\n\n(In future, will make interactive/dashboards of the elements below)\n\nCodesummary_df <- evals_pub %>%\n distinct(crucial_rsx, .keep_all = T) %>% \n group_by(cat_1) %>%\n summarise(count = n()) \n\nsummary_df$cat_1[is.na(summary_df$cat_1)] <- \"Unknown\"\n\nsummary_df <- summary_df %>%\n arrange(-desc(count)) %>%\n mutate(cat_1 = factor(cat_1, levels = unique(cat_1)))\n\n# Create stacked bar chart\nggplot(summary_df, aes(x = cat_1, y = count)) +\n geom_bar(stat = \"identity\") + \n theme_minimal() +\n labs(x = \"Paper category\", y = \"Count\", \n title = \"Count of evaluated papers by primary category\") \n\n\n\n\n\nCode# Bar plot\nggplot(evals_pub, aes(x = source_main_wrapped)) + \n geom_bar(position = \"stack\", stat = \"count\") +\n labs(x = \"Source\", y = \"Count\") +\n theme_light() +\n theme_minimal() +\n ggtitle(\"Pool of research/evaluations by paper source\") + # add title\n theme(\n panel.grid.major = element_blank(),\n panel.grid.minor = element_blank(),\n text = element_text(size = 16), # changing all text size to 16\n axis.text.y = element_text(size = 10),\n axis.text.x = element_text(size = 14)\n )\n\n\n\n\n\nCodeall_pub_records$is_evaluated = all_pub_records$`stage of process/todo` %in% c(\"published\",\n \"contacting/awaiting_authors_response_to_evaluation\",\n \"awaiting_publication_ME_comments\",\n \"awaiting_evaluations\")\n\nall_pub_records$source_main[all_pub_records$source_main == \"NA\"] <- \"Not applicable\" \nall_pub_records$source_main[all_pub_records$source_main == \"internal-from-syllabus-agenda-policy-database\"] <- \"Internal: syllabus, agenda, etc.\" \nall_pub_records$source_main = tidyr::replace_na(all_pub_records$source_main, \"Unknown\")\n\n\n\nggplot(all_pub_records, aes(x = fct_infreq(source_main), fill = is_evaluated)) + \n geom_bar(position = \"stack\", stat = \"count\") +\n labs(x = \"Source\", y = \"Count\", fill = \"Selected for\\nevaluation?\") +\n coord_flip() + # flipping the coordinates to have categories on y-axis (on the left)\n theme_light() +\n theme_minimal() +\n ggtitle(\"Evaluations by source of the paper\") +# add title\n theme(\n panel.grid.major = element_blank(),\n panel.grid.minor = element_blank(),\n text = element_text(size = 16), # changing all text size to 16\n axis.text.y = element_text(size = 12),\n axis.text.x = element_text(size = 14)\n )\n\n\n\n\nThe distribution of ratings and predictions\nNext, we present the ratings and predictions along with ‘uncertainty measures’.3 Where evaluators gave only a 1-5 confidence level4, we use the imputations discussed and coded above.\n\nFor each category and prediction (overall and by paper)\n\n\n\nCodewrap_text <- function(text, width) {\n sapply(strwrap(text, width = width, simplify = FALSE), paste, collapse = \"\\n\")\n}\n\nevals_pub$wrapped_pub_names <- wrap_text(evals_pub$paper_abbrev, width = 15)\n\n\n#todo -- sort by average overall, use color and vertical spacing more\n#todo: introduce a carriage return into the paper names (workaround) to wrap these and save horizontal space\n\n\n# Dot plot\nggplot(evals_pub, aes(x = paper_abbrev, y = overall)) +\n geom_point(stat = \"identity\", size = 3, shape = 1, colour = \"lightblue\", stroke = 2) +\n geom_text_repel(aes(label = eval_name), \n size = 3, \n box.padding = unit(0.35, \"lines\"),\n point.padding = unit(0.3, \"lines\")) +\n coord_flip() + # flipping the coordinates to have categories on y-axis (on the left)\n theme_light() +\n xlab(\"Paper\") + # remove x-axis label\n ylab(\"Overall score\") + # name y-axis\n ggtitle(\"Overall scores of evaluated papers\") +# add title\n theme(\n panel.grid.major = element_blank(),\n panel.grid.minor = element_blank(),\n text = element_text(size = 14), # changing all text size to 16\n axis.text.y = element_text(size = 8),\n axis.text.x = element_text(size = 12)\n )\n\n\n\nCode#todo -- add more vertical space between papers\n\n\n\nIn future (todo), we aim to build a dashboard allowing people to use the complete set of ratings and predictions, and choose their own weightings. (Also incorporating the evaluator uncertainty in reasonable ways.)\nThe below should be fixed – the column widths below are misleading\n\n\n\n\n\n\nFuture vis\n\n\n\n\n\nSpider or radial chart\nEach rating is a dimension or attribute (potentially normalized) potentially superimpose a ‘circle’ for the suggested weighting or overall.\nEach paper gets its own spider, with all others (or the average) in faded color behind it as a comparator.\nIdeally user can switch on/off\nBeware – people infer things from the shape’s size\n\n\n\n\n\nCodeunit.scale = function(x) (x*100 - min(x*100)) / (max(x*100) - min(x*100))\nevaluations_table <- evals_pub %>%\n select(paper_abbrev, eval_name, cat_1, source_main, overall, adv_knowledge, methods, logic_comms, journal_predict) %>%\n arrange(desc(paper_abbrev))\n\nout = formattable(\n evaluations_table,\n list(\n #area(col = 5:8) ~ function(x) percent(x / 100, digits = 0),\n area(col = 5:8) ~ color_tile(\"#FA614B66\",\"#3E7DCC\"),\n `journal_predict` = proportion_bar(\"#DeF7E9\", unit.scale)\n )\n)\nout\n\n\n\n\n\npaper_abbrev\n\n\neval_name\n\n\ncat_1\n\n\nsource_main\n\n\noverall\n\n\nadv_knowledge\n\n\nmethods\n\n\nlogic_comms\n\n\njournal_predict\n\n\n\n\n\nWell-being: Cash vs. psychotherapy\n\n\nAnonymous_13\n\n\nGH&D\n\n\ninternal-NBER\n\n\n90\n\n\n90\n\n\n90\n\n\n80\n\n\n4.0\n\n\n\n\nWell-being: Cash vs. psychotherapy\n\n\nHannah Metzler\n\n\nGH&D\n\n\ninternal-NBER\n\n\n75\n\n\n70\n\n\n90\n\n\n75\n\n\n3.0\n\n\n\n\nNonprofit Govc.: Randomized healthcare DRC\n\n\nWayne Aaron Sandholtz\n\n\nGH&D\n\n\ninternal-NBER\n\n\n65\n\n\n70\n\n\n60\n\n\n55\n\n\n3.6\n\n\n\n\nLT CEA: Resilient foods vs. AGI safety\n\n\nScott Janzwood\n\n\nlong-term-relevant\n\n\nsubmitted\n\n\n65\n\n\nNA\n\n\nNA\n\n\nNA\n\n\nNA\n\n\n\n\nLT CEA: Resilient foods vs. AGI safety\n\n\nAnca Hanea\n\n\nlong-term-relevant\n\n\nsubmitted\n\n\n80\n\n\n80\n\n\n70\n\n\n85\n\n\n3.5\n\n\n\n\nLT CEA: Resilient foods vs. AGI safety\n\n\nAlex Bates\n\n\nlong-term-relevant\n\n\nsubmitted\n\n\n40\n\n\n30\n\n\n50\n\n\n60\n\n\n2.0\n\n\n\n\nEnv. fx of prod.: ecological obs\n\n\nElias Cisneros\n\n\nNA\n\n\ninternal-NBER\n\n\n88\n\n\n90\n\n\n75\n\n\n80\n\n\n4.0\n\n\n\n\nEnv. fx of prod.: ecological obs\n\n\nAnonymous_12\n\n\nNA\n\n\ninternal-NBER\n\n\n70\n\n\n70\n\n\n70\n\n\n75\n\n\n4.0\n\n\n\n\nCBT Human K, Ghana\n\n\nAnonymous_11\n\n\nNA\n\n\ninternal-NBER\n\n\n75\n\n\n60\n\n\n90\n\n\n70\n\n\n4.0\n\n\n\n\nCBT Human K, Ghana\n\n\nAnonymous_16\n\n\nNA\n\n\ninternal-NBER\n\n\n75\n\n\n65\n\n\n60\n\n\n75\n\n\nNA\n\n\n\n\nBanning wildlife trade can boost demand\n\n\nAnonymous_3\n\n\nconservation\n\n\nsubmitted\n\n\n75\n\n\n70\n\n\n80\n\n\n70\n\n\n3.0\n\n\n\n\nBanning wildlife trade can boost demand\n\n\nLiew Jia Huan\n\n\nconservation\n\n\nsubmitted\n\n\n75\n\n\n80\n\n\n50\n\n\n70\n\n\n2.5\n\n\n\n\nAdvance market commit. (vaccines)\n\n\nDavid Manheim\n\n\npolicy\n\n\ninternal-from-syllabus-agenda-policy-database\n\n\n80\n\n\n25\n\n\n95\n\n\n75\n\n\n3.0\n\n\n\n\nAdvance market commit. (vaccines)\n\n\nJoel Tan\n\n\npolicy\n\n\ninternal-from-syllabus-agenda-policy-database\n\n\n79\n\n\n90\n\n\n70\n\n\n70\n\n\n5.0\n\n\n\n\nAdvance market commit. (vaccines)\n\n\nDan Tortorice\n\n\npolicy\n\n\ninternal-from-syllabus-agenda-policy-database\n\n\n80\n\n\n90\n\n\n80\n\n\n80\n\n\n4.0\n\n\n\n\nAI and econ. growth\n\n\nSeth Benzell\n\n\nmacroeconomics\n\n\ninternal-from-syllabus-agenda-policy-database\n\n\n80\n\n\n75\n\n\n80\n\n\n70\n\n\nNA\n\n\n\n\nAI and econ. growth\n\n\nPhil Trammel\n\n\nmacroeconomics\n\n\ninternal-from-syllabus-agenda-policy-database\n\n\n92\n\n\n97\n\n\n70\n\n\n45\n\n\n3.5\n\n\n\n\n\n\n\nNext, look for systematic variation\n\nBy field and topic area of paper\nBy submission/selection route\nBy evaluation manager\n\n… perhaps building a model of this. We are looking for systematic ‘biases and trends’, loosely speaking, to help us better understand how our evaluation system is working.\n\nRelationship among the ratings (and predictions)\n\nCorrelation matrix\nANOVA\nPCA (Principle components)\nWith other ‘control’ factors?\n\nHow do the specific measures predict the aggregate ones (overall rating, merited publication)\n\nCF ‘our suggested weighting’" }, { "objectID": "chapters/evaluation_data.html#aggregation-of-expert-opinion-modeling",