diff --git a/.gitignore b/.gitignore index 16f7881..5b828ed 100644 --- a/.gitignore +++ b/.gitignore @@ -1,47 +1,33 @@ # History files .Rhistory .Rapp.history - # Session Data files .RData .RDataTmp - # User-specific files .Ruserdata - # Example code in package build process *-Ex.R - # Output files from R CMD build /*.tar.gz - # Output files from R CMD check /*.Rcheck/ - # RStudio files .Rproj.user/ - # produced vignettes - # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 .httr-oauth - # knitr and R markdown default cache directories *_cache/ /cache/ - # Temporary files created by R markdown *.utf8.md *.knit.md - # R Environment Variables .Renviron - - # translation temp files po/*~ - # RStudio Connect folder rsconnect/ - /.quarto/ +quarto-cli diff --git a/_freeze/chapters/evaluation_data/execute-results/html.json b/_freeze/chapters/evaluation_data/execute-results/html.json index dd8ab9d..99b9010 100644 --- a/_freeze/chapters/evaluation_data/execute-results/html.json +++ b/_freeze/chapters/evaluation_data/execute-results/html.json @@ -1,14 +1,18 @@ { - "hash": "41e3663b5e7b8e04b8cd544dbb677cb2", + "hash": "b9cd5e9fb7889a16ccdf9ec951bb5e0d", "result": { - "markdown": "# Evaluation data: description, exploration, checks\n\n\n::: {.cell}\n\n```{.r .cell-code code-summary=\"load packages\"}\n#devtools::install_github(\"rethinkpriorities/rp-r-package\")\nlibrary(rethinkpriorities)\n\n#devtools::install_github(\"rethinkpriorities/r-noodling-package\")\nlibrary(rnoodling)\n\nlibrary(here)\nsource(here::here(\"code\", \"shared_packages_code.R\"))\nlibrary(dplyr)\nlibrary(pacman)\n\np_load(DT, santoku, lme4, huxtable, janitor, emmeans, sjPlot, sjmisc, ggeffects, ggrepel, likert, labelled, plotly, stringr, install=FALSE)\n\np_load(ggthemes, paletteer, ggridges, install=FALSE)\n\nselect <- dplyr::select\n\noptions(knitr.duplicate.label = \"allow\")\n\noptions(mc.cores = parallel::detectCores())\n#rstan_options(auto_write = TRUE)\n\n#library(hunspell)\n\n#(brms)\n\n#devtools::install_github(\"bergant/airtabler\")\np_load(airtabler)\n\n#remotes::install_github(\"rmcelreath/rethinking\")\n#library(rethinking)\n```\n:::\n\n::: {.cell}\n\n```{.r .cell-code code-summary=\"install aggrecat package\"}\n#devtools::install_github(\"metamelb-repliCATS/aggreCAT\")\n```\n:::\n\n::: {.cell}\n\n```{.r .cell-code code-summary=\"input from airtable\"}\nbase_id <- \"appbPYEw9nURln7Qg\"\n\n# Set your Airtable API key\n#Sys.setenv(AIRTABLE_API_KEY = \"\") \n#this should be set in my .Renviron file\n\n\n# Read data from a specific view\nevals <- air_get(base = \"appbPYEw9nURln7Qg\", \"output_eval\") \n```\n:::\n\n::: {.cell}\n\n```{.r .cell-code code-summary=\"just the useful and publish-able data, clean a bit\"}\ncolnames(evals) <- snakecase::to_snake_case(colnames(evals))\n\nevals_pub <- evals %>% \n dplyr::rename(stage_of_process = stage_of_process_todo_from_crucial_research_2) %>% \n mutate(stage_of_process = unlist(stage_of_process)) %>% \n dplyr::filter(stage_of_process == \"published\") %>% \n select(id, crucial_research, evaluator_name, category, source_main, author_agreement, overall, lb_overall, ub_overall, conf_index_overall, advancing_knowledge_and_practice, lb_advancing_knowledge_and_practice, ub_advancing_knowledge_and_practice, conf_index_advancing_knowledge_and_practice, methods_justification_reasonableness_validity_robustness, lb_methods_justification_reasonableness_validity_robustness, ub_methods_justification_reasonableness_validity_robustness, conf_index_methods_justification_reasonableness_validity_robustness, logic_communication, lb_logic_communication, ub_logic_communication, conf_index_logic_communication, engaging_with_real_world_impact_quantification_practice_realism_and_relevance, lb_engaging_with_real_world_impact_quantification_practice_realism_and_relevance, ub_engaging_with_real_world_impact_quantification_practice_realism_and_relevance, conf_index_engaging_with_real_world_impact_quantification_practice_realism_and_relevance, relevance_to_global_priorities, lb_relevance_to_global_priorities, ub_relevance_to_global_priorities, conf_index_relevance_to_global_priorities, journal_quality_predict, lb_journal_quality_predict, ub_journal_quality_predict, conf_index_journal_quality_predict, open_collaborative_replicable, conf_index_open_collaborative_replicable, lb_open_collaborative_replicable, ub_open_collaborative_replicable, merits_journal, lb_merits_journal, ub_merits_journal, conf_index_merits_journal)\n```\n\n::: {.cell-output .cell-output-error}\n```\nError in `mutate()`:\nℹ In argument: `stage_of_process = unlist(stage_of_process)`.\nCaused by error:\n! `stage_of_process` must be size 20 or 1, not 23.\n```\n:::\n\n```{.r .cell-code code-summary=\"just the useful and publish-able data, clean a bit\"}\nevals_pub %<>%\nmutate(across(everything(), ~ map(.x, ~ ifelse(is.null(.x), NA, .x)), .names = \"{.col}_unlisted\")) %>% \n tidyr::unnest_wider(category, names_sep = \"\") %>%\nmutate(across(everything(), unlist)) #unlist list columns\n```\n\n::: {.cell-output .cell-output-error}\n```\nError in mutate(., across(everything(), ~map(.x, ~ifelse(is.null(.x), : object 'evals_pub' not found\n```\n:::\n\n```{.r .cell-code code-summary=\"just the useful and publish-able data, clean a bit\"}\n#Note: category, topic_subfield, and source have multiple meaningful categories. These will need care \n```\n:::\n\n::: {.cell}\n\n```{.r .cell-code code-summary=\"Shorten names \"}\nnew_names <- c(\n \"eval_name\" = \"evaluator_name\",\n \"cat_1\" = \"category1\",\n \"cat_2\" = \"category2\",\n \"cat_3\" = \"category3\",\n \"crucial_rsx\" = \"crucial_research\",\n \"conf_overall\" = \"conf_index_overall\",\n \"adv_knowledge\" = \"advancing_knowledge_and_practice\",\n \"lb_adv_knowledge\" = \"lb_advancing_knowledge_and_practice\",\n \"ub_adv_knowledge\" = \"ub_advancing_knowledge_and_practice\",\n \"conf_adv_knowledge\" = \"conf_index_advancing_knowledge_and_practice\",\n \"methods\" = \"methods_justification_reasonableness_validity_robustness\",\n \"lb_methods\" = \"lb_methods_justification_reasonableness_validity_robustness\",\n \"ub_methods\" = \"ub_methods_justification_reasonableness_validity_robustness\",\n \"conf_methods\" = \"conf_index_methods_justification_reasonableness_validity_robustness\",\n \"logic_comms\" = \"logic_communication\",\n \"lb_logic_comms\" = \"lb_logic_communication\",\n \"ub_logic_comms\" = \"ub_logic_communication\",\n \"conf_logic_comms\" = \"conf_index_logic_communication\",\n \"real_world\" = \"engaging_with_real_world_impact_quantification_practice_realism_and_relevance\",\n \"lb_real_world\" = \"lb_engaging_with_real_world_impact_quantification_practice_realism_and_relevance\",\n \"ub_real_world\" = \"ub_engaging_with_real_world_impact_quantification_practice_realism_and_relevance\",\n \"conf_real_world\" = \"conf_index_engaging_with_real_world_impact_quantification_practice_realism_and_relevance\",\n \"gp_relevance\" = \"relevance_to_global_priorities\",\n \"lb_gp_relevance\" = \"lb_relevance_to_global_priorities\",\n \"ub_gp_relevance\" = \"ub_relevance_to_global_priorities\",\n \"conf_gp_relevance\" = \"conf_index_relevance_to_global_priorities\",\n \"journal_predict\" = \"journal_quality_predict\",\n \"lb_journal_predict\" = \"lb_journal_quality_predict\",\n \"ub_journal_predict\" = \"ub_journal_quality_predict\",\n \"conf_journal_predict\" = \"conf_index_journal_quality_predict\",\n \"open_sci\" = \"open_collaborative_replicable\",\n \"conf_open_sci\" = \"conf_index_open_collaborative_replicable\",\n \"lb_open_sci\" = \"lb_open_collaborative_replicable\",\n \"ub_open_sci\" = \"ub_open_collaborative_replicable\",\n \"conf_merits_journal\" = \"conf_index_merits_journal\"\n)\n\nevals_pub <- evals_pub %>%\n rename(!!!new_names)\n```\n\n::: {.cell-output .cell-output-error}\n```\nError in rename(., !!!new_names): object 'evals_pub' not found\n```\n:::\n\n```{.r .cell-code code-summary=\"Shorten names \"}\n# make the old names into labels\n\nlibrary(stringr)\n \n# Create a list of labels\nlabels <- str_replace_all(new_names, \"_\", \" \")\nlabels <- str_to_title(labels)\n \n# Assign labels to the dataframe\n# for(i in seq_along(labels)) {\n# col_name <- new_names[names(new_names)[i]]\n# label <- labels[i]\n# attr(evals_pub[[col_name]], \"label\") <- label\n# }\n# \n```\n:::\n\n\n\n\n### Reconcile the uncertainty ratings and CIs (first-pass) {-}\n\nImpute CIs from stated confidence level 'dots', correspondence loosely described [here](https://effective-giving-marketing.gitbook.io/unjournal-x-ea-and-global-priorities-research/policies-projects-evaluation-workflow/evaluation/guidelines-for-evaluators#1-5-dots-explanation-and-relation-to-cis)\n\n::: {.callout-note collapse=\"true\"}\n## Dots to interval choices\n\n> 5 = Extremely confident, i.e., 90% confidence interval spans +/- 4 points or less)\n\nFor 0-100 ratings, code the LB as $min(R - 4\\times \\frac{R}{100},0)$ and the UB as $max(R + 4\\times \\frac{R}{100},0)$, where R is the stated (middle) rating. This 'scales' the CI, as interpreted, to be proportional to the rating, with a maximum 'interval' of about 8, with the rating is about 96.\n\n> 4 = Very*confident: 90% confidence interval +/- 8 points or less\n\nFor 0-100 ratings, code the LB as $min(R - 8\\times \\frac{R}{100},0)$ and the UB as $max(R + 8\\times \\frac{R}{100},0)$, where R is the stated (middle) rating. \n\n> 3 = Somewhat** confident: 90% confidence interval +/- 15 points or less \n\n> 2 = Not very** confident: 90% confidence interval, +/- 25 points or less\n\nComparable scaling for the 2-3 ratings as for the 4 and 5 rating.\n\n> 1 = Not** confident: (90% confidence interval +/- more than 25 points)\n \nCode LB as $min(R - 37.5\\times \\frac{R}{100},0)$ and the UB as $max(R + 37.5\\times \\frac{R}{100},0)$. \n \nThis is just a first-pass. There might be a more information-theoretic way of doing this. On the other hand, we might be switching the evaluations to use a different tool soon, perhaps getting rid of the 1-5 confidence ratings.\n\n::: \n\n\n\n::: {.cell}\n\n```{.r .cell-code code-summary=\"reconcile explicit bounds and stated confidence level\"}\n# Define the baseline widths for each confidence rating\nbaseline_widths <- c(4, 8, 15, 25, 37.5)\n\n# Define a function to calculate the lower and upper bounds, where given only an index\ncalc_bounds <- function(rating, confidence, lb_explicit, ub_explicit, scale=100) {\n # Check if confidence is NA\n if (is.na(confidence)) {\n return(c(lb_explicit, ub_explicit)) # Return explicit bounds if confidence is NA\n } else {\n baseline_width <- baseline_widths[confidence]\n lb <- pmax(rating - baseline_width * rating / scale, 0)\n ub <- pmin(rating + baseline_width * rating / scale, scale)\n return(c(lb, ub))\n }\n}\n\n# Function to calculate bounds for a single category\ncalc_category_bounds <- function(df, category, scale=100) {\n # Calculate bounds\n bounds <- mapply(calc_bounds, df[[category]], df[[paste0(\"conf_\", category)]], df[[paste0(\"lb_\", category)]], df[[paste0(\"ub_\", category)]])\n \n # Convert to data frame and ensure it has the same number of rows as the input\n bounds_df <- as.data.frame(t(bounds))\n rownames(bounds_df) <- NULL\n \n # Add bounds to original data frame\n df[[paste0(category, \"_lb_imp\")]] <- bounds_df[, 1]\n df[[paste0(category, \"_ub_imp\")]] <- bounds_df[, 2]\n \n return(df)\n}\n\n\n# Lists of categories\n\nrating_cats <- c(\"overall\", \"adv_knowledge\", \"methods\", \"logic_comms\", \"real_world\", \"gp_relevance\", \"open_sci\")\n\n#... 'predictions' are currently 1-5 (0-5?)\npred_cats <- c(\"journal_predict\", \"merits_journal\")\n\n# Apply the function to each category\n# DR: I don't love this looping 'edit in place' code approach, but whatever\nfor (cat in rating_cats) {\n evals_pub <- calc_category_bounds(evals_pub, cat, scale=100)\n}\n```\n\n::: {.cell-output .cell-output-error}\n```\nError in calc_category_bounds(evals_pub, cat, scale = 100): object 'evals_pub' not found\n```\n:::\n\n```{.r .cell-code code-summary=\"reconcile explicit bounds and stated confidence level\"}\nfor (cat in pred_cats) {\n evals_pub <- calc_category_bounds(evals_pub, cat, scale=5)\n}\n```\n\n::: {.cell-output .cell-output-error}\n```\nError in calc_category_bounds(evals_pub, cat, scale = 5): object 'evals_pub' not found\n```\n:::\n:::\n\n::: {.cell}\n\n```{.r .cell-code code-summary=\"save data for others' use\"}\nevals_pub %>% saveRDS(file = here(\"data\", \"evals.Rdata\"))\n```\n\n::: {.cell-output .cell-output-error}\n```\nError in saveRDS(., file = here(\"data\", \"evals.Rdata\")): object 'evals_pub' not found\n```\n:::\n\n```{.r .cell-code code-summary=\"save data for others' use\"}\nevals_pub %>% write_csv(file = here(\"data\", \"evals.csv\"))\n```\n\n::: {.cell-output .cell-output-error}\n```\nError in is.data.frame(x): object 'evals_pub' not found\n```\n:::\n\n```{.r .cell-code code-summary=\"save data for others' use\"}\n#evals_pub %>% readRDS(file = here(\"data\", \"evals.Rdata\"))\n```\n:::\n\n\n \n# Basic presentation\n\n## Simple data summaries/codebooks/dashboards and visualization\n\nBelow, we give a data table of key attributes of the paper, the author, and the 'middle' ratings and predictions. \n\n\n::: {.cell}\n\n```{.r .cell-code code-summary=\"Data datable (all shareable relevant data)\"}\n(\n all_evals_dt <- evals_pub %>%\n arrange(crucial_rsx, eval_name) %>%\n dplyr::select(crucial_rsx, eval_name, everything())) %>%\n dplyr::select(-id) %>% \n dplyr::select(-matches(\"ub_|lb_|conf\")) %>% \n rename_all(~ gsub(\"_\", \" \", .)) %>% \n rename(\"Research _____________________\" = \"crucial rsx\" \n ) %>%\n DT::datatable(\n caption = \"Evaluations (confidence bounds not shown)\", \n filter = 'top',\n rownames= FALSE,\n options = list(pageLength = 7)\n )\n```\n\n::: {.cell-output .cell-output-error}\n```\nError in arrange(., crucial_rsx, eval_name): object 'evals_pub' not found\n```\n:::\n:::\n\n\nNext, we present the ratings and predictions along with 'uncertainty measures'. We use \"ub imp\" (and \"lb imp\") to denote the upper and lower bounds given by evaluators. Where evaluators gave only a 1-5 confidence level^[More or less, the ones who report a level for 'conf overall', although some people did this for some but not others], we use the imputations discussed and coded above. \n\n\n::: {.cell}\n\n```{.r .cell-code}\n(\n all_evals_dt_ci <- evals_pub %>%\n arrange(crucial_rsx, eval_name) %>%\n dplyr::select(crucial_rsx, eval_name, conf_overall, matches(\"ub_imp|lb_imp\")) %>%\n rename_all(~ gsub(\"_\", \" \", .)) %>% \n rename(\"Research _____________________\" = \"crucial rsx\" \n ) %>%\n DT::datatable(\n caption = \"Evaluations and (imputed*) confidence bounds)\", \n filter = 'top',\n rownames= FALSE,\n options = list(pageLength = 7)\n )\n)\n```\n\n::: {.cell-output .cell-output-error}\n```\nError in arrange(., crucial_rsx, eval_name): object 'evals_pub' not found\n```\n:::\n:::\n\n\n\n- Composition of research evaluated\n - By field (economics, psychology, etc.)\n - By subfield of economics \n - By topic/cause area (Global health, economic development, impact of technology, global catastrophic risks, etc. )\n - By source (submitted, identified with author permission, direct evaluation)\n \n- Timing of intake and evaluation \n\n\n### The distribution of ratings and predictions {-}\n\n- For each category and prediction (overall and by paper)\n\n- By field and topic area of paper\n\n- By submission/selection route\n\n- By evaluation manager\n\n\n### Relationship among the ratings (and predictions) {-} \n\n- Correlation matrix\n\n- ANOVA\n\n- PCI\n\n- With other 'control' factors?\n\n- How do the specific measures predict the aggregate ones (overall rating, merited publication)\n - CF 'our suggested weighting'\n\n\n## Aggregation of expert opinion (modeling)\n\n\n## Notes on sources and approaches\n\n\n::: {.callout-note collapse=\"true\"}\n## Hanea et al {-}\n(Consult, e.g., repliCATS/Hanea and others work; meta-science and meta-analysis approaches)\n\n`aggrecat` package\n\n> Although the accuracy, calibration, and informativeness of the majority of methods are very similar, a couple of the aggregation methods consistently distinguish themselves as among the best or worst. Moreover, the majority of methods outperform the usual benchmarks provided by the simple average or the median of estimates.\n\n[Hanea et al, 2021](https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0256919#sec007)\n\n However, these are in a different context. Most of those measures are designed to deal with probablistic forecasts for binary outcomes, where the predictor also gives a 'lower bound' and 'upper bound' for that probability. We could roughly compare that to our continuous metrics with 90% CI's (or imputations for these).\n\nFurthermore, many (all their successful measures?) use 'performance-based weights', accessing metrics from prior prediction performance of the same forecasters We do not have these, nor do we have a sensible proxy for this. \n:::\n\n\n### D Veen et al (2017)\n\n[link](https://www.researchgate.net/profile/Duco-Veen/publication/319662351_Using_the_Data_Agreement_Criterion_to_Rank_Experts'_Beliefs/links/5b73e2dc299bf14c6da6c663/Using-the-Data-Agreement-Criterion-to-Rank-Experts-Beliefs.pdf)\n\n... we show how experts can be ranked based on their knowledge and their level of (un)certainty. By letting experts specify their knowledge in the form of a probability distribution, we can assess how accurately they can predict new data, and how appropriate their level of (un)certainty is. The expert’s specified probability distribution can be seen as a prior in a Bayesian statistical setting. We evaluate these priors by extending an existing prior-data (dis)agreement measure, the Data Agreement Criterion, and compare this approach to using Bayes factors to assess prior specification. We compare experts with each other and the data to evaluate their appropriateness. Using this method, new research questions can be asked and answered, for instance: Which expert predicts the new data best? Is there agreement between my experts and the data? Which experts’ representation is more valid or useful? Can we reach convergence between expert judgement and data? We provided an empirical example ranking (regional) directors of a large financial institution based on their predictions of turnover. \n\nBe sure to consult the [correction made here](https://www.semanticscholar.org/paper/Correction%3A-Veen%2C-D.%3B-Stoel%2C-D.%3B-Schalken%2C-N.%3B-K.%3B-Veen-Stoel/a2882e0e8606ef876133f25a901771259e7033b1)\n\n::: \n\n\n::: {.callout-note collapse=\"true\"}\n## Also seems relevant:\n\nSee [Gsheet HERE](https://docs.google.com/spreadsheets/d/14japw6eLGpGjEWy1MjHNJXU1skZY_GAIc2uC2HIUalM/edit#gid=0), generated from an Elicit.org inquiry.\n\n\n::: \n\n\n\nIn spite of the caveats in the fold above, we construct some measures of aggregate beliefs using the `aggrecat` package. We will make (and explain) some ad-hoc choices here. We present these:\n\n1. For each paper\n2. For categories of papers and cross-paper categories of evaluations\n3. For the overall set of papers and evaluations\n\nWe can also hold onto these aggregated metrics for later use in modeling.\n\n\n- Simple averaging\n\n- Bayesian approaches \n\n- Best-performing approaches from elsewhere \n\n- Assumptions over unit-level random terms \n\n### Explicit modeling of 'research quality' (for use in prizes, etc.) {-}\n\n- Use the above aggregation as the outcome of interest, or weight towards categories of greater interest?\n\n- Model with controls -- look for greatest positive residual? \n\n\n## Inter-rater reliability\n\n## Decomposing variation, dimension reduction, simple linear models\n\n\n## Later possiblities\n\n- Relation to evaluation text content (NLP?)\n\n- Relation/prediction of later outcomes (traditional publication, citations, replication)\n\n\n", + "markdown": "# Evaluation data: description, exploration, checks\n\n\n::: {.cell}\n\n```{.r .cell-code code-summary=\"load packages\"}\nsource(here::here(\"code\", \"shared_packages_code.R\"))\n\n#devtools::install_github(\"rethinkpriorities/rp-r-package\")\nlibrary(rethinkpriorities)\n\n#devtools::install_github(\"rethinkpriorities/r-noodling-package\") #mainly used playing in real time\nlibrary(rnoodling)\n\nlibrary(here)\nlibrary(dplyr)\nlibrary(pacman)\n\np_load(formattable, sparkline, install=FALSE)\n\np_load(DT, santoku, lme4, huxtable, janitor, emmeans, sjPlot, sjmisc, ggeffects, ggrepel, likert, labelled, plotly, stringr, install=FALSE)\n\np_load(ggthemes, paletteer, ggridges, install=FALSE)\n\nselect <- dplyr::select \n\noptions(knitr.duplicate.label = \"allow\")\n\noptions(mc.cores = parallel::detectCores())\n#rstan_options(auto_write = TRUE)\n\n#library(hunspell)\n\n#(brms)\n\n#devtools::install_github(\"bergant/airtabler\")\np_load(airtabler)\n\n#remotes::install_github(\"rmcelreath/rethinking\")\n#library(rethinking)\n```\n:::\n\n\n\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code code-summary=\"input from airtable\"}\nbase_id <- \"appbPYEw9nURln7Qg\"\n\n# Set your Airtable API key\n#Sys.setenv(AIRTABLE_API_KEY = \"\") \n#this should be set in my .Renviron file\n\n\n# Read data from a specific view\n\nevals <- air_get(base = base_id, \"output_eval\") \n\nall_pub_records <- data.frame()\npub_records <- air_select(base = base_id, table = \"crucial_research\")\n\n# Append the records to the list\nall_pub_records <- bind_rows(all_pub_records, pub_records)\n\n# While the length of the records list is 100 (the maximum), fetch more records\nwhile(nrow(pub_records) == 100) {\n # Get the ID of the last record in the list\n offset <- get_offset(pub_records)\n \n # Fetch the next 100 records, starting after the last ID\n pub_records <- air_select(base = base_id, table = \"crucial_research\", offset = offset)\n \n # Append the records to the df\n all_pub_records <- bind_rows(all_pub_records, pub_records)\n}\n```\n:::\n\n::: {.cell}\n\n```{.r .cell-code code-summary=\"just the useful and publish-able data, clean a bit\"}\ncolnames(evals) <- snakecase::to_snake_case(colnames(evals))\n\nevals_pub <- evals %>% \n dplyr::rename(stage_of_process = stage_of_process_todo_from_crucial_research_2) %>% \n mutate(stage_of_process = unlist(stage_of_process)) %>% \n dplyr::filter(stage_of_process == \"published\") %>% \n select(id, crucial_research, paper_abbrev, evaluator_name, category, source_main, author_agreement, overall, lb_overall, ub_overall, conf_index_overall, advancing_knowledge_and_practice, lb_advancing_knowledge_and_practice, ub_advancing_knowledge_and_practice, conf_index_advancing_knowledge_and_practice, methods_justification_reasonableness_validity_robustness, lb_methods_justification_reasonableness_validity_robustness, ub_methods_justification_reasonableness_validity_robustness, conf_index_methods_justification_reasonableness_validity_robustness, logic_communication, lb_logic_communication, ub_logic_communication, conf_index_logic_communication, engaging_with_real_world_impact_quantification_practice_realism_and_relevance, lb_engaging_with_real_world_impact_quantification_practice_realism_and_relevance, ub_engaging_with_real_world_impact_quantification_practice_realism_and_relevance, conf_index_engaging_with_real_world_impact_quantification_practice_realism_and_relevance, relevance_to_global_priorities, lb_relevance_to_global_priorities, ub_relevance_to_global_priorities, conf_index_relevance_to_global_priorities, journal_quality_predict, lb_journal_quality_predict, ub_journal_quality_predict, conf_index_journal_quality_predict, open_collaborative_replicable, conf_index_open_collaborative_replicable, lb_open_collaborative_replicable, ub_open_collaborative_replicable, merits_journal, lb_merits_journal, ub_merits_journal, conf_index_merits_journal)\n\nevals_pub %<>%\n tidyr::unnest_wider(category, names_sep = \"\") %>%\n tidyr::unnest_wider(paper_abbrev, names_sep = \"\") %>%\nmutate(across(everything(), unlist)) %>% #unlist list columns \n dplyr::rename(paper_abbrev = paper_abbrev1)\n\n#Todo -- check the unlist is not propagating the entry\n#Note: category, topic_subfield, and source have multiple meaningful categories. These will need care \n```\n:::\n\n::: {.cell}\n\n```{.r .cell-code code-summary=\"Shorten names \"}\nnew_names <- c(\n \"eval_name\" = \"evaluator_name\",\n \"cat_1\" = \"category1\",\n \"cat_2\" = \"category2\",\n \"cat_3\" = \"category3\",\n \"crucial_rsx\" = \"crucial_research\",\n \"conf_overall\" = \"conf_index_overall\",\n \"adv_knowledge\" = \"advancing_knowledge_and_practice\",\n \"lb_adv_knowledge\" = \"lb_advancing_knowledge_and_practice\",\n \"ub_adv_knowledge\" = \"ub_advancing_knowledge_and_practice\",\n \"conf_adv_knowledge\" = \"conf_index_advancing_knowledge_and_practice\",\n \"methods\" = \"methods_justification_reasonableness_validity_robustness\",\n \"lb_methods\" = \"lb_methods_justification_reasonableness_validity_robustness\",\n \"ub_methods\" = \"ub_methods_justification_reasonableness_validity_robustness\",\n \"conf_methods\" = \"conf_index_methods_justification_reasonableness_validity_robustness\",\n \"logic_comms\" = \"logic_communication\",\n \"lb_logic_comms\" = \"lb_logic_communication\",\n \"ub_logic_comms\" = \"ub_logic_communication\",\n \"conf_logic_comms\" = \"conf_index_logic_communication\",\n \"real_world\" = \"engaging_with_real_world_impact_quantification_practice_realism_and_relevance\",\n \"lb_real_world\" = \"lb_engaging_with_real_world_impact_quantification_practice_realism_and_relevance\",\n \"ub_real_world\" = \"ub_engaging_with_real_world_impact_quantification_practice_realism_and_relevance\",\n \"conf_real_world\" = \"conf_index_engaging_with_real_world_impact_quantification_practice_realism_and_relevance\",\n \"gp_relevance\" = \"relevance_to_global_priorities\",\n \"lb_gp_relevance\" = \"lb_relevance_to_global_priorities\",\n \"ub_gp_relevance\" = \"ub_relevance_to_global_priorities\",\n \"conf_gp_relevance\" = \"conf_index_relevance_to_global_priorities\",\n \"journal_predict\" = \"journal_quality_predict\",\n \"lb_journal_predict\" = \"lb_journal_quality_predict\",\n \"ub_journal_predict\" = \"ub_journal_quality_predict\",\n \"conf_journal_predict\" = \"conf_index_journal_quality_predict\",\n \"open_sci\" = \"open_collaborative_replicable\",\n \"conf_open_sci\" = \"conf_index_open_collaborative_replicable\",\n \"lb_open_sci\" = \"lb_open_collaborative_replicable\",\n \"ub_open_sci\" = \"ub_open_collaborative_replicable\",\n \"conf_merits_journal\" = \"conf_index_merits_journal\"\n)\n\nevals_pub <- evals_pub %>%\n rename(!!!new_names)\n\n# Function to insert a newline character every 15 characters\nwrap_text <- function(x, width = 15) {\n gsub(\"(.{1,15})\", \"\\\\1-\\n\", x)\n}\n\nevals_pub$source_main_wrapped <- wrap_text(evals_pub$source_main, 15)\n\nevals_pub$eval_name <- ifelse(\n grepl(\"^\\\\b\\\\w+\\\\b$|\\\\bAnonymous\\\\b\", evals_pub$eval_name),\n paste0(\"Anonymous_\", seq_along(evals_pub$eval_name)),\n evals_pub$eval_name\n)\n\n\n# make the old names into labels\n\n# Create a list of labels\nlabels <- str_replace_all(new_names, \"_\", \" \")\nlabels <- str_to_title(labels)\n \n# Assign labels to the dataframe\n# for(i in seq_along(labels)) {\n# col_name <- new_names[names(new_names)[i]]\n# label <- labels[i]\n# attr(evals_pub[[col_name]], \"label\") <- label\n# }\n# \n```\n:::\n\n\n\n\n### Reconcile uncertainty ratings and CIs {-}\n\nWhere people gave only confidence level 'dots', we impute CIs (confidence/credible intervals). We follow the correspondence described [here](https://effective-giving-marketing.gitbook.io/unjournal-x-ea-and-global-priorities-research/policies-projects-evaluation-workflow/evaluation/guidelines-for-evaluators#1-5-dots-explanation-and-relation-to-cis). (Otherwise where they gave actual CIs, we use these.)^[Note this is only a first-pass; a more sophisticated approach may be warranted in future.]\n\n::: {.callout-note collapse=\"true\"}\n## Dots to interval choices\n\n> 5 = Extremely confident, i.e., 90% confidence interval spans +/- 4 points or less)\n\nFor 0-100 ratings, code the LB as $min(R - 4\\times \\frac{R}{100},0)$ and the UB as $max(R + 4\\times \\frac{R}{100},0)$, where R is the stated (middle) rating. This 'scales' the CI, as interpreted, to be proportional to the rating, with a maximum 'interval' of about 8, with the rating is about 96.\n\n> 4 = Very*confident: 90% confidence interval +/- 8 points or less\n\nFor 0-100 ratings, code the LB as $min(R - 8\\times \\frac{R}{100},0)$ and the UB as $max(R + 8\\times \\frac{R}{100},0)$, where R is the stated (middle) rating. \n\n> 3 = Somewhat** confident: 90% confidence interval +/- 15 points or less \n\n> 2 = Not very** confident: 90% confidence interval, +/- 25 points or less\n\nComparable scaling for the 2-3 ratings as for the 4 and 5 rating.\n\n> 1 = Not** confident: (90% confidence interval +/- more than 25 points)\n \nCode LB as $min(R - 37.5\\times \\frac{R}{100},0)$ and the UB as $max(R + 37.5\\times \\frac{R}{100},0)$. \n \nThis is just a first-pass. There might be a more information-theoretic way of doing this. On the other hand, we might be switching the evaluations to use a different tool soon, perhaps getting rid of the 1-5 confidence ratings altogether.\n\n::: \n\n\n\n::: {.cell}\n\n```{.r .cell-code code-summary=\"reconcile explicit bounds and stated confidence level\"}\n# Define the baseline widths for each confidence rating\nbaseline_widths <- c(4, 8, 15, 25, 37.5)\n\n# Define a function to calculate the lower and upper bounds, where given only an index\ncalc_bounds <- function(rating, confidence, lb_explicit, ub_explicit, scale=100) {\n # Check if confidence is NA\n if (is.na(confidence)) {\n return(c(lb_explicit, ub_explicit)) # Return explicit bounds if confidence is NA\n } else {\n baseline_width <- baseline_widths[confidence]\n lb <- pmax(rating - baseline_width * rating / scale, 0)\n ub <- pmin(rating + baseline_width * rating / scale, scale)\n return(c(lb, ub))\n }\n}\n\n# Function to calculate bounds for a single category\ncalc_category_bounds <- function(df, category, scale=100) {\n # Calculate bounds\n bounds <- mapply(calc_bounds, df[[category]], df[[paste0(\"conf_\", category)]], df[[paste0(\"lb_\", category)]], df[[paste0(\"ub_\", category)]])\n \n # Convert to data frame and ensure it has the same number of rows as the input\n bounds_df <- as.data.frame(t(bounds))\n rownames(bounds_df) <- NULL\n \n # Add bounds to original data frame\n df[[paste0(category, \"_lb_imp\")]] <- bounds_df[, 1]\n df[[paste0(category, \"_ub_imp\")]] <- bounds_df[, 2]\n \n return(df)\n}\n\n\n# Lists of categories\n\nrating_cats <- c(\"overall\", \"adv_knowledge\", \"methods\", \"logic_comms\", \"real_world\", \"gp_relevance\", \"open_sci\")\n\n#... 'predictions' are currently 1-5 (0-5?)\npred_cats <- c(\"journal_predict\", \"merits_journal\")\n\n# Apply the function to each category\n# DR: I don't love this looping 'edit in place' code approach, but whatever\nfor (cat in rating_cats) {\n evals_pub <- calc_category_bounds(evals_pub, cat, scale=100)\n}\n\nfor (cat in pred_cats) {\n evals_pub <- calc_category_bounds(evals_pub, cat, scale=5)\n}\n```\n:::\n\n::: {.cell}\n\n```{.r .cell-code code-summary=\"save data for others' use\"}\nevals_pub %>% saveRDS(file = here(\"data\", \"evals.Rdata\"))\nevals_pub %>% write_csv(file = here(\"data\", \"evals.csv\"))\n\n#evals_pub %>% readRDS(file = here(\"data\", \"evals.Rdata\"))\n```\n:::\n\n\n \n# Basic presentation\n\n## What sorts of papers/projects are we considering and evaluating? \n\nIn this section, we give some simple data summaries and visualizations, for a broad description of The Unjournal's coverage. \n\nIn the interactive tables below we give some key attributes of the papers and the evaluators, and a preview of the evaluations.\n\n\n::: column-body-outset\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\n(\n all_evals_dt <- evals_pub %>%\n arrange(paper_abbrev, eval_name) %>%\n dplyr::select(paper_abbrev, crucial_rsx, eval_name, cat_1, cat_2, source_main_wrapped, author_agreement) %>%\n dplyr::select(-matches(\"ub_|lb_|conf\")) %>% \n #rename_all(~ gsub(\"_\", \" \", .)) %>% \n rename(\"Research _____________________\" = \"crucial_rsx\" \n ) %>%\n DT::datatable(\n caption = \"Evaluations (confidence bounds not shown)\", \n filter = 'top',\n rownames= FALSE,\n options = list(pageLength = 7)\n )\n)\n```\n\n::: {.cell-output-display}\n```{=html}\n
\n\n```\n:::\n:::\n\n\n\n\\\n\nNext, the 'middle ratings and predictions'.\n\n\n::: {.cell}\n\n```{.r .cell-code code-summary=\"Data datable (all shareable relevant data)\"}\n(\n all_evals_dt <- evals_pub %>%\n arrange(paper_abbrev, eval_name, overall) %>%\n dplyr::select(paper_abbrev, eval_name, all_of(rating_cats)) %>%\n DT::datatable(\n caption = \"Evaluations and predictions (confidence bounds not shown)\", \n filter = 'top',\n rownames= FALSE,\n options = list(pageLength = 7)\n )\n)\n```\n\n::: {.cell-output-display}\n```{=html}\n\n\n```\n:::\n:::\n\n\\\n\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\n(\n all_evals_dt_ci <- evals_pub %>%\n arrange(paper_abbrev, eval_name) %>%\n dplyr::select(paper_abbrev, eval_name, conf_overall, rating_cats, matches(\"ub_imp|lb_imp\")) %>%\n DT::datatable(\n caption = \"Evaluations and (imputed*) confidence bounds)\", \n filter = 'top',\n rownames= FALSE,\n options = list(pageLength = 7)\n )\n)\n```\n:::\n\n:::\n\n\n::: {.callout-note collapse=\"true\"}\n## Next consider...\n\n- Composition of research evaluated\n - By field (economics, psychology, etc.)\n - By subfield of economics \n - By topic/cause area (Global health, economic development, impact of technology, global catastrophic risks, etc. )\n - By source (submitted, identified with author permission, direct evaluation)\n \n- Timing of intake and evaluation^[Consider: timing might be its own section or chapter; this is a major thing journals track, and we want to keep track of ourselves]\n\n:::\n\nThe funnel plot below starts with the paper we prioritized for likely Unjournal evaluation, marking these as 'considering'.\n\n\n::: {.cell}\n\n```{.r .cell-code}\n#Add in the 3 different evaluation input sources\n#update to be automated rather than hard-coded - to look at David's work here\n\npapers_considered <- all_pub_records %>% nrow()\n\npapers_deprio <- all_pub_records %>% filter(`stage of process/todo` == \"de-prioritized\") %>% nrow()\n\npapers_evaluated <- all_pub_records %>% filter(`stage of process/todo` %in% c(\"published\",\n \"contacting/awaiting_authors_response_to_evaluation\",\n \"awaiting_publication_ME_comments\",\n \"awaiting_evaluations\")) %>% nrow()\n\npapers_complete <- all_pub_records %>% filter(`stage of process/todo` == \"published\") %>% \nnrow()\n\npapers_in_progress <- papers_evaluated-papers_complete\n\npapers_still_in_consideration <- all_pub_records %>% filter(`stage of process/todo` == \"considering\") %>% nrow()\n\n\nfig <- plot_ly(\n type = \"sankey\",\n orientation = \"h\",\n\n node = list(\n label = c(\"Prioritized\", \"Eval uated\", \"Complete\", \"In progress\", \"Still in consideration\", \"De-prioritized\"),\n color = c(\"orange\", \"green\", \"green\", \"orange\", \"orange\", \"red\"),\n pad = 15,\n thickness = 20,\n line = list(\n color = \"black\",\n width = 0.5\n )\n ),\n\n link = list(\n source = c(0,1,1,0,0),\n target = c(1,2,3,4,5),\n value = c(\n papers_evaluated,\n papers_complete,\n papers_in_progress,\n papers_still_in_consideration,\n papers_deprio\n ))\n )\nfig <- fig %>% layout(\n title = \"Unjournal paper funnel\",\n font = list(\n size = 10\n )\n)\n\nfig \n```\n\n::: {.cell-output-display}\n```{=html}\n\n\n```\n:::\n:::\n\n::: {.cell}\n\n```{.r .cell-code}\nsummary_df <- evals_pub %>%\n distinct(crucial_rsx, .keep_all = T) %>% \n group_by(cat_1) %>%\n summarise(count = n()) \n\nsummary_df$cat_1[is.na(summary_df$cat_1)] <- \"Unknown\"\n\nsummary_df <- summary_df %>%\n arrange(-desc(count)) %>%\n mutate(cat_1 = factor(cat_1, levels = unique(cat_1)))\n\n# Create stacked bar chart\nggplot(summary_df, aes(x = cat_1, y = count)) +\n geom_bar(stat = \"identity\") + \n theme_minimal() +\n labs(x = \"Paper category\", y = \"Count\", \n title = \"Count of evaluated papers by primary category\") \n```\n\n::: {.cell-output-display}\n![](evaluation_data_files/figure-html/unnamed-chunk-14-1.png){width=672}\n:::\n:::\n\n::: {.cell}\n\n```{.r .cell-code}\n# Bar plot\nggplot(evals_pub, aes(x = source_main_wrapped)) + \n geom_bar(position = \"stack\", stat = \"count\") +\n labs(x = \"Source\", y = \"Count\") +\n theme_light() +\n theme_minimal() +\n ggtitle(\"Evaluations by source of the paper\") + # add title\n theme(\n panel.grid.major = element_blank(),\n panel.grid.minor = element_blank(),\n text = element_text(size = 16), # changing all text size to 16\n axis.text.y = element_text(size = 10),\n axis.text.x = element_text(size = 14)\n )\n```\n\n::: {.cell-output-display}\n![](evaluation_data_files/figure-html/unnamed-chunk-16-1.png){width=672}\n:::\n:::\n\n::: {.cell}\n\n```{.r .cell-code}\nall_pub_records$is_evaluated = all_pub_records$`stage of process/todo` %in% c(\"published\",\n \"contacting/awaiting_authors_response_to_evaluation\",\n \"awaiting_publication_ME_comments\",\n \"awaiting_evaluations\")\n\nall_pub_records$source_main[all_pub_records$source_main == \"NA\"] <- \"Not applicable\" \nall_pub_records$source_main[all_pub_records$source_main == \"internal-from-syllabus-agenda-policy-database\"] <- \"Internal: syllabus, agenda, etc.\" \nall_pub_records$source_main = tidyr::replace_na(all_pub_records$source_main, \"Unknown\")\n\n\n\nggplot(all_pub_records, aes(x = fct_infreq(source_main), fill = is_evaluated)) + \n geom_bar(position = \"stack\", stat = \"count\") +\n labs(x = \"Source\", y = \"Count\", fill = \"Selected for\\nevaluation?\") +\n coord_flip() + # flipping the coordinates to have categories on y-axis (on the left)\n theme_light() +\n theme_minimal() +\n ggtitle(\"Evaluations by source of the paper\") +# add title\n theme(\n panel.grid.major = element_blank(),\n panel.grid.minor = element_blank(),\n text = element_text(size = 16), # changing all text size to 16\n axis.text.y = element_text(size = 12),\n axis.text.x = element_text(size = 14)\n )\n```\n\n::: {.cell-output-display}\n![](evaluation_data_files/figure-html/unnamed-chunk-18-1.png){width=672}\n:::\n:::\n\n\n\n### The distribution of ratings and predictions {-}\n\nNext, we present the ratings and predictions along with 'uncertainty measures'.^[We use \"ub imp\" (and \"lb imp\") to denote the upper and lower bounds given by evaluators.] Where evaluators gave only a 1-5 confidence level^[More or less, the ones who report a level for 'conf overall', although some people did this for some but not others], we use the imputations discussed and coded above. \n\n\n- For each category and prediction (overall and by paper)\n\n\n::: column-body-outset\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nwrap_text <- function(text, width) {\n sapply(strwrap(text, width = width, simplify = FALSE), paste, collapse = \"\\n\")\n}\n\nevals_pub$wrapped_pub_names <- wrap_text(evals_pub$paper_abbrev, width = 15)\n\n\n\n# Dot plot\nggplot(evals_pub, aes(x = paper_abbrev, y = overall)) +\n geom_point(stat = \"identity\", size = 4, shape = 1, colour = \"lightblue\", stroke = 3) +\n geom_text_repel(aes(label = eval_name), \n size = 3, \n box.padding = unit(0.35, \"lines\"),\n point.padding = unit(0.3, \"lines\")) +\n coord_flip() + # flipping the coordinates to have categories on y-axis (on the left)\n theme_light() +\n xlab(\"Paper\") + # remove x-axis label\n ylab(\"Overall score\") + # name y-axis\n ggtitle(\"Overall scores of evaluated papers\") +# add title\n theme(\n panel.grid.major = element_blank(),\n panel.grid.minor = element_blank(),\n text = element_text(size = 14), # changing all text size to 16\n axis.text.y = element_text(size = 8),\n axis.text.x = element_text(size = 12)\n )\n```\n\n::: {.cell-output-display}\n![](evaluation_data_files/figure-html/unnamed-chunk-20-1.png){width=672}\n:::\n:::\n\n:::\n\n\n::: column-body-outset\n\n\n::: {.cell}\n\n```{.r .cell-code}\nunit.scale = function(x) (x*100 - min(x*100)) / (max(x*100) - min(x*100))\nevaluations_table <- evals_pub %>%\n select(paper_abbrev, eval_name, cat_1, source_main, overall, adv_knowledge, methods, logic_comms, journal_predict) %>%\n arrange(desc(paper_abbrev))\n\nout = formattable(\n evaluations_table,\n list(\n #area(col = 5:8) ~ function(x) percent(x / 100, digits = 0),\n area(col = 5:8) ~ color_tile(\"#FA614B66\",\"#3E7DCC\"),\n `journal_predict` = proportion_bar(\"#DeF7E9\", unit.scale)\n )\n)\nout\n```\n\n::: {.cell-output-display}\npaper_abbrev | \neval_name | \ncat_1 | \nsource_main | \noverall | \nadv_knowledge | \nmethods | \nlogic_comms | \njournal_predict | \n
---|---|---|---|---|---|---|---|---|
Well-being: Cash vs. psychotherapy | \nAnonymous_13 | \nGH&D | \ninternal-NBER | \n90 | \n90 | \n90 | \n80 | \n4.0 | \n
Well-being: Cash vs. psychotherapy | \nHannah Metzler | \nGH&D | \ninternal-NBER | \n75 | \n70 | \n90 | \n75 | \n3.0 | \n
Nonprofit Govc.: Randomized healthcare DRC | \nWayne Aaron Sandholtz | \nGH&D | \ninternal-NBER | \n65 | \n70 | \n60 | \n55 | \n3.6 | \n
LT CEA: Resilient foods vs. AGI safety | \nScott Janzwood | \nlong-term-relevant | \nsubmitted | \n65 | \nNA | \nNA | \nNA | \nNA | \n
LT CEA: Resilient foods vs. AGI safety | \nAnca Hanea | \nlong-term-relevant | \nsubmitted | \n80 | \n80 | \n70 | \n85 | \n3.5 | \n
LT CEA: Resilient foods vs. AGI safety | \nAlex Bates | \nlong-term-relevant | \nsubmitted | \n40 | \n30 | \n50 | \n60 | \n2.0 | \n
Env. fx of prod.: ecological obs | \nElias Cisneros | \nNA | \ninternal-NBER | \n88 | \n90 | \n75 | \n80 | \n4.0 | \n
Env. fx of prod.: ecological obs | \nAnonymous_12 | \nNA | \ninternal-NBER | \n70 | \n70 | \n70 | \n75 | \n4.0 | \n
CBT Human K, Ghana | \nAnonymous_11 | \nNA | \ninternal-NBER | \n75 | \n60 | \n90 | \n70 | \n4.0 | \n
CBT Human K, Ghana | \nAnonymous_16 | \nNA | \ninternal-NBER | \n75 | \n65 | \n60 | \n75 | \nNA | \n
Banning wildlife trade can boost demand | \nAnonymous_3 | \nconservation | \nsubmitted | \n75 | \n70 | \n80 | \n70 | \n3.0 | \n
Banning wildlife trade can boost demand | \nLiew Jia Huan | \nconservation | \nsubmitted | \n75 | \n80 | \n50 | \n70 | \n2.5 | \n
Advance market commit. (vaccines) | \nDavid Manheim | \npolicy | \ninternal-from-syllabus-agenda-policy-database | \n80 | \n25 | \n95 | \n75 | \n3.0 | \n
Advance market commit. (vaccines) | \nJoel Tan | \npolicy | \ninternal-from-syllabus-agenda-policy-database | \n79 | \n90 | \n70 | \n70 | \n5.0 | \n
Advance market commit. (vaccines) | \nDan Tortorice | \npolicy | \ninternal-from-syllabus-agenda-policy-database | \n80 | \n90 | \n80 | \n80 | \n4.0 | \n
AI and econ. growth | \nSeth Benzell | \nmacroeconomics | \ninternal-from-syllabus-agenda-policy-database | \n80 | \n75 | \n80 | \n70 | \nNA | \n
AI and econ. growth | \nPhil Trammel | \nmacroeconomics | \ninternal-from-syllabus-agenda-policy-database | \n92 | \n97 | \n70 | \n45 | \n3.5 | \n