Create a standalone script to import data

- currently from Airtable - tweak columns to fit current spec - add a step to the Publish action on github
unjournal · Apr 23, 2024 · d36e1ef · d36e1ef
1 parent 69b9cfd
commit d36e1ef
Show file tree

Hide file tree

Showing 4 changed files with 212 additions and 6 deletions.
diff --git a/.github/workflows/quarto-publish.yml b/.github/workflows/quarto-publish.yml
@@ -18,19 +18,23 @@ jobs:
       - name: Check out repository
         uses: actions/checkout@v4
 
+      - name: Import unjournal data
+        run: Rscript code/import-unjournal-data.R
+        env: # Or as an environment variable
+          AIRTABLE_API_KEY: ${{ secrets.AIRTABLE_API_KEY }}
+
       - name: Set up Quarto
         uses: quarto-dev/quarto-actions/setup@v2
           # uncomment below and fill to pin a version
           # version: SPECIFIC-QUARTO-VERSION-HERE
-
-      # add software dependencies here and any libraries
 
-      # From https://github.com/r-lib/actions/tree/v2-branch/setup-r
       - name: Setup R
         uses: r-lib/actions/setup-r@v2
 
       - name: Install system dependencies on Linux
-        run: sudo apt-get install jags libcurl4-openssl-dev libharfbuzz-dev libfribidi-dev 
+        run: |
+          sudo apt-get install jags libcurl4-openssl-dev \
+            libharfbuzz-dev libfribidi-dev 
 
       # From https://github.com/r-lib/actions/tree/v2/setup-renv
       - name: Setup dependencies with renv

diff --git a/TODO.md b/TODO.md
@@ -5,7 +5,8 @@ This is a place for planned or desirable technical changes.
 Higher-level plans are discussed on
 the [Unjournal coda.io project management website](https://coda.io/d/Project-Management-UJ_dOyXJoZ6imx/Projects_subw9#Projects_tuA9I/r30&view=full).
 
-[ ] Render the quarto docs remotely via github actions.
+[x] Render the quarto docs remotely via github actions.
+[ ] Reimport from airtable *or* use pubpub API
 
 # Evaluating publication predictions
 

diff --git a/_quarto.yml b/_quarto.yml
@@ -5,7 +5,7 @@ project:
 book:
   title: "The Unjournal evaluations: data and analysis"
   author: "David Reinstein, Julia Bottesini, and the Unjournal team"
-  repo-url: https://github.com/daaronr/unjournaldata/
+  repo-url: https://github.com/unjournal/unjournaldata/
   repo-actions: [edit, issue]
   chapters:
     - index.qmd

diff --git a/code/import-unjournal-data.R b/code/import-unjournal-data.R
@@ -0,0 +1,201 @@
+
+# standalone script to create data frame of unjournal reviews
+# currently uses airtable. In future could use pubpub API.
+
+# Environment variable AIRTABLE_API_KEY should be set to your 
+# Personal Access Token; these function the same way as the old API keys.
+# Your PAT needs only read access to tables and table structure.
+
+library(dplyr)
+library(airtabler)   
+library(snakecase)
+library(stringr)
+library(here)
+library(readr)
+
+base_id <- "applDG6ifmUmeEJ7j" # new ID to cover "UJ - research & core members" base
+
+pub_records <- air_select(base = base_id, table = "crucial_rsx")
+all_pub_records <- pub_records
+# 100 is the maximum length returned
+while(nrow(pub_records) == 100) {
+  # Get the ID of the last record in the list
+  offset <- get_offset(pub_records)
+  # Fetch the next records, starting after this ID
+  pub_records <- air_select(base = base_id, table = "crucial_rsx", offset =  offset)
+  # Append the records to the df
+  all_pub_records <- bind_rows(all_pub_records, pub_records)
+}
+rm(pub_records)
+
+evals_pub <- air_get(base = base_id, "output_eval") 
+colnames(evals_pub) <- snakecase::to_snake_case(colnames(evals_pub))
+
+evals_pub <- evals_pub %>% 
+  dplyr::rename(stage_of_process = stage_of_process_todo_from_crucial_research_2) %>% 
+  mutate(stage_of_process = unlist(stage_of_process)) %>% 
+  dplyr::filter(grepl("published", stage_of_process)) %>% 
+    select(id, 
+           crucial_research, 
+           paper_abbrev, 
+           evaluator_name, 
+           category, 
+           source_main, 
+           author_agreement, 
+           overall, 
+           lb_overall, 
+           ub_overall, 
+           conf_index_overall, 
+           advancing_knowledge_and_practice, 
+           lb_advancing_knowledge_and_practice, 
+           ub_advancing_knowledge_and_practice, 
+           conf_index_advancing_knowledge_and_practice,
+           methods_justification_reasonableness_validity_robustness,
+           lb_methods_justification_reasonableness_validity_robustness,
+           ub_methods_justification_reasonableness_validity_robustness,
+           conf_index_methods_justification_reasonableness_validity_robustness, 
+           logic_communication, lb_logic_communication, ub_logic_communication, 
+           conf_index_logic_communication,
+           engaging_with_real_world_impact_quantification_practice_realism_and_relevance,
+           lb_engaging_with_real_world_impact_quantification_practice_realism_and_relevance,
+           ub_engaging_with_real_world_impact_quantification_practice_realism_and_relevance,
+           conf_index_engaging_with_real_world_impact_quantification_practice_realism_and_relevance,
+           relevance_to_global_priorities, 
+           lb_relevance_to_global_priorities, 
+           ub_relevance_to_global_priorities, 
+           conf_index_relevance_to_global_priorities, 
+           journal_quality_predict, 
+           lb_journal_quality_predict, 
+           ub_journal_quality_predict,
+           conf_index_journal_quality_predict, 
+           open_collaborative_replicable, 
+           conf_index_open_collaborative_replicable, 
+           lb_open_collaborative_replicable, 
+           ub_open_collaborative_replicable, 
+           merits_journal, 
+           lb_merits_journal, 
+           ub_merits_journal, 
+           conf_index_merits_journal)
+
+# shorten names (before you expand into columns)
+new_names <- c(
+  "eval_name" = "evaluator_name",
+  "cat" = "category",
+  "crucial_rsx" = "crucial_research",
+  "conf_overall" = "conf_index_overall",
+  "adv_knowledge" = "advancing_knowledge_and_practice",
+  "lb_adv_knowledge" = "lb_advancing_knowledge_and_practice",
+  "ub_adv_knowledge" = "ub_advancing_knowledge_and_practice",
+  "conf_adv_knowledge" = "conf_index_advancing_knowledge_and_practice",
+  "methods" = "methods_justification_reasonableness_validity_robustness",
+  "lb_methods" = "lb_methods_justification_reasonableness_validity_robustness",
+  "ub_methods" = "ub_methods_justification_reasonableness_validity_robustness",
+  "conf_methods" = "conf_index_methods_justification_reasonableness_validity_robustness",
+  "logic_comms" = "logic_communication",
+  "lb_logic_comms" = "lb_logic_communication",
+  "ub_logic_comms" = "ub_logic_communication",
+  "conf_logic_comms" = "conf_index_logic_communication",
+  "real_world" = "engaging_with_real_world_impact_quantification_practice_realism_and_relevance",
+  "lb_real_world" = "lb_engaging_with_real_world_impact_quantification_practice_realism_and_relevance",
+  "ub_real_world" = "ub_engaging_with_real_world_impact_quantification_practice_realism_and_relevance",
+  "conf_real_world" = "conf_index_engaging_with_real_world_impact_quantification_practice_realism_and_relevance",
+  "gp_relevance" = "relevance_to_global_priorities",
+  "lb_gp_relevance" = "lb_relevance_to_global_priorities",
+  "ub_gp_relevance" = "ub_relevance_to_global_priorities",
+  "conf_gp_relevance" = "conf_index_relevance_to_global_priorities",
+  "journal_predict" = "journal_quality_predict",
+  "lb_journal_predict" = "lb_journal_quality_predict",
+  "ub_journal_predict" = "ub_journal_quality_predict",
+  "conf_journal_predict" = "conf_index_journal_quality_predict",
+  "open_sci" = "open_collaborative_replicable",
+  "conf_open_sci" = "conf_index_open_collaborative_replicable",
+  "lb_open_sci" = "lb_open_collaborative_replicable",
+  "ub_open_sci" = "ub_open_collaborative_replicable",
+  "conf_merits_journal" = "conf_index_merits_journal"
+)
+
+evals_pub <- evals_pub %>%
+  rename(!!!new_names)
+
+#  Create a list of labels with the old, longer names
+labels <- str_replace_all(new_names, "_", " ") %>% str_to_title()
+
+
+# expand categories into columns, unlist everything
+evals_pub %<>%
+  tidyr::unnest_wider(cat, names_sep = "_") %>% # give each of these its own col
+  mutate(across(everything(), unlist))  # maybe check why some of these are lists in the first place
+
+# clean the Anonymous names
+evals_pub$eval_name <- ifelse(
+  grepl("^\\b\\w+\\b$|\\bAnonymous\\b", evals_pub$eval_name),
+  paste0("Anonymous_", seq_along(evals_pub$eval_name)),
+  evals_pub$eval_name
+)
+
+# only these variables are publicly shareable
+all_papers_p <- all_pub_records %>% 
+  dplyr::select(
+    id,
+    category,
+   # these columns seem no longer to exist:
+   # cfdc_DR,
+   # 'confidence -- user entered',
+   # cfdc_assessor,
+   # avg_cfdc,
+    category,
+    cause_cat_1_text,
+    cause_cat_2_text,
+    topic_subfield_text,
+    eval_manager_text,
+    'publication status',
+    'Contacted author?',
+    'stage of process/todo',
+    'source_main',  
+    'author permission?',
+'Direct Kotahi Prize Submission?',
+    'createdTime'         
+  )
+
+
+saveRDS(all_papers_p, file = here("data", "all_papers_p.Rdata"))
+write_csv(all_papers_p, file = here("data", "all_papers_p.csv"))
+
+saveRDS(evals_pub, file = here("data", "evals.Rdata"))
+write_csv(evals_pub, file = here("data", "evals.csv"))
+
+# Beginnings of work for pubpub:
+# 
+# simple access to pubpub v6 API
+# function to get a collection of pubpubs
+# function to get details of each pub
+# 
+# 
+# library(httr)
+# library(secretbase)
+# 
+# url <- "https://unjournal.pubpub.org/api/login"
+# 
+# password_hash <- secretbase::sha3("", bits = 512L)
+# payload <- sprintf('{
+# "email": "[email protected]",
+# "password": "%s"
+# }', password_hash)
+# 
+# response <- VERB("POST", url, 
+#                  body = payload, 
+#                  content_type("application/json"), 
+#                  accept("application/json"), 
+#                  encode = "json")
+# 
+# content(response, "text")
+# 
+# 
+# url <- "https://www.pubpub.org/api/pubs/cashtransfersmetrics"
+# 
+# response <- VERB("GET", 
+#                  url, 
+#                  content_type("application/octet-stream"), 
+#                  accept("application/json"))
+# 
+# content(response, "text")