From 06e70e741fda97eb263e5032c084271710dfcce7 Mon Sep 17 00:00:00 2001 From: egouldo Date: Thu, 1 Feb 2024 09:41:21 +1100 Subject: [PATCH 01/15] #52 change RO's afiiliation --- inst/ms/aggreCAT.qmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/ms/aggreCAT.qmd b/inst/ms/aggreCAT.qmd index decb031..f3f0939 100644 --- a/inst/ms/aggreCAT.qmd +++ b/inst/ms/aggreCAT.qmd @@ -47,7 +47,7 @@ author: affiliations: 'University of Melbourne' orcid: '0000-0003-2536-2596' - name: "Rose O'Dea" - affiliations: 'University of New South Wales' + affiliations: 'University of Melbourne' orcid: '0000-0001-8177-5075' - name: "Rebecca Groenewegen" affiliations: 'University of Melbourne' From 22513525461c074d82013e822a348191b4e36312 Mon Sep 17 00:00:00 2001 From: egouldo Date: Thu, 1 Feb 2024 10:02:46 +1100 Subject: [PATCH 02/15] upgrade README to mention that JAGS dep if using Bayes' agg methods --- README.md | 55 ++++++++++++++++++++---------------------------------- README.qmd | 2 ++ 2 files changed, 22 insertions(+), 35 deletions(-) diff --git a/README.md b/README.md index c655f97..e16cf01 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ -README -================ +# README + [![DOI](https://zenodo.org/badge/531484296.svg)](https://zenodo.org/badge/latestdoi/531484296) [![R-CMD-check](https://github.com/metamelb-repliCATS/aggreCAT/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/metamelb-repliCATS/aggreCAT/actions/workflows/R-CMD-check.yaml) @@ -90,6 +90,10 @@ Then load the package: library(aggreCAT) ``` +Note, if you wish to use any of the Bayesian aggregation methods,you +will need to have +[JAGS](https://sourceforge.net/projects/mcmc-jags/files/) installed. + # Getting Started with `aggreCAT` Below we provide a brief summary of the package, for a detailed @@ -145,29 +149,12 @@ IDEA protocol, best estimates, and upper and lower bounds are elicited from each participant, over two rounds. The judgement data is contained in the object `data_ratings`, described at `?data_ratings`. -
- - -
+ ## A minimal working example with `AverageWAgg()` -
- - -
- + Below we demonstrate how to use the most simple commonly implemented aggregation method `ArMean`, which takes the arithmetic mean of participant Best Estimates. We first use a small subset of 5 @@ -232,7 +219,7 @@ data_ratings %>% AverageWAgg() #> 8 ArMean 116 62.6 25 #> 9 ArMean 118 54.8 25 #> 10 ArMean 133 59.9 25 -#> # … with 15 more rows +#> # ℹ 15 more rows ``` And other times, we might want to trial different aggregation methods @@ -257,27 +244,25 @@ appropriately attributed and cited accordingly: ``` r citation("aggreCAT") +#> To cite aggreCAT in publications use: #> -#> To cite package 'aggreCAT' in publications use: -#> -#> Willcox A, Gray C, Gould E, Wilkinson D, Hanea A, Wintle B, E. O'Dea -#> R (????). _aggreCAT: Mathematically Aggregating Expert Judgments_. R -#> package version 0.0.0.9002, -#> . +#> Gould et al. aggreCAT: An R Package for Mathematically Aggregating +#> Expert Judgments (2023). MetArXiv #> #> A BibTeX entry for LaTeX users is #> -#> @Manual{, -#> title = {aggreCAT: Mathematically Aggregating Expert Judgments}, -#> author = {Aaron Willcox and Charles T. Gray and Elliot Gould and David Wilkinson and Anca Hanea and Bonnie Wintle and Rose {E. O'Dea}}, -#> note = {R package version 0.0.0.9002}, -#> url = {https://replicats.research.unimelb.edu.au/}, +#> @Article{, +#> title = {aggreCAT: An R Package for Mathematically Aggregating Expert Judgments}, +#> author = {{Gould} and {Elliot} and {Gray} and Charles T. and {Willcox} and {Aaron} and {O'Dea} and {Rose} and {Groenewgen} and {Rebecca} and {Wilkinson} and David P.}, +#> journal = {MetArXiv}, +#> year = {2023}, #> } ``` ## References -
+
diff --git a/README.qmd b/README.qmd index 040870e..c3fdd42 100644 --- a/README.qmd +++ b/README.qmd @@ -68,6 +68,8 @@ Then load the package: library(aggreCAT) ``` +Note, if you wish to use any of the Bayesian aggregation methods,you will need to have [JAGS](https://sourceforge.net/projects/mcmc-jags/files/) installed. + # Getting Started with `aggreCAT` Below we provide a brief summary of the package, for a detailed overview, please consult the manuscript [@Gould2022]. From a8de231a597d016bdff0d349672cdd6869689cc8 Mon Sep 17 00:00:00 2001 From: egouldo Date: Thu, 1 Feb 2024 10:22:04 +1100 Subject: [PATCH 03/15] #54 add note to README about rjags library issue and solution --- README.md | 2 ++ README.qmd | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index e16cf01..3a4d434 100644 --- a/README.md +++ b/README.md @@ -93,6 +93,8 @@ library(aggreCAT) Note, if you wish to use any of the Bayesian aggregation methods,you will need to have [JAGS](https://sourceforge.net/projects/mcmc-jags/files/) installed. +Note that some mac users may need to install jags dependencies from +source: `install.packages("rjags",type = "source")`. # Getting Started with `aggreCAT` diff --git a/README.qmd b/README.qmd index c3fdd42..a894e31 100644 --- a/README.qmd +++ b/README.qmd @@ -68,7 +68,7 @@ Then load the package: library(aggreCAT) ``` -Note, if you wish to use any of the Bayesian aggregation methods,you will need to have [JAGS](https://sourceforge.net/projects/mcmc-jags/files/) installed. +Note, if you wish to use any of the Bayesian aggregation methods,you will need to have [JAGS](https://sourceforge.net/projects/mcmc-jags/files/) installed. Note that some mac users may need to install jags dependencies from source: `install.packages("rjags",type = "source")`. # Getting Started with `aggreCAT` From 8de3c7699a70e0735e4cf055b2397f54cfb8492a Mon Sep 17 00:00:00 2001 From: egouldo Date: Thu, 1 Feb 2024 13:41:16 +1100 Subject: [PATCH 04/15] #54 rewrite some sentences following RO feebdack --- inst/ms/aggreCAT.qmd | 775 ++++++++++++++++++++++++------------------- 1 file changed, 428 insertions(+), 347 deletions(-) diff --git a/inst/ms/aggreCAT.qmd b/inst/ms/aggreCAT.qmd index f3f0939..90d6135 100644 --- a/inst/ms/aggreCAT.qmd +++ b/inst/ms/aggreCAT.qmd @@ -56,7 +56,7 @@ author: orcid: '0000-0002-9560-6499' affiliations: 'University of Melbourne' abstract: | - Structured elicitation protocols, such as the IDEA protocol, may be used to elicit expert judgements in the form of subjective probabilities from multiple experts. Judgements from individual experts about a particular phenomena must therefore be mathematically aggregated into a single prediction. The process of aggregation may be complicated when judgements are elicited with uncertainty bounds, and also when there are several rounds of elicitation. This paper presents the new R package [aggreCAT]{.pkg}, which provides 27 unique aggregation methods for combining individual judgements into a single, probabilistic measure. The aggregation methods were developed as a part of the Defense Advanced Research Projects Agency (DARPA) 'Systematizing Confidence in Open Research and Evidence' (SCORE) programme, which aims to generate confidence scores or estimates of 'claim credibility' for over 4000 research claims from the social and behavioural sciences. We provide several worked examples illustrating the underlying mechanics of the aggregation methods. We also describe a general workflow for using the software in practice to facilitate uptake of this software for appropriate use-cases. + Structured elicitation protocols, such as the IDEA protocol, may be used to elicit expert judgements in the form of subjective probabilities from multiple experts. Judgements from individual experts about a particular phenomena must therefore be mathematically aggregated into a single prediction. Aggregation becomes more complicated when judgements are elicited with uncertainty bounds and when there are multiple rounds of elicitation. This paper presents the new [R]{.proglang} package [aggreCAT]{.pkg}, which provides 27 unique aggregation methods for combining individual judgements into a single, probabilistic measure. The aggregation methods were developed as a part of the Defense Advanced Research Projects Agency (DARPA) 'Systematizing Confidence in Open Research and Evidence' (SCORE) programme, which aims to generate confidence scores or estimates of 'claim credibility' for over 4000 research claims from the social and behavioural sciences. We provide several worked examples illustrating the underlying mechanics of the aggregation methods. We also describe a general workflow for using the software in practice to facilitate uptake of this software for appropriate use-cases. keywords: [mathematical aggregation, expert judgement, DARPA SCORE, replicability, R] keywords-formatted: [mathematical aggregation, expert judgement, DARPA SCORE, replicability, "[R]{.proglang}"] bibliography: bibliography.bib @@ -86,28 +86,27 @@ library(kableExtra) Expert judgement is frequently used to inform forecasting about uncertain future events across a range of disciplines, including ecology, conservation science, human geography, political science, and -management [@Sutherland2018]. Judgements from groups of experts tend to -perform better than a single expert [@Goossens2008], and it is -best-practice to elicit judgements from diverse groups so that group -members can bring "different perspectives, cross-examine each others' -reasoning, and share information", however judgements or forecasts must -then be distilled into a single forecast, ideally accompanied by -estimates of uncertainty around those estimates [@Hanea2021]. Judgements -from multiple experts may be combined into a single forecast using -either behavioural approaches that force experts into forming consensus, -or by using mathematical approaches [@Goossens2008]. +management [@Sutherland2018]. It is best-practice to elicit judgements +from diverse groups to capitalise on the 'wisdom of crowds' +[@hemming2017], because groups of experts tend to perform better than a +single expert [@Goossens2008]. However judgements or forecasts must then +be distilled into a single forecast, ideally accompanied by estimates of +uncertainty around those estimates [@Hanea2021]. Judgements from +multiple experts may be combined into a single forecast using either +behavioural approaches that force experts into forming consensus, or +mathematical approaches [@Goossens2008]. Although there are a variety of methods for mathematically aggregating expert judgements into single point-predictions, there are few open-source software implementations available to analysts or -researchers. The [R]{.proglang} [@R] package [expert]{.pkg} provides three -models of expert opinion to combine judgements elicited from groups of -experts (CITE) , and [SHELF]{.pkg} implements only a single method -(weighted linear pool) for aggregating expert judgements (CITE). Other -[R]{.proglang} packages providing methods to mathematically aggregate -expert judgements do so for non-point predictions, for example, -[opera]{.pkg}, which generates time-series predictions (CITE). In this -paper we present the [aggreCAT]{.pkg} package, which provides 27 +researchers. The [R]{.proglang} [@R] package [expert]{.pkg} provides +three models of expert opinion to combine judgements elicited from +groups of experts (CITE) , and [SHELF]{.pkg} implements only a single +method (weighted linear pool) for aggregating expert judgements (CITE). +Other [R]{.proglang} packages providing methods to mathematically +aggregate expert judgements do so for non-point predictions, for +example, [opera]{.pkg}, which generates time-series predictions (CITE). +In this paper we present the [aggreCAT]{.pkg} package, which provides 27 different methods for mathematically aggregating judgements within groups of experts into a single forecast. @@ -133,14 +132,15 @@ replication effort [@alipourfard2021]. Replications are time-consuming and costly [@Isager2020], and studies have shown that replication outcomes can be reliably elicited from researchers [@Gordon2020]. Consequently, the DARPA SCORE program -generated Confidence Scores for $> 4000$ SBS claims using expert elicitation based on two very -different strategies -- prediction markets [@Gordon2020] and the IDEA -protocol [@hemming2017], the latter of which is used by the repliCATS -project [@Fraser:2021]. A proportion of these research claims were randomly -selected for direct replication, against which the elicited and -aggregated Confidence Scores are 'ground-truthed'. These aim of the DARPA SCORE project is to -aid the development of artificial intelligence tools that can -automatically assign Confidence Scores. +generated Confidence Scores for $> 4000$ SBS claims using expert +elicitation based on two very different strategies -- prediction markets +[@Gordon2020] and the IDEA protocol [@hemming2017], the latter of which +is used by the repliCATS project [@Fraser:2021]. A proportion of these +research claims were randomly selected for direct replication, against +which the elicited and aggregated Confidence Scores are 'ground-truthed' +or verified. The aim of the DARPA SCORE project is to aid the +development of artificial intelligence tools that can automatically +assign Confidence Scores. #### The repliCATS IDEA protocol {#sec-IDEAprotocol} @@ -160,28 +160,31 @@ the repliCATS platform [@Pearson2021], a multi-user cloud-based software platform that implements the IDEA protocol, between July 7th 2019 and November 30th 2020. -For a single claim under assessment, between 4 and 15 experts -individually drew on background information to provide estimates of the -probability, including 4 numeric data points and one character data -point: an upper and lower bound, and best estimate of the event -probability, as well as justifications for their estimates, and a value -on the likert binary scale up to 7 rating the individuals' degree of -comprehension of the claim (Round 1, *Investigate*). In the *Discuss* -phase, three-point estimates from each group member are anonymously -presented to the group, who then collectively discuss differences in -opinion and provide potential evidence for these differences. Group -members subsequently provide a second set of probabilistic judgements -(Round 2, *Estimate*). Thus, for a single assessment, 2 sets of -judgements are elicited from each expert (*pre-* and *post-*group -discussion). - -During the fourth step, *Aggregate*, judgements are mathematically -aggregated into a single Confidence Score or forecast of +During the *Investigate* phase, individuals review the claim and draw on +background knowledge to provide estimates of the probability of the +claim replicating. These estimates are numeric (0 - 100%) and include a +best estimate, and upper and lower bounds on that estimate. Individuals +also provide a value on the likert binary scale up to 7, rating the +individuals' degree of comprehension of the claim. Individuals were also +asked to provide short comments justifying their estimates. + +In the *Discuss* phase, three-point estimates from each group member are +anonymously presented to the group, who then collectively discuss +differences in opinion and provide potential evidence for these +differences. Next, in the *Estimate* phase, individuals are asked to +provide a second set of three-point probablistic judgements. Thus, for a +single assessment, 2 sets of judgements are elicited from each expert, +*pre-* and *post-*group discussion, referred to here on as *Round 1* and +*Round 2* judgements, respectively. + +During the fourth phase, *Aggregate*, judgements are mathematically +aggregated into a single Confidence Score, or in this case, forecast of replicability. The repliCATS project developed 27 different methods for mathematically aggregating judgements elicited from groups of experts into Confidence Scores [@Hanea2021]. We developed the [aggreCAT]{.pkg} package to implement these aggregation methods and deliver Confidence -Scores for over 4000 SBS research claims as a part of the DARPA SCORE project. +Scores for over 4000 SBS research claims as a part of the DARPA SCORE +project. ![The IDEA protocol as deployed by the repliCATS project (reproduced with permission from Wintle et al. @@ -192,29 +195,29 @@ with permission from Wintle et al. In this paper we aim to provide a detailed overview of the [aggreCAT]{.pkg} package so that researchers may apply the aggregation functions described in [@Hanea2021] to their own expert elicitation -datasets where mathematical aggregation is required. Note that judgements -that have already been subjected to behavioural or consensus aggregation -may not be subsequently mathematically aggregated, however individual elicited -judgements may be aggregated mathematically as an alternative or complement -to behavioural or consensus-based aggregation. +datasets where mathematical aggregation is required. Note that +judgements that have already been subjected to behavioural or consensus +aggregation may not be subsequently mathematically aggregated, however +individual elicited judgements may be aggregated mathematically as an +alternative or complement to behavioural or consensus-based aggregation. We begin by formulating the problem of mathematically aggregating expert judgements. Each method, and its data requirements is summarised in -Table \ref{tbl-method-summary-table}. Before outlining key aggregation methods, we -briefly summarise package datasets, which were collected by the -repliCATS project. By first describing the datasets before describing -the aggregation methods in detail, we aim to provide a grounded -understanding of the different outputs of expert elicitation using the -repliCATS IDEA protocol, and the inputs available to the aggregation -functions. +Table \ref{tbl-method-summary-table}. Before outlining key aggregation +methods, we briefly summarise package datasets, which were collected by +the repliCATS project. By first describing the datasets before +describing the aggregation methods in detail, we aim to provide a +grounded understanding of the different outputs of expert elicitation +using the repliCATS IDEA protocol, and the inputs available to the +aggregation functions. Next, we describe and illustrate the main types of aggregators, which may be categorised according to their data requirements, mathematical -properties and computational implementation (@sec-focal-claims). By selecting -representative functions of each key aggregator type and applying them -to a subset of focal claims, we demonstrate the internal mechanics of -how these methods differently operationalise the data to generate -forecasts or Confidence Scores. We do not give advice on the +properties and computational implementation (@sec-focal-claims). By +selecting representative functions of each key aggregator type and +applying them to a subset of focal claims, we demonstrate the internal +mechanics of how these methods differently operationalise the data to +generate forecasts or Confidence Scores. We do not give advice on the circumstances in which each method should be used, instead, choice of aggregation method should be informed by the mathematical properties of the method, the desired properties of an aggregation, and the purpose @@ -224,8 +227,8 @@ each method as well as a discussion of their relative merits, see Finally, we provide a detailed workflow for aggregating expert judgments for multiple forecasts, using multiple aggregation functions, as -implemented by the repliCATS project in the course of delivering $>$ 4000 -Confidence Scores for the DARPA SCORE program. The [aggreCAT]{.pkg} +implemented by the repliCATS project in the course of delivering $>$ +4000 Confidence Scores for the DARPA SCORE program. The [aggreCAT]{.pkg} package provides a set of supporting functions for evaluating or ground-truthing aggregated forecasts or Confidence Scores against a set of known-outcomes, as well as functions for visualising comparisons of @@ -261,35 +264,38 @@ implementation and data requirements, because these inform the function arguments as well as the type and form of the data that is parsed to the aggregation functions. These aspects include: -- Elicitation requirement, number of elicitation rounds: the majority of - aggregation methods require data from only a single round of +- Elicitation requirement, number of elicitation rounds: the majority + of aggregation methods require data from only a single round of judgements, i.e. the final post-discussion estimates. However, some aggregation methods require data from both rounds of judgements, - which may be elicited using the IDEA protocol or other similar + which may be elicited using the IDEA protocol or another similarly structured elicitation protocol in which there are two rounds of judgements. -- Elicitation requirement, single point or three point elicitation: several - aggregation methods use only a single data point elicited from - individuals (their best estimate), however, most aggregation methods - require a best estimate, and estimates of uncertainty in the form of - upper and lower bounds. +- Elicitation requirement, single point or three point elicitation: + several aggregation methods use only a single data point elicited + from individuals (their best estimate), however, most aggregation + methods require a best estimate, and estimates of uncertainty in the + form of upper and lower bounds. - Number of claims / forecasts assessed by the individual: some weighted aggregation methods consist of weights that are calculated from properties of participant judgements across multiple - forecasting questions, not just the target claim being aggregation. - Secondly, for aggregation methods that calculate variance in estimates, - variance cannot be calculated on a single data point. While 2 is the mathematical minimum, the user should give consideration to what minimum number of claims should be used to reliably calculate measures of variance. + forecasting questions, not just the target claim being aggregated. + Secondly, for aggregation methods that calculate variance in + estimates, variance cannot be calculated on a single data point. + While two is the mathematical minimum, the user should give + consideration to what minimum number of claims should be used to + reliably calculate measures of variance. - Supplementary data requirements: several aggregation methods require supplementary data collected either in addition to or as part of the - repliCATS IDEA protocol, some of which will need additional qualitative - coding before being parsed to the aggregation function. + repliCATS IDEA protocol, some of which will need additional + qualitative coding before being parsed to the aggregation function. The data and structured elicitation protocol requirements are described -in Table \ref{tbl-method-summary-table}. All aggregation methods requiring a -single round of estimates can therefore be applied to expert judgments -derived from any structured elicitation protocol that generates, lower, -upper, and best estimates from each individual (i.e. not just the IDEA -protocol), and does not enforce behavioural consensus. +in Table \ref{tbl-method-summary-table}. All aggregation methods +requiring a single round of estimates can therefore be applied to expert +judgments derived from any structured elicitation protocol that +generates, lower, upper, and best estimates from each individual (i.e. +not just the IDEA protocol) and does not enforce behavioural consensus. #### Notation and Problem Formulation @@ -301,14 +307,21 @@ each individual aggregation function, please consult [@Hanea2021] or the The total number of research claims, $claim$, or unique forecasts being assessed, $C$ , is indexed by $c = 1, ..., C$. The total number of individuals / experts / participants is denoted by $N$, and is indexed -by $i = 1, ..., N$. Each claim *outcome* (i.e. the outcome of a replication study) assumes binary values, where the value is 0 if the claim is false, and 1 if the claim is true. '`TRUE`' claims are claims where the replication study found a significant result in the same direction as the original research claim, and '`FALSE`' claims are those where the replication study *did not* find a significant result in the same direction as the original study. For each claim $c$, an individual $i$ assesses the probability of a claim replicating by +by $i = 1, ..., N$. Each claim *outcome* (i.e. the outcome of a +replication study) assumes binary values, where the value is 0 if the +claim is false, and 1 if the claim is true. '`TRUE`' claims are claims +where the replication study found a statistically significant result in +the same direction as the original research claim, and '`FALSE`' claims +are those where the replication study *did not* find a significant +result in the same direction as the original study. For each claim $c$, +an individual $i$ assesses the probability of a claim replicating by providing three probabilities: a lower bound ${L}_{i,c}$, an upper bound ${U}_{i,c}$, and a best estimate $B_{i,c}$, satisfying the inequalities: $0 \le Li,c \le Bi,c \le Ui,c \le 1$. Every claim is assessed by multiple individuals, and their probabilities -are aggregated using one of the aggregation methods to obtain a group -or aggregate probability, denoted by $\hat{p}_c$. The aggregated +are aggregated using one of the aggregation methods to obtain a group or +aggregate probability, denoted by $\hat{p}_c$. The aggregated probability calculated using a specific method, is given by $\hat{p}_{c}\left(Method \space ID \right)$. Each aggregation is assigned a unique $Method \space ID$ which is the abbreviation of the @@ -339,8 +352,11 @@ of estimates, and by prior statistical knowledge as measured in a quiz. Below, we define standardised notation for describing weighted linear combinations of individual judgements where un-normalised weights are -denoted by $w\_method$ and normalised weights by $\tilde{w} \_ method$ (@eq-eqn1). Given that for all aggregation methods weights are normalised, and that the normalisation process is the same for each aggregation method, the equations for the aggregation methods are presented for un-normalised weights. - +denoted by $w\_method$ and normalised weights by $\tilde{w} \_ method$ +(@eq-eqn1). Given that for all aggregation methods weights are +normalised, and that the normalisation process is the same for each +aggregation method, the equations for the aggregation methods are +presented for un-normalised weights. $$ \hat{p}_c\left(Method \space ID \right) = \frac{1}{N}\sum_{i=1}^N \tilde{w}\_{method}_{i,c} B_{i,c} @@ -387,14 +403,14 @@ form measures of reasoning breadth and engagement [@Fraser:2021]. meeting of the Society for the Improvement of Psychological Science (SIPS), [\](https://osf.io/ndzpt/){.uri}. -`data_ratings` is a *tidy* [data.frame]{.class} wherein each *observation* (or row) -corresponds to a single value in the set of `value`s constituting a -participant's complete assessment of a research claim. Each research -claim is assigned a unique `paper_id`, and each participant has a unique -(and anonymous) `user_name`. The variable `round` denotes the round in -which each `value` was elicited (`round_1` or `round_2`). `question` -denotes the type of question the `value` pertains to; -`direct_replication` for probabilistic judgements about the +`data_ratings` is a *tidy* [data.frame]{.class} wherein each +*observation* (or row) corresponds to a single value in the set of +`value`s constituting a participant's complete assessment of a research +claim. Each research claim is assigned a unique `paper_id`, and each +participant has a unique (and anonymous) `user_name`. The variable +`round` denotes the round in which each `value` was elicited (`round_1` +or `round_2`). `question` denotes the type of question the `value` +pertains to; `direct_replication` for probabilistic judgements about the replicability of the claim, `belief_binary` for participants' belief in the plausibility of the claim, `comprehension` for participants' comprehensibility ratings, and `involved_binary` for involvement in the @@ -412,9 +428,12 @@ ranging from (0,100). The `binary_question`s corresponding to comprehensibility and involvement consist of binary values (`1` for the affirmative, and `-1` for the negative). Finally, values corresponding to participants' comprehension ratings are on a `likert_binary` scale -from `1` through `7`. Note that additional columns with participant attributes can be included in the ratings dataset if required by the user, we include the `group` column in `data-ratings`, which describes the group number the participant was a part of. Below we show some example data for a single user -for a single claim to illustrate this structure of the core -`data_ratings` dataset. +from `1` through `7`. Note that additional columns with participant +attributes can be included in the ratings dataset if required by the +user; we include the `group` column in `data-ratings`, which describes +the group number the participant was a part of. Below we show some +example data for a single user for a single claim to illustrate this +structure of the core `data_ratings` dataset. ```{r} #| label: data_ratings-sample @@ -432,25 +451,25 @@ aggreCAT::data_ratings %>% Not all data necessary for constructing weights on performance is contained in `data_ratings`. Additional data collected as part of the repliCATS IDEA protocol are contained within separate datasets to -`data_ratings`. Participants provided justifications for giving particular -judgemetns, and these are contained in `data_justifications`. On the repliCATS -platform users were given the option to comment on others' justifications -(`data_comments`), to vote on others' comments (`data_comment_ratings`) and on -others' justifications (`data_justification_ratings`). Finally, [aggreCAT]{.pkg} -contains three 'supplementary' datasets containing data -collected externally to the repliCATS IDEA protocol: `data_supp_quiz`, -`data_supp_priors`, and `data_supp_reasons`. +`data_ratings`. Participants provided justifications for giving +particular judgemetns, and these are contained in `data_justifications`. +On the repliCATS platform users were given the option to comment on +others' justifications (`data_comments`), to vote on others' comments +(`data_comment_ratings`) and on others' justifications +(`data_justification_ratings`). Finally, [aggreCAT]{.pkg} contains three +'supplementary' datasets containing data collected externally to the +repliCATS IDEA protocol: `data_supp_quiz`, `data_supp_priors`, and +`data_supp_reasons`. #### Quiz Score Data {#sec-quiz-supplementary-data} -Prior to the workshop, participants also completed an optional quiz on -statistical concepts and meta-research that we expect participants to be -aware of in order to reliably evaluate the replicability of research -claims. Quiz responses are contained in `data_supp_quiz` and are used to -construct performance weights for the aggregation method `QuizWAgg` -where each participant receives a `quiz_score` if -they completed the quiz, and `NA` if they did not attempt the quiz [see @Hanea2021 for further details]. - +Prior to the workshop, participants were asked to complete an optional +quiz on statistical concepts and meta-research which we thought would +aid in reliably evaluating the replicability of research claims. Quiz +responses are contained in `data_supp_quiz` and are used to construct +performance weights for the aggregation method `QuizWAgg` where each +participant receives a `quiz_score` if they completed the quiz, and `NA` +if they did not attempt the quiz [see @Hanea2021 for further details]. #### Reasoning Data {#sec-reasonwagg-supplementary-data} @@ -463,12 +482,12 @@ into one of 25 unique reasoning categories by the repliCATS Reasoning team [@Wintle:2021]. Reasoning categories include plausibility of the claim, effect size, sample size, presence of a power analysis, transparency of reporting, and journal reporting [@Hanea2021]. Within -`data_supp_reasons`, each of the reasoning categories that passed -our inter-coder reliability threshold are -distributed as columns in the dataset whose names are prefixed with `RW`, -and for each claim `paper_id`, each participant `user_id` is assigned a logical -`1` or `0` if they included that reasoning category in support of their Best estimate for -that claim. See @sec-ReasoningWAgg for details on the +`data_supp_reasons`, each of the reasoning categories that passed our +inter-coder reliability threshold are distributed as columns in the +dataset whose names are prefixed with `RW`, and for each claim +`paper_id`, each participant `user_id` is assigned a logical `1` or `0` +if they included that reasoning category in support of their Best +estimate for that claim. See @sec-ReasoningWAgg for details on the `ReasonWAgg` aggregation method. #### Bayesian Prior Data {#sec-bayesian-supplementary-data} @@ -479,13 +498,13 @@ probability of a claim replicating estimated from a predictive model participants assessing a given claim $c$ [@Hanea2021]. The prior data is contained in `data_supp_priors` with each claim in column `paper_id` being assigned a prior probability (on the logit scale) of the claim -replicating in column `prior_means`. +replicating in column `prior_means`. #### Aggregation Wrapper Functions -Although there are 27 aggregation methods in total, we grouped -methods based on their mathematical properties into eight 'wrapper' -functions, denoted by the suffix `WAgg`, the abbreviation of *weighted +Although there are 27 aggregation methods in total, we grouped methods +based on their mathematical properties into eight 'wrapper' functions, +denoted by the suffix `WAgg`, the abbreviation of *weighted aggregation*: `LinearWAgg()`, `AverageWAgg()`, `BayesianWAgg()`, `IntervalWAgg()`, `ShiftingWAgg()`, `ReasoningWAgg()`, `DistributionWAgg()`, and `ExtremisationWAgg()`. The specific @@ -495,10 +514,10 @@ options are described in each aggregation wrapper functions' help page. ### 'Tidy' Aggregation and Prescribed Inputs The design philosophy of [aggreCAT]{.pkg} is principled on 'tidy' data -[@Wickham:2014vp]. Each aggregation method expects a [data.frame]{.class} or -[tibble]{.class} of judgements (`data_ratings`) as its input, and -returns a [tibble]{.class} consisting of the variables `method`, -`paper_id`, `cs` and `n_experts` (see @sec-AverageWAgg for +[@Wickham:2014vp]. Each aggregation method expects a +[data.frame]{.class} or [tibble]{.class} of judgements (`data_ratings`) +as its input, and returns a [tibble]{.class} containing the variables +`method`, `paper_id`, `cs` and `n_experts` (see @sec-AverageWAgg for illustration of outputs); where `method` is a character vector corresponding to the aggregation method name specified in the `type` argument. Each aggregation is applied as a summary function @@ -508,41 +527,55 @@ of expert judgements summarised in the aggregated confidence score is returned in the column `n_experts`. Because of the tidy nature of the aggregation outputs, multiple aggregations can be applied to the same data with the results of all aggregation methods row bound together in a -single `tibble`. - -Each aggregation function requires values derived from three-point -elicitation (best-estimate, upper and lower bound), however, some methods -require only the best-estimates for mathematical aggregation. For every -aggregation function, the three-point elicitation values corresponding -to the `direct_replication` question are required inputs. Of the -`question` and `element`s other than the three-point elicitation -`element`s belonging to the direct replication `question`, only the -`comprehension` question with the `likert_binary` elements is required --- this is an input into [CompWAgg]{.fun}, which is used to weight -participants judgements. - -## Focal Claim Aggregation{#sec-focal-claims} +single `tibble` (See the example repliCATS workflow in @sec-workflow). + +The tibble of judgements to be aggregated (`data_ratings`) requires the +columns `round`, `paper_id`, `user_name`, `question`, `element`, `value` +and `group`. Each observation in the judgement data corresponds to a +single `value` for a single `question` elicited from a single +`user_name` about a given `paper_id` in a single `round`. There are four +types of `question`s that elicited `values` correspond to. Estimates +about the event probability for a given `paper_id` correspond to +`"direct_replication"` in the `question` variable. The type of estimate +the `value` belongs to is recorded in the `element` variable, and may be +one of `"three_point_lower"`, `"three_point_best"`, or +`"three_point_upper"`. + +Every aggregation function requires at least one `value` derived from +three-point elicitation (`question == "direct_replication"`) in the +dataframe supplied to the `expert_judgements` argument, however, some +methods require only the best-estimates +(`element == "three_point_best"`) for mathematical aggregation. +Similarly some aggregation methods require multiple `round`s of +judgements, while others require only a single round. Only the +aggregation method *CompWAgg* requires `value`s for the `comprehension` +question. For a summary of each aggregation method, its calling function +and data requirements and sources, see @tbl-method-summary-table. + +## Focal Claim Aggregation {#sec-focal-claims} We now demonstrate how judgements elicited from a diverse group of individuals may be mathematically aggregated for a single forecasting problem, using the datasets provided by [aggreCAT]{.pkg}. We illustrate the internal mechanics of the weighting methods and the different data -requirements of each of the different types of aggregators -- namely; +requirements of each of the different types of aggregators, namely; methods with non-weighted linear combinations of judgements, weighted linear combinations of judgements, re-scaled weighted linear combinations of judgements, methods that require supplementary data, and methods that require data elicited from the full IDEA protocol. Each -group of methods differs in the type of judgements elicited (single- or three-point estimates), the number of elicitation rounds (one or two rounds), whether multiple forecasts / elicited judgements are -used during confidence score computation for a target forecast / claim, -and finally whether supplementary data is required for aggregation. +group of methods differs in the type of judgements elicited (single- or +three-point estimates), the number of elicitation rounds (one or two +rounds), whether multiple forecasts / elicited judgements are used +during confidence score computation for a target forecast / claim, and +finally whether supplementary data is required for aggregation. Here we demonstrate the application of aggregation methods for each group of methods using a set of 'focal claims' selected from the pilot study dataset supplied with the [aggreCAT]{.pkg} package. Below we subset the dataset `data_ratings` to include a sample of four claims with judgements from five randomly-sampled participants. From these -focal claims, we select a target claim for which we will apply -an exemplar aggregation method from each mathematical aggregator +focal claims, we select a target claim for which we will apply an +exemplar aggregation method from each mathematical aggregator (@tbl-focal-claim ). ```{r} @@ -602,14 +635,14 @@ $$ \hat{p}_c\left(ArMean \right ) = \frac{1}{N}\sum_{i=1}^N B_{i,c} $$ {#eq-ArMean} -Below we demonstrate the application of `ArMean` on a single claim -`108` for a subset of participants who assessed this claim. We also -illustrate this aggregation visually in [Figure 2](#fig-ArMean). -`ArMean` is applied using the aggregation method [AverageWAgg]{.fct}, -which is a wrapper function for several aggregation methods that -calculate different types of averaged best-estimates (see -`?AverageWAgg`). The function returns the Confidence Score for the claim -in the form of a [tibble]{.class}: +Below we demonstrate the application of `ArMean` on a single claim `108` +for a subset of participants who assessed this claim. We also illustrate +this aggregation visually in [Figure 2](#fig-ArMean). `ArMean` is +applied using the aggregation method [*AverageWAgg*]{.fct}, which is a +wrapper function for several aggregation methods that calculate +different types of averaged best-estimates (see `?AverageWAgg`). The +function returns the Confidence Score for the claim in the form of a +[tibble]{.class}: ```{r} #| label: focal-claim-ArMean @@ -629,9 +662,9 @@ focal_claims %>% knitr::include_graphics(path = "images/ArMean.png") ``` -::: {.callout} -:::{#aggWorkflow} -### Box 1: Aggregation Workflow Blueprint{.unnumbered} +::: callout +::: {#aggWorkflow} +### Box 1: Aggregation Workflow Blueprint {.unnumbered} #### Argument Structure and Expected Form @@ -644,14 +677,18 @@ args(AverageWAgg) ``` The aggregation *method* to be applied by the aggregation *function*, is -specified by the `type` argument, defaulting to `ArMean` in the above example. The resultant -`tibble` of Confidence Scores includes the `name` of the aggregation -method applied, defaulting to the `type` argument, but this can be -overridden by the user if they supply a non-`NULL` value to `name`. -\ - -Percentage values, counts, or other non probabilistic quantities are the default expected value type for ratings supplied to the `expert_judgements` argument of aggregation functions. By overriding the default value for the argument `percent_toggle` with `TRUE`, percentage values are converted to probabilities by dividing judgements over 100 within the aggregation functions. - +specified by the `type` argument, defaulting to `ArMean` in the above +example. The resultant `tibble` of Confidence Scores includes the `name` +of the aggregation method applied, defaulting to the `type` argument, +but this can be overridden by the user if they supply a non-`NULL` value +to `name`. \ + +Percentage values, counts, or other non probabilistic quantities are the +default expected value type for ratings supplied to the +`expert_judgements` argument of aggregation functions. By overriding the +default value for the argument `percent_toggle` with `TRUE`, percentage +values are converted to probabilities by dividing judgements over 100 +within the aggregation functions. When working with regularly updated data and developing a reproducible pipeline [@Yenni2019] , it can be useful to put aggregation methods into @@ -659,17 +696,17 @@ pipeline [@Yenni2019] , it can be useful to put aggregation methods into aggregation function instead of computing a Confidence Score using the aggregation method. By setting `placeholder` to `TRUE`, the user can supply a placeholder Confidence Score, which defaults to $65\%$, the -approximate average replication rate of SBS research claims [@Camerer2018]. -Should the user wish to set an alternative value, they can create a modified -version of `method_placeholder()` for themselves and store this within -the global environment. This function will then be called by the -aggregation method when the `placeholder` argument is set to `TRUE`. +approximate average replication rate of SBS research claims +[@Camerer2018]. Should the user wish to set an alternative value, they +can create a modified version of `method_placeholder()` for themselves +and store this within the global environment. This function will then be +called by the aggregation method when the `placeholder` argument is set +to `TRUE`. Some functions expect additional arguments, especially those that rely on additional or supplementary data. See the *man* pages for details of additional arguments. - #### Mathematical Aggregation Computational Workflow Blueprint Each aggregation function follows a general computational workflow @@ -691,10 +728,13 @@ elements must be returned (`three_point_filter`), including the and the plausibility ratings under `binary_question` in column `element`. `three_point_filter` defaults to `TRUE` to provide only direct replication questions and associated values. Nearly all -aggregation functions use only the round 2 judgements, so the -`round_2_filter` defaults to `TRUE` (See Table \ref{tbl-method-summary-table} for required inputs of all aggregation methods). `preprocess_judgements()` further pre-processes the data to remove missing data, and returnq the data -into an appropriate structure for calculating weights and applying the -aggregation function with `dplyr::summarise()`. +aggregation functions use only the round two judgements, so the +`round_2_filter` defaults to `TRUE` (See Table +\ref{tbl-method-summary-table} for required inputs of all aggregation +methods). `preprocess_judgements()` further pre-processes the data to +remove missing data, and returns the data into an appropriate structure +for calculating weights and applying the aggregation function with +`dplyr::summarise()`. ```{r} #| label: demo-preprocess-judgements @@ -715,9 +755,10 @@ functions / methods not supplied in \code{aggreCAT}. For some aggregation methods, weights are necessary, and thus are computed prior to aggregation. Some aggregation methods compute weights -using separate weighting functions (See Table \ref{tbl-method-summary-table}), however, -for aggregation methods with simpler weight computations, these are -defined in-function, rather than being modularised. +using separate weighting functions (See Table +\ref{tbl-method-summary-table}), however, for aggregation methods with +simpler weight computations, these are defined in-function, rather than +being modularised. After application of `preprocess_judgements()`, weights are constructed, and the aggregation method is applied, the function @@ -725,8 +766,9 @@ and the aggregation method is applied, the function data frame that is returned by each aggregation function. The post processing function returns a [tibble]{.class} consisting of observations equal to the number of unique claims that were parsed to -`postprocess_judgements()`, the `method`, `paper_id` , the Confidence Score `value`, -as well as the total number of participants `n_experts` whose assessments were used in the aggregation. +`postprocess_judgements()`, the `method`, `paper_id` , the Confidence +Score `value`, as well as the total number of participants `n_experts` +whose assessments were used in the aggregation. ::: ::: @@ -740,12 +782,12 @@ best estimate $B_{i,c}$ by the width of their uncertainty intervals, i.e. the difference between an individual's upper ${U}_{i,c}$ and lower bounds ${L}_{i,c}$. For a given claim $c$, a vector of weights for all individuals is calculated from their upper and lower estimates using the -weighting function, [weight\_interval]{.fct}, which calculates the +weighting function, [weight_interval]{.fct}, which calculates the interval width for each individual's estimate for the target claim. The weights are then normalised across the claim (by dividing each weight by the sum of all weights per claim). Normalised weights are then -multiplied by the corresponding individual's best estimates $B_{i,c}$ -andsummed together into a single Confidence Score +multiplied by the corresponding individual's best estimates, $B_{i,c}$, +and summed together into a single Confidence Score (@fig-IntWAgg-IndIntWAgg). ### Re-scaled weighted linear combinations of judgements @@ -757,14 +799,15 @@ interval width weights for individual $i$ for claim $c$ relative to the widest interval provided by that individual across all claims $C$, (@eq-IntWAgg). For the target claim, each individual's interval width is divided by the maximum interval width that same individual gave across -all claims they have provided judgements for, using the weighting -function [weight\_nIndivInterval]{.fct} (@eq-weightnIndivInterval). -The process of re-scaling is illustrated in @fig-IntWAgg-IndIntWAgg. -Other aggregation methods that re-scale weights by using data from -multiple claims other than the target claim under aggregation are +all claims they have provided judgements for using the weighting +function [`weight_nIndivInterval`]{.fct} (@eq-weightnIndivInterval). The +process of re-scaling is illustrated in @fig-IntWAgg-IndIntWAgg. Other +aggregation methods that re-scale weights by using data from multiple +claims other than the target claim under aggregation are `VarIndIntWAgg`, `IndIntAsymWAgg`, `KitchSinkWAgg` (applied with the wrapper function [IntervalWAgg]{.fct}) and `GranWAgg` (applied with the -wrapper function [LinearWAgg]{.fct}), see Table \ref{tbl-method-summary-table}. +wrapper function [*LinearWAgg*]{.fct}), see Table +\ref{tbl-method-summary-table}. $$ w\_Interval_{i,c}= \frac{1}{U_{i,c} - L_{i,c}} @@ -774,7 +817,11 @@ $$ \hat{p}_c\left( IntWAgg \right) = \sum_{i=1}^N \tilde{w}\_Interval_{i,c}B_{i,c} $$ {#eq-IntWAgg} -As for [AverageWAgg]{.fct}, when using the wrapper function [IntervalWAgg]{.fct} we supply the aggregation method names as a character vector to the `type` argument and the focal claim data frame to the argument `expert_judgements`, using [dplyr::bind_rows]{.fct} to bind the resultant Confidence Scores together: +As for [*AverageWAgg*]{.fct}, when using the wrapper function +[*IntervalWAgg*]{.fct} we supply the aggregation method names as a +character vector to the `type` argument and the focal claim data frame +to the argument `expert_judgements`, using [`dplyr::bind_rows`]{.fct} to +bind the resultant Confidence Scores together: ```{r} #| label: focal-claim-IntWAgg @@ -790,9 +837,10 @@ dplyr::bind_rows( ) ``` +```{=tex} \newpage \blandscape - +``` ```{r} #| label: fig-IntWAgg-IndIntWAgg #| fig-cap: "Example applications of mathematical aggregation methods a\\) `IntWAgg` and b\\) `IndIntWAgg` using the wrapper function a1\\) `IntWAgg` uses participants' upper and lower bounds to construct performance weights. b2\\) This weighting computation is modified in `IndIntWAgg` whereby the weights for each individual are re-scaled by the largest interval width across all claims for a given individual. We exemplify this rescaling process by illustrating the calculation of participant 1's maximum interval width across all claims they assessed in the demonstration dataset `focal_claims`. This is repeated for every individual who has assessed the target claim under aggregation. " @@ -801,18 +849,19 @@ dplyr::bind_rows( knitr::include_graphics("images/IntervalWAgg.png") ``` +```{=tex} \newpage \elandscape - -### Aggregation Methods Requiring Supplementary Data{#sec-ReasoningWAgg} +``` +### Aggregation Methods Requiring Supplementary Data {#sec-ReasoningWAgg} In addition to the three-point elicitation dataset `data_ratings`, some aggregation methods require supplementary data inputs collected externally to the repliCATS IDEA protocol. Each aggregation wrapper function that requires supplementary data expects this data to be provided as a [data.frame]{.class} or [tibble]{.class} in addition to -the main judgements that are provided to the `expert_judements` -argument \ref{tbl-method-summary-table}. +the main judgements that are provided to the `expert_judements` argument +\ref{tbl-method-summary-table}. We illustrate the usage and internal mechanics of this type of aggregation with the method `ReasonWAgg`, which weights participants' @@ -849,14 +898,14 @@ best estimates are then summed to generate the Confidence Score knitr::include_graphics("images/ReasonWAgg.png") ``` -The focal claim selected for aggregation using `ReasonWAgg` is `24`, +The focal claim selected for aggregation using `ReasonWAgg` is `24`, and the round two three-point estimates from the five focal participants for this claim are shown in @tbl-reason-wagg-focal-claim. We first prepare the supplementary data for aggregation `data_supp_reasons`, subsetting only the participants contained in our `focal_claims` dataset. We also illustrate a subset of the supplementary data for our five focal -participants for the focal claim `24` (see `?data_supp_reasons` for -a description of variables): +participants for the focal claim `24` (see `?data_supp_reasons` for a +description of variables): ```{r} #| label: prepare-supp-data @@ -898,9 +947,10 @@ focal_claims %>% gt::fmt_integer(columns = round) ``` -Confidence Scores estimating the replicability for claim `24` (@tbl-reason-wagg-focal-claim) using the `ReasonWAgg` method are computed -using [ReasoningWAgg]{.fct} and by providing the supplementary data to -the `reasons` argument: +Confidence Scores estimating the replicability for claim `24` +(@tbl-reason-wagg-focal-claim) using the `ReasonWAgg` method are +computed using [ReasoningWAgg]{.fct} and by providing the supplementary +data to the `reasons` argument: ```{r} #| label: focal-claim-ReasonWAgg @@ -918,8 +968,8 @@ all participants are missing a Reasoning Score, the log-odds transformed best estimate is returned instead (See `?AverageWAgg`, `type="LOArMean"`). The user can choose to flag this behaviour explicitly by setting the argument `flag_loarmean` to `TRUE`, which will -generate new columns in the aggregation output [data.frame]{.class} named -`method_applied` (with values `LOArMean` or `ReasonWAgg`), and +generate new columns in the aggregation output [data.frame]{.class} +named `method_applied` (with values `LOArMean` or `ReasonWAgg`), and `no_reason_score`, a logical variable describing whether or not there were no reasoning scores for that claim. @@ -929,11 +979,12 @@ Both Bayesian methods `BayTriVar` and `BayPRIORsAgg` use the full three-point elicitation data, i.e., they use information contained in the uncertainty bound provided by individuals (upper ${U}_{i,c}$ and lower bounds ${L}_{i,c}$), in addition to Best Estimates, $B_{i,c}$. -Like `IndIntWAgg` and other methods (Table \ref{tbl-method-summary-table}), the Bayesian -aggregation methods also construct weights from information encoded in -participant assessments of claims other than the target claim under -aggregation. In fact, the Bayesian methods require more than a single -claim's worth of data to work properly execute due mathematical +Like `IndIntWAgg` and other methods (Table +\ref{tbl-method-summary-table}), the Bayesian aggregation methods also +construct weights from information encoded in participant assessments of +claims other than the target claim under aggregation. In fact, the +Bayesian methods require data from more than a single claim in order for +the function to properly execute, due to the underlying mathematical specification of the models (See `?BayesianWAgg` and below for details). The two Bayesian methods use the elicited probabilities as data to @@ -952,15 +1003,17 @@ using default priors, priors are generated from a predictive model that estimates the probability of a claim replicating based on characteristics of the claim and publication [@Gould2021a]. Priors are parsed as supplementary data to the wrapper function -[BayesianWAgg]{.fct} using the argument `priors` (@sec-bayesian-supplementary-data) with each claim having its own unique prior. +[BayesianWAgg]{.fct} using the argument `priors` +(@sec-bayesian-supplementary-data) with each claim having its own unique +prior. We illustrate aggregation of participant judgements using the method `BayTriVar` to generate a Confidence Score for the claim `108`. Note that [BayesianWAgg]{.fct} expects best estimates in the form of probabilities, so to convert elicited values in the form of percentages -within the data parsed to `expert_judgements` to probabilities, -the logical value `TRUE` is supplied to the argument -`percent_toggle` ([Box 1](#aggWorkflow)): +within the data parsed to `expert_judgements` to probabilities, the +logical value `TRUE` is supplied to the argument `percent_toggle` ([Box +1](#aggWorkflow)): ```{r} #| label: focal-claim-BayTriVar @@ -974,22 +1027,23 @@ focal_claims %>% ``` The Confidence Score calculated for a given claim depends on data for -other claims and participants included in the -`expert_judgements` argument other than the target claim, -because, by definition, [BayesianWAgg]{.fct} calculates the Confidence -Score for a target claim using data from participants' assessments of -other claims, and from all other claims in the [data.frame]{.class} parsed to the -`expert_judgements` argument. Because information about other -claims than the target claim is used to calculate the Confidence Score -for the target claim, what is included in the data supplied to the -argument `expert_judgements` in [BayesianWAgg]{.fct} will alter -the Confidence Score. Above, we calculated the Confidence Score for -claim `108` but including information from -`r xfun::numbers_to_words(length(unique(focal_claims$paper_id)) - 1)` additional claims included -in the `focal_claims` [data.frame]{.class}: -`r focal_claims %>% distinct(paper_id) %>% flatten_chr %>% glue::backtick()`. However, if we -were to supply assessments for only two claims to [BayesianWAgg]{.fct}, -then we would observe a different result for focal claim `108`: +other claims and participants included in the `expert_judgements` +argument other than the target claim, because, by definition, +[BayesianWAgg]{.fct} calculates the Confidence Score for a target claim +using data from participants' assessments of other claims, and from all +other claims in the [data.frame]{.class} parsed to the +`expert_judgements` argument. Because data from claims than the target +claim are used to calculate the Confidence Score for the target claim, +any data from claims other than the target claim within the data parsed +to the `expert_judgements` argument in *BayesianWAgg* will alter the +Confidence Score. Above, we calculated the Confidence Score for claim +`108` but including information from +`r xfun::numbers_to_words(length(unique(focal_claims$paper_id)) - 1)` +additional claims included in the `focal_claims` [data.frame]{.class}: +`r focal_claims %>% distinct(paper_id) %>% flatten_chr %>% glue::backtick()`. +However, if we were to supply assessments for only two claims to +[*BayesianWAgg*]{.fct}, then we would observe a different result for +focal claim `108`: ```{r} #| label: focal-claim-subset-BayTriVar @@ -1006,10 +1060,10 @@ The Confidence Score shifts from `r focal_claims %>% BayesianWAgg(type = "BayTriVar", percent_toggle = TRUE) %>% dplyr::filter(paper_id == 108) %>% pluck("cs") %>% round(2)` to `r focal_claims %>% dplyr::filter(paper_id %in% c(108, 138)) %>% BayesianWAgg(type = "BayTriVar", percent_toggle = TRUE) %>% dplyr::filter(paper_id == 108) %>% pluck("cs") %>% round(2)`. -Note that [BayesianWAgg]{.fct} cannot calculate confidence scores when +Note that [*BayesianWAgg*]{.fct} cannot calculate confidence scores when judgements for only a single claim is provided to -[expert\_judgements]{.fct}, because by definition the underlying -Bayesian model calculates variance across multiple claims and multiple +[expert_judgements]{.fct}, because by definition the underlying Bayesian +model calculates variance across multiple claims and multiple participants: ```{r} @@ -1023,19 +1077,23 @@ focal_claims %>% percent_toggle = TRUE) ``` -Although we have set $n=2$ as the minimum number of claims for which variance is computed, it is up to the user to determine their own justifiable minimum for reliable variance calculations. +Although we have set $n=2$ as the minimum number of claims for which +variance is computed, it is up to the user to determine their own +justifiable minimum for reliable variance calculations. Finally, all of the previous methods illustrated in this section have -been used with data generated using the IDEA elicitation protocol, -however this elicitation method is not strictly necessary for the of -these methods. Methods that *do* require the full IDEA protocol for -their correct mathematical implementation, such as [ShiftingWAgg]{.fct}, -which use two rounds of three-point judgements in which the second round -judgements are revised after discussion, are listed in Table \ref{tbl-method-summary-table}. - +been used with data generated from the IDEA elicitation protocol, which +is not strictly necessary for the application of these aggregation +methods. Methods that *do* require the full IDEA protocol for their +correct mathematical implementation are listed in Table +\ref{tbl-method-summary-table}, e.g. *ShiftingWAgg*, which uses two +rounds of three-point judgements wherein second round judgements are +revised after discussion. + +```{=tex} \newpage \blandscape - +``` ```{r} #| label: fig-BayesianWAgg #| fig-cap: "Illustration of BayTriVar applied with `BayesianWAgg()`for a single claim, `paper_id = 108` from the `focal_claims` data object. Note that the claims `138`, `186` and `24` contained in `focal_claims` are used in the calculation of pariticipant-level SD and claim-level SD, thus the Confidence Score returned by BayTriVar is sensitive to the other claims provided to argument `expert_judgements`." @@ -1044,17 +1102,23 @@ judgements are revised after discussion, are listed in Table \ref{tbl-method-su #| out-width: "900px" knitr::include_graphics("images/BayesianWAgg.png") ``` + +```{=tex} \newpage \elandscape - -## An illustrative workflow for use in real study contexts{#sec-workflow} - -Throughout the SCORE program, 752 participants assessed more than -4000 unique claims using the repliCATS IDEA protocol, between 7th July 2019 and 25 November 2021. This required batch aggregation over multiple claims, and to generate Confidence Scores for multiple claims. We also applied multiple aggregation methods to the same claim so that we could compare and evaluate the different aggregation methods. We expect that these are not uncommon -use-cases,consequently in this section we demonstrate a general workflow -for using the [aggreCAT]{.pkg} package to aggregate expert judgements -using pilot data from DARPA SCORE program generated by the repliCATS -project. +``` +## An illustrative workflow for use in real study contexts {#sec-workflow} + +Throughout the SCORE program, 752 participants assessed more than 4000 +unique claims using the repliCATS IDEA protocol, between 7th July 2019 +and 25 November 2021. This required batch aggregation over multiple +claims, and to generate Confidence Scores for multiple claims. We also +applied multiple aggregation methods to the same claim so that we could +compare and evaluate the different aggregation methods. We expect that +these are not uncommon use-cases,consequently in this section we +demonstrate a general workflow for using the [aggreCAT]{.pkg} package to +aggregate expert judgements using pilot data from DARPA SCORE program +generated by the repliCATS project. ### Generating multiple forecasts @@ -1062,17 +1126,17 @@ During expert-elicitation the analyst or researcher may be tasked with generating multiple forecasts for different problems or questions, and therefore it is useful to batch the aggregation. Since the [aggreCAT]{.pkg} package is designed using the principles of *tidy* data -analysis [@tidyverse2019], each aggregation function accepts a [data.frame]{.class} -of raw three-point forecasts for one or more claims, $C$, parsed to the -argument `expert_judgements`. The data pre-processing and aggregation -methods are applied using a combination of calls to [tidyverse]{.pkg} -functions, including `summarise` and `mutate`. From the user's -perspective, this means that data processing and application of the -aggergation methods is handled internally by the [aggreCAT]{.pkg} -package, rather than by the user. The user is therefore free to focus -their attention on the interpretation and analysis of the forecasts. -Here we demonstrate the application of the `ArMean` aggregation method -to four focal claims simultaneously: +analysis [@tidyverse2019], each aggregation function accepts a +[data.frame]{.class} of raw three-point forecasts for one or more +claims, $C$, parsed to the argument `expert_judgements`. The data +pre-processing and aggregation methods are applied using a combination +of calls to [tidyverse]{.pkg} functions, including `summarise` and +`mutate`. From the user's perspective, this means that data processing +and application of the aggergation methods is handled internally by the +[aggreCAT]{.pkg} package, rather than by the user. The user is therefore +free to focus their attention on the interpretation and analysis of the +forecasts. Here we demonstrate the application of the `ArMean` +aggregation method to four focal claims simultaneously: ```{r} #| label: generating-multiple-forecasts @@ -1089,16 +1153,18 @@ evaluated and compared. Since different methods offer different mathematical properties, and therefore might be more or less appropriate depending on the purpose of the aggregation and forecasting, a researcher or analyst might want to check how the different assumptions -embedded in different aggregation methods influence the final -Confidence Scores for a forecast -- i.e. how robust are the results to -different methods and therefore to different assumptions? +embedded in different aggregation methods influence the final Confidence +Scores for a forecast -- i.e. how robust are the results to different +methods and therefore to different assumptions? From a computational perspective, multiple aggregation methods must first be applied to the forecast prior to comparison and evaluation. -This can be achieved by applying each different aggregation method to `focal_claims`, -and binding the results together with [dplyr]{.pkg}'s [row\_bind]{.fct}. -However, more elegant and succinct solutions can be implemented using -[purrr]{.pkg}'s [map\_dfr]{.fct} function [@purrr2020, see @lst-multi-method-workflow-non-supp and @lst-multi-method-workflow-both]. +This can be achieved by applying each different aggregation method to +`focal_claims`, and binding the results together with [dplyr]{.pkg}'s +[row_bind]{.fct}. However, more elegant and succinct solutions can be +implemented using [purrr]{.pkg}'s [map_dfr]{.fct} function [@purrr2020, +see @lst-multi-method-workflow-non-supp and +@lst-multi-method-workflow-both]. ```{r} #| label: multi-method-workflow-non-supp @@ -1144,10 +1210,11 @@ aggreCAT::data_outcomes %>% head() ``` -The function [confidence\_score\_evaluation]{.fct} evaluates a set -of aggregated forecasts or Confidence Scores against a set of known or +The function [confidence_score_evaluation]{.fct} evaluates a set of +aggregated forecasts or Confidence Scores against a set of known or observed outcomes, returning the Area Under the ROC Curve (AUC), the -Brier score, and classification accuracy of each method (@tbl-multi-method-workflow-eval): +Brier score, and classification accuracy of each method +(@tbl-multi-method-workflow-eval): ```{r} #| label: multi-method-workflow-eval @@ -1186,10 +1253,9 @@ aggreCAT::confidence_score_evaluation( We include two functions for visualising comparison and evaluation of Confidence Scores across multiple aggregation methods for a suite of forecasts from multiple participants, -[confidence\_scores\_ridgeplot]{.fct} and -[confidencescore\_heatmap]{.fct}. -[confidence\_scores\_ridgeplot]{.fct} generates ridgeline plots -using [ggridges]{.pkg} [@ggridges2021], and displays the distribution of +[confidence_scores_ridgeplot]{.fct} and [confidencescore_heatmap]{.fct}. +[confidence_scores_ridgeplot]{.fct} generates ridgeline plots using +[ggridges]{.pkg} [@ggridges2021], and displays the distribution of predicted outcomes across a suite of forecasts for each aggregation method, grouped into separate 'mountain ranges' according to the mathematical properties of the aggregation method (@fig-ridgeplot). @@ -1256,7 +1322,6 @@ p <- plot_judgements %>% p ``` - ```{r} #| label: fig-heatmap #| echo: false @@ -1290,17 +1355,17 @@ worst_forecasts <- dplyr::inner_join(confidenceSCOREs, data_outcomes) %>% dplyr::slice_max(difference, n = 3) ``` -While [confidencescore\_heatmap]{.fct} is useful for comparison of -aggregation methods, [confidencescore\_heatmap]{.fct} is useful for +While [confidencescore_heatmap]{.fct} is useful for comparison of +aggregation methods, [confidencescore_heatmap]{.fct} is useful for visual comparative *evaluation* of aggregation methods. -[confidencescore\_heatmap]{.fct} generates heatmaps of forecasted +[confidencescore_heatmap]{.fct} generates heatmaps of forecasted Confidence Scores for each aggregation method included in the dataset -provided to the argument `confidence_scores` organised with -unique aggregation methods on the y-axis, and separate forecasts or -`paper_id`s along the y-axis (@fig-heatmap). The heatmap is blocked -vertically according to the mathematical characteristics of each -aggregation method, and horizontally into two groups, according to the -binary outcomes in `data_outcomes`. +provided to the argument `confidence_scores` organised with unique +aggregation methods on the y-axis, and separate forecasts or `paper_id`s +along the y-axis (@fig-heatmap). The heatmap is blocked vertically +according to the mathematical characteristics of each aggregation +method, and horizontally into two groups, according to the binary +outcomes in `data_outcomes`. Horizontal grouping facilitates quick and simple evaluation of the aggregation methods. Perfectly accurate aggregation methods show dark @@ -1310,28 +1375,30 @@ actual outcomes were `0` or `FALSE`. Deviation from this expectation indicates which aggregation methods for which claim/forecast, for which outcome type were inaccurate, and to what degree. -For example, in @fig-heatmap, for the dataset -`confidenceSCOREs` the successful replication of most claims was -accurately forecasted by most methods, except for several claims. Some -methods performed better than others for some claims (e.g. `ReasonWAgg` -for claims `109` and `138`. +For example, in @fig-heatmap, for the dataset `confidenceSCOREs` the +successful replication of most claims was accurately forecasted by most +methods, except for several claims. Some methods performed better than +others for some claims (e.g. `ReasonWAgg` for claims `109` and `138`. Finally, creating bespoke user-defined plots is relatively easy -- -because [aggreCAT]{.pkg} functions return tidy [data.frame]{.class}s or [tibble]{.class}s, we can easily -manipulate the raw judgements, aggregated Confidence Scores and outcome -data to plot them with [ggplot2]{.pkg} [@ggplot2016] or other -visualisation package. Below we plot the aggregated Confidence Scores -along with the three-point judgements (subset using -[preprocess\_judgements]{.fct} on `focal_claims`, -transforming judgements in percentages to probabilities by setting -`percent_toggle` to `TRUE`, @fig-aggregation, @lst-confidencescores). +because [aggreCAT]{.pkg} functions return tidy [data.frame]{.class}s or +[tibble]{.class}s, we can easily manipulate the raw judgements, +aggregated Confidence Scores and outcome data to plot them with +[ggplot2]{.pkg} [@ggplot2016] or other visualisation package. Below we +plot the aggregated Confidence Scores along with the three-point +judgements (subset using [preprocess_judgements]{.fct} on +`focal_claims`, transforming judgements in percentages to probabilities +by setting `percent_toggle` to `TRUE`, @fig-aggregation, +@lst-confidencescores). ### Extending aggreCAT to other datasets The aggregation methods supplied by the [aggreCAT]{.pkg} package can easily be applied to other forecasting problems. The only requirements -are that the data inputs adhere to the required format (Box [1](#aggWorkflow)), and that the expert judgements are elicited using the -appropriate method, as required by each aggregation method (see Table \ref{tbl-method-summary-table}). +are that the data inputs adhere to the required format (Box +[1](#aggWorkflow)), and that the expert judgements are elicited using +the appropriate method, as required by each aggregation method (see +Table \ref{tbl-method-summary-table}). Judgement data provided to the `expert_judgements`, `data_justifications` or any supplementary data inputs argument must @@ -1343,28 +1410,34 @@ different forecasting problem `paper_id`, a unique `user_name` for each individual, and the `element` of the three point elicitation that the recorded response or `value` in that row corresponds to. The data is stored in long or tidy format such that each row or observation in the -[data.frame]{.class} references only a single `element` of a participants' set of -three point elicitation values. When applying aggregation methods -requiring supplementary data to the elicitation data, the analyst should -also adhere to the requirements stipulated for the relevant -supplementary dataset described in the documentation. - -Although several aggregation methods *require* judgements that -are elicited using the IDEA protocol (See Table \ref{tbl-method-summary-table} for exceptions), most aggregation methods require -only a single round of elicitation that generates a set of three points; -a best estimate, and upper and lower bounds about those estimates, even if the IDEA protocol was used -to elicit judgements. +[data.frame]{.class} references only a single `element` of a +participants' set of three point elicitation values. When applying +aggregation methods requiring supplementary data to the elicitation +data, the analyst should also adhere to the requirements stipulated for +the relevant supplementary dataset described in the documentation. + +Although several aggregation methods *require* judgements that are +elicited using the IDEA protocol (See Table +\ref{tbl-method-summary-table} for exceptions), most aggregation methods +require only a single round of elicitation that generates a set of three +points; a best estimate, and upper and lower bounds about those +estimates, even if the IDEA protocol was used to elicit judgements. Hence, the aggregation functions contained in the [aggreCAT]{.pkg} package are unsuitable for use with judgements elicited with methods -that aggregate behaviourally (e.g. using consensus) and therefore result in a single forecast value. Where the analyst elicits judgements for only a single round, the analyst should record the round in the judgements data as the character string `"round_1"`, and set the `round_2_filter` argument to `FALSE` in the aggregation wrapper function call. +that aggregate behaviourally (e.g. using consensus) and therefore result +in a single forecast value. Where the analyst elicits judgements for +only a single round, the analyst should record the round in the +judgements data as the character string `"round_1"`, and set the +`round_2_filter` argument to `FALSE` in the aggregation wrapper function +call. Should the analyst wish to create their own aggregation functions, pre- and post-processing functions may be leveraged inside the functions -([preprocess\_judgements]{.fct} and [postprocess\_judgements]{.fct}, +([preprocess_judgements]{.fct} and [postprocess_judgements]{.fct}, respectively), as we have illustrated in data preparation for @fig-aggregation (@lst-confidencescores). These processing functions modularise key components of the aggregation's computational -implementation -- namely the data wrangling that occurs before and after +implementation, namely, the data wrangling that occurs before and after the actual mathematical aggregation. #### Preparing your own Elicitation Data @@ -1405,12 +1478,11 @@ green_turtles <- We can then apply multiple aggregation methods, using the same approach implemented for aggregation of the `focal_claims` dataset -(@lst-BYO-data-aggregate), with aggregated Confidence Scores -shown in @tbl-BYO-data-aggregate. Note that because the judgements are -absolute values rather than probabilities, we set the -`percent_toggle` argument for each aggregation wrapper function -to [FALSE]{.val} (@lst-BYO-data-aggregate). - +(@lst-BYO-data-aggregate), with aggregated Confidence Scores shown in +@tbl-BYO-data-aggregate. Note that because the judgements are absolute +values rather than probabilities, we set the `percent_toggle` argument +for each aggregation wrapper function to [FALSE]{.val} +(@lst-BYO-data-aggregate). ```{r} #| label: tbl-BYO-data-aggregate @@ -1460,11 +1532,13 @@ so many aggregation methods, and methods that use proxies of forecasting accuracy using weights. The [aggreCAT]{.pkg} package is production-ready for application to data -elicited during either a single workshop, or for contexts where data collection may be ongoing and continuous analysis is used for automating aggregation. Unlike -other aggregation packages, the [aggreCAT]{.pkg} package is designed to -work within the *tidyverse*. The package is premised on the principles -of *tidy* data analysis whereby the user supplies [data.frame]{.class}s of elicited -judgements, and the aggregation methods return [data.frame]{.class}s of aggregated +elicited during either a single workshop, or for contexts where data +collection may be ongoing and continuous analysis is used for automating +aggregation. Unlike other aggregation packages, the [aggreCAT]{.pkg} +package is designed to work within the *tidyverse*. The package is +premised on the principles of *tidy* data analysis whereby the user +supplies [data.frame]{.class}s of elicited judgements, and the +aggregation methods return [data.frame]{.class}s of aggregated forecasts. The benefits of this approach are three-fold. Firstly, the work of data-wrangling and application of the aggregation methods is handled internally by the aggregation methods, so that the researcher @@ -1479,11 +1553,12 @@ in @sec-workflow. Thirdly, application of the [aggreCAT]{.pkg} package aggregation methods and performance evaluation tools is scalable, which is evidenced by the application of the [aggreCAT]{.pkg} package to forecast the -replicability of over 4000 research claims by the repliCATS project. -The scalability and placeholder functionality allow the [aggreCAT]{.pkg} package to be built into production-ready pipelines for more complicated analyses where -there are multiple forecasts being elicited and aggregated, where there -are numerous participants, and where multiple aggregation methods are -applied. +replicability of over 4000 research claims by the repliCATS project. The +scalability and placeholder functionality allow the [aggreCAT]{.pkg} +package to be built into production-ready pipelines for more complicated +analyses where there are multiple forecasts being elicited and +aggregated, where there are numerous participants, and where multiple +aggregation methods are applied. Finally, through the provision of built-in performance metrics, the analyst is able to 'ground-truth' and evaluate the forecasts against @@ -1493,7 +1568,7 @@ The [aggreCAT]{.pkg} package is easily extensible and production-ready. Each aggregation function follows a consistent modular blueprint, wherein data-wrangling of the inputs and outputs of aggregation is largely handled by pre- and post-processing functions -([preprocess\_judgements]{.fct} and [postprocess\_judgements]{.fct}, +([preprocess_judgements]{.fct} and [postprocess_judgements]{.fct}, respectively). This design expedites debugging by making it easier to pinpoint the exact source of errors, while also permitting the user to easily create their own custom aggregation methods. @@ -1510,7 +1585,10 @@ to data from domains beyond forecasting the replicability of research claims through our minimal example using forecasts generated using the IDEA protocol for a fisheries and conservation problem. -The package will be actively maintained into the future, beyond the life of the DARPA SCORE program. Bug reports and feature-requests can easily be lodged on the [aggreCAT]{.pkg} GitHub repository using reproducible examples created with [reprex]{.pkg} [@reprexpkg2020] on the repliCATS +The package will be actively maintained into the future, beyond the life +of the DARPA SCORE program. Bug reports and feature-requests can easily +be lodged on the [aggreCAT]{.pkg} GitHub repository using reproducible +examples created with [reprex]{.pkg} [@reprexpkg2020] on the repliCATS pilot study datasets shipped with the [aggreCAT]{.pkg} package. We have described the computational implementation of the aggregation @@ -1529,12 +1607,14 @@ rapidly and easily analysing the results of IDEA protocol and other structured elicitation procedures where mathematical aggregation of human forecasts is required. +```{=tex} \newpage \blandscape +``` ```{r} #| results: asis #| echo: false -#| label: method-summary-table +#| label: tbl-method-summary-table aggreCAT:::method_summary_table %>% ungroup %>% # filter(str_detect(aggregator_fun_desc, "[?]",negate = TRUE)) %>% #drop Eng/CompWAgg @@ -1576,7 +1656,7 @@ aggreCAT:::method_summary_table %>% ## Listings {.unnumbered} -```{#lst-multi-method-workflow-non-supp .r lst-cap="Multiple aggregation methods can be applied by binding rows rather than using the purrr package, if preferred."} +``` {#lst-multi-method-workflow-non-supp .r lst-cap="Multiple aggregation methods can be applied by binding rows rather than using the purrr package, if preferred."} purrr::map2_dfr(.x = list(AverageWAgg, IntervalWAgg, IntervalWAgg, @@ -1593,7 +1673,7 @@ purrr::map2_dfr(.x = list(AverageWAgg, ) ``` -```{#lst-multi-method-workflow-both .r lst-cap="If we wish to batch aggregate claims using a combination of aggregation methods that do and do not require supplementary data, we must aggregate them separately, since the methods that require supplementary data have an additional argument for the supplementary data that must be parsed to the wrapper function call. We can chain the aggregation of the methods that do not require supplementary data, and the methods that do require supplementary data together very neatly using [dplyr]{.pkg}'s [bind\_rows]{.fct} function [@dplyr2021] and the [magrittr]{.fct} pipe `\%\>\%` [@magrittr2020]. Below we implement this approach while applying the aggregation methods `ArMean`, `IntWAgg`, `IndIntWAgg`, `ShiftWAgg` and `BayTriVar` to the repliCATS pilot program dataset `data\_ratings`."} +``` {#lst-multi-method-workflow-both .r lst-cap="If we wish to batch aggregate claims using a combination of aggregation methods that do and do not require supplementary data, we must aggregate them separately, since the methods that require supplementary data have an additional argument for the supplementary data that must be parsed to the wrapper function call. We can chain the aggregation of the methods that do not require supplementary data, and the methods that do require supplementary data together very neatly using [dplyr]{.pkg}'s [bind_rows]{.fct} function [@dplyr2021] and the [magrittr]{.fct} pipe `%>%` [@magrittr2020]. Below we implement this approach while applying the aggregation methods `ArMean`, `IntWAgg`, `IndIntWAgg`, `ShiftWAgg` and `BayTriVar` to the repliCATS pilot program dataset `data_ratings`."} confidenceSCOREs <- list( AverageWAgg, @@ -1617,7 +1697,7 @@ confidenceSCOREs <- ) ``` -```{#lst-BYO-data-aggregate .r lst-cap="Bring your own data: non-probablistic values"} +``` {#lst-BYO-data-aggregate .r lst-cap="Bring your own data: non-probablistic values"} turtle_CS <- list( AverageWAgg, @@ -1634,7 +1714,7 @@ turtle_CS <- ) ``` -```{#lst-confidencescores .r lst-cap="Visualising Confidence Scores"} +``` {#lst-confidencescores .r lst-cap="Visualising Confidence Scores"} plot_cs <- confidenceSCOREs %>% dplyr::left_join(aggreCAT::data_outcomes) %>% @@ -1673,28 +1753,29 @@ p <- plot_judgements %>% ggplot2::scale_colour_brewer(palette = "Set1") ``` +```{=tex} \elandscape \newpage - +``` ## Computational details {.unnumbered} -The analyses and results in this paper were obtained using the following computing environment, versions of `R` and `R` packages: +The analyses and results in this paper were obtained using the following +computing environment, versions of `R` and `R` packages: -:::{.callout} +::: callout ```{R} #| label: session-info #| prompt: true devtools::session_info() ``` - ::: ## Acknowledgments {.unnumbered} -:::{.callout} +::: callout This project is sponsored by the Defense Advanced Research Projects -Agency (DARPA) under cooperative agreement No.HR001118S0047. The -content of the information does not necessarily reflect the position -or the policy of the Government, and no official endorsement should -be inferred. +Agency (DARPA) under cooperative agreement No.HR001118S0047. The content +of the information does not necessarily reflect the position or the +policy of the Government, and no official endorsement should be +inferred. ::: From c78581abad358d1f8c412874ef3ab9a7caa47dbb Mon Sep 17 00:00:00 2001 From: egouldo Date: Thu, 1 Feb 2024 13:49:17 +1100 Subject: [PATCH 05/15] #54 tighten up intro para for illustrative workflow --- inst/ms/aggreCAT.qmd | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/inst/ms/aggreCAT.qmd b/inst/ms/aggreCAT.qmd index 90d6135..0962b36 100644 --- a/inst/ms/aggreCAT.qmd +++ b/inst/ms/aggreCAT.qmd @@ -1111,14 +1111,18 @@ knitr::include_graphics("images/BayesianWAgg.png") Throughout the SCORE program, 752 participants assessed more than 4000 unique claims using the repliCATS IDEA protocol, between 7th July 2019 -and 25 November 2021. This required batch aggregation over multiple -claims, and to generate Confidence Scores for multiple claims. We also -applied multiple aggregation methods to the same claim so that we could -compare and evaluate the different aggregation methods. We expect that -these are not uncommon use-cases,consequently in this section we -demonstrate a general workflow for using the [aggreCAT]{.pkg} package to -aggregate expert judgements using pilot data from DARPA SCORE program -generated by the repliCATS project. +and 25 November 2021. In order to efficiently generate Confidence Scores +for multiple claims, we needed to apply the same aggregation method over +batches of assessments containing multiple claims. We were also tasked +with comparing and evaluating the different aggregation methods, which +required us to generate Confidence Scores for each claim using multiple +aggregation methods. We expect that needing to aggregate over multiple +assessments, and needing to apply multiple types of aggregation methods +will be common use-cases, consequently in this section we demonstrate a +workflow for using the [aggreCAT]{.pkg} package to aggregate expert +judgements using multiple aggregation methods on multiple claims using +pilot data generated by the repliCATS project for the DARPA SCORE +program. ### Generating multiple forecasts From 8d6bf23377c062d8f1eb88ce0196294a78e597ce Mon Sep 17 00:00:00 2001 From: egouldo Date: Thu, 1 Feb 2024 13:57:31 +1100 Subject: [PATCH 06/15] #54 fix typo in blocked heatmap plot fun --- R/confidence_score_plots.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/confidence_score_plots.R b/R/confidence_score_plots.R index e0ea3c8..aa27d9c 100644 --- a/R/confidence_score_plots.R +++ b/R/confidence_score_plots.R @@ -251,7 +251,7 @@ method_types <- function(conf_scores){ BAYES = c("BayTriVar", "BayPRIORsAgg") conf_scores <- conf_scores %>% - dplyr::mutate(type = dplyr::case_when(method %in% NWL ~ "Non-weighted Linear Combintation", + dplyr::mutate(type = dplyr::case_when(method %in% NWL ~ "Non-weighted Linear Combination", method %in% WLCI ~ "Weighted Linear Combinations", #method %in% WLCE ~ "Weighted Linear Combinations (Supplementary Data)", method %in% BAYES ~ "Bayesian Methods")) @@ -259,7 +259,7 @@ method_types <- function(conf_scores){ # Levels for the plot output conf_scores$type <- factor(conf_scores$type, - levels = c("Non-weighted Linear Combintation", + levels = c("Non-weighted Linear Combination", "Weighted Linear Combinations", #"Weighted Linear Combinations (Supplementary Data)", "Bayesian Methods")) From a57f7b3639f714e02bbdae6828d131e5062c60b0 Mon Sep 17 00:00:00 2001 From: egouldo Date: Thu, 1 Feb 2024 14:02:10 +1100 Subject: [PATCH 07/15] #54 make edits and fixes to repliCATS example section --- inst/ms/aggreCAT.qmd | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/inst/ms/aggreCAT.qmd b/inst/ms/aggreCAT.qmd index 0962b36..77c40f7 100644 --- a/inst/ms/aggreCAT.qmd +++ b/inst/ms/aggreCAT.qmd @@ -1136,7 +1136,7 @@ claims, $C$, parsed to the argument `expert_judgements`. The data pre-processing and aggregation methods are applied using a combination of calls to [tidyverse]{.pkg} functions, including `summarise` and `mutate`. From the user's perspective, this means that data processing -and application of the aggergation methods is handled internally by the +and application of the aggregation methods is handled internally by the [aggreCAT]{.pkg} package, rather than by the user. The user is therefore free to focus their attention on the interpretation and analysis of the forecasts. Here we demonstrate the application of the `ArMean` @@ -1151,8 +1151,8 @@ AverageWAgg(focal_claims, type = "ArMean") ### Comparing and Evaluating Aggregation Methods In real study contexts, such as that of the repliCATS project in the -DARPA SCORE program, it is of interest to compute Confidence Scores -using multiple aggregation methods so that their performance might be +DARPA SCORE program, it may be of interest to compute Confidence Scores +using multiple aggregation methods so that their performance can be evaluated and compared. Since different methods offer different mathematical properties, and therefore might be more or less appropriate depending on the purpose of the aggregation and forecasting, a @@ -1280,7 +1280,7 @@ confidence_score_ridgeplot(confidence_scores = confidenceSCOREs) ```{r} #| label: fig-aggregation -#| fig-cap: "Confidence Scores for the aggregation methods `ArMean`, `BayTriVar`, `IntWAgg`, `IndIntWAgg`, `ReasonWAgg` and `ShiftWAgg` for four claims. Participants' three-point best estimates are displayed as black points, and their upper and lowr bounds displayed as black error bars. Confidence Scores are displayed as points within the upper row of plots. Lines are displayed vertically at the 0.5 probability mark, and their colour denotes the observed outcome under previous large-scale replication projects." +#| fig-cap: "Confidence Scores for the aggregation methods `ArMean`, `BayTriVar`, `IntWAgg`, `IndIntWAgg`, `ReasonWAgg` and `ShiftWAgg` for four claims. Participants' three-point best estimates are displayed as black points, and their upper and lower bounds displayed as black error bars. Confidence Scores are displayed as points within the upper row of plots. Lines are displayed vertically at the 0.5 probability mark, and their colour denotes the observed outcome under previous large-scale replication projects." #| message: false #| fig-align: center #| warning: false @@ -1337,7 +1337,7 @@ p #| warning: false #| eval: true #| fig-pos: H -#| fig-cap: "Blocked heatmap visualisation of confidence scores is useful for visually comparing aggregation methods and evaluating them against a set of known outcomes. In this example, Confidence Scores generated by six aggregation methods for the repliCATS pilot study are visualised for 25 claims. Claims where known outcomes succesfully replicated `outcome == TRUE` are presented in heatmap blocks on the left, and claims that failed to replicate are presented in heatmap blocks on the right. Confidence Scores generated by different aggregation methods are positioned along the y-axis, with vertical groupings according to the methods' mathematical properties. Colour and intensity of cells indicates the direction and degree of deviation respectively of the Confidence Scores from the known outcomes." +#| fig-cap: "Blocked heatmap of confidence scores is useful for visually comparing aggregation methods and evaluating them against a set of known outcomes. In this example, Confidence Scores generated by six aggregation methods for the repliCATS pilot study are presented for 25 claims. Claims where known outcomes succesfully replicated `outcome == TRUE` are presented in heatmap on the left, and claims that failed to replicate are presented in heatmap on the right. Confidence Scores generated by different aggregation methods are positioned along the y-axis, with vertical groupings according to the methods' mathematical properties. Colour and intensity of cells indicates the direction and degree of deviation respectively of the Confidence Scores from the known outcomes." suppressPackageStartupMessages(library(ggforce)) @@ -1424,8 +1424,8 @@ Although several aggregation methods *require* judgements that are elicited using the IDEA protocol (See Table \ref{tbl-method-summary-table} for exceptions), most aggregation methods require only a single round of elicitation that generates a set of three -points; a best estimate, and upper and lower bounds about those -estimates, even if the IDEA protocol was used to elicit judgements. +points – a best estimate, and upper and lower bounds about those +estimates – even if the IDEA protocol was used to elicit judgements. Hence, the aggregation functions contained in the [aggreCAT]{.pkg} package are unsuitable for use with judgements elicited with methods that aggregate behaviourally (e.g. using consensus) and therefore result From 3a32117ea2d7884d1e0659941a1fd0d805d7bba3 Mon Sep 17 00:00:00 2001 From: egouldo Date: Thu, 1 Feb 2024 14:04:46 +1100 Subject: [PATCH 08/15] #54 incorporate all comments/feedback RO --- inst/ms/aggreCAT.qmd | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/inst/ms/aggreCAT.qmd b/inst/ms/aggreCAT.qmd index 77c40f7..d20a192 100644 --- a/inst/ms/aggreCAT.qmd +++ b/inst/ms/aggreCAT.qmd @@ -1543,8 +1543,8 @@ package is designed to work within the *tidyverse*. The package is premised on the principles of *tidy* data analysis whereby the user supplies [data.frame]{.class}s of elicited judgements, and the aggregation methods return [data.frame]{.class}s of aggregated -forecasts. The benefits of this approach are three-fold. Firstly, the -work of data-wrangling and application of the aggregation methods is +forecasts. There are four key benefits of the aggreCAT package. Firstly, +the work of data-wrangling and application of the aggregation methods is handled internally by the aggregation methods, so that the researcher can focus on analysis and interpretation of the aggregation outputs. This is critical in data-deficient contexts where rapid assessments are From ab63d2b57a59ec5739e5b1faad6a05a77fb10871 Mon Sep 17 00:00:00 2001 From: egouldo Date: Thu, 1 Feb 2024 14:14:20 +1100 Subject: [PATCH 09/15] #54 fix typos and extra spaces after cross-refs --- inst/ms/aggreCAT.qmd | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/inst/ms/aggreCAT.qmd b/inst/ms/aggreCAT.qmd index d20a192..2691cb6 100644 --- a/inst/ms/aggreCAT.qmd +++ b/inst/ms/aggreCAT.qmd @@ -576,7 +576,7 @@ subset the dataset `data_ratings` to include a sample of four claims with judgements from five randomly-sampled participants. From these focal claims, we select a target claim for which we will apply an exemplar aggregation method from each mathematical aggregator -(@tbl-focal-claim ). +(@tbl-focal-claim). ```{r} #| label: focal-claim-selection @@ -628,7 +628,7 @@ implementation using the [aggreCAT]{.pkg} package with the simplest, unweighted aggregation method, `ArMean`. All other aggregation methods take this underlying computational blueprint, and expand on it according to the aggregation methods' requirements (See [Box 1](#aggWorkflow) for -details). `ArMean` ( @eq-ArMean ) takes the unweighted linear average +details). `ArMean` (@eq-ArMean) takes the unweighted linear average (i.e. arithmetic mean) of the best estimates, $B_{i,c}$. $$ @@ -1202,10 +1202,10 @@ confidenceSCOREs After generating Confidence Scores using various aggregation methods, we then evaluate the forecasts. We evaluated the repliCATS pilot study forecasts against the outcomes of previous, high-powered replication -studies [@Hanea2021], which are contained in the `data\_outcomes` -dataset published with [aggreCAT]{.pkg}. In this dataset, each claim -`paper_id` is assigned an `outcome` of `0` if the claim did not -replicate and `1` if the claim was successfully replicated: +studies [@Hanea2021], which are contained in the `data_outcomes` dataset +published with [aggreCAT]{.pkg}. In this dataset, each claim `paper_id` +is assigned an `outcome` of `0` if the claim did not replicate and `1` +if the claim was successfully replicated: ```{r} #| label: replication-outcomes From d298d8c4f0ec334d644b351bfdc0659283f1b789 Mon Sep 17 00:00:00 2001 From: egouldo Date: Thu, 1 Feb 2024 14:19:08 +1100 Subject: [PATCH 10/15] #27 add link to GitHub repository --- inst/ms/aggreCAT.qmd | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/inst/ms/aggreCAT.qmd b/inst/ms/aggreCAT.qmd index 2691cb6..589988c 100644 --- a/inst/ms/aggreCAT.qmd +++ b/inst/ms/aggreCAT.qmd @@ -1591,9 +1591,11 @@ IDEA protocol for a fisheries and conservation problem. The package will be actively maintained into the future, beyond the life of the DARPA SCORE program. Bug reports and feature-requests can easily -be lodged on the [aggreCAT]{.pkg} GitHub repository using reproducible -examples created with [reprex]{.pkg} [@reprexpkg2020] on the repliCATS -pilot study datasets shipped with the [aggreCAT]{.pkg} package. +be lodged on the [aggreCAT]{.pkg} [GitHub +repository](https://github.com/metamelb-repliCATS/aggreCAT) using +reproducible examples created with [reprex]{.pkg} [@reprexpkg2020] on +the repliCATS pilot study datasets shipped with the [aggreCAT]{.pkg} +package. We have described the computational implementation of the aggregation methods and supporting tools within the [aggreCAT]{.pkg} package, From 9e2898b8ed3a973eaeb1208900c4821fd653133a Mon Sep 17 00:00:00 2001 From: egouldo Date: Thu, 1 Feb 2024 16:31:08 +1100 Subject: [PATCH 11/15] #29 experiment replace gt:: with tinytable:: --- inst/ms/TEST_TABLE.qmd | 1680 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1680 insertions(+) create mode 100644 inst/ms/TEST_TABLE.qmd diff --git a/inst/ms/TEST_TABLE.qmd b/inst/ms/TEST_TABLE.qmd new file mode 100644 index 0000000..24529c4 --- /dev/null +++ b/inst/ms/TEST_TABLE.qmd @@ -0,0 +1,1680 @@ +--- +title: "aggreCAT: an R Package for Mathematically Aggregating Expert Judgements" +format: + jss-pdf: + keep-tex: true + header-includes: + - \usepackage[utf8]{inputenc} + - \usepackage{amsmath} + - \usepackage{amsfonts} + - \usepackage{caption} + - \usepackage{booktabs} + - \usepackage{longtable} + - \usepackage{array} + - \usepackage{multirow} + - \usepackage{wrapfig} + - \usepackage{float} + #- \usepackage{colortbl} + - \usepackage{pdflscape} + - \usepackage{tabu} + - \usepackage{threeparttable} + - \usepackage{threeparttablex} + - \usepackage[normalem]{ulem} + - \usepackage{makecell} + - \usepackage{xcolor} + - \newcommand{\blandscape}{\begin{landscape}} + - \newcommand{\elandscape}{\end{landscape}} + - \usepackage{underscore} + # - \usepackage[authoryear,round]{natbib} + x11names: true + journal: + cite-shortnames: true + type: article + jss-html: default +author: + # use this syntax to add text on several lines + # To add another line, use \AND at the end of the previous one as above + - name: 'Elliot Gould^[School of Ecosystem and Forest Sciences, University of Melbourne]' + email: 'elliot.gould (at) unimelb.edu.au' + orcid: '0000-0002-6585-538X' + affiliations: 'University of Melbourne' + address: | + | School of Forest and Ecosystem Sciences + | University of Melbourne, Parkville, Victoria 3010 + - name: "Charles T. Gray" + affiliations: 'Newcastle University' + - name: "Aaron Willcox" + affiliations: 'University of Melbourne' + orcid: '0000-0003-2536-2596' + - name: "Rose O'Dea" + affiliations: 'University of Melbourne' + orcid: '0000-0001-8177-5075' + - name: "Rebecca Groenewegen" + affiliations: 'University of Melbourne' + orcid: '0000-0001-9177-8536' + - name: 'David P. Wilkinson' + orcid: '0000-0002-9560-6499' + affiliations: 'University of Melbourne' +abstract: | + Structured elicitation protocols, such as the IDEA protocol, may be used to elicit expert judgements in the form of subjective probabilities from multiple experts. Judgements from individual experts about a particular phenomena must therefore be mathematically aggregated into a single prediction. Aggregation becomes more complicated when judgements are elicited with uncertainty bounds and when there are multiple rounds of elicitation. This paper presents the new [R]{.proglang} package [aggreCAT]{.pkg}, which provides 27 unique aggregation methods for combining individual judgements into a single, probabilistic measure. The aggregation methods were developed as a part of the Defense Advanced Research Projects Agency (DARPA) 'Systematizing Confidence in Open Research and Evidence' (SCORE) programme, which aims to generate confidence scores or estimates of 'claim credibility' for over 4000 research claims from the social and behavioural sciences. We provide several worked examples illustrating the underlying mechanics of the aggregation methods. We also describe a general workflow for using the software in practice to facilitate uptake of this software for appropriate use-cases. +keywords: [mathematical aggregation, expert judgement, DARPA SCORE, replicability, R] +keywords-formatted: [mathematical aggregation, expert judgement, DARPA SCORE, replicability, "[R]{.proglang}"] +bibliography: bibliography.bib +editor: + markdown: + wrap: 72 +callout-appearance: simple +callout-icon: false +appendix-style: none +# pdf-engine: citeproc #until quarto repsects classoption:shortnames +--- + +## TEST TABLE + +```{r} +library(tidyverse) +library(aggreCAT) +library(aggreCAT) +library(tinytex) +options(tinytex.verbose = TRUE) +library(knitr) +options(kableExtra.latex.load_packages = FALSE) +library(kableExtra) +library(tinytable) +``` + + +## Introduction {#sec-introduction} + +Expert judgement is frequently used to inform forecasting about +uncertain future events across a range of disciplines, including +ecology, conservation science, human geography, political science, and +management [@Sutherland2018]. It is best-practice to elicit judgements +from diverse groups to capitalise on the 'wisdom of crowds' +[@hemming2017], because groups of experts tend to perform better than a +single expert [@Goossens2008]. However judgements or forecasts must then +be distilled into a single forecast, ideally accompanied by estimates of +uncertainty around those estimates [@Hanea2021]. Judgements from +multiple experts may be combined into a single forecast using either +behavioural approaches that force experts into forming consensus, or +mathematical approaches [@Goossens2008]. + +Although there are a variety of methods for mathematically aggregating +expert judgements into single point-predictions, there are few +open-source software implementations available to analysts or +researchers. The [R]{.proglang} [@R] package [expert]{.pkg} provides +three models of expert opinion to combine judgements elicited from +groups of experts (CITE) , and [SHELF]{.pkg} implements only a single +method (weighted linear pool) for aggregating expert judgements (CITE). +Other [R]{.proglang} packages providing methods to mathematically +aggregate expert judgements do so for non-point predictions, for +example, [opera]{.pkg}, which generates time-series predictions (CITE). +In this paper we present the [aggreCAT]{.pkg} package, which provides 27 +different methods for mathematically aggregating judgements within +groups of experts into a single forecast. + +### DARPA SCORE program and the repliCATS project {#sec-repliCATS} + +The [aggreCAT]{.pkg} package, and the mathematical aggregators therein, +were developed by [the repliCATS (Collaborative Assessment for +Trustworthy Science) +project](https://replicats.research.unimelb.edu.au/) as a part of the +[SCORE +program](https://www.darpa.mil/program/systematizing-confidence-in-open-research-and-evidence) +(Systematizing Confidence in Open Research and Evidence), funded by +DARPA (Defense Advanced Research Projects Agency) [@alipourfard2021]. +The SCORE program is the largest replication project in science to date, +and aims to build automated tools that can rapidly and reliably assign +"Confidence Scores" to research claims from empirical studies in the +Social and Behavioural Sciences (SBS). Confidence Scores are +quantitative measures of the likely reproducibility or replicability of +a research claim or result, and may be used by consumers of scientific +research as a proxy measure for their credibility in the absence of +replication effort [@alipourfard2021]. + +Replications are time-consuming and costly [@Isager2020], and studies +have shown that replication outcomes can be reliably elicited from +researchers [@Gordon2020]. Consequently, the DARPA SCORE program +generated Confidence Scores for $> 4000$ SBS claims using expert +elicitation based on two very different strategies -- prediction markets +[@Gordon2020] and the IDEA protocol [@hemming2017], the latter of which +is used by the repliCATS project [@Fraser:2021]. A proportion of these +research claims were randomly selected for direct replication, against +which the elicited and aggregated Confidence Scores are 'ground-truthed' +or verified. The aim of the DARPA SCORE project is to aid the +development of artificial intelligence tools that can automatically +assign Confidence Scores. + +#### The repliCATS IDEA protocol {#sec-IDEAprotocol} + +The repliCATS project adapted and deployed the IDEA protocol to elicit +crowd-sourced judgements from diverse groups about the likely +replicability of SBS research claims [@Fraser:2021]. The IDEA +('Investigate', 'Discuss', 'Estimate' and 'Aggregate') protocol is a +four-step structured elicitation protocol that draws on the 'wisdom of +crowds' to elicit subjective judgements about the likelihood of +uncertain events [@hemming2017, figure 1]. To collect expert judgements +about the replicability of SBS claims, we asked participants to estimate +the "probability that direct replications of a study would find a +statistically significant effect in the same direction as the original +claim", eliciting estimates of uncertainty in the form of upper and +lower bounds on those point-estimates. Judgements were elicited using +the repliCATS platform [@Pearson2021], a multi-user cloud-based software +platform that implements the IDEA protocol, between July 7th 2019 and +November 30th 2020. + +During the *Investigate* phase, individuals review the claim and draw on +background knowledge to provide estimates of the probability of the +claim replicating. These estimates are numeric (0 - 100%) and include a +best estimate, and upper and lower bounds on that estimate. Individuals +also provide a value on the likert binary scale up to 7, rating the +individuals' degree of comprehension of the claim. Individuals were also +asked to provide short comments justifying their estimates. + +In the *Discuss* phase, three-point estimates from each group member are +anonymously presented to the group, who then collectively discuss +differences in opinion and provide potential evidence for these +differences. Next, in the *Estimate* phase, individuals are asked to +provide a second set of three-point probablistic judgements. Thus, for a +single assessment, 2 sets of judgements are elicited from each expert, +*pre-* and *post-*group discussion, referred to here on as *Round 1* and +*Round 2* judgements, respectively. + +During the fourth phase, *Aggregate*, judgements are mathematically +aggregated into a single Confidence Score, or in this case, forecast of +replicability. The repliCATS project developed 27 different methods for +mathematically aggregating judgements elicited from groups of experts +into Confidence Scores [@Hanea2021]. We developed the [aggreCAT]{.pkg} +package to implement these aggregation methods and deliver Confidence +Scores for over 4000 SBS research claims as a part of the DARPA SCORE +project. + +![The IDEA protocol as deployed by the repliCATS project (reproduced +with permission from Wintle et al. +2021).](images/img_IDEA_repliCATS.png "The IDEA protocol as deployed by the repliCATS project (reproduced with permission from Wintle et al. 2021"){#fig1} + +## Introducing the aggreCAT package + +In this paper we aim to provide a detailed overview of the +[aggreCAT]{.pkg} package so that researchers may apply the aggregation +functions described in [@Hanea2021] to their own expert elicitation +datasets where mathematical aggregation is required. Note that +judgements that have already been subjected to behavioural or consensus +aggregation may not be subsequently mathematically aggregated, however +individual elicited judgements may be aggregated mathematically as an +alternative or complement to behavioural or consensus-based aggregation. + +We begin by formulating the problem of mathematically aggregating expert +judgements. Each method, and its data requirements is summarised in +Table \ref{tbl-method-summary-table}. Before outlining key aggregation +methods, we briefly summarise package datasets, which were collected by +the repliCATS project. By first describing the datasets before +describing the aggregation methods in detail, we aim to provide a +grounded understanding of the different outputs of expert elicitation +using the repliCATS IDEA protocol, and the inputs available to the +aggregation functions. + +Next, we describe and illustrate the main types of aggregators, which +may be categorised according to their data requirements, mathematical +properties and computational implementation (@sec-focal-claims). By +selecting representative functions of each key aggregator type and +applying them to a subset of focal claims, we demonstrate the internal +mechanics of how these methods differently operationalise the data to +generate forecasts or Confidence Scores. We do not give advice on the +circumstances in which each method should be used, instead, choice of +aggregation method should be informed by the mathematical properties of +the method, the desired properties of an aggregation, and the purpose +for which the aggregation is being used. For a detailed description of +each method as well as a discussion of their relative merits, see +[@Hanea2021]. + +Finally, we provide a detailed workflow for aggregating expert judgments +for multiple forecasts, using multiple aggregation functions, as +implemented by the repliCATS project in the course of delivering $>$ +4000 Confidence Scores for the DARPA SCORE program. The [aggreCAT]{.pkg} +package provides a set of supporting functions for evaluating or +ground-truthing aggregated forecasts or Confidence Scores against a set +of known-outcomes, as well as functions for visualising comparisons of +different aggregation methods and the outcomes of performance +evaluation. We describe and demonstrate this functionality in the +presentation of the repliCATS workflow. The workflow is representative +of the probable challenges faced by the researcher in the course of +mathematically aggregating groups of forecasts, and should equip the +reader to use [aggreCAT]{.pkg} for their own datasets; it exemplifies +how to extend the [aggreCAT]{.pkg} package to any expert judgement +dataset from any domain in which there are multiple judgements from +multiple individuals that must be combined into a single forecast. + +## Mathematically Aggregating Expert Judgements + +Mathematically, the aggregation methods can be divided into three main +types: + +- Un-weighted linear combination of best estimates, transformed best + estimates or distributions, + +- Weighted linear combinations of best estimates, transformed best + estimates and of distributions, where weights are proxies of + forecasting performance constructed from characteristics of + participants and/or their judgements, and + +- Bayesian methods that use participant judgements as data with which + to update both uninformative and informative priors. + +However, the [aggreCAT]{.pkg} package user might wish to categorise the +aggregation methods according to aspects of their computational +implementation and data requirements, because these inform the function +arguments as well as the type and form of the data that is parsed to the +aggregation functions. These aspects include: + +- Elicitation requirement, number of elicitation rounds: the majority + of aggregation methods require data from only a single round of + judgements, i.e. the final post-discussion estimates. However, some + aggregation methods require data from both rounds of judgements, + which may be elicited using the IDEA protocol or another similarly + structured elicitation protocol in which there are two rounds of + judgements. +- Elicitation requirement, single point or three point elicitation: + several aggregation methods use only a single data point elicited + from individuals (their best estimate), however, most aggregation + methods require a best estimate, and estimates of uncertainty in the + form of upper and lower bounds. +- Number of claims / forecasts assessed by the individual: some + weighted aggregation methods consist of weights that are calculated + from properties of participant judgements across multiple + forecasting questions, not just the target claim being aggregated. + Secondly, for aggregation methods that calculate variance in + estimates, variance cannot be calculated on a single data point. + While two is the mathematical minimum, the user should give + consideration to what minimum number of claims should be used to + reliably calculate measures of variance. +- Supplementary data requirements: several aggregation methods require + supplementary data collected either in addition to or as part of the + repliCATS IDEA protocol, some of which will need additional + qualitative coding before being parsed to the aggregation function. + +The data and structured elicitation protocol requirements are described +in Table \ref{tbl-method-summary-table}. All aggregation methods +requiring a single round of estimates can therefore be applied to expert +judgments derived from any structured elicitation protocol that +generates, lower, upper, and best estimates from each individual (i.e. +not just the IDEA protocol) and does not enforce behavioural consensus. + +#### Notation and Problem Formulation + +Here we describe some preliminary mathematical notation used to +represent each aggregation method. For the mathematical specification of +each individual aggregation function, please consult [@Hanea2021] or the +[aggreCAT]{.pkg} package function documentation. + +The total number of research claims, $claim$, or unique forecasts being +assessed, $C$ , is indexed by $c = 1, ..., C$. The total number of +individuals / experts / participants is denoted by $N$, and is indexed +by $i = 1, ..., N$. Each claim *outcome* (i.e. the outcome of a +replication study) assumes binary values, where the value is 0 if the +claim is false, and 1 if the claim is true. '`TRUE`' claims are claims +where the replication study found a statistically significant result in +the same direction as the original research claim, and '`FALSE`' claims +are those where the replication study *did not* find a significant +result in the same direction as the original study. For each claim $c$, +an individual $i$ assesses the probability of a claim replicating by +providing three probabilities: a lower bound ${L}_{i,c}$, an upper bound +${U}_{i,c}$, and a best estimate $B_{i,c}$, satisfying the inequalities: +$0 \le Li,c \le Bi,c \le Ui,c \le 1$. + +Every claim is assessed by multiple individuals, and their probabilities +are aggregated using one of the aggregation methods to obtain a group or +aggregate probability, denoted by $\hat{p}_c$. The aggregated +probability calculated using a specific method, is given by +$\hat{p}_{c}\left(Method \space ID \right)$. Each aggregation is +assigned a unique $Method \space ID$ which is the abbreviation of the +mathematical operation used in calculating the weights. Note that all +Best, Lower and Upper estimates are taken to be `round 2` judgements +from the repliCATS IDEA protocol [Figure 1](#fig1)), unless appended by +a "1", where they are `round 1` judgements, e.g. $B1_{i,c}$ denotes the +`round 1` Best estimate from individual $i$ for claim $c$. + +##### Weighting Expert Forecasting Performance + +Equal-weighting of judgements are less calibrated, accurate and +informative than weighted aggregation methods where judgements from +experts who performed well in similar judgement tasks are more heavily +weighted [@Hanea2021]. Proxies for forecasting performance, such as +breadth and variability of qualitative reasons used by experts to +justify their judgements, can be used to form weights in the absence of +measures of experts' prior performance [@Hanea2021]. + +The aggregation methods other than the [AverageWAgg]{.fct} and Bayesian +approaches in [aggreCAT]{.pkg} each employ weighting schemes that are +informed by proxies for good forecasting performance whereby experts' +estimates are weighted differently by measures of reasoning, engagement, +openness to changing their mind in light of new facts, evidence or +opinions presented in the discussion round, extremity of estimates, +informativeness of estimates, asymmetry of estimate bounds, granularity +of estimates, and by prior statistical knowledge as measured in a quiz. + +Below, we define standardised notation for describing weighted linear +combinations of individual judgements where un-normalised weights are +denoted by $w\_method$ and normalised weights by $\tilde{w} \_ method$ +(@eq-eqn1). Given that for all aggregation methods weights are +normalised, and that the normalisation process is the same for each +aggregation method, the equations for the aggregation methods are +presented for un-normalised weights. + +$$ +\hat{p}_c\left(Method \space ID \right) = \frac{1}{N}\sum_{i=1}^N \tilde{w}\_{method}_{i,c} B_{i,c} +$$ {#eq-eqn1} + +By default, weights are calculated across all claims on a +per-individual, per-claim basis, such that judgements for the same +individual are weighted differently across all claims they have provided +judgements for. There are some exceptions to this default: `GranWAgg`, +`QuizWAgg`, `IndIntWAgg` `IndIntAsymWAgg`, `VarIndIntWAgg`, +`KitchSinkWAgg`. Note that `IndIntWAgg,` and methods that include its +weighting function `weight_nIndivInterval()` as a component, re-scale +weights by a fixed measure across all claims. Hence, for aggregation +methods that use information from multiple claims other than the target +claim for which the Confidence Score is being computed, each individual +claim $c$ is indexed by $d = 1, ..., C$. Where the default weighting is +used, this is coded into each function. However, where more complex and +method-specific weighting methods are used, modularised functions have +been created for ease of debugging. These function names are prefixed +with `weight_`. + +### Package datasets + +The [aggreCAT]{.pkg} package includes the core dataset `data_ratings` +consisting of judgements elicited during a pilot experiment exploring +the performance of IDEA groups in assessing replicability of a set of +claims with "known outcomes." "Known-outcome" claims are SBS research +claims that have been subject to replication studies in previous +large-scale replication projects[^1]. Data were collected using the +repliCATS IDEA protocol at a two day workshop[^2] in the Netherlands, on +July 2019, at which 25 participants assessed the replicability of 25 +unique SBS claims. In addition to the probabilistic estimates provided +for each research claim assessed, participants were also asked to rate +the claim's plausibility and comprehensibility, answer whether they were +involved in any aspect of the original study, and to provide their +reasoning in support of their quantitative estimates, which were used to +form measures of reasoning breadth and engagement [@Fraser:2021]. + +[^1]: Many labs 1, 2 and 3 @Klein2014, @Klein2018ManyL2, @Ebersole2016, + the Social Sciences Replication Project @Camerer2018 and the + Reproducibility Project Psychology @aac4716. + +[^2]: See @Hanea2021 for details. The workshop was held at the annual + meeting of the Society for the Improvement of Psychological Science + (SIPS), [\](https://osf.io/ndzpt/){.uri}. + +`data_ratings` is a *tidy* [data.frame]{.class} wherein each +*observation* (or row) corresponds to a single value in the set of +`value`s constituting a participant's complete assessment of a research +claim. Each research claim is assigned a unique `paper_id`, and each +participant has a unique (and anonymous) `user_name`. The variable +`round` denotes the round in which each `value` was elicited (`round_1` +or `round_2`). `question` denotes the type of question the `value` +pertains to; `direct_replication` for probabilistic judgements about the +replicability of the claim, `belief_binary` for participants' belief in +the plausibility of the claim, `comprehension` for participants' +comprehensibility ratings, and `involved_binary` for involvement in the +original study. An additional column `element` maintains the tidy +structure of the data, while capturing the multiple `value`s that +comprise a full assessment of the replicability (`direct_replication`) +of a claim; `three_point_best`, `three_point_lower` and +`three_point_upper` denote the best estimate and lower and upper bounds +respectively. `binary_question` describes the `element` for both the +plausibility rating (`belief_binary`) and involvement +(`involved_binary`) questions, whereas `likert_binary` is the `element` +describing a participant's `comprehension` rating. Judgements are +recorded in column `value` in the form of percentage probabilities +ranging from (0,100). The `binary_question`s corresponding to +comprehensibility and involvement consist of binary values (`1` for the +affirmative, and `-1` for the negative). Finally, values corresponding +to participants' comprehension ratings are on a `likert_binary` scale +from `1` through `7`. Note that additional columns with participant +attributes can be included in the ratings dataset if required by the +user; we include the `group` column in `data-ratings`, which describes +the group number the participant was a part of. Below we show some +example data for a single user for a single claim to illustrate this +structure of the core `data_ratings` dataset. + +```{r} +#| label: data_ratings-sample +#| message: false +#| prompt: true +#| results: hold +library(tidyverse,quietly = TRUE) +library(aggreCAT) +aggreCAT::data_ratings %>% + dplyr::filter(paper_id == dplyr::first(paper_id), + user_name == dplyr::first(user_name)) %>% + print(., n = nrow(.)) +``` + +Not all data necessary for constructing weights on performance is +contained in `data_ratings`. Additional data collected as part of the +repliCATS IDEA protocol are contained within separate datasets to +`data_ratings`. Participants provided justifications for giving +particular judgemetns, and these are contained in `data_justifications`. +On the repliCATS platform users were given the option to comment on +others' justifications (`data_comments`), to vote on others' comments +(`data_comment_ratings`) and on others' justifications +(`data_justification_ratings`). Finally, [aggreCAT]{.pkg} contains three +'supplementary' datasets containing data collected externally to the +repliCATS IDEA protocol: `data_supp_quiz`, `data_supp_priors`, and +`data_supp_reasons`. + +#### Quiz Score Data {#sec-quiz-supplementary-data} + +Prior to the workshop, participants were asked to complete an optional +quiz on statistical concepts and meta-research which we thought would +aid in reliably evaluating the replicability of research claims. Quiz +responses are contained in `data_supp_quiz` and are used to construct +performance weights for the aggregation method `QuizWAgg` where each +participant receives a `quiz_score` if they completed the quiz, and `NA` +if they did not attempt the quiz [see @Hanea2021 for further details]. + +#### Reasoning Data {#sec-reasonwagg-supplementary-data} + +`ReasonWAgg` uses the number of unique reasons given by participants to +support a Best Estimate for a given claim $B_{i,c}$ to construct +performance weights, and is contained within `data_supp_reasons`. +Qualitative statements made by individuals during claim evaluation were +recorded on the repliCATS platform [@Pearson2021] and coded as falling +into one of 25 unique reasoning categories by the repliCATS Reasoning +team [@Wintle:2021]. Reasoning categories include plausibility of the +claim, effect size, sample size, presence of a power analysis, +transparency of reporting, and journal reporting [@Hanea2021]. Within +`data_supp_reasons`, each of the reasoning categories that passed our +inter-coder reliability threshold are distributed as columns in the +dataset whose names are prefixed with `RW`, and for each claim +`paper_id`, each participant `user_id` is assigned a logical `1` or `0` +if they included that reasoning category in support of their Best +estimate for that claim. See @sec-ReasoningWAgg for details on the +`ReasonWAgg` aggregation method. + +#### Bayesian Prior Data {#sec-bayesian-supplementary-data} + +The method `BayPRIORsAgg` uses Bayesian updating to update a prior +probability of a claim replicating estimated from a predictive model +[@Gould2021a] using an aggregate of the best estimates for all +participants assessing a given claim $c$ [@Hanea2021]. The prior data is +contained in `data_supp_priors` with each claim in column `paper_id` +being assigned a prior probability (on the logit scale) of the claim +replicating in column `prior_means`. + +#### Aggregation Wrapper Functions + +Although there are 27 aggregation methods in total, we grouped methods +based on their mathematical properties into eight 'wrapper' functions, +denoted by the suffix `WAgg`, the abbreviation of *weighted +aggregation*: `LinearWAgg()`, `AverageWAgg()`, `BayesianWAgg()`, +`IntervalWAgg()`, `ShiftingWAgg()`, `ReasoningWAgg()`, +`DistributionWAgg()`, and `ExtremisationWAgg()`. The specific +aggregation *method* is applied according to the `type` argument, whose +options are described in each aggregation wrapper functions' help page. + +### 'Tidy' Aggregation and Prescribed Inputs + +The design philosophy of [aggreCAT]{.pkg} is principled on 'tidy' data +[@Wickham:2014vp]. Each aggregation method expects a +[data.frame]{.class} or [tibble]{.class} of judgements (`data_ratings`) +as its input, and returns a [tibble]{.class} containing the variables +`method`, `paper_id`, `cs` and `n_experts` (see @sec-AverageWAgg for +illustration of outputs); where `method` is a character vector +corresponding to the aggregation method name specified in the `type` +argument. Each aggregation is applied as a summary function +[@Wickham2017R], and therefore returns a single row or observation with +a single confidence score `cs` for each claim or `paper_id`. The number +of expert judgements summarised in the aggregated confidence score is +returned in the column `n_experts`. Because of the tidy nature of the +aggregation outputs, multiple aggregations can be applied to the same +data with the results of all aggregation methods row bound together in a +single `tibble` (See the example repliCATS workflow in @sec-workflow). + +The tibble of judgements to be aggregated (`data_ratings`) requires the +columns `round`, `paper_id`, `user_name`, `question`, `element`, `value` +and `group`. Each observation in the judgement data corresponds to a +single `value` for a single `question` elicited from a single +`user_name` about a given `paper_id` in a single `round`. There are four +types of `question`s that elicited `values` correspond to. Estimates +about the event probability for a given `paper_id` correspond to +`"direct_replication"` in the `question` variable. The type of estimate +the `value` belongs to is recorded in the `element` variable, and may be +one of `"three_point_lower"`, `"three_point_best"`, or +`"three_point_upper"`. + +Every aggregation function requires at least one `value` derived from +three-point elicitation (`question == "direct_replication"`) in the +dataframe supplied to the `expert_judgements` argument, however, some +methods require only the best-estimates +(`element == "three_point_best"`) for mathematical aggregation. +Similarly some aggregation methods require multiple `round`s of +judgements, while others require only a single round. Only the +aggregation method *CompWAgg* requires `value`s for the `comprehension` +question. For a summary of each aggregation method, its calling function +and data requirements and sources, see @tbl-method-summary-table. + +## Focal Claim Aggregation {#sec-focal-claims} + +We now demonstrate how judgements elicited from a diverse group of +individuals may be mathematically aggregated for a single forecasting +problem, using the datasets provided by [aggreCAT]{.pkg}. We illustrate +the internal mechanics of the weighting methods and the different data +requirements of each of the different types of aggregators, namely; +methods with non-weighted linear combinations of judgements, weighted +linear combinations of judgements, re-scaled weighted linear +combinations of judgements, methods that require supplementary data, and +methods that require data elicited from the full IDEA protocol. Each +group of methods differs in the type of judgements elicited (single- or +three-point estimates), the number of elicitation rounds (one or two +rounds), whether multiple forecasts / elicited judgements are used +during confidence score computation for a target forecast / claim, and +finally whether supplementary data is required for aggregation. + +Here we demonstrate the application of aggregation methods for each +group of methods using a set of 'focal claims' selected from the pilot +study dataset supplied with the [aggreCAT]{.pkg} package. Below we +subset the dataset `data_ratings` to include a sample of four claims +with judgements from five randomly-sampled participants. From these +focal claims, we select a target claim for which we will apply an +exemplar aggregation method from each mathematical aggregator +(@tbl-focal-claim). + +```{r} +#| label: focal-claim-selection +#| prompt: true +set.seed(1234) +focal_claims <- data_ratings %>% + dplyr::filter(paper_id %in% c("24", "138", "186", "108")) +# select 5 users to highlight in focal claim demonstration +focal_users <- focal_claims %>% + dplyr::distinct(user_name) %>% + dplyr::slice_sample(n=5) +# filter out non-focal users from focal claims +focal_claims <- focal_claims %>% + dplyr::right_join(focal_users, by = "user_name") +focal_claims +``` + +```{r} +#| label: tbl-focal-claim +#| tbl-cap: "Focal Claim Data: Round 2 expert judgements for claim 108 derived from a subset of 5 claims and 5 participants from `data\_ratings`. Judgements are displayed as percentages." +#| eval: true +#| echo: false +#| results: asis +focal_claims %>% + filter(question == "direct_replication", + paper_id == "108", + round == "round_2") %>% + pivot_wider(names_from = element, + values_from = value) %>% + arrange(user_name) %>% + select(paper_id, + user_name, + three_point_lower, + three_point_best, + three_point_upper) %>% + rename(`Claim ID` = paper_id, + `User Name` = user_name, + `Lower Bound` = three_point_lower, + `Best Estimate` = three_point_best, + `Upper Bound` = three_point_upper) %>% + tt() +``` + +### Non-weighted linear combination of judgements {#sec-AverageWAgg .section} + +We first demonstrate the mechanics of mathematical aggregation and its +implementation using the [aggreCAT]{.pkg} package with the simplest, +unweighted aggregation method, `ArMean`. All other aggregation methods +take this underlying computational blueprint, and expand on it according +to the aggregation methods' requirements (See [Box 1](#aggWorkflow) for +details). `ArMean` (@eq-ArMean) takes the unweighted linear average +(i.e. arithmetic mean) of the best estimates, $B_{i,c}$. + +$$ +\hat{p}_c\left(ArMean \right ) = \frac{1}{N}\sum_{i=1}^N B_{i,c} +$$ {#eq-ArMean} + +Below we demonstrate the application of `ArMean` on a single claim `108` +for a subset of participants who assessed this claim. We also illustrate +this aggregation visually in [Figure 2](#fig-ArMean). `ArMean` is +applied using the aggregation method [*AverageWAgg*]{.fct}, which is a +wrapper function for several aggregation methods that calculate +different types of averaged best-estimates (see `?AverageWAgg`). The +function returns the Confidence Score for the claim in the form of a +[tibble]{.class}: + +```{r} +#| label: focal-claim-ArMean +#| prompt: true +#| message: false +focal_claims %>% + dplyr::filter(paper_id == "108") %>% + AverageWAgg(type = "ArMean") +``` + +```{r} +#| label: fig-ArMean +#| fig-cap: "ArMean with `AverageWAgg()` uses the Estimates (shown in colour) from each participant to compute the mean. We illustrate this using a single claim `108` for a subset of 5 out of 25 participants from the `data_ratings` dataset. Note that the data representations in this figure are for explanatory purposes only, the data in the actual aggregation is tidy, with long form structure and format." +#| results: hold +#| echo: false +#| out-width: "600px" +knitr::include_graphics(path = "images/ArMean.png") +``` + +::: callout +::: {#aggWorkflow} +### Box 1: Aggregation Workflow Blueprint {.unnumbered} + +#### Argument Structure and Expected Form + +Each aggregation *wrapper* function takes the following arguments: +`expert_judgements`, `type`, `name`, `placeholder` and `percent_toggle`: + +```{r} +#| prompt: true +args(AverageWAgg) +``` + +The aggregation *method* to be applied by the aggregation *function*, is +specified by the `type` argument, defaulting to `ArMean` in the above +example. The resultant `tibble` of Confidence Scores includes the `name` +of the aggregation method applied, defaulting to the `type` argument, +but this can be overridden by the user if they supply a non-`NULL` value +to `name`. \ + +Percentage values, counts, or other non probabilistic quantities are the +default expected value type for ratings supplied to the +`expert_judgements` argument of aggregation functions. By overriding the +default value for the argument `percent_toggle` with `TRUE`, percentage +values are converted to probabilities by dividing judgements over 100 +within the aggregation functions. + +When working with regularly updated data and developing a reproducible +pipeline [@Yenni2019] , it can be useful to put aggregation methods into +'placeholder' mode, whereby a placeholder value is returned by the +aggregation function instead of computing a Confidence Score using the +aggregation method. By setting `placeholder` to `TRUE`, the user can +supply a placeholder Confidence Score, which defaults to $65\%$, the +approximate average replication rate of SBS research claims +[@Camerer2018]. Should the user wish to set an alternative value, they +can create a modified version of `method_placeholder()` for themselves +and store this within the global environment. This function will then be +called by the aggregation method when the `placeholder` argument is set +to `TRUE`. + +Some functions expect additional arguments, especially those that rely +on additional or supplementary data. See the *man* pages for details of +additional arguments. + +#### Mathematical Aggregation Computational Workflow Blueprint + +Each aggregation function follows a general computational workflow +'blueprint' whereby the primary dataset `data_ratings`, parsed to the +`expert_judgements` argument, is first pre-processed by +`pre_process_judgements()`, weights are computed if applicable, +subsequently the aggregation method is applied using +`dplyr::summarise()`, and then finally the aggregated data is parsed to +`postprocess_judgements()`. + +The `preprocess_judgements()` function parses the primary dataset +`data_ratings` through the argument `expert_judgements` to filter the +required quantitative inputs for the aggregation method at hand. It uses +two filtering arguments to control which round of judgements are used as +inputs (`round_2_filter`), and whether the full set of three-point +elicitation judgements should be used, or whether other additional +elements must be returned (`three_point_filter`), including the +`likert_binary` elements for participants' comprehensibility ratings, +and the plausibility ratings under `binary_question` in column +`element`. `three_point_filter` defaults to `TRUE` to provide only +direct replication questions and associated values. Nearly all +aggregation functions use only the round two judgements, so the +`round_2_filter` defaults to `TRUE` (See Table +\ref{tbl-method-summary-table} for required inputs of all aggregation +methods). `preprocess_judgements()` further pre-processes the data to +remove missing data, and returns the data into an appropriate structure +for calculating weights and applying the aggregation function with +`dplyr::summarise()`. + +```{r} +#| label: demo-preprocess-judgements +#| prompt: true +data_ratings %>% + dplyr::group_by(paper_id) %>% + tidyr::nest() %>% + dplyr::ungroup() %>% + dplyr::slice_sample(n = 1) %>% + tidyr::unnest(cols = c(data)) %>% + preprocess_judgements() +``` + +The `preprocess_judgements()` function is exposed to the user to allow +for data formatting in preparation for plotting, e.g. with +[ggplot2]{.pkg} [@ggplot2016], or for developing bespoke aggregation +functions / methods not supplied in \code{aggreCAT}. + +For some aggregation methods, weights are necessary, and thus are +computed prior to aggregation. Some aggregation methods compute weights +using separate weighting functions (See Table +\ref{tbl-method-summary-table}), however, for aggregation methods with +simpler weight computations, these are defined in-function, rather than +being modularised. + +After application of `preprocess_judgements()`, weights are constructed, +and the aggregation method is applied, the function +`postprocess_judgements()` then processes the variables into the final +data frame that is returned by each aggregation function. The post +processing function returns a [tibble]{.class} consisting of +observations equal to the number of unique claims that were parsed to +`postprocess_judgements()`, the `method`, `paper_id` , the Confidence +Score `value`, as well as the total number of participants `n_experts` +whose assessments were used in the aggregation. +::: +::: + +### Weighted linear combinations of judgements {#sec-IntWAgg .section} + +We now demonstrate the construction of weights for forecasting +performance, as well as the use of uncertainty bounds in addition to the +Best Estimates (i.e. three-point estimates) in the aggregation +computation. The aggregation method `IntWAgg` weights each participant's +best estimate $B_{i,c}$ by the width of their uncertainty intervals, +i.e. the difference between an individual's upper ${U}_{i,c}$ and lower +bounds ${L}_{i,c}$. For a given claim $c$, a vector of weights for all +individuals is calculated from their upper and lower estimates using the +weighting function, [weight_interval]{.fct}, which calculates the +interval width for each individual's estimate for the target claim. The +weights are then normalised across the claim (by dividing each weight by +the sum of all weights per claim). Normalised weights are then +multiplied by the corresponding individual's best estimates, $B_{i,c}$, +and summed together into a single Confidence Score +(@fig-IntWAgg-IndIntWAgg). + +### Re-scaled weighted linear combinations of judgements + +Individuals vary in the interval widths they give across different +claims. `IndIntWAgg` is a variation on `IntWAgg` that accounts for +cross-claim variation within individuals' assessments by rescaling the +interval width weights for individual $i$ for claim $c$ relative to the +widest interval provided by that individual across all claims $C$, +(@eq-IntWAgg). For the target claim, each individual's interval width is +divided by the maximum interval width that same individual gave across +all claims they have provided judgements for using the weighting +function [`weight_nIndivInterval`]{.fct} (@eq-weightnIndivInterval). The +process of re-scaling is illustrated in @fig-IntWAgg-IndIntWAgg. Other +aggregation methods that re-scale weights by using data from multiple +claims other than the target claim under aggregation are +`VarIndIntWAgg`, `IndIntAsymWAgg`, `KitchSinkWAgg` (applied with the +wrapper function [IntervalWAgg]{.fct}) and `GranWAgg` (applied with the +wrapper function [*LinearWAgg*]{.fct}), see Table +\ref{tbl-method-summary-table}. + +$$ +w\_Interval_{i,c}= \frac{1}{U_{i,c} - L_{i,c}} +$$ {#eq-weightnIndivInterval} + +$$ +\hat{p}_c\left( IntWAgg \right) = \sum_{i=1}^N \tilde{w}\_Interval_{i,c}B_{i,c} +$$ {#eq-IntWAgg} + +As for [*AverageWAgg*]{.fct}, when using the wrapper function +[*IntervalWAgg*]{.fct} we supply the aggregation method names as a +character vector to the `type` argument and the focal claim data frame +to the argument `expert_judgements`, using [`dplyr::bind_rows`]{.fct} to +bind the resultant Confidence Scores together: + +```{r} +#| label: focal-claim-IntWAgg +#| prompt: true +#| message: false +dplyr::bind_rows( + aggreCAT::IntervalWAgg(expert_judgements = focal_claims %>% + dplyr::filter(paper_id == "108"), + type = "IndIntWAgg"), + aggreCAT::IntervalWAgg(expert_judgements = focal_claims %>% + dplyr::filter(paper_id == "108"), + type = "IntWAgg") + ) +``` + +```{=tex} +\newpage +\blandscape +``` +```{r} +#| label: fig-IntWAgg-IndIntWAgg +#| fig-cap: "Example applications of mathematical aggregation methods a\\) `IntWAgg` and b\\) `IndIntWAgg` using the wrapper function a1\\) `IntWAgg` uses participants' upper and lower bounds to construct performance weights. b2\\) This weighting computation is modified in `IndIntWAgg` whereby the weights for each individual are re-scaled by the largest interval width across all claims for a given individual. We exemplify this rescaling process by illustrating the calculation of participant 1's maximum interval width across all claims they assessed in the demonstration dataset `focal_claims`. This is repeated for every individual who has assessed the target claim under aggregation. " +#| echo: false +#| results: hold +knitr::include_graphics("images/IntervalWAgg.png") +``` + +```{=tex} +\newpage +\elandscape +``` +### Aggregation Methods Requiring Supplementary Data {#sec-ReasoningWAgg} + +In addition to the three-point elicitation dataset `data_ratings`, some +aggregation methods require supplementary data inputs collected +externally to the repliCATS IDEA protocol. Each aggregation wrapper +function that requires supplementary data expects this data to be +provided as a [data.frame]{.class} or [tibble]{.class} in addition to +the main judgements that are provided to the `expert_judements` argument +\ref{tbl-method-summary-table}. + +We illustrate the usage and internal mechanics of this type of +aggregation with the method `ReasonWAgg`, which weights participants' +best estimates $B_{i,c}$ by the breadth of reasoning provided to support +the individuals' estimate (@eq-ReasonWAgg). This method is premised on +the expectation that multiple (unique) reasons justifying an +individual's judgement may indicate their breadth of thinking, +understanding and knowledge about both the claim and its context +[@Hanea2021] while also reflecting their level of engagement and general +conscientiousness. These qualities are correlated with improved +forecasting [@Wintle:2021]. Thus, greater weighting of best estimates +that are accompanied by a greater number of supporting reasons may yield +more reliable Confidence Scores. + +$$ +\hat{p}_c\left( ReasonWAgg \right) = \sum_{i=1}^N \tilde{w}\_reason_{i,c}B_{i,c} +$$ {#eq-ReasonWAgg} + +`ReasonWAgg` is applied with the wrapper function [ReasoningWAgg]{.fct}, +which uses the coded reasoning data `data_supp_reasons` +(@sec-reasonwagg-supplementary-data) to compute a vector of weights, +$w\_reason_{i,c}$ , the number of unique reasons provided by individual +$i$ in support of their estimate for claim $c$ ([Figure +4](#fig-ReasonWAgg)). Weights are then normalised across individuals, +multiplied by the Best Estimates for that claim $B_{i,c}$ and weighted +best estimates are then summed to generate the Confidence Score +(@eq-ReasonWAgg). + +```{r} +#| label: fig-ReasonWAgg +#| echo: false +#| fig-cap: "Illustration of the `ReasonWAgg` aggregation method for a subset of five participants who assessed claim `24`. `ReasonWAgg` is applied using the wrapper function `ReasoningWAgg()` and exemplifies aggregation methods that use supplementary data (`data\_supp\_ReasonWAgg`) collected externally to the IDEA protocol in the construction of weights and subsequent calculation of Confidence Scores. Weights are constructed by taking the sum of the number of unique reasons made in support of quantitative estimates for each participant, for the target claim." +#| results: asis +knitr::include_graphics("images/ReasonWAgg.png") +``` + +The focal claim selected for aggregation using `ReasonWAgg` is `24`, and +the round two three-point estimates from the five focal participants for +this claim are shown in @tbl-reason-wagg-focal-claim. We first prepare +the supplementary data for aggregation `data_supp_reasons`, subsetting +only the participants contained in our `focal_claims` dataset. We also +illustrate a subset of the supplementary data for our five focal +participants for the focal claim `24` (see `?data_supp_reasons` for a +description of variables): + +```{r} +#| label: prepare-supp-data +#| prompt: true +data_supp_reasons_focal <- aggreCAT::data_supp_reasons %>% + dplyr::right_join(focal_users) + +data_supp_reasons_focal %>% + dplyr::filter( paper_id == 24) %>% + tidyr::pivot_longer(cols = c(-paper_id, -user_name)) %>% + dplyr::arrange(name) %>% + tidyr::separate(name, + into = c("reason_num", "reason"), + sep = "\\s", extra = "merge") %>% + dplyr::select(-reason) %>% + dplyr::group_by(paper_id, user_name) %>% + tidyr::pivot_wider(names_from = reason_num) %>% + dplyr::arrange(user_name) +``` + +```{r} +#| label: tbl-reason-wagg-focal-claim +#| tbl-cap: "Focal Claim $24$ judgements comprising best estimates, upper and lower bounds elicited from five participants. Judgements are displayed as percentages." +#| results: asis +#| echo: false +focal_claims %>% + dplyr::filter(paper_id == "24") %>% + tidyr::pivot_wider(names_from = element, values_from = value) %>% + dplyr::arrange(user_name) %>% + dplyr::select(paper_id, user_name, round, three_point_lower, three_point_best, three_point_upper) %>% + tidyr::drop_na() %>% + dplyr::mutate(round = str_remove(round, "round_") %>% as.integer) %>% + rename(`Claim ID` = paper_id, + `User Name` = user_name, + `Lower Bound` = three_point_lower, + `Best Estimate` = three_point_best, + `Upper Bound` = three_point_upper) %>% + tt() +``` + +Confidence Scores estimating the replicability for claim `24` +(@tbl-reason-wagg-focal-claim) using the `ReasonWAgg` method are +computed using [ReasoningWAgg]{.fct} and by providing the supplementary +data to the `reasons` argument: + +```{r} +#| label: focal-claim-ReasonWAgg +#| eval: false +#| prompt: true +#| message: false +focal_claims %>% + dplyr::filter(paper_id == "24") %>% + aggreCAT::ReasoningWAgg(reasons = data_supp_reasons_focal, + type = "ReasonWAgg") +``` + +Note that if there are zero participants with a Reasoning Score $>0$ or +all participants are missing a Reasoning Score, the log-odds transformed +best estimate is returned instead (See `?AverageWAgg`, +`type="LOArMean"`). The user can choose to flag this behaviour +explicitly by setting the argument `flag_loarmean` to `TRUE`, which will +generate new columns in the aggregation output [data.frame]{.class} +named `method_applied` (with values `LOArMean` or `ReasonWAgg`), and +`no_reason_score`, a logical variable describing whether or not there +were no reasoning scores for that claim. + +### Bayesian Aggregation Methods + +Both Bayesian methods `BayTriVar` and `BayPRIORsAgg` use the full +three-point elicitation data, i.e., they use information contained in +the uncertainty bound provided by individuals (upper ${U}_{i,c}$ and +lower bounds ${L}_{i,c}$), in addition to Best Estimates, $B_{i,c}$. +Like `IndIntWAgg` and other methods (Table +\ref{tbl-method-summary-table}), the Bayesian aggregation methods also +construct weights from information encoded in participant assessments of +claims other than the target claim under aggregation. In fact, the +Bayesian methods require data from more than a single claim in order for +the function to properly execute, due to the underlying mathematical +specification of the models (See `?BayesianWAgg` and below for details). + +The two Bayesian methods use the elicited probabilities as data to +update prior probabilities. `BayTriVar` incorporates three sources of +uncertainty in best estimates: variability in best estimates across all +claims, variability in estimates across all individuals, and +claim-participant variability (which is derived from an individuals' +upper and lower bounds). This Bayesian model, implemented using +[R2JAGS]{.pkg} [@R2JAGS], takes the log odds transformed individual best +estimates, and uses a normal likelihood function to derive a posterior +distribution for the probability of replication. The estimated +confidence score is the mean of this posterior distribution. + +`BayPRIORsAgg` is a modified version of `BayTriVar` where, instead of +using default priors, priors are generated from a predictive model that +estimates the probability of a claim replicating based on +characteristics of the claim and publication [@Gould2021a]. Priors are +parsed as supplementary data to the wrapper function +[BayesianWAgg]{.fct} using the argument `priors` +(@sec-bayesian-supplementary-data) with each claim having its own unique +prior. + +We illustrate aggregation of participant judgements using the method +`BayTriVar` to generate a Confidence Score for the claim `108`. Note +that [BayesianWAgg]{.fct} expects best estimates in the form of +probabilities, so to convert elicited values in the form of percentages +within the data parsed to `expert_judgements` to probabilities, the +logical value `TRUE` is supplied to the argument `percent_toggle` ([Box +1](#aggWorkflow)): + +```{r} +#| label: focal-claim-BayTriVar +#| eval: true +#| message: false +#| prompt: true +focal_claims %>% + BayesianWAgg(type = "BayTriVar", + percent_toggle = TRUE) %>% + dplyr::filter(paper_id == "108") +``` + +The Confidence Score calculated for a given claim depends on data for +other claims and participants included in the `expert_judgements` +argument other than the target claim, because, by definition, +[BayesianWAgg]{.fct} calculates the Confidence Score for a target claim +using data from participants' assessments of other claims, and from all +other claims in the [data.frame]{.class} parsed to the +`expert_judgements` argument. Because data from claims than the target +claim are used to calculate the Confidence Score for the target claim, +any data from claims other than the target claim within the data parsed +to the `expert_judgements` argument in *BayesianWAgg* will alter the +Confidence Score. Above, we calculated the Confidence Score for claim +`108` but including information from +`r xfun::numbers_to_words(length(unique(focal_claims$paper_id)) - 1)` +additional claims included in the `focal_claims` [data.frame]{.class}: +`r focal_claims %>% distinct(paper_id) %>% flatten_chr %>% glue::backtick()`. +However, if we were to supply assessments for only two claims to +[*BayesianWAgg*]{.fct}, then we would observe a different result for +focal claim `108`: + +```{r} +#| label: focal-claim-subset-BayTriVar +#| eval: true +#| message: false +#| prompt: true +focal_claims %>% + dplyr::filter(paper_id %in% c("108", "138")) %>% + aggreCAT::BayesianWAgg(type = "BayTriVar", percent_toggle = TRUE) %>% + dplyr::filter(paper_id == "108") +``` + +The Confidence Score shifts from +`r focal_claims %>% BayesianWAgg(type = "BayTriVar", percent_toggle = TRUE) %>% dplyr::filter(paper_id == 108) %>% pluck("cs") %>% round(2)` +to +`r focal_claims %>% dplyr::filter(paper_id %in% c(108, 138)) %>% BayesianWAgg(type = "BayTriVar", percent_toggle = TRUE) %>% dplyr::filter(paper_id == 108) %>% pluck("cs") %>% round(2)`. +Note that [*BayesianWAgg*]{.fct} cannot calculate confidence scores when +judgements for only a single claim is provided to +[expert_judgements]{.fct}, because by definition the underlying Bayesian +model calculates variance across multiple claims and multiple +participants: + +```{r} +#| label: focal-claim-BayTriVar-fail +#| error: true +#| message: false +#| prompt: true +focal_claims %>% + dplyr::filter(paper_id == "108") %>% + aggreCAT::BayesianWAgg(type = "BayTriVar", + percent_toggle = TRUE) +``` + +Although we have set $n=2$ as the minimum number of claims for which +variance is computed, it is up to the user to determine their own +justifiable minimum for reliable variance calculations. + +Finally, all of the previous methods illustrated in this section have +been used with data generated from the IDEA elicitation protocol, which +is not strictly necessary for the application of these aggregation +methods. Methods that *do* require the full IDEA protocol for their +correct mathematical implementation are listed in Table +\ref{tbl-method-summary-table}, e.g. *ShiftingWAgg*, which uses two +rounds of three-point judgements wherein second round judgements are +revised after discussion. + +```{=tex} +\newpage +\blandscape +``` +```{r} +#| label: fig-BayesianWAgg +#| fig-cap: "Illustration of BayTriVar applied with `BayesianWAgg()`for a single claim, `paper_id = 108` from the `focal_claims` data object. Note that the claims `138`, `186` and `24` contained in `focal_claims` are used in the calculation of pariticipant-level SD and claim-level SD, thus the Confidence Score returned by BayTriVar is sensitive to the other claims provided to argument `expert_judgements`." +#| results: hold +#| echo: false +#| out-width: "900px" +knitr::include_graphics("images/BayesianWAgg.png") +``` + +```{=tex} +\newpage +\elandscape +``` +## An illustrative workflow for use in real study contexts {#sec-workflow} + +Throughout the SCORE program, 752 participants assessed more than 4000 +unique claims using the repliCATS IDEA protocol, between 7th July 2019 +and 25 November 2021. In order to efficiently generate Confidence Scores +for multiple claims, we needed to apply the same aggregation method over +batches of assessments containing multiple claims. We were also tasked +with comparing and evaluating the different aggregation methods, which +required us to generate Confidence Scores for each claim using multiple +aggregation methods. We expect that needing to aggregate over multiple +assessments, and needing to apply multiple types of aggregation methods +will be common use-cases, consequently in this section we demonstrate a +workflow for using the [aggreCAT]{.pkg} package to aggregate expert +judgements using multiple aggregation methods on multiple claims using +pilot data generated by the repliCATS project for the DARPA SCORE +program. + +### Generating multiple forecasts + +During expert-elicitation the analyst or researcher may be tasked with +generating multiple forecasts for different problems or questions, and +therefore it is useful to batch the aggregation. Since the +[aggreCAT]{.pkg} package is designed using the principles of *tidy* data +analysis [@tidyverse2019], each aggregation function accepts a +[data.frame]{.class} of raw three-point forecasts for one or more +claims, $C$, parsed to the argument `expert_judgements`. The data +pre-processing and aggregation methods are applied using a combination +of calls to [tidyverse]{.pkg} functions, including `summarise` and +`mutate`. From the user's perspective, this means that data processing +and application of the aggregation methods is handled internally by the +[aggreCAT]{.pkg} package, rather than by the user. The user is therefore +free to focus their attention on the interpretation and analysis of the +forecasts. Here we demonstrate the application of the `ArMean` +aggregation method to four focal claims simultaneously: + +```{r} +#| label: generating-multiple-forecasts +#| message: false +AverageWAgg(focal_claims, type = "ArMean") +``` + +### Comparing and Evaluating Aggregation Methods + +In real study contexts, such as that of the repliCATS project in the +DARPA SCORE program, it may be of interest to compute Confidence Scores +using multiple aggregation methods so that their performance can be +evaluated and compared. Since different methods offer different +mathematical properties, and therefore might be more or less appropriate +depending on the purpose of the aggregation and forecasting, a +researcher or analyst might want to check how the different assumptions +embedded in different aggregation methods influence the final Confidence +Scores for a forecast -- i.e. how robust are the results to different +methods and therefore to different assumptions? + +From a computational perspective, multiple aggregation methods must +first be applied to the forecast prior to comparison and evaluation. +This can be achieved by applying each different aggregation method to +`focal_claims`, and binding the results together with [dplyr]{.pkg}'s +[row_bind]{.fct}. However, more elegant and succinct solutions can be +implemented using [purrr]{.pkg}'s [map_dfr]{.fct} function [@purrr2020, +see @lst-multi-method-workflow-non-supp and +@lst-multi-method-workflow-both]. + +```{r} +#| label: multi-method-workflow-non-supp +#| message: false +#| prompt: true +confidenceSCOREs <- + dplyr::bind_rows( + AverageWAgg(focal_claims, + "ArMean", + percent_toggle = TRUE), + IntervalWAgg(focal_claims, + "IndIntWAgg", + percent_toggle = TRUE), + IntervalWAgg(focal_claims, + "IntWAgg", + percent_toggle = TRUE), + ShiftingWAgg(focal_claims, + "ShiftWAgg", + percent_toggle = TRUE), + BayesianWAgg(focal_claims, + "BayTriVar", + percent_toggle = TRUE), + ReasoningWAgg(focal_claims, + reasons = aggreCAT::data_supp_reasons, + percent_toggle = TRUE) + ) + +confidenceSCOREs +``` + +After generating Confidence Scores using various aggregation methods, we +then evaluate the forecasts. We evaluated the repliCATS pilot study +forecasts against the outcomes of previous, high-powered replication +studies [@Hanea2021], which are contained in the `data_outcomes` dataset +published with [aggreCAT]{.pkg}. In this dataset, each claim `paper_id` +is assigned an `outcome` of `0` if the claim did not replicate and `1` +if the claim was successfully replicated: + +```{r} +#| label: replication-outcomes +#| prompt: true +aggreCAT::data_outcomes %>% + head() +``` + +The function [confidence_score_evaluation]{.fct} evaluates a set of +aggregated forecasts or Confidence Scores against a set of known or +observed outcomes, returning the Area Under the ROC Curve (AUC), the +Brier score, and classification accuracy of each method +(@tbl-multi-method-workflow-eval): + +```{r} +#| label: multi-method-workflow-eval +#| message: false +#| results: false +#| echo: false +#| prompt: true +aggreCAT::confidence_score_evaluation( + confidenceSCOREs, + aggreCAT::data_outcomes + ) +``` + +```{r} +#| label: tbl-multi-method-workflow-eval +#| tbl-cap: "AUC and Classification Accuracy for forecasts from the aggregation methods 'ShiftWAgg', 'ArMean', 'IntWAgg', 'IndIntWAgg', 'ReasonWAgg' and 'BayTriVar' for a subset of the repliCATS pilot study claims (`focal_claims`) and known outcomes." +#| message: false +#| results: asis +#| echo: false +#| prompt: true +aggreCAT::confidence_score_evaluation( + confidenceSCOREs, + aggreCAT::data_outcomes + ) %>% + rename(Method = method, + `Brier Score` = Brier_Score, + `Classification Accuracy (%)` = Classification_Accuracy) %>% + tt() %>% + format_tt(j = c(2,3), digits = 2) +``` + +### Visualising Judgements, Confidence Scores and Forecast Performance + +We include two functions for visualising comparison and evaluation of +Confidence Scores across multiple aggregation methods for a suite of +forecasts from multiple participants, +[confidence_scores_ridgeplot]{.fct} and [confidencescore_heatmap]{.fct}. +[confidence_scores_ridgeplot]{.fct} generates ridgeline plots using +[ggridges]{.pkg} [@ggridges2021], and displays the distribution of +predicted outcomes across a suite of forecasts for each aggregation +method, grouped into separate 'mountain ranges' according to the +mathematical properties of the aggregation method (@fig-ridgeplot). + +```{r} +#| label: fig-ridgeplot +#| fig-height: 6 +#| fig-width: 12 +#| fig-cap: "Ridgeline plots illustrating the distribution of aggregated Confidence Scores for the tibble `confidenceSCOREs`, grouped according to mathematical properties of each method." +#| echo: false +#| message: false +#| warning: false +#| fig-pos: H +# not run, eval = FALSE +suppressPackageStartupMessages(library(ggridges)) +confidence_score_ridgeplot(confidence_scores = confidenceSCOREs) +``` + +```{r} +#| label: fig-aggregation +#| fig-cap: "Confidence Scores for the aggregation methods `ArMean`, `BayTriVar`, `IntWAgg`, `IndIntWAgg`, `ReasonWAgg` and `ShiftWAgg` for four claims. Participants' three-point best estimates are displayed as black points, and their upper and lower bounds displayed as black error bars. Confidence Scores are displayed as points within the upper row of plots. Lines are displayed vertically at the 0.5 probability mark, and their colour denotes the observed outcome under previous large-scale replication projects." +#| message: false +#| fig-align: center +#| warning: false +#| echo: false +#| fig-width: 10 +#| fig-pos: H +plot_cs <- + confidenceSCOREs %>% + dplyr::left_join(aggreCAT::data_outcomes) %>% + dplyr::mutate(data_type = "Confidence Scores") %>% + dplyr::rename(x_vals = cs, + y_vals = method) %>% + dplyr::select(y_vals, paper_id, data_type, outcome, x_vals) + +plot_judgements <- + aggreCAT::preprocess_judgements(focal_claims, + percent_toggle = TRUE) %>% + tidyr::pivot_wider(names_from = element, + values_from = value) %>% + dplyr::left_join(aggreCAT::data_outcomes) %>% + dplyr::rename(x_vals = three_point_best, + y_vals = user_name) %>% + dplyr::select(paper_id, + y_vals, + x_vals, + tidyr::contains("three_point"), + outcome) %>% + dplyr::mutate(data_type = "Elicited Probabilities") + +p <- plot_judgements %>% + dplyr::bind_rows(., {dplyr::semi_join(plot_cs, plot_judgements, + by = "paper_id")}) %>% + ggplot2::ggplot(ggplot2::aes(x = x_vals, y = y_vals)) + + ggplot2::geom_pointrange(ggplot2::aes(xmin = three_point_lower, + xmax = three_point_upper)) + + ggplot2::facet_grid(data_type ~ paper_id, scales = "free_y") + + ggplot2::theme_classic() + + ggplot2::theme(legend.position = "none") + + ggplot2::geom_vline(aes(xintercept = 0.5, colour = as.logical(outcome))) + + ggplot2::xlab("Probability of Replication") + + ggplot2::ylab(ggplot2::element_blank()) + + ggplot2::scale_colour_brewer(palette = "Set1") +p +``` + +```{r} +#| label: fig-heatmap +#| echo: false +#| fig-width: 10 +#| fig-height: 6 +#| dpi: 300 +#| fig-align: center +#| message: false +#| warning: false +#| eval: true +#| fig-pos: H +#| fig-cap: "Blocked heatmap of confidence scores is useful for visually comparing aggregation methods and evaluating them against a set of known outcomes. In this example, Confidence Scores generated by six aggregation methods for the repliCATS pilot study are presented for 25 claims. Claims where known outcomes succesfully replicated `outcome == TRUE` are presented in heatmap on the left, and claims that failed to replicate are presented in heatmap on the right. Confidence Scores generated by different aggregation methods are positioned along the y-axis, with vertical groupings according to the methods' mathematical properties. Colour and intensity of cells indicates the direction and degree of deviation respectively of the Confidence Scores from the known outcomes." + + +suppressPackageStartupMessages(library(ggforce)) +suppressPackageStartupMessages(library(ggpubr)) +confidence_score_heatmap(confidence_scores = confidenceSCOREs, + data_outcomes = aggreCAT::data_outcomes) +``` + +```{r} +#| label: intext-outcomes +#| include: false +best_forecasts <- dplyr::inner_join(confidenceSCOREs, data_outcomes) %>% + dplyr::mutate(difference = abs(outcome-cs)) %>% + dplyr::group_by(outcome) %>% + dplyr::slice_min(difference, n = 3) +worst_forecasts <- dplyr::inner_join(confidenceSCOREs, data_outcomes) %>% + dplyr::mutate(difference = abs(outcome-cs)) %>% + dplyr::group_by(outcome) %>% + dplyr::slice_max(difference, n = 3) +``` + +While [confidencescore_heatmap]{.fct} is useful for comparison of +aggregation methods, [confidencescore_heatmap]{.fct} is useful for +visual comparative *evaluation* of aggregation methods. +[confidencescore_heatmap]{.fct} generates heatmaps of forecasted +Confidence Scores for each aggregation method included in the dataset +provided to the argument `confidence_scores` organised with unique +aggregation methods on the y-axis, and separate forecasts or `paper_id`s +along the y-axis (@fig-heatmap). The heatmap is blocked vertically +according to the mathematical characteristics of each aggregation +method, and horizontally into two groups, according to the binary +outcomes in `data_outcomes`. + +Horizontal grouping facilitates quick and simple evaluation of the +aggregation methods. Perfectly accurate aggregation methods show dark +blue squares in the left heatmap blocks, where the outcomes were `1` or +`TRUE`, and dark red squares on the right heatmap blocks, where the +actual outcomes were `0` or `FALSE`. Deviation from this expectation +indicates which aggregation methods for which claim/forecast, for which +outcome type were inaccurate, and to what degree. + +For example, in @fig-heatmap, for the dataset `confidenceSCOREs` the +successful replication of most claims was accurately forecasted by most +methods, except for several claims. Some methods performed better than +others for some claims (e.g. `ReasonWAgg` for claims `109` and `138`. + +Finally, creating bespoke user-defined plots is relatively easy -- +because [aggreCAT]{.pkg} functions return tidy [data.frame]{.class}s or +[tibble]{.class}s, we can easily manipulate the raw judgements, +aggregated Confidence Scores and outcome data to plot them with +[ggplot2]{.pkg} [@ggplot2016] or other visualisation package. Below we +plot the aggregated Confidence Scores along with the three-point +judgements (subset using [preprocess_judgements]{.fct} on +`focal_claims`, transforming judgements in percentages to probabilities +by setting `percent_toggle` to `TRUE`, @fig-aggregation, +@lst-confidencescores). + +### Extending aggreCAT to other datasets + +The aggregation methods supplied by the [aggreCAT]{.pkg} package can +easily be applied to other forecasting problems. The only requirements +are that the data inputs adhere to the required format (Box +[1](#aggWorkflow)), and that the expert judgements are elicited using +the appropriate method, as required by each aggregation method (see +Table \ref{tbl-method-summary-table}). + +Judgement data provided to the `expert_judgements`, +`data_justifications` or any supplementary data inputs argument must +contain the requisite column names, and be of the correct data type, as +described in each method's documentation (see `?data_ratings`, for +example). At minimum the user must supply to `expert_judgements`: the +`round` under which each judgement is elicited, a unique ID for each +different forecasting problem `paper_id`, a unique `user_name` for each +individual, and the `element` of the three point elicitation that the +recorded response or `value` in that row corresponds to. The data is +stored in long or tidy format such that each row or observation in the +[data.frame]{.class} references only a single `element` of a +participants' set of three point elicitation values. When applying +aggregation methods requiring supplementary data to the elicitation +data, the analyst should also adhere to the requirements stipulated for +the relevant supplementary dataset described in the documentation. + +Although several aggregation methods *require* judgements that are +elicited using the IDEA protocol (See Table +\ref{tbl-method-summary-table} for exceptions), most aggregation methods +require only a single round of elicitation that generates a set of three +points – a best estimate, and upper and lower bounds about those +estimates – even if the IDEA protocol was used to elicit judgements. +Hence, the aggregation functions contained in the [aggreCAT]{.pkg} +package are unsuitable for use with judgements elicited with methods +that aggregate behaviourally (e.g. using consensus) and therefore result +in a single forecast value. Where the analyst elicits judgements for +only a single round, the analyst should record the round in the +judgements data as the character string `"round_1"`, and set the +`round_2_filter` argument to `FALSE` in the aggregation wrapper function +call. + +Should the analyst wish to create their own aggregation functions, pre- +and post-processing functions may be leveraged inside the functions +([preprocess_judgements]{.fct} and [postprocess_judgements]{.fct}, +respectively), as we have illustrated in data preparation for +@fig-aggregation (@lst-confidencescores). These processing functions +modularise key components of the aggregation's computational +implementation, namely, the data wrangling that occurs before and after +the actual mathematical aggregation. + +#### Preparing your own Elicitation Data + +We demonstrate how to prepare data for applying the [aggreCAT]{.pkg} +aggregation methods with data collected using the IDEA protocol for an +environmental conservation problem [@Arlidge2020] . Participants were +asked "How many green turtles in winter per month would be saved using a +total gillnet ban, with gear switching to lobster potting or hand line +fishing required?". We take the required data for the +`expert_judgements` argument from Table S51 of Arlidge et al. +[-@Arlidge2020], make the data long instead of wide, and then add the +required additional columns `paper_id` and `question`: + +```{r} +#| label: lst-BYO-data-wrangle +#| prompt: true +green_turtles <- + dplyr::tribble(~user_name, ~round, ~three_point_lower, + ~three_point_upper, ~three_point_best, + "L01", 1, 10.00, 16.43, 10.00, + "L01", 2, 10.00, 16.43, 10.00, + "L02", 1, 500.00, 522.50, 500.00, + "L02", 2, 293.75, 406.25, 350.00, + "L03", 1, 400.00, 512.50, 400.00, + "L03", 2, 300.00, 356.25, 300.00, + "L04", 1, 32.29, 65.10, 41.67, + "L04", 2, 32.29, 65.10, 41.67, + "L05", 1, 6.67, 7.74, 6.67, + "L05", 2, 6.67, 7.74, 6.67) %>% + dplyr::group_by(user_name) %>% # pivot longer + tidyr::pivot_longer(cols = tidyr::contains("three_point"), + names_to = "element", values_to = "value") %>% + dplyr::mutate(paper_id = 1, + round = ifelse(round ==1, "round_1", "round_2"), + question = "direct_replication") +``` + +We can then apply multiple aggregation methods, using the same approach +implemented for aggregation of the `focal_claims` dataset +(@lst-BYO-data-aggregate), with aggregated Confidence Scores shown in +@tbl-BYO-data-aggregate. Note that because the judgements are absolute +values rather than probabilities, we set the `percent_toggle` argument +for each aggregation wrapper function to [FALSE]{.val} +(@lst-BYO-data-aggregate). + +```{r} +#| label: tbl-BYO-data-aggregate +#| tbl-cap: "Example aggregation of non-percentage / non-probabilistic estimates with several aggregation methods using Green Turtle dataset (Arlidge *et al*. 2020)." +#| eval: true +#| echo: false +#| message: false +#| warning: false +turtle_CS <- + list( + AverageWAgg, + IntervalWAgg, + IntervalWAgg, + ShiftingWAgg +) %>% + purrr::map2_dfr(.y = list("ArMean", + "IndIntWAgg", + "IntWAgg", + "ShiftWAgg"), + .f = ~ .x(green_turtles, type = .y, + percent_toggle = FALSE) + ) +turtle_CS %>% + rename(Method = method, + `Question ID` = paper_id, + `Confidence Score` = cs, + `N (experts)` = n_experts) %>% + tt() %>% + format_tt(j = 3, num_fmt = 'decimal', digits = 2) +``` + +## Summary and Discussion {#sec-summary} + +The [aggreCAT]{.pkg} package provides a diverse suite of methods for +mathematically aggregating judgements elicited from groups of experts +using structured elicitation procedures, such as the IDEA protocol. The +[aggreCAT]{.pkg} package was developed by the repliCATS project as a +part of the DARPA SCORE program to implement the 27 aggregation methods +described in Hanea et al. [-@Hanea2021]. + +There are very few open-source tools available to the researcher wishing +to mathematically aggregate judgements. The [aggreCAT]{.pkg} package is +therefore unique in both the diversity of aggregation methods it +contains, as well as in its computational approach to implementing the +aggregation methods. There is no other R or other software package with +so many aggregation methods, and methods that use proxies of forecasting +accuracy using weights. + +The [aggreCAT]{.pkg} package is production-ready for application to data +elicited during either a single workshop, or for contexts where data +collection may be ongoing and continuous analysis is used for automating +aggregation. Unlike other aggregation packages, the [aggreCAT]{.pkg} +package is designed to work within the *tidyverse*. The package is +premised on the principles of *tidy* data analysis whereby the user +supplies [data.frame]{.class}s of elicited judgements, and the +aggregation methods return [data.frame]{.class}s of aggregated +forecasts. There are four key benefits of the aggreCAT package. Firstly, +the work of data-wrangling and application of the aggregation methods is +handled internally by the aggregation methods, so that the researcher +can focus on analysis and interpretation of the aggregation outputs. +This is critical in data-deficient contexts where rapid assessments are +needed, which is a common use-case for the use of expert derived +forecasts. Secondly, the [aggreCAT]{.pkg} package is easily paired with +other tidyverse tools, such as [purrr]{.pkg}, [dplyr]{.pkg}, and +[ggplot2]{.pkg}, as exemplified through the repliCATS workflow described +in @sec-workflow. + +Thirdly, application of the [aggreCAT]{.pkg} package aggregation methods +and performance evaluation tools is scalable, which is evidenced by the +application of the [aggreCAT]{.pkg} package to forecast the +replicability of over 4000 research claims by the repliCATS project. The +scalability and placeholder functionality allow the [aggreCAT]{.pkg} +package to be built into production-ready pipelines for more complicated +analyses where there are multiple forecasts being elicited and +aggregated, where there are numerous participants, and where multiple +aggregation methods are applied. + +Finally, through the provision of built-in performance metrics, the +analyst is able to 'ground-truth' and evaluate the forecasts against +known-outcomes, or alternative forecasting methods [e.g. @Arlidge2020]. + +The [aggreCAT]{.pkg} package is easily extensible and production-ready. +Each aggregation function follows a consistent modular blueprint, +wherein data-wrangling of the inputs and outputs of aggregation is +largely handled by pre- and post-processing functions +([preprocess_judgements]{.fct} and [postprocess_judgements]{.fct}, +respectively). This design expedites debugging by making it easier to +pinpoint the exact source of errors, while also permitting the user to +easily create their own custom aggregation methods. + +Although the package currently requires data inputs to conform to +nomenclature specific to the repliCATS project, future releases of the +[aggreCAT]{.pkg} package will relax the data-input requirements so they +are more domain-agnostic. We believe this to be a minimal barrier for +adoption and application of the [aggreCAT]{.pkg} package. Ecologists +should be no stranger to these naming conventions for data requirements, +with packages like [vegan]{.pkg} also imposing strict nomenclature +[@veganpkg2020]. We have illustrated how to extend and apply the package +to data from domains beyond forecasting the replicability of research +claims through our minimal example using forecasts generated using the +IDEA protocol for a fisheries and conservation problem. + +The package will be actively maintained into the future, beyond the life +of the DARPA SCORE program. Bug reports and feature-requests can easily +be lodged on the [aggreCAT]{.pkg} [GitHub +repository](https://github.com/metamelb-repliCATS/aggreCAT) using +reproducible examples created with [reprex]{.pkg} [@reprexpkg2020] on +the repliCATS pilot study datasets shipped with the [aggreCAT]{.pkg} +package. + +We have described the computational implementation of the aggregation +methods and supporting tools within the [aggreCAT]{.pkg} package, +providing usage examples and workflows for both simple and more complex +research contexts. Consequently, this paper should fully equip the +analyst for applying the aggregation functions contained within the +[aggreCAT]{.pkg} package to their own data. Where the analyst is +uncertain as to *which* aggregation method is best for their particular +research goals, the reader should consult Hanea et al. [-@Hanea2021] for +a discussion on the mathematical principles and hypotheses underlying +the design of the aggregation methods, as well as a comparative +performance evaluation of each of the methods. In conclusion, the +[aggreCAT]{.pkg} package will aid researchers and decision analysts in +rapidly and easily analysing the results of IDEA protocol and other +structured elicitation procedures where mathematical aggregation of +human forecasts is required. + +\newpage +\blandscape + + +```{r} +#| label: tbl-method-summary-table +#| tbl-cap: "Summary of aggregation methods and functions, including data requirements and sources." +#| echo: FALSE +#| include: FALSE +aggreCAT:::method_summary_table %>% + ungroup %>% + # filter(str_detect(aggregator_fun_desc, "[?]",negate = TRUE)) %>% #drop Eng/CompWAgg + mutate(aggregator_function = glue::glue("**{aggregator_function}**")) %>% + tidyr::unite(agg_name_description, + aggregator_function, + aggregator_fun_desc, sep = " ") %>% + select(-agg_name_description) %>% + mutate(supp_data_requirements = tidyr::replace_na(supp_data_requirements, " ")) %>% + distinct(judgement_data_sources_eqns) %>% + tt() +``` + + + +```{r, include = TRUE} +aggreCAT:::method_summary_table %>% + ungroup %>% + # filter(str_detect(aggregator_fun_desc, "[?]",negate = TRUE)) %>% #drop Eng/CompWAgg + mutate(aggregator_function = glue::glue("**{aggregator_function}**")) %>% + tidyr::unite(agg_name_description, + aggregator_function, + aggregator_fun_desc, sep = " ") %>% + select(-agg_name_description) %>% + mutate(supp_data_requirements = tidyr::replace_na(supp_data_requirements, " ")) %>% + kableExtra::kbl(col.names = c( + "Method", + "Description", + "Data Requirements", + "Weighting Function", + "Elicitation Rounds", + "Elicitation Method", + "Data Sources"), + escape = FALSE, + booktabs = TRUE, + longtable = TRUE, + caption = "\\label{tbl-method-summary-table} Summary of aggregation methods and functions, including data requirements and sources.", + format = "latex") %>% + kableExtra::column_spec(column = c(1,3,4,6,7), width = "10em") %>% + kableExtra::column_spec(column = c(5), width = "5em") %>% + kableExtra::column_spec(column = c(2), width = "20em") %>% + kableExtra::kable_styling(latex_options = + c("HOLD_position", "repeat_header"), + font_size = 6, + position = "left") %>% + kableExtra::pack_rows("AverageWAgg(): Averaged best estimates", 1,5) %>% + kableExtra::pack_rows("LinearWAgg() Linearly-weighted best estimates ", 6,10) %>% + kableExtra::pack_rows("IntervalWAgg() Linearly-weighted best estimates, with weights influenced by interval widths ", 11,16) %>% + kableExtra::pack_rows("ShiftingWAgg() Weighted by judgemetns that shift most after discussion", 17, 21) %>% + kableExtra::pack_rows("ReasoningWAgg() Linearly-weighted best estimates, with weights constructed from supplementary reasoning data", 22,23) %>% + kableExtra::pack_rows("ExtremisationWAgg() Takes the average of best-estimates and transforms it using the cumulative distribution function of a beta distribution",24,25) %>% + kableExtra::pack_rows("DistributionWAgg() Calculates the arithmetic mean of distributions created from expert judgements.", 26,27) %>% + kableExtra::pack_rows("BayesianWAgg() Bayesian aggregation methods with either uninformative or informative prior distributions", 28,29) +``` + +\elandscape +\newpage \ No newline at end of file From d8ad6e268e8bf8d2e59502e8cc495545ec8c5831 Mon Sep 17 00:00:00 2001 From: egouldo Date: Thu, 1 Feb 2024 16:53:45 +1100 Subject: [PATCH 12/15] #29 replace gt with tinytable in aggreCAT.qmd --- inst/ms/TEST_TABLE.qmd | 128 ++++++++++++++++++++++++++++++++++++++++- inst/ms/aggreCAT.qmd | 56 +++++++++--------- 2 files changed, 152 insertions(+), 32 deletions(-) diff --git a/inst/ms/TEST_TABLE.qmd b/inst/ms/TEST_TABLE.qmd index 24529c4..37a63ca 100644 --- a/inst/ms/TEST_TABLE.qmd +++ b/inst/ms/TEST_TABLE.qmd @@ -1233,6 +1233,7 @@ aggreCAT::confidence_score_evaluation( ) ``` + ```{r} #| label: tbl-multi-method-workflow-eval #| tbl-cap: "AUC and Classification Accuracy for forecasts from the aggregation methods 'ShiftWAgg', 'ArMean', 'IntWAgg', 'IndIntWAgg', 'ReasonWAgg' and 'BayTriVar' for a subset of the repliCATS pilot study claims (`focal_claims`) and known outcomes." @@ -1246,7 +1247,7 @@ aggreCAT::confidence_score_evaluation( ) %>% rename(Method = method, `Brier Score` = Brier_Score, - `Classification Accuracy (%)` = Classification_Accuracy) %>% + `Classification Accuracy ($\\%$)` = Classification_Accuracy) %>% tt() %>% format_tt(j = c(2,3), digits = 2) ``` @@ -1676,5 +1677,128 @@ aggreCAT:::method_summary_table %>% kableExtra::pack_rows("BayesianWAgg() Bayesian aggregation methods with either uninformative or informative prior distributions", 28,29) ``` +## Listings {.unnumbered} + +``` {#lst-multi-method-workflow-non-supp .r lst-cap="Multiple aggregation methods can be applied by binding rows rather than using the purrr package, if preferred."} +purrr::map2_dfr(.x = list(AverageWAgg, + IntervalWAgg, + IntervalWAgg, + ShiftingWAgg, + BayesianWAgg), + .y = list("ArMean", + "IndIntWAgg", + "IntWAgg", + "ShiftWAgg", + "BayTriVar"), + .f = ~ .x(focal_claims, + type = .y, + percent_toggle = TRUE) +) +``` + +``` {#lst-multi-method-workflow-both .r lst-cap="If we wish to batch aggregate claims using a combination of aggregation methods that do and do not require supplementary data, we must aggregate them separately, since the methods that require supplementary data have an additional argument for the supplementary data that must be parsed to the wrapper function call. We can chain the aggregation of the methods that do not require supplementary data, and the methods that do require supplementary data together very neatly using [dplyr]{.pkg}'s [bind_rows]{.fct} function [@dplyr2021] and the [magrittr]{.fct} pipe `%>%` [@magrittr2020]. Below we implement this approach while applying the aggregation methods `ArMean`, `IntWAgg`, `IndIntWAgg`, `ShiftWAgg` and `BayTriVar` to the repliCATS pilot program dataset `data_ratings`."} +confidenceSCOREs <- + list( + AverageWAgg, + IntervalWAgg, + IntervalWAgg, + ShiftingWAgg, + BayesianWAgg + ) %>% + purrr::map2_dfr( + .y = list("ArMean", + "IndIntWAgg", + "IntWAgg", + "ShiftWAgg", + "BayTriVar"), + .f = ~ .x(aggreCAT::data_ratings, type = .y, percent_toggle = TRUE) + ) %>% + dplyr::bind_rows( + ReasoningWAgg(aggreCAT::data_ratings, + reasons = aggreCAT::data_supp_reasons, + percent_toggle = TRUE) + ) +``` + +``` {#lst-BYO-data-aggregate .r lst-cap="Bring your own data: non-probablistic values"} +turtle_CS <- + list( + AverageWAgg, + IntervalWAgg, + IntervalWAgg, + ShiftingWAgg +) %>% + purrr::map2_dfr(.y = list("ArMean", + "IndIntWAgg", + "IntWAgg", + "ShiftWAgg"), + .f = ~ .x(green_turtles, type = .y, + percent_toggle = FALSE) + ) +``` + +``` {#lst-confidencescores .r lst-cap="Visualising Confidence Scores"} +plot_cs <- + confidenceSCOREs %>% + dplyr::left_join(aggreCAT::data_outcomes) %>% + dplyr::mutate(data_type = "Confidence Scores") %>% + dplyr::rename(x_vals = cs, + y_vals = method) %>% + dplyr::select(y_vals, paper_id, data_type, outcome, x_vals) + +plot_judgements <- + aggreCAT::preprocess_judgements(focal_claims, + percent_toggle = TRUE) %>% + tidyr::pivot_wider(names_from = element, + values_from = value) %>% + dplyr::left_join(aggreCAT::data_outcomes) %>% + dplyr::rename(x_vals = three_point_best, + y_vals = user_name) %>% + dplyr::select(paper_id, + y_vals, + x_vals, + tidyr::contains("three_point"), + outcome) %>% + dplyr::mutate(data_type = "Elicited Probabilities") + +p <- plot_judgements %>% + dplyr::bind_rows(., {dplyr::semi_join(plot_cs, plot_judgements, + by = "paper_id")}) %>% + ggplot2::ggplot(ggplot2::aes(x = x_vals, y = y_vals)) + + ggplot2::geom_pointrange(ggplot2::aes(xmin = three_point_lower, + xmax = three_point_upper)) + + ggplot2::facet_grid(data_type ~ paper_id, scales = "free_y") + + ggplot2::theme_classic() + + ggplot2::theme(legend.position = "none") + + ggplot2::geom_vline(aes(xintercept = 0.5, colour = as.logical(outcome))) + + ggplot2::xlab("Probability of Replication") + + ggplot2::ylab(ggplot2::element_blank()) + + ggplot2::scale_colour_brewer(palette = "Set1") +``` + +```{=tex} \elandscape -\newpage \ No newline at end of file +\newpage +``` +## Computational details {.unnumbered} + +The analyses and results in this paper were obtained using the following +computing environment, versions of `R` and `R` packages: + +::: callout +```{R} +#| label: session-info +#| prompt: true +devtools::session_info() +``` +::: + +## Acknowledgments {.unnumbered} + +::: callout +This project is sponsored by the Defense Advanced Research Projects +Agency (DARPA) under cooperative agreement No.HR001118S0047. The content +of the information does not necessarily reflect the position or the +policy of the Government, and no official endorsement should be +inferred. +::: \ No newline at end of file diff --git a/inst/ms/aggreCAT.qmd b/inst/ms/aggreCAT.qmd index 589988c..012b00e 100644 --- a/inst/ms/aggreCAT.qmd +++ b/inst/ms/aggreCAT.qmd @@ -79,6 +79,7 @@ library(tinytex) library(knitr) options(kableExtra.latex.load_packages = FALSE) library(kableExtra) +library(tinytable) ``` ## Introduction {#sec-introduction} @@ -612,13 +613,12 @@ focal_claims %>% three_point_lower, three_point_best, three_point_upper) %>% - gt::gt() %>% - gt::cols_label(paper_id = "Claim ID", - user_name = "User Name", - three_point_lower = "Lower Bound", - three_point_best = "Best Estimate", - three_point_upper = "Upper Bound") %>% - gt::as_latex() + rename(`Claim ID` = paper_id, + `User Name` = user_name, + `Lower Bound` = three_point_lower, + `Best Estimate` = three_point_best, + `Upper Bound` = three_point_upper) %>% + tt() ``` ### Non-weighted linear combination of judgements {#sec-AverageWAgg .section} @@ -938,13 +938,12 @@ focal_claims %>% dplyr::select(paper_id, user_name, round, three_point_lower, three_point_best, three_point_upper) %>% tidyr::drop_na() %>% dplyr::mutate(round = str_remove(round, "round_") %>% as.integer) %>% - gt::gt() %>% - gt::cols_label(paper_id = "Claim ID", - user_name = "User Name", - three_point_lower = "Lower Bound", - three_point_best = "Best Estimate", - three_point_upper = "Upper Bound") %>% - gt::fmt_integer(columns = round) + rename(`Claim ID` = paper_id, + `User Name` = user_name, + `Lower Bound` = three_point_lower, + `Best Estimate` = three_point_best, + `Upper Bound` = three_point_upper) %>% + tt() ``` Confidence Scores estimating the replicability for claim `24` @@ -1243,13 +1242,11 @@ aggreCAT::confidence_score_evaluation( confidenceSCOREs, aggreCAT::data_outcomes ) %>% - gt::gt() %>% - gt::cols_label(method = "Method", - Brier_Score = "Brier Score", - Classification_Accuracy = "Classification Accuracy") %>% - gt::fmt_number(columns = -c(method, Classification_Accuracy)) %>% - gt::fmt_percent(Classification_Accuracy,scale_values = FALSE, - decimals = 0) + rename(Method = method, + `Brier Score` = Brier_Score, + `Classification Accuracy ($\\%$)` = Classification_Accuracy) %>% + tt() %>% + format_tt(j = c(2,3), digits = 2) ``` ### Visualising Judgements, Confidence Scores and Forecast Performance @@ -1510,12 +1507,12 @@ turtle_CS <- percent_toggle = FALSE) ) turtle_CS %>% - gt::gt() %>% - gt::cols_label(method = "Method", - paper_id = "Question ID", - cs = "Confidence Score", - n_experts = "N (experts)") %>% - gt::fmt_number(columns = cs) + rename(Method = method, + `Question ID` = paper_id, + `Confidence Score` = cs, + `N (experts)` = n_experts) %>% + tt() %>% + format_tt(j = 3, num_fmt = 'decimal', digits = 2) ``` ## Summary and Discussion {#sec-summary} @@ -1618,9 +1615,8 @@ human forecasts is required. \blandscape ``` ```{r} -#| results: asis -#| echo: false -#| label: tbl-method-summary-table +#| include: TRUE +#| echo: FALSE aggreCAT:::method_summary_table %>% ungroup %>% # filter(str_detect(aggregator_fun_desc, "[?]",negate = TRUE)) %>% #drop Eng/CompWAgg From b0462bb3c3fd3548b43d517a7a54d0873c3973bf Mon Sep 17 00:00:00 2001 From: egouldo Date: Fri, 2 Feb 2024 11:25:54 +1100 Subject: [PATCH 13/15] #29 test setup conditional execution of methods summary table html/pdf --- inst/ms/TEST_TABLE.qmd | 72 ++++++++++++++++++++++++++++-------------- 1 file changed, 49 insertions(+), 23 deletions(-) diff --git a/inst/ms/TEST_TABLE.qmd b/inst/ms/TEST_TABLE.qmd index 37a63ca..ee3ced4 100644 --- a/inst/ms/TEST_TABLE.qmd +++ b/inst/ms/TEST_TABLE.qmd @@ -83,7 +83,6 @@ library(kableExtra) library(tinytable) ``` - ## Introduction {#sec-introduction} Expert judgement is frequently used to inform forecasting about @@ -1233,7 +1232,6 @@ aggreCAT::confidence_score_evaluation( ) ``` - ```{r} #| label: tbl-multi-method-workflow-eval #| tbl-cap: "AUC and Classification Accuracy for forecasts from the aggregation methods 'ShiftWAgg', 'ArMean', 'IntWAgg', 'IndIntWAgg', 'ReasonWAgg' and 'BayTriVar' for a subset of the repliCATS pilot study claims (`focal_claims`) and known outcomes." @@ -1613,31 +1611,54 @@ rapidly and easily analysing the results of IDEA protocol and other structured elicitation procedures where mathematical aggregation of human forecasts is required. -\newpage -\blandscape - - -```{r} +::: {.content-hidden unless-format="html"} +```{r, include = TRUE, echo = FALSE} #| label: tbl-method-summary-table #| tbl-cap: "Summary of aggregation methods and functions, including data requirements and sources." -#| echo: FALSE -#| include: FALSE aggreCAT:::method_summary_table %>% - ungroup %>% - # filter(str_detect(aggregator_fun_desc, "[?]",negate = TRUE)) %>% #drop Eng/CompWAgg - mutate(aggregator_function = glue::glue("**{aggregator_function}**")) %>% - tidyr::unite(agg_name_description, - aggregator_function, - aggregator_fun_desc, sep = " ") %>% - select(-agg_name_description) %>% - mutate(supp_data_requirements = tidyr::replace_na(supp_data_requirements, " ")) %>% - distinct(judgement_data_sources_eqns) %>% - tt() + ungroup %>% + # filter(str_detect(aggregator_fun_desc, "[?]",negate = TRUE)) %>% #drop Eng/CompWAgg + mutate(aggregator_function = glue::glue("**{aggregator_function}**")) %>% + tidyr::unite(agg_name_description, + aggregator_function, + aggregator_fun_desc, sep = " ") %>% + select(-agg_name_description) %>% + mutate(supp_data_requirements = tidyr::replace_na(supp_data_requirements, " ")) %>% + rename("Method" = type, + "Description" = "type_desc", + "Data Requirements" = "supp_data_requirements", + "Weighting Function" = "weighting_fn", + "Elicitation Rounds" = "number_rounds", + "Elicitation Method" = "elicitation_method", + "Data Sources" = "judgement_data_sources_eqns") %>% + tt() %>% + group_tt( + i = list( + "AverageWAgg(): Averaged best estimates" = 1, + "LinearWAgg() Linearly-weighted best estimates" = 6, + "IntervalWAgg() Linearly-weighted best estimates, with weights influenced by interval widths" = 11, + "ShiftingWAgg() Weighted by judgemetns that shift most after discussion" = 17, + "ReasoningWAgg() Linearly-weighted best estimates, with weights constructed from supplementary reasoning data" = 22, + "ExtremisationWAgg() Takes the average of best-estimates and transforms it using the cumulative distribution function of a beta distribution" = 24, + "DistributionWAgg() Calculates the arithmetic mean of distributions created from expert judgements." = 26, + "BayesianWAgg() Bayesian aggregation methods with either uninformative or informative prior distributions" = 28 + ) + ) + ``` +::: +::: {.content-hidden unless-format="pdf"} +```{=tex} +\newpage +\blandscape +``` -```{r, include = TRUE} +```{r} +#| echo: false +#| include: true +#| results: asis aggreCAT:::method_summary_table %>% ungroup %>% # filter(str_detect(aggregator_fun_desc, "[?]",negate = TRUE)) %>% #drop Eng/CompWAgg @@ -1658,8 +1679,7 @@ aggreCAT:::method_summary_table %>% escape = FALSE, booktabs = TRUE, longtable = TRUE, - caption = "\\label{tbl-method-summary-table} Summary of aggregation methods and functions, including data requirements and sources.", - format = "latex") %>% + caption = "Summary of aggregation methods and functions, including data requirements and sources. \\label{tbl-method-summary-table}", format = "latex") %>% kableExtra::column_spec(column = c(1,3,4,6,7), width = "10em") %>% kableExtra::column_spec(column = c(5), width = "5em") %>% kableExtra::column_spec(column = c(2), width = "20em") %>% @@ -1676,6 +1696,7 @@ aggreCAT:::method_summary_table %>% kableExtra::pack_rows("DistributionWAgg() Calculates the arithmetic mean of distributions created from expert judgements.", 26,27) %>% kableExtra::pack_rows("BayesianWAgg() Bayesian aggregation methods with either uninformative or informative prior distributions", 28,29) ``` +::: ## Listings {.unnumbered} @@ -1776,10 +1797,15 @@ p <- plot_judgements %>% ggplot2::scale_colour_brewer(palette = "Set1") ``` +:::{.content-hidden unless-format="pdf"} + ```{=tex} \elandscape \newpage ``` + +::: + ## Computational details {.unnumbered} The analyses and results in this paper were obtained using the following @@ -1801,4 +1827,4 @@ Agency (DARPA) under cooperative agreement No.HR001118S0047. The content of the information does not necessarily reflect the position or the policy of the Government, and no official endorsement should be inferred. -::: \ No newline at end of file +::: From 44cc9a26a009893f8d0c5475487bedfbdd3b116c Mon Sep 17 00:00:00 2001 From: egouldo Date: Fri, 2 Feb 2024 11:44:45 +1100 Subject: [PATCH 14/15] #29 conditional table formatting, update html format --- inst/ms/aggreCAT.qmd | 59 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 56 insertions(+), 3 deletions(-) diff --git a/inst/ms/aggreCAT.qmd b/inst/ms/aggreCAT.qmd index 012b00e..6538e6d 100644 --- a/inst/ms/aggreCAT.qmd +++ b/inst/ms/aggreCAT.qmd @@ -25,7 +25,7 @@ format: - \newcommand{\blandscape}{\begin{landscape}} - \newcommand{\elandscape}{\end{landscape}} - \usepackage{underscore} - # - \usepackage[authoryear,round]{natbib} + # - \usepackage[authoryear,round]{natbib} x11names: true journal: cite-shortnames: true @@ -1610,13 +1610,59 @@ rapidly and easily analysing the results of IDEA protocol and other structured elicitation procedures where mathematical aggregation of human forecasts is required. +::: {.content-hidden unless-format="html"} +```{r, include = TRUE, echo = FALSE} +#| column: page +#| label: tbl-method-summary-table +#| tbl-cap: "Summary of aggregation methods and functions, including data requirements and sources." +aggreCAT:::method_summary_table %>% + ungroup %>% + # filter(str_detect(aggregator_fun_desc, "[?]",negate = TRUE)) %>% #drop Eng/CompWAgg + mutate(aggregator_function = glue::glue("**{aggregator_function}**")) %>% + tidyr::unite(agg_name_description, + aggregator_function, + aggregator_fun_desc, sep = " ") %>% + select(-agg_name_description) %>% + mutate(supp_data_requirements = tidyr::replace_na(supp_data_requirements, " ")) %>% + rename("Method" = type, + "Description" = "type_desc", + "Data Requirements" = "supp_data_requirements", + "Weighting Function" = "weighting_fn", + "Elicitation Rounds" = "number_rounds", + "Elicitation Method" = "elicitation_method", + "Data Sources" = "judgement_data_sources_eqns") %>% + tt() %>% + group_tt( + i = list( + "AverageWAgg(): Averaged best estimates" = 1, + "LinearWAgg() Linearly-weighted best estimates" = 6, + "IntervalWAgg() Linearly-weighted best estimates, with weights influenced by interval widths" = 11, + "ShiftingWAgg() Weighted by judgemetns that shift most after discussion" = 17, + "ReasoningWAgg() Linearly-weighted best estimates, with weights constructed from supplementary reasoning data" = 22, + "ExtremisationWAgg() Takes the average of best-estimates and transforms it using the cumulative distribution function of a beta distribution" = 24, + "DistributionWAgg() Calculates the arithmetic mean of distributions created from expert judgements." = 26, + "BayesianWAgg() Bayesian aggregation methods with either uninformative or informative prior distributions" = 28 + ) + ) %>% + format_tt(j = c(3,4), markdown = TRUE) +#TODO needs tidying up for html presentation + +``` +::: + +::: {.content-hidden unless-format="pdf"} + ```{=tex} \newpage \blandscape ``` ```{r} -#| include: TRUE -#| echo: FALSE +#| echo: false +#| include: true +#| results: asis + +#TODO Table label and caption are missign from first page of table... possibly something +# todo with https://github.com/quarto-dev/quarto-cli/issues/1486 aggreCAT:::method_summary_table %>% ungroup %>% # filter(str_detect(aggregator_fun_desc, "[?]",negate = TRUE)) %>% #drop Eng/CompWAgg @@ -1656,6 +1702,8 @@ aggreCAT:::method_summary_table %>% kableExtra::pack_rows("BayesianWAgg() Bayesian aggregation methods with either uninformative or informative prior distributions", 28,29) ``` +::: + ## Listings {.unnumbered} ``` {#lst-multi-method-workflow-non-supp .r lst-cap="Multiple aggregation methods can be applied by binding rows rather than using the purrr package, if preferred."} @@ -1755,10 +1803,15 @@ p <- plot_judgements %>% ggplot2::scale_colour_brewer(palette = "Set1") ``` +::: {.content-hidden unless-format="pdf"} + ```{=tex} \elandscape \newpage ``` + +::: + ## Computational details {.unnumbered} The analyses and results in this paper were obtained using the following From 5b795bf788d8b62ba2d764199ddbb861a3cb1814 Mon Sep 17 00:00:00 2001 From: egouldo Date: Fri, 2 Feb 2024 11:58:08 +1100 Subject: [PATCH 15/15] #29 improve html methods table formatting --- inst/ms/aggreCAT.qmd | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/inst/ms/aggreCAT.qmd b/inst/ms/aggreCAT.qmd index 6538e6d..d3d9803 100644 --- a/inst/ms/aggreCAT.qmd +++ b/inst/ms/aggreCAT.qmd @@ -1611,10 +1611,13 @@ structured elicitation procedures where mathematical aggregation of human forecasts is required. ::: {.content-hidden unless-format="html"} -```{r, include = TRUE, echo = FALSE} +```{r} #| column: page +#| include: true +#| echo: false #| label: tbl-method-summary-table #| tbl-cap: "Summary of aggregation methods and functions, including data requirements and sources." +#| results: asis aggreCAT:::method_summary_table %>% ungroup %>% # filter(str_detect(aggregator_fun_desc, "[?]",negate = TRUE)) %>% #drop Eng/CompWAgg @@ -1634,17 +1637,20 @@ aggreCAT:::method_summary_table %>% tt() %>% group_tt( i = list( - "AverageWAgg(): Averaged best estimates" = 1, - "LinearWAgg() Linearly-weighted best estimates" = 6, - "IntervalWAgg() Linearly-weighted best estimates, with weights influenced by interval widths" = 11, - "ShiftingWAgg() Weighted by judgemetns that shift most after discussion" = 17, - "ReasoningWAgg() Linearly-weighted best estimates, with weights constructed from supplementary reasoning data" = 22, - "ExtremisationWAgg() Takes the average of best-estimates and transforms it using the cumulative distribution function of a beta distribution" = 24, - "DistributionWAgg() Calculates the arithmetic mean of distributions created from expert judgements." = 26, - "BayesianWAgg() Bayesian aggregation methods with either uninformative or informative prior distributions" = 28 + "`AverageWAgg()` *Averaged best estimates*" = 1, + "`LinearWAgg()` *Linearly-weighted best estimates*" = 6, + "`IntervalWAgg()` *Linearly-weighted best estimates, with weights influenced by interval widths*" = 11, + "`ShiftingWAgg()` *Weighted by judgemetns that shift most after discussion*" = 17, + "`ReasoningWAgg()` *Linearly-weighted best estimates, with weights constructed from supplementary reasoning data*" = 22, + "`ExtremisationWAgg()` *Takes the average of best-estimates and transforms it using the cumulative distribution function of a beta distribution*" = 24, + "`DistributionWAgg()` *Calculates the arithmetic mean of distributions created from expert judgements*" = 26, + "`BayesianWAgg()` *Bayesian aggregation methods with either uninformative or informative prior distributions*" = 28 ) ) %>% - format_tt(j = c(3,4), markdown = TRUE) + # format_tt(j = c(3,4), markdown = TRUE) %>% + print("markdown") + # style_tt(i = c(1,6,11,17,22,26,28), markdown = TRUE) + #TODO needs tidying up for html presentation ```