From c1b55192e3ad829d63ef64beaaa4329fa6cd9994 Mon Sep 17 00:00:00 2001
From: Laure Thompson <602628+laurejt@users.noreply.github.com>
Date: Tue, 22 Oct 2024 10:41:49 -0400
Subject: [PATCH 1/9] Added basic recipe logging

---
 src/corppa/poetry_detection/annotation/recipe.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/corppa/poetry_detection/annotation/recipe.py b/src/corppa/poetry_detection/annotation/recipe.py
index abd35a9..e6f2ddd 100644
--- a/src/corppa/poetry_detection/annotation/recipe.py
+++ b/src/corppa/poetry_detection/annotation/recipe.py
@@ -18,6 +18,7 @@
 import spacy
 from prodigy.components.loaders import JSONL
 from prodigy.core import Arg, recipe
+from prodigy.util import log
 
 #: reference to current directory, for use as Prodigy CSS directory
 CURRENT_DIR = Path(__file__).parent.absolute()
@@ -86,7 +87,7 @@ def annotate_text_and_image(
     """Annotate text and image side by side: allows adding manual spans
     to both image and text. Intended for page-level annotation.
     """
-
+    log("RECIPE: Starting recipe annotate_text_and_image", locals())
     stream = JSONL(source)  # load jsonlines into stream
     # tokenize for span annotation and add image prefix
     tokenized_stream = tokenize_stream(stream, image_prefix)
@@ -136,7 +137,7 @@ def annotate_page_text(
     with text for reference only (image cannot be annotated).
     Intended for page-level annotation.
     """
-
+    log("RECIPE: Starting recipe annotate_page_text", locals())
     stream = JSONL(source)  # load jsonlines into stream
     # tokenize for span annotation and add image prefix
     tokenized_stream = tokenize_stream(stream, image_prefix)

From 1c12cd93d9051d62f44d26ee0da8c2684ae62bdb Mon Sep 17 00:00:00 2001
From: Laure Thompson <602628+laurejt@users.noreply.github.com>
Date: Tue, 22 Oct 2024 12:25:29 -0400
Subject: [PATCH 2/9] Modify recipes to use Prodigy's get_label utility

---
 .../poetry_detection/annotation/recipe.py      | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/src/corppa/poetry_detection/annotation/recipe.py b/src/corppa/poetry_detection/annotation/recipe.py
index e6f2ddd..635eacd 100644
--- a/src/corppa/poetry_detection/annotation/recipe.py
+++ b/src/corppa/poetry_detection/annotation/recipe.py
@@ -18,7 +18,7 @@
 import spacy
 from prodigy.components.loaders import JSONL
 from prodigy.core import Arg, recipe
-from prodigy.util import log
+from prodigy.util import get_labels, log
 
 #: reference to current directory, for use as Prodigy CSS directory
 CURRENT_DIR = Path(__file__).parent.absolute()
@@ -78,7 +78,11 @@ def tokenize_stream(stream, image_prefix=None):
 @recipe(
     "annotate_text_and_image",
     dataset=Arg(help="path to input dataset"),
-    labels=Arg("--label", "-l", help="Comma-separated label(s)"),
+    label=Arg(
+        "--label",
+        "-l",
+        help="Comma-separated label(s) to annotate or text file with one label per line",
+    ),
     image_prefix=Arg("--image-prefix", "-i", help="Base URL for images"),
 )
 def annotate_text_and_image(
@@ -93,7 +97,7 @@ def annotate_text_and_image(
     tokenized_stream = tokenize_stream(stream, image_prefix)
 
     # split labels by commas and strip any whitespace
-    label_list = [label.strip() for label in labels.split(",")]
+    label_list = get_labels(label)
 
     blocks = [
         {
@@ -127,7 +131,11 @@ def annotate_text_and_image(
 @recipe(
     "annotate_page_text",
     dataset=Arg(help="path to input dataset"),
-    labels=Arg("--label", "-l", help="Comma-separated label(s)"),
+    label=Arg(
+        "--label",
+        "-l",
+        help="Comma-separated label(s) to annotate or text file with one label per line",
+    ),
     image_prefix=Arg("--image-prefix", "-i", help="Base URL for images"),
 )
 def annotate_page_text(
@@ -143,7 +151,7 @@ def annotate_page_text(
     tokenized_stream = tokenize_stream(stream, image_prefix)
 
     # split labels by commas and strip any whitespace
-    label_list = [label.strip() for label in labels.split(",")]
+    label_list = get_label(label)
 
     blocks = [
         {

From 6d19b2fb01b4cd064343cdb593b1f304a5d84dcb Mon Sep 17 00:00:00 2001
From: Laure Thompson <602628+laurejt@users.noreply.github.com>
Date: Tue, 22 Oct 2024 12:28:13 -0400
Subject: [PATCH 3/9] Updated top-level documentation

---
 src/corppa/poetry_detection/annotation/recipe.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/src/corppa/poetry_detection/annotation/recipe.py b/src/corppa/poetry_detection/annotation/recipe.py
index 635eacd..b1f1c71 100644
--- a/src/corppa/poetry_detection/annotation/recipe.py
+++ b/src/corppa/poetry_detection/annotation/recipe.py
@@ -1,8 +1,12 @@
 """
-This module provides custom recipes for Prodigy annotation. They were
-created with page-level text annotation in mind, and support annotating
-text with a reference image displayed beside the text (`annotate_page_text`),
-or annotating both text and image side by side (`annotate_text_and_image`).
+This module provides custom recipes for Prodigy annotation. These were
+created with page-level annotation in mind, and assume a page is associated
+with both an image and text.
+
+Recipes:
+    * `annotate_page_text`: Annotate a page's text with the page's image
+      displayed side-by-side for reference.
+    * `annotate_text_and_image`: Annotate both a page's text and image side-by-side.
 
 Referenced images must be served out independently for display; the image url
 prefix for images should be specified when initializing the recipe.
@@ -10,7 +14,7 @@
 Example use:
 ```
 prodigy annotate_page_text poetry_spans poetry_pages.jsonl --label POETRY,PROSODY -F ../corppa/poetry_detection/annotation/recipe.py --image-prefix http://localhost:8000/
-prodigy annotate_text_and_image poetry_text_image poetry_pages.jsonl --label POETRY -F ../corppa/poetry_detection/annotation/recipe.py --image-prefix http://localhost:8000/
+prodigy annotate_text_and_image poetry_text_image poetry_pages.jsonl -l POETRY -F ../corppa/poetry_detection/annotation/recipe.py --image-prefix http://localhost:8000/
 """
 
 from pathlib import Path

From a961c1e31949a7b4e8837ff548c240db058374f5 Mon Sep 17 00:00:00 2001
From: Laure Thompson <602628+laurejt@users.noreply.github.com>
Date: Tue, 22 Oct 2024 12:33:03 -0400
Subject: [PATCH 4/9] Fixed label(s) typos

---
 src/corppa/poetry_detection/annotation/recipe.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/corppa/poetry_detection/annotation/recipe.py b/src/corppa/poetry_detection/annotation/recipe.py
index b1f1c71..da2c76a 100644
--- a/src/corppa/poetry_detection/annotation/recipe.py
+++ b/src/corppa/poetry_detection/annotation/recipe.py
@@ -101,7 +101,7 @@ def annotate_text_and_image(
     tokenized_stream = tokenize_stream(stream, image_prefix)
 
     # split labels by commas and strip any whitespace
-    label_list = get_labels(label)
+    label_list = get_labels(labels)
 
     blocks = [
         {
@@ -155,7 +155,7 @@ def annotate_page_text(
     tokenized_stream = tokenize_stream(stream, image_prefix)
 
     # split labels by commas and strip any whitespace
-    label_list = get_label(label)
+    label_list = get_labels(labels)
 
     blocks = [
         {

From 6c5d9388663a7f0e27f0af90ebf57132af27ce48 Mon Sep 17 00:00:00 2001
From: Laure Thompson <602628+laurejt@users.noreply.github.com>
Date: Tue, 22 Oct 2024 21:32:35 -0400
Subject: [PATCH 5/9] Update to use Stream component's get_stream

---
 src/corppa/poetry_detection/annotation/recipe.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/corppa/poetry_detection/annotation/recipe.py b/src/corppa/poetry_detection/annotation/recipe.py
index da2c76a..7766422 100644
--- a/src/corppa/poetry_detection/annotation/recipe.py
+++ b/src/corppa/poetry_detection/annotation/recipe.py
@@ -20,9 +20,10 @@
 from pathlib import Path
 
 import spacy
-from prodigy.components.loaders import JSONL
+from prodigy import log
+from prodigy.components.stream import get_stream
 from prodigy.core import Arg, recipe
-from prodigy.util import get_labels, log
+from prodigy.util import get_labels
 
 #: reference to current directory, for use as Prodigy CSS directory
 CURRENT_DIR = Path(__file__).parent.absolute()
@@ -82,7 +83,7 @@ def tokenize_stream(stream, image_prefix=None):
 @recipe(
     "annotate_text_and_image",
     dataset=Arg(help="path to input dataset"),
-    label=Arg(
+    labels=Arg(
         "--label",
         "-l",
         help="Comma-separated label(s) to annotate or text file with one label per line",
@@ -96,7 +97,7 @@ def annotate_text_and_image(
     to both image and text. Intended for page-level annotation.
     """
     log("RECIPE: Starting recipe annotate_text_and_image", locals())
-    stream = JSONL(source)  # load jsonlines into stream
+    stream = get_stream(source)
     # tokenize for span annotation and add image prefix
     tokenized_stream = tokenize_stream(stream, image_prefix)
 
@@ -135,7 +136,7 @@ def annotate_text_and_image(
 @recipe(
     "annotate_page_text",
     dataset=Arg(help="path to input dataset"),
-    label=Arg(
+    labels=Arg(
         "--label",
         "-l",
         help="Comma-separated label(s) to annotate or text file with one label per line",
@@ -150,7 +151,7 @@ def annotate_page_text(
     Intended for page-level annotation.
     """
     log("RECIPE: Starting recipe annotate_page_text", locals())
-    stream = JSONL(source)  # load jsonlines into stream
+    stream = get_stream(source)
     # tokenize for span annotation and add image prefix
     tokenized_stream = tokenize_stream(stream, image_prefix)
 

From 27a569db27c3c8156fca98be852d9c05e9d05681 Mon Sep 17 00:00:00 2001
From: Laure Thompson <602628+laurejt@users.noreply.github.com>
Date: Tue, 22 Oct 2024 22:39:07 -0400
Subject: [PATCH 6/9] Update recipes to use stream.apply & fetch_media

---
 .../poetry_detection/annotation/recipe.py     | 104 +++++++++++-------
 1 file changed, 66 insertions(+), 38 deletions(-)

diff --git a/src/corppa/poetry_detection/annotation/recipe.py b/src/corppa/poetry_detection/annotation/recipe.py
index 7766422..4b61209 100644
--- a/src/corppa/poetry_detection/annotation/recipe.py
+++ b/src/corppa/poetry_detection/annotation/recipe.py
@@ -21,6 +21,8 @@
 
 import spacy
 from prodigy import log
+from prodigy.components.preprocess import add_tokens
+from prodigy.components.preprocess import fetch_media as fetch_media_preprocessor
 from prodigy.components.stream import get_stream
 from prodigy.core import Arg, recipe
 from prodigy.util import get_labels
@@ -48,36 +50,28 @@
 }
 
 
-def tokenize_stream(stream, image_prefix=None):
-    """Takes a stream of Prodigy tasks and tokenizes text for span annotation,
-    and optionally adds an image prefix URL to any image paths present.
-    Stream is expected to contain `text` and may contain image_path` and a `meta`
-    dictionary. Returns a generator of the stream.
-    """
+def add_image(task, image_prefix=None):
+    if image_prefix is None:
+        task["image"] = task["image_path"]
+    else:
+        path_pfx = image_prefix.rstrip("/")
+        task["image"] = f"{path_pfx}/{task['image_path']}"
+    return task
 
-    nlp = spacy.blank("en")  # use blank spaCy model for tokenization
 
-    # ensure image prefix URL does not have a trailing slash
-    if image_prefix is None:
-        image_prefix = ""
-    image_prefix = image_prefix.rstrip("/")
-
-    for task in stream:
-        if task.get("text"):
-            doc = nlp(task["text"])
-            task["tokens"] = [
-                {
-                    "text": token.text,
-                    "start": token.idx,
-                    "end": token.idx + len(token.text),
-                    "id": i,
-                }
-                for i, token in enumerate(doc)
-            ]
-        # add image prefix URL for serving out images
-        if "image_path" in task:
-            task["image"] = f"{image_prefix}/{task['image_path']}"
-        yield task
+def add_images(examples, image_prefix=None):
+    for task in examples:
+        yield add_image(task, image_prefix=image_prefix)
+
+
+def remove_images(examples, image_prefix=None):
+    for task in examples:
+        # If "image" is a base64 string and "image_path" is present in the task,
+        # remove the image data
+        if task["image"].startswith("data:") and "image_path" in task:
+            # Replace image with full image path
+            add_image(task, image_prefix=image_prefix)
+    return examples
 
 
 @recipe(
@@ -89,17 +83,29 @@ def tokenize_stream(stream, image_prefix=None):
         help="Comma-separated label(s) to annotate or text file with one label per line",
     ),
     image_prefix=Arg("--image-prefix", "-i", help="Base URL for images"),
+    fetch_media=Arg(
+        "--fetch-media", "-FM", help="Load images from local paths or URLs"
+    ),
 )
 def annotate_text_and_image(
-    dataset: str, source: str, labels: str, image_prefix: str = None
+    dataset: str,
+    source: str,
+    labels: str,
+    image_prefix: str = None,
+    fetch_media: bool = False,
 ):
     """Annotate text and image side by side: allows adding manual spans
     to both image and text. Intended for page-level annotation.
     """
     log("RECIPE: Starting recipe annotate_text_and_image", locals())
     stream = get_stream(source)
-    # tokenize for span annotation and add image prefix
-    tokenized_stream = tokenize_stream(stream, image_prefix)
+    # add tokens tokenize
+    stream.apply(add_tokens, nlp=spacy.blank("en"), stream=stream)
+    # add image prefix
+    stream.apply(add_images, image_prefix=image_prefix)
+    # optionally fetch media
+    if fetch_media:
+        stream.apply(fetch_media_preprocessor, ["image"], skip=True)
 
     # split labels by commas and strip any whitespace
     label_list = get_labels(labels)
@@ -125,13 +131,18 @@ def annotate_text_and_image(
         }
     )
 
-    return {
+    components = {
         "dataset": dataset,
-        "stream": tokenized_stream,
+        "stream": stream,
         "view_id": "blocks",
         "config": config,
     }
 
+    if fetch_media:
+        components["before_db"] = lambda x: remove_images(x, image_prefix=image_prefix)
+
+    return components
+
 
 @recipe(
     "annotate_page_text",
@@ -142,9 +153,16 @@ def annotate_text_and_image(
         help="Comma-separated label(s) to annotate or text file with one label per line",
     ),
     image_prefix=Arg("--image-prefix", "-i", help="Base URL for images"),
+    fetch_media=Arg(
+        "--fetch-media", "-FM", help="Load images from local paths or URLs"
+    ),
 )
 def annotate_page_text(
-    dataset: str, source: str, labels: str, image_prefix: str = None
+    dataset: str,
+    source: str,
+    labels: str,
+    image_prefix: str = None,
+    fetch_media: bool = False,
 ):
     """Annotate text with manual spans; displays an image side by side
     with text for reference only (image cannot be annotated).
@@ -152,8 +170,13 @@ def annotate_page_text(
     """
     log("RECIPE: Starting recipe annotate_page_text", locals())
     stream = get_stream(source)
-    # tokenize for span annotation and add image prefix
-    tokenized_stream = tokenize_stream(stream, image_prefix)
+    # add tokens tokenize
+    stream.apply(add_tokens, nlp=spacy.blank("en"), stream=stream)
+    # add image prefix
+    stream.apply(add_images, stream, image_prefix=image_prefix)
+    # optionally fetch media
+    if fetch_media:
+        stream.apply(fetch_media_preprocessor, ["image"], skip=True)
 
     # split labels by commas and strip any whitespace
     label_list = get_labels(labels)
@@ -175,9 +198,14 @@ def annotate_page_text(
         }
     )
 
-    return {
+    components = {
         "dataset": dataset,
-        "stream": tokenized_stream,
+        "stream": stream,
         "view_id": "blocks",
         "config": config,
     }
+
+    if fetch_media:
+        components["before_db"] = lambda x: remove_images(x, image_prefix=image_prefix)
+
+    return components

From eafd313fe16b9aa262e7de0fc45a1d9b8a9e0c78 Mon Sep 17 00:00:00 2001
From: Laure Thompson <602628+laurejt@users.noreply.github.com>
Date: Wed, 23 Oct 2024 11:44:06 -0400
Subject: [PATCH 7/9] Added review recipe

---
 .../poetry_detection/annotation/recipe.py     | 286 +++++++++++++++---
 1 file changed, 252 insertions(+), 34 deletions(-)

diff --git a/src/corppa/poetry_detection/annotation/recipe.py b/src/corppa/poetry_detection/annotation/recipe.py
index 4b61209..cc6db10 100644
--- a/src/corppa/poetry_detection/annotation/recipe.py
+++ b/src/corppa/poetry_detection/annotation/recipe.py
@@ -1,12 +1,14 @@
 """
 This module provides custom recipes for Prodigy annotation. These were
-created with page-level annotation in mind, and assume a page is associated
-with both an image and text.
+created with page-level annotation in mind and assume a page is associated
+with both text and an image. Each recipe displays a page's image and text
+side-by-side.
 
 Recipes:
-    * `annotate_page_text`: Annotate a page's text with the page's image
-      displayed side-by-side for reference.
+    * `annotate_page_text`: Annotate a page's text.
     * `annotate_text_and_image`: Annotate both a page's text and image side-by-side.
+    * `review_page_spans`: Review existing page-level text annotations to produce
+      a final, adjudicated set of annotations.
 
 Referenced images must be served out independently for display; the image url
 prefix for images should be specified when initializing the recipe.
@@ -14,18 +16,24 @@
 Example use:
 ```
 prodigy annotate_page_text poetry_spans poetry_pages.jsonl --label POETRY,PROSODY -F ../corppa/poetry_detection/annotation/recipe.py --image-prefix http://localhost:8000/
-prodigy annotate_text_and_image poetry_text_image poetry_pages.jsonl -l POETRY -F ../corppa/poetry_detection/annotation/recipe.py --image-prefix http://localhost:8000/
+prodigy annotate_text_and_image poetry_text_image poetry_pages.jsonl -l POETRY -F ../corppa/poetry_detection/annotation/recipe.py --image-prefix ../ppa-web-images -FM
+prodigy review_page_spans adjudicate poetry_spans -l POETRY -F ../corppa/poetry_detection/annotation/recipe.py --image-prefix ../ppa-web-images -FM --sessions alice,bob
 """
 
+from collections import defaultdict
+from copy import deepcopy
 from pathlib import Path
+from typing import Dict, Iterable, List, Optional
 
 import spacy
-from prodigy import log
+from prodigy import log, set_hashes
+from prodigy.components.db import connect
 from prodigy.components.preprocess import add_tokens
 from prodigy.components.preprocess import fetch_media as fetch_media_preprocessor
 from prodigy.components.stream import get_stream
 from prodigy.core import Arg, recipe
-from prodigy.util import get_labels
+from prodigy.types import LabelsType, RecipeSettingsType, StreamType, TaskType
+from prodigy.util import INPUT_HASH_ATTR, SESSION_ID_ATTR, get_labels
 
 #: reference to current directory, for use as Prodigy CSS directory
 CURRENT_DIR = Path(__file__).parent.absolute()
@@ -49,22 +57,51 @@
     "global_css_dir": CURRENT_DIR,
 }
 
+#: color palette for predefined session names
+PALETTE = [
+    "#c5bdf4",
+    "#ffd882",
+    "#d9fbad",
+    "#c2f2f6",
+    "#ffdaf9",
+    "#b5c6c9",
+    "#96e8ce",
+    "#ffd1b2",
+]
 
-def add_image(task, image_prefix=None):
+
+def add_image(example: TaskType, image_prefix: Optional[str] = None):
+    """
+    Set an example's image field to its existing image_path with an optional prefix
+
+    Note: Assumes filepaths use forward slash
+    """
     if image_prefix is None:
-        task["image"] = task["image_path"]
+        example["image"] = example["image_path"]
     else:
         path_pfx = image_prefix.rstrip("/")
-        task["image"] = f"{path_pfx}/{task['image_path']}"
-    return task
+        example["image"] = f"{path_pfx}/{example['image_path']}"
+    return example
 
 
-def add_images(examples, image_prefix=None):
-    for task in examples:
-        yield add_image(task, image_prefix=image_prefix)
+def add_images(examples: StreamType, image_prefix: Optional[str] = None) -> StreamType:
+    """
+    Set the image field for each example in the stream
+
+    Calls: `add_image`
+    """
+    for example in examples:
+        yield add_image(example, image_prefix=image_prefix)
 
 
-def remove_images(examples, image_prefix=None):
+def remove_images(
+    examples: Iterable[TaskType], image_prefix: Optional[str] = None
+) -> List[TaskType]:
+    """
+    For each example, replace base64 data URIs with image filepath or URL
+
+    Calls: `add_image`
+    """
     for task in examples:
         # If "image" is a base64 string and "image_path" is present in the task,
         # remove the image data
@@ -90,10 +127,10 @@ def remove_images(examples, image_prefix=None):
 def annotate_text_and_image(
     dataset: str,
     source: str,
-    labels: str,
+    labels: LabelsType = [],
     image_prefix: str = None,
     fetch_media: bool = False,
-):
+) -> RecipeSettingsType:
     """Annotate text and image side by side: allows adding manual spans
     to both image and text. Intended for page-level annotation.
     """
@@ -105,25 +142,21 @@ def annotate_text_and_image(
     stream.apply(add_images, image_prefix=image_prefix)
     # optionally fetch media
     if fetch_media:
-        stream.apply(fetch_media_preprocessor, ["image"], skip=True)
-
-    # split labels by commas and strip any whitespace
-    label_list = get_labels(labels)
+        stream.apply(fetch_media_preprocessor, ["image"])
 
     blocks = [
         {
             "view_id": "image_manual",
-            "labels": label_list,
+            "labels": labels,
         },
-        {"view_id": "spans_manual", "labels": label_list},
+        {"view_id": "spans_manual", "labels": labels},
     ]
 
     # copy the common config options and add blocks and labels
-    config = PRODIGY_COMMON_CONFIG.copy()
+    config = deepcopy(PRODIGY_COMMON_CONFIG)
     config.update(
         {
             "blocks": blocks,
-            "labels": label_list,
             "ner_manual_highlight_chars": True,
             "image_manual_spans_key": "image_spans",
             # limit image selection to rectangle only, no polygon or freehand
@@ -160,10 +193,10 @@ def annotate_text_and_image(
 def annotate_page_text(
     dataset: str,
     source: str,
-    labels: str,
+    labels: LabelsType = [],
     image_prefix: str = None,
     fetch_media: bool = False,
-):
+) -> RecipeSettingsType:
     """Annotate text with manual spans; displays an image side by side
     with text for reference only (image cannot be annotated).
     Intended for page-level annotation.
@@ -176,24 +209,20 @@ def annotate_page_text(
     stream.apply(add_images, stream, image_prefix=image_prefix)
     # optionally fetch media
     if fetch_media:
-        stream.apply(fetch_media_preprocessor, ["image"], skip=True)
-
-    # split labels by commas and strip any whitespace
-    label_list = get_labels(labels)
+        stream.apply(fetch_media_preprocessor, ["image"])
 
     blocks = [
         {
             "view_id": "html",
             "html_template": "<img src='{{ image }}' width='500'>",
         },
-        {"view_id": "spans_manual", "labels": label_list},
+        {"view_id": "spans_manual", "labels": labels},
     ]
     # copy the common config options and add blocks and labels
-    config = PRODIGY_COMMON_CONFIG.copy()
+    config = deepcopy(PRODIGY_COMMON_CONFIG)
     config.update(
         {
             "blocks": blocks,
-            "labels": label_list,
             "ner_manual_highlight_chars": True,
         }
     )
@@ -209,3 +238,192 @@ def annotate_page_text(
         components["before_db"] = lambda x: remove_images(x, image_prefix=image_prefix)
 
     return components
+
+
+class ReviewStream:
+    """
+    Stream of review examples. This mostly exists to expose a __len__ to show
+    total progress in the web interface.
+    """
+
+    def __init__(
+        self,
+        data: Dict[int, List[TaskType]],
+        image_prefix: Optional[str] = None,
+        fetch_media: bool = False,
+    ) -> None:
+        """
+        Initialize a review stream.
+
+        data: Merged data, with examples grouped by input hash.
+        image_prefix: Image prefix for creating image (full) paths
+        fetch_media: Whether to fetch task images.
+        """
+        self.n_examples = len(data)
+        self.data = self.get_data(data, image_prefix, fetch_media)
+
+    def __len__(self) -> int:
+        return self.n_examples
+
+    def __iter__(self) -> StreamType:
+        for example in self.data:
+            yield example
+
+    def create_review_example(self, versions: List[TaskType]) -> TaskType:
+        """
+        Create review example from several annotated versions.
+        """
+        # TODO: Make sure that no unmerged version content is preserved.
+        review_example = deepcopy(versions[-1])
+        # Merge spans
+        merged_spans = []
+        session_counts = {}
+        sessions = []
+
+        for version in versions:
+            session_id = version[SESSION_ID_ATTR]
+            # Assume: session name does not contain -
+            session_name = session_id.rsplit("-", maxsplit=1)[1]
+            if session_id not in session_counts:
+                session_counts[session_id] = 1
+            else:
+                session_name += f"-{session_counts[session_id]}"
+                session_counts[session_id] += 1
+            sessions.append(session_name)
+            if "spans" not in version:
+                # Not sure when an annotated example would be missing a spans field
+                continue
+            for span in version["spans"]:
+                new_span = span.copy()
+                span_label = span["label"]
+                new_span["label"] = f"{session_name}: {span_label}"
+                merged_spans.append(new_span)
+        review_example["spans"] = merged_spans
+        review_example["sessions"] = sessions
+        return review_example
+
+    def get_data(
+        self,
+        data: Dict[int, List[TaskType]],
+        image_prefix: Optional[str],
+        fetch_media: bool,
+    ) -> List[TaskType]:
+        """
+        Build review examples from data. Add images to each example.
+        """
+        examples = []
+        for _, versions in data.items():
+            review_example = self.create_review_example(versions)
+            review_example = add_image(review_example, image_prefix)
+            examples.append(review_example)
+        if fetch_media:
+            return fetch_media_preprocessor(examples, ["image"])
+        return examples
+
+
+def get_review_stream(
+    examples: Iterable[TaskType],
+    image_prefix: Optional[str] = None,
+    fetch_media: bool = False,
+) -> StreamType:
+    # Group examples by input (page_id, text)
+    grouped_examples = defaultdict(list)
+    for example in examples:
+        # Reset hashes
+        example = set_hashes(
+            example, overwrite=True, input_keys=["id", "text"], task_keys=["spans"]
+        )
+        input_hash = example[INPUT_HASH_ATTR]
+        grouped_examples[input_hash].append(example)
+    return ReviewStream(
+        grouped_examples, image_prefix=image_prefix, fetch_media=fetch_media
+    )
+
+
+@recipe(
+    "review_page_spans",
+    dataset=Arg(help="Dataset to save annotations to"),
+    input_dataset=Arg(help="Name of dataset to review"),
+    labels=Arg(
+        "--label",
+        "-l",
+        help="Comma-separated label(s) to annotate or text file with one label per line",
+    ),
+    image_prefix=Arg("--image-prefix", "-i", help="Base URL for images"),
+    fetch_media=Arg(
+        "--fetch-media", "-FM", help="Load images from local paths or URLs"
+    ),
+    sessions=Arg("--sessions", help="Comma-separated session names for coloring"),
+)
+def review_page_spans(
+    dataset: str,
+    input_dataset: str,
+    labels: LabelsType = [],
+    image_prefix: str = None,
+    fetch_media: bool = False,
+    sessions: List[str] = [],
+) -> RecipeSettingsType:
+    """
+    Review input text span annotations and annotate with manual spans to create
+    final, adjudicated annotations. Loads and displays input text span
+    annotations.
+    """
+    # Load annotations
+    DB = connect()
+    if input_dataset not in DB:
+        raise RecipeError(f"Can't find input dataset '{input_dataset}' in database")
+    annotations = DB.get_dataset_examples(input_dataset)
+
+    blocks = [
+        {
+            "view_id": "html",
+            "html_template": "<img src='{{ image }}' width='500'>",
+        },
+        {"view_id": "spans_manual", "labels": labels},
+    ]
+
+    def before_db(examples):
+        """
+        Modifies annotated examples before saving to the database:
+            * Remove image spans & tokens (unneeded fields)
+            * Reset image to (full) image path if image fetched
+        """
+        for example in examples:
+            # remove image spans
+            del example["image_spans"]
+            # remove tokens
+            del example["tokens"]
+            if fetch_media:
+                # reset image to path
+                example = add_image(example, image_prefix=image_prefix)
+        return examples
+
+    # Set label colors
+    label_colors = PRODIGY_COMMON_CONFIG["custom_theme"]["labels"].copy()
+    if sessions:
+        # Add session-label colors
+        for i, session in enumerate(sessions):
+            session_color = PALETTE[i % len(PALETTE)]
+            for label in labels:
+                label_colors[f"{session}: {label}"] = session_color
+
+    # copy the common config options and add blocks and labels
+    config = deepcopy(PRODIGY_COMMON_CONFIG)
+    config.update(
+        {
+            "blocks": blocks,
+            "ner_manual_highlight_chars": True,
+            "global_css_dir": CURRENT_DIR,
+            "custom_theme": {"labels": label_colors},
+        }
+    )
+
+    return {
+        "dataset": dataset,
+        "view_id": "blocks",
+        "stream": get_review_stream(
+            annotations, image_prefix=image_prefix, fetch_media=fetch_media
+        ),
+        "before_db": before_db,
+        "config": config,
+    }

From 93db63f79eb394108517355d415015500735c52e Mon Sep 17 00:00:00 2001
From: Laure Thompson <602628+laurejt@users.noreply.github.com>
Date: Wed, 23 Oct 2024 16:23:38 -0400
Subject: [PATCH 8/9] Update src/corppa/poetry_detection/annotation/recipe.py

Co-authored-by: Rebecca Sutton Koeser <rlskoeser@users.noreply.github.com>
---
 src/corppa/poetry_detection/annotation/recipe.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/corppa/poetry_detection/annotation/recipe.py b/src/corppa/poetry_detection/annotation/recipe.py
index cc6db10..f606a9a 100644
--- a/src/corppa/poetry_detection/annotation/recipe.py
+++ b/src/corppa/poetry_detection/annotation/recipe.py
@@ -283,6 +283,7 @@ def create_review_example(self, versions: List[TaskType]) -> TaskType:
         for version in versions:
             session_id = version[SESSION_ID_ATTR]
             # Assume: session name does not contain -
+            # full session name includes the dataset id; split to get the session name without dataset id
             session_name = session_id.rsplit("-", maxsplit=1)[1]
             if session_id not in session_counts:
                 session_counts[session_id] = 1

From a0780c0528e970d2f56364902225a3a09f66a256 Mon Sep 17 00:00:00 2001
From: Laure Thompson <602628+laurejt@users.noreply.github.com>
Date: Wed, 23 Oct 2024 16:33:44 -0400
Subject: [PATCH 9/9] Rename remove_images to remove_image_data

---
 src/corppa/poetry_detection/annotation/recipe.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/corppa/poetry_detection/annotation/recipe.py b/src/corppa/poetry_detection/annotation/recipe.py
index f606a9a..b23beb7 100644
--- a/src/corppa/poetry_detection/annotation/recipe.py
+++ b/src/corppa/poetry_detection/annotation/recipe.py
@@ -94,7 +94,7 @@ def add_images(examples: StreamType, image_prefix: Optional[str] = None) -> Stre
         yield add_image(example, image_prefix=image_prefix)
 
 
-def remove_images(
+def remove_image_data(
     examples: Iterable[TaskType], image_prefix: Optional[str] = None
 ) -> List[TaskType]:
     """
@@ -172,7 +172,9 @@ def annotate_text_and_image(
     }
 
     if fetch_media:
-        components["before_db"] = lambda x: remove_images(x, image_prefix=image_prefix)
+        components["before_db"] = lambda x: remove_image_data(
+            x, image_prefix=image_prefix
+        )
 
     return components
 
@@ -235,7 +237,9 @@ def annotate_page_text(
     }
 
     if fetch_media:
-        components["before_db"] = lambda x: remove_images(x, image_prefix=image_prefix)
+        components["before_db"] = lambda x: remove_image_data(
+            x, image_prefix=image_prefix
+        )
 
     return components