From c1b55192e3ad829d63ef64beaaa4329fa6cd9994 Mon Sep 17 00:00:00 2001 From: Laure Thompson <602628+laurejt@users.noreply.github.com> Date: Tue, 22 Oct 2024 10:41:49 -0400 Subject: [PATCH 1/9] Added basic recipe logging --- src/corppa/poetry_detection/annotation/recipe.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/corppa/poetry_detection/annotation/recipe.py b/src/corppa/poetry_detection/annotation/recipe.py index abd35a9..e6f2ddd 100644 --- a/src/corppa/poetry_detection/annotation/recipe.py +++ b/src/corppa/poetry_detection/annotation/recipe.py @@ -18,6 +18,7 @@ import spacy from prodigy.components.loaders import JSONL from prodigy.core import Arg, recipe +from prodigy.util import log #: reference to current directory, for use as Prodigy CSS directory CURRENT_DIR = Path(__file__).parent.absolute() @@ -86,7 +87,7 @@ def annotate_text_and_image( """Annotate text and image side by side: allows adding manual spans to both image and text. Intended for page-level annotation. """ - + log("RECIPE: Starting recipe annotate_text_and_image", locals()) stream = JSONL(source) # load jsonlines into stream # tokenize for span annotation and add image prefix tokenized_stream = tokenize_stream(stream, image_prefix) @@ -136,7 +137,7 @@ def annotate_page_text( with text for reference only (image cannot be annotated). Intended for page-level annotation. """ - + log("RECIPE: Starting recipe annotate_page_text", locals()) stream = JSONL(source) # load jsonlines into stream # tokenize for span annotation and add image prefix tokenized_stream = tokenize_stream(stream, image_prefix) From 1c12cd93d9051d62f44d26ee0da8c2684ae62bdb Mon Sep 17 00:00:00 2001 From: Laure Thompson <602628+laurejt@users.noreply.github.com> Date: Tue, 22 Oct 2024 12:25:29 -0400 Subject: [PATCH 2/9] Modify recipes to use Prodigy's get_label utility --- .../poetry_detection/annotation/recipe.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/src/corppa/poetry_detection/annotation/recipe.py b/src/corppa/poetry_detection/annotation/recipe.py index e6f2ddd..635eacd 100644 --- a/src/corppa/poetry_detection/annotation/recipe.py +++ b/src/corppa/poetry_detection/annotation/recipe.py @@ -18,7 +18,7 @@ import spacy from prodigy.components.loaders import JSONL from prodigy.core import Arg, recipe -from prodigy.util import log +from prodigy.util import get_labels, log #: reference to current directory, for use as Prodigy CSS directory CURRENT_DIR = Path(__file__).parent.absolute() @@ -78,7 +78,11 @@ def tokenize_stream(stream, image_prefix=None): @recipe( "annotate_text_and_image", dataset=Arg(help="path to input dataset"), - labels=Arg("--label", "-l", help="Comma-separated label(s)"), + label=Arg( + "--label", + "-l", + help="Comma-separated label(s) to annotate or text file with one label per line", + ), image_prefix=Arg("--image-prefix", "-i", help="Base URL for images"), ) def annotate_text_and_image( @@ -93,7 +97,7 @@ def annotate_text_and_image( tokenized_stream = tokenize_stream(stream, image_prefix) # split labels by commas and strip any whitespace - label_list = [label.strip() for label in labels.split(",")] + label_list = get_labels(label) blocks = [ { @@ -127,7 +131,11 @@ def annotate_text_and_image( @recipe( "annotate_page_text", dataset=Arg(help="path to input dataset"), - labels=Arg("--label", "-l", help="Comma-separated label(s)"), + label=Arg( + "--label", + "-l", + help="Comma-separated label(s) to annotate or text file with one label per line", + ), image_prefix=Arg("--image-prefix", "-i", help="Base URL for images"), ) def annotate_page_text( @@ -143,7 +151,7 @@ def annotate_page_text( tokenized_stream = tokenize_stream(stream, image_prefix) # split labels by commas and strip any whitespace - label_list = [label.strip() for label in labels.split(",")] + label_list = get_label(label) blocks = [ { From 6d19b2fb01b4cd064343cdb593b1f304a5d84dcb Mon Sep 17 00:00:00 2001 From: Laure Thompson <602628+laurejt@users.noreply.github.com> Date: Tue, 22 Oct 2024 12:28:13 -0400 Subject: [PATCH 3/9] Updated top-level documentation --- src/corppa/poetry_detection/annotation/recipe.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/corppa/poetry_detection/annotation/recipe.py b/src/corppa/poetry_detection/annotation/recipe.py index 635eacd..b1f1c71 100644 --- a/src/corppa/poetry_detection/annotation/recipe.py +++ b/src/corppa/poetry_detection/annotation/recipe.py @@ -1,8 +1,12 @@ """ -This module provides custom recipes for Prodigy annotation. They were -created with page-level text annotation in mind, and support annotating -text with a reference image displayed beside the text (`annotate_page_text`), -or annotating both text and image side by side (`annotate_text_and_image`). +This module provides custom recipes for Prodigy annotation. These were +created with page-level annotation in mind, and assume a page is associated +with both an image and text. + +Recipes: + * `annotate_page_text`: Annotate a page's text with the page's image + displayed side-by-side for reference. + * `annotate_text_and_image`: Annotate both a page's text and image side-by-side. Referenced images must be served out independently for display; the image url prefix for images should be specified when initializing the recipe. @@ -10,7 +14,7 @@ Example use: ``` prodigy annotate_page_text poetry_spans poetry_pages.jsonl --label POETRY,PROSODY -F ../corppa/poetry_detection/annotation/recipe.py --image-prefix http://localhost:8000/ -prodigy annotate_text_and_image poetry_text_image poetry_pages.jsonl --label POETRY -F ../corppa/poetry_detection/annotation/recipe.py --image-prefix http://localhost:8000/ +prodigy annotate_text_and_image poetry_text_image poetry_pages.jsonl -l POETRY -F ../corppa/poetry_detection/annotation/recipe.py --image-prefix http://localhost:8000/ """ from pathlib import Path From a961c1e31949a7b4e8837ff548c240db058374f5 Mon Sep 17 00:00:00 2001 From: Laure Thompson <602628+laurejt@users.noreply.github.com> Date: Tue, 22 Oct 2024 12:33:03 -0400 Subject: [PATCH 4/9] Fixed label(s) typos --- src/corppa/poetry_detection/annotation/recipe.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/corppa/poetry_detection/annotation/recipe.py b/src/corppa/poetry_detection/annotation/recipe.py index b1f1c71..da2c76a 100644 --- a/src/corppa/poetry_detection/annotation/recipe.py +++ b/src/corppa/poetry_detection/annotation/recipe.py @@ -101,7 +101,7 @@ def annotate_text_and_image( tokenized_stream = tokenize_stream(stream, image_prefix) # split labels by commas and strip any whitespace - label_list = get_labels(label) + label_list = get_labels(labels) blocks = [ { @@ -155,7 +155,7 @@ def annotate_page_text( tokenized_stream = tokenize_stream(stream, image_prefix) # split labels by commas and strip any whitespace - label_list = get_label(label) + label_list = get_labels(labels) blocks = [ { From 6c5d9388663a7f0e27f0af90ebf57132af27ce48 Mon Sep 17 00:00:00 2001 From: Laure Thompson <602628+laurejt@users.noreply.github.com> Date: Tue, 22 Oct 2024 21:32:35 -0400 Subject: [PATCH 5/9] Update to use Stream component's get_stream --- src/corppa/poetry_detection/annotation/recipe.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/corppa/poetry_detection/annotation/recipe.py b/src/corppa/poetry_detection/annotation/recipe.py index da2c76a..7766422 100644 --- a/src/corppa/poetry_detection/annotation/recipe.py +++ b/src/corppa/poetry_detection/annotation/recipe.py @@ -20,9 +20,10 @@ from pathlib import Path import spacy -from prodigy.components.loaders import JSONL +from prodigy import log +from prodigy.components.stream import get_stream from prodigy.core import Arg, recipe -from prodigy.util import get_labels, log +from prodigy.util import get_labels #: reference to current directory, for use as Prodigy CSS directory CURRENT_DIR = Path(__file__).parent.absolute() @@ -82,7 +83,7 @@ def tokenize_stream(stream, image_prefix=None): @recipe( "annotate_text_and_image", dataset=Arg(help="path to input dataset"), - label=Arg( + labels=Arg( "--label", "-l", help="Comma-separated label(s) to annotate or text file with one label per line", @@ -96,7 +97,7 @@ def annotate_text_and_image( to both image and text. Intended for page-level annotation. """ log("RECIPE: Starting recipe annotate_text_and_image", locals()) - stream = JSONL(source) # load jsonlines into stream + stream = get_stream(source) # tokenize for span annotation and add image prefix tokenized_stream = tokenize_stream(stream, image_prefix) @@ -135,7 +136,7 @@ def annotate_text_and_image( @recipe( "annotate_page_text", dataset=Arg(help="path to input dataset"), - label=Arg( + labels=Arg( "--label", "-l", help="Comma-separated label(s) to annotate or text file with one label per line", @@ -150,7 +151,7 @@ def annotate_page_text( Intended for page-level annotation. """ log("RECIPE: Starting recipe annotate_page_text", locals()) - stream = JSONL(source) # load jsonlines into stream + stream = get_stream(source) # tokenize for span annotation and add image prefix tokenized_stream = tokenize_stream(stream, image_prefix) From 27a569db27c3c8156fca98be852d9c05e9d05681 Mon Sep 17 00:00:00 2001 From: Laure Thompson <602628+laurejt@users.noreply.github.com> Date: Tue, 22 Oct 2024 22:39:07 -0400 Subject: [PATCH 6/9] Update recipes to use stream.apply & fetch_media --- .../poetry_detection/annotation/recipe.py | 104 +++++++++++------- 1 file changed, 66 insertions(+), 38 deletions(-) diff --git a/src/corppa/poetry_detection/annotation/recipe.py b/src/corppa/poetry_detection/annotation/recipe.py index 7766422..4b61209 100644 --- a/src/corppa/poetry_detection/annotation/recipe.py +++ b/src/corppa/poetry_detection/annotation/recipe.py @@ -21,6 +21,8 @@ import spacy from prodigy import log +from prodigy.components.preprocess import add_tokens +from prodigy.components.preprocess import fetch_media as fetch_media_preprocessor from prodigy.components.stream import get_stream from prodigy.core import Arg, recipe from prodigy.util import get_labels @@ -48,36 +50,28 @@ } -def tokenize_stream(stream, image_prefix=None): - """Takes a stream of Prodigy tasks and tokenizes text for span annotation, - and optionally adds an image prefix URL to any image paths present. - Stream is expected to contain `text` and may contain image_path` and a `meta` - dictionary. Returns a generator of the stream. - """ +def add_image(task, image_prefix=None): + if image_prefix is None: + task["image"] = task["image_path"] + else: + path_pfx = image_prefix.rstrip("/") + task["image"] = f"{path_pfx}/{task['image_path']}" + return task - nlp = spacy.blank("en") # use blank spaCy model for tokenization - # ensure image prefix URL does not have a trailing slash - if image_prefix is None: - image_prefix = "" - image_prefix = image_prefix.rstrip("/") - - for task in stream: - if task.get("text"): - doc = nlp(task["text"]) - task["tokens"] = [ - { - "text": token.text, - "start": token.idx, - "end": token.idx + len(token.text), - "id": i, - } - for i, token in enumerate(doc) - ] - # add image prefix URL for serving out images - if "image_path" in task: - task["image"] = f"{image_prefix}/{task['image_path']}" - yield task +def add_images(examples, image_prefix=None): + for task in examples: + yield add_image(task, image_prefix=image_prefix) + + +def remove_images(examples, image_prefix=None): + for task in examples: + # If "image" is a base64 string and "image_path" is present in the task, + # remove the image data + if task["image"].startswith("data:") and "image_path" in task: + # Replace image with full image path + add_image(task, image_prefix=image_prefix) + return examples @recipe( @@ -89,17 +83,29 @@ def tokenize_stream(stream, image_prefix=None): help="Comma-separated label(s) to annotate or text file with one label per line", ), image_prefix=Arg("--image-prefix", "-i", help="Base URL for images"), + fetch_media=Arg( + "--fetch-media", "-FM", help="Load images from local paths or URLs" + ), ) def annotate_text_and_image( - dataset: str, source: str, labels: str, image_prefix: str = None + dataset: str, + source: str, + labels: str, + image_prefix: str = None, + fetch_media: bool = False, ): """Annotate text and image side by side: allows adding manual spans to both image and text. Intended for page-level annotation. """ log("RECIPE: Starting recipe annotate_text_and_image", locals()) stream = get_stream(source) - # tokenize for span annotation and add image prefix - tokenized_stream = tokenize_stream(stream, image_prefix) + # add tokens tokenize + stream.apply(add_tokens, nlp=spacy.blank("en"), stream=stream) + # add image prefix + stream.apply(add_images, image_prefix=image_prefix) + # optionally fetch media + if fetch_media: + stream.apply(fetch_media_preprocessor, ["image"], skip=True) # split labels by commas and strip any whitespace label_list = get_labels(labels) @@ -125,13 +131,18 @@ def annotate_text_and_image( } ) - return { + components = { "dataset": dataset, - "stream": tokenized_stream, + "stream": stream, "view_id": "blocks", "config": config, } + if fetch_media: + components["before_db"] = lambda x: remove_images(x, image_prefix=image_prefix) + + return components + @recipe( "annotate_page_text", @@ -142,9 +153,16 @@ def annotate_text_and_image( help="Comma-separated label(s) to annotate or text file with one label per line", ), image_prefix=Arg("--image-prefix", "-i", help="Base URL for images"), + fetch_media=Arg( + "--fetch-media", "-FM", help="Load images from local paths or URLs" + ), ) def annotate_page_text( - dataset: str, source: str, labels: str, image_prefix: str = None + dataset: str, + source: str, + labels: str, + image_prefix: str = None, + fetch_media: bool = False, ): """Annotate text with manual spans; displays an image side by side with text for reference only (image cannot be annotated). @@ -152,8 +170,13 @@ def annotate_page_text( """ log("RECIPE: Starting recipe annotate_page_text", locals()) stream = get_stream(source) - # tokenize for span annotation and add image prefix - tokenized_stream = tokenize_stream(stream, image_prefix) + # add tokens tokenize + stream.apply(add_tokens, nlp=spacy.blank("en"), stream=stream) + # add image prefix + stream.apply(add_images, stream, image_prefix=image_prefix) + # optionally fetch media + if fetch_media: + stream.apply(fetch_media_preprocessor, ["image"], skip=True) # split labels by commas and strip any whitespace label_list = get_labels(labels) @@ -175,9 +198,14 @@ def annotate_page_text( } ) - return { + components = { "dataset": dataset, - "stream": tokenized_stream, + "stream": stream, "view_id": "blocks", "config": config, } + + if fetch_media: + components["before_db"] = lambda x: remove_images(x, image_prefix=image_prefix) + + return components From eafd313fe16b9aa262e7de0fc45a1d9b8a9e0c78 Mon Sep 17 00:00:00 2001 From: Laure Thompson <602628+laurejt@users.noreply.github.com> Date: Wed, 23 Oct 2024 11:44:06 -0400 Subject: [PATCH 7/9] Added review recipe --- .../poetry_detection/annotation/recipe.py | 286 +++++++++++++++--- 1 file changed, 252 insertions(+), 34 deletions(-) diff --git a/src/corppa/poetry_detection/annotation/recipe.py b/src/corppa/poetry_detection/annotation/recipe.py index 4b61209..cc6db10 100644 --- a/src/corppa/poetry_detection/annotation/recipe.py +++ b/src/corppa/poetry_detection/annotation/recipe.py @@ -1,12 +1,14 @@ """ This module provides custom recipes for Prodigy annotation. These were -created with page-level annotation in mind, and assume a page is associated -with both an image and text. +created with page-level annotation in mind and assume a page is associated +with both text and an image. Each recipe displays a page's image and text +side-by-side. Recipes: - * `annotate_page_text`: Annotate a page's text with the page's image - displayed side-by-side for reference. + * `annotate_page_text`: Annotate a page's text. * `annotate_text_and_image`: Annotate both a page's text and image side-by-side. + * `review_page_spans`: Review existing page-level text annotations to produce + a final, adjudicated set of annotations. Referenced images must be served out independently for display; the image url prefix for images should be specified when initializing the recipe. @@ -14,18 +16,24 @@ Example use: ``` prodigy annotate_page_text poetry_spans poetry_pages.jsonl --label POETRY,PROSODY -F ../corppa/poetry_detection/annotation/recipe.py --image-prefix http://localhost:8000/ -prodigy annotate_text_and_image poetry_text_image poetry_pages.jsonl -l POETRY -F ../corppa/poetry_detection/annotation/recipe.py --image-prefix http://localhost:8000/ +prodigy annotate_text_and_image poetry_text_image poetry_pages.jsonl -l POETRY -F ../corppa/poetry_detection/annotation/recipe.py --image-prefix ../ppa-web-images -FM +prodigy review_page_spans adjudicate poetry_spans -l POETRY -F ../corppa/poetry_detection/annotation/recipe.py --image-prefix ../ppa-web-images -FM --sessions alice,bob """ +from collections import defaultdict +from copy import deepcopy from pathlib import Path +from typing import Dict, Iterable, List, Optional import spacy -from prodigy import log +from prodigy import log, set_hashes +from prodigy.components.db import connect from prodigy.components.preprocess import add_tokens from prodigy.components.preprocess import fetch_media as fetch_media_preprocessor from prodigy.components.stream import get_stream from prodigy.core import Arg, recipe -from prodigy.util import get_labels +from prodigy.types import LabelsType, RecipeSettingsType, StreamType, TaskType +from prodigy.util import INPUT_HASH_ATTR, SESSION_ID_ATTR, get_labels #: reference to current directory, for use as Prodigy CSS directory CURRENT_DIR = Path(__file__).parent.absolute() @@ -49,22 +57,51 @@ "global_css_dir": CURRENT_DIR, } +#: color palette for predefined session names +PALETTE = [ + "#c5bdf4", + "#ffd882", + "#d9fbad", + "#c2f2f6", + "#ffdaf9", + "#b5c6c9", + "#96e8ce", + "#ffd1b2", +] -def add_image(task, image_prefix=None): + +def add_image(example: TaskType, image_prefix: Optional[str] = None): + """ + Set an example's image field to its existing image_path with an optional prefix + + Note: Assumes filepaths use forward slash + """ if image_prefix is None: - task["image"] = task["image_path"] + example["image"] = example["image_path"] else: path_pfx = image_prefix.rstrip("/") - task["image"] = f"{path_pfx}/{task['image_path']}" - return task + example["image"] = f"{path_pfx}/{example['image_path']}" + return example -def add_images(examples, image_prefix=None): - for task in examples: - yield add_image(task, image_prefix=image_prefix) +def add_images(examples: StreamType, image_prefix: Optional[str] = None) -> StreamType: + """ + Set the image field for each example in the stream + + Calls: `add_image` + """ + for example in examples: + yield add_image(example, image_prefix=image_prefix) -def remove_images(examples, image_prefix=None): +def remove_images( + examples: Iterable[TaskType], image_prefix: Optional[str] = None +) -> List[TaskType]: + """ + For each example, replace base64 data URIs with image filepath or URL + + Calls: `add_image` + """ for task in examples: # If "image" is a base64 string and "image_path" is present in the task, # remove the image data @@ -90,10 +127,10 @@ def remove_images(examples, image_prefix=None): def annotate_text_and_image( dataset: str, source: str, - labels: str, + labels: LabelsType = [], image_prefix: str = None, fetch_media: bool = False, -): +) -> RecipeSettingsType: """Annotate text and image side by side: allows adding manual spans to both image and text. Intended for page-level annotation. """ @@ -105,25 +142,21 @@ def annotate_text_and_image( stream.apply(add_images, image_prefix=image_prefix) # optionally fetch media if fetch_media: - stream.apply(fetch_media_preprocessor, ["image"], skip=True) - - # split labels by commas and strip any whitespace - label_list = get_labels(labels) + stream.apply(fetch_media_preprocessor, ["image"]) blocks = [ { "view_id": "image_manual", - "labels": label_list, + "labels": labels, }, - {"view_id": "spans_manual", "labels": label_list}, + {"view_id": "spans_manual", "labels": labels}, ] # copy the common config options and add blocks and labels - config = PRODIGY_COMMON_CONFIG.copy() + config = deepcopy(PRODIGY_COMMON_CONFIG) config.update( { "blocks": blocks, - "labels": label_list, "ner_manual_highlight_chars": True, "image_manual_spans_key": "image_spans", # limit image selection to rectangle only, no polygon or freehand @@ -160,10 +193,10 @@ def annotate_text_and_image( def annotate_page_text( dataset: str, source: str, - labels: str, + labels: LabelsType = [], image_prefix: str = None, fetch_media: bool = False, -): +) -> RecipeSettingsType: """Annotate text with manual spans; displays an image side by side with text for reference only (image cannot be annotated). Intended for page-level annotation. @@ -176,24 +209,20 @@ def annotate_page_text( stream.apply(add_images, stream, image_prefix=image_prefix) # optionally fetch media if fetch_media: - stream.apply(fetch_media_preprocessor, ["image"], skip=True) - - # split labels by commas and strip any whitespace - label_list = get_labels(labels) + stream.apply(fetch_media_preprocessor, ["image"]) blocks = [ { "view_id": "html", "html_template": "", }, - {"view_id": "spans_manual", "labels": label_list}, + {"view_id": "spans_manual", "labels": labels}, ] # copy the common config options and add blocks and labels - config = PRODIGY_COMMON_CONFIG.copy() + config = deepcopy(PRODIGY_COMMON_CONFIG) config.update( { "blocks": blocks, - "labels": label_list, "ner_manual_highlight_chars": True, } ) @@ -209,3 +238,192 @@ def annotate_page_text( components["before_db"] = lambda x: remove_images(x, image_prefix=image_prefix) return components + + +class ReviewStream: + """ + Stream of review examples. This mostly exists to expose a __len__ to show + total progress in the web interface. + """ + + def __init__( + self, + data: Dict[int, List[TaskType]], + image_prefix: Optional[str] = None, + fetch_media: bool = False, + ) -> None: + """ + Initialize a review stream. + + data: Merged data, with examples grouped by input hash. + image_prefix: Image prefix for creating image (full) paths + fetch_media: Whether to fetch task images. + """ + self.n_examples = len(data) + self.data = self.get_data(data, image_prefix, fetch_media) + + def __len__(self) -> int: + return self.n_examples + + def __iter__(self) -> StreamType: + for example in self.data: + yield example + + def create_review_example(self, versions: List[TaskType]) -> TaskType: + """ + Create review example from several annotated versions. + """ + # TODO: Make sure that no unmerged version content is preserved. + review_example = deepcopy(versions[-1]) + # Merge spans + merged_spans = [] + session_counts = {} + sessions = [] + + for version in versions: + session_id = version[SESSION_ID_ATTR] + # Assume: session name does not contain - + session_name = session_id.rsplit("-", maxsplit=1)[1] + if session_id not in session_counts: + session_counts[session_id] = 1 + else: + session_name += f"-{session_counts[session_id]}" + session_counts[session_id] += 1 + sessions.append(session_name) + if "spans" not in version: + # Not sure when an annotated example would be missing a spans field + continue + for span in version["spans"]: + new_span = span.copy() + span_label = span["label"] + new_span["label"] = f"{session_name}: {span_label}" + merged_spans.append(new_span) + review_example["spans"] = merged_spans + review_example["sessions"] = sessions + return review_example + + def get_data( + self, + data: Dict[int, List[TaskType]], + image_prefix: Optional[str], + fetch_media: bool, + ) -> List[TaskType]: + """ + Build review examples from data. Add images to each example. + """ + examples = [] + for _, versions in data.items(): + review_example = self.create_review_example(versions) + review_example = add_image(review_example, image_prefix) + examples.append(review_example) + if fetch_media: + return fetch_media_preprocessor(examples, ["image"]) + return examples + + +def get_review_stream( + examples: Iterable[TaskType], + image_prefix: Optional[str] = None, + fetch_media: bool = False, +) -> StreamType: + # Group examples by input (page_id, text) + grouped_examples = defaultdict(list) + for example in examples: + # Reset hashes + example = set_hashes( + example, overwrite=True, input_keys=["id", "text"], task_keys=["spans"] + ) + input_hash = example[INPUT_HASH_ATTR] + grouped_examples[input_hash].append(example) + return ReviewStream( + grouped_examples, image_prefix=image_prefix, fetch_media=fetch_media + ) + + +@recipe( + "review_page_spans", + dataset=Arg(help="Dataset to save annotations to"), + input_dataset=Arg(help="Name of dataset to review"), + labels=Arg( + "--label", + "-l", + help="Comma-separated label(s) to annotate or text file with one label per line", + ), + image_prefix=Arg("--image-prefix", "-i", help="Base URL for images"), + fetch_media=Arg( + "--fetch-media", "-FM", help="Load images from local paths or URLs" + ), + sessions=Arg("--sessions", help="Comma-separated session names for coloring"), +) +def review_page_spans( + dataset: str, + input_dataset: str, + labels: LabelsType = [], + image_prefix: str = None, + fetch_media: bool = False, + sessions: List[str] = [], +) -> RecipeSettingsType: + """ + Review input text span annotations and annotate with manual spans to create + final, adjudicated annotations. Loads and displays input text span + annotations. + """ + # Load annotations + DB = connect() + if input_dataset not in DB: + raise RecipeError(f"Can't find input dataset '{input_dataset}' in database") + annotations = DB.get_dataset_examples(input_dataset) + + blocks = [ + { + "view_id": "html", + "html_template": "", + }, + {"view_id": "spans_manual", "labels": labels}, + ] + + def before_db(examples): + """ + Modifies annotated examples before saving to the database: + * Remove image spans & tokens (unneeded fields) + * Reset image to (full) image path if image fetched + """ + for example in examples: + # remove image spans + del example["image_spans"] + # remove tokens + del example["tokens"] + if fetch_media: + # reset image to path + example = add_image(example, image_prefix=image_prefix) + return examples + + # Set label colors + label_colors = PRODIGY_COMMON_CONFIG["custom_theme"]["labels"].copy() + if sessions: + # Add session-label colors + for i, session in enumerate(sessions): + session_color = PALETTE[i % len(PALETTE)] + for label in labels: + label_colors[f"{session}: {label}"] = session_color + + # copy the common config options and add blocks and labels + config = deepcopy(PRODIGY_COMMON_CONFIG) + config.update( + { + "blocks": blocks, + "ner_manual_highlight_chars": True, + "global_css_dir": CURRENT_DIR, + "custom_theme": {"labels": label_colors}, + } + ) + + return { + "dataset": dataset, + "view_id": "blocks", + "stream": get_review_stream( + annotations, image_prefix=image_prefix, fetch_media=fetch_media + ), + "before_db": before_db, + "config": config, + } From 93db63f79eb394108517355d415015500735c52e Mon Sep 17 00:00:00 2001 From: Laure Thompson <602628+laurejt@users.noreply.github.com> Date: Wed, 23 Oct 2024 16:23:38 -0400 Subject: [PATCH 8/9] Update src/corppa/poetry_detection/annotation/recipe.py Co-authored-by: Rebecca Sutton Koeser --- src/corppa/poetry_detection/annotation/recipe.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/corppa/poetry_detection/annotation/recipe.py b/src/corppa/poetry_detection/annotation/recipe.py index cc6db10..f606a9a 100644 --- a/src/corppa/poetry_detection/annotation/recipe.py +++ b/src/corppa/poetry_detection/annotation/recipe.py @@ -283,6 +283,7 @@ def create_review_example(self, versions: List[TaskType]) -> TaskType: for version in versions: session_id = version[SESSION_ID_ATTR] # Assume: session name does not contain - + # full session name includes the dataset id; split to get the session name without dataset id session_name = session_id.rsplit("-", maxsplit=1)[1] if session_id not in session_counts: session_counts[session_id] = 1 From a0780c0528e970d2f56364902225a3a09f66a256 Mon Sep 17 00:00:00 2001 From: Laure Thompson <602628+laurejt@users.noreply.github.com> Date: Wed, 23 Oct 2024 16:33:44 -0400 Subject: [PATCH 9/9] Rename remove_images to remove_image_data --- src/corppa/poetry_detection/annotation/recipe.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/corppa/poetry_detection/annotation/recipe.py b/src/corppa/poetry_detection/annotation/recipe.py index f606a9a..b23beb7 100644 --- a/src/corppa/poetry_detection/annotation/recipe.py +++ b/src/corppa/poetry_detection/annotation/recipe.py @@ -94,7 +94,7 @@ def add_images(examples: StreamType, image_prefix: Optional[str] = None) -> Stre yield add_image(example, image_prefix=image_prefix) -def remove_images( +def remove_image_data( examples: Iterable[TaskType], image_prefix: Optional[str] = None ) -> List[TaskType]: """ @@ -172,7 +172,9 @@ def annotate_text_and_image( } if fetch_media: - components["before_db"] = lambda x: remove_images(x, image_prefix=image_prefix) + components["before_db"] = lambda x: remove_image_data( + x, image_prefix=image_prefix + ) return components @@ -235,7 +237,9 @@ def annotate_page_text( } if fetch_media: - components["before_db"] = lambda x: remove_images(x, image_prefix=image_prefix) + components["before_db"] = lambda x: remove_image_data( + x, image_prefix=image_prefix + ) return components