diff --git a/.gitignore b/.gitignore index b7bbec8..de9a71e 100644 --- a/.gitignore +++ b/.gitignore @@ -139,6 +139,6 @@ data/**/ !data/README.md notebooks -!docs/documentation/notebooks +!docs/how-to-guides/notebooks scripts \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index bfc9be0..a9a1e66 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,30 +1,30 @@ -anonipy-0.0.8 (2024-06-17) +### anonipy-0.0.8 (2024-06-17) - Add automatic date format detection support to DateGenerator -anonipy-0.0.7 (2024-06-06) +### anonipy-0.0.7 (2024-06-06) - Upgrade gliner-spacy to have cleaner code - Add function to help manual post-anonymization replacement fixing -anonipy-0.0.6 (2024-05-31) +### anonipy-0.0.6 (2024-05-31) - Add GPU support and entity scores to EntityExtractor - Standardize the function naming in strategies -anonipy-0.0.5 (2024-05-29) +### anonipy-0.0.5 (2024-05-29) - Re-implement file reading methods + add unit tests - Expland the test environment on all OS -anonipy-0.0.4 (2024-05-27) +### anonipy-0.0.4 (2024-05-27) - Add unit tests - Fix the LANGUAGES constant - Refine the Entity implementation - Update documentation -anonipy-0.0.3 (2024-05-22) +### anonipy-0.0.3 (2024-05-22) - Add read_json function - Add write_json function @@ -33,11 +33,11 @@ anonipy-0.0.3 (2024-05-22) - Reduce the number of viable suggestions used to create a substitute in MaskLabelGenerator - Add the entity label to the replacements in strategies -anonipy-0.0.2 (2024-05-22) +### anonipy-0.0.2 (2024-05-22) - Add write_file function - Add blog to the documentation -anonipy-0.0.1 (2024-05-21) +### anonipy-0.0.1 (2024-05-21) - Initial release \ No newline at end of file diff --git a/README.md b/README.md index df50ca1..483efae 100644 --- a/README.md +++ b/README.md @@ -30,26 +30,24 @@ The anonipy package is a python package for data anonymization. It is designed to be simple to use and highly customizable, supporting different anonymization strategies. Powered by LLMs. -## ✅ Requirements +## Requirements Before starting the project make sure these requirements are available: - [python]. The python programming language (v3.8, v3.9, v3.10, v3.11). -## 💾 Install +## Install ```bash pip install anonipy ``` -## ⬆️ Upgrade +## Upgrade ```bash pip install anonipy --upgrade ``` -## 🔎 Example - -The details of the example can be found in the [Overview](https://eriknovak.github.io/anonipy/documentation/notebooks/00-overview.ipynb). +## Example ```python original_text = """\ @@ -77,14 +75,14 @@ Use the language detector to detect the language of the text: ```python from anonipy.utils.language_detector import LanguageDetector -lang_detector = LanguageDetector() -language = lang_detector(original_text) +language_detector = LanguageDetector() +language = language_detector(original_text) ``` Prepare the entity extractor and extract the personal infomation from the original text: ```python -from anonipy.anonymize.extractors import EntityExtractor +from anonipy.anonymize.extractors import NERExtractor # define the labels to be extracted and anonymized labels = [ @@ -94,14 +92,14 @@ labels = [ {"label": "date", "type": "date"}, ] -# language taken from the language detector -entity_extractor = EntityExtractor(labels, lang=language, score_th=0.5) +# initialize the NER extractor for the language and labels +extractor = NERExtractor(labels, lang=language, score_th=0.5) # extract the entities from the original text -doc, entities = entity_extractor(original_text) +doc, entities = extractor(original_text) # display the entities in the original text -entity_extractor.display(doc) +extractor.display(doc) ``` Use generators to create substitutes for the entities: @@ -123,9 +121,9 @@ def anonymization_mapping(text, entity): if entity.type == "string": return llm_generator.generate(entity, temperature=0.7) if entity.label == "date": - return date_generator.generate(entity, output_gen="middle_of_the_month") + return date_generator.generate(entity, output_gen="MIDDLE_OF_THE_MONTH") if entity.label == "date of birth": - return date_generator.generate(entity, output_gen="middle_of_the_year") + return date_generator.generate(entity, output_gen="MIDDLE_OF_THE_YEAR") if entity.label == "social security number": return number_generator.generate(entity) return "[REDACTED]" @@ -143,7 +141,7 @@ pseudo_strategy = PseudonymizationStrategy(mapping=anonymization_mapping) anonymized_text, replacements = pseudo_strategy.anonymize(original_text, entities) ``` -## 📖 Acknowledgements +## Acknowledgements [Anonipy](https://eriknovak.github.io/anonipy/) is developed by the [Department for Artificial Intelligence](http://ailab.ijs.si/) at the diff --git a/anonipy/__init__.py b/anonipy/__init__.py index ff20e30..84c6e67 100644 --- a/anonipy/__init__.py +++ b/anonipy/__init__.py @@ -1,25 +1,15 @@ -""" -anonipy - -The anonipy package provides utilities for data anonymization. - -Submodules ----------- -anonymize : - The package containing anonymization classes and functions. -utils : - The package containing utility classes and functions. -definitions : - The object definitions used within the package. -constants : - The constant values used to help with data anonymization. +"""`Anonipy` is a text anonymization package. +The `anonipy` package provides utilities for data anonymization. It provides +a set of modules and utilities for (1) identifying relevant information +that needs to be anonymized, (2) generating substitutes for the identified +information, and (3) strategies for anonymizing the identified information. -How to use the documentation ----------------------------- -Documentation is available in two forms: docstrings provided -with the code and a loose standing reference guide, available -from `the anonipy homepage `. +Modules: + anonymize: The module containing the anonymization submodules and utility. + utils: The module containing utility classes and functions. + definitions: The module containing predefined types used across the package. + constants: The module containing the predefined constants used across the package. """ diff --git a/anonipy/anonymize/__init__.py b/anonipy/anonymize/__init__.py index 598d0c6..3bdb532 100644 --- a/anonipy/anonymize/__init__.py +++ b/anonipy/anonymize/__init__.py @@ -1,29 +1,23 @@ -""" -anonymize +"""Module containing the anonymization modules and utility. -The module provides a set of anonymization utilities. +The `anonymize` module provides a set of anonymization modules and utility, +including `extractors`, `generators`, and `strategies`. In addition, it provides +methods for anonymizing text based on a list of replacements. -Submodules ----------- -extractors : - The module containing the extractor classes -generators : - The module containing the generator classes -strategies : - The module containing the strategy classes -regex : - The module containing the regex patterns +Modules: + extractors: The module containing the extractor classes. + generators: The module containing the generator classes. + strategies: The module containing the strategy classes. -Methods -------- -anonymize() +Methods: + anonymize(text, replacements): + Anonymize the text based on the replacements. """ from . import extractors from . import generators from . import strategies -from . import regex from .helpers import anonymize -__all__ = ["extractors", "generators", "strategies", "regex", "anonymize"] +__all__ = ["extractors", "generators", "strategies", "anonymize"] diff --git a/anonipy/anonymize/extractors/__init__.py b/anonipy/anonymize/extractors/__init__.py index ec53b8a..58e054c 100644 --- a/anonipy/anonymize/extractors/__init__.py +++ b/anonipy/anonymize/extractors/__init__.py @@ -1,18 +1,19 @@ -""" -extractors +"""Module containing the `extractors`. -The module provides a set of extractors used in the library. +The `extractors` module provides a set of extractors used to identify relevant +information within a document. -Classes -------- -ExtractorInterface : - The class representing the extractor interface -EntityExtractor : - The class representing the entity extractor +Classes: + NERExtractor: The class representing the named entity recognition (NER) extractor. + PatternExtractor: The class representing the pattern extractor. + MultiExtractor: The class representing the multi extractor. """ from .interface import ExtractorInterface -from .entity_extractor import EntityExtractor +from .multi_extractor import MultiExtractor +from .ner_extractor import NERExtractor +from .pattern_extractor import PatternExtractor + -__all__ = ["ExtractorInterface", "EntityExtractor"] +__all__ = ["ExtractorInterface", "MultiExtractor", "NERExtractor", "PatternExtractor"] diff --git a/anonipy/anonymize/extractors/entity_extractor.py b/anonipy/anonymize/extractors/entity_extractor.py deleted file mode 100644 index dbca1b4..0000000 --- a/anonipy/anonymize/extractors/entity_extractor.py +++ /dev/null @@ -1,205 +0,0 @@ -import re -import importlib -from typing import List, Tuple -import warnings - -import torch -from spacy import displacy -from spacy.tokens import Doc - -from ..helpers import convert_spacy_to_entity -from ..regex import regex_map -from ...constants import LANGUAGES -from ...definitions import Entity - -from .interface import ExtractorInterface - - -class EntityExtractor(ExtractorInterface): - """The class representing the entity extractor - - Attributes - ---------- - labels : List[dict] - The list of labels to extract - lang : str - The language of the text to extract - score_th : float - The score threshold - use_gpu : bool - Whether to use GPU - pipeline : spacy pipeline - The spacy pipeline - - - Methods - ------- - __call__(self, text: str) - Extract the entities from the text - display(self, doc: Doc) - Display the entities in the text - - """ - - def __init__( - self, - labels: List[dict], - lang: LANGUAGES = LANGUAGES.ENGLISH, - score_th=0.5, - use_gpu=False, - *args, - **kwargs, - ): - """ - Parameters - ---------- - labels : List[dict] - The list of labels to extract - lang : str - The language of the text to extract - score_th : float - The score threshold. Entities with a score below this threshold will be ignored. Default: 0.5 - use_gpu : bool - Whether to use GPU. Default: False - - """ - - super().__init__(labels, *args, **kwargs) - self.lang = lang - self.score_th = score_th - self.use_gpu = use_gpu - self.labels = self._prepare_labels(labels) - self.pipeline = self._prepare_pipeline() - - def __call__(self, text: str, *args, **kwargs) -> Tuple[Doc, List[Entity]]: - """Extract the entities from the text - - Parameters - ---------- - text : str - The text to extract entities from - - Returns - ------- - Tuple[Doc, List[Entity]] - The spacy doc and the list of entities extracted - - """ - - doc = self.pipeline(text) - entities, doc.ents = self._prepare_entities(doc) - return doc, entities - - def display(self, doc: Doc): - """Display the entities in the text - - Parameters - ---------- - doc : Doc - The spacy doc to display - - """ - - options = {"colors": {l["label"]: "#5C7AEA" for l in self.labels}} - displacy.render(doc, style="ent", options=options) - - # =========================================== - # Private methods - # =========================================== - - def _prepare_labels(self, labels: List[dict]) -> List[dict]: - """Prepare the labels for the extractor - - Parameters - ---------- - labels : List[dict] - The list of labels to prepare - - Returns - ------- - List[dict] - The prepared labels - - """ - for l in labels: - if "regex" in l: - continue - regex = regex_map(l["type"]) - if regex is not None: - l["regex"] = regex - return labels - - def _create_gliner_config(self): - """Create the config for the GLINER model - - Returns - ------- - dict - The config for the GLINER model - - """ - - map_location = "cpu" - if self.use_gpu and not torch.cuda.is_available(): - return warnings.warn( - "The user requested GPU use, but not available GPU was found. Reverting back to CPU use." - ) - if self.use_gpu and torch.cuda.is_available(): - map_location = "cuda" - - return { - # the model is specialized for extracting PII data - "gliner_model": "urchade/gliner_multi_pii-v1", - "labels": [l["label"] for l in self.labels], - "threshold": self.score_th, - "chunk_size": 384, - "style": "ent", - "map_location": map_location, - } - - def _prepare_pipeline(self): - """Prepare the spacy pipeline - - Returns - ------- - spacy pipeline - The spacy pipeline - - """ - - # load the appropriate parser for the language - module_lang, class_lang = self.lang[0].lower(), self.lang[1].lower().title() - language_module = importlib.import_module(f"spacy.lang.{module_lang}") - language_class = getattr(language_module, class_lang) - # initialize the language parser - nlp = language_class() - nlp.add_pipe("sentencizer") - gliner_config = self._create_gliner_config() - nlp.add_pipe("gliner_spacy", config=gliner_config) - return nlp - - def _prepare_entities(self, doc: Doc): - """Prepares the anonipy and spacy entities - - Parameters - ---------- - doc : Doc - The spacy doc to prepare - - Returns - ------- - Tuple[List[Entity], List[Entity]] - The anonipy entities and the spacy entities - - - """ - - # TODO: make this part more generic - anoni_entities = [] - spacy_entities = [] - for e in doc.ents: - label = list(filter(lambda x: x["label"] == e.label_, self.labels))[0] - if re.match(label["regex"], e.text): - anoni_entities.append(convert_spacy_to_entity(e, **label)) - spacy_entities.append(e) - return anoni_entities, spacy_entities diff --git a/anonipy/anonymize/extractors/interface.py b/anonipy/anonymize/extractors/interface.py index 092f5fe..0becfe5 100644 --- a/anonipy/anonymize/extractors/interface.py +++ b/anonipy/anonymize/extractors/interface.py @@ -5,7 +5,7 @@ class ExtractorInterface: - """The class representing the extractor interface""" + """The class representing the extractor interface.""" def __init__(self, labels: List[dict], *args, **kwargs): pass diff --git a/anonipy/anonymize/extractors/multi_extractor.py b/anonipy/anonymize/extractors/multi_extractor.py new file mode 100644 index 0000000..b7985c0 --- /dev/null +++ b/anonipy/anonymize/extractors/multi_extractor.py @@ -0,0 +1,166 @@ +from typing import List, Set, Tuple, Iterable + +import itertools + +from spacy import displacy +from spacy.tokens import Doc + +from ...definitions import Entity +from ...utils.colors import get_label_color + +from .interface import ExtractorInterface + + +# =============================================== +# Extractor class +# =============================================== + + +class MultiExtractor: + """The class representing the multi extractor. + + Examples: + >>> from anonipy.constants import LANGUAGES + >>> from anonipy.anonymize.extractors import NERExtractor, PatternExtractor, MultiExtractor + >>> extractors = [ + >>> NERExtractor(ner_labels, lang=LANGUAGES.ENGLISH), + >>> PatternExtractor(pattern_labels, lang=LANGUAGES.ENGLISH), + >>> ] + >>> extractor = MultiExtractor(extractors) + >>> extractor("John Doe is a 19 year old software engineer.") + [(Doc, [Entity]), (Doc, [Entity])], [Entity] + + Attributes: + extractors (List[ExtractorInterface]): + The list of extractors to use. + + Methods: + __call__(self, text): + Extract the entities fron the text using the provided extractors. + display(self, doc): + Display the entities extracted from the text document. + + """ + + def __init__(self, extractors: List[ExtractorInterface]): + """Initialize the multi extractor. + + Examples: + >>> from anonipy.constants import LANGUAGES + >>> from anonipy.anonymize.extractors import NERExtractor, PatternExtractor, MultiExtractor + >>> extractors = [ + >>> NERExtractor(ner_labels, lang=LANGUAGES.ENGLISH), + >>> PatternExtractor(pattern_labels, lang=LANGUAGES.ENGLISH), + >>> ] + >>> extractor = MultiExtractor(extractors) + MultiExtractor() + + Args: + extractors: The list of extractors to use. + + """ + + self.extractors = extractors + + def __call__( + self, text: str + ) -> Tuple[List[Tuple[Doc, List[Entity]]], List[Entity]]: + """Extract the entities fron the text using the provided extractors. + + Examples: + >>> extractor("John Doe is a 19 year old software engineer.") + [(Doc, [Entity]), (Doc, [Entity])], [Entity] + + Args: + text: The text to extract entities from. + + Returns: + The list of extractor outputs containing the tuple (spacy document, extracted entities). + The list of joint entities. + """ + + extractor_outputs = [e(text) for e in self.extractors] + joint_entities = self._merge_entities(extractor_outputs) + return extractor_outputs, joint_entities + + def display(self, doc: Doc, page: bool = False, jupyter: bool = None) -> str: + """Display the entities in the text. + + Examples: + >>> extractor_outputs, entities = extractor("John Doe is a 19 year old software engineer.") + >>> extractor.display(extractor_outputs[0][0]) + HTML + + Args: + doc: The spacy doc to display. + page: Whether to display the doc in a web browser. + jupyter: Whether to display the doc in a jupyter notebook. + + Returns: + The HTML representation of the document and the extracted entities. + + """ + + labels = list( + itertools.chain.from_iterable([e.labels for e in self.extractors]) + ) + options = {"colors": {l["label"]: get_label_color(l["label"]) for l in labels}} + return displacy.render( + doc, style="ent", options=options, page=page, jupyter=jupyter + ) + + def _merge_entities( + self, extractor_outputs: List[Tuple[Doc, List[Entity]]] + ) -> List[Entity]: + """Merges the entities returned by the extractors. + + Args: + extractor_outputs: The list of extractor outputs. + + Returns: + The merged entities list. + + """ + + if len(extractor_outputs) == 0: + return [] + if len(extractor_outputs) == 1: + return extractor_outputs[1] + + joint_entities = self._filter_entities( + list( + itertools.chain.from_iterable( + [entity[1] for entity in extractor_outputs] + ) + ) + ) + return joint_entities + + def _filter_entities(self, entities: Iterable[Entity]) -> List[Entity]: + """Filters the entities based on their start and end indices. + + Args: + entities: The entities to filter. + + Returns: + The filtered entities. + + """ + + get_sort_key = lambda entity: ( + entity.end_index - entity.start_index, + -entity.start_index, + ) + sorted_entities = sorted(entities, key=get_sort_key, reverse=True) + result = [] + seen_tokens: Set[int] = set() + for entities in sorted_entities: + # Check for end - 1 here because boundaries are inclusive + if ( + entities.start_index not in seen_tokens + and entities.end_index - 1 not in seen_tokens + ): + result.append(entities) + seen_tokens.update(range(entities.start_index, entities.end_index)) + result = sorted(result, key=lambda entity: entity.start_index) + return result diff --git a/anonipy/anonymize/extractors/ner_extractor.py b/anonipy/anonymize/extractors/ner_extractor.py new file mode 100644 index 0000000..22927cc --- /dev/null +++ b/anonipy/anonymize/extractors/ner_extractor.py @@ -0,0 +1,267 @@ +import re +import importlib +from typing import List, Tuple +import warnings + +import torch +from spacy import displacy +from spacy.tokens import Doc, Span +from spacy.language import Language + +from ..helpers import convert_spacy_to_entity +from ...utils.regex import regex_mapping +from ...constants import LANGUAGES +from ...definitions import Entity +from ...utils.colors import get_label_color + +from .interface import ExtractorInterface + +# =============================================== +# Extractor class +# =============================================== + + +class NERExtractor(ExtractorInterface): + """The class representing the named entity recognition (NER) extractor. + + Examples: + >>> from anonipy.constants import LANGUAGES + >>> from anonipy.anonymize.extractors import NERExtractor + >>> labels = [{"label": "PERSON", "type": "string"}] + >>> extractor = NERExtractor(labels, lang=LANGUAGES.ENGLISH) + >>> extractor("John Doe is a 19 year old software engineer.") + Doc, [Entity] + + Attributes: + labels (List[dict]): The list of labels to extract. + lang (str): The language of the text to extract. + score_th (float): The score threshold. + use_gpu (bool): Whether to use GPU. + gliner_model (str): The gliner model to use. + pipeline (Language): The spacy pipeline for extracting entities. + spacy_style (str): The style the entities should be stored in the spacy doc. + + Methods: + __call__(self, text): + Extract the entities from the text. + display(self, doc): + Display the entities in the text. + + """ + + def __init__( + self, + labels: List[dict], + lang: LANGUAGES = LANGUAGES.ENGLISH, + score_th: float = 0.5, + use_gpu: bool = False, + gliner_model: str = "urchade/gliner_multi_pii-v1", + spacy_style: str = "ent", + *args, + **kwargs, + ): + """Initialize the named entity recognition (NER) extractor. + + Examples: + >>> from anonipy.constants import LANGUAGES + >>> from anonipy.anonymize.extractors import NERExtractor + >>> labels = [{"label": "PERSON", "type": "string"}] + >>> extractor = NERExtractor(labels, lang=LANGUAGES.ENGLISH) + NERExtractor() + + Args: + labels: The list of labels to extract. + lang: The language of the text to extract. + score_th: The score threshold. Entities with a score below this threshold will be ignored. + use_gpu: Whether to use GPU. + gliner_model: The gliner model to use to identify the entities. + spacy_style: The style the entities should be stored in the spacy doc. Options: `ent` or `span`. + + """ + + super().__init__(labels, *args, **kwargs) + self.lang = lang + self.score_th = score_th + self.use_gpu = use_gpu + self.gliner_model = gliner_model + self.spacy_style = spacy_style + self.labels = self._prepare_labels(labels) + self.pipeline = self._prepare_pipeline() + + def __call__(self, text: str, *args, **kwargs) -> Tuple[Doc, List[Entity]]: + """Extract the entities from the text. + + Examples: + >>> extractor("John Doe is a 19 year old software engineer.") + Doc, [Entity] + + Args: + text: The text to extract entities from. + + Returns: + The spacy document. + The list of extracted entities. + + """ + + doc = self.pipeline(text) + anoni_entities, spacy_entities = self._prepare_entities(doc) + self._set_spacy_fields(doc, spacy_entities) + return doc, anoni_entities + + def display(self, doc: Doc, page: bool = False, jupyter: bool = None) -> str: + """Display the entities in the text. + + Examples: + >>> doc, entities = extractor("John Doe is a 19 year old software engineer.") + >>> extractor.display(doc) + HTML + + Args: + doc: The spacy doc to display. + page: Whether to display the doc in a web browser. + jupyter: Whether to display the doc in a jupyter notebook. + + Returns: + The HTML representation of the document and the extracted entities. + + """ + + options = { + "colors": {l["label"]: get_label_color(l["label"]) for l in self.labels} + } + return displacy.render( + doc, style=self.spacy_style, options=options, page=page, jupyter=jupyter + ) + + # =========================================== + # Private methods + # =========================================== + + def _prepare_labels(self, labels: List[dict]) -> List[dict]: + """Prepare the labels for the extractor. + + The provided labels are enriched with the corresponding regex + definitions, if the `regex` key was not provided. + + Args: + labels: The list of labels to prepare. + + Returns: + The enriched labels. + + """ + for l in labels: + if "regex" in l: + continue + regex = regex_mapping[l["type"]] + if regex is not None: + l["regex"] = regex + return labels + + def _create_gliner_config(self) -> dict: + """Create the config for the GLINER model. + + Returns: + The configuration dictionary for the GLINER model. + + """ + + map_location = "cpu" + if self.use_gpu and not torch.cuda.is_available(): + return warnings.warn( + "The user requested GPU use, but not available GPU was found. Reverting back to CPU use." + ) + if self.use_gpu and torch.cuda.is_available(): + map_location = "cuda" + + return { + # the model is specialized for extracting PII data + "gliner_model": self.gliner_model, + "labels": [l["label"] for l in self.labels], + "threshold": self.score_th, + "chunk_size": 384, + "style": self.spacy_style, + "map_location": map_location, + } + + def _prepare_pipeline(self) -> Language: + """Prepare the spacy pipeline. + + Prepares the pipeline for processing the text in the corresponding + provided language. + + Returns: + The spacy text processing and extraction pipeline. + + """ + + # load the appropriate parser for the language + module_lang, class_lang = self.lang[0].lower(), self.lang[1].lower().title() + language_module = importlib.import_module(f"spacy.lang.{module_lang}") + language_class = getattr(language_module, class_lang) + # initialize the language parser + nlp = language_class() + nlp.add_pipe("sentencizer") + gliner_config = self._create_gliner_config() + nlp.add_pipe("gliner_spacy", config=gliner_config) + return nlp + + def _prepare_entities(self, doc: Doc) -> Tuple[List[Entity], List[Span]]: + """Prepares the anonipy and spacy entities. + + Args: + doc: The spacy doc to prepare. + + Returns: + The list of anonipy entities. + The list of spacy entities. + + """ + + # TODO: make this part more generic + anoni_entities = [] + spacy_entities = [] + for s in self._get_spacy_fields(doc): + label = list(filter(lambda x: x["label"] == s.label_, self.labels))[0] + if re.match(label["regex"], s.text): + anoni_entities.append(convert_spacy_to_entity(s, **label)) + spacy_entities.append(s) + return anoni_entities, spacy_entities + + def _get_spacy_fields(self, doc: Doc) -> List[Span]: + """Get the spacy doc entity spans. + + args: + doc: The spacy doc to get the entity spans from. + + Returns: + The list of Spans from the spacy doc. + + """ + + if self.spacy_style == "ent": + return doc.ents + elif self.spacy_style == "span": + return doc.spans["sc"] + else: + raise ValueError(f"Invalid spacy style: {self.spacy_style}") + + def _set_spacy_fields(self, doc: Doc, entities: List[Span]) -> None: + """Set the spacy doc entity spans. + + Args: + doc: The spacy doc to set the entity spans. + entities: The entity spans to set. + + Returns: + None + + """ + + if self.spacy_style == "ent": + doc.ents = entities + elif self.spacy_style == "span": + doc.spans["sc"] = entities + else: + raise ValueError(f"Invalid spacy style: {self.spacy_style}") diff --git a/anonipy/anonymize/extractors/pattern_extractor.py b/anonipy/anonymize/extractors/pattern_extractor.py new file mode 100644 index 0000000..469d143 --- /dev/null +++ b/anonipy/anonymize/extractors/pattern_extractor.py @@ -0,0 +1,291 @@ +import re + +import importlib +from typing import List, Tuple, Optional, Callable + +from spacy import displacy, util +from spacy.tokens import Doc, Span +from spacy.language import Language +from spacy.matcher import Matcher + +from ..helpers import convert_spacy_to_entity +from ...constants import LANGUAGES +from ...definitions import Entity +from ...utils.colors import get_label_color + +from .interface import ExtractorInterface + + +# =============================================== +# Extractor class +# =============================================== + + +class PatternExtractor(ExtractorInterface): + """The class representing the pattern extractor. + + Examples: + >>> from anonipy.constants import LANGUAGES + >>> from anonipy.anonymize.extractors import PatternExtractor + >>> labels = [{"label": "PERSON", "regex": "([A-Z][a-z]+ [A-Z][a-z]+)"}] + >>> extractor = PatternExtractor(labels, lang=LANGUAGES.ENGLISH) + >>> extractor("John Doe is a 19 year old software engineer.") + Doc, [Entity] + + Attributes: + labels (List[dict]): The list of labels and patterns to extract. + lang (str): The language of the text to extract. + pipeline (Language): The spacy pipeline for extracting entities. + token_matchers (Matcher): The spacy token pattern matcher. + global_matchers (function): The global pattern matcher. + + Methods: + __call__(self, text): + Extract the entities from the text. + display(self, doc): + Display the entities in the text. + + """ + + def __init__( + self, + labels: List[dict], + lang: LANGUAGES = LANGUAGES.ENGLISH, + spacy_style: str = "ent", + *args, + **kwargs, + ): + """Initialize the pattern extractor. + + Examples: + >>> from anonipy.constants import LANGUAGES + >>> from anonipy.anonymize.extractors import PatternExtractor + >>> labels = [{"label": "PERSON", "regex": "([A-Z][a-z]+ [A-Z][a-z]+)"}] + >>> extractor = PatternExtractor(labels, lang=LANGUAGES.ENGLISH) + PatternExtractor() + + Args: + labels: The list of labels and patterns to extract. + lang: The language of the text to extract. + spacy_style: The style the entities should be stored in the spacy doc. Options: `ent` or `span`. + + """ + + super().__init__(labels, *args, **kwargs) + self.lang = lang + self.labels = labels + self.spacy_style = spacy_style + self.pipeline = self._prepare_pipeline() + self.token_matchers = self._prepare_token_matchers() + self.global_matchers = self._prepare_global_matchers() + + def __call__(self, text: str, *args, **kwargs) -> Tuple[Doc, List[Entity]]: + """Extract the entities from the text. + + Examples: + >>> extractor("John Doe is a 19 year old software engineer.") + Doc, [Entity] + + Args: + text: The text to extract entities from. + + Returns: + The spacy document. + The list of extracted entities. + + """ + + doc = self.pipeline(text) + self.token_matchers(doc) if self.token_matchers else None + self.global_matchers(doc) if self.global_matchers else None + anoni_entities, spacy_entities = self._prepare_entities(doc) + self._set_doc_entity_spans(doc, spacy_entities) + return doc, anoni_entities + + def display(self, doc: Doc, page: bool = False, jupyter: bool = None) -> str: + """Display the entities in the text. + + Examples: + >>> doc, entities = extractor("John Doe is a 19 year old software engineer.") + >>> extractor.display(doc) + HTML + + Args: + doc: The spacy doc to display. + page: Whether to display the doc in a web browser. + jupyter: Whether to display the doc in a jupyter notebook. + + Returns: + The HTML representation of the document and the extracted entities. + + """ + + options = { + "colors": {l["label"]: get_label_color(l["label"]) for l in self.labels} + } + return displacy.render( + doc, style=self.spacy_style, options=options, page=page, jupyter=jupyter + ) + + # =========================================== + # Private methods + # =========================================== + + def _prepare_pipeline(self) -> Language: + """Prepare the spacy pipeline. + + Prepares the pipeline for processing the text in the corresponding + provided language. + + Returns: + The spacy text processing and extraction pipeline. + + """ + + # load the appropriate parser for the language + module_lang, class_lang = self.lang[0].lower(), self.lang[1].lower().title() + language_module = importlib.import_module(f"spacy.lang.{module_lang}") + language_class = getattr(language_module, class_lang) + # initialize the language parser + nlp = language_class() + nlp.add_pipe("sentencizer") + return nlp + + def _prepare_token_matchers(self) -> Optional[Matcher]: + """Prepare the token pattern matchers. + + Prepares the token pattern matchers for the provided labels. + + Returns: + The spacy matcher object or None if no relevant labels are provided. + + """ + + relevant_labels = list(filter(lambda l: "pattern" in l, self.labels)) + if len(relevant_labels) == 0: + return None + + matcher = Matcher(self.pipeline.vocab) + for label in relevant_labels: + if isinstance(label["pattern"], list): + on_match = self._create_add_event_ent(label["label"]) + matcher.add(label["label"], label["pattern"], on_match=on_match) + return matcher + + def _prepare_global_matchers(self) -> Optional[Callable]: + """Prepares the global pattern matchers. + + Prepares the global pattern matchers for the provided labels. + + Returns: + The function used to match the patterns or None if no relevant labels are provided. + + """ + + relevant_labels = list(filter(lambda l: "regex" in l, self.labels)) + if len(relevant_labels) == 0: + return None + + def global_matchers(doc: Doc) -> None: + for label in relevant_labels: + for match in re.finditer(label["regex"], doc.text): + # define the entity span + start, end = match.span(1) + entity = doc.char_span(start, end, label=label["label"]) + entity._.score = 1.0 + # add the entity to the previous entity list + prev_entities = self._get_doc_entity_spans(doc) + if self.spacy_style == "ent": + prev_entities = util.filter_spans(prev_entities + (entity,)) + elif self.spacy_style == "span": + prev_entities.append(entity) + else: + raise ValueError(f"Invalid spacy style: {self.spacy_style}") + self._set_doc_entity_spans(doc, prev_entities) + + return global_matchers + + def _prepare_entities(self, doc: Doc) -> Tuple[List[Entity], List[Span]]: + """Prepares the anonipy and spacy entities. + + Args: + doc: The spacy doc to prepare. + + Returns: + The list of anonipy entities. + The list of spacy entities. + + """ + + # TODO: make this part more generic + anoni_entities = [] + spacy_entities = [] + for e in self._get_doc_entity_spans(doc): + label = list(filter(lambda x: x["label"] == e.label_, self.labels))[0] + anoni_entities.append(convert_spacy_to_entity(e, **label)) + spacy_entities.append(e) + return anoni_entities, spacy_entities + + def _create_add_event_ent(self, label: str) -> Callable: + """Create the add event entity function + + Args: + label: The identified label entity. + + Returns: + The function used to add the entity to the spacy doc. + + """ + + def add_event_ent(matcher, doc, i, matches): + # define the entity span + _, start, end = matches[i] + entity = Span(doc, start, end, label=label) + entity._.score = 1.0 + # add the entity to the previous entity list + prev_entities = self._get_doc_entity_spans(doc) + if self.spacy_style == "ent": + prev_entities = util.filter_spans(prev_entities + (entity,)) + elif self.spacy_style == "span": + prev_entities.append(entity) + else: + raise ValueError(f"Invalid spacy style: {self.spacy_style}") + self._set_doc_entity_spans(doc, prev_entities) + + return add_event_ent + + def _get_doc_entity_spans(self, doc: Doc) -> List[Span]: + """Get the spacy doc entity spans. + + Args: + doc: The spacy doc to get the entity spans from. + + Returns: + The list of entity spans. + + """ + + if self.spacy_style == "ent": + return doc.ents + elif self.spacy_style == "span": + if "sc" not in doc.spans: + doc.spans["sc"] = [] + return doc.spans["sc"] + else: + raise ValueError(f"Invalid spacy style: {self.spacy_style}") + + def _set_doc_entity_spans(self, doc: Doc, entities: List[Span]) -> None: + """Set the spacy doc entity spans. + + Args: + doc: The spacy doc to set the entity spans. + entities: The entity spans to assign the doc. + + """ + + if self.spacy_style == "ent": + doc.ents = entities + elif self.spacy_style == "span": + doc.spans["sc"] = entities + else: + raise ValueError(f"Invalid spacy style: {self.spacy_style}") diff --git a/anonipy/anonymize/generators/__init__.py b/anonipy/anonymize/generators/__init__.py index b0b8b74..8c93bd7 100644 --- a/anonipy/anonymize/generators/__init__.py +++ b/anonipy/anonymize/generators/__init__.py @@ -1,20 +1,13 @@ -""" -generators +"""Module containing the `generators`. -The module provides a set of generators used in the library. +The `generators` module provides a set of generators used to generate data +substitutes. -Classes -------- -GeneratorInterface : - The class representing the generator interface -LLMLabelGenerator : - The class representing the LLM label generator -MaskLabelGenerator : - The class representing the mask label generator -NumberGenerator : - The class representing the number generator -DateGenerator : - The class representing the date generator +Classes: + LLMLabelGenerator: The class representing the label generator utilizing LLMs. + MaskLabelGenerator: The class representing the label generator utilizing token masking. + NumberGenerator: The class representing the number generator. + DateGenerator: The class representing the date generator. """ @@ -25,9 +18,9 @@ from .date_generator import DateGenerator __all__ = [ - "GeneratorInterface", "LLMLabelGenerator", "MaskLabelGenerator", "NumberGenerator", "DateGenerator", + "GeneratorInterface", ] diff --git a/anonipy/anonymize/generators/date_generator.py b/anonipy/anonymize/generators/date_generator.py index 7b36b69..bc88822 100644 --- a/anonipy/anonymize/generators/date_generator.py +++ b/anonipy/anonymize/generators/date_generator.py @@ -5,144 +5,128 @@ from ...utils.datetime import detect_datetime_format from .interface import GeneratorInterface from ...definitions import Entity +from ...constants import DATE_TRANSFORM_VARIANTS # ===================================== # Operation functions # ===================================== -def first_day_of_month(day: datetime.datetime, *args, **kwargs): - """Returns the first day of the month of the given date +def first_day_of_month(day: datetime.datetime, *args, **kwargs) -> datetime.datetime: + """Returns the first day of the month of the given date. - Parameters - ---------- - day : datetime.datetime - The date to get the first day of the month from + Args: + day: The date to get the first day of the month from. - Returns - ------- - datetime.datetime - The first day of the month of the given date + Returns: + The first day of the month of the given date. """ return day.replace(day=1) -def last_day_of_month(day: datetime.datetime, *args, **kwargs): - """Returns the last day of the month of the given date +def last_day_of_month(day: datetime.datetime, *args, **kwargs) -> datetime.datetime: + """Returns the last day of the month of the given date. - Parameters - ---------- - day : datetime.datetime - The date to get the last day of the month from + Args: + day: The date to get the last day of the month from. - Returns - ------- - datetime.datetime - The last day of the month of the given date + Returns: + The last day of the month of the given date. """ next_month = day.replace(day=28) + datetime.timedelta(days=4) return next_month - datetime.timedelta(days=next_month.day) -def middle_of_the_month(day: datetime.datetime, *args, **kwargs): - """Returns the middle day of the month of the given date +def middle_of_the_month(day: datetime.datetime, *args, **kwargs) -> datetime.datetime: + """Returns the middle day of the month of the given date. - Parameters - ---------- - day : datetime.datetime - The date to get the middle day of the month from + Args: + day: The date to get the middle day of the month from. - Returns - ------- - datetime.datetime - The middle day of the month of the given date + Returns: + The middle day of the month of the given date. """ return day.replace(day=15) -def middle_of_the_year(day: datetime.datetime, *args, **kwargs): - """Returns the middle day of the year of the given date +def middle_of_the_year(day: datetime.datetime, *args, **kwargs) -> datetime.datetime: + """Returns the middle day of the year of the given date. - Parameters - ---------- - day : datetime.datetime - The date to get the middle day of the year from + Args: + day: The date to get the middle day of the year from. - Returns - ------- - datetime.datetime - The middle day of the year of the given date + Returns: + The middle day of the year of the given date. """ return day.replace(month=7, day=1) -def random_date(day: datetime.datetime, sigma: int = 30, *args, **kwargs): - """Returns a random date within the given date range +def random_date( + day: datetime.datetime, sigma: int = 30, *args, **kwargs +) -> datetime.datetime: + """Returns a random date within the given date range. The function returns a date within the range [day - sigma, day + sigma]. - Parameters - ---------- - day : datetime.datetime - The date to get the random date from - sigma : int - The range of the random date in days. Default: 30 + Args: + day: The date to get the random date from. + sigma: The range of the random date in days. - Returns - ------- - datetime.datetime - The random date within the given date range + Returns: + The random date within the given date range. """ delta = random.randint(-sigma, sigma) return day + datetime.timedelta(days=delta) -operations = { - "first_day_of_the_month": first_day_of_month, - "last_day_of_the_month": last_day_of_month, - "middle_of_the_month": middle_of_the_month, - "middle_of_the_year": middle_of_the_year, - "random": random_date, +DATE_VARIANTS_MAPPING = { + DATE_TRANSFORM_VARIANTS.FIRST_DAY_OF_THE_MONTH: first_day_of_month, + DATE_TRANSFORM_VARIANTS.LAST_DAY_OF_THE_MONTH: last_day_of_month, + DATE_TRANSFORM_VARIANTS.MIDDLE_OF_THE_MONTH: middle_of_the_month, + DATE_TRANSFORM_VARIANTS.MIDDLE_OF_THE_YEAR: middle_of_the_year, + DATE_TRANSFORM_VARIANTS.RANDOM: random_date, } - # ===================================== # Main class # ===================================== class DateGenerator(GeneratorInterface): - """The class representing the date generator + """The class representing the date generator. + + Examples: + >>> from anonipy.anonymize.generators import DateGenerator + >>> generator = DateGenerator() + >>> generator.generate(entity) - Attributes - ---------- - date_format : str - The date format to use - day_sigma : int - The range of the random date in days + Attributes: + date_format (str): The date format in which the date should be generated. + day_sigma (int): The range of the random date in days. - Methods - ------- - generate(entity: Entity, output_gen: str = "random") - Generate the date based on the entity and output_gen + Methods: + generate(entity, output_gen): + Generate the date substitute based on the input parameters. """ - def __init__(self, date_format="auto", day_sigma: int = 30, *args, **kwargs): - """ - Parameters - ---------- - date_format : str, optional - The date format to use. Default: "auto" - day_sigma : int, optional - The range of the random date in days. Default: 30 + def __init__(self, date_format: str = "auto", day_sigma: int = 30, *args, **kwargs): + """Initializes he date generator. + + Examples: + >>> from anonipy.anonymize.generators import DateGenerator + >>> generator = DateGenerator() + + Args: + date_format: The date format in which the date should be generated. More on date formats [see here](https://www.contensis.com/help-and-docs/guides/querying-your-content/zenql-search/date-formats). + day_sigma: The range of the random date in days. """ @@ -150,25 +134,24 @@ def __init__(self, date_format="auto", day_sigma: int = 30, *args, **kwargs): self.date_format = date_format self.day_sigma = day_sigma - def generate(self, entity: Entity, output_gen: str = "random", *args, **kwargs): - """Generate the date based on the entity and output_gen + def generate( + self, + entity: Entity, + sub_variant: DATE_TRANSFORM_VARIANTS = DATE_TRANSFORM_VARIANTS.RANDOM, + *args, + **kwargs, + ) -> str: + """Generate the entity substitute based on the input parameters. - Parameters - ---------- - entity : Entity - The entity to generate the date from - output_gen : str, optional - The output generator to use. Default: "random" + Args: + entity: The entity to generate the date substitute from. + sub_variant: The substitute function variant to use. - Returns - ------- - str - The generated date + Returns: + The generated date substitute. - Raises - ------ - ValueError - If the entity type is not `date` or `custom` + Raises: + ValueError: If the entity type is not `date` or `custom`. """ @@ -179,19 +162,24 @@ def generate(self, entity: Entity, output_gen: str = "random", *args, **kwargs): elif entity.type not in ["date"]: raise ValueError("The entity type must be `date` to generate dates.") - if output_gen not in operations.keys(): + if not DATE_TRANSFORM_VARIANTS.is_valid(sub_variant): raise ValueError( - f"The output_gen must be one of {', '.join(list(operations.keys()))} to generate dates." + f"The output_gen must be one of {', '.join(DATE_TRANSFORM_VARIANTS.values())} to generate dates." ) + # detect the date format if self.date_format == "auto": entity_date, date_format = detect_datetime_format(entity.text) else: entity_date = datetime.datetime.strptime(entity.text, self.date_format) date_format = self.date_format + + # validate the input values if entity_date is None: raise ValueError(f"Entity `{entity.text}` is not a valid date.") if date_format is None or date_format == ValueError("Unknown Format"): raise ValueError(f"Entity `{entity.text}` is not a valid date.") - generate_date = operations[output_gen](entity_date, self.day_sigma) + + # generate the date substitute + generate_date = DATE_VARIANTS_MAPPING[sub_variant](entity_date, self.day_sigma) return generate_date.strftime(date_format) diff --git a/anonipy/anonymize/generators/interface.py b/anonipy/anonymize/generators/interface.py index 3dec839..a8e1278 100644 --- a/anonipy/anonymize/generators/interface.py +++ b/anonipy/anonymize/generators/interface.py @@ -7,7 +7,7 @@ class GeneratorInterface: - """The class representing the generator interface""" + """The class representing the generator interface.""" def __init__(self, *args, **kwargs): pass diff --git a/anonipy/anonymize/generators/llm_label_generator.py b/anonipy/anonymize/generators/llm_label_generator.py index 2775883..2b34a03 100644 --- a/anonipy/anonymize/generators/llm_label_generator.py +++ b/anonipy/anonymize/generators/llm_label_generator.py @@ -1,3 +1,5 @@ +from typing import Tuple + import torch from tokenizers import pre_tokenizers from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig @@ -13,14 +15,18 @@ # ===================================== -def prepare_llama3_byte_decoder(): - """Prepares the byte decoder +def prepare_llama3_byte_decoder() -> dict: + """Prepares the byte decoder. This is an implementation of a workaround, such that the guidance module can be used with the LLaMa-3 model from Hugging Face. Once the issue is resolved, we will remove this function. Link to the guidance issue: https://github.com/guidance-ai/guidance/issues/782 + + Returns: + The byte decoder. + """ byte_decoder = {} # alphabet = pre_tokenizers.ByteLevel(False, False).alphabet() @@ -75,28 +81,35 @@ def prepare_llama3_byte_decoder(): class LLMLabelGenerator(GeneratorInterface): - """The class representing the LLM label generator + """The class representing the LLM label generator. + + !!! info "GPU Requirements" + The `LLMLabelGenerator` utilizes the open source LLMs, specifically the [Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) model. Because the model is quite large, we utilize quantization using the `bitsandbytes` package to reduce its size. Therefore, the `LLMLabelGenerator` requires at least 8GB GPU and CUDA drivers to be available. If these resources are not available on your machine, you can use the `MaskLabelGenerator` instead. - Attributes - ---------- - model : models.Transformers - The model used to generate the label + Examples: + >>> from anonipy.anonymize.generators import LLMLabelGenerator + >>> generator = LLMLabelGenerator() + >>> generator.generate(entity) - Methods - ------- - generate(entity: Entity, entity_prefix: str = "", temperature: float = 0.0) - Generate the label based on the entity + Attributes: + model (models.Transformers): The model used to generate the label substitutes. - validate(entity: Entity) - Validate the entity + Methods: + generate(entity, entity_prefix, temperature): + Generate the label based on the entity. + + validate(entity): + [EXPERIMENTAL] Validate if the entity text corresponds to the entity label. """ def __init__(self, *args, **kwargs): - """ - Parameters - ---------- - None + """Initializes the LLM label generator. + + Examples: + >>> from anonipy.anonymize.generators import LLMLabelGenerator + >>> generator = LLMLabelGenerator() + LLMLabelGenerator() """ @@ -115,30 +128,30 @@ def __init__(self, *args, **kwargs): def generate( self, entity: Entity, - entity_prefix: str = "", + add_entity_attrs: str = "", temperature: float = 0.0, *args, **kwargs, - ): - """Generate the label based on the entity - - Parameters - ---------- - entity : Entity - The entity to generate the label from - entity_prefix : str - The prefix to use for the entity - temperature : float - The temperature to use for the generation. Default: 0.0 - - Returns - ------- - str - The generated label + ) -> str: + """Generate the substitute for the entity based on it's attributes. + + Examples: + >>> from anonipy.anonymize.generators import LLMLabelGenerator + >>> generator = LLMLabelGenerator() + >>> generator.generate(entity) + label + + Args: + entity: The entity to generate the label from. + add_entity_attrs: Additional entity attribute description to add to the generation. + temperature: The temperature to use for the generation. + + Returns: + The generated entity label substitute. """ - user_prompt = f"What is a random {entity_prefix} {entity.label} replacement for {entity.text}? Respond only with the replacement." + user_prompt = f"What is a random {add_entity_attrs} {entity.label} replacement for {entity.text}? Respond only with the replacement." assistant_prompt = gen( name="replacement", stop="<|eot_id|>", @@ -154,18 +167,20 @@ def generate( ) return lm["replacement"] - def validate(self, entity: Entity): - """Validate the entity + def validate(self, entity: Entity) -> bool: + """[EXPERIMENTAL] Validate the appropriateness of the entity. + + Examples: + >>> from anonipy.anonymize.generators import LLMLabelGenerator + >>> generator = LLMLabelGenerator() + >>> generator.validate(entity) + True - Parameters - ---------- - entity : Entity - The entity to validate + Args: + entity: The entity to be validated. - Returns - ------- - bool - The validation result + Returns: + The validation result. """ @@ -184,20 +199,20 @@ def validate(self, entity: Entity): # Private methods # ================================= - def _prepare_model_and_tokenizer(self, model_name: str): - """Prepares the model and tokenizer + def _prepare_model_and_tokenizer( + self, model_name: str + ) -> Tuple[AutoModelForCausalLM, AutoTokenizer]: + """Prepares the model and tokenizer. - Parameters - ---------- - model_name : str - The name of the model to use + Args: + model_name: The name of the model to use. - Returns - ------- - model, tokenizer - The model and the tokenizer + Returns: + The huggingface model. + The huggingface tokenizer. """ + # prepare the model bnb_config = BitsAndBytesConfig( load_in_4bit=True, @@ -216,38 +231,32 @@ def _prepare_model_and_tokenizer(self, model_name: str): tokenizer.byte_decoder = prepare_llama3_byte_decoder() return model, tokenizer - def _system_prompt(self): - """Returns the system prompt""" + def _system_prompt(self) -> str: + """Returns the system prompt.""" return "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful AI assistant for generating replacements for text entities.<|eot_id|>" - def _user_prompt(self, prompt): - """Returns the user prompt + def _user_prompt(self, prompt: str) -> str: + """Returns the user prompt. - Parameters - ---------- - prompt : str - The prompt to use + Args: + prompt: The prompt to use. - Returns - ------- - str - The user prompt + Returns: + The user part of the prompt. """ + return f"<|start_header_id|>user<|end_header_id|>\n\n{prompt}<|eot_id|>" - def _assistant_prompt(self, prompt): - """Returns the assistant prompt + def _assistant_prompt(self, prompt: str) -> str: + """Returns the assistant prompt. - Parameters - ---------- - prompt : str - The prompt to use + Args: + prompt: The prompt to use. - Returns - ------- - str - The assistant prompt + Returns: + The assistant part of the prompt. """ + return f"<|start_header_id|>assistant<|end_header_id|>\n\n{prompt}" diff --git a/anonipy/anonymize/generators/mask_label_generator.py b/anonipy/anonymize/generators/mask_label_generator.py index aab448b..7444046 100644 --- a/anonipy/anonymize/generators/mask_label_generator.py +++ b/anonipy/anonymize/generators/mask_label_generator.py @@ -2,7 +2,8 @@ import random import warnings import itertools -from typing import List + +from typing import List, Tuple import torch from transformers import AutoModelForMaskedLM, AutoTokenizer, pipeline @@ -19,43 +20,46 @@ class MaskLabelGenerator(GeneratorInterface): - """The class representing the mask label generator - - Attributes - ---------- - context_window : int - The context window size - pipeline : transformers pipeline - The transformers pipeline - mask_token : str - The mask token - - Methods - ------- - generate(self, entity: Entity, text: str) - Anonymize the text based on the entities + """The class representing the mask label generator. + + Examples: + >>> from anonipy.anonymize.generators import MaskLabelGenerator + >>> generator = MaskLabelGenerator(model_name, context_window=100, use_gpu=False) + >>> generator.generate(entity) + + + Attributes: + pipeline (Pipeline): The transformers pipeline used to generate the label substitutes. + context_window (int): The context window size to use to generate the label substitutes. + mask_token (str): The mask token to use to replace the masked words. + + Methods: + generate(entity, text): + Generate the substitute for the entity based on it's location in the text. """ def __init__( self, - model_name="FacebookAI/xlm-roberta-large", + model_name: str = "FacebookAI/xlm-roberta-large", use_gpu: bool = False, context_window: int = 100, *args, **kwargs, ): - """ - Parameters - ---------- - model_name : str, optional - The name of the model to use. Default: "FacebookAI/xlm-roberta-large" - use_gpu : bool, optional - Whether to use GPU/CUDA. Default: False - context_window : int, optional - The context window size. Default: 100 + """Initializes the mask label generator. + + Examples: + >>> from anonipy.anonymize.generators import MaskLabelGenerator + >>> generator = MaskLabelGenerator(context_window=120, use_gpu=True) + + Args: + model_name: The name of the masking model to use. + context_window: The context window size. + use_gpu: Whether to use GPU/CUDA, if available. """ + super().__init__(*args, **kwargs) self.context_window = context_window if use_gpu and not torch.cuda.is_available(): @@ -65,26 +69,29 @@ def __init__( use_gpu = False # prepare the fill-mask pipeline and store the mask token - model, tokenizer = self._prepare_model_and_tokenizer(model_name, use_gpu) + model, tokenizer, device = self._prepare_model_and_tokenizer( + model_name, use_gpu + ) self.mask_token = tokenizer.mask_token self.pipeline = pipeline( - "fill-mask", model=model, tokenizer=tokenizer, top_k=40 + "fill-mask", model=model, tokenizer=tokenizer, top_k=40, device=device ) - def generate(self, entity: Entity, text: str, *args, **kwargs): - """Generate the substituted text based on the entity + def generate(self, entity: Entity, text: str, *args, **kwargs) -> str: + """Generate the substitute for the entity using the masking model. - Parameters - ---------- - entity : Entity - The entity to generate the label from - text : str - The text to generate the label from + Examples: + >>> from anonipy.anonymize.generators import MaskLabelGenerator + >>> generator = MaskLabelGenerator(context_window=120, use_gpu=True) + >>> generator.generate(entity, text) + label - Returns - ------- - str - The generated text + Args: + entity: The entity used to generate the substitute. + text: The original text in which the entity is located; used to get the entity's context. + + Returns: + The generated substitute text. """ @@ -97,22 +104,22 @@ def generate(self, entity: Entity, text: str, *args, **kwargs): # Private methods # ================================= - def _prepare_model_and_tokenizer(self, model_name: str, use_gpu: bool): - """Prepares the model and tokenizer + def _prepare_model_and_tokenizer( + self, model_name: str, use_gpu: bool + ) -> Tuple[AutoModelForMaskedLM, AutoTokenizer]: + """Prepares the model and tokenizer. - Parameters - ---------- - model_name : str - The name of the model to use - use_gpu : bool - Whether to use GPU/CUDA + Args: + model_name: The name of the model to use. + use_gpu: Whether to use GPU/CUDA, if available. - Returns - ------- - model, tokenizer - The model and the tokenizer + Returns: + The huggingface model. + The huggingface tokenizer. + The device to use. """ + # prepare the model device = torch.device( "cuda" if use_gpu and torch.cuda.is_available() else "cpu" @@ -120,22 +127,19 @@ def _prepare_model_and_tokenizer(self, model_name: str, use_gpu: bool): model = AutoModelForMaskedLM.from_pretrained(model_name).to(device) # prepare the tokenizer tokenizer = AutoTokenizer.from_pretrained(model_name) - return model, tokenizer + return model, tokenizer, device - def _create_masks(self, entity: Entity): - """Creates the masks for the entity + def _create_masks(self, entity: Entity) -> List[dict]: + """Creates the masks for the provided entity. - Parameters - ---------- - entity : Entity - The entity to create the masks for + Args: + entity: The entity to create the masks for. - Returns - ------- - list - The list of masks + Returns: + The list of masks attributes, including the true text, mask text, start index, and end index within the original text. """ + masks = [] chunks = re.split(r"\s+", entity.text) for idx in range(len(chunks)): @@ -152,41 +156,31 @@ def _create_masks(self, entity: Entity): return masks def _get_context_text(self, text: str, start_index: int, end_index: int) -> str: - """Get the context text - - Parameters - ---------- - text : str - The text to get the context text from - start_index : int - The start index - end_index : int - The end index - - Returns - ------- - str - The context text + """Get the context text. + + Args: + text: The text to get the context from. + start_index: The start index of the context window. + end_index: The end index of the context window. + + Returns: + The context window text. """ + min_index = max(0, start_index - self.context_window) max_index = min(end_index + self.context_window, len(text)) return text[min_index:max_index] def _prepare_generate_inputs(self, masks: List[dict], text: str) -> List[str]: - """Prepares the generate inputs + """Prepares the generate inputs. - Parameters - ---------- - masks : List[dict] - The list of masks - text : str - The text to prepare the generate inputs for + Args: + masks: The list of masks attributes. + text: The text to prepare the generate inputs for. - Returns - ------- - list - The list of generate inputs + Returns: + The list of generate inputs. """ return [ @@ -201,23 +195,18 @@ def _prepare_generate_inputs(self, masks: List[dict], text: str) -> List[str]: def _create_substitute( self, entity: Entity, masks: List[dict], suggestions: List[dict] ) -> str: - """Create a substitute for the entity - - Parameters - ---------- - entity : Entity - The entity to create the substitute for - masks : List[dict] - The list of masks - suggestions : List[dict] - The list of suggestions - - Returns - ------- - str - The created substitute + """Create a substitute for the entity. + + Args: + entity: The entity to create the substitute for. + masks: The list of masks attributes. + suggestions: The list of substitute suggestions. + + Returns: + The created and selected substitute text. """ + substitute_chunks = [] for mask, suggestion in zip(masks, suggestions): suggestion = suggestion if type(suggestion) == list else [suggestion] diff --git a/anonipy/anonymize/generators/number_generator.py b/anonipy/anonymize/generators/number_generator.py index 462a147..c7644c2 100644 --- a/anonipy/anonymize/generators/number_generator.py +++ b/anonipy/anonymize/generators/number_generator.py @@ -9,44 +9,50 @@ class NumberGenerator(GeneratorInterface): - """The class representing the number generator + """The class representing the number generator. - Methods - ------- - generate(self, entity: Entity) - Generates a number replacement + Examples: + >>> from anonipy.anonymize.generators import NumberGenerator + >>> generator = NumberGenerator() + >>> generator.generate(entity) + + Methods: + generate(self, entity): + Generates a substitute for the numeric entity. """ def __init__(self, *args, **kwargs): - """ - Parameters - ---------- - None + """Initializes the number generator. + + Examples: + >>> from anonipy.anonymize.generators import NumberGenerator + >>> generator = NumberGenerator() """ + super().__init__(*args, **kwargs) - pass - def generate(self, entity: Entity, *args, **kwargs): - """Generates a number replacement + def generate(self, entity: Entity, *args, **kwargs) -> str: + """Generates the substitute for the numeric entity. + + Examples: + >>> from anonipy.anonymize.generators import NumberGenerator + >>> generator = NumberGenerator() + >>> generator.generate(entity) + "1234567890" - Parameters - ---------- - entity : Entity - The entity to generate the number from + Args: + entity: The numeric entity to generate the numeric substitute. - Returns - ------- - str - The generated number + Returns: + The generated numeric substitute. - Raises - ------ - ValueError - If the entity type is not `integer`, `float`, `phone_number` or `custom` + Raises: + ValueError: If the entity type is not `integer`, `float`, `phone_number` or `custom`. """ + if entity.type in ["custom"]: warnings.warn( "The entity type is `custom`. Make sure the generator is returning appropriate values." diff --git a/anonipy/anonymize/helpers.py b/anonipy/anonymize/helpers.py index 844a811..077be9e 100644 --- a/anonipy/anonymize/helpers.py +++ b/anonipy/anonymize/helpers.py @@ -1,27 +1,32 @@ -from typing import List +import re +from typing import List, Union + +from spacy.tokens import Span + from ..definitions import Entity, Replacement +from ..constants import ENTITY_TYPES # ===================================== # Entity converters # ===================================== -def convert_spacy_to_entity(entity, type=None, regex=".*", *args, **kwargs): - """Convert a SpaCy entity to an Entity object +def convert_spacy_to_entity( + entity: Span, + type: ENTITY_TYPES = None, + regex: Union[str, re.Pattern] = ".*", + *args, + **kwargs, +) -> Entity: + """Convert a spacy entity to an anonipy entity object. - Parameters - ---------- - entity : SpaCy Span - The SpaCy entity to convert - type : ENTITY_TYPES, optional - The type of the entity. Default: None - regex : Union[str, re.Pattern], optional - The regular expression the entity must match. Default: ".*" + Args: + entity: The spacy Span representing the entity to convert. + type: The type of the entity. + regex: The regular expression the entity must match. - Returns - ------- - Entity - The converted Entity object + Returns: + The converted anonipy entity object. """ @@ -36,22 +41,27 @@ def convert_spacy_to_entity(entity, type=None, regex=".*", *args, **kwargs): ) +# ===================================== +# Anonymization function +# ===================================== + + def anonymize(text: str, replacements: List[Replacement]) -> str: - """Anonymize a text based on a list of replacements + """Anonymize a text based on a list of replacements. - Parameters - ---------- - text : str - The text to anonymize - replacements : List[Replacement] - The list of replacements to apply + Examples: + >>> from anonipy.anonymize import anonymize + >>> anonymize(text, replacements) - Returns - ------- - Tuple[str, List[Replacement]] - The anonymized text and the list of replacements applied + Args: + text: The text to anonymize. + replacements: The list of replacements to apply. + + Returns: + The anonymized text. """ + s_replacements = sorted(replacements, key=lambda x: x["start_index"], reverse=True) anonymized_text = text diff --git a/anonipy/anonymize/strategies/__init__.py b/anonipy/anonymize/strategies/__init__.py index e0f25a7..8b9f7c0 100644 --- a/anonipy/anonymize/strategies/__init__.py +++ b/anonipy/anonymize/strategies/__init__.py @@ -1,18 +1,12 @@ -""" -strategies +"""Module containing the `strategies`. -The module provides a set of strategies used in the library. +The `strategies` module provides a set of strategies used to anonymize the +identified vulnerable data. -Classes -------- -StrategyInterface : - The class representing the strategy interface -MaskingStrategy : - The class representing the masking strategy -RedactionStrategy : - The class representing the redaction strategy -PseudonymizationStrategy : - The class representing the pseudonymization strategy +Classes: + RedactionStrategy: The class representing the redaction strategy. + MaskingStrategy: The class representing the masking strategy. + PseudonymizationStrategy: The class representing the pseudonymization strategy. """ diff --git a/anonipy/anonymize/strategies/interface.py b/anonipy/anonymize/strategies/interface.py index 74abaf8..8ddfc0e 100644 --- a/anonipy/anonymize/strategies/interface.py +++ b/anonipy/anonymize/strategies/interface.py @@ -7,12 +7,11 @@ class StrategyInterface: - """The class representing the strategy interface + """The class representing the strategy interface. - Methods - ------- - anonymize(text: str, entities: List[Entity], *args, **kwargs) - Anonymize the text based on the entities + Methods: + anonymize(text, entities): + Anonymize the text based on the entities. """ diff --git a/anonipy/anonymize/strategies/masking.py b/anonipy/anonymize/strategies/masking.py index 3d76586..1c652b6 100644 --- a/anonipy/anonymize/strategies/masking.py +++ b/anonipy/anonymize/strategies/masking.py @@ -11,26 +11,31 @@ class MaskingStrategy(StrategyInterface): - """The class representing the masking strategy + """The class representing the masking strategy. - Attributes - ---------- - substitute_label : str - The label to substitute in the anonymized text + Examples: + >>> from anonipy.anonymize.strategies import MaskingStrategy + >>> strategy = MaskingStrategy() + >>> strategy.anonymize(text, entities) - Methods - ------- - anonymize(text: str, entities: List[Entity]) - Anonymize the text based on the entities + Attributes: + substitute_label (str): The label to substitute in the anonymized text. + + Methods: + anonymize(text, entities): + Anonymize the text based on the entities. """ def __init__(self, substitute_label: str = "*", *args, **kwargs): - """ - Parameters - ---------- - substitute_label : str, optional - The label to substitute in the anonymized text. Default: "*" + """Initializes the masking strategy. + + Examples: + >>> from anonipy.anonymize.strategies import MaskingStrategy + >>> strategy = MaskingStrategy() + + Args: + substitute_label: The label to substitute in the anonymized text. """ @@ -40,19 +45,20 @@ def __init__(self, substitute_label: str = "*", *args, **kwargs): def anonymize( self, text: str, entities: List[Entity], *args, **kwargs ) -> Tuple[str, List[Replacement]]: - """Anonymize the text based on the entities + """Anonymize the text using the masking strategy. - Parameters - ---------- - text : str - The text to anonymize - entities : List[Entity] - The list of entities to anonymize + Examples: + >>> from anonipy.anonymize.strategies import MaskingStrategy + >>> strategy = MaskingStrategy() + >>> strategy.anonymize(text, entities) - Returns - ------- - Tuple[str, List[Replacement]] - The anonymized text and the list of replacements applied + Args: + text: The text to anonymize. + entities: The list of entities to anonymize. + + Returns: + The anonymized text. + The list of applied replacements. """ @@ -60,20 +66,21 @@ def anonymize( anonymized_text, replacements = anonymize(text, replacements) return anonymized_text, replacements + # =========================================== + # Private methods + # =========================================== + def _create_replacement(self, entity: Entity) -> Replacement: - """Creates a replacement for the entity + """Creates a replacement for the entity. - Parameters - ---------- - entity : Entity - The entity to create the replacement for + Args: + entity: The entity to create the replacement for. - Returns - ------- - Replacement - The created replacement + Returns: + The created replacement. """ + mask = self._create_mask(entity) return { "original_text": entity.text, @@ -84,19 +91,17 @@ def _create_replacement(self, entity: Entity) -> Replacement: } def _create_mask(self, entity: Entity) -> str: - """Creates a mask for the entity + """Creates a mask for the entity. - Parameters - ---------- - entity : Entity - The entity to create the mask for + Args: + entity: The entity to create the mask for. - Returns - ------- - str - The created mask + Returns: + The created mask. """ + + # TODO: add random length substitution return " ".join( [ self.substitute_label * len(chunk) diff --git a/anonipy/anonymize/strategies/pseudonymization.py b/anonipy/anonymize/strategies/pseudonymization.py index 1816928..d71e8f8 100644 --- a/anonipy/anonymize/strategies/pseudonymization.py +++ b/anonipy/anonymize/strategies/pseudonymization.py @@ -1,4 +1,4 @@ -from typing import List, Tuple +from typing import List, Tuple, Callable from .interface import StrategyInterface from ...definitions import Entity, Replacement @@ -10,26 +10,31 @@ class PseudonymizationStrategy(StrategyInterface): - """The class representing the pseudonymization strategy + """The class representing the pseudonymization strategy. - Attributes - ---------- - mapping : dict - The mapping of entities to pseudonyms + Examples: + >>> from anonipy.anonymize.strategies import PseudonymizationStrategy + >>> strategy = PseudonymizationStrategy(mapping) + >>> strategy.anonymize(text, entities) - Methods - ------- - anonymize(text: str, entities: List[Entity]) - Anonymize the text based on the entities + Attributes: + mapping: The mapping of entities to pseudonyms. + + Methods: + anonymize(text, entities): + Anonymize the text based on the entities. """ - def __init__(self, mapping, *args, **kwargs): - """ - Parameters - ---------- - mapping : func - The mapping of entities to pseudonyms + def __init__(self, mapping: Callable, *args, **kwargs): + """Initializes the pseudonymization strategy. + + Examples: + >>> from anonipy.anonymize.strategies import PseudonymizationStrategy + >>> strategy = PseudonymizationStrategy(mapping) + + Args: + mapping: The mapping function on how to handle each entity type. """ @@ -39,19 +44,20 @@ def __init__(self, mapping, *args, **kwargs): def anonymize( self, text: str, entities: List[Entity], *args, **kwargs ) -> Tuple[str, List[Replacement]]: - """Anonymize the text based on the entities + """Anonymize the text using the pseudonymization strategy. + + Examples: + >>> from anonipy.anonymize.strategies import PseudonymizationStrategy + >>> strategy = PseudonymizationStrategy(mapping) + >>> strategy.anonymize(text, entities) - Parameters - ---------- - text : str - The text to anonymize - entities : List[Entity] - The list of entities to anonymize + Args: + text: The text to anonymize. + entities: The list of entities to anonymize. - Returns - ------- - Tuple[str, List[Replacement]] - The anonymized text and the list of replacements applied + Returns: + The anonymized text. + The list of applied replacements. """ @@ -62,26 +68,25 @@ def anonymize( anonymized_text, replacements = anonymize(text, replacements) return anonymized_text, replacements + # =========================================== + # Private methods + # =========================================== + def _create_replacement( self, entity: Entity, text: str, replacements: List[dict] ) -> Replacement: - """Creates a replacement for the entity - - Parameters - ---------- - entity : Entity - The entity to create the replacement for - text : str - The text to anonymize - replacements : List[dict] - The list of replacements - - Returns - ------- - Replacement - The created replacement + """Creates a replacement for the entity. + + Args: + entity: The entity to create the replacement for. + text: The text to anonymize. + replacements: The list of existing replacements. + + Returns: + The created replacement. """ + # check if the replacement already exists anonymized_text = self._check_replacement(entity, replacements) # create a new replacement if it doesn't exist @@ -96,20 +101,17 @@ def _create_replacement( "anonymized_text": anonymized_text, } - def _check_replacement(self, entity: Entity, replacements: List[dict]) -> str: - """Checks if a suitable replacement already exists + def _check_replacement( + self, entity: Entity, replacements: List[Replacement] + ) -> str: + """Checks if a suitable replacement already exists. - Parameters - ---------- - entity : Entity - The entity to check - replacements : List[dict] - The list of replacements + Args: + entity: The entity to check. + replacements: The list of replacements. - Returns - ------- - str - The anonymized text if the replacement already exists, None otherwise + Returns: + The anonymized text if the replacement already exists, None otherwise. """ existing_replacement = list( diff --git a/anonipy/anonymize/strategies/redaction.py b/anonipy/anonymize/strategies/redaction.py index 0ae9c3d..5e075a9 100644 --- a/anonipy/anonymize/strategies/redaction.py +++ b/anonipy/anonymize/strategies/redaction.py @@ -10,26 +10,31 @@ class RedactionStrategy(StrategyInterface): - """The class representing the redaction strategy + """The class representing the redaction strategy. - Attributes - ---------- - substitute_label : str - The label to substitute in the anonymized text + Examples: + >>> from anonipy.anonymize.strategies import RedactionStrategy + >>> strategy = RedactionStrategy() + >>> strategy.anonymize(text, entities) - Methods - ------- - anonymize(text: str, entities: List[Entity]) - Anonymize the text based on the entities + Attributes: + substitute_label (str): The label to substitute in the anonymized text. + + Methods: + anonymize(text, entities): + Anonymize the text based on the entities. """ def __init__(self, substitute_label: str = "[REDACTED]", *args, **kwargs) -> None: - """ - Parameters - ---------- - substitute_label : str, optional - The label to substitute in the anonymized text. Default: "[REDACTED]" + """Initializes the redaction strategy. + + Examples: + >>> from anonipy.anonymize.strategies import RedactionStrategy + >>> strategy = RedactionStrategy() + + Args: + substitute_label: The label to substitute in the anonymized text. """ @@ -39,19 +44,20 @@ def __init__(self, substitute_label: str = "[REDACTED]", *args, **kwargs) -> Non def anonymize( self, text: str, entities: List[Entity], *args, **kwargs ) -> Tuple[str, List[Replacement]]: - """Anonymize the text based on the entities + """Anonymize the text using the redaction strategy. + + Examples: + >>> from anonipy.anonymize.strategies import RedactionStrategy + >>> strategy = RedactionStrategy() + >>> strategy.anonymize(text, entities) - Parameters - ---------- - text : str - The text to anonymize - entities : List[Entity] - The list of entities to anonymize + Args: + text: The text to anonymize. + entities: The list of entities to anonymize. - Returns - ------- - Tuple[str, List[Replacement]] - The anonymized text and the list of replacements applied + Returns: + The anonymized text. + The list of applied replacements. """ @@ -59,18 +65,18 @@ def anonymize( anonymized_text, replacements = anonymize(text, replacements) return anonymized_text, replacements + # =========================================== + # Private methods + # =========================================== + def _create_replacement(self, entity: Entity) -> Replacement: - """Creates a replacement for the entity + """Creates a replacement for the entity. - Parameters - ---------- - entity : Entity - The entity to create the replacement for + Args: + entity: The entity to create the replacement for. - Returns - ------- - Replacement - The replacement for the entity + Returns: + The created replacement. """ diff --git a/anonipy/constants.py b/anonipy/constants.py index 71be42c..66b8cf6 100644 --- a/anonipy/constants.py +++ b/anonipy/constants.py @@ -1,20 +1,34 @@ -""" -constants +"""Module containing the `constants`. -The module provides a set of constants used in the library. +The `constants` module provides a set of predefined constants used in the package. +These include supported languages, types of entities, and date transformation +variants. -Classes -------- -LANGUAGES : - Predefined supported languages -ENTITY_TYPES : - Predefined types of entities +Classes: + LANGUAGES: Predefined supported languages. + ENTITY_TYPES: Predefined types of entities. + DATE_TRANSFORM_VARIANTS: Predefined types of the date transformation variants. """ +from typing import List + class LANGUAGES: - """Supported languages""" + """The main anonipy supported languages. + + Attributes: + DUTCH (Tuple[Literal["nl"], Literal["Dutch"]]): The Dutch language. + ENGLISH (Tuple[Literal["en"], Literal["English"]]): The English language. + FRENCH (Tuple[Literal["fr"], Literal["French"]]): The French language. + GERMAN (Tuple[Literal["de"], Literal["German"]]): The German language. + GREEK (Tuple[Literal["el"], Literal["Greek"]]): The Greek language. + ITALIAN (Tuple[Literal["it"], Literal["Italian"]]): The Italian language. + SLOVENE (Tuple[Literal["sl"], Literal["Slovene"]]): The Slovene language. + SPANISH (Tuple[Literal["es"], Literal["Spanish"]]): The Spanish language. + UKRAINIAN (Tuple[Literal["uk"], Literal["Ukrainian"]]): The Ukrainian language. + + """ DUTCH = ("nl", "Dutch") ENGLISH = ("en", "English") @@ -28,7 +42,19 @@ class LANGUAGES: class ENTITY_TYPES: - """Types of entities""" + """The anonipy supported entity types. + + Attributes: + CUSTOM (Literal["custom"]): The custom entity type. + STRING (Literal["string"]): The string entity type. + INTEGER (Literal["integer"]): The integer entity type. + FLOAT (Literal["float"]): The float entity type. + DATE (Literal["date"]): The date entity type. + EMAIL (Literal["email"]): The email entity type. + WEBSITE_URL (Literal["website_url"]): The website url entity type. + PHONE_NUMBER (Literal["phone_number"]): The phone number entity type. + + """ CUSTOM = "custom" STRING = "string" @@ -38,3 +64,57 @@ class ENTITY_TYPES: EMAIL = "email" WEBSITE_URL = "website_url" PHONE_NUMBER = "phone_number" + + +class DATE_TRANSFORM_VARIANTS: + """The supported date transform variants. + + Attributes: + FIRST_DAY_OF_THE_MONTH (Literal["FIRST_DAY_OF_THE_MONTH"]): The first day of the month. + LAST_DAY_OF_THE_MONTH (Literal["LAST_DAY_OF_THE_MONTH"]): The last day of the month. + MIDDLE_OF_THE_MONTH (Literal["MIDDLE_OF_THE_MONTH"]): The middle of the month. + MIDDLE_OF_THE_YEAR (Literal["MIDDLE_OF_THE_YEAR"]): The middle of the year. + RANDOM (Literal["RANDOM"]): A random date. + + Methods: + values(): + Return a list of all possible date transform variants. + is_valid(value): + Check if the value is a valid date variant. + + """ + + FIRST_DAY_OF_THE_MONTH = "FIRST_DAY_OF_THE_MONTH" + LAST_DAY_OF_THE_MONTH = "LAST_DAY_OF_THE_MONTH" + MIDDLE_OF_THE_MONTH = "MIDDLE_OF_THE_MONTH" + MIDDLE_OF_THE_YEAR = "MIDDLE_OF_THE_YEAR" + RANDOM = "RANDOM" + + @classmethod + def values(self) -> List[str]: + """Return a list of all possible date transform variants. + + Returns: + The list of all possible variants. + + """ + return [ + self.FIRST_DAY_OF_THE_MONTH, + self.LAST_DAY_OF_THE_MONTH, + self.MIDDLE_OF_THE_MONTH, + self.MIDDLE_OF_THE_YEAR, + self.RANDOM, + ] + + @classmethod + def is_valid(self, value: str) -> bool: + """Check if the value is a valid date variant. + + Args: + value: The value to check. + + Returns: + `True` if the value is a valid date variant, `False` otherwise. + + """ + return value in self.values() diff --git a/anonipy/definitions.py b/anonipy/definitions.py index ff26607..5faeae0 100644 --- a/anonipy/definitions.py +++ b/anonipy/definitions.py @@ -1,14 +1,10 @@ -""" -definitions +"""Module containing the `definitions`. -The module provides a set of object definitions used in the library. +The `definitions` module provides a set of predefined types used in the package. -Classes -------- -Entity : - The class representing the entity -Replacement : - The class representing the replacement +Classes: + Entity: The class representing the anonipy entity object. + Replacement: The class representing the anonipy replacement object. """ @@ -17,6 +13,7 @@ from typing_extensions import NotRequired from dataclasses import dataclass +from .utils.regex import regex_mapping from .constants import ENTITY_TYPES # ================================================ @@ -26,24 +23,16 @@ @dataclass class Entity: - """The class representing the entity - - Attributes - ---------- - text : str - The text of the entity - label : str - The label of the entity - start_index : int - The start index of the entity in the text - end_index : int - The end index of the entity in the text - score : float - The prediction score of the entity. The score is returned by the extractor models. Default: 1.0 - type : ENTITY_TYPES - The type of the entity. Default: None - regex : Union[str, re.Pattern] - The regular expression the entity must match. Default: ".*" + """The class representing the anonipy Entity object. + + Attributes: + text (str): The text of the entity. + label (str): The label of the entity. + start_index (int): The start index of the entity in the text. + end_index (int): The end index of the entity in the text. + score (float): The prediction score of the entity. The score is returned by the extractor models. + type (ENTITY_TYPES): The type of the entity. + regex (Union[str, re.Pattern]): The regular expression the entity must match. """ @@ -53,24 +42,24 @@ class Entity: end_index: int score: float = 1.0 type: ENTITY_TYPES = None - regex: Union[str, re.Pattern] = ".*" + regex: Union[str, re.Pattern] = None + + def __post_init__(self): + if self.regex is None: + if self.type == "custom": + raise ValueError("Custom entities require a regex.") + self.regex = regex_mapping[self.type] class Replacement(TypedDict): - """The class representing the replacement - - Attributes - ---------- - original_text : str, optional - The original text of the entity - label : str, optional - The label of the entity - start_index : int - The start index of the entity in the text - end_index : int - The end index of the entity in the text - anonymized_text : str - The anonymized text replacing the original + """The class representing the anonipy Replacement object. + + Attributes: + original_text (str): The original text of the entity. + label (str): The label of the entity. + start_index (int): The start index of the entity in the text. + end_index (int): The end index of the entity in the text. + anonymized_text (str): The anonymized text replacing the original. """ diff --git a/anonipy/utils/__init__.py b/anonipy/utils/__init__.py index 58aaf21..060fcac 100644 --- a/anonipy/utils/__init__.py +++ b/anonipy/utils/__init__.py @@ -1,19 +1,16 @@ -""" -utils +"""The module containing the `utils`. -The module provides a set of utilities used in the library. +The `utils` module provides a set of utilities used in the package. -Submodules ----------- -language_detector : - The module containing the language detector -file_system : - The module containing the file system utilities +Modules: + regex: The module containing the regex utilities and functions. + file_system: The module containing the file system utilities and functions. + language_detector: The module containing the language detector. """ -from . import language_detector +from ..utils import regex from . import file_system +from . import language_detector - -__all__ = ["language_detector", "file_system"] +__all__ = ["regex", "file_system", "language_detector"] diff --git a/anonipy/utils/colors.py b/anonipy/utils/colors.py new file mode 100644 index 0000000..8946f74 --- /dev/null +++ b/anonipy/utils/colors.py @@ -0,0 +1,26 @@ +import random + +random.seed(42) + +# =============================================== +# GLOBAL COLOR MAPPING +# =============================================== + +GLOBAL_COLOR_MAPPING = {} + +# =============================================== +# Color generators and functions +# =============================================== + + +def generate_random_color(): + return "#" + "".join([random.choice("0123456789ABCDEF") for j in range(6)]) + + +def get_label_color(label): + if label in GLOBAL_COLOR_MAPPING: + return GLOBAL_COLOR_MAPPING[label] + else: + color = generate_random_color() + GLOBAL_COLOR_MAPPING[label] = color + return color diff --git a/anonipy/utils/datetime.py b/anonipy/utils/datetime.py index 461f782..42b549c 100644 --- a/anonipy/utils/datetime.py +++ b/anonipy/utils/datetime.py @@ -1,5 +1,8 @@ +import datetime import dateutil.parser as parser +from typing import Tuple + # ===================================== # Constants # ===================================== @@ -80,18 +83,14 @@ # ===================================== -def detect_datetime_format(datetime): - """Detects the datetime format +def detect_datetime_format(datetime: str) -> Tuple[datetime.datetime, str]: + """Detects the datetime format. - Parameters - ---------- - datetime: str - The datetime string + Args: + datetime: The datetime string to detect the format. - Returns - ------- - Tuple[datetime.datetime, str] - The detected datetime and the format + Returns: + The detected datetime and it's format. """ diff --git a/anonipy/utils/file_system.py b/anonipy/utils/file_system.py index 2f7cd34..92cd4f1 100644 --- a/anonipy/utils/file_system.py +++ b/anonipy/utils/file_system.py @@ -1,5 +1,17 @@ -""" -The file system utilities +"""The module containing the `file_system` utilities. + +The `file_system` module provides a set of utilities for reading and writing files. + +Methods: + open_file(file_path): + Opens a file and returns its content as a string. + write_file(text, file_path, encode): + Writes the text to a file. + open_json(file_path): + Opens a JSON file and returns its content as a dictionary. + write_json(data, file_path): + Writes the data to a JSON file. + """ import os @@ -19,20 +31,17 @@ # ===================================== -def remove_extra_spaces(text: str) -> str: - """Remove extra spaces from text +def _remove_extra_spaces(text: str) -> str: + """Remove extra spaces from text. - Parameters - ---------- - text : str - The text to remove extra spaces from + Args: + text: The text to remove extra spaces from. - Returns - ------- - str - The text with extra spaces removed + Returns: + The text with extra spaces removed. """ + text = text.strip() # remove extra spaces text = re.sub(" +", " ", text) @@ -40,18 +49,14 @@ def remove_extra_spaces(text: str) -> str: return text -def remove_page_numbers(text: str) -> str: - """Removes page numbers from text +def _remove_page_numbers(text: str) -> str: + """Removes page numbers from text. - Parameters - ---------- - text : str - The text to remove page numbers from + Args: + text: The text to remove page numbers from. - Returns - ------- - str - The text with page numbers removed + Returns: + The text with page numbers removed. """ @@ -69,18 +74,14 @@ def remove_page_numbers(text: str) -> str: # ===================================== -def extract_text_from_pdf(pdf_path: str) -> str: - """Extracts text from a PDF file +def _extract_text_from_pdf(pdf_path: str) -> str: + """Extracts text from a PDF file. - Parameters - ---------- - pdf_path : str - The path to the PDF file + Args: + pdf_path: The path to the PDF file. - Returns - ------- - str - The text from the PDF file + Returns: + The text from the PDF file. """ @@ -89,8 +90,8 @@ def extract_text_from_pdf(pdf_path: str) -> str: pages_text = [] for page in pdf_reader.pages: text = page.extract_text(extraction_mode="layout") - text = remove_page_numbers(text) - text = remove_extra_spaces(text) + text = _remove_page_numbers(text) + text = _remove_extra_spaces(text) pages_text.append(text) document_text = "\n".join(pages_text) @@ -103,17 +104,13 @@ def extract_text_from_pdf(pdf_path: str) -> str: def _word_process_paragraph(p) -> str: - """Get the text from a paragraph + """Get the text from a paragraph. - Parameters - ---------- - p : etree._Element - The paragraph element + Args: + p (etree._Element): The paragraph element. - Returns - ------- - str - The text from the paragraph + Returns: + The text from the paragraph. """ @@ -121,17 +118,13 @@ def _word_process_paragraph(p) -> str: def _word_process_table(t) -> str: - """Get the text from a table + """Get the text from a table. - Parameters - ---------- - t : etree._Element - The table element + Args: + t (etree._Element): The table element. - Returns - ------- - str - The text from the table + Returns: + The text from the table. """ @@ -147,18 +140,14 @@ def _word_process_table(t) -> str: return "\n".join(table_text) -def extract_text_from_word(doc_path: str) -> str: - """Extracts text from a Word file +def _extract_text_from_word(doc_path: str) -> str: + """Extracts text from a Word file. - Parameters - ---------- - doc_path : str - The path to the Word file + Args: + doc_path: The path to the Word file. - Returns - ------- - str - The text from the Word file + Returns: + The text from the Word file. """ @@ -183,18 +172,18 @@ def extract_text_from_word(doc_path: str) -> str: def open_file(file_path: str) -> str: - """ - Opens a file and returns its content as a string + """Opens a file and returns its content as a string. + + Examples: + >>> from anonipy.utils import file_system + >>> file_system.open_file("path/to/file.txt") + "Hello, World!" - Parameters - ---------- - file_path : str - The path to the file + Args: + file_path: The path to the file. - Returns - ------- - str - The content of the file as a string + Returns: + The content of the file as a string. """ @@ -203,9 +192,9 @@ def open_file(file_path: str) -> str: _, file_extension = os.path.splitext(file_path) if file_extension.lower() == ".pdf": - return extract_text_from_pdf(file_path) + return _extract_text_from_pdf(file_path) elif file_extension.lower() in [".doc", ".docx"]: - return extract_text_from_word(file_path) + return _extract_text_from_word(file_path) elif file_extension.lower() == ".txt": with open(file_path, "r", encoding="utf-8") as f: return f.read() @@ -213,47 +202,21 @@ def open_file(file_path: str) -> str: raise ValueError(f"The file extension is not supported: {file_extension}") -def open_json(file_path: str) -> dict: - """ - Opens a JSON file and returns its content as a dictionary - - Parameters - ---------- - file_path : str - The path to the JSON file - - Returns - ------- - dict - The content of the JSON file as a dictionary - - """ - - if not os.path.isfile(file_path): - raise FileNotFoundError(f"The file does not exist: {file_path}") +def write_file(text: str, file_path: str, encode: Union[str, bool] = True) -> None: + """Writes the text to a file. - with open(file_path, "r", encoding="utf-8") as f: - return json.load(f) + Examples: + >>> from anonipy.utils import file_system + >>> file_system.write_file("Hello, World!", "path/to/file.txt") + Args: + text: The text to write to the file. + file_path: The path to the file. + encode: The encoding to use. -def write_file(text: str, file_path: str, encode: Union[str, bool] = True) -> None: - """Writes text to a file - - Parameters - ---------- - text : str - The text to write to the file - file_path : str - The path to the file - encode : Union[str, bool], optional - The encoding to use. Default: True - - Raises - ------ - TypeError - If text, file_path is not a string; encode is not a string or a boolean - FileNotFoundError - If the directory does not exist + Raises: + TypeError: If text, `file_path` is not a string; `encode` is not a string or a boolean. + FileNotFoundError: If the directory does not exist. """ @@ -281,15 +244,39 @@ def write_file(text: str, file_path: str, encode: Union[str, bool] = True) -> No f.write(text) +def open_json(file_path: str) -> dict: + """Opens a JSON file and returns its content as a dictionary. + + Examples: + >>> from anonipy.utils import file_system + >>> file_system.open_json("path/to/file.json") + {"hello": "world"} + + Args: + file_path: The path to the JSON file. + + Returns: + The content of the JSON file as a dictionary. + + """ + + if not os.path.isfile(file_path): + raise FileNotFoundError(f"The file does not exist: {file_path}") + + with open(file_path, "r", encoding="utf-8") as f: + return json.load(f) + + def write_json(data: dict, file_path: str) -> None: - """Writes data to a JSON file - - Parameters - ---------- - data : dict - The data to write to the JSON file - file_path : str - The path to the JSON file + """Writes data to a JSON file. + + Examples: + >>> from anonipy.utils import file_system + >>> file_system.write_json({"hello": "world"}, "path/to/file.json") + + Args: + data: The data to write to the JSON file. + file_path: The path to the JSON file. """ diff --git a/anonipy/utils/language_detector.py b/anonipy/utils/language_detector.py index 4b403e4..ccff4ca 100644 --- a/anonipy/utils/language_detector.py +++ b/anonipy/utils/language_detector.py @@ -1,30 +1,51 @@ +"""The module containing the `language_detector` utilities. + +The `language_detector` module contains the `LanguageDetector` class, which is +used to detect the language of a text. + +Classes: + LanguageDetector: The class representing the language detector. + +""" + +from typing import Tuple + from lingua import LanguageDetectorBuilder +# ===================================== +# Main class +# ===================================== + class LanguageDetector: - """The class for detecting the language of a text + """The class representing the language detector. + + Examples: + >>> from anonipy.utils.language_detector import LanguageDetector + >>> detector = LanguageDetector() + >>> detector.detect(text) - Attributes - ---------- - detector : LanguageDetector - The language detector + Attributes: + detector (lingua.LanguageDetector): The language detector. - Methods - ------- - __call__(self, text: str, output_standard: str = "iso_code_639_1") - Detect the language of a text. Calls the `detect` method. + Methods: + __call__(text, output_standard): + Detect the language of a text. Calls the `detect` method. - detect(text: str, output_standard: str = "iso_code_639_1") - Detect the language of a text + detect(text, output_standard): + Detect the language of a text. """ def __init__(self, low_accuracy: bool = False): - """ - Parameters - ---------- - low_accuracy : bool, optional - Whether to use the low accuracy mode. Default: False + """Initializes the language detector. + + Examples: + >>> from anonipy.utils.language_detector import LanguageDetector + >>> detector = LanguageDetector() + + Args: + low_accuracy: Whether to use the low accuracy mode. """ @@ -37,24 +58,45 @@ def __init__(self, low_accuracy: bool = False): ) self.detector = builder.build() - def __call__(self, text: str, output_standard: str = "iso_code_639_1") -> str: - return self.detect(text, output_standard) + def __call__( + self, text: str, output_standard: str = "iso_code_639_1" + ) -> Tuple[str, str]: + """Detects the language of a text by calling the `detect` method. + + Examples: + >>> from anonipy.utils.language_detector import LanguageDetector + >>> detector = LanguageDetector() + >>> detector(text) + + Args: + text: The text to detect the language of. + output_standard: The output standard. + + Returns: + The language code. + The full name of the language. - def detect(self, text: str, output_standard: str = "iso_code_639_1") -> str: """ - Detect the language of a text - - Parameters - ---------- - text : str - The text to detect the language of - output_standard : str, optional - The output standard. Default: "iso_code_639_1" - - Returns - ------- - Tuple[str, str] - The language code and the full name of the language + + return self.detect(text, output_standard) + + def detect( + self, text: str, output_standard: str = "iso_code_639_1" + ) -> Tuple[str, str]: + """Detects the language of a text. + + Examples: + >>> from anonipy.utils.language_detector import LanguageDetector + >>> detector = LanguageDetector() + >>> detector.detect(text) + + Args: + text: The text to detect the language of. + output_standard: The output standard. + + Returns: + The language code. + The full name of the language. """ diff --git a/anonipy/anonymize/regex.py b/anonipy/utils/regex.py similarity index 57% rename from anonipy/anonymize/regex.py rename to anonipy/utils/regex.py index 67149b8..812c939 100644 --- a/anonipy/anonymize/regex.py +++ b/anonipy/utils/regex.py @@ -1,3 +1,21 @@ +"""Module containing the `regex` related utilities. + +The `regex` module provides the regex definitions and functions used within the package. + +Classes: + RegexMapping: The class representing the mapping for data type to the corresponding regex. + +Attributes: + REGEX_STRING (str): The regex definition for string. + REGEX_INTEGER (str): The regex definition for integer. + REGEX_FLOAT (str): The regex definition for float. + REGEX_DATE (str): The regex definition for date. + REGEX_EMAIL_ADDRESS (str): The regex definition for email address. + REGEX_PHONE_NUMBER (str): The regex definition for phone number. + REGEX_WEBSITE_URL (str): The regex definition for website URL. + +""" + from collections import defaultdict from ..constants import ENTITY_TYPES @@ -8,8 +26,14 @@ # ===================================== REGEX_STRING = ".*" +"""The regex definition for string.""" + REGEX_INTEGER = "\d+" +"""The regex definition for integer.""" + REGEX_FLOAT = "[\d\.,]+" +"""The regex definition for float.""" + REGEX_DATE = ( r"(" r"(\d{4}[-/.\s]\d{2}[-/.\s]\d{2}[ T]\d{2}:\d{2}:\d{2})|" @@ -32,13 +56,27 @@ r"([A-Za-z]+,[ ](January|February|March|April|May|June|July|August|September|October|November|December)[ ]\d{1,2},[ ]\d{4}[ ]?\d{2}:\d{2}[ ]?[APap][mM])" r")" ) +"""The regex definition for dates. + +The regex definition for dates includes string representations, which are currently in +the English language. + +TODO: + - Add regex definitions for other languages. +""" + REGEX_EMAIL_ADDRESS = ( "[a-zA-Z0-9.!#$%&’*+/=?^_`{|}~-]+@[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*" ) +"""The regex definition for email addresses.""" + REGEX_PHONE_NUMBER = ( "[(]?[\+]?[(]?[0-9]{1,3}[)]?[-\s\.]?([0-9]{2,}[-\s\.]?){2,}([0-9]{3,})" ) +"""The regex definition for phone numbers.""" + REGEX_WEBSITE_URL = "((https?|ftp|smtp):\/\/)?(www.)?([a-zA-Z0-9]+\.)+[a-z]{2,}(\/[a-zA-Z0-9#\?\_\.\=\-\&]+|\/?)*" +"""The regex definition for website URLs.""" # ===================================== @@ -46,23 +84,30 @@ # ===================================== -class RegexMap: - """RegexMap +class RegexMapping: + """The class representing the regex mapping. + + Examples: + >>> from anonipy.anonymize.regex import regex_mapping + >>> regex_mapping["string"] + ".*" - The class representing the regex map + Attributes: + regex_mapping (defaultdict): + The mapping between the data type and the corresponding regex. - Attributes - ---------- - regex_mapping : defaultdict - The regex mapping + Methods: + __getitem__(type): + Gets the regex for the given type. """ def __init__(self): - """ - Parameters - ---------- - None + """Initialize the regex mapping. + + Examples: + >>> from anonipy.anonymize.regex import RegexMapping + >>> regex_mapping = RegexMapping() """ @@ -76,22 +121,24 @@ def __init__(self): self.regex_mapping[ENTITY_TYPES.PHONE_NUMBER] = REGEX_PHONE_NUMBER self.regex_mapping[ENTITY_TYPES.WEBSITE_URL] = REGEX_WEBSITE_URL - def __call__(self, type: str) -> str: - """Gets the regex for the given type + def __getitem__(self, type: str) -> str: + """Gets the regex for the given type. - Parameters - ---------- - type : str - The type of the entity + Examples: + >>> from anonipy.anonymize.regex import RegexMapping + >>> regex_mapping = RegexMapping() + >>> regex_mapping["string"] + ".*" - Returns - ------- - str - The regex for the given type + Args: + type: The type of the entity. - """ + Returns: + The regex for the given type. + """ return self.regex_mapping[type] -regex_map = RegexMap() +regex_mapping = RegexMapping() +"""The shorthand to the `RegexMapping` instance.""" diff --git a/docs/blog/index.md b/docs/blog/index.md deleted file mode 100644 index c58f16c..0000000 --- a/docs/blog/index.md +++ /dev/null @@ -1,2 +0,0 @@ -# Blog - diff --git a/docs/blog/posts/anonymizing-documents.md b/docs/blog/posts/anonymizing-documents.md deleted file mode 100644 index 69eb414..0000000 --- a/docs/blog/posts/anonymizing-documents.md +++ /dev/null @@ -1,184 +0,0 @@ ---- -date: 2024-05-22 -authors: [eriknovak] -description: > - Our package can be used to anonymize a document such as PDF and word documents. -categories: - - Tutorial ---- - -# Anonymizing documents - -The `anonipy` package was designed for anonymizing text. However, a lot of text -data can be found in document form, such as PDFs, word documents, and other. Copying -the text from the documents to be anonymized can be cumbersome. The `anonipy` package -provides utility functions that extracts the text from the documents. - - -In this blog post, we explain how `anonipy` can be used to anonymize texts in -document form. - - - -!!! info "Prerequisites" - To use the `anonipy` package, we must have Python version 3.8 or higher - installed on the machine. - -## Installation - -Before we start, we must first install the `anonipy` package. To do that, run the -following command in the terminal: - -```bash -pip install anonipy -``` - -This will install the `anonipy` package, which contains all of the required modules. - -If you already installed it and would like to update it, run the following command: - -```bash -pip install anonipy --upgrade -``` - -## Document anonymization - -### Extracting the text from the document - -Next, we will use the `anonipy` package to anonymize the text in the document. -First, we must extract the text. This can be done using the package's utility -function `open_file`. It uses the [textract](https://textract.readthedocs.io/en/stable/) -package to extract the text from different types of documents. - - -To extract the text, using the following code: - -```python -from anonipy.utils.file_system import open_file - -file_text = open_file(file_path) -``` - -where `file_path` is the path to the document we want to anonymize. The `open_file` -will open the document, extract the content, and return it as a string. - -Once this is done, we can start anonymizing the text, in a regular way. - -### Extracting personal information from the text - -Now we can identify and extract personal information from the text. We do this -by using `EntityExtractor`, an extractor that leverages the -[GLiNER](https://github.com/urchade/GLiNER) span-based NER models. - -It returns the text and the extracted entities. - -```python -from anonipy.constants import LANGUAGES -from anonipy.anonymize.extractors import EntityExtractor - -# define the labels to be extracted and their types -labels = [ - {"label": "name", "type": "string"}, - {"label": "social security number", "type": "custom"}, - {"label": "date of birth", "type": "date"}, - {"label": "date", "type": "date"}, -] - -# initialize the entity extractor -entity_extractor = EntityExtractor( - labels, lang=LANGUAGES.ENGLISH, score_th=args.score_th -) -# extract the entities from the original text -doc, entities = entity_extractor(file_text) -``` - -To display the entities in the original text, we can use the `display` method: - -```python -entity_extractor.display(doc) -``` - - -### Preparing the anonymization mapping - -Next, we prepare the anonymization mapping. We do this by using the generators -module part of the `anonipy` package. The generators are used to generate -substitutes for the entities. - -For example, we can use `MaskLabelGenerator` to generate substitutes using the -language models to solve a `mask-filling` problem, i.e. finding the words that -would be probabilistically suitable to replace the entity in the text. - -The full list of available generators can be found [here][generators]. - -Furthermore, we use the `PseudonymizationStrategy` to anonymize the text. More -on anonymization strategies can be found [here][strategies]. - - -```python -from anonipy.anonymize.generators import ( - MaskLabelGenerator, - DateGenerator, - NumberGenerator, -) -from anonipy.anonymize.strategies import PseudonymizationStrategy - -# initialize the generators -mask_generator = MaskLabelGenerator() -date_generator = DateGenerator() -number_generator = NumberGenerator() - -# prepare the anonymization mapping -def anonymization_mapping(text, entity): - if entity.type == "string": - return mask_generator.generate(entity, text) - if entity.label == "date": - return date_generator.generate(entity, output_gen="middle_of_the_month") - if entity.label == "date of birth": - return date_generator.generate(entity, output_gen="middle_of_the_year") - if entity.label == "social security number": - return number_generator.generate(entity) - return "[REDACTED]" - -# initialize the pseudonymization strategy -pseudo_strategy = PseudonymizationStrategy(mapping=anonymization_mapping) -``` - -### Anonymizing the text - -Once we prepare the anonymization strategy, we can use it to anonymize the text. - -```python -# anonymize the original text -anonymized_text, replacements = pseudo_strategy.anonymize(file_text, entities) -``` - -### Saving the anonymized text - -Finally, we can save the anonymized text to a file. This can be done using the -`write_file` function from the `anonipy.utils.file_system` module. - -```python -from anonipy.utils.file_system import write_file - -write_file(anonymized_text, output_file, encode="utf-8") -``` - -Where `output_file` is the path to the file where the anonymized text will be saved. - - -## Conclusion - -In this blog post, we show how one can anonymize a document using the `anonipy` package. -We first used the `open_file` utility function to extract the content of the document -and store it as a string. We then used the `EntityExtractor` to identify and extract -personal information form the text, and the `PseudonymizationStrategy` in combination -with various generators to anonymize the text. Finally, we used the `write_file` -to save the anonymized text to a file. - -This process is very straightforward and can be applied to almost any document type. -Furthermore, it can be expanded to process multiple documents written in the same -language at once. Stay tuned to see how this can be done in the future! - -[generators]: ../../documentation/notebooks/02-generators.ipynb -[strategies]: ../../documentation/notebooks/03-strategies.ipynb \ No newline at end of file diff --git a/docs/changelog.md b/docs/changelog.md new file mode 100644 index 0000000..ece2e41 --- /dev/null +++ b/docs/changelog.md @@ -0,0 +1,5 @@ +--- +title: Changelog +--- + +--8<-- "CHANGELOG.md" \ No newline at end of file diff --git a/docs/css/extra.css b/docs/css/extra.css new file mode 100644 index 0000000..615c911 --- /dev/null +++ b/docs/css/extra.css @@ -0,0 +1,4 @@ +/* Hide the package name in the header */ +header.md-header .md-header__ellipsis .md-header__topic:first-of-type { + display: none; +} \ No newline at end of file diff --git a/docs/development.md b/docs/development.md index a08c15d..34587a7 100644 --- a/docs/development.md +++ b/docs/development.md @@ -6,7 +6,7 @@ title: Development This section is for developers only. It describes the requirements, the setup process, how to run tests, and how to deploy. -## ✅ Requirements +## Requirements Before starting the project make sure these requirements are available: - [python][python]. The python programming language (v3.8, v3.9, v3.10, v3.11). @@ -14,7 +14,7 @@ Before starting the project make sure these requirements are available: - [git][git]. For versioning your code. -## 🛠️ Setup +## Setup ### Create the python environment @@ -49,7 +49,7 @@ pip install -e .[all] githooks ``` -## 🧪 Tests +## Tests To run existing tests, simply run: @@ -57,7 +57,7 @@ To run existing tests, simply run: python -m unittest discover test ``` -## 📝 Documentation +## Documentation To start live-reloading the documentation, run: diff --git a/docs/documentation/notebooks/00-overview.ipynb b/docs/documentation/notebooks/00-overview.ipynb deleted file mode 100644 index 527ece0..0000000 --- a/docs/documentation/notebooks/00-overview.ipynb +++ /dev/null @@ -1,607 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Overview\n", - "\n", - "This notebook provides an overview of the package and its functionality." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# used to hide warnings\n", - "import warnings\n", - "\n", - "warnings.filterwarnings(\"ignore\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let us first define the text, from which we will showcase the package's functionality." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "original_text = \"\"\"\\\n", - "Medical Record\n", - "\n", - "Patient Name: John Doe\n", - "Date of Birth: 15-01-1985\n", - "Date of Examination: 20-05-2024\n", - "Social Security Number: 123-45-6789\n", - "\n", - "Examination Procedure:\n", - "John Doe underwent a routine physical examination. The procedure included measuring vital signs (blood pressure, heart rate, temperature), a comprehensive blood panel, and a cardiovascular stress test. The patient also reported occasional headaches and dizziness, prompting a neurological assessment and an MRI scan to rule out any underlying issues.\n", - "\n", - "Medication Prescribed:\n", - "\n", - "Ibuprofen 200 mg: Take one tablet every 6-8 hours as needed for headache and pain relief.\n", - "Lisinopril 10 mg: Take one tablet daily to manage high blood pressure.\n", - "Next Examination Date:\n", - "15-11-2024\n", - "\"\"\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Extract personal information from text" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The `anonipy` has implemented entity extraction components, that can be used to extract personal information from text.\n", - "\n", - "More can be found in the chapter [Extractors](/documentation/notebooks/01-extractors)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Language detector" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "from anonipy.utils.language_detector import LanguageDetector\n", - "\n", - "lang_detector = LanguageDetector()" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "('en', 'English')" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# identify the language of the original text\n", - "language = lang_detector(original_text)\n", - "language" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Extract personal information" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "from anonipy.anonymize.extractors import EntityExtractor" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "# define the labels to be extracted and anonymized\n", - "labels = [\n", - " {\"label\": \"name\", \"type\": \"string\"},\n", - " {\n", - " \"label\": \"social security number\",\n", - " \"type\": \"custom\",\n", - " \"regex\": \"[0-9]{3}-[0-9]{2}-[0-9]{4}\",\n", - " },\n", - " {\"label\": \"date of birth\", \"type\": \"date\"},\n", - " {\"label\": \"date\", \"type\": \"date\"},\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "# language taken from the language detector\n", - "entity_extractor = EntityExtractor(labels, lang=language, score_th=0.5)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "# extract the entities from the original text\n", - "doc, entities = entity_extractor(original_text)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
Medical Record

Patient Name: \n", - "\n", - " John Doe\n", - " name\n", - "\n", - "
Date of Birth: \n", - "\n", - " 15-01-1985\n", - " date of birth\n", - "\n", - "
Date of Examination: \n", - "\n", - " 20-05-2024\n", - " date\n", - "\n", - "
Social Security Number: \n", - "\n", - " 123-45-6789\n", - " social security number\n", - "\n", - "

Examination Procedure:
\n", - "\n", - " John Doe\n", - " name\n", - "\n", - " underwent a routine physical examination. The procedure included measuring vital signs (blood pressure, heart rate, temperature), a comprehensive blood panel, and a cardiovascular stress test. The patient also reported occasional headaches and dizziness, prompting a neurological assessment and an MRI scan to rule out any underlying issues.

Medication Prescribed:

Ibuprofen 200 mg: Take one tablet every 6-8 hours as needed for headache and pain relief.
Lisinopril 10 mg: Take one tablet daily to manage high blood pressure.
Next Examination Date:
\n", - "\n", - " 15-11-2024\n", - " date\n", - "\n", - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# display the entities in the original text\n", - "entity_extractor.display(doc)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The extracted entities metadata is available in the `entities` variable, which are:" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[Entity(text='John Doe', label='name', start_index=30, end_index=38, score=0.9961156845092773, type='string', regex='.*'),\n", - " Entity(text='15-01-1985', label='date of birth', start_index=54, end_index=64, score=0.9937193393707275, type='date', regex='(\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{2,4})|(\\\\d{2,4}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2})'),\n", - " Entity(text='20-05-2024', label='date', start_index=86, end_index=96, score=0.9867385625839233, type='date', regex='(\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{2,4})|(\\\\d{2,4}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2})'),\n", - " Entity(text='123-45-6789', label='social security number', start_index=121, end_index=132, score=0.9993416666984558, type='custom', regex='[0-9]{3}-[0-9]{2}-[0-9]{4}'),\n", - " Entity(text='John Doe', label='name', start_index=157, end_index=165, score=0.994924783706665, type='string', regex='.*'),\n", - " Entity(text='15-11-2024', label='date', start_index=717, end_index=727, score=0.8285622596740723, type='date', regex='(\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{2,4})|(\\\\d{2,4}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2})')]" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "entities" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Anonymize the original text" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The `anonipy` has implemented generators for different types of information, that can be used \n", - "to generate replacements for the original text.\n", - "\n", - "More on generators can be found in the chapter [Generators](/documentation/notebooks/02-generators),\n", - "while chapter [Strategies](/documentation/notebooks/03-strategies) provides strategies for anonymizing\n", - "the original text." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Prepare generators for generating replacements" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "from anonipy.anonymize.generators import (\n", - " LLMLabelGenerator,\n", - " DateGenerator,\n", - " NumberGenerator,\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Loading checkpoint shards: 100%|██████████| 4/4 [00:21<00:00, 5.44s/it]\n" - ] - } - ], - "source": [ - "# initialize the generators\n", - "llm_generator = LLMLabelGenerator()\n", - "date_generator = DateGenerator()\n", - "number_generator = NumberGenerator()" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "# prepare the anonymization mapping\n", - "def anonymization_mapping(text, entity):\n", - " if entity.type == \"string\":\n", - " return llm_generator.generate(entity, temperature=0.7)\n", - " if entity.label == \"date\":\n", - " return date_generator.generate(entity, output_gen=\"middle_of_the_month\")\n", - " if entity.label == \"date of birth\":\n", - " return date_generator.generate(entity, output_gen=\"middle_of_the_year\")\n", - " if entity.label == \"social security number\":\n", - " return number_generator.generate(entity)\n", - " return \"[REDACTED]\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Anonymize the original text" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "from anonipy.anonymize.strategies import PseudonymizationStrategy" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "# initialize the pseudonymization strategy\n", - "pseudo_strategy = PseudonymizationStrategy(mapping=anonymization_mapping)" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [], - "source": [ - "# anonymize the original text\n", - "anonymized_text, replacements = pseudo_strategy.anonymize(original_text, entities)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The anonymized text is:" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Medical Record\n", - "\n", - "Patient Name: Ethan Lane\n", - "Date of Birth: 01-07-1985\n", - "Date of Examination: 15-05-2024\n", - "Social Security Number: 588-85-9388\n", - "\n", - "Examination Procedure:\n", - "Ethan Lane underwent a routine physical examination. The procedure included measuring vital signs (blood pressure, heart rate, temperature), a comprehensive blood panel, and a cardiovascular stress test. The patient also reported occasional headaches and dizziness, prompting a neurological assessment and an MRI scan to rule out any underlying issues.\n", - "\n", - "Medication Prescribed:\n", - "\n", - "Ibuprofen 200 mg: Take one tablet every 6-8 hours as needed for headache and pain relief.\n", - "Lisinopril 10 mg: Take one tablet daily to manage high blood pressure.\n", - "Next Examination Date:\n", - "15-11-2024\n", - "\n" - ] - } - ], - "source": [ - "print(anonymized_text)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "And the associated replacements are:" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[{'original_text': '15-11-2024',\n", - " 'label': 'date',\n", - " 'start_index': 717,\n", - " 'end_index': 727,\n", - " 'anonymized_text': '15-11-2024'},\n", - " {'original_text': 'John Doe',\n", - " 'label': 'name',\n", - " 'start_index': 157,\n", - " 'end_index': 165,\n", - " 'anonymized_text': 'Ethan Lane'},\n", - " {'original_text': '123-45-6789',\n", - " 'label': 'social security number',\n", - " 'start_index': 121,\n", - " 'end_index': 132,\n", - " 'anonymized_text': '588-85-9388'},\n", - " {'original_text': '20-05-2024',\n", - " 'label': 'date',\n", - " 'start_index': 86,\n", - " 'end_index': 96,\n", - " 'anonymized_text': '15-05-2024'},\n", - " {'original_text': '15-01-1985',\n", - " 'label': 'date of birth',\n", - " 'start_index': 54,\n", - " 'end_index': 64,\n", - " 'anonymized_text': '01-07-1985'},\n", - " {'original_text': 'John Doe',\n", - " 'label': 'name',\n", - " 'start_index': 30,\n", - " 'end_index': 38,\n", - " 'anonymized_text': 'Ethan Lane'}]" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "replacements" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Fixing the anonymized text" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In case the anonymized text is not suitable, we can fix it by using the `anonymize` function found in the `anonipy.anonymize` module.\n", - "To do this, let us define a new set of replacements. \n", - "\n", - "We can edit existing replacements by changing the `anonymized_text` value, remove the ones that are not suitable,\n", - "and add missing ones.\n", - "\n", - "Note that the new set does not require the `original_text` and `label` values." - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [], - "source": [ - "new_replacements = [\n", - " {\n", - " \"start_index\": 30,\n", - " \"end_index\": 38,\n", - " \"anonymized_text\": \"Mark Strong\",\n", - " },\n", - " {\n", - " \"original_text\": \"20-05-2024\",\n", - " \"label\": \"date\",\n", - " \"start_index\": 86,\n", - " \"end_index\": 96,\n", - " \"anonymized_text\": \"18-05-2024\",\n", - " },\n", - " {\n", - " \"original_text\": \"123-45-6789\",\n", - " \"label\": \"social security number\",\n", - " \"start_index\": 121,\n", - " \"end_index\": 132,\n", - " \"anonymized_text\": \"119-88-7014\",\n", - " },\n", - " {\n", - " \"original_text\": \"John Doe\",\n", - " \"label\": \"name\",\n", - " \"start_index\": 157,\n", - " \"end_index\": 165,\n", - " \"anonymized_text\": \"Mark Strong\",\n", - " },\n", - "]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now, let us anonymize the original text using the new replacements." - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [], - "source": [ - "from anonipy.anonymize import anonymize" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [], - "source": [ - "# anonymize the original text using the new replacements\n", - "anonymized_text, replacements = anonymize(original_text, new_replacements)" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Medical Record\n", - "\n", - "Patient Name: Mark Strong\n", - "Date of Birth: 15-01-1985\n", - "Date of Examination: 18-05-2024\n", - "Social Security Number: 119-88-7014\n", - "\n", - "Examination Procedure:\n", - "Mark Strong underwent a routine physical examination. The procedure included measuring vital signs (blood pressure, heart rate, temperature), a comprehensive blood panel, and a cardiovascular stress test. The patient also reported occasional headaches and dizziness, prompting a neurological assessment and an MRI scan to rule out any underlying issues.\n", - "\n", - "Medication Prescribed:\n", - "\n", - "Ibuprofen 200 mg: Take one tablet every 6-8 hours as needed for headache and pain relief.\n", - "Lisinopril 10 mg: Take one tablet daily to manage high blood pressure.\n", - "Next Examination Date:\n", - "15-11-2024\n", - "\n" - ] - } - ], - "source": [ - "print(anonymized_text)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.12" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/docs/documentation/notebooks/01-extractors.ipynb b/docs/documentation/notebooks/01-extractors.ipynb deleted file mode 100644 index e47f579..0000000 --- a/docs/documentation/notebooks/01-extractors.ipynb +++ /dev/null @@ -1,708 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Extractors\n", - "\n", - "This chapter showcases how to use the label extractors in the package.\n", - "\n", - "The label extractors are used to extract relevant `named entities` from text. These \n", - "entities can be people names, organizations, addresses, social security numbers, etc.\n", - "The entities are then used to anonymize the text." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# used to hide warnings\n", - "import warnings\n", - "\n", - "warnings.filterwarnings(\"ignore\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let us first define the text, from which we want to extract the entities." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "original_text = \"\"\"\\\n", - "Medical Record\n", - "\n", - "Patient Name: John Doe\n", - "Date of Birth: 15-01-1985\n", - "Date of Examination: 20-05-2024\n", - "Social Security Number: 123-45-6789\n", - "\n", - "Examination Procedure:\n", - "John Doe underwent a routine physical examination. The procedure included measuring vital signs (blood pressure, heart rate, temperature), a comprehensive blood panel, and a cardiovascular stress test. The patient also reported occasional headaches and dizziness, prompting a neurological assessment and an MRI scan to rule out any underlying issues.\n", - "\n", - "Medication Prescribed:\n", - "\n", - "Ibuprofen 200 mg: Take one tablet every 6-8 hours as needed for headache and pain relief.\n", - "Lisinopril 10 mg: Take one tablet daily to manage high blood pressure.\n", - "Next Examination Date:\n", - "15-11-2024\n", - "\"\"\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Language configuration" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "First, we must specify the language that the text is written in. We can do this manually or by using a language detector." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Manual selection" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "One option, when all of the texts are in the same language, is to use manually specifying the text language. \n", - "In the `anonipy` package, we provide a constant called `LANGUAGES` in the `constants` submodule, which \n", - "contains all the supported languages. Please find the format of the language code in the `constants` module.\n", - "\n", - "Since the `original_text` is in English, we will use the `LANGUAGES.ENGLISH` predefined constant.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "from anonipy.constants import LANGUAGES" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "('en', 'English')" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "LANGUAGES.ENGLISH" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Using language detector" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "An alternative is to use a language detector available in the `anonipy` package. \n", - "The language detector is created using the [lingua](https://github.com/pemistahl/lingua-py) python package, \n", - "and allows automatic detection of the text language." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "from anonipy.utils.language_detector import LanguageDetector" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Initialize the language detector and use it to automatically detect the language of the text." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "('en', 'English')" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "lang_detector = LanguageDetector()\n", - "lang_detector(original_text)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Using extractors" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Initialization" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can now initialize the label extractors. This is done using the `EntityExtractor` class found in `anonipy.anonymize.extractors` submodule.\n", - "\n", - "
\n", - "

Info

\n", - "

\n", - " The EntityExtractor class is created using the GLiNER models, specifically the one that is finetuned for recognizing Personally Identifiable Information (PII) within text. The model has been finetuned on six languages (English, French, German, Spanish, Italian, and Portuguese), but can be applied also to other languages.\n", - "

\n", - "
" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "from anonipy.anonymize.extractors import EntityExtractor" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The `EntityExtractor` class takes the following arguments:\n", - "\n", - "- `labels`: A list of dictionaries containing the labels to be extracted. \n", - "- `lang`: The language of the text to be anonymized. Defaults to `LANGUAGES.ENGLISH`.\n", - "- `score_th`: The score threshold used to filter the labels, i.e. the entity has to have a score greater than `score_th` to be considered. Defaults to 0.5.\n", - "- `use_gpu`: Whether to use the GPU. Defaults to `False`.\n", - "\n", - "We must now define the labels to be extracted. In this example, we will extract the people name, the dates, and the social security number from the text." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "labels = [\n", - " {\"label\": \"name\", \"type\": \"string\"},\n", - " {\n", - " \"label\": \"social security number\",\n", - " \"type\": \"custom\",\n", - " \"regex\": \"[0-9]{3}-[0-9]{2}-[0-9]{4}\",\n", - " },\n", - " {\"label\": \"date of birth\", \"type\": \"date\"},\n", - " {\"label\": \"date\", \"type\": \"date\"},\n", - "]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let us now initialize the entity extractor.\n", - "\n", - "
\n", - "

Info

\n", - "

\n", - " The initialization of EntityExtractor will throw some warnings. Ignore them.\n", - " These are expected due to the use of package dependencies.\n", - "

\n", - "
" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "entity_extractor = EntityExtractor(labels, lang=LANGUAGES.ENGLISH, score_th=0.5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Entity extraction" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The `EntityExtractor` receives the text to be anonymized and returns the enriched text document and the extracted entities." - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "doc, entities = entity_extractor(original_text)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The entities extracted within the input text are:" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
Medical Record

Patient Name: \n", - "\n", - " John Doe\n", - " name\n", - "\n", - "
Date of Birth: \n", - "\n", - " 15-01-1985\n", - " date of birth\n", - "\n", - "
Date of Examination: \n", - "\n", - " 20-05-2024\n", - " date\n", - "\n", - "
Social Security Number: \n", - "\n", - " 123-45-6789\n", - " social security number\n", - "\n", - "

Examination Procedure:
\n", - "\n", - " John Doe\n", - " name\n", - "\n", - " underwent a routine physical examination. The procedure included measuring vital signs (blood pressure, heart rate, temperature), a comprehensive blood panel, and a cardiovascular stress test. The patient also reported occasional headaches and dizziness, prompting a neurological assessment and an MRI scan to rule out any underlying issues.

Medication Prescribed:

Ibuprofen 200 mg: Take one tablet every 6-8 hours as needed for headache and pain relief.
Lisinopril 10 mg: Take one tablet daily to manage high blood pressure.
Next Examination Date:
\n", - "\n", - " 15-11-2024\n", - " date\n", - "\n", - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "entity_extractor.display(doc)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The extracted entities are stored in the `entities` variable. Each entity contains the following information:\n", - "\n", - "- `text`: The text of the entity.\n", - "- `label`: The label of the entity.\n", - "- `start_index`: The start index of the entity in the text.\n", - "- `end_index`: The end index of the entity in the text.\n", - "- `score`: The score of the entity. It shows how certain the model is that the entity is relevant.\n", - "- `type`: The type of the entity (taken from the defined `labels` variable list).\n", - "- `regex`: The regular expression the entity must match." - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[Entity(text='John Doe', label='name', start_index=30, end_index=38, score=0.9961156845092773, type='string', regex='.*'),\n", - " Entity(text='15-01-1985', label='date of birth', start_index=54, end_index=64, score=0.9937193393707275, type='date', regex='(\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{2,4})|(\\\\d{2,4}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2})'),\n", - " Entity(text='20-05-2024', label='date', start_index=86, end_index=96, score=0.9867385625839233, type='date', regex='(\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{2,4})|(\\\\d{2,4}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2})'),\n", - " Entity(text='123-45-6789', label='social security number', start_index=121, end_index=132, score=0.9993416666984558, type='custom', regex='[0-9]{3}-[0-9]{2}-[0-9]{4}'),\n", - " Entity(text='John Doe', label='name', start_index=157, end_index=165, score=0.994924783706665, type='string', regex='.*'),\n", - " Entity(text='15-11-2024', label='date', start_index=717, end_index=727, score=0.8285622596740723, type='date', regex='(\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{2,4})|(\\\\d{2,4}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2})')]" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "entities" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Advices and suggestions" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Use specific label names.**\n", - "In the above example, we used specific label names to extract the entities. If \n", - "we use a less specific name, the entity extractor might not find any relevant entity.\n", - "\n", - "For instance, when using `social security number` as the label name, the entity extractor\n", - "is able to extract the social security number from the text. However, if we use `ssn` or \n", - "just `number` as the label name, the entity extractor might not find any relevant entity.\n", - "\n", - "
\n", - "

Tip

\n", - "

\n", - " Using more specific label names is better.\n", - "

\n", - "
" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Use custom regex patterns.**\n", - "In the `anonipy` package, we provide some predefined entity types, which are:\n", - "\n", - "- `string`. Extracts a string from the text.\n", - "- `integer`. Extracts an integer from the text.\n", - "- `float`. Extracts a float from the text.\n", - "- `date`. Extracts a date from the text.\n", - "- `email`. Extracts an email address from the text.\n", - "- `phone_number`. Extracts a phone number from the text.\n", - "- `website_url`. Extracts an URL from the text.\n", - "\n", - "These entity types also have a corresponding regex pattern, as defined in the `anonipy.anonymize.regex` submodule." - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "string : .*\n", - "integer : \\d+\n", - "float : [\\d\\.,]+\n", - "date : (\\d{1,2}[\\/\\-\\.]\\d{1,2}[\\/\\-\\.]\\d{2,4})|(\\d{2,4}[\\/\\-\\.]\\d{1,2}[\\/\\-\\.]\\d{1,2})\n", - "email : [a-zA-Z0-9.!#$%&’*+/=?^_`{|}~-]+@[a-zA-Z0-9-]+(?:\\.[a-zA-Z0-9-]+)*\n", - "phone_number : [(]?[\\+]?[(]?[0-9]{1,3}[)]?[-\\s\\.]?([0-9]{2,}[-\\s\\.]?){2,}([0-9]{3,})\n", - "website_url : ((https?|ftp|smtp):\\/\\/)?(www.)?([a-zA-Z0-9]+\\.)+[a-z]{2,}(\\/[a-zA-Z0-9#\\?\\_\\.\\=\\-\\&]+|\\/?)*\n" - ] - } - ], - "source": [ - "from anonipy.anonymize.regex import regex_map\n", - "\n", - "for type in [\n", - " \"string\",\n", - " \"integer\",\n", - " \"float\",\n", - " \"date\",\n", - " \"email\",\n", - " \"phone_number\",\n", - " \"website_url\",\n", - "]:\n", - " print(f\"{type:<13}: {regex_map(type)}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If the user wants to use a custom regex pattern, they can define it in the `labels` \n", - "variable list. Using a custom regex pattern allows the user to specify a more strict \n", - "pattern that the entity must match.\n", - "\n", - "The custom regex can be specified in the following way:" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "labels = [\n", - " {\"label\": \"name\", \"type\": \"string\"},\n", - " # using the custom regex pattern: type must be 'custom' and specify the regex pattern in the 'regex' key\n", - " {\n", - " \"label\": \"social security number\",\n", - " \"type\": \"custom\",\n", - " \"regex\": \"[0-9]{3}-[0-9]{2}-[0-9]{4}\",\n", - " },\n", - " {\"label\": \"date of birth\", \"type\": \"date\"},\n", - " {\"label\": \"date\", \"type\": \"date\"},\n", - "]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Lets rerun the above example:" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "# ignore the warnings: these are expected due to the use of package dependencies\n", - "entity_extractor = EntityExtractor(labels, lang=LANGUAGES.ENGLISH, score_th=0.5)" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [], - "source": [ - "doc, entities = entity_extractor(original_text)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The extracted entities are the same as before. The difference is that the social \n", - "security number now also had to match the custom regex pattern." - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
Medical Record

Patient Name: \n", - "\n", - " John Doe\n", - " name\n", - "\n", - "
Date of Birth: \n", - "\n", - " 15-01-1985\n", - " date of birth\n", - "\n", - "
Date of Examination: \n", - "\n", - " 20-05-2024\n", - " date\n", - "\n", - "
Social Security Number: \n", - "\n", - " 123-45-6789\n", - " social security number\n", - "\n", - "

Examination Procedure:
\n", - "\n", - " John Doe\n", - " name\n", - "\n", - " underwent a routine physical examination. The procedure included measuring vital signs (blood pressure, heart rate, temperature), a comprehensive blood panel, and a cardiovascular stress test. The patient also reported occasional headaches and dizziness, prompting a neurological assessment and an MRI scan to rule out any underlying issues.

Medication Prescribed:

Ibuprofen 200 mg: Take one tablet every 6-8 hours as needed for headache and pain relief.
Lisinopril 10 mg: Take one tablet daily to manage high blood pressure.
Next Examination Date:
\n", - "\n", - " 15-11-2024\n", - " date\n", - "\n", - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "entity_extractor.display(doc)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Creating custom extractors" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The user can develop their own custom extractor. To do this, the custom extractor\n", - "must inherit from the `ExtractorInterface` class. \n", - "\n", - "The extractor must have two methods defined: `__init__` and `__call__`.\n", - "\n", - "An example of a custom extractor that extracts only a specific regex pattern from \n", - "the text is shown below:" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [], - "source": [ - "import re\n", - "from anonipy.anonymize.extractors import ExtractorInterface\n", - "from anonipy.definitions import Entity\n", - "\n", - "\n", - "class CustomExtractor(ExtractorInterface):\n", - "\n", - " def __init__(self):\n", - " # the custom extractor will retrieve entities that follow the regex pattern\n", - " self.regex_pattern = re.compile(r\"\\d{1,2}-\\d{1,2}-\\d{2,4}\")\n", - "\n", - " def __call__(self, text: str) -> tuple[str, list[Entity]]:\n", - " entities = []\n", - " for match in re.finditer(self.regex_pattern, text):\n", - " entities.append(\n", - " Entity(\n", - " text=match.group(),\n", - " label=\"date\",\n", - " start_index=match.start(),\n", - " end_index=match.end(),\n", - " score=1.0,\n", - " type=\"date\",\n", - " regex=self.regex_pattern,\n", - " )\n", - " )\n", - " return text, entities" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [], - "source": [ - "custom_extractor = CustomExtractor()\n", - "_, entities = custom_extractor(original_text)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let us output the extracted entities. Note that the third entity corresponds to a part of the social security number." - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[Entity(text='15-01-1985', label='date', start_index=54, end_index=64, score=1.0, type='date', regex=re.compile('\\\\d{1,2}-\\\\d{1,2}-\\\\d{2,4}')),\n", - " Entity(text='20-05-2024', label='date', start_index=86, end_index=96, score=1.0, type='date', regex=re.compile('\\\\d{1,2}-\\\\d{1,2}-\\\\d{2,4}')),\n", - " Entity(text='23-45-6789', label='date', start_index=122, end_index=132, score=1.0, type='date', regex=re.compile('\\\\d{1,2}-\\\\d{1,2}-\\\\d{2,4}')),\n", - " Entity(text='15-11-2024', label='date', start_index=717, end_index=727, score=1.0, type='date', regex=re.compile('\\\\d{1,2}-\\\\d{1,2}-\\\\d{2,4}'))]" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "entities" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.12" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/docs/documentation/notebooks/02-generators.ipynb b/docs/documentation/notebooks/02-generators.ipynb deleted file mode 100644 index 4064f7c..0000000 --- a/docs/documentation/notebooks/02-generators.ipynb +++ /dev/null @@ -1,917 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Generators" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This chapter showcases the generators in the `anonipy` package.\n", - "\n", - "The main motivation behind generators is to generate replacements for entities. \n", - "In order to do this, `anonipy` has implemented a number of generators for generating:\n", - "\n", - "- strings\n", - "- numbers\n", - "- dates\n", - "\n", - "All of the generators are implemented in the `anonipy.anonymize.generators` module. \n", - "In the following section, we will present each generator in detail." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# used to hide warnings\n", - "import warnings\n", - "\n", - "warnings.filterwarnings(\"ignore\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let us first define the text and the associated entities, as seen in the\n", - "previous chapter (see [Extractors](/documentation/notebooks/01-extractors/))." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "original_text = \"\"\"\\\n", - "Medical Record\n", - "\n", - "Patient Name: John Doe\n", - "Date of Birth: 15-01-1985\n", - "Date of Examination: 20-05-2024\n", - "Social Security Number: 123-45-6789\n", - "\n", - "Examination Procedure:\n", - "John Doe underwent a routine physical examination. The procedure included measuring vital signs (blood pressure, heart rate, temperature), a comprehensive blood panel, and a cardiovascular stress test. The patient also reported occasional headaches and dizziness, prompting a neurological assessment and an MRI scan to rule out any underlying issues.\n", - "\n", - "Medication Prescribed:\n", - "\n", - "Ibuprofen 200 mg: Take one tablet every 6-8 hours as needed for headache and pain relief.\n", - "Lisinopril 10 mg: Take one tablet daily to manage high blood pressure.\n", - "Next Examination Date:\n", - "15-11-2024\n", - "\"\"\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Normally, the entities are extracted using the the `EntityExtractor`. For this section,\n", - "we manually define the entities:" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "from anonipy.definitions import Entity\n", - "\n", - "entities = [\n", - " Entity(\n", - " text=\"John Doe\",\n", - " label=\"name\",\n", - " start_index=30,\n", - " end_index=38,\n", - " score=1.0,\n", - " type=\"string\",\n", - " regex=\".*\",\n", - " ),\n", - " Entity(\n", - " text=\"15-01-1985\",\n", - " label=\"date of birth\",\n", - " start_index=54,\n", - " end_index=64,\n", - " score=1.0,\n", - " type=\"date\",\n", - " regex=\"(\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{2,4})|(\\\\d{2,4}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2})\",\n", - " ),\n", - " Entity(\n", - " text=\"20-05-2024\",\n", - " label=\"date\",\n", - " start_index=86,\n", - " end_index=96,\n", - " score=1.0,\n", - " type=\"date\",\n", - " regex=\"(\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{2,4})|(\\\\d{2,4}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2})\",\n", - " ),\n", - " Entity(\n", - " text=\"123-45-6789\",\n", - " label=\"social security number\",\n", - " start_index=121,\n", - " end_index=132,\n", - " score=1.0,\n", - " type=\"custom\",\n", - " regex=\"[0-9]{3}-[0-9]{2}-[0-9]{4}\",\n", - " ),\n", - " Entity(\n", - " text=\"John Doe\",\n", - " label=\"name\",\n", - " start_index=157,\n", - " end_index=165,\n", - " score=1.0,\n", - " type=\"string\",\n", - " regex=\".*\",\n", - " ),\n", - " Entity(\n", - " text=\"15-11-2024\",\n", - " label=\"date\",\n", - " start_index=717,\n", - " end_index=727,\n", - " score=1.0,\n", - " type=\"date\",\n", - " regex=\"(\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{2,4})|(\\\\d{2,4}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2})\",\n", - " ),\n", - "]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## LLMLabelGenerator" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "
\n", - "

Warning

\n", - "

\n", - " The LLMLabelGenerator utilizes the open source LLMs, \n", - " specifically the Llama 3 model.\n", - " Because the model is quite large, we utilize quantization using the bitsandbytes package to reduce its size.\n", - " Therefore, the LLMLabelGenerator requires at least 8GB GPU and CUDA drivers to be available.\n", - " If these resources are not available on your machine, you can use the MaskLabelGenerator instead.\n", - "

\n", - "
\n", - "\n", - "The `LLMLabelGenerator` is a one-stop-shop generator that utilizes LLMs to generate replacements for entities. It is implemented to support any entity type.\n", - "\n", - "For more details, please check the `LLMLabelGenerator` class implementation." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let us first import the generator and initialize it.\n", - "\n", - "
\n", - "

Info

\n", - "

\n", - " The initialization of LLMLabelGenerator will throw some warnings. Ignore them.\n", - " These are expected due to the use of package dependencies.\n", - "

\n", - "
" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "9a6dfc4a4fd74bcc8351b6b01755f18a", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Loading checkpoint shards: 0%| | 0/4 [00:00XLM-RoBERTa, to generate replacements for entities. It is implemented to support any entity type, but we suggest using it \n", - "with string entities. For other entity types, please use other generators.\n", - "\n", - "For more details, please check the `MaskLabelGenerator` class implementation." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let us first import the generator and initialize it. The generator at initialization \n", - "can receive the following parameters:\n", - "\n", - "- `model_name`: The model to use for the generation (Default: \"FacebookAI/xlm-roberta-large\").\n", - "- `use_gpu`: Whether to use the GPU for the generation (Default: False).\n", - "- `context_window`: The size of the context window to both sides of the entity to use for the generation.\n", - " If the context window is set to 100, the context will be the 100 characters before and after the entity (Default: 100).\n", - "\n", - "
\n", - "

Info

\n", - "

\n", - " The initialization of MaskLabelGenerator will throw some warnings. Ignore them.\n", - " These are expected due to the use of package dependencies.\n", - "

\n", - "
" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Some weights of the model checkpoint at FacebookAI/xlm-roberta-large were not used when initializing XLMRobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']\n", - "- This IS expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", - "- This IS NOT expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n" - ] - } - ], - "source": [ - "from anonipy.anonymize.generators import MaskLabelGenerator\n", - "\n", - "# initialization using default parameters\n", - "mask_generator = MaskLabelGenerator()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To use the generator, we can call the `generate` method. The `generate` method receives the following parameters:\n", - "\n", - "- `entity`: The entity to generate a replacement for.\n", - "- `original_text`: The original text from which the generator will retrieve the context of the entity text." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This generator will create a list of suggestions from which it will select one at random. Therefore, the generator will return different suggestions every time it is called." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'James Smith'" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "mask_generator.generate(entities[0], text=original_text)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'Michael Smith'" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "mask_generator.generate(entities[0], text=original_text)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'David Smith'" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "mask_generator.generate(entities[0], text=original_text)" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "John Doe | name | Thomas David\n", - "15-01-1985 | date of birth | None\n", - "20-05-2024 | date | None\n", - "123-45-6789 | social security number | None\n", - "John Doe | name | Officer first\n", - "15-11-2024 | date | None\n" - ] - } - ], - "source": [ - "for entity in entities:\n", - " print(\n", - " f\"{entity.text:<12} | {entity.label:<22} | {mask_generator.generate(entity, text=original_text)}\"\n", - " )" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Advices and suggestions" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Using only for string entities.**\n", - "As seen from the above examples, the `MaskLabelGenerator` is best used with string entities.\n", - "For number and date entities, it is best to use other generators, such as `NumberGenerator` \n", - "and `DateGenerator`." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## NumberGenerator" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The `NumberGenerator` is a generator for generating random numbers. It is implemented to support integers, floats, and \n", - "phone numbers, but it can be used to generate values for custom types which include numbers.\n", - "\n", - "For more details, please check the `NumberGenerator` class implementation." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let us first import the generator and initialize it. The generator at initialization \n", - "does not need any parameters." - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "from anonipy.anonymize.generators import NumberGenerator\n", - "\n", - "number_generator = NumberGenerator()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To use the generator, we can call the `generate` method. The `generate` method receives the following parameters:\n", - "\n", - "- `entity`: The number entity to generate a replacement for." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This generator will create a suggestion by replacing numeric values in the entity text at random. Therefore, the generator will return different suggestions every time it is called." - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'143-46-4915'" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "number_generator.generate(entities[3])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Furthermore, it will throw an error if the entity type is not `integer`, `float`, `phone_number` or `custom`." - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The entity type must be `integer`, `float`, `phone_number` or `custom` to generate numbers.\n" - ] - } - ], - "source": [ - "try:\n", - " number_generator.generate(entities[0])\n", - "except Exception as e:\n", - " print(e)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## DateGenerator" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The `DateGenerator` is a generator for generating dates. It is implemented to support date entities.\n", - "\n", - "For more details, please check the `DateGenerator` class implementation." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let us first import the generator and initialize it. The generator at initialization \n", - "can receive the following parameters:\n", - "\n", - "- `date_format`: The format in which the dates will be provided and generated (Default: \"%d-%m-%Y\").\n", - "- `day_sigma`: The number of days to add or subtract from the date when using the `random` generator method (see below) (Default: 30)." - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [], - "source": [ - "from anonipy.anonymize.generators import DateGenerator\n", - "\n", - "date_generator = DateGenerator()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To use the generator, we can call the `generate` method. The `generate` method receives the following parameters:\n", - "\n", - "- `entity`: The number entity to generate a replacement for.\n", - "- `output_gen`: the method used to generate the date (Default: \"random\"). It can be one of:\n", - " - `random`: generates a random date that is between `entity` and `entity` $\\pm$ `day_sigma` days.\n", - " - `first_day_of_the_month`: returns the first day of the month of `entity`.\n", - " - `last_day_of_the_month`: returns the last day of the month of `entity`.\n", - " - `middle_of_the_month`: returns the middle day of the month of `entity`.\n", - " - `middle_of_the_year`: returns the middle day of the year of `entity`." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Using the above parameters, this generator will create the appropriate date suggestions:" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'20-05-2024'" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "entities[2].text" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'26-05-2024'" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "date_generator.generate(entities[2], output_gen=\"random\")" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'01-05-2024'" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "date_generator.generate(entities[2], output_gen=\"first_day_of_the_month\")" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'31-05-2024'" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "date_generator.generate(entities[2], output_gen=\"last_day_of_the_month\")" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'15-05-2024'" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "date_generator.generate(entities[2], output_gen=\"middle_of_the_month\")" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'01-07-2024'" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "date_generator.generate(entities[2], output_gen=\"middle_of_the_year\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Furthermore, it will throw an error if the entity type is not `date`." - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The entity type must be `date` to generate dates.\n" - ] - } - ], - "source": [ - "try:\n", - " date_generator.generate(entities[0])\n", - "except Exception as e:\n", - " print(e)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Creating custom generator" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The user can develop their own custom generators. To do this, the custom generator\n", - "must inherit from the `GeneratorInterface` class. \n", - "\n", - "The generator must have two methods defined: `__init__` and `generate`,\n", - "where the `generate` method must accept at least the entity.\n", - "\n", - "An example of a custom generator that will generate only emojis is shown below:" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [], - "source": [ - "import random\n", - "from anonipy.anonymize.generators import GeneratorInterface\n", - "from anonipy.definitions import Entity\n", - "\n", - "\n", - "class CustomGenerator(GeneratorInterface):\n", - "\n", - " def __init__(self):\n", - " self.emojis = [\"😄\", \"🤗\", \"😢\"]\n", - "\n", - " def generate(self, entity: Entity) -> tuple[str, list[Entity]]:\n", - " return random.choice(self.emojis)" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'😄'" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "custom_generator = CustomGenerator()\n", - "custom_generator.generate(entities[0])" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.12" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/docs/documentation/notebooks/03-strategies.ipynb b/docs/documentation/notebooks/03-strategies.ipynb deleted file mode 100644 index a412b4b..0000000 --- a/docs/documentation/notebooks/03-strategies.ipynb +++ /dev/null @@ -1,694 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Strategies" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This chapter showcases how to use the anonymization strategies in the package.\n", - "\n", - "The main motivation behind the anonymization strategies is to streamline the process of data anonymization. The `anonipy` package implements strategies, which can be found in the `anonipy.anonymize.strategies` module.\n", - "\n", - "Furthermore, each strategy has an associated `anonymize` method, which returns the anonymized text and the list of anonymized entities showing which part of the text was anonymized and with which replacement." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# used to hide warnings\n", - "import warnings\n", - "\n", - "warnings.filterwarnings(\"ignore\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let us first define the text and the associated entities, as seen in the\n", - "previous chapter (see [Extractors](/documentation/notebooks/01-extractors/))." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "original_text = \"\"\"\\\n", - "Medical Record\n", - "\n", - "Patient Name: John Doe\n", - "Date of Birth: 15-01-1985\n", - "Date of Examination: 20-05-2024\n", - "Social Security Number: 123-45-6789\n", - "\n", - "Examination Procedure:\n", - "John Doe underwent a routine physical examination. The procedure included measuring vital signs (blood pressure, heart rate, temperature), a comprehensive blood panel, and a cardiovascular stress test. The patient also reported occasional headaches and dizziness, prompting a neurological assessment and an MRI scan to rule out any underlying issues.\n", - "\n", - "Medication Prescribed:\n", - "\n", - "Ibuprofen 200 mg: Take one tablet every 6-8 hours as needed for headache and pain relief.\n", - "Lisinopril 10 mg: Take one tablet daily to manage high blood pressure.\n", - "Next Examination Date:\n", - "15-11-2024\n", - "\"\"\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Normally, the entities are extracted using the the `EntityExtractor`. For this section,\n", - "we manually define the entities:" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "from anonipy.definitions import Entity\n", - "\n", - "entities = [\n", - " Entity(\n", - " text=\"John Doe\",\n", - " label=\"name\",\n", - " start_index=30,\n", - " end_index=38,\n", - " score=1.0,\n", - " type=\"string\",\n", - " regex=\".*\",\n", - " ),\n", - " Entity(\n", - " text=\"15-01-1985\",\n", - " label=\"date of birth\",\n", - " start_index=54,\n", - " end_index=64,\n", - " score=1.0,\n", - " type=\"date\",\n", - " regex=\"(\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{2,4})|(\\\\d{2,4}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2})\",\n", - " ),\n", - " Entity(\n", - " text=\"20-05-2024\",\n", - " label=\"date\",\n", - " start_index=86,\n", - " end_index=96,\n", - " score=1.0,\n", - " type=\"date\",\n", - " regex=\"(\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{2,4})|(\\\\d{2,4}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2})\",\n", - " ),\n", - " Entity(\n", - " text=\"123-45-6789\",\n", - " label=\"social security number\",\n", - " start_index=121,\n", - " end_index=132,\n", - " score=1.0,\n", - " type=\"custom\",\n", - " regex=\"[0-9]{3}-[0-9]{2}-[0-9]{4}\",\n", - " ),\n", - " Entity(\n", - " text=\"John Doe\",\n", - " label=\"name\",\n", - " start_index=157,\n", - " end_index=165,\n", - " score=1.0,\n", - " type=\"string\",\n", - " regex=\".*\",\n", - " ),\n", - " Entity(\n", - " text=\"15-11-2024\",\n", - " label=\"date\",\n", - " start_index=717,\n", - " end_index=727,\n", - " score=1.0,\n", - " type=\"date\",\n", - " regex=\"(\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{2,4})|(\\\\d{2,4}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2})\",\n", - " ),\n", - "]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## RedactionStrategy" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Data redaction is the process of obscuring information that’s personally identifiable, confidential, classified or sensitive.\n", - "\n", - "The `RedactionStrategy` anonymizes the original text by replacing the entities in the text with a predefined substitute label, which defaults to `[REDACTED]`.\n", - "\n", - "
\n", - "

Info

\n", - "

\n", - " The redaction strategy hides sensitive information by replacing the original entities with a string that does not\n", - " reveal any information about the original. While this is useful for obscuring information, it does change the\n", - " text's distribution, which can effect the training of machine learning models.\n", - "

\n", - "
" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "from anonipy.anonymize.strategies import RedactionStrategy" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "redaction_strategy = RedactionStrategy(substitute_label=\"[REDACTED]\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Using the strategy, we can anonymize the text:" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "anonymized_text, replacements = redaction_strategy.anonymize(original_text, entities)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The anonymized text is:" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Medical Record\n", - "\n", - "Patient Name: [REDACTED]\n", - "Date of Birth: [REDACTED]\n", - "Date of Examination: [REDACTED]\n", - "Social Security Number: [REDACTED]\n", - "\n", - "Examination Procedure:\n", - "[REDACTED] underwent a routine physical examination. The procedure included measuring vital signs (blood pressure, heart rate, temperature), a comprehensive blood panel, and a cardiovascular stress test. The patient also reported occasional headaches and dizziness, prompting a neurological assessment and an MRI scan to rule out any underlying issues.\n", - "\n", - "Medication Prescribed:\n", - "\n", - "Ibuprofen 200 mg: Take one tablet every 6-8 hours as needed for headache and pain relief.\n", - "Lisinopril 10 mg: Take one tablet daily to manage high blood pressure.\n", - "Next Examination Date:\n", - "[REDACTED]\n", - "\n" - ] - } - ], - "source": [ - "print(anonymized_text)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "And the associated replacements are:" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[{'original_text': 'John Doe',\n", - " 'label': 'name',\n", - " 'start_index': 30,\n", - " 'end_index': 38,\n", - " 'anonymized_text': '[REDACTED]'},\n", - " {'original_text': '15-01-1985',\n", - " 'label': 'date of birth',\n", - " 'start_index': 54,\n", - " 'end_index': 64,\n", - " 'anonymized_text': '[REDACTED]'},\n", - " {'original_text': '20-05-2024',\n", - " 'label': 'date',\n", - " 'start_index': 86,\n", - " 'end_index': 96,\n", - " 'anonymized_text': '[REDACTED]'},\n", - " {'original_text': '123-45-6789',\n", - " 'label': 'social security number',\n", - " 'start_index': 121,\n", - " 'end_index': 132,\n", - " 'anonymized_text': '[REDACTED]'},\n", - " {'original_text': 'John Doe',\n", - " 'label': 'name',\n", - " 'start_index': 157,\n", - " 'end_index': 165,\n", - " 'anonymized_text': '[REDACTED]'},\n", - " {'original_text': '15-11-2024',\n", - " 'label': 'date',\n", - " 'start_index': 717,\n", - " 'end_index': 727,\n", - " 'anonymized_text': '[REDACTED]'}]" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "replacements" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## MaskingStrategy" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Data masking refers to the disclosure of data with modified values. Data anonymization is done by creating a mirror image of a database and implementing alteration strategies, such as character shuffling, encryption, term, or character substitution. For example, a value character may be replaced by a symbol such as “*” or “x.” It makes identification or reverse engineering difficult.\n", - "\n", - "The `MaskingStrategy` anonymizes the original text by replacing the entities with masks, which are created using the subsitute label, which defaults to `*`.\n", - "\n", - "
\n", - "

Info

\n", - "

\n", - " The masking strategy is useful as it hides the original sensitive values and retains the original text's length.\n", - " However, it also changes the original text's meaning and distribution, as the replacement values are not the \n", - " same as the original values.\n", - "

\n", - "
" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "from anonipy.anonymize.strategies import MaskingStrategy" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "masking_strategy = MaskingStrategy(substitute_label=\"*\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Using the strategy, we can anonymize the text:" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "anonymized_text, replacements = masking_strategy.anonymize(original_text, entities)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The anonymized text is:" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Medical Record\n", - "\n", - "Patient Name: **** ***\n", - "Date of Birth: **********\n", - "Date of Examination: **********\n", - "Social Security Number: ***********\n", - "\n", - "Examination Procedure:\n", - "**** *** underwent a routine physical examination. The procedure included measuring vital signs (blood pressure, heart rate, temperature), a comprehensive blood panel, and a cardiovascular stress test. The patient also reported occasional headaches and dizziness, prompting a neurological assessment and an MRI scan to rule out any underlying issues.\n", - "\n", - "Medication Prescribed:\n", - "\n", - "Ibuprofen 200 mg: Take one tablet every 6-8 hours as needed for headache and pain relief.\n", - "Lisinopril 10 mg: Take one tablet daily to manage high blood pressure.\n", - "Next Examination Date:\n", - "**********\n", - "\n" - ] - } - ], - "source": [ - "print(anonymized_text)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "And the associated replacements are:" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[{'original_text': 'John Doe',\n", - " 'label': 'name',\n", - " 'start_index': 30,\n", - " 'end_index': 38,\n", - " 'anonymized_text': '**** ***'},\n", - " {'original_text': '15-01-1985',\n", - " 'label': 'date of birth',\n", - " 'start_index': 54,\n", - " 'end_index': 64,\n", - " 'anonymized_text': '**********'},\n", - " {'original_text': '20-05-2024',\n", - " 'label': 'date',\n", - " 'start_index': 86,\n", - " 'end_index': 96,\n", - " 'anonymized_text': '**********'},\n", - " {'original_text': '123-45-6789',\n", - " 'label': 'social security number',\n", - " 'start_index': 121,\n", - " 'end_index': 132,\n", - " 'anonymized_text': '***********'},\n", - " {'original_text': 'John Doe',\n", - " 'label': 'name',\n", - " 'start_index': 157,\n", - " 'end_index': 165,\n", - " 'anonymized_text': '**** ***'},\n", - " {'original_text': '15-11-2024',\n", - " 'label': 'date',\n", - " 'start_index': 717,\n", - " 'end_index': 727,\n", - " 'anonymized_text': '**********'}]" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "replacements" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## PseudonymizationStrategy" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Pseudonymization is a data de-identification tool that substitutes private identifiers with false identifiers or pseudonyms, such as swapping the “John Smith” identifier with the “Mark Spencer” identifier. It maintains statistical precision and data confidentiality, allowing changed data to be used for creation, training, testing, and analysis, while at the same time maintaining data privacy.\n", - "\n", - "The `PseudonymizationStrategy` anonymizes the original text by replacing the entities with fake ones, which are created using the generators (see [Generators](/documentation/notebooks/02-generators/)).\n", - "\n", - "
\n", - "

Info

\n", - "

\n", - " The pseudonymization strategy is the most useful in terms of retaining the statistical distributions of the text. However, it is also most technical, as the user must define a function for mapping true entities to fake ones. Furthermore, if an entity appears multiple times the pseudonymization strategy will retain the same mapping between the true and fake entities.\n", - "

\n", - "
" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The `PseudonymizationStrategy` requires a function for mapping entities. In our example, we will define a function using the generators.\n", - "To make the example accessible as possible, we will use the [MaskLabelGenerator](/documentation/notebooks/02-generators/#masklabelgenerator)\n", - "instead of the [LLMLabelGenerator](/documentation/notebooks/02-generators/#llmlabelgenerator) for generating string entities." - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "from anonipy.anonymize.generators import (\n", - " MaskLabelGenerator,\n", - " DateGenerator,\n", - " NumberGenerator,\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Some weights of the model checkpoint at FacebookAI/xlm-roberta-large were not used when initializing XLMRobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']\n", - "- This IS expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", - "- This IS NOT expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n" - ] - } - ], - "source": [ - "mask_generator = MaskLabelGenerator()\n", - "date_generator = DateGenerator()\n", - "number_generator = NumberGenerator()" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [], - "source": [ - "def anonymization_mapping(text, entity):\n", - " if entity.type == \"string\":\n", - " return mask_generator.generate(entity, text)\n", - " if entity.label == \"date\":\n", - " return date_generator.generate(entity, output_gen=\"middle_of_the_month\")\n", - " if entity.label == \"date of birth\":\n", - " return date_generator.generate(entity, output_gen=\"middle_of_the_year\")\n", - " if entity.label == \"social security number\":\n", - " return number_generator.generate(entity)\n", - " return \"[REDACTED]\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let us initialize the strategy:" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], - "source": [ - "from anonipy.anonymize.strategies import PseudonymizationStrategy" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [], - "source": [ - "pseudo_strategy = PseudonymizationStrategy(mapping=anonymization_mapping)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Using the strategy, we can anonymize the text:" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [], - "source": [ - "anonymized_text, replacements = pseudo_strategy.anonymize(original_text, entities)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The anonymized text is:" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Medical Record\n", - "\n", - "Patient Name: first Professor\n", - "Date of Birth: 01-07-1985\n", - "Date of Examination: 15-05-2024\n", - "Social Security Number: 724-78-8182\n", - "\n", - "Examination Procedure:\n", - "first Professor underwent a routine physical examination. The procedure included measuring vital signs (blood pressure, heart rate, temperature), a comprehensive blood panel, and a cardiovascular stress test. The patient also reported occasional headaches and dizziness, prompting a neurological assessment and an MRI scan to rule out any underlying issues.\n", - "\n", - "Medication Prescribed:\n", - "\n", - "Ibuprofen 200 mg: Take one tablet every 6-8 hours as needed for headache and pain relief.\n", - "Lisinopril 10 mg: Take one tablet daily to manage high blood pressure.\n", - "Next Examination Date:\n", - "15-11-2024\n", - "\n" - ] - } - ], - "source": [ - "print(anonymized_text)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "And the associated replacements are:" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[{'original_text': 'John Doe',\n", - " 'label': 'name',\n", - " 'start_index': 30,\n", - " 'end_index': 38,\n", - " 'anonymized_text': 'first Professor'},\n", - " {'original_text': '15-01-1985',\n", - " 'label': 'date of birth',\n", - " 'start_index': 54,\n", - " 'end_index': 64,\n", - " 'anonymized_text': '01-07-1985'},\n", - " {'original_text': '20-05-2024',\n", - " 'label': 'date',\n", - " 'start_index': 86,\n", - " 'end_index': 96,\n", - " 'anonymized_text': '15-05-2024'},\n", - " {'original_text': '123-45-6789',\n", - " 'label': 'social security number',\n", - " 'start_index': 121,\n", - " 'end_index': 132,\n", - " 'anonymized_text': '724-78-8182'},\n", - " {'original_text': 'John Doe',\n", - " 'label': 'name',\n", - " 'start_index': 157,\n", - " 'end_index': 165,\n", - " 'anonymized_text': 'first Professor'},\n", - " {'original_text': '15-11-2024',\n", - " 'label': 'date',\n", - " 'start_index': 717,\n", - " 'end_index': 727,\n", - " 'anonymized_text': '15-11-2024'}]" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "replacements" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.12" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/docs/documentation/notebooks/04-utility.ipynb b/docs/documentation/notebooks/04-utility.ipynb deleted file mode 100644 index 49600bf..0000000 --- a/docs/documentation/notebooks/04-utility.ipynb +++ /dev/null @@ -1,32 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Utility" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This chapter showcases how to use the utility functions of the package.\n", - "\n", - "
\n", - "

Info

\n", - "

\n", - " This chapter is still under development. Stay tuned! \n", - "

\n", - "
" - ] - } - ], - "metadata": { - "language_info": { - "name": "python" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/docs/blog/.authors.yml b/docs/how-to-guides/.authors.yml similarity index 100% rename from docs/blog/.authors.yml rename to docs/how-to-guides/.authors.yml diff --git a/docs/how-to-guides/index.md b/docs/how-to-guides/index.md new file mode 100644 index 0000000..9f0b797 --- /dev/null +++ b/docs/how-to-guides/index.md @@ -0,0 +1 @@ +# How-To Guides \ No newline at end of file diff --git a/docs/blog/posts/anonymizing-collections-of-documents.md b/docs/how-to-guides/posts/anonymizing-collections-of-documents.md similarity index 60% rename from docs/blog/posts/anonymizing-collections-of-documents.md rename to docs/how-to-guides/posts/anonymizing-collections-of-documents.md index b91b3f2..b16c245 100644 --- a/docs/blog/posts/anonymizing-collections-of-documents.md +++ b/docs/how-to-guides/posts/anonymizing-collections-of-documents.md @@ -1,8 +1,8 @@ --- -date: 2024-05-23 +date: 2024-07-16 authors: [eriknovak] description: > - Our package can be used to anonymize collections of documents. + Anonipy can be used to anonymize collections of documents. categories: - Tutorial --- @@ -47,7 +47,7 @@ consists of preparing the entity extractor, the anonymization strategy, and the generators for the anonymization process. ```python -from anonipy.anonymize.extractors import EntityExtractor +from anonipy.anonymize.extractors import NERExtractor from anonipy.anonymize.generators import ( MaskLabelGenerator, DateGenerator, @@ -69,7 +69,7 @@ labels = [ ] # initialize the entity extractor -entity_extractor = EntityExtractor( +entity_extractor = NERExtractor( labels, lang=LANGUAGES.ENGLISH, score_th=0.5 ) @@ -87,9 +87,9 @@ def anonymization_mapping(text, entity): if entity.type == "string": return mask_generator.generate(entity, text) if entity.label == "date": - return date_generator.generate(entity, output_gen="middle_of_the_month") + return date_generator.generate(entity, sub_variant="MIDDLE_OF_THE_MONTH") if entity.label == "date of birth": - return date_generator.generate(entity, output_gen="middle_of_the_year") + return date_generator.generate(entity, sub_variant="MIDDLE_OF_THE_YEAR") if entity.label == "social security number": return number_generator.generate(entity) return "[REDACTED]" @@ -126,15 +126,21 @@ file_names = [ # iterate through each file for file_name in file_names: + # extract the text from the document file_text = open_file(join(input_folder, file_name)) + # extract the entities from the text doc, entities = entity_extractor(file_text) + # anonymize the text anonymized_text, replacements = pseudo_strategy.anonymize(file_text, entities) + # write the anonymized text into the output folder output_file_name = ".".join(file_name.split(".")[:-1]) + "_anonymized" write_file(anonymized_text, join(output_folder, output_file_name) + ".txt") + + # write the replacements into the output folder write_json(replacements, join(output_folder, output_file_name) + ".json") ``` @@ -151,3 +157,90 @@ process. We then find all of the files we want to anonymize and anonymize them. Each anonymized file is finally stored in a separate folder which contains the anonymized text. +## Full code + + +```python +import os +from os.path import isfile, join + +from anonipy.anonymize.extractors import NERExtractor +from anonipy.anonymize.generators import ( + MaskLabelGenerator, + DateGenerator, + NumberGenerator, +) +from anonipy.anonymize.strategies import PseudonymizationStrategy +from anonipy.utils.file_system import open_file, write_file, write_json +from anonipy.constants import LANGUAGES + +# =============================================== +# Preparing the anonymization components +# =============================================== + +# define the labels to be extracted and their types +labels = [ + {"label": "name", "type": "string"}, + {"label": "social security number", "type": "custom"}, + {"label": "date of birth", "type": "date"}, + {"label": "date", "type": "date"}, +] + +# initialize the entity extractor +entity_extractor = NERExtractor( + labels, lang=LANGUAGES.ENGLISH, score_th=0.5 +) + +# initialize the generators +mask_generator = MaskLabelGenerator() +date_generator = DateGenerator() +number_generator = NumberGenerator() + +# prepare the anonymization mapping +def anonymization_mapping(text, entity): + if entity.type == "string": + return mask_generator.generate(entity, text) + if entity.label == "date": + return date_generator.generate(entity, sub_variant="MIDDLE_OF_THE_MONTH") + if entity.label == "date of birth": + return date_generator.generate(entity, sub_variant="MIDDLE_OF_THE_YEAR") + if entity.label == "social security number": + return number_generator.generate(entity) + return "[REDACTED]" + +# initialize the pseudonymization strategy +pseudo_strategy = PseudonymizationStrategy(mapping=anonymization_mapping) + +# =============================================== +# Anonymize the collection of documents +# =============================================== + +# prepare the input and output folder paths +input_folder = "path/to/input/folder" +output_folder = "path/to/output/folder" + +# prepare a list of file paths in the input folder +file_names = [ + f for f in os.listdir(input_folder) if isfile(join(input_folder, f)) +] + +# iterate through each file +for file_name in file_names: + + # extract the text from the document + file_text = open_file(join(input_folder, file_name)) + + # extract the entities from the text + doc, entities = entity_extractor(file_text) + + # anonymize the text + anonymized_text, replacements = pseudo_strategy.anonymize(file_text, entities) + + # write the anonymized text into the output folder + output_file_name = ".".join(file_name.split(".")[:-1]) + "_anonymized" + write_file(anonymized_text, join(output_folder, output_file_name) + ".txt") + + # write the replacements into the output folder + write_json(replacements, join(output_folder, output_file_name) + ".json") + +``` \ No newline at end of file diff --git a/docs/how-to-guides/posts/anonymizing-documents.md b/docs/how-to-guides/posts/anonymizing-documents.md new file mode 100644 index 0000000..70339f2 --- /dev/null +++ b/docs/how-to-guides/posts/anonymizing-documents.md @@ -0,0 +1,238 @@ +--- +date: 2024-07-16 +authors: [eriknovak] +description: > + Anonipy can be used to anonymize a document such as PDF and word documents. +categories: + - Tutorial +--- + +# Anonymizing documents + +The [anonipy][anonipy] package was designed for anonymizing text. However, a lot of text data can be found in document form, such as PDFs, word documents, and other. Copying the text from the documents to be anonymized can be cumbersome. The `anonipy` package provides utility functions that extracts the text from the documents. + + +In this blog post, we explain how `anonipy` can be used to anonymize texts in document form. + + + +!!! info "Prerequisites" + To use the `anonipy` package, we must have Python version 3.8 or higher + installed on the machine. + +## Installation + +Before we start, we must first install the `anonipy` package. To do that, run the +following command in the terminal: + +```bash +pip install anonipy +``` + +This will install the `anonipy` package, which contains all of the required modules. + +If you already installed it and would like to update it, run the following command: + +```bash +pip install anonipy --upgrade +``` + +## Document anonymization + +### Extracting the text from the document + +Next, we will use the `anonipy` package to anonymize the text in the document. First, we must extract the text. This can be done using the package's utility function [open_file][anonipy.utils.file_system.open_file]. The function supports extraction of text from `doc`, `docx`, `pdf` and `txt` files. + +To extract the text, using the following code: + +```python +from anonipy.utils.file_system import open_file + +file_path = "path/to/file.txt" +file_text = open_file(file_path) +``` + +where `file_path` is the path to the document we want to anonymize. The [open_file][anonipy.utils.file_system.open_file] will open the document, extract the content, and return it as a string. + +Once this is done, we can start anonymizing the text, in a regular way. + +### Extracting personal information from the text + +Now we can identify and extract personal information from the text. We do this by using [NERExtractor][anonipy.anonymize.extractors.NERExtractor], an extractor that leverages the [GLiNER](https://github.com/urchade/GLiNER) span-based NER models. + +It returns the text and the extracted entities. + +```python +from anonipy.constants import LANGUAGES +from anonipy.anonymize.extractors import NERExtractor + +# define the labels to be extracted and their types +labels = [ + {"label": "name", "type": "string"}, + {"label": "social security number", "type": "custom"}, + {"label": "date of birth", "type": "date"}, + {"label": "date", "type": "date"}, +] + +# initialize the entity extractor +extractor = NERExtractor( + labels, lang=LANGUAGES.ENGLISH, score_th=0.5 +) +# extract the entities from the original text +doc, entities = extractor(file_text) +``` + +To display the entities in the original text, we can use the [display][anonipy.anonymize.extractors.NERExtractor.display] method: + +```python +extractor.display(doc) +``` + + +### Preparing the anonymization mapping + +Next, we prepare the anonymization mapping. We do this by using the generators module part of the `anonipy` package. The generators are used to generate substitutes for the entities. + +For example, we can use [MaskLabelGenerator][anonipy.anonymize.generators.MaskLabelGenerator] to generate substitutes using the language models to solve a `mask-filling` problem, i.e. finding the words that would be probabilistically suitable to replace the entity in the text. + +The full list of available generators can be found in the [generators][anonipy.anonymize.generators] submodule. + +Furthermore, we use the [PseudonymizationStrategy][anonipy.anonymize.strategies.PseudonymizationStrategy] to anonymize the text. More on anonymization strategies can be found in the [strategies][anonipy.anonymize.strategies] submodule. + + +```python +from anonipy.anonymize.generators import ( + MaskLabelGenerator, + DateGenerator, + NumberGenerator, +) +from anonipy.anonymize.strategies import PseudonymizationStrategy + +# initialize the generators +mask_generator = MaskLabelGenerator() +date_generator = DateGenerator() +number_generator = NumberGenerator() + +# prepare the anonymization mapping +def anonymization_mapping(text, entity): + if entity.type == "string": + return mask_generator.generate(entity, text) + if entity.label == "date": + return date_generator.generate(entity, sub_variant="MIDDLE_OF_THE_MONTH") + if entity.label == "date of birth": + return date_generator.generate(entity, sub_variant="MIDDLE_OF_THE_YEAR") + if entity.label == "social security number": + return number_generator.generate(entity) + return "[REDACTED]" + +# initialize the pseudonymization strategy +pseudo_strategy = PseudonymizationStrategy(mapping=anonymization_mapping) +``` + +### Anonymizing the text + +Once we prepare the anonymization strategy, we can use it to anonymize the text. + +```python +# anonymize the original text +anonymized_text, replacements = pseudo_strategy.anonymize(file_text, entities) +``` + +### Saving the anonymized text + +Finally, we can save the anonymized text to a file. This can be done using the [write_file][anonipy.utils.file_system.write_file] function from the [file_system][anonipy.utils.file_system] submodule. + +```python +from anonipy.utils.file_system import write_file + +output_file = "path/to/output_file.txt" +write_file(anonymized_text, output_file, encode="utf-8") +``` + +Where `output_file` is the path to the file where the anonymized text will be saved. + + +## Conclusion + +In this blog post, we show how one can anonymize a document using the `anonipy` package. We first used the [open_file][anonipy.utils.file_system.open_file] utility function to extract the content of the document and store it as a string. We then used the [NERExtractor][anonipy.anonymize.extractors.NERExtractor] to identify and extract personal information form the text, and the [PseudonymizationStrategy][anonipy.anonymize.strategies.PseudonymizationStrategy] in combination with various [generators][anonipy.anonymize.generators] to anonymize the text. Finally, we used the [write_file][anonipy.utils.file_system.write_file] utility function to save the anonymized text to a file. + +This process is very straightforward and can be applied to almost any document type. Furthermore, it can be expanded to process multiple documents written in the same language at once. Stay tuned to see how this can be done in the future! + +## Full code + +```python +from anonipy.anonymize.extractors import NERExtractor +from anonipy.anonymize.generators import ( + MaskLabelGenerator, + DateGenerator, + NumberGenerator, +) +from anonipy.anonymize.strategies import PseudonymizationStrategy + +from anonipy.utils.file_system import open_file, write_file +from anonipy.constants import LANGUAGES + +# ===================================== +# Read the file content +# ===================================== + +# load the file content +file_path = "path/to/file.txt" +file_text = open_file(file_path) + +# ===================================== +# Extract the entities +# ===================================== + +# define the labels to be extracted and their types +labels = [ + {"label": "name", "type": "string"}, + {"label": "social security number", "type": "custom"}, + {"label": "date of birth", "type": "date"}, + {"label": "date", "type": "date"}, +] + +# initialize the entity extractor +extractor = NERExtractor( + labels, lang=LANGUAGES.ENGLISH, score_th=0.5 +) + +# extract the entities from the original text +doc, entities = extractor(file_text) + +# ===================================== +# Prepare the anonymization strategy +# and anonymize the text +# ===================================== + +# initialize the generators +mask_generator = MaskLabelGenerator() +date_generator = DateGenerator() +number_generator = NumberGenerator() + +# prepare the anonymization mapping +def anonymization_mapping(text, entity): + if entity.type == "string": + return mask_generator.generate(entity, text) + if entity.label == "date": + return date_generator.generate(entity, sub_variant="MIDDLE_OF_THE_MONTH") + if entity.label == "date of birth": + return date_generator.generate(entity, sub_variant="MIDDLE_OF_THE_YEAR") + if entity.label == "social security number": + return number_generator.generate(entity) + return "[REDACTED]" + +# initialize the pseudonymization strategy +pseudo_strategy = PseudonymizationStrategy(mapping=anonymization_mapping) + +# anonymize the original text +anonymized_text, replacements = pseudo_strategy.anonymize(file_text, entities) + +# ===================================== +# Save the anonymized text +# ===================================== + +# save the anonymized text to a file +output_file = "path/to/output_file.txt" +write_file(anonymized_text, output_file, encode="utf-8") +``` diff --git a/docs/how-to-guides/posts/extractors-overview.md b/docs/how-to-guides/posts/extractors-overview.md new file mode 100644 index 0000000..4fbc1e7 --- /dev/null +++ b/docs/how-to-guides/posts/extractors-overview.md @@ -0,0 +1,395 @@ +--- +date: 2024-07-15 +authors: [eriknovak] +description: > + The overview of the implemented extractors. +categories: + - Overview +--- + +# Extractors overview + +In this post, we will show an overview of the implemented extractors. The extractors are used to extract relevant `named entities` from text. These entities can be people names, organizations, addresses, social security numbers, etc. The entities are then used to anonymize the text. + +All extractors and their API references are available in the [extractors][anonipy.anonymize.extractors] module. What follows is the presentation of the different extractors `anonipy` provides. + + + + + + +## Pre-requisites + +Let us first define the text, from which we want to extract the entities. + +```python +original_text = """\ +Medical Record + +Patient Name: John Doe +Date of Birth: 15-01-1985 +Date of Examination: 20-05-2024 +Social Security Number: 123-45-6789 + +Examination Procedure: +John Doe underwent a routine physical examination. The procedure included measuring vital signs (blood pressure, heart rate, temperature), a comprehensive blood panel, and a cardiovascular stress test. The patient also reported occasional headaches and dizziness, prompting a neurological assessment and an MRI scan to rule out any underlying issues. + +Medication Prescribed: + +Ibuprofen 200 mg: Take one tablet every 6-8 hours as needed for headache and pain relief. +Lisinopril 10 mg: Take one tablet daily to manage high blood pressure. +Next Examination Date: +15-11-2024 +""" +``` + + + + +## Language configuration + +Each extractor requires a language to be configured. The language is used to determine how to process the text. If the language is not specified, the extractor will use the default language. The default language is `ENGLISH`. + +To make it easier to switch languages, we can use the [LANGUAGES][anonipy.constants.LANGUAGES] constant. + +```python +from anonipy.constants import LANGUAGES + +LANGUAGE.ENGLISH# (1)! +``` + +1. The `LANGUAGE.ENGLISH` return the `("en", "English")` literal tuple, which is the format required by the extractors. + + + + +### Using the language detector + +An alternative is to use a language detector available in the [language_detector][anonipy.utils.language_detector] module. The detector utilizes the [lingua](https://github.com/pemistahl/lingua-py) python package, and allows automatic detection of the language of the text. + +```python +from anonipy.utils.language_detector import LanguageDetector + +# initialize the language detector and detect the language +language_detector = LanguageDetector() +language_detector(original_text)# (1)! +``` + +1. The `language_detector` returns the literal tuple `("en", "English")`, similar to the `LANGUAGE.ENGLISH`, making it compatible with the extractors. + + + + +## Named Entity + +Each extractor will extract the `named entities` from the text. The entities can be people names, organizations, addresses, social security numbers, etc. The entities are represented using the [Entity][anonipy.definitions.Entity] dataclass, which consists of the following parameters: + +::: anonipy.definitions.Entity + options: + show_root_heading: False + show_docstring_description: False + show_source: False + + + + +## Extractors + +All following extractors are available in the [extractors][anonipy.anonymize.extractors] module. + + + + +### Named entity recognition (NER) extractor + +The [NERExtractor][anonipy.anonymize.extractors.NERExtractor] extractor uses a span-based NER model to identify the relevant entities in the text. Furthermore, it uses the [GLiNER](https://github.com/urchade/GLiNER) span-based NER model, specifically the model finetuned for recognizing Personal Identifiable Information (PII) within text. The model has been finetuned on six languages (English, French, German, Spanish, Italian, and Portuguese), but can be applied also to other languages. + +```python +from anonipy.anonymize.extractors import NERExtractor +``` + +The `NERExtractor` takes the following input parameters: + +::: anonipy.anonymize.extractors.NERExtractor.__init__ + options: + show_root_heading: False + show_docstring_description: False + show_docstring_examples: False + show_source: False + +We must define the labels to be extracted and their types. In this example, we will extract the following entities: + +```python +labels = [ + {"label": "name", "type": "string"}, + {"label": "social security number", "type": "custom", "regex": "[0-9]{3}-[0-9]{2}-[0-9]{4}"}, + {"label": "date of birth", "type": "date"}, + {"label": "date", "type": "date"}, +] +``` + +Let us now initialize the entity extractor. + +```python +ner_extractor = NERExtractor(labels, lang=LANGUAGES.ENGLISH, score_th=0.5) +``` + +!!! info "Initialization warnings" + The initialization of `NERExtractor` will throw some warnings. Ignore them. These are expected due to the use of package dependencies. + + +The `NERExtractor` receives the text to be anonymized and returns the enriched text document and the extracted entities. + +```python +doc, entities = ner_extractor(original_text) +``` + +The entities extracted within the input text are: + +```python +ner_extractor.display(doc) +``` + +
Medical Record

Patient Name: + + John Doe + name + +
Date of Birth: + + 15-01-1985 + date + of birth + +
Date of Examination: + + 20-05-2024 + date + +
Social Security Number: + + 123-45-6789 + social + security number + +

Examination Procedure:
+ + John Doe + name + + underwent a routine physical examination. The procedure included measuring vital signs (blood pressure, heart + rate, temperature), a comprehensive blood panel, and a cardiovascular stress test. The patient also reported + occasional headaches and dizziness, prompting a neurological assessment and an MRI scan to rule out any + underlying issues.

Medication Prescribed:

Ibuprofen 200 mg: Take one tablet every 6-8 hours as + needed for headache and pain relief.
Lisinopril 10 mg: Take one tablet daily to manage high blood + pressure.
Next Examination Date:
+ + 15-11-2024 + date + +
+ +**Advices and suggestions** + +**Use specific label names.** +In the above example, we used specific label names to extract the entities. If we use a less specific name, the entity extractor might not find any relevant entity. + +For instance, when using `social security number` as the label name, the entity extractor is able to extract the social security number from the text. However, if we use `ssn` or just `number` as the label name, the entity extractor might not find any relevant entity. + +!!! tip + Using more specific label names is better. + + + +**Use custom regex patterns.** +In the `anonipy` package, we provide some predefined [ENTITY_TYPES][anonipy.constants.ENTITY_TYPES], which are: + +::: anonipy.constants.ENTITY_TYPES + options: + show_root_heading: False + show_docstring_description: False + show_source: False + +These entity types also have a corresponding regex pattern, as defined in the [regex][anonipy.utils.regex] submodule. + + +If the user wants to use a custom regex pattern, they can define it in the `labels` variable list. Using a custom regex pattern allows the user to specify a more strict pattern that the entity must match. + + + + +### Pattern extractor + +The [PatternExtractor][anonipy.anonymize.extractors.PatternExtractor] is an extractor that uses a custom spacy and regex pattern to extract entities. When documents have a consistent format and structure, the +pattern extractor can be useful, as it can extract entities in a consistent way. + +```python +from anonipy.anonymize.extractors import PatternExtractor +``` + +The `PatternExtractor` takes the following parameters: + +::: anonipy.anonymize.extractors.PatternExtractor.__init__ + options: + show_root_heading: False + show_docstring_description: False + show_docstring_examples: False + show_source: False + +We must define the labels and their patterns used to extract the +relevant entities. The patterns are defined using [spacy patterns](https://spacy.io/usage/rule-based-matching/) or [regex patterns](https://docs.python.org/3/library/re.html). + +In this example, we will use the following labels and patterns: + +```python +labels = [ + # the pattern is defined using regex patterns, where the paranthesis are used to indicate core entity values + {"label": "symptoms", "regex": r"\((.*)\)"}, + # the pattern is defined using spacy patterns + { + "label": "medicine", + "pattern": [[{"IS_ALPHA": True}, {"LIKE_NUM": True}, {"LOWER": "mg"}]], + }, + # the pattern is defined using spacy patterns + { + "label": "date", + "pattern": [ + [ + {"SHAPE": "dd"}, + {"TEXT": "-"}, + {"SHAPE": "dd"}, + {"TEXT": "-"}, + {"SHAPE": "dddd"}, + ] + ], + }, +] +``` + +Let us now initialize the pattern extractor. + +```python +pattern_extractor = PatternExtractor(labels, lang=LANGUAGES.ENGLISH) +``` + +The `PatternExtractor` receives the original text and returns the enriched text document and the extracted entities. + +```python +doc, entities = pattern_extractor(original_text) +``` + +The entities extracted within the input text are: + +```python +pattern_extractor.display(doc) +``` + +
Medical Record

Patient Name: John Doe
Date of + Birth: + + 15-01-1985 + date + +
Date of Examination: + + 20-05-2024 + date + +
Social Security Number: 123-45-6789

Examination Procedure:
John Doe underwent a routine physical + examination. The procedure included measuring vital signs ( + + blood pressure, heart rate, temperature + symptoms + + ), a comprehensive blood panel, and a cardiovascular stress test. The patient also reported occasional headaches and + dizziness, prompting a neurological assessment and an MRI scan to rule out any underlying issues.

Medication + Prescribed:

+ + Ibuprofen 200 mg + medicine + + : Take one tablet every 6-8 hours as needed for headache and pain relief.
+ + Lisinopril 10 mg + medicine + + : Take one tablet daily to manage high blood pressure.
Next Examination Date:
+ + 15-11-2024 + date + +
+
+ + + + +### Multi extractor + +The [MultiExtractor][anonipy.anonymize.extractors.MultiExtractor] is a extractor that can be used to extract entities using multiple extractors. + +The motivation behind the multi extractor is the following: depending on the document format, personal information can be located in different locations; some of them can be found at similar places, while others can be found in different places and formats. Because of this, we would need to use the [NERExtractor][anonipy.anonymize.extractors.NERExtractor] to automatically identify the entities at different locations and the [PatternExtractor][anonipy.anonymize.extractors.PatternExtractor] to extract the entities that appear at the same location. + +`MultiExtractor` enables the use of both extractors at the same time. Furthermore, if both +extractors identify entities at similar locations, then the `MultiExtractor` will also +provide a list of joint entities. + +```python +from anonipy.anonymize.extractors import MultiExtractor +``` + +The `MultiExtractor` takes the following parameters: + +::: anonipy.anonymize.extractors.MultiExtractor.__init__ + options: + show_root_heading: False + show_docstring_description: False + show_docstring_examples: False + show_source: False + +In this example, we will use the previously initialized NER and pattern extractors. + +```python +multi_extractor = MultiExtractor( + extractors=[ner_extractor, pattern_extractor], +) +``` + +Similar as before, the `MultiExtractor` receives the original text, but returns the outputs of all the extractors, as well as the joint entities from all the extractors. + +```python +extractor_outputs, joint_entities = multi_extractor(original_text) +``` + +In this case, `extractor_outputs[0]` will contain the `(doc, entities)` from the NER extractor, and `extractor_outputs[1]` will contain the `(doc, entities)` from the pattern extractor. +The `joint_entities` will contain the joint entities from all the extractors. + + + +## Conclusion + +The extractors are used to extract entities from the text. The `anonipy` package supports both machine learning-based and pattern-based entity extraction, enabling information identification and extraction from different textual formats. \ No newline at end of file diff --git a/docs/how-to-guides/posts/generators-overview.md b/docs/how-to-guides/posts/generators-overview.md new file mode 100644 index 0000000..ea3efae --- /dev/null +++ b/docs/how-to-guides/posts/generators-overview.md @@ -0,0 +1,340 @@ +--- +date: 2024-07-15 +authors: [eriknovak] +description: > + The overview of the implemented generators. +categories: + - Overview +--- + +# Generators overview + +In this post, we will show an overview of the implemented generators. The generators are used to create new texts that would serve as substitutes to the extracted `named entities`. The substitutes can be then used to replace and anonymize the text. + +All generators and their API references are available in the [generators][anonipy.anonymize.generators] module. What follows is the presentation of the different generators `anonipy` provides. + + + + + + +## Pre-requisites + +Let us first define the text, from which we want to extract the entities. + +```python +original_text = """\ +Medical Record + +Patient Name: John Doe +Date of Birth: 15-01-1985 +Date of Examination: 20-05-2024 +Social Security Number: 123-45-6789 + +Examination Procedure: +John Doe underwent a routine physical examination. The procedure included measuring vital signs (blood pressure, heart rate, temperature), a comprehensive blood panel, and a cardiovascular stress test. The patient also reported occasional headaches and dizziness, prompting a neurological assessment and an MRI scan to rule out any underlying issues. + +Medication Prescribed: + +Ibuprofen 200 mg: Take one tablet every 6-8 hours as needed for headache and pain relief. +Lisinopril 10 mg: Take one tablet daily to manage high blood pressure. +Next Examination Date: +15-11-2024 +""" +``` + +Normally, the entities would be extracted using an [extractor][anonipy.anonymize.extractors]. For this example, we manually define the entities. + +```python +from anonipy.definitions import Entity + +entities = [ + Entity( + text="John Doe", + label="name", + start_index=30, + end_index=38, + type="string", + ), + Entity( + text="20-05-2024", + label="date", + start_index=86, + end_index=96, + type="date", + ), + Entity( + text="123-45-6789", + label="social security number", + start_index=121, + end_index=132, + type="custom", + regex="\d{3}-\d{2}-\d{4}", + ), +] +``` + + + +## Generators + +All following generators are available in the [generators][anonipy.anonymize.generators] module. + + + + +### LLMLabelGenerator + +The [LLMLabelGenerator][anonipy.anonymize.generators.LLMLabelGenerator] is a one-stop-shop generator that utilizes LLMs to generate replacements for entities. It is implemented to support any entity type. + +!!! info "GPU Requirements" + The `LLMLabelGenerator` utilizes the open source LLMs, specifically the [Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) model. Because the model is quite large, we utilize quantization using the `bitsandbytes` package to reduce its size. Therefore, the `LLMLabelGenerator` requires at least 8GB GPU and CUDA drivers to be available. If these resources are not available on your machine, consider using the `MaskLabelGenerator` instead. + +```python +from anonipy.anonymize.generators import LLMLabelGenerator +``` + +The `LLMLabelGenerator` currently does not require any input parameters at initialization. + +Let us now initialize the LLM label generator. + +```python +llm_generator = LLMLabelGenerator() +``` + +!!! info "Initialization warnings" + The initialization of `LLMLabelGenerator` will throw some warnings. Ignore them. These are expected due to the use of package dependencies. + +To use the generator, we can call the `generate` method. The `generate` method receives the following parameters: + +::: anonipy.anonymize.generators.LLMLabelGenerator.generate + options: + show_root_heading: False + show_docstring_description: False + show_docstring_examples: False + show_docstring_returns: False + show_source: False + +Let us generate the replacement for the first entity from `entities` using the default parameters. + +```python +llm_generator.generate(entities[0])# (1)! +``` + +1. The generator receives the `John Doe` name entity and might return the replacement: `Ethan Thomson` + + +Let us now change the label prefix and generate the replacement using a higher temperature. + +```python +llm_generator.generate( + entities[0], + entity_prefix="Spanish", + temperature=0.7 +)# (1)! +``` + +1. The generator receives the `John Doe` name entity and under the different generation parameters might return the replacement: `Juan Rodrigez` + +Going through the whole `entities` list, the `LLMLabelGenerator`, using the default parameters, might generate the following replacements: + +| Entity | Type | Label | Replacement | +| ------------- | -------- | ------------------------ | --------------- | +| `John Doe` | `string` | `name` | `Ethan Thomson` | +| `20-05-2024` | `date` | `date` | `23-07-2027` | +| `123-45-6789` | `custom` | `social security number` | `987-65-4321` | + + +**Advices and suggestions** + +**Using LLMLabelGenerator only for string and custom types.** +While the `LLMLabelGenerator` is able to generate alternatives for different entity types, we suggest using it only for string and custom entity types. The reason is that the LLMs can be quite slow for generating replacements. + +In addition, `anonipy` has other generators that can be used for other entity types, such as dates, numbers, etc. + +**Restricting with regex.** +Using LLMs to generate text is best when the generation is restricted to a specific pattern. The [Entity][anonipy.definitions.Entity] object already contains a `regex` field that can be used to restrict the generation +to a specific pattern. However, it is recommended to specify to have as specific and restrictive regex expressions as possible. + +This will help the `LLMLabelGenerator` to generate more accurate replacements. + + + + +### MaskLabelGenerator + +The [MaskLabelGenerator][anonipy.anonymize.generators.MaskLabelGenerator] is a generator that uses smaller language models, such as [XLM-RoBERTa](https://huggingface.co/FacebookAI/xlm-roberta-large), to generate replacements for entities. It is implemented to support any entity type, but we suggest using it only with string entities. For other entity types, please use other available [generators][anonipy.anonymize.generators]. + +```python +from anonipy.anonymize.generators import MaskLabelGenerator +``` + +The `MaskLabelGenerator` requires the following input parameters at initialization: + +::: anonipy.anonymize.generators.MaskLabelGenerator.__init__ + options: + show_root_heading: False + show_docstring_description: False + show_docstring_examples: False + show_docstring_returns: False + show_source: False + +Let us now initialize the mask label generator. + +```python +mask_generator = MaskLabelGenerator() +``` + +!!! info "Initialization warnings" + The initialization of `LLMLabelGenerator` will throw some warnings. Ignore them. These are expected due to the use of package dependencies. + +To use the generator, we can call the `generate` method. The `generate` method receives the following parameters: + +::: anonipy.anonymize.generators.MaskLabelGenerator.generate + options: + show_root_heading: False + show_docstring_description: False + show_docstring_examples: False + show_docstring_returns: False + show_source: False + +This generator will create a list of suggestions from which it will select one at random. Therefore, the generator will return different suggestions every time it is called. + +```python +mask_generator.generate(entities[0], text=original_text)# (1)! +mask_generator.generate(entities[0], text=original_text)# (2)! +mask_generator.generate(entities[0], text=original_text)# (3)! +``` + +1. The first generation for the `John Doe` name entity might return the replacement: `James Smith` +2. The second generation might return the replacement: `Michael Smith` +3. The third generation might return the replacement: `David Blane` + +**Advices and suggestions** + +**Using only for string entities.** +As seen from the above examples, the `MaskLabelGenerator` is best used with string entities. For number and date entities, it is best to use other generators, such as `NumberGenerator` and `DateGenerator`. + + + + +### NumberGenerator + +The [NumberGenerator][anonipy.anonymize.generators.NumberGenerator] is a generator for generating random numbers. It is implemented to support integers, floats, and phone numbers, but it can be used to generate values for custom types which include numbers. + +```python +from anonipy.anonymize.generators import NumberGenerator +``` + +The `NumberGenerator` currently does not require any input parameters at initialization. + +```python +number_generator = NumberGenerator() +``` + +To use the generator, we can call the `generate` method. The `generate` method receives the following parameters: + +::: anonipy.anonymize.generators.NumberGenerator.generate + options: + show_root_heading: False + show_docstring_description: False + show_docstring_examples: False + show_docstring_returns: False + show_source: False + +This generator will create a suggestion by replacing numeric values in the entity text at random. Therefore, the generator will return different suggestions every time it is called. + +```python +number_generator.generate(entities[2])# (1)! +``` + +1. For the `social security number` entity, the generator will return a replacement, such as: `143-46-4915`. + +Furthermore, it will throw an error if the entity type is not `integer`, `float`, `phone_number` or `custom`. + +```python +try: + number_generator.generate(entities[0])# (1)! +except Exception as e: + print(e)# (2)! +``` + +1. The provided entity is a `string`, therefore it will raise an error. +2. The exception will state `The entity type must be 'integer', 'float', 'phone_number' or 'custom' to generate numbers.` + + + + +### DateGenerator + +The [DateGenerator][anonipy.anonymize.generators.DateGenerator] is a generator for generating dates. It is implemented to support date entities. + +```python +from anonipy.anonymize.generators import DateGenerator +``` + +The `DateGenerator` requires the following input parameters at initialization: + +::: anonipy.anonymize.generators.DateGenerator.__init__ + options: + show_root_heading: False + show_docstring_description: False + show_docstring_examples: False + show_docstring_returns: False + show_source: False + +Let us now initialize the date generator. + +```python +date_generator = DateGenerator() +``` + +To use the generator, we can call the `generate` method. The `generate` method receives the following parameters: + +::: anonipy.anonymize.generators.DateGenerator.generate + options: + show_root_heading: False + show_docstring_description: False + show_docstring_examples: False + show_docstring_returns: False + show_docstring_raises: False + show_source: False + +Using the above parameters, this generator will create the appropriate date suggestions: + +```python + +entities[1]# (1)! +date_generator.generate(entities[1], sub_variant="RANDOM")# (2)! +date_generator.generate(entities[1], sub_variant="FIRST_DAY_OF_THE_MONTH")# (3)! +date_generator.generate(entities[1], sub_variant="LAST_DAY_OF_THE_MONTH")# (4)! +date_generator.generate(entities[1], sub_variant="MIDDLE_OF_THE_MONTH")# (5)! +date_generator.generate(entities[1], sub_variant="MIDDLE_OF_THE_YEAR")# (6)! +``` + +1. The entity is a `date` entity with the text `20-05-2024`. +2. The `RANDOM` sub variant will return a random date within the given date range. A possible generation can be: `26-05-2024` +3. The `FIRST_DAY_OF_THE_MONTH` sub variant will return the first day of the month: `01-05-2024` +4. The `LAST_DAY_OF_THE_MONTH` sub variant will return the last day of the month: `31-05-2024` +5. The `MIDDLE_OF_THE_MONTH` sub variant will return the middle day of the month: `15-05-2024` +6. The `MIDDLE_OF_THE_YEAR` sub variant will return the middle day of the year: `01-07-2024` + + +Furthermore, it will throw an error if the entity type is not `date`. + +```python +try: + date_generator.generate(entities[0])# (1)! +except Exception as e: + print(e)# (2)! +``` + +1. The provided entity is a `string`, therefore it will raise an error. +2. The exception will state `The entity type must be 'date' to generate dates.` + + + + +## Conclusion + +The generators are used to create new texts that would serve as substitutes to the extracted `named entities`. The substitutes can be then used to replace and anonymize the text. diff --git a/docs/how-to-guides/posts/strategies-overview.md b/docs/how-to-guides/posts/strategies-overview.md new file mode 100644 index 0000000..a3765a8 --- /dev/null +++ b/docs/how-to-guides/posts/strategies-overview.md @@ -0,0 +1,528 @@ +--- +date: 2024-07-15 +authors: [eriknovak] +description: > + The overview of the implemented strategies. +categories: + - Overview +--- + +# Strategies overview + +In this post, we will show an overview of the implemented strategies. The strategies delegate how the original text will be anonymized given the extracted `named entities`. They output the anonymized text and the list of replacements that were made to the original text. + +All strategies and their API references are available in the [strategies][anonipy.anonymize.strategies] module. What follows is the presentation of the different strategies `anonipy` provides. + + + + + + +## Pre-requisites + +Let us first define the text we want to anonymiyze. + +```python +original_text = """\ +Medical Record + +Patient Name: John Doe +Date of Birth: 15-01-1985 +Date of Examination: 20-05-2024 +Social Security Number: 123-45-6789 + +Examination Procedure: +John Doe underwent a routine physical examination. The procedure included measuring vital signs (blood pressure, heart rate, temperature), a comprehensive blood panel, and a cardiovascular stress test. The patient also reported occasional headaches and dizziness, prompting a neurological assessment and an MRI scan to rule out any underlying issues. + +Medication Prescribed: + +Ibuprofen 200 mg: Take one tablet every 6-8 hours as needed for headache and pain relief. +Lisinopril 10 mg: Take one tablet daily to manage high blood pressure. +Next Examination Date: +15-11-2024 +""" +``` + +Normally, the entities would be extracted using an [extractor][anonipy.anonymize.extractors]. For this example, we manually define the entities. + +```python +from anonipy.definitions import Entity + +entities = [ + Entity( + text="John Doe", + label="name", + start_index=30, + end_index=38, + type="string", + ), + Entity( + text="15-01-1985", + label="date of birth", + start_index=54, + end_index=64, + type="date", + ), + Entity( + text="20-05-2024", + label="date", + start_index=86, + end_index=96, + type="date", + ), + Entity( + text="123-45-6789", + label="social security number", + start_index=121, + end_index=132, + type="custom", + regex="[0-9]{3}-[0-9]{2}-[0-9]{4}", + ), + Entity( + text="John Doe", + label="name", + start_index=157, + end_index=165, + type="string", + ), + Entity( + text="15-11-2024", + label="date", + start_index=717, + end_index=727, + type="date", + ), +] +``` + + +## Strategies + +All following strategies are available in the [strategies][anonipy.anonymize.strategies] module. + + +### Redacted Strategy + +Data redaction is the process of obscuring information that’s personally identifiable, confidential, classified or sensitive. + +The [RedactionStrategy][anonipy.anonymize.strategies.RedactionStrategy] anonymizes the original text by replacing the entities in the text with a predefined substitute label, which defaults to `[REDACTED]`. + +!!! info "Anonymization details" + The redaction strategy hides sensitive information by replacing the original entities with a string that does not reveal any information about the original. While this is useful for obscuring information, it does change the text's distribution, which can effect the training of machine learning models. + +```python +from anonipy.anonymize.strategies import RedactionStrategy +``` + + +The `RedactionStrategy` requires the following input parameters at initialization: + +::: anonipy.anonymize.strategies.RedactionStrategy.__init__ + options: + show_root_heading: False + show_docstring_description: False + show_docstring_examples: False + show_docstring_returns: False + show_source: False + +Let us now initialize the redaction strategy. + +```python +redaction_strategy = RedactionStrategy() +``` + +To use the strategy, we can call the `anonymize` method to anonymize the text given the `entities`. The `anonymize` method receives the following parameters: + +::: anonipy.anonymize.strategies.RedactionStrategy.anonymize + options: + show_root_heading: False + show_docstring_description: False + show_docstring_examples: False + show_docstring_returns: False + show_source: False + +Using the `RedactionStrategy`, we can now anonymize the text. + +```python +anonymized_text, replacements = redaction_strategy.anonymize( + original_text, + entities +) +``` + +Which returns the anonymized text and the list of replacements made. + +```python +print(anonymized_text) +``` +```text +Medical Record + +Patient Name: [REDACTED] +Date of Birth: [REDACTED] +Date of Examination: [REDACTED] +Social Security Number: [REDACTED] + +Examination Procedure: +[REDACTED] underwent a routine physical examination. The procedure included measuring vital signs (blood pressure, heart rate, temperature), a comprehensive blood panel, and a cardiovascular stress test. The patient also reported occasional headaches and dizziness, prompting a neurological assessment and an MRI scan to rule out any underlying issues. + +Medication Prescribed: + +Ibuprofen 200 mg: Take one tablet every 6-8 hours as needed for headache and pain relief. +Lisinopril 10 mg: Take one tablet daily to manage high blood pressure. +Next Examination Date: +[REDACTED] +``` + +And the associated replacements are: + +```python +print(replacements) +``` +```json +[ + { + "original_text": "John Doe", + "label": "name", + "start_index": 30, + "end_index": 38, + "anonymized_text": "[REDACTED]" + }, + { + "original_text": "15-01-1985", + "label": "date of birth", + "start_index": 54, + "end_index": 64, + "anonymized_text": "[REDACTED]" + }, + { + "original_text": "20-05-2024", + "label": "date", + "start_index": 86, + "end_index": 96, + "anonymized_text": "[REDACTED]" + }, + { + "original_text": "123-45-6789", + "label": "social security number", + "start_index": 121, + "end_index": 132, + "anonymized_text": "[REDACTED]" + }, + { + "original_text": "John Doe", + "label": "name", + "start_index": 157, + "end_index": 165, + "anonymized_text": "[REDACTED]" + }, + { + "original_text": "15-11-2024", + "label": "date", + "start_index": 717, + "end_index": 727, + "anonymized_text": "[REDACTED]" + } +] +``` + + + + +### Masking Strategy + +Data masking refers to the disclosure of data with modified values. Data anonymization is done by creating a mirror image of a database and implementing alteration strategies, such as character shuffling, encryption, term, or character substitution. For example, a value character may be replaced by a symbol such as “*” or “x.” It makes identification or reverse engineering difficult. + +The [MaskingStrategy][anonipy.anonymize.strategies.MaskingStrategy] anonymizes the original text by replacing the entities with masks, which are created using the subsitute label, which defaults to `*`. + +!!! info "Anonymization details" + The masking strategy is useful as it hides the original sensitive values and retains the original text's length. However, it also changes the original text's meaning and distribution, as the replacement values are not the same as the original values. + +```python +from anonipy.anonymize.strategies import MaskingStrategy +``` + + +The `MaskingStrategy` requires the following input parameters at initialization: + +::: anonipy.anonymize.strategies.MaskingStrategy.__init__ + options: + show_root_heading: False + show_docstring_description: False + show_docstring_examples: False + show_docstring_returns: False + show_source: False + +Let us now initialize the masking strategy. + +```python +masking_strategy = MaskingStrategy() +``` + +To use the strategy, we can call the `anonymize` method to anonymize the text given the `entities`. The `anonymize` method receives the following parameters: + +::: anonipy.anonymize.strategies.MaskingStrategy.anonymize + options: + show_root_heading: False + show_docstring_description: False + show_docstring_examples: False + show_docstring_returns: False + show_source: False + +Using the `MaskingStrategy`, we can now anonymize the text. + +```python +anonymized_text, replacements = masking_strategy.anonymize( + original_text, + entities +) +``` + +Which returns the anonymized text and the list of replacements made. + +```python +print(anonymized_text) +``` +```text +Patient Name: **** *** +Date of Birth: ********** +Date of Examination: ********** +Social Security Number: *********** + +Examination Procedure: +**** *** underwent a routine physical examination. The procedure included measuring vital signs (blood pressure, heart rate, temperature), a comprehensive blood panel, and a cardiovascular stress test. The patient also reported occasional headaches and dizziness, prompting a neurological assessment and an MRI scan to rule out any underlying issues. + +Medication Prescribed: + +Ibuprofen 200 mg: Take one tablet every 6-8 hours as needed for headache and pain relief. +Lisinopril 10 mg: Take one tablet daily to manage high blood pressure. +Next Examination Date: +********** +``` + +And the associated replacements are: + +```python +print(replacements) +``` +```json +[ + { + "original_text": "John Doe", + "label": "name", + "start_index": 30, + "end_index": 38, + "anonymized_text": "**** ***" + }, + { + "original_text": "15-01-1985", + "label": "date of birth", + "start_index": 54, + "end_index": 64, + "anonymized_text": "**********" + }, + { + "original_text": "20-05-2024", + "label": "date", + "start_index": 86, + "end_index": 96, + "anonymized_text": "**********" + }, + { + "original_text": "123-45-6789", + "label": "social security number", + "start_index": 121, + "end_index": 132, + "anonymized_text": "***********" + }, + { + "original_text": "John Doe", + "label": "name", + "start_index": 157, + "end_index": 165, + "anonymized_text": "**** ***" + }, + { + "original_text": "15-11-2024", + "label": "date", + "start_index": 717, + "end_index": 727, + "anonymized_text": "**********" + } +] +``` + + + + +### Pseudonymization Strategy + +Pseudonymization is a data de-identification tool that substitutes private identifiers with false identifiers or pseudonyms, such as swapping the “John Smith” identifier with the “Mark Spencer” identifier. It maintains statistical precision and data confidentiality, allowing changed data to be used for creation, training, testing, and analysis, while at the same time maintaining data privacy. + +The [PseudonymizationStrategy][anonipy.anonymize.strategies.PseudonymizationStrategy] anonymizes the original text by replacing the entities with fake ones, which are created using the generators (see [generators][anonipy.anonymize.generators]). + +!!! info "Anonymization details" + The pseudonymization strategy is the most useful in terms of retaining the statistical distributions of the text. However, it is also most technical, as the user must define a function for mapping true entities to fake ones. Furthermore, if an entity appears multiple times the pseudonymization strategy will retain the same mapping between the true and fake entities. + + +The `PseudonymizationStrategy` requires a function for mapping entities. In our example, we will define a function using the generators. To make the example accessible as possible, we will use the [MaskLabelGenerator][anonipy.anonymize.generators.MaskLabelGenerator] instead of the [LLMLabelGenerator][anonipy.anonymize.generators.LLMLabelGenerator] for generating string entities. + +First, let us define the mapping function. We will import the required generators and initialize them. + +```python +from anonipy.anonymize.generators import ( + MaskLabelGenerator, + DateGenerator, + NumberGenerator, +) + +# initialize the generators +mask_generator = MaskLabelGenerator() +date_generator = DateGenerator() +number_generator = NumberGenerator() +``` + +Next, we will define the anonymization mapping function. This function receives two inputs: `text` and the `entity`. The `text` is the original text, and the `entity` is the current entity. The anonymization mapping will create a replacement for the given `entity` based on it's information and context within the `text`. + +```python +def anonymization_mapping(text, entity): + if entity.type == "string": + return mask_generator.generate(entity, text) + if entity.label == "date": + return date_generator.generate(entity, sub_variant="MIDDLE_OF_THE_MONTH") + if entity.label == "date of birth": + return date_generator.generate(entity, sub_variant="MIDDLE_OF_THE_YEAR") + if entity.label == "social security number": + return number_generator.generate(entity) + return "[REDACTED]" +``` + + +Let us now initialize the pseudonymization strategy. + +```python +from anonipy.anonymize.strategies import PseudonymizationStrategy +``` + + +The `PseudonymizationStrategy` requires the following input parameters at initialization: + +::: anonipy.anonymize.strategies.PseudonymizationStrategy.__init__ + options: + show_root_heading: False + show_docstring_description: False + show_docstring_examples: False + show_docstring_returns: False + show_source: False + +Let us now initialize the pseudonymization strategy. + +```python +pseudo_strategy = PseudonymizationStrategy( + mapping=anonymization_mapping +) +``` + +To use the strategy, we can call the `anonymize` method to anonymize the text given the `entities`. The `anonymize` method receives the following parameters: + +::: anonipy.anonymize.strategies.PseudonymizationStrategy.anonymize + options: + show_root_heading: False + show_docstring_description: False + show_docstring_examples: False + show_docstring_returns: False + show_source: False + +Using the `PseudonymizationStrategy`, we can now anonymize the text. + +```python +anonymized_text, replacements = pseudo_strategy.anonymize( + original_text, + entities +) +``` + +Which returns the anonymized text and the list of replacements made. + +!!! note "Generator performance" + While `MaskLabelGenerator` is faster and less resource intensive than `LLMLabelGenerator`, it sometimes does not provide a meaningful replacement. In the example below, the patient name `John Doe` is replaced with `first Professor`, which is not meaningful. Therefore, when possible, we advise using the `LLMLabelGenerator` instead. + +```python +print(anonymized_text) +``` +```text +Medical Record + +Patient Name: first Professor +Date of Birth: 01-07-1985 +Date of Examination: 15-05-2024 +Social Security Number: 724-78-8182 + +Examination Procedure: +first Professor underwent a routine physical examination. The procedure included measuring vital signs (blood pressure, heart rate, temperature), a comprehensive blood panel, and a cardiovascular stress test. The patient also reported occasional headaches and dizziness, prompting a neurological assessment and an MRI scan to rule out any underlying issues. + +Medication Prescribed: + +Ibuprofen 200 mg: Take one tablet every 6-8 hours as needed for headache and pain relief. +Lisinopril 10 mg: Take one tablet daily to manage high blood pressure. +Next Examination Date: +15-11-2024 +``` + +And the associated replacements are: + +```python +print(replacements) +``` +```json +[ + { + "original_text": "John Doe", + "label": "name", + "start_index": 30, + "end_index": 38, + "anonymized_text": "first Professor" + }, + { + "original_text": "15-01-1985", + "label": "date of birth", + "start_index": 54, + "end_index": 64, + "anonymized_text": "01-07-1985" + }, + { + "original_text": "20-05-2024", + "label": "date", + "start_index": 86, + "end_index": 96, + "anonymized_text": "15-05-2024" + }, + { + "original_text": "123-45-6789", + "label": "social security number", + "start_index": 121, + "end_index": 132, + "anonymized_text": "724-78-8182" + }, + { + "original_text": "John Doe", + "label": "name", + "start_index": 157, + "end_index": 165, + "anonymized_text": "first Professor" + }, + { + "original_text": "15-11-2024", + "label": "date", + "start_index": 717, + "end_index": 727, + "anonymized_text": "15-11-2024" + } +] +``` + + + +## Conclusion + +The strategies are used to anonymize the text in combination with the extracted `named entities`. The strategies are used to replace and anonymize the text as well as provide the list of replacements that were made to the original text. diff --git a/docs/index.md b/docs/index.md index f13ba84..e4c05c5 100644 --- a/docs/index.md +++ b/docs/index.md @@ -2,141 +2,11 @@ title: Home --- -

- logo -

- -

- Data anonymization package, supporting different anonymization strategies -

- -

- - Test - - - Package version - - - Supported Python versions - -

- - ---- - -**Documentation:** [https://eriknovak.github.io/anonipy](https://eriknovak.github.io/anonipy) - -**Source code:** [https://github.com/eriknovak/anonipy](https://github.com/eriknovak/anonipy) - ---- - -The anonipy package is a python package for data anonymization. It is designed to be simple to use and highly customizable, supporting different anonymization strategies. Powered by LLMs. - -## Install - -```bash -pip install anonipy -``` - -## Upgrade - -```bash -pip install anonipy --upgrade -``` - -## Example of usage - -The details of the example can be found in the [Overview](documentation/notebooks/00-overview.ipynb). - -```python -original_text = """\ -Medical Record - -Patient Name: John Doe -Date of Birth: 15-01-1985 -Date of Examination: 20-05-2024 -Social Security Number: 123-45-6789 - -Examination Procedure: -John Doe underwent a routine physical examination. The procedure included measuring vital signs (blood pressure, heart rate, temperature), a comprehensive blood panel, and a cardiovascular stress test. The patient also reported occasional headaches and dizziness, prompting a neurological assessment and an MRI scan to rule out any underlying issues. - -Medication Prescribed: - -Ibuprofen 200 mg: Take one tablet every 6-8 hours as needed for headache and pain relief. -Lisinopril 10 mg: Take one tablet daily to manage high blood pressure. -Next Examination Date: -15-11-2024 -""" -``` - -Use the language detector to detect the language of the text: - -```python -from anonipy.utils.language_detector import LanguageDetector - -lang_detector = LanguageDetector() -language = lang_detector(original_text) -``` - -Prepare the entity extractor and extract the personal infomation from the original text: - -```python -from anonipy.anonymize.extractors import EntityExtractor - -# define the labels to be extracted and anonymized -labels = [ - {"label": "name", "type": "string"}, - {"label": "social security number", "type": "custom"}, - {"label": "date of birth", "type": "date"}, - {"label": "date", "type": "date"}, -] - -# language taken from the language detector -entity_extractor = EntityExtractor(labels, lang=language, score_th=0.5) - -# extract the entities from the original text -doc, entities = entity_extractor(original_text) - -# display the entities in the original text -entity_extractor.display(doc) -``` - -Use generators to create substitutes for the entities: - -```python -from anonipy.anonymize.generators import ( - LLMLabelGenerator, - DateGenerator, - NumberGenerator, -) - -# initialize the generators -llm_generator = LLMLabelGenerator() -date_generator = DateGenerator() -number_generator = NumberGenerator() - -# prepare the anonymization mapping -def anonymization_mapping(text, entity): - if entity.type == "string": - return llm_generator.generate(entity, temperature=0.7) - if entity.label == "date": - return date_generator.generate(entity, output_gen="middle_of_the_month") - if entity.label == "date of birth": - return date_generator.generate(entity, output_gen="middle_of_the_year") - if entity.label == "social security number": - return number_generator.generate(entity) - return "[REDACTED]" -``` - -Anonymize the text using the anonymization mapping: - -```python -from anonipy.anonymize.strategies import PseudonymizationStrategy - -# initialize the pseudonymization strategy -pseudo_strategy = PseudonymizationStrategy(mapping=anonymization_mapping) - -# anonymize the original text -anonymized_text, replacements = pseudo_strategy.anonymize(original_text, entities) -``` \ No newline at end of file + + +--8<-- "README.md" \ No newline at end of file diff --git a/docs/references/anonymize/extractors.md b/docs/references/anonymize/extractors.md new file mode 100644 index 0000000..043d524 --- /dev/null +++ b/docs/references/anonymize/extractors.md @@ -0,0 +1,15 @@ +--- +title: Extractors Module +--- + +# Extractors Module + +::: anonipy.anonymize.extractors + options: + heading_level: 2 + +::: anonipy.anonymize.extractors.NERExtractor + +::: anonipy.anonymize.extractors.PatternExtractor + +::: anonipy.anonymize.extractors.MultiExtractor \ No newline at end of file diff --git a/docs/references/anonymize/generators.md b/docs/references/anonymize/generators.md new file mode 100644 index 0000000..aa705fb --- /dev/null +++ b/docs/references/anonymize/generators.md @@ -0,0 +1,17 @@ +--- +title: Generators Module +--- + +# Generators Module + +::: anonipy.anonymize.generators + options: + heading_level: 2 + +::: anonipy.anonymize.generators.LLMLabelGenerator + +::: anonipy.anonymize.generators.MaskLabelGenerator + +::: anonipy.anonymize.generators.NumberGenerator + +::: anonipy.anonymize.generators.DateGenerator diff --git a/docs/references/anonymize/index.md b/docs/references/anonymize/index.md new file mode 100644 index 0000000..49e732b --- /dev/null +++ b/docs/references/anonymize/index.md @@ -0,0 +1,13 @@ +--- +title: Anonymize Module +--- + +# Anonymize Module + +::: anonipy.anonymize + options: + heading_level: 2 + +## Functions + +::: anonipy.anonymize.anonymize \ No newline at end of file diff --git a/docs/references/anonymize/strategies.md b/docs/references/anonymize/strategies.md new file mode 100644 index 0000000..e243450 --- /dev/null +++ b/docs/references/anonymize/strategies.md @@ -0,0 +1,15 @@ +--- +title: Strategies Module +--- + +# Strategies Module + +::: anonipy.anonymize.strategies + options: + heading_level: 2 + +::: anonipy.anonymize.strategies.RedactionStrategy + +::: anonipy.anonymize.strategies.MaskingStrategy + +::: anonipy.anonymize.strategies.PseudonymizationStrategy \ No newline at end of file diff --git a/docs/references/constants.md b/docs/references/constants.md new file mode 100644 index 0000000..eef2ec6 --- /dev/null +++ b/docs/references/constants.md @@ -0,0 +1,9 @@ +--- +title: Constants Module +--- + +# Constants Module + +::: anonipy.constants + options: + heading_level: 2 \ No newline at end of file diff --git a/docs/references/definitions.md b/docs/references/definitions.md new file mode 100644 index 0000000..fca9985 --- /dev/null +++ b/docs/references/definitions.md @@ -0,0 +1,9 @@ +--- +title: Definitions Module +--- + +# Definitions Module + +::: anonipy.definitions + options: + heading_level: 2 diff --git a/docs/references/index.md b/docs/references/index.md new file mode 100644 index 0000000..6966412 --- /dev/null +++ b/docs/references/index.md @@ -0,0 +1,9 @@ +--- +title: Anonipy Module +--- + +# Anonipy Module + +::: anonipy + options: + heading_level: 2 \ No newline at end of file diff --git a/docs/references/utils/file_system.md b/docs/references/utils/file_system.md new file mode 100644 index 0000000..0d30819 --- /dev/null +++ b/docs/references/utils/file_system.md @@ -0,0 +1,9 @@ +--- +title: File System Module +--- + +# File System Module + +::: anonipy.utils.file_system + options: + heading_level: 2 diff --git a/docs/references/utils/index.md b/docs/references/utils/index.md new file mode 100644 index 0000000..0ca2ee9 --- /dev/null +++ b/docs/references/utils/index.md @@ -0,0 +1,11 @@ +--- +title: Utils Module +hide: + - toc +--- + +# Utils Module + +::: anonipy.utils + options: + heading_level: 2 diff --git a/docs/references/utils/language_detector.md b/docs/references/utils/language_detector.md new file mode 100644 index 0000000..4f5d4f4 --- /dev/null +++ b/docs/references/utils/language_detector.md @@ -0,0 +1,12 @@ +--- +title: Language Detector Module +--- + +# Language Detector Module + +::: anonipy.utils.language_detector + options: + members: False + heading_level: 2 + +::: anonipy.utils.language_detector.LanguageDetector \ No newline at end of file diff --git a/docs/references/utils/regex.md b/docs/references/utils/regex.md new file mode 100644 index 0000000..e4dfc95 --- /dev/null +++ b/docs/references/utils/regex.md @@ -0,0 +1,9 @@ +--- +title: Regex Module +--- + +# Regex Module + +::: anonipy.utils.regex + options: + heading_level: 2 diff --git a/mkdocs.yml b/mkdocs.yml index de99c6f..f9c538d 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -1,5 +1,5 @@ # Project information -site_name: anonipy +site_name: Anonipy site_author: Erik Novak site_description: >- Data anonymization package, supporting different anonymization strategies @@ -45,13 +45,26 @@ theme: edit_uri: "" +extra_css: + - css/extra.css + # Plugins plugins: - - blog + - blog: + blog_dir: how-to-guides + - autorefs - search: separator: '[\s\u200b\-_,:!=\[\]()"`/]+|\.(?!\d)|&[lg]t;|(?!\b)(?=[A-Z][a-z])' - mkdocs-jupyter: include: ["*.ipynb"] + - mkdocstrings: + handlers: + python: + options: + show_root_heading: True + show_root_toc_entry: False + members_order: "source" + heading_level: 3 # Markdown Extensions markdown_extensions: @@ -76,13 +89,23 @@ markdown_extensions: nav: - Home: index.md - - Documentation: - - Overview: documentation/notebooks/00-overview.ipynb - - Extractors: documentation/notebooks/01-extractors.ipynb - - Generators: documentation/notebooks/02-generators.ipynb - - Strategies: documentation/notebooks/03-strategies.ipynb - - Utility: documentation/notebooks/04-utility.ipynb - - Blog: - - blog/index.md + - How-To Guides: + - how-to-guides/index.md + + - API Reference: + - references/index.md + - anonymize: + - references/anonymize/index.md + - extractors: references/anonymize/extractors.md + - generators: references/anonymize/generators.md + - strategies: references/anonymize/strategies.md + - utils: + - references/utils/index.md + - regex: references/utils/regex.md + - file_system: references/utils/file_system.md + - language_detector: references/utils/language_detector.md + - definitions: references/definitions.md + - constants: references/constants.md + - Changelog: changelog.md - Development: development.md \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index dd517c3..934fa92 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,6 +33,7 @@ dev = [ "python-githooks", "mkdocs-material", "mkdocs-jupyter", + "mkdocstrings[python]", ] test = [ "coverage", diff --git a/requirements.txt b/requirements.txt index 5e112ff..79e7531 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ # NLP and LLMs spacy gliner==0.2.2 -gliner-spacy>=0.0.8 +gliner-spacy==0.0.8 transformers bitsandbytes lingua-language-detector diff --git a/test/test_extractors.py b/test/test_extractors.py index 5dc5865..153c520 100644 --- a/test/test_extractors.py +++ b/test/test_extractors.py @@ -4,8 +4,8 @@ import torch from anonipy.definitions import Entity -from anonipy.anonymize.extractors import EntityExtractor -from anonipy.anonymize.regex import regex_map +from anonipy.anonymize.extractors import NERExtractor, PatternExtractor, MultiExtractor +from anonipy.utils.regex import regex_mapping from anonipy.constants import LANGUAGES @@ -32,14 +32,13 @@ 15-11-2024 """ -original_entities = [ +ner_entities = [ Entity( text="John Doe", label="name", start_index=30, end_index=38, type="string", - regex=regex_map("string"), ), Entity( text="15-01-1985", @@ -47,7 +46,6 @@ start_index=54, end_index=64, type="date", - regex=regex_map("date"), ), Entity( text="20-05-2024", @@ -55,7 +53,6 @@ start_index=86, end_index=96, type="date", - regex=regex_map("date"), ), Entity( text="123-45-6789", @@ -71,7 +68,6 @@ start_index=157, end_index=165, type="string", - regex=regex_map("string"), ), Entity( text="15-11-2024", @@ -79,68 +75,285 @@ start_index=717, end_index=727, type="date", - regex=regex_map("date"), ), ] -# define the labels to be extracted and anonymized -labels = [ - {"label": "name", "type": "string"}, - { - "label": "social security number", - "type": "custom", - "regex": "[0-9]{3}-[0-9]{2}-[0-9]{4}", - }, - {"label": "date of birth", "type": "date"}, - {"label": "date", "type": "date"}, +pattern_entities = [ + Entity( + text="15-01-1985", + label="date", + start_index=54, + end_index=64, + type=None, + ), + Entity( + text="20-05-2024", + label="date", + start_index=86, + end_index=96, + type=None, + ), + Entity( + text="blood pressure, heart rate, temperature", + label="symptoms", + start_index=254, + end_index=293, + type=None, + regex="\\((.*)\\)", + ), + Entity( + text="Ibuprofen 200 mg", + label="medicine", + start_index=533, + end_index=549, + type=None, + ), + Entity( + text="Lisinopril 10 mg", + label="medicine", + start_index=623, + end_index=639, + type=None, + ), + Entity( + text="15-11-2024", + label="date", + start_index=717, + end_index=727, + type=None, + ), ] # ===================================== -# Test Entity Extractor +# Test NER Extractor # ===================================== -class TestEntityExtractor(unittest.TestCase): +class TestNERExtractor(unittest.TestCase): def setUp(self): warnings.filterwarnings("ignore", category=ImportWarning) warnings.filterwarnings("ignore", category=UserWarning) warnings.filterwarnings("ignore", category=FutureWarning) + # define the labels to be extracted and anonymized + self.labels = [ + {"label": "name", "type": "string"}, + { + "label": "social security number", + "type": "custom", + "regex": "[0-9]{3}-[0-9]{2}-[0-9]{4}", + }, + {"label": "date of birth", "type": "date"}, + {"label": "date", "type": "date"}, + ] def test_init(self): try: - EntityExtractor() + NERExtractor() except Exception as e: self.assertRaises(TypeError, e) def test_init_inputs(self): - extractor = EntityExtractor(labels=labels, lang=LANGUAGES.ENGLISH, score_th=0.5) - self.assertEqual(extractor.__class__, EntityExtractor) + extractor = NERExtractor( + labels=self.labels, lang=LANGUAGES.ENGLISH, score_th=0.5 + ) + self.assertEqual(extractor.__class__, NERExtractor) def test_init_gpu(self): if torch.cuda.is_available(): - extractor = EntityExtractor( - labels=labels, lang=LANGUAGES.ENGLISH, score_th=0.5, use_gpu=True + extractor = NERExtractor( + labels=self.labels, lang=LANGUAGES.ENGLISH, score_th=0.5, use_gpu=True ) - self.assertEqual(extractor.__class__, EntityExtractor) + self.assertEqual(extractor.__class__, NERExtractor) + + def test_methods(self): + extractor = NERExtractor( + labels=self.labels, lang=LANGUAGES.ENGLISH, score_th=0.5 + ) + self.assertEqual(hasattr(extractor, "__call__"), True) + self.assertEqual(hasattr(extractor, "display"), True) + + def test_extract_default(self): + extractor = NERExtractor( + labels=self.labels, lang=LANGUAGES.ENGLISH, score_th=0.5 + ) + doc, entities = extractor(original_text) + for p_entity, t_entity in zip(entities, ner_entities): + self.assertEqual(p_entity.text, t_entity.text) + self.assertEqual(p_entity.label, t_entity.label) + self.assertEqual(p_entity.start_index, t_entity.start_index) + self.assertEqual(p_entity.end_index, t_entity.end_index) + self.assertEqual(p_entity.type, t_entity.type) + self.assertEqual(p_entity.regex, t_entity.regex) + self.assertEqual(p_entity.score >= 0.5, True) + + +# ===================================== +# Test Pattern Extractor +# ===================================== + + +class TestPatternExtractor(unittest.TestCase): + + def setUp(self): + warnings.filterwarnings("ignore", category=ImportWarning) + warnings.filterwarnings("ignore", category=UserWarning) + warnings.filterwarnings("ignore", category=FutureWarning) + # define the labels to be extracted and anonymized + self.labels = [ + { + "label": "symptoms", + "regex": r"\((.*)\)", # symptoms are enclosed in parentheses + }, + { + "label": "medicine", + "pattern": [[{"IS_ALPHA": True}, {"LIKE_NUM": True}, {"LOWER": "mg"}]], + }, + { + "label": "date", + "pattern": [ # represent the date as a sequence of digits using spacy + [ + {"SHAPE": "dd"}, + {"TEXT": "-"}, + {"SHAPE": "dd"}, + {"TEXT": "-"}, + {"SHAPE": "dddd"}, + ] + ], + }, + ] + + def test_init(self): + try: + PatternExtractor() + except Exception as e: + self.assertRaises(TypeError, e) + + def test_init_inputs(self): + extractor = PatternExtractor(labels=self.labels, lang=LANGUAGES.ENGLISH) + self.assertEqual(extractor.__class__, PatternExtractor) def test_methods(self): - extractor = EntityExtractor(labels=labels, lang=LANGUAGES.ENGLISH, score_th=0.5) + extractor = PatternExtractor(labels=self.labels, lang=LANGUAGES.ENGLISH) self.assertEqual(hasattr(extractor, "__call__"), True) self.assertEqual(hasattr(extractor, "display"), True) def test_extract_default(self): - extractor = EntityExtractor(labels=labels, lang=LANGUAGES.ENGLISH, score_th=0.5) + extractor = PatternExtractor(labels=self.labels, lang=LANGUAGES.ENGLISH) doc, entities = extractor(original_text) - for pred_entity, orig_entity in zip(entities, original_entities): - self.assertEqual(pred_entity.text, orig_entity.text) - self.assertEqual(pred_entity.label, orig_entity.label) - self.assertEqual(pred_entity.start_index, orig_entity.start_index) - self.assertEqual(pred_entity.end_index, orig_entity.end_index) - self.assertEqual(pred_entity.type, orig_entity.type) - self.assertEqual(pred_entity.regex, orig_entity.regex) - self.assertEqual(pred_entity.score >= 0.5, True) + for p_entity, t_entity in zip(entities, pattern_entities): + self.assertEqual(p_entity.text, t_entity.text) + self.assertEqual(p_entity.label, t_entity.label) + self.assertEqual(p_entity.start_index, t_entity.start_index) + self.assertEqual(p_entity.end_index, t_entity.end_index) + self.assertEqual(p_entity.type, t_entity.type) + self.assertEqual(p_entity.regex, t_entity.regex) + self.assertEqual(p_entity.score == 1.0, True) + + +class TestMultiExtractor(unittest.TestCase): + + def setUp(self): + warnings.filterwarnings("ignore", category=ImportWarning) + warnings.filterwarnings("ignore", category=UserWarning) + warnings.filterwarnings("ignore", category=FutureWarning) + # define the labels to be extracted and anonymized + self.ner_labels = [ + {"label": "name", "type": "string"}, + { + "label": "social security number", + "type": "custom", + "regex": "[0-9]{3}-[0-9]{2}-[0-9]{4}", + }, + {"label": "date of birth", "type": "date"}, + {"label": "date", "type": "date"}, + ] + self.pattern_labels = [ + { + "label": "symptoms", + "regex": r"\((.*)\)", # symptoms are enclosed in parentheses + }, + { + "label": "medicine", + "pattern": [[{"IS_ALPHA": True}, {"LIKE_NUM": True}, {"LOWER": "mg"}]], + }, + { + "label": "date", + "pattern": [ # represent the date as a sequence of digits using spacy + [ + {"SHAPE": "dd"}, + {"TEXT": "-"}, + {"SHAPE": "dd"}, + {"TEXT": "-"}, + {"SHAPE": "dddd"}, + ] + ], + }, + ] + + def test_init(self): + try: + MultiExtractor() + except Exception as e: + self.assertRaises(TypeError, e) + + def test_init_inputs(self): + extractors = [ + NERExtractor(labels=self.ner_labels, lang=LANGUAGES.ENGLISH), + PatternExtractor(labels=self.pattern_labels, lang=LANGUAGES.ENGLISH), + ] + extractor = MultiExtractor(extractors) + self.assertEqual(extractor.__class__, MultiExtractor) + + def test_methods(self): + extractors = [ + NERExtractor(labels=self.ner_labels, lang=LANGUAGES.ENGLISH), + PatternExtractor(labels=self.pattern_labels, lang=LANGUAGES.ENGLISH), + ] + extractor = MultiExtractor(extractors) + self.assertEqual(hasattr(extractor, "__call__"), True) + self.assertEqual(hasattr(extractor, "display"), True) + + def test_extract_default(self): + extractors = [ + NERExtractor(labels=self.ner_labels, lang=LANGUAGES.ENGLISH), + PatternExtractor(labels=self.pattern_labels, lang=LANGUAGES.ENGLISH), + ] + extractor = MultiExtractor(extractors) + extractor_outputs, joint_entities = extractor(original_text) + + # check the performance of the first extractor + for p_entity, t_entity in zip(extractor_outputs[0][1], ner_entities): + self.assertEqual(p_entity.text, t_entity.text) + self.assertEqual(p_entity.label, t_entity.label) + self.assertEqual(p_entity.start_index, t_entity.start_index) + self.assertEqual(p_entity.end_index, t_entity.end_index) + self.assertEqual(p_entity.type, t_entity.type) + self.assertEqual(p_entity.regex, t_entity.regex) + self.assertEqual(p_entity.score >= 0.5, True) + + # check the performance of the second extractor + for p_entity, t_entity in zip(extractor_outputs[1][1], pattern_entities): + self.assertEqual(p_entity.text, t_entity.text) + self.assertEqual(p_entity.label, t_entity.label) + self.assertEqual(p_entity.start_index, t_entity.start_index) + self.assertEqual(p_entity.end_index, t_entity.end_index) + self.assertEqual(p_entity.type, t_entity.type) + self.assertEqual(p_entity.regex, t_entity.regex) + self.assertEqual(p_entity.score == 1.0, True) + + # check the performance of the joint entities generation + for p_entity, t_entity in zip( + joint_entities, extractor._filter_entities(ner_entities + pattern_entities) + ): + self.assertEqual(p_entity.text, t_entity.text) + self.assertEqual(p_entity.label, t_entity.label) + self.assertEqual(p_entity.start_index, t_entity.start_index) + self.assertEqual(p_entity.end_index, t_entity.end_index) + self.assertEqual(p_entity.type, t_entity.type) + self.assertEqual(p_entity.regex, t_entity.regex) + self.assertEqual(p_entity.score >= 0.5, True) if __name__ == "__main__": diff --git a/test/test_generators.py b/test/test_generators.py index 1937aa1..0aa54ea 100644 --- a/test/test_generators.py +++ b/test/test_generators.py @@ -11,7 +11,7 @@ DateGenerator, NumberGenerator, ) -from anonipy.anonymize.regex import regex_map +from anonipy.utils.regex import regex_mapping # ===================================== # Test Cases @@ -118,7 +118,6 @@ end_index=38, score=1.0, type="string", - regex=regex_map("string"), ), "date": [ Entity( @@ -128,7 +127,6 @@ end_index=96, score=1.0, type="date", - regex=regex_map("date"), ) ] + [ @@ -139,7 +137,6 @@ end_index=86 + len(str), score=1.0, type="date", - regex=regex_map("date"), ) for str in DATETIME_STRS ], @@ -150,7 +147,6 @@ end_index=132, score=1.0, type="integer", - regex=regex_map("integer"), ), "float": Entity( text="123,456,789.000", @@ -159,7 +155,6 @@ end_index=132, score=1.0, type="float", - regex=regex_map("float"), ), "custom": Entity( text="123-45-6789", @@ -199,7 +194,7 @@ def test_generate_default(self): def test_generate_custom(self): entity = test_entities["name"] generated_text = self.generator.generate( - entity, entity_prefix="Spanish", temperature=0.5 + entity, add_entity_attrs="Spanish", temperature=0.5 ) match = re.match(entity.regex, generated_text) self.assertNotEqual(match, None) @@ -273,34 +268,34 @@ def text_generate_uncorrect_date_format(self): def test_generate_first_day_of_the_month(self): entity = test_entities["date"][0] generated_text = self.generator.generate( - entity, output_gen="first_day_of_the_month" + entity, sub_variant="FIRST_DAY_OF_THE_MONTH" ) self.assertEqual(generated_text, "01-05-2024") def test_generate_last_day_of_the_month(self): entity = test_entities["date"][0] generated_text = self.generator.generate( - entity, output_gen="last_day_of_the_month" + entity, sub_variant="LAST_DAY_OF_THE_MONTH" ) self.assertEqual(generated_text, "31-05-2024") def test_generate_middle_of_the_month(self): entity = test_entities["date"][0] generated_text = self.generator.generate( - entity, output_gen="middle_of_the_month" + entity, sub_variant="MIDDLE_OF_THE_MONTH" ) self.assertEqual(generated_text, "15-05-2024") def test_generate_middle_of_the_year(self): entity = test_entities["date"][0] generated_text = self.generator.generate( - entity, output_gen="middle_of_the_year" + entity, sub_variant="MIDDLE_OF_THE_YEAR" ) self.assertEqual(generated_text, "01-07-2024") def test_generate_random(self): entity = test_entities["date"][0] - generated_text = self.generator.generate(entity, output_gen="random") + generated_text = self.generator.generate(entity, sub_variant="RANDOM") match = re.match(entity.regex, generated_text) self.assertNotEqual(match, None) self.assertEqual(match.group(0), generated_text) @@ -315,7 +310,7 @@ def test_generate_uncorrect_type(self): def test_process_different_formats(self): for entity in test_entities["date"]: try: - self.generator.generate(entity, output_gen="random") + self.generator.generate(entity, sub_variant="RANDOM") except ValueError: self.fail( f"self.generator.generate() raised ValueError unexpectedly for date: {entity.text}" diff --git a/test/test_regex.py b/test/test_regex.py index 7730d4b..8a8d382 100644 --- a/test/test_regex.py +++ b/test/test_regex.py @@ -1,8 +1,8 @@ import unittest -from anonipy.anonymize.regex import ( - regex_map, - RegexMap, +from anonipy.utils.regex import ( + regex_mapping, + RegexMapping, REGEX_STRING, REGEX_INTEGER, REGEX_FLOAT, @@ -55,15 +55,15 @@ class TestRegex(unittest.TestCase): def test_init(self): - self.assertEqual(regex_map.__class__, RegexMap) - self.assertEqual(hasattr(regex_map, "regex_mapping"), True) + self.assertEqual(regex_mapping.__class__, RegexMapping) + self.assertEqual(hasattr(regex_mapping, "regex_mapping"), True) def test_regex_mapping(self): for test_case in TEST_CASES: - self.assertEqual(regex_map(test_case["entity"]), test_case["regex"]) + self.assertEqual(regex_mapping[test_case["entity"]], test_case["regex"]) self.assertEqual( - regex_map(test_case["value"]), regex_map(test_case["entity"]) + regex_mapping[test_case["value"]], regex_mapping[test_case["entity"]] ) diff --git a/test/test_strategies.py b/test/test_strategies.py index d614864..7ab6dac 100644 --- a/test/test_strategies.py +++ b/test/test_strategies.py @@ -13,9 +13,9 @@ test_text = "Test this string, and this test too!" test_entities = [ - Entity(text="Test", label="test", start_index=0, end_index=4, score=1.0), - Entity(text="string", label="type", start_index=10, end_index=16, score=1.0), - Entity(text="test", label="test", start_index=27, end_index=31, score=1.0), + Entity(text="Test", label="test", start_index=0, end_index=4), + Entity(text="string", label="type", start_index=10, end_index=16), + Entity(text="test", label="test", start_index=27, end_index=31), ]