From 707cec1ec22cd256b192a419bd52f9353bbc4bef Mon Sep 17 00:00:00 2001 From: eriknovak Date: Tue, 18 Jun 2024 16:25:22 +0200 Subject: [PATCH 1/2] Add code documentation --- anonipy/__init__.py | 18 ++- anonipy/anonymize/__init__.py | 22 +++ anonipy/anonymize/extractors/__init__.py | 14 ++ .../anonymize/extractors/entity_extractor.py | 66 ++++++++- anonipy/anonymize/extractors/interface.py | 1 + anonipy/anonymize/generators/__init__.py | 20 +++ .../anonymize/generators/date_generator.py | 120 ++++++++++++++++ anonipy/anonymize/generators/interface.py | 1 + .../generators/llm_label_generator.py | 57 +++++++- .../generators/mask_label_generator.py | 46 ++++++ .../anonymize/generators/number_generator.py | 34 +++++ anonipy/anonymize/helpers.py | 33 +++++ anonipy/anonymize/regex.py | 14 +- anonipy/anonymize/strategies/__init__.py | 18 +++ anonipy/anonymize/strategies/interface.py | 8 ++ anonipy/anonymize/strategies/masking.py | 38 +++++ .../anonymize/strategies/pseudonymization.py | 38 +++++ anonipy/anonymize/strategies/redaction.py | 38 +++++ anonipy/constants.py | 16 ++- anonipy/definitions.py | 50 ++++++- anonipy/utils/__init__.py | 14 ++ anonipy/utils/datetime.py | 14 ++ anonipy/utils/file_system.py | 136 ++++++++++++++++++ anonipy/utils/language_detector.py | 45 +++++- 24 files changed, 840 insertions(+), 21 deletions(-) diff --git a/anonipy/__init__.py b/anonipy/__init__.py index e62ee03..ff20e30 100644 --- a/anonipy/__init__.py +++ b/anonipy/__init__.py @@ -1,11 +1,19 @@ """ anonipy -========= -Provides - 1. Label extractors - 2. Label generators - 3. Anonymization strategies +The anonipy package provides utilities for data anonymization. + +Submodules +---------- +anonymize : + The package containing anonymization classes and functions. +utils : + The package containing utility classes and functions. +definitions : + The object definitions used within the package. +constants : + The constant values used to help with data anonymization. + How to use the documentation ---------------------------- diff --git a/anonipy/anonymize/__init__.py b/anonipy/anonymize/__init__.py index 879f10f..598d0c6 100644 --- a/anonipy/anonymize/__init__.py +++ b/anonipy/anonymize/__init__.py @@ -1,3 +1,25 @@ +""" +anonymize + +The module provides a set of anonymization utilities. + +Submodules +---------- +extractors : + The module containing the extractor classes +generators : + The module containing the generator classes +strategies : + The module containing the strategy classes +regex : + The module containing the regex patterns + +Methods +------- +anonymize() + +""" + from . import extractors from . import generators from . import strategies diff --git a/anonipy/anonymize/extractors/__init__.py b/anonipy/anonymize/extractors/__init__.py index 58dac1f..ec53b8a 100644 --- a/anonipy/anonymize/extractors/__init__.py +++ b/anonipy/anonymize/extractors/__init__.py @@ -1,3 +1,17 @@ +""" +extractors + +The module provides a set of extractors used in the library. + +Classes +------- +ExtractorInterface : + The class representing the extractor interface +EntityExtractor : + The class representing the entity extractor + +""" + from .interface import ExtractorInterface from .entity_extractor import EntityExtractor diff --git a/anonipy/anonymize/extractors/entity_extractor.py b/anonipy/anonymize/extractors/entity_extractor.py index e1aa510..259893d 100644 --- a/anonipy/anonymize/extractors/entity_extractor.py +++ b/anonipy/anonymize/extractors/entity_extractor.py @@ -16,6 +16,30 @@ class EntityExtractor(ExtractorInterface): + """The class representing the entity extractor + + Attributes + ---------- + labels : List[dict] + The list of labels to extract + lang : str + The language of the text to extract + score_th : float + The score threshold + use_gpu : bool + Whether to use GPU + pipeline : spacy pipeline + The spacy pipeline + + + Methods + ------- + __call__(self, text: str) + Extract the entities from the text + display(self, doc: Doc) + Display the entities in the text + + """ def __init__( self, @@ -23,19 +47,59 @@ def __init__( lang: LANGUAGES = LANGUAGES.ENGLISH, score_th=0.5, use_gpu=False, + *args, + **kwargs, ): + """ + Parameters + ---------- + labels : List[dict] + The list of labels to extract + lang : str + The language of the text to extract + score_th : float + The score threshold. Entities with a score below this threshold will be ignored. Default: 0.5 + use_gpu : bool + Whether to use GPU. Default: False + + """ + + super().__init__(labels, *args, **kwargs) self.lang = lang self.score_th = score_th self.use_gpu = use_gpu self.labels = self._prepare_labels(labels) self.pipeline = self._prepare_pipeline() - def __call__(self, text: str) -> Tuple[Doc, List[Entity]]: + def __call__(self, text: str, *args, **kwargs) -> Tuple[Doc, List[Entity]]: + """Extract the entities from the text + + Parameters + ---------- + text : str + The text to extract entities from + + Returns + ------- + Tuple[Doc, List[Entity]] + The spacy doc and the list of entities extracted + + """ + doc = self.pipeline(text) entities, doc.ents = self._prepare_entities(doc) return doc, entities def display(self, doc: Doc): + """Display the entities in the text + + Parameters + ---------- + doc : Doc + The spacy doc to display + + """ + options = {"colors": {l["label"]: "#5C7AEA" for l in self.labels}} displacy.render(doc, style="ent", options=options) diff --git a/anonipy/anonymize/extractors/interface.py b/anonipy/anonymize/extractors/interface.py index 096e77c..092f5fe 100644 --- a/anonipy/anonymize/extractors/interface.py +++ b/anonipy/anonymize/extractors/interface.py @@ -5,6 +5,7 @@ class ExtractorInterface: + """The class representing the extractor interface""" def __init__(self, labels: List[dict], *args, **kwargs): pass diff --git a/anonipy/anonymize/generators/__init__.py b/anonipy/anonymize/generators/__init__.py index 56a17bb..b0b8b74 100644 --- a/anonipy/anonymize/generators/__init__.py +++ b/anonipy/anonymize/generators/__init__.py @@ -1,3 +1,23 @@ +""" +generators + +The module provides a set of generators used in the library. + +Classes +------- +GeneratorInterface : + The class representing the generator interface +LLMLabelGenerator : + The class representing the LLM label generator +MaskLabelGenerator : + The class representing the mask label generator +NumberGenerator : + The class representing the number generator +DateGenerator : + The class representing the date generator + +""" + from .interface import GeneratorInterface from .llm_label_generator import LLMLabelGenerator from .mask_label_generator import MaskLabelGenerator diff --git a/anonipy/anonymize/generators/date_generator.py b/anonipy/anonymize/generators/date_generator.py index 08ce5a0..8da1ca3 100644 --- a/anonipy/anonymize/generators/date_generator.py +++ b/anonipy/anonymize/generators/date_generator.py @@ -12,23 +12,94 @@ def first_day_of_month(day: datetime.datetime, *args, **kwargs): + """Returns the first day of the month of the given date + + Parameters + ---------- + day : datetime.datetime + The date to get the first day of the month from + + Returns + ------- + datetime.datetime + The first day of the month of the given date + + """ return day.replace(day=1) def last_day_of_month(day: datetime.datetime, *args, **kwargs): + """Returns the last day of the month of the given date + + Parameters + ---------- + day : datetime.datetime + The date to get the last day of the month from + + Returns + ------- + datetime.datetime + The last day of the month of the given date + + """ next_month = day.replace(day=28) + datetime.timedelta(days=4) return next_month - datetime.timedelta(days=next_month.day) def middle_of_the_month(day: datetime.datetime, *args, **kwargs): + """Returns the middle day of the month of the given date + + Parameters + ---------- + day : datetime.datetime + The date to get the middle day of the month from + + Returns + ------- + datetime.datetime + The middle day of the month of the given date + + """ + return day.replace(day=15) def middle_of_the_year(day: datetime.datetime, *args, **kwargs): + """Returns the middle day of the year of the given date + + Parameters + ---------- + day : datetime.datetime + The date to get the middle day of the year from + + Returns + ------- + datetime.datetime + The middle day of the year of the given date + + """ + return day.replace(month=7, day=1) def random_date(day: datetime.datetime, sigma: int = 30, *args, **kwargs): + """Returns a random date within the given date range + + The function returns a date within the range [day - sigma, day + sigma]. + + Parameters + ---------- + day : datetime.datetime + The date to get the random date from + sigma : int + The range of the random date in days. Default: 30 + + Returns + ------- + datetime.datetime + The random date within the given date range + + """ delta = random.randint(-sigma, sigma) return day + datetime.timedelta(days=delta) @@ -48,12 +119,61 @@ def random_date(day: datetime.datetime, sigma: int = 30, *args, **kwargs): class DateGenerator(GeneratorInterface): + """ + The class representing the date generator + + Attributes + ---------- + date_format : str + The date format to use + day_sigma : int + The range of the random date in days + + Methods + ------- + generate(entity: Entity, output_gen: str = "random") + Generate the date based on the entity and output_gen + + """ def __init__(self, date_format="auto", day_sigma: int = 30, *args, **kwargs): + """ + Parameters + ---------- + date_format : str, optional + The date format to use. Default: "auto" + day_sigma : int, optional + The range of the random date in days. Default: 30 + + """ + + super().__init__(*args, **kwargs) self.date_format = date_format self.day_sigma = day_sigma def generate(self, entity: Entity, output_gen: str = "random", *args, **kwargs): + """ + Generate the date based on the entity and output_gen + + Parameters + ---------- + entity : Entity + The entity to generate the date from + output_gen : str, optional + The output generator to use. Default: "random" + + Returns + ------- + str + The generated date + + Raises + ------ + ValueError + If the entity type is not `date` or `custom` + + """ + if entity.type in ["custom"]: warnings.warn( "The entity type is `custom`. Make sure the generator is returning appropriate values." diff --git a/anonipy/anonymize/generators/interface.py b/anonipy/anonymize/generators/interface.py index 5303a27..3dec839 100644 --- a/anonipy/anonymize/generators/interface.py +++ b/anonipy/anonymize/generators/interface.py @@ -7,6 +7,7 @@ class GeneratorInterface: + """The class representing the generator interface""" def __init__(self, *args, **kwargs): pass diff --git a/anonipy/anonymize/generators/llm_label_generator.py b/anonipy/anonymize/generators/llm_label_generator.py index 1001c69..b73a2e8 100644 --- a/anonipy/anonymize/generators/llm_label_generator.py +++ b/anonipy/anonymize/generators/llm_label_generator.py @@ -75,8 +75,32 @@ def prepare_llama3_byte_decoder(): class LLMLabelGenerator(GeneratorInterface): + """The class representing the LLM label generator + + Attributes + ---------- + model : models.Transformers + The model used to generate the label + + Methods + ------- + generate(entity: Entity, entity_prefix: str = "", temperature: float = 0.0) + Generate the label based on the entity + + validate(entity: Entity) + Validate the entity + + """ def __init__(self, *args, **kwargs): + """ + Parameters + ---------- + None + + """ + + super().__init__(*args, **kwargs) # TODO: make this configurable model_name = "meta-llama/Meta-Llama-3-8B-Instruct" @@ -96,6 +120,24 @@ def generate( *args, **kwargs, ): + """Generate the label based on the entity + + Parameters + ---------- + entity : Entity + The entity to generate the label from + entity_prefix : str + The prefix to use for the entity + temperature : float + The temperature to use for the generation. Default: 0.0 + + Returns + ------- + str + The generated label + + """ + user_prompt = f"What is a random {entity_prefix} {entity.label} replacement for {entity.text}? Respond only with the replacement." assistant_prompt = gen( name="replacement", @@ -113,7 +155,20 @@ def generate( return lm["replacement"] def validate(self, entity: Entity): - regex = regex if regex else ".*" + """Validate the entity + + Parameters + ---------- + entity : Entity + The entity to validate + + Returns + ------- + bool + The validation result + + """ + user_prompt = f"Is {entity.text} a {entity.label}?" assistant_prompt = select(["True", "False"], name="validation") # validate the entity with the validation prompt diff --git a/anonipy/anonymize/generators/mask_label_generator.py b/anonipy/anonymize/generators/mask_label_generator.py index cc8390b..2ab6aab 100644 --- a/anonipy/anonymize/generators/mask_label_generator.py +++ b/anonipy/anonymize/generators/mask_label_generator.py @@ -18,6 +18,23 @@ class MaskLabelGenerator(GeneratorInterface): + """The class representing the mask label generator + + Attributes + ---------- + context_window : int + The context window size + pipeline : transformers pipeline + The transformers pipeline + mask_token : str + The mask token + + Methods + ------- + generate(self, entity: Entity, text: str) + Anonymize the text based on the entities + + """ def __init__( self, @@ -27,6 +44,18 @@ def __init__( *args, **kwargs, ): + """ + Parameters + ---------- + model_name : str, optional + The name of the model to use. Default: "FacebookAI/xlm-roberta-large" + use_gpu : bool, optional + Whether to use GPU/CUDA. Default: False + context_window : int, optional + The context window size. Default: 100 + + """ + super().__init__(*args, **kwargs) self.context_window = context_window if use_gpu and not torch.cuda.is_available(): warnings.warn( @@ -42,6 +71,23 @@ def __init__( ) def generate(self, entity: Entity, text: str, *args, **kwargs): + """ + Generate the substituted text based on the entity + + Parameters + ---------- + entity : Entity + The entity to generate the label from + text : str + The text to generate the label from + + Returns + ------- + str + The generated text + + """ + masks = self._create_masks(entity) input_texts = self._prepare_generate_inputs(masks, text) suggestions = self.pipeline(input_texts) diff --git a/anonipy/anonymize/generators/number_generator.py b/anonipy/anonymize/generators/number_generator.py index 54014f4..9540aaa 100644 --- a/anonipy/anonymize/generators/number_generator.py +++ b/anonipy/anonymize/generators/number_generator.py @@ -9,11 +9,45 @@ class NumberGenerator(GeneratorInterface): + """The class representing the number generator + + Methods + ------- + generate(self, entity: Entity) + Generates a number replacement + + """ def __init__(self, *args, **kwargs): + """ + Parameters + ---------- + None + + """ + super().__init__(*args, **kwargs) pass def generate(self, entity: Entity, *args, **kwargs): + """ + Generates a number replacement + + Parameters + ---------- + entity : Entity + The entity to generate the number from + + Returns + ------- + str + The generated number + + Raises + ------ + ValueError + If the entity type is not `integer`, `float`, `phone_number` or `custom` + + """ if entity.type in ["custom"]: warnings.warn( "The entity type is `custom`. Make sure the generator is returning appropriate values." diff --git a/anonipy/anonymize/helpers.py b/anonipy/anonymize/helpers.py index 76e81ee..844a811 100644 --- a/anonipy/anonymize/helpers.py +++ b/anonipy/anonymize/helpers.py @@ -7,6 +7,24 @@ def convert_spacy_to_entity(entity, type=None, regex=".*", *args, **kwargs): + """Convert a SpaCy entity to an Entity object + + Parameters + ---------- + entity : SpaCy Span + The SpaCy entity to convert + type : ENTITY_TYPES, optional + The type of the entity. Default: None + regex : Union[str, re.Pattern], optional + The regular expression the entity must match. Default: ".*" + + Returns + ------- + Entity + The converted Entity object + + """ + return Entity( entity.text, entity.label_, @@ -19,6 +37,21 @@ def convert_spacy_to_entity(entity, type=None, regex=".*", *args, **kwargs): def anonymize(text: str, replacements: List[Replacement]) -> str: + """Anonymize a text based on a list of replacements + + Parameters + ---------- + text : str + The text to anonymize + replacements : List[Replacement] + The list of replacements to apply + + Returns + ------- + Tuple[str, List[Replacement]] + The anonymized text and the list of replacements applied + + """ s_replacements = sorted(replacements, key=lambda x: x["start_index"], reverse=True) anonymized_text = text diff --git a/anonipy/anonymize/regex.py b/anonipy/anonymize/regex.py index 15bcc82..957db0f 100644 --- a/anonipy/anonymize/regex.py +++ b/anonipy/anonymize/regex.py @@ -1,7 +1,3 @@ -""" -The regex definitions for various use cases -""" - from collections import defaultdict from ..constants import ENTITY_TYPES @@ -51,6 +47,16 @@ class RegexMap: + """RegexMap + + The class representing the regex map + + Attributes + ---------- + regex_mapping : defaultdict + The regex mapping + + """ def __init__(self): self.regex_mapping = defaultdict(lambda: ".*") diff --git a/anonipy/anonymize/strategies/__init__.py b/anonipy/anonymize/strategies/__init__.py index c0cc673..e0f25a7 100644 --- a/anonipy/anonymize/strategies/__init__.py +++ b/anonipy/anonymize/strategies/__init__.py @@ -1,3 +1,21 @@ +""" +strategies + +The module provides a set of strategies used in the library. + +Classes +------- +StrategyInterface : + The class representing the strategy interface +MaskingStrategy : + The class representing the masking strategy +RedactionStrategy : + The class representing the redaction strategy +PseudonymizationStrategy : + The class representing the pseudonymization strategy + +""" + from .interface import StrategyInterface from .masking import MaskingStrategy from .redaction import RedactionStrategy diff --git a/anonipy/anonymize/strategies/interface.py b/anonipy/anonymize/strategies/interface.py index bef7c86..448b0d7 100644 --- a/anonipy/anonymize/strategies/interface.py +++ b/anonipy/anonymize/strategies/interface.py @@ -11,6 +11,14 @@ class StrategyInterface: + """The class representing the strategy interface + + Methods + ------- + anonymize(text: str, entities: List[Entity], *args, **kwargs) + Anonymize the text based on the entities + + """ def __init__(self, *args, **kwargs): pass diff --git a/anonipy/anonymize/strategies/masking.py b/anonipy/anonymize/strategies/masking.py index 039d124..090c271 100644 --- a/anonipy/anonymize/strategies/masking.py +++ b/anonipy/anonymize/strategies/masking.py @@ -15,13 +15,51 @@ class MaskingStrategy(StrategyInterface): + """The class representing the masking strategy + + Attributes + ---------- + substitute_label : str + The label to substitute in the anonymized text + + Methods + ------- + anonymize(text: str, entities: List[Entity]) + Anonymize the text based on the entities + + """ def __init__(self, substitute_label: str = "*", *args, **kwargs): + """ + Parameters + ---------- + substitute_label : str, optional + The label to substitute in the anonymized text. Default: "*" + + """ + + super().__init__(*args, **kwargs) self.substitute_label = substitute_label def anonymize( self, text: str, entities: List[Entity], *args, **kwargs ) -> Tuple[str, List[Replacement]]: + """Anonymize the text based on the entities + + Parameters + ---------- + text : str + The text to anonymize + entities : List[Entity] + The list of entities to anonymize + + Returns + ------- + Tuple[str, List[Replacement]] + The anonymized text and the list of replacements applied + + """ + replacements = [self._create_replacement(ent) for ent in entities] anonymized_text, replacements = anonymize(text, replacements) return anonymized_text, replacements diff --git a/anonipy/anonymize/strategies/pseudonymization.py b/anonipy/anonymize/strategies/pseudonymization.py index 6dbe7b7..1760ac5 100644 --- a/anonipy/anonymize/strategies/pseudonymization.py +++ b/anonipy/anonymize/strategies/pseudonymization.py @@ -14,13 +14,51 @@ class PseudonymizationStrategy(StrategyInterface): + """The class representing the pseudonymization strategy + + Attributes + ---------- + mapping : dict + The mapping of entities to pseudonyms + + Methods + ------- + anonymize(text: str, entities: List[Entity]) + Anonymize the text based on the entities + + """ def __init__(self, mapping, *args, **kwargs): + """ + Parameters + ---------- + mapping : func + The mapping of entities to pseudonyms + + """ + + super().__init__(*args, **kwargs) self.mapping = mapping def anonymize( self, text: str, entities: List[Entity], *args, **kwargs ) -> Tuple[str, List[Replacement]]: + """Anonymize the text based on the entities + + Parameters + ---------- + text : str + The text to anonymize + entities : List[Entity] + The list of entities to anonymize + + Returns + ------- + Tuple[str, List[Replacement]] + The anonymized text and the list of replacements applied + + """ + replacements = [] for ent in entities: replacement = self._create_replacement(ent, text, replacements) diff --git a/anonipy/anonymize/strategies/redaction.py b/anonipy/anonymize/strategies/redaction.py index e39604f..bf4e160 100644 --- a/anonipy/anonymize/strategies/redaction.py +++ b/anonipy/anonymize/strategies/redaction.py @@ -14,13 +14,51 @@ class RedactionStrategy(StrategyInterface): + """The class representing the redaction strategy + + Attributes + ---------- + substitute_label : str + The label to substitute in the anonymized text + + Methods + ------- + anonymize(text: str, entities: List[Entity]) + Anonymize the text based on the entities + + """ def __init__(self, substitute_label: str = "[REDACTED]", *args, **kwargs) -> None: + """ + Parameters + ---------- + substitute_label : str, optional + The label to substitute in the anonymized text. Default: "[REDACTED]" + + """ + + super().__init__(*args, **kwargs) self.substitute_label = substitute_label def anonymize( self, text: str, entities: List[Entity], *args, **kwargs ) -> Tuple[str, List[Replacement]]: + """Anonymize the text based on the entities + + Parameters + ---------- + text : str + The text to anonymize + entities : List[Entity] + The list of entities to anonymize + + Returns + ------- + Tuple[str, List[Replacement]] + The anonymized text and the list of replacements applied + + """ + replacements = [self._create_replacement(ent) for ent in entities] anonymized_text, replacements = anonymize(text, replacements) return anonymized_text, replacements diff --git a/anonipy/constants.py b/anonipy/constants.py index a11ff65..71be42c 100644 --- a/anonipy/constants.py +++ b/anonipy/constants.py @@ -1,10 +1,16 @@ """ -The constants used to make it easier to use the library -""" +constants + +The module provides a set of constants used in the library. -# ================================================ -# Constants -# ================================================ +Classes +------- +LANGUAGES : + Predefined supported languages +ENTITY_TYPES : + Predefined types of entities + +""" class LANGUAGES: diff --git a/anonipy/definitions.py b/anonipy/definitions.py index 43e3106..ff26607 100644 --- a/anonipy/definitions.py +++ b/anonipy/definitions.py @@ -1,5 +1,15 @@ """ -The definitions used within the package +definitions + +The module provides a set of object definitions used in the library. + +Classes +------- +Entity : + The class representing the entity +Replacement : + The class representing the replacement + """ import re @@ -16,6 +26,27 @@ @dataclass class Entity: + """The class representing the entity + + Attributes + ---------- + text : str + The text of the entity + label : str + The label of the entity + start_index : int + The start index of the entity in the text + end_index : int + The end index of the entity in the text + score : float + The prediction score of the entity. The score is returned by the extractor models. Default: 1.0 + type : ENTITY_TYPES + The type of the entity. Default: None + regex : Union[str, re.Pattern] + The regular expression the entity must match. Default: ".*" + + """ + text: str label: str start_index: int @@ -26,6 +57,23 @@ class Entity: class Replacement(TypedDict): + """The class representing the replacement + + Attributes + ---------- + original_text : str, optional + The original text of the entity + label : str, optional + The label of the entity + start_index : int + The start index of the entity in the text + end_index : int + The end index of the entity in the text + anonymized_text : str + The anonymized text replacing the original + + """ + original_text: NotRequired[str] label: NotRequired[str] start_index: int diff --git a/anonipy/utils/__init__.py b/anonipy/utils/__init__.py index f1007c5..58aaf21 100644 --- a/anonipy/utils/__init__.py +++ b/anonipy/utils/__init__.py @@ -1,3 +1,17 @@ +""" +utils + +The module provides a set of utilities used in the library. + +Submodules +---------- +language_detector : + The module containing the language detector +file_system : + The module containing the file system utilities + +""" + from . import language_detector from . import file_system diff --git a/anonipy/utils/datetime.py b/anonipy/utils/datetime.py index 9ee2a3d..461f782 100644 --- a/anonipy/utils/datetime.py +++ b/anonipy/utils/datetime.py @@ -81,6 +81,20 @@ def detect_datetime_format(datetime): + """Detects the datetime format + + Parameters + ---------- + datetime: str + The datetime string + + Returns + ------- + Tuple[datetime.datetime, str] + The detected datetime and the format + + """ + try: parsed_datetime = parser.parse(datetime, fuzzy=True) diff --git a/anonipy/utils/file_system.py b/anonipy/utils/file_system.py index 876e11b..629c304 100644 --- a/anonipy/utils/file_system.py +++ b/anonipy/utils/file_system.py @@ -20,6 +20,19 @@ def remove_extra_spaces(text: str) -> str: + """Remove extra spaces from text + + Parameters + ---------- + text : str + The text to remove extra spaces from + + Returns + ------- + str + The text with extra spaces removed + + """ text = text.strip() # remove extra spaces text = re.sub(" +", " ", text) @@ -28,6 +41,20 @@ def remove_extra_spaces(text: str) -> str: def remove_page_numbers(text: str) -> str: + """Removes page numbers from text + + Parameters + ---------- + text : str + The text to remove page numbers from + + Returns + ------- + str + The text with page numbers removed + + """ + page_number_pattern = re.compile(r"^\s*\d+\s*$|\s*\d+\s*$") filtered_lines = [ line.strip() @@ -43,6 +70,20 @@ def remove_page_numbers(text: str) -> str: def extract_text_from_pdf(pdf_path: str) -> str: + """Extracts text from a PDF file + + Parameters + ---------- + pdf_path : str + The path to the PDF file + + Returns + ------- + str + The text from the PDF file + + """ + pdf_reader = PdfReader(pdf_path) pages_text = [] @@ -79,6 +120,20 @@ def _word_process_table(t) -> str: def extract_text_from_word(doc_path: str) -> str: + """Extracts text from a Word file + + Parameters + ---------- + doc_path : str + The path to the Word file + + Returns + ------- + str + The text from the Word file + + """ + doc = Document(doc_path) content = [] for element in doc.element.body: @@ -100,6 +155,24 @@ def extract_text_from_word(doc_path: str) -> str: def open_file(file_path: str) -> str: + """ + Opens a file and returns its content as a string + + Parameters + ---------- + file_path : str + The path to the file + + Returns + ------- + str + The content of the file as a string + + """ + + if not os.path.isfile(file_path): + raise FileNotFoundError(f"The file does not exist: {file_path}") + _, file_extension = os.path.splitext(file_path) if file_extension.lower() == ".pdf": return extract_text_from_pdf(file_path) @@ -113,11 +186,60 @@ def open_file(file_path: str) -> str: def open_json(file_path: str) -> dict: + """ + Opens a JSON file and returns its content as a dictionary + + Parameters + ---------- + file_path : str + The path to the JSON file + + Returns + ------- + dict + The content of the JSON file as a dictionary + + """ + + if not os.path.isfile(file_path): + raise FileNotFoundError(f"The file does not exist: {file_path}") + with open(file_path, "r", encoding="utf-8") as f: return json.load(f) def write_file(text: str, file_path: str, encode: Union[str, bool] = True) -> None: + """Writes text to a file + + Parameters + ---------- + text : str + The text to write to the file + file_path : str + The path to the file + encode : Union[str, bool], optional + The encoding to use. Default: True + + Raises + ------ + TypeError + If text, file_path is not a string; encode is not a string or a boolean + FileNotFoundError + If the directory does not exist + + """ + + if not isinstance(text, str): + raise TypeError("text must be a string") + + if not isinstance(file_path, str): + raise TypeError("file_path must be a string") + + if not os.path.exists(os.path.dirname(file_path)): + raise FileNotFoundError( + f"The directory does not exist: {os.path.dirname(file_path)}" + ) + if not isinstance(encode, str) and not isinstance(encode, bool): raise TypeError("encode must be a string or a boolean") @@ -132,5 +254,19 @@ def write_file(text: str, file_path: str, encode: Union[str, bool] = True) -> No def write_json(data: dict, file_path: str) -> None: + """Writes data to a JSON file + + Parameters + ---------- + data : dict + The data to write to the JSON file + file_path : str + The path to the JSON file + + """ + + if not os.path.exists(os.path.dirname(file_path)): + os.makedirs(os.path.dirname(file_path), exist_ok=True) + with open(file_path, "w", encoding="utf-8") as f: json.dump(data, f, ensure_ascii=False, indent=4) diff --git a/anonipy/utils/language_detector.py b/anonipy/utils/language_detector.py index 108f6d8..4b403e4 100644 --- a/anonipy/utils/language_detector.py +++ b/anonipy/utils/language_detector.py @@ -1,13 +1,33 @@ -""" -The language detector class -""" - from lingua import LanguageDetectorBuilder class LanguageDetector: + """The class for detecting the language of a text + + Attributes + ---------- + detector : LanguageDetector + The language detector + + Methods + ------- + __call__(self, text: str, output_standard: str = "iso_code_639_1") + Detect the language of a text. Calls the `detect` method. + + detect(text: str, output_standard: str = "iso_code_639_1") + Detect the language of a text + + """ def __init__(self, low_accuracy: bool = False): + """ + Parameters + ---------- + low_accuracy : bool, optional + Whether to use the low accuracy mode. Default: False + + """ + # Prepare the language detector for all languages builder = LanguageDetectorBuilder.from_all_languages() builder = ( @@ -21,6 +41,23 @@ def __call__(self, text: str, output_standard: str = "iso_code_639_1") -> str: return self.detect(text, output_standard) def detect(self, text: str, output_standard: str = "iso_code_639_1") -> str: + """ + Detect the language of a text + + Parameters + ---------- + text : str + The text to detect the language of + output_standard : str, optional + The output standard. Default: "iso_code_639_1" + + Returns + ------- + Tuple[str, str] + The language code and the full name of the language + + """ + language = self.detector.detect_language_of(text) iso_code = getattr(language, output_standard).name.lower() full_name = language.name.lower().title() From bb480688cf91ea71956ff90f7f5600a93f4678ac Mon Sep 17 00:00:00 2001 From: eriknovak Date: Tue, 18 Jun 2024 18:14:37 +0200 Subject: [PATCH 2/2] Add missing documentation --- .../anonymize/extractors/entity_extractor.py | 50 ++++++++++- .../anonymize/generators/date_generator.py | 6 +- .../generators/llm_label_generator.py | 40 +++++++++ .../generators/mask_label_generator.py | 89 +++++++++++++++++-- .../anonymize/generators/number_generator.py | 3 +- anonipy/anonymize/regex.py | 21 +++++ anonipy/anonymize/strategies/interface.py | 4 - anonipy/anonymize/strategies/masking.py | 30 ++++++- .../anonymize/strategies/pseudonymization.py | 36 +++++++- anonipy/anonymize/strategies/redaction.py | 18 +++- anonipy/utils/file_system.py | 28 ++++++ 11 files changed, 295 insertions(+), 30 deletions(-) diff --git a/anonipy/anonymize/extractors/entity_extractor.py b/anonipy/anonymize/extractors/entity_extractor.py index 259893d..dbca1b4 100644 --- a/anonipy/anonymize/extractors/entity_extractor.py +++ b/anonipy/anonymize/extractors/entity_extractor.py @@ -107,7 +107,20 @@ def display(self, doc: Doc): # Private methods # =========================================== - def _prepare_labels(self, labels): + def _prepare_labels(self, labels: List[dict]) -> List[dict]: + """Prepare the labels for the extractor + + Parameters + ---------- + labels : List[dict] + The list of labels to prepare + + Returns + ------- + List[dict] + The prepared labels + + """ for l in labels: if "regex" in l: continue @@ -117,6 +130,15 @@ def _prepare_labels(self, labels): return labels def _create_gliner_config(self): + """Create the config for the GLINER model + + Returns + ------- + dict + The config for the GLINER model + + """ + map_location = "cpu" if self.use_gpu and not torch.cuda.is_available(): return warnings.warn( @@ -136,6 +158,15 @@ def _create_gliner_config(self): } def _prepare_pipeline(self): + """Prepare the spacy pipeline + + Returns + ------- + spacy pipeline + The spacy pipeline + + """ + # load the appropriate parser for the language module_lang, class_lang = self.lang[0].lower(), self.lang[1].lower().title() language_module = importlib.import_module(f"spacy.lang.{module_lang}") @@ -147,8 +178,21 @@ def _prepare_pipeline(self): nlp.add_pipe("gliner_spacy", config=gliner_config) return nlp - def _prepare_entities(self, doc): - # prepares the anonymized and spacy entities + def _prepare_entities(self, doc: Doc): + """Prepares the anonipy and spacy entities + + Parameters + ---------- + doc : Doc + The spacy doc to prepare + + Returns + ------- + Tuple[List[Entity], List[Entity]] + The anonipy entities and the spacy entities + + + """ # TODO: make this part more generic anoni_entities = [] diff --git a/anonipy/anonymize/generators/date_generator.py b/anonipy/anonymize/generators/date_generator.py index 8da1ca3..7b36b69 100644 --- a/anonipy/anonymize/generators/date_generator.py +++ b/anonipy/anonymize/generators/date_generator.py @@ -119,8 +119,7 @@ def random_date(day: datetime.datetime, sigma: int = 30, *args, **kwargs): class DateGenerator(GeneratorInterface): - """ - The class representing the date generator + """The class representing the date generator Attributes ---------- @@ -152,8 +151,7 @@ def __init__(self, date_format="auto", day_sigma: int = 30, *args, **kwargs): self.day_sigma = day_sigma def generate(self, entity: Entity, output_gen: str = "random", *args, **kwargs): - """ - Generate the date based on the entity and output_gen + """Generate the date based on the entity and output_gen Parameters ---------- diff --git a/anonipy/anonymize/generators/llm_label_generator.py b/anonipy/anonymize/generators/llm_label_generator.py index b73a2e8..2775883 100644 --- a/anonipy/anonymize/generators/llm_label_generator.py +++ b/anonipy/anonymize/generators/llm_label_generator.py @@ -185,6 +185,19 @@ def validate(self, entity: Entity): # ================================= def _prepare_model_and_tokenizer(self, model_name: str): + """Prepares the model and tokenizer + + Parameters + ---------- + model_name : str + The name of the model to use + + Returns + ------- + model, tokenizer + The model and the tokenizer + + """ # prepare the model bnb_config = BitsAndBytesConfig( load_in_4bit=True, @@ -204,10 +217,37 @@ def _prepare_model_and_tokenizer(self, model_name: str): return model, tokenizer def _system_prompt(self): + """Returns the system prompt""" return "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful AI assistant for generating replacements for text entities.<|eot_id|>" def _user_prompt(self, prompt): + """Returns the user prompt + + Parameters + ---------- + prompt : str + The prompt to use + + Returns + ------- + str + The user prompt + + """ return f"<|start_header_id|>user<|end_header_id|>\n\n{prompt}<|eot_id|>" def _assistant_prompt(self, prompt): + """Returns the assistant prompt + + Parameters + ---------- + prompt : str + The prompt to use + + Returns + ------- + str + The assistant prompt + + """ return f"<|start_header_id|>assistant<|end_header_id|>\n\n{prompt}" diff --git a/anonipy/anonymize/generators/mask_label_generator.py b/anonipy/anonymize/generators/mask_label_generator.py index 2ab6aab..aab448b 100644 --- a/anonipy/anonymize/generators/mask_label_generator.py +++ b/anonipy/anonymize/generators/mask_label_generator.py @@ -2,6 +2,7 @@ import random import warnings import itertools +from typing import List import torch from transformers import AutoModelForMaskedLM, AutoTokenizer, pipeline @@ -71,8 +72,7 @@ def __init__( ) def generate(self, entity: Entity, text: str, *args, **kwargs): - """ - Generate the substituted text based on the entity + """Generate the substituted text based on the entity Parameters ---------- @@ -98,6 +98,21 @@ def generate(self, entity: Entity, text: str, *args, **kwargs): # ================================= def _prepare_model_and_tokenizer(self, model_name: str, use_gpu: bool): + """Prepares the model and tokenizer + + Parameters + ---------- + model_name : str + The name of the model to use + use_gpu : bool + Whether to use GPU/CUDA + + Returns + ------- + model, tokenizer + The model and the tokenizer + + """ # prepare the model device = torch.device( "cuda" if use_gpu and torch.cuda.is_available() else "cpu" @@ -108,6 +123,19 @@ def _prepare_model_and_tokenizer(self, model_name: str, use_gpu: bool): return model, tokenizer def _create_masks(self, entity: Entity): + """Creates the masks for the entity + + Parameters + ---------- + entity : Entity + The entity to create the masks for + + Returns + ------- + list + The list of masks + + """ masks = [] chunks = re.split(r"\s+", entity.text) for idx in range(len(chunks)): @@ -123,12 +151,44 @@ def _create_masks(self, entity: Entity): ) return masks - def _get_context_text(self, text, start_index, end_index): + def _get_context_text(self, text: str, start_index: int, end_index: int) -> str: + """Get the context text + + Parameters + ---------- + text : str + The text to get the context text from + start_index : int + The start index + end_index : int + The end index + + Returns + ------- + str + The context text + + """ min_index = max(0, start_index - self.context_window) max_index = min(end_index + self.context_window, len(text)) return text[min_index:max_index] - def _prepare_generate_inputs(self, masks, text): + def _prepare_generate_inputs(self, masks: List[dict], text: str) -> List[str]: + """Prepares the generate inputs + + Parameters + ---------- + masks : List[dict] + The list of masks + text : str + The text to prepare the generate inputs for + + Returns + ------- + list + The list of generate inputs + + """ return [ self._get_context_text( text[: m["start_index"]] + m["mask_text"] + text[m["end_index"] :], @@ -138,7 +198,26 @@ def _prepare_generate_inputs(self, masks, text): for m in masks ] - def _create_substitute(self, entity: Entity, masks, suggestions): + def _create_substitute( + self, entity: Entity, masks: List[dict], suggestions: List[dict] + ) -> str: + """Create a substitute for the entity + + Parameters + ---------- + entity : Entity + The entity to create the substitute for + masks : List[dict] + The list of masks + suggestions : List[dict] + The list of suggestions + + Returns + ------- + str + The created substitute + + """ substitute_chunks = [] for mask, suggestion in zip(masks, suggestions): suggestion = suggestion if type(suggestion) == list else [suggestion] diff --git a/anonipy/anonymize/generators/number_generator.py b/anonipy/anonymize/generators/number_generator.py index 9540aaa..462a147 100644 --- a/anonipy/anonymize/generators/number_generator.py +++ b/anonipy/anonymize/generators/number_generator.py @@ -29,8 +29,7 @@ def __init__(self, *args, **kwargs): pass def generate(self, entity: Entity, *args, **kwargs): - """ - Generates a number replacement + """Generates a number replacement Parameters ---------- diff --git a/anonipy/anonymize/regex.py b/anonipy/anonymize/regex.py index 957db0f..67149b8 100644 --- a/anonipy/anonymize/regex.py +++ b/anonipy/anonymize/regex.py @@ -59,6 +59,13 @@ class RegexMap: """ def __init__(self): + """ + Parameters + ---------- + None + + """ + self.regex_mapping = defaultdict(lambda: ".*") # Define the regex mappings self.regex_mapping[ENTITY_TYPES.STRING] = REGEX_STRING @@ -70,6 +77,20 @@ def __init__(self): self.regex_mapping[ENTITY_TYPES.WEBSITE_URL] = REGEX_WEBSITE_URL def __call__(self, type: str) -> str: + """Gets the regex for the given type + + Parameters + ---------- + type : str + The type of the entity + + Returns + ------- + str + The regex for the given type + + """ + return self.regex_mapping[type] diff --git a/anonipy/anonymize/strategies/interface.py b/anonipy/anonymize/strategies/interface.py index 448b0d7..74abaf8 100644 --- a/anonipy/anonymize/strategies/interface.py +++ b/anonipy/anonymize/strategies/interface.py @@ -1,7 +1,3 @@ -""" -Contains the interface for the strategy -""" - from typing import List from ...definitions import Entity diff --git a/anonipy/anonymize/strategies/masking.py b/anonipy/anonymize/strategies/masking.py index 090c271..3d76586 100644 --- a/anonipy/anonymize/strategies/masking.py +++ b/anonipy/anonymize/strategies/masking.py @@ -1,7 +1,3 @@ -""" -Contains the masking strategy -""" - import re from typing import List, Tuple @@ -65,6 +61,19 @@ def anonymize( return anonymized_text, replacements def _create_replacement(self, entity: Entity) -> Replacement: + """Creates a replacement for the entity + + Parameters + ---------- + entity : Entity + The entity to create the replacement for + + Returns + ------- + Replacement + The created replacement + + """ mask = self._create_mask(entity) return { "original_text": entity.text, @@ -75,6 +84,19 @@ def _create_replacement(self, entity: Entity) -> Replacement: } def _create_mask(self, entity: Entity) -> str: + """Creates a mask for the entity + + Parameters + ---------- + entity : Entity + The entity to create the mask for + + Returns + ------- + str + The created mask + + """ return " ".join( [ self.substitute_label * len(chunk) diff --git a/anonipy/anonymize/strategies/pseudonymization.py b/anonipy/anonymize/strategies/pseudonymization.py index 1760ac5..1816928 100644 --- a/anonipy/anonymize/strategies/pseudonymization.py +++ b/anonipy/anonymize/strategies/pseudonymization.py @@ -1,7 +1,3 @@ -""" -Contains the pseudonymization strategy -""" - from typing import List, Tuple from .interface import StrategyInterface @@ -69,6 +65,23 @@ def anonymize( def _create_replacement( self, entity: Entity, text: str, replacements: List[dict] ) -> Replacement: + """Creates a replacement for the entity + + Parameters + ---------- + entity : Entity + The entity to create the replacement for + text : str + The text to anonymize + replacements : List[dict] + The list of replacements + + Returns + ------- + Replacement + The created replacement + + """ # check if the replacement already exists anonymized_text = self._check_replacement(entity, replacements) # create a new replacement if it doesn't exist @@ -84,6 +97,21 @@ def _create_replacement( } def _check_replacement(self, entity: Entity, replacements: List[dict]) -> str: + """Checks if a suitable replacement already exists + + Parameters + ---------- + entity : Entity + The entity to check + replacements : List[dict] + The list of replacements + + Returns + ------- + str + The anonymized text if the replacement already exists, None otherwise + + """ existing_replacement = list( filter(lambda x: x["original_text"] == entity.text, replacements) ) diff --git a/anonipy/anonymize/strategies/redaction.py b/anonipy/anonymize/strategies/redaction.py index bf4e160..0ae9c3d 100644 --- a/anonipy/anonymize/strategies/redaction.py +++ b/anonipy/anonymize/strategies/redaction.py @@ -1,7 +1,3 @@ -""" -Contains the redaction strategy -""" - from typing import List, Tuple from .interface import StrategyInterface @@ -64,6 +60,20 @@ def anonymize( return anonymized_text, replacements def _create_replacement(self, entity: Entity) -> Replacement: + """Creates a replacement for the entity + + Parameters + ---------- + entity : Entity + The entity to create the replacement for + + Returns + ------- + Replacement + The replacement for the entity + + """ + return { "original_text": entity.text, "label": entity.label, diff --git a/anonipy/utils/file_system.py b/anonipy/utils/file_system.py index 629c304..2f7cd34 100644 --- a/anonipy/utils/file_system.py +++ b/anonipy/utils/file_system.py @@ -103,10 +103,38 @@ def extract_text_from_pdf(pdf_path: str) -> str: def _word_process_paragraph(p) -> str: + """Get the text from a paragraph + + Parameters + ---------- + p : etree._Element + The paragraph element + + Returns + ------- + str + The text from the paragraph + + """ + return p.text def _word_process_table(t) -> str: + """Get the text from a table + + Parameters + ---------- + t : etree._Element + The table element + + Returns + ------- + str + The text from the table + + """ + table_text = [] for row in t.findall(".//w:tr", WORD_NAMESPACES): row_text = []