From 707cec1ec22cd256b192a419bd52f9353bbc4bef Mon Sep 17 00:00:00 2001
From: eriknovak <erik.novak@ijs.si>
Date: Tue, 18 Jun 2024 16:25:22 +0200
Subject: [PATCH 1/2] Add code documentation

---
 anonipy/__init__.py                           |  18 ++-
 anonipy/anonymize/__init__.py                 |  22 +++
 anonipy/anonymize/extractors/__init__.py      |  14 ++
 .../anonymize/extractors/entity_extractor.py  |  66 ++++++++-
 anonipy/anonymize/extractors/interface.py     |   1 +
 anonipy/anonymize/generators/__init__.py      |  20 +++
 .../anonymize/generators/date_generator.py    | 120 ++++++++++++++++
 anonipy/anonymize/generators/interface.py     |   1 +
 .../generators/llm_label_generator.py         |  57 +++++++-
 .../generators/mask_label_generator.py        |  46 ++++++
 .../anonymize/generators/number_generator.py  |  34 +++++
 anonipy/anonymize/helpers.py                  |  33 +++++
 anonipy/anonymize/regex.py                    |  14 +-
 anonipy/anonymize/strategies/__init__.py      |  18 +++
 anonipy/anonymize/strategies/interface.py     |   8 ++
 anonipy/anonymize/strategies/masking.py       |  38 +++++
 .../anonymize/strategies/pseudonymization.py  |  38 +++++
 anonipy/anonymize/strategies/redaction.py     |  38 +++++
 anonipy/constants.py                          |  16 ++-
 anonipy/definitions.py                        |  50 ++++++-
 anonipy/utils/__init__.py                     |  14 ++
 anonipy/utils/datetime.py                     |  14 ++
 anonipy/utils/file_system.py                  | 136 ++++++++++++++++++
 anonipy/utils/language_detector.py            |  45 +++++-
 24 files changed, 840 insertions(+), 21 deletions(-)

diff --git a/anonipy/__init__.py b/anonipy/__init__.py
index e62ee03..ff20e30 100644
--- a/anonipy/__init__.py
+++ b/anonipy/__init__.py
@@ -1,11 +1,19 @@
 """
 anonipy
-=========
 
-Provides
-  1. Label extractors
-  2. Label generators
-  3. Anonymization strategies
+The anonipy package provides utilities for data anonymization.
+
+Submodules
+----------
+anonymize :
+    The package containing anonymization classes and functions.
+utils :
+    The package containing utility classes and functions.
+definitions :
+    The object definitions used within the package.
+constants :
+    The constant values used to help with data anonymization.
+
 
 How to use the documentation
 ----------------------------
diff --git a/anonipy/anonymize/__init__.py b/anonipy/anonymize/__init__.py
index 879f10f..598d0c6 100644
--- a/anonipy/anonymize/__init__.py
+++ b/anonipy/anonymize/__init__.py
@@ -1,3 +1,25 @@
+"""
+anonymize
+
+The module provides a set of anonymization utilities.
+
+Submodules
+----------
+extractors :
+    The module containing the extractor classes
+generators :
+    The module containing the generator classes
+strategies :
+    The module containing the strategy classes
+regex :
+    The module containing the regex patterns
+
+Methods
+-------
+anonymize()
+
+"""
+
 from . import extractors
 from . import generators
 from . import strategies
diff --git a/anonipy/anonymize/extractors/__init__.py b/anonipy/anonymize/extractors/__init__.py
index 58dac1f..ec53b8a 100644
--- a/anonipy/anonymize/extractors/__init__.py
+++ b/anonipy/anonymize/extractors/__init__.py
@@ -1,3 +1,17 @@
+"""
+extractors
+
+The module provides a set of extractors used in the library.
+
+Classes
+-------
+ExtractorInterface :
+    The class representing the extractor interface
+EntityExtractor :
+    The class representing the entity extractor
+
+"""
+
 from .interface import ExtractorInterface
 from .entity_extractor import EntityExtractor
 
diff --git a/anonipy/anonymize/extractors/entity_extractor.py b/anonipy/anonymize/extractors/entity_extractor.py
index e1aa510..259893d 100644
--- a/anonipy/anonymize/extractors/entity_extractor.py
+++ b/anonipy/anonymize/extractors/entity_extractor.py
@@ -16,6 +16,30 @@
 
 
 class EntityExtractor(ExtractorInterface):
+    """The class representing the entity extractor
+
+    Attributes
+    ----------
+    labels : List[dict]
+        The list of labels to extract
+    lang : str
+        The language of the text to extract
+    score_th : float
+        The score threshold
+    use_gpu : bool
+        Whether to use GPU
+    pipeline : spacy pipeline
+        The spacy pipeline
+
+
+    Methods
+    -------
+    __call__(self, text: str)
+        Extract the entities from the text
+    display(self, doc: Doc)
+        Display the entities in the text
+
+    """
 
     def __init__(
         self,
@@ -23,19 +47,59 @@ def __init__(
         lang: LANGUAGES = LANGUAGES.ENGLISH,
         score_th=0.5,
         use_gpu=False,
+        *args,
+        **kwargs,
     ):
+        """
+        Parameters
+        ----------
+        labels : List[dict]
+            The list of labels to extract
+        lang : str
+            The language of the text to extract
+        score_th : float
+            The score threshold. Entities with a score below this threshold will be ignored. Default: 0.5
+        use_gpu : bool
+            Whether to use GPU. Default: False
+
+        """
+
+        super().__init__(labels, *args, **kwargs)
         self.lang = lang
         self.score_th = score_th
         self.use_gpu = use_gpu
         self.labels = self._prepare_labels(labels)
         self.pipeline = self._prepare_pipeline()
 
-    def __call__(self, text: str) -> Tuple[Doc, List[Entity]]:
+    def __call__(self, text: str, *args, **kwargs) -> Tuple[Doc, List[Entity]]:
+        """Extract the entities from the text
+
+        Parameters
+        ----------
+        text : str
+            The text to extract entities from
+
+        Returns
+        -------
+        Tuple[Doc, List[Entity]]
+            The spacy doc and the list of entities extracted
+
+        """
+
         doc = self.pipeline(text)
         entities, doc.ents = self._prepare_entities(doc)
         return doc, entities
 
     def display(self, doc: Doc):
+        """Display the entities in the text
+
+        Parameters
+        ----------
+        doc : Doc
+            The spacy doc to display
+
+        """
+
         options = {"colors": {l["label"]: "#5C7AEA" for l in self.labels}}
         displacy.render(doc, style="ent", options=options)
 
diff --git a/anonipy/anonymize/extractors/interface.py b/anonipy/anonymize/extractors/interface.py
index 096e77c..092f5fe 100644
--- a/anonipy/anonymize/extractors/interface.py
+++ b/anonipy/anonymize/extractors/interface.py
@@ -5,6 +5,7 @@
 
 
 class ExtractorInterface:
+    """The class representing the extractor interface"""
 
     def __init__(self, labels: List[dict], *args, **kwargs):
         pass
diff --git a/anonipy/anonymize/generators/__init__.py b/anonipy/anonymize/generators/__init__.py
index 56a17bb..b0b8b74 100644
--- a/anonipy/anonymize/generators/__init__.py
+++ b/anonipy/anonymize/generators/__init__.py
@@ -1,3 +1,23 @@
+"""
+generators
+
+The module provides a set of generators used in the library.
+
+Classes
+-------
+GeneratorInterface :
+    The class representing the generator interface
+LLMLabelGenerator :
+    The class representing the LLM label generator
+MaskLabelGenerator :
+    The class representing the mask label generator
+NumberGenerator :
+    The class representing the number generator
+DateGenerator :
+    The class representing the date generator
+
+"""
+
 from .interface import GeneratorInterface
 from .llm_label_generator import LLMLabelGenerator
 from .mask_label_generator import MaskLabelGenerator
diff --git a/anonipy/anonymize/generators/date_generator.py b/anonipy/anonymize/generators/date_generator.py
index 08ce5a0..8da1ca3 100644
--- a/anonipy/anonymize/generators/date_generator.py
+++ b/anonipy/anonymize/generators/date_generator.py
@@ -12,23 +12,94 @@
 
 
 def first_day_of_month(day: datetime.datetime, *args, **kwargs):
+    """Returns the first day of the month of the given date
+
+    Parameters
+    ----------
+    day : datetime.datetime
+        The date to get the first day of the month from
+
+    Returns
+    -------
+    datetime.datetime
+        The first day of the month of the given date
+
+    """
     return day.replace(day=1)
 
 
 def last_day_of_month(day: datetime.datetime, *args, **kwargs):
+    """Returns the last day of the month of the given date
+
+    Parameters
+    ----------
+    day : datetime.datetime
+        The date to get the last day of the month from
+
+    Returns
+    -------
+    datetime.datetime
+        The last day of the month of the given date
+
+    """
     next_month = day.replace(day=28) + datetime.timedelta(days=4)
     return next_month - datetime.timedelta(days=next_month.day)
 
 
 def middle_of_the_month(day: datetime.datetime, *args, **kwargs):
+    """Returns the middle day of the month of the given date
+
+    Parameters
+    ----------
+    day : datetime.datetime
+        The date to get the middle day of the month from
+
+    Returns
+    -------
+    datetime.datetime
+        The middle day of the month of the given date
+
+    """
+
     return day.replace(day=15)
 
 
 def middle_of_the_year(day: datetime.datetime, *args, **kwargs):
+    """Returns the middle day of the year of the given date
+
+    Parameters
+    ----------
+    day : datetime.datetime
+        The date to get the middle day of the year from
+
+    Returns
+    -------
+    datetime.datetime
+        The middle day of the year of the given date
+
+    """
+
     return day.replace(month=7, day=1)
 
 
 def random_date(day: datetime.datetime, sigma: int = 30, *args, **kwargs):
+    """Returns a random date within the given date range
+
+    The function returns a date within the range [day - sigma, day + sigma].
+
+    Parameters
+    ----------
+    day : datetime.datetime
+        The date to get the random date from
+    sigma : int
+        The range of the random date in days. Default: 30
+
+    Returns
+    -------
+    datetime.datetime
+        The random date within the given date range
+
+    """
     delta = random.randint(-sigma, sigma)
     return day + datetime.timedelta(days=delta)
 
@@ -48,12 +119,61 @@ def random_date(day: datetime.datetime, sigma: int = 30, *args, **kwargs):
 
 
 class DateGenerator(GeneratorInterface):
+    """
+    The class representing the date generator
+
+    Attributes
+    ----------
+    date_format : str
+        The date format to use
+    day_sigma : int
+        The range of the random date in days
+
+    Methods
+    -------
+    generate(entity: Entity, output_gen: str = "random")
+        Generate the date based on the entity and output_gen
+
+    """
 
     def __init__(self, date_format="auto", day_sigma: int = 30, *args, **kwargs):
+        """
+        Parameters
+        ----------
+        date_format : str, optional
+            The date format to use. Default: "auto"
+        day_sigma : int, optional
+            The range of the random date in days. Default: 30
+
+        """
+
+        super().__init__(*args, **kwargs)
         self.date_format = date_format
         self.day_sigma = day_sigma
 
     def generate(self, entity: Entity, output_gen: str = "random", *args, **kwargs):
+        """
+        Generate the date based on the entity and output_gen
+
+        Parameters
+        ----------
+        entity : Entity
+            The entity to generate the date from
+        output_gen : str, optional
+            The output generator to use. Default: "random"
+
+        Returns
+        -------
+        str
+            The generated date
+
+        Raises
+        ------
+        ValueError
+            If the entity type is not `date` or `custom`
+
+        """
+
         if entity.type in ["custom"]:
             warnings.warn(
                 "The entity type is `custom`. Make sure the generator is returning appropriate values."
diff --git a/anonipy/anonymize/generators/interface.py b/anonipy/anonymize/generators/interface.py
index 5303a27..3dec839 100644
--- a/anonipy/anonymize/generators/interface.py
+++ b/anonipy/anonymize/generators/interface.py
@@ -7,6 +7,7 @@
 
 
 class GeneratorInterface:
+    """The class representing the generator interface"""
 
     def __init__(self, *args, **kwargs):
         pass
diff --git a/anonipy/anonymize/generators/llm_label_generator.py b/anonipy/anonymize/generators/llm_label_generator.py
index 1001c69..b73a2e8 100644
--- a/anonipy/anonymize/generators/llm_label_generator.py
+++ b/anonipy/anonymize/generators/llm_label_generator.py
@@ -75,8 +75,32 @@ def prepare_llama3_byte_decoder():
 
 
 class LLMLabelGenerator(GeneratorInterface):
+    """The class representing the LLM label generator
+
+    Attributes
+    ----------
+    model : models.Transformers
+        The model used to generate the label
+
+    Methods
+    -------
+    generate(entity: Entity, entity_prefix: str = "", temperature: float = 0.0)
+        Generate the label based on the entity
+
+    validate(entity: Entity)
+        Validate the entity
+
+    """
 
     def __init__(self, *args, **kwargs):
+        """
+        Parameters
+        ----------
+        None
+
+        """
+
+        super().__init__(*args, **kwargs)
         # TODO: make this configurable
         model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
 
@@ -96,6 +120,24 @@ def generate(
         *args,
         **kwargs,
     ):
+        """Generate the label based on the entity
+
+        Parameters
+        ----------
+        entity : Entity
+            The entity to generate the label from
+        entity_prefix : str
+            The prefix to use for the entity
+        temperature : float
+            The temperature to use for the generation. Default: 0.0
+
+        Returns
+        -------
+        str
+            The generated label
+
+        """
+
         user_prompt = f"What is a random {entity_prefix} {entity.label} replacement for {entity.text}? Respond only with the replacement."
         assistant_prompt = gen(
             name="replacement",
@@ -113,7 +155,20 @@ def generate(
         return lm["replacement"]
 
     def validate(self, entity: Entity):
-        regex = regex if regex else ".*"
+        """Validate the entity
+
+        Parameters
+        ----------
+        entity : Entity
+            The entity to validate
+
+        Returns
+        -------
+        bool
+            The validation result
+
+        """
+
         user_prompt = f"Is {entity.text} a {entity.label}?"
         assistant_prompt = select(["True", "False"], name="validation")
         # validate the entity with the validation prompt
diff --git a/anonipy/anonymize/generators/mask_label_generator.py b/anonipy/anonymize/generators/mask_label_generator.py
index cc8390b..2ab6aab 100644
--- a/anonipy/anonymize/generators/mask_label_generator.py
+++ b/anonipy/anonymize/generators/mask_label_generator.py
@@ -18,6 +18,23 @@
 
 
 class MaskLabelGenerator(GeneratorInterface):
+    """The class representing the mask label generator
+
+    Attributes
+    ----------
+    context_window : int
+        The context window size
+    pipeline : transformers pipeline
+        The transformers pipeline
+    mask_token : str
+        The mask token
+
+    Methods
+    -------
+    generate(self, entity: Entity, text: str)
+        Anonymize the text based on the entities
+
+    """
 
     def __init__(
         self,
@@ -27,6 +44,18 @@ def __init__(
         *args,
         **kwargs,
     ):
+        """
+        Parameters
+        ----------
+        model_name : str, optional
+            The name of the model to use. Default: "FacebookAI/xlm-roberta-large"
+        use_gpu : bool, optional
+            Whether to use GPU/CUDA. Default: False
+        context_window : int, optional
+            The context window size. Default: 100
+
+        """
+        super().__init__(*args, **kwargs)
         self.context_window = context_window
         if use_gpu and not torch.cuda.is_available():
             warnings.warn(
@@ -42,6 +71,23 @@ def __init__(
         )
 
     def generate(self, entity: Entity, text: str, *args, **kwargs):
+        """
+        Generate the substituted text based on the entity
+
+        Parameters
+        ----------
+        entity : Entity
+            The entity to generate the label from
+        text : str
+            The text to generate the label from
+
+        Returns
+        -------
+        str
+            The generated text
+
+        """
+
         masks = self._create_masks(entity)
         input_texts = self._prepare_generate_inputs(masks, text)
         suggestions = self.pipeline(input_texts)
diff --git a/anonipy/anonymize/generators/number_generator.py b/anonipy/anonymize/generators/number_generator.py
index 54014f4..9540aaa 100644
--- a/anonipy/anonymize/generators/number_generator.py
+++ b/anonipy/anonymize/generators/number_generator.py
@@ -9,11 +9,45 @@
 
 
 class NumberGenerator(GeneratorInterface):
+    """The class representing the number generator
+
+    Methods
+    -------
+    generate(self, entity: Entity)
+        Generates a number replacement
+
+    """
 
     def __init__(self, *args, **kwargs):
+        """
+        Parameters
+        ----------
+        None
+
+        """
+        super().__init__(*args, **kwargs)
         pass
 
     def generate(self, entity: Entity, *args, **kwargs):
+        """
+        Generates a number replacement
+
+        Parameters
+        ----------
+        entity : Entity
+            The entity to generate the number from
+
+        Returns
+        -------
+        str
+            The generated number
+
+        Raises
+        ------
+        ValueError
+            If the entity type is not `integer`, `float`, `phone_number` or `custom`
+
+        """
         if entity.type in ["custom"]:
             warnings.warn(
                 "The entity type is `custom`. Make sure the generator is returning appropriate values."
diff --git a/anonipy/anonymize/helpers.py b/anonipy/anonymize/helpers.py
index 76e81ee..844a811 100644
--- a/anonipy/anonymize/helpers.py
+++ b/anonipy/anonymize/helpers.py
@@ -7,6 +7,24 @@
 
 
 def convert_spacy_to_entity(entity, type=None, regex=".*", *args, **kwargs):
+    """Convert a SpaCy entity to an Entity object
+
+    Parameters
+    ----------
+    entity : SpaCy Span
+        The SpaCy entity to convert
+    type : ENTITY_TYPES, optional
+        The type of the entity. Default: None
+    regex : Union[str, re.Pattern], optional
+        The regular expression the entity must match. Default: ".*"
+
+    Returns
+    -------
+    Entity
+        The converted Entity object
+
+    """
+
     return Entity(
         entity.text,
         entity.label_,
@@ -19,6 +37,21 @@ def convert_spacy_to_entity(entity, type=None, regex=".*", *args, **kwargs):
 
 
 def anonymize(text: str, replacements: List[Replacement]) -> str:
+    """Anonymize a text based on a list of replacements
+
+    Parameters
+    ----------
+    text : str
+        The text to anonymize
+    replacements : List[Replacement]
+        The list of replacements to apply
+
+    Returns
+    -------
+    Tuple[str, List[Replacement]]
+        The anonymized text and the list of replacements applied
+
+    """
     s_replacements = sorted(replacements, key=lambda x: x["start_index"], reverse=True)
 
     anonymized_text = text
diff --git a/anonipy/anonymize/regex.py b/anonipy/anonymize/regex.py
index 15bcc82..957db0f 100644
--- a/anonipy/anonymize/regex.py
+++ b/anonipy/anonymize/regex.py
@@ -1,7 +1,3 @@
-"""
-The regex definitions for various use cases
-"""
-
 from collections import defaultdict
 
 from ..constants import ENTITY_TYPES
@@ -51,6 +47,16 @@
 
 
 class RegexMap:
+    """RegexMap
+
+    The class representing the regex map
+
+    Attributes
+    ----------
+    regex_mapping : defaultdict
+        The regex mapping
+
+    """
 
     def __init__(self):
         self.regex_mapping = defaultdict(lambda: ".*")
diff --git a/anonipy/anonymize/strategies/__init__.py b/anonipy/anonymize/strategies/__init__.py
index c0cc673..e0f25a7 100644
--- a/anonipy/anonymize/strategies/__init__.py
+++ b/anonipy/anonymize/strategies/__init__.py
@@ -1,3 +1,21 @@
+"""
+strategies
+
+The module provides a set of strategies used in the library.
+
+Classes
+-------
+StrategyInterface :
+    The class representing the strategy interface
+MaskingStrategy :
+    The class representing the masking strategy
+RedactionStrategy :
+    The class representing the redaction strategy
+PseudonymizationStrategy :
+    The class representing the pseudonymization strategy
+
+"""
+
 from .interface import StrategyInterface
 from .masking import MaskingStrategy
 from .redaction import RedactionStrategy
diff --git a/anonipy/anonymize/strategies/interface.py b/anonipy/anonymize/strategies/interface.py
index bef7c86..448b0d7 100644
--- a/anonipy/anonymize/strategies/interface.py
+++ b/anonipy/anonymize/strategies/interface.py
@@ -11,6 +11,14 @@
 
 
 class StrategyInterface:
+    """The class representing the strategy interface
+
+    Methods
+    -------
+    anonymize(text: str, entities: List[Entity], *args, **kwargs)
+        Anonymize the text based on the entities
+
+    """
 
     def __init__(self, *args, **kwargs):
         pass
diff --git a/anonipy/anonymize/strategies/masking.py b/anonipy/anonymize/strategies/masking.py
index 039d124..090c271 100644
--- a/anonipy/anonymize/strategies/masking.py
+++ b/anonipy/anonymize/strategies/masking.py
@@ -15,13 +15,51 @@
 
 
 class MaskingStrategy(StrategyInterface):
+    """The class representing the masking strategy
+
+    Attributes
+    ----------
+    substitute_label : str
+        The label to substitute in the anonymized text
+
+    Methods
+    -------
+    anonymize(text: str, entities: List[Entity])
+        Anonymize the text based on the entities
+
+    """
 
     def __init__(self, substitute_label: str = "*", *args, **kwargs):
+        """
+        Parameters
+        ----------
+        substitute_label : str, optional
+            The label to substitute in the anonymized text. Default: "*"
+
+        """
+
+        super().__init__(*args, **kwargs)
         self.substitute_label = substitute_label
 
     def anonymize(
         self, text: str, entities: List[Entity], *args, **kwargs
     ) -> Tuple[str, List[Replacement]]:
+        """Anonymize the text based on the entities
+
+        Parameters
+        ----------
+        text : str
+            The text to anonymize
+        entities : List[Entity]
+            The list of entities to anonymize
+
+        Returns
+        -------
+        Tuple[str, List[Replacement]]
+            The anonymized text and the list of replacements applied
+
+        """
+
         replacements = [self._create_replacement(ent) for ent in entities]
         anonymized_text, replacements = anonymize(text, replacements)
         return anonymized_text, replacements
diff --git a/anonipy/anonymize/strategies/pseudonymization.py b/anonipy/anonymize/strategies/pseudonymization.py
index 6dbe7b7..1760ac5 100644
--- a/anonipy/anonymize/strategies/pseudonymization.py
+++ b/anonipy/anonymize/strategies/pseudonymization.py
@@ -14,13 +14,51 @@
 
 
 class PseudonymizationStrategy(StrategyInterface):
+    """The class representing the pseudonymization strategy
+
+    Attributes
+    ----------
+    mapping : dict
+        The mapping of entities to pseudonyms
+
+    Methods
+    -------
+    anonymize(text: str, entities: List[Entity])
+        Anonymize the text based on the entities
+
+    """
 
     def __init__(self, mapping, *args, **kwargs):
+        """
+        Parameters
+        ----------
+        mapping : func
+            The mapping of entities to pseudonyms
+
+        """
+
+        super().__init__(*args, **kwargs)
         self.mapping = mapping
 
     def anonymize(
         self, text: str, entities: List[Entity], *args, **kwargs
     ) -> Tuple[str, List[Replacement]]:
+        """Anonymize the text based on the entities
+
+        Parameters
+        ----------
+        text : str
+            The text to anonymize
+        entities : List[Entity]
+            The list of entities to anonymize
+
+        Returns
+        -------
+        Tuple[str, List[Replacement]]
+            The anonymized text and the list of replacements applied
+
+        """
+
         replacements = []
         for ent in entities:
             replacement = self._create_replacement(ent, text, replacements)
diff --git a/anonipy/anonymize/strategies/redaction.py b/anonipy/anonymize/strategies/redaction.py
index e39604f..bf4e160 100644
--- a/anonipy/anonymize/strategies/redaction.py
+++ b/anonipy/anonymize/strategies/redaction.py
@@ -14,13 +14,51 @@
 
 
 class RedactionStrategy(StrategyInterface):
+    """The class representing the redaction strategy
+
+    Attributes
+    ----------
+    substitute_label : str
+        The label to substitute in the anonymized text
+
+    Methods
+    -------
+    anonymize(text: str, entities: List[Entity])
+        Anonymize the text based on the entities
+
+    """
 
     def __init__(self, substitute_label: str = "[REDACTED]", *args, **kwargs) -> None:
+        """
+        Parameters
+        ----------
+        substitute_label : str, optional
+            The label to substitute in the anonymized text. Default: "[REDACTED]"
+
+        """
+
+        super().__init__(*args, **kwargs)
         self.substitute_label = substitute_label
 
     def anonymize(
         self, text: str, entities: List[Entity], *args, **kwargs
     ) -> Tuple[str, List[Replacement]]:
+        """Anonymize the text based on the entities
+
+        Parameters
+        ----------
+        text : str
+            The text to anonymize
+        entities : List[Entity]
+            The list of entities to anonymize
+
+        Returns
+        -------
+        Tuple[str, List[Replacement]]
+            The anonymized text and the list of replacements applied
+
+        """
+
         replacements = [self._create_replacement(ent) for ent in entities]
         anonymized_text, replacements = anonymize(text, replacements)
         return anonymized_text, replacements
diff --git a/anonipy/constants.py b/anonipy/constants.py
index a11ff65..71be42c 100644
--- a/anonipy/constants.py
+++ b/anonipy/constants.py
@@ -1,10 +1,16 @@
 """
-The constants used to make it easier to use the library
-"""
+constants
+
+The module provides a set of constants used in the library.
 
-# ================================================
-# Constants
-# ================================================
+Classes
+-------
+LANGUAGES :
+    Predefined supported languages
+ENTITY_TYPES :
+    Predefined types of entities
+
+"""
 
 
 class LANGUAGES:
diff --git a/anonipy/definitions.py b/anonipy/definitions.py
index 43e3106..ff26607 100644
--- a/anonipy/definitions.py
+++ b/anonipy/definitions.py
@@ -1,5 +1,15 @@
 """
-The definitions used within the package
+definitions
+
+The module provides a set of object definitions used in the library.
+
+Classes
+-------
+Entity :
+    The class representing the entity
+Replacement :
+    The class representing the replacement
+
 """
 
 import re
@@ -16,6 +26,27 @@
 
 @dataclass
 class Entity:
+    """The class representing the entity
+
+    Attributes
+    ----------
+    text : str
+        The text of the entity
+    label : str
+        The label of the entity
+    start_index : int
+        The start index of the entity in the text
+    end_index : int
+        The end index of the entity in the text
+    score : float
+        The prediction score of the entity. The score is returned by the extractor models. Default: 1.0
+    type : ENTITY_TYPES
+        The type of the entity. Default: None
+    regex : Union[str, re.Pattern]
+        The regular expression the entity must match. Default: ".*"
+
+    """
+
     text: str
     label: str
     start_index: int
@@ -26,6 +57,23 @@ class Entity:
 
 
 class Replacement(TypedDict):
+    """The class representing the replacement
+
+    Attributes
+    ----------
+    original_text : str, optional
+        The original text of the entity
+    label : str, optional
+        The label of the entity
+    start_index : int
+        The start index of the entity in the text
+    end_index : int
+        The end index of the entity in the text
+    anonymized_text : str
+        The anonymized text replacing the original
+
+    """
+
     original_text: NotRequired[str]
     label: NotRequired[str]
     start_index: int
diff --git a/anonipy/utils/__init__.py b/anonipy/utils/__init__.py
index f1007c5..58aaf21 100644
--- a/anonipy/utils/__init__.py
+++ b/anonipy/utils/__init__.py
@@ -1,3 +1,17 @@
+"""
+utils
+
+The module provides a set of utilities used in the library.
+
+Submodules
+----------
+language_detector :
+    The module containing the language detector
+file_system :
+    The module containing the file system utilities
+
+"""
+
 from . import language_detector
 from . import file_system
 
diff --git a/anonipy/utils/datetime.py b/anonipy/utils/datetime.py
index 9ee2a3d..461f782 100644
--- a/anonipy/utils/datetime.py
+++ b/anonipy/utils/datetime.py
@@ -81,6 +81,20 @@
 
 
 def detect_datetime_format(datetime):
+    """Detects the datetime format
+
+    Parameters
+    ----------
+    datetime: str
+        The datetime string
+
+    Returns
+    -------
+    Tuple[datetime.datetime, str]
+        The detected datetime and the format
+
+    """
+
     try:
         parsed_datetime = parser.parse(datetime, fuzzy=True)
 
diff --git a/anonipy/utils/file_system.py b/anonipy/utils/file_system.py
index 876e11b..629c304 100644
--- a/anonipy/utils/file_system.py
+++ b/anonipy/utils/file_system.py
@@ -20,6 +20,19 @@
 
 
 def remove_extra_spaces(text: str) -> str:
+    """Remove extra spaces from text
+
+    Parameters
+    ----------
+    text : str
+        The text to remove extra spaces from
+
+    Returns
+    -------
+    str
+        The text with extra spaces removed
+
+    """
     text = text.strip()
     # remove extra spaces
     text = re.sub(" +", " ", text)
@@ -28,6 +41,20 @@ def remove_extra_spaces(text: str) -> str:
 
 
 def remove_page_numbers(text: str) -> str:
+    """Removes page numbers from text
+
+    Parameters
+    ----------
+    text : str
+        The text to remove page numbers from
+
+    Returns
+    -------
+    str
+        The text with page numbers removed
+
+    """
+
     page_number_pattern = re.compile(r"^\s*\d+\s*$|\s*\d+\s*$")
     filtered_lines = [
         line.strip()
@@ -43,6 +70,20 @@ def remove_page_numbers(text: str) -> str:
 
 
 def extract_text_from_pdf(pdf_path: str) -> str:
+    """Extracts text from a PDF file
+
+    Parameters
+    ----------
+    pdf_path : str
+        The path to the PDF file
+
+    Returns
+    -------
+    str
+        The text from the PDF file
+
+    """
+
     pdf_reader = PdfReader(pdf_path)
 
     pages_text = []
@@ -79,6 +120,20 @@ def _word_process_table(t) -> str:
 
 
 def extract_text_from_word(doc_path: str) -> str:
+    """Extracts text from a Word file
+
+    Parameters
+    ----------
+    doc_path : str
+        The path to the Word file
+
+    Returns
+    -------
+    str
+        The text from the Word file
+
+    """
+
     doc = Document(doc_path)
     content = []
     for element in doc.element.body:
@@ -100,6 +155,24 @@ def extract_text_from_word(doc_path: str) -> str:
 
 
 def open_file(file_path: str) -> str:
+    """
+    Opens a file and returns its content as a string
+
+    Parameters
+    ----------
+    file_path : str
+        The path to the file
+
+    Returns
+    -------
+    str
+        The content of the file as a string
+
+    """
+
+    if not os.path.isfile(file_path):
+        raise FileNotFoundError(f"The file does not exist: {file_path}")
+
     _, file_extension = os.path.splitext(file_path)
     if file_extension.lower() == ".pdf":
         return extract_text_from_pdf(file_path)
@@ -113,11 +186,60 @@ def open_file(file_path: str) -> str:
 
 
 def open_json(file_path: str) -> dict:
+    """
+    Opens a JSON file and returns its content as a dictionary
+
+    Parameters
+    ----------
+    file_path : str
+        The path to the JSON file
+
+    Returns
+    -------
+    dict
+        The content of the JSON file as a dictionary
+
+    """
+
+    if not os.path.isfile(file_path):
+        raise FileNotFoundError(f"The file does not exist: {file_path}")
+
     with open(file_path, "r", encoding="utf-8") as f:
         return json.load(f)
 
 
 def write_file(text: str, file_path: str, encode: Union[str, bool] = True) -> None:
+    """Writes text to a file
+
+    Parameters
+    ----------
+    text : str
+        The text to write to the file
+    file_path : str
+        The path to the file
+    encode : Union[str, bool], optional
+        The encoding to use. Default: True
+
+    Raises
+    ------
+    TypeError
+        If text, file_path is not a string; encode is not a string or a boolean
+    FileNotFoundError
+        If the directory does not exist
+
+    """
+
+    if not isinstance(text, str):
+        raise TypeError("text must be a string")
+
+    if not isinstance(file_path, str):
+        raise TypeError("file_path must be a string")
+
+    if not os.path.exists(os.path.dirname(file_path)):
+        raise FileNotFoundError(
+            f"The directory does not exist: {os.path.dirname(file_path)}"
+        )
+
     if not isinstance(encode, str) and not isinstance(encode, bool):
         raise TypeError("encode must be a string or a boolean")
 
@@ -132,5 +254,19 @@ def write_file(text: str, file_path: str, encode: Union[str, bool] = True) -> No
 
 
 def write_json(data: dict, file_path: str) -> None:
+    """Writes data to a JSON file
+
+    Parameters
+    ----------
+    data : dict
+        The data to write to the JSON file
+    file_path : str
+        The path to the JSON file
+
+    """
+
+    if not os.path.exists(os.path.dirname(file_path)):
+        os.makedirs(os.path.dirname(file_path), exist_ok=True)
+
     with open(file_path, "w", encoding="utf-8") as f:
         json.dump(data, f, ensure_ascii=False, indent=4)
diff --git a/anonipy/utils/language_detector.py b/anonipy/utils/language_detector.py
index 108f6d8..4b403e4 100644
--- a/anonipy/utils/language_detector.py
+++ b/anonipy/utils/language_detector.py
@@ -1,13 +1,33 @@
-"""
-The language detector class
-"""
-
 from lingua import LanguageDetectorBuilder
 
 
 class LanguageDetector:
+    """The class for detecting the language of a text
+
+    Attributes
+    ----------
+    detector : LanguageDetector
+        The language detector
+
+    Methods
+    -------
+    __call__(self, text: str, output_standard: str = "iso_code_639_1")
+        Detect the language of a text. Calls the `detect` method.
+
+    detect(text: str, output_standard: str = "iso_code_639_1")
+        Detect the language of a text
+
+    """
 
     def __init__(self, low_accuracy: bool = False):
+        """
+        Parameters
+        ----------
+        low_accuracy : bool, optional
+            Whether to use the low accuracy mode. Default: False
+
+        """
+
         # Prepare the language detector for all languages
         builder = LanguageDetectorBuilder.from_all_languages()
         builder = (
@@ -21,6 +41,23 @@ def __call__(self, text: str, output_standard: str = "iso_code_639_1") -> str:
         return self.detect(text, output_standard)
 
     def detect(self, text: str, output_standard: str = "iso_code_639_1") -> str:
+        """
+        Detect the language of a text
+
+        Parameters
+        ----------
+        text : str
+            The text to detect the language of
+        output_standard : str, optional
+            The output standard. Default: "iso_code_639_1"
+
+        Returns
+        -------
+        Tuple[str, str]
+            The language code and the full name of the language
+
+        """
+
         language = self.detector.detect_language_of(text)
         iso_code = getattr(language, output_standard).name.lower()
         full_name = language.name.lower().title()

From bb480688cf91ea71956ff90f7f5600a93f4678ac Mon Sep 17 00:00:00 2001
From: eriknovak <erik.novak@ijs.si>
Date: Tue, 18 Jun 2024 18:14:37 +0200
Subject: [PATCH 2/2] Add missing documentation

---
 .../anonymize/extractors/entity_extractor.py  | 50 ++++++++++-
 .../anonymize/generators/date_generator.py    |  6 +-
 .../generators/llm_label_generator.py         | 40 +++++++++
 .../generators/mask_label_generator.py        | 89 +++++++++++++++++--
 .../anonymize/generators/number_generator.py  |  3 +-
 anonipy/anonymize/regex.py                    | 21 +++++
 anonipy/anonymize/strategies/interface.py     |  4 -
 anonipy/anonymize/strategies/masking.py       | 30 ++++++-
 .../anonymize/strategies/pseudonymization.py  | 36 +++++++-
 anonipy/anonymize/strategies/redaction.py     | 18 +++-
 anonipy/utils/file_system.py                  | 28 ++++++
 11 files changed, 295 insertions(+), 30 deletions(-)

diff --git a/anonipy/anonymize/extractors/entity_extractor.py b/anonipy/anonymize/extractors/entity_extractor.py
index 259893d..dbca1b4 100644
--- a/anonipy/anonymize/extractors/entity_extractor.py
+++ b/anonipy/anonymize/extractors/entity_extractor.py
@@ -107,7 +107,20 @@ def display(self, doc: Doc):
     # Private methods
     # ===========================================
 
-    def _prepare_labels(self, labels):
+    def _prepare_labels(self, labels: List[dict]) -> List[dict]:
+        """Prepare the labels for the extractor
+
+        Parameters
+        ----------
+        labels : List[dict]
+            The list of labels to prepare
+
+        Returns
+        -------
+        List[dict]
+            The prepared labels
+
+        """
         for l in labels:
             if "regex" in l:
                 continue
@@ -117,6 +130,15 @@ def _prepare_labels(self, labels):
         return labels
 
     def _create_gliner_config(self):
+        """Create the config for the GLINER model
+
+        Returns
+        -------
+        dict
+            The config for the GLINER model
+
+        """
+
         map_location = "cpu"
         if self.use_gpu and not torch.cuda.is_available():
             return warnings.warn(
@@ -136,6 +158,15 @@ def _create_gliner_config(self):
         }
 
     def _prepare_pipeline(self):
+        """Prepare the spacy pipeline
+
+        Returns
+        -------
+        spacy pipeline
+            The spacy pipeline
+
+        """
+
         # load the appropriate parser for the language
         module_lang, class_lang = self.lang[0].lower(), self.lang[1].lower().title()
         language_module = importlib.import_module(f"spacy.lang.{module_lang}")
@@ -147,8 +178,21 @@ def _prepare_pipeline(self):
         nlp.add_pipe("gliner_spacy", config=gliner_config)
         return nlp
 
-    def _prepare_entities(self, doc):
-        # prepares the anonymized and spacy entities
+    def _prepare_entities(self, doc: Doc):
+        """Prepares the anonipy and spacy entities
+
+        Parameters
+        ----------
+        doc : Doc
+            The spacy doc to prepare
+
+        Returns
+        -------
+        Tuple[List[Entity], List[Entity]]
+            The anonipy entities and the spacy entities
+
+
+        """
 
         # TODO: make this part more generic
         anoni_entities = []
diff --git a/anonipy/anonymize/generators/date_generator.py b/anonipy/anonymize/generators/date_generator.py
index 8da1ca3..7b36b69 100644
--- a/anonipy/anonymize/generators/date_generator.py
+++ b/anonipy/anonymize/generators/date_generator.py
@@ -119,8 +119,7 @@ def random_date(day: datetime.datetime, sigma: int = 30, *args, **kwargs):
 
 
 class DateGenerator(GeneratorInterface):
-    """
-    The class representing the date generator
+    """The class representing the date generator
 
     Attributes
     ----------
@@ -152,8 +151,7 @@ def __init__(self, date_format="auto", day_sigma: int = 30, *args, **kwargs):
         self.day_sigma = day_sigma
 
     def generate(self, entity: Entity, output_gen: str = "random", *args, **kwargs):
-        """
-        Generate the date based on the entity and output_gen
+        """Generate the date based on the entity and output_gen
 
         Parameters
         ----------
diff --git a/anonipy/anonymize/generators/llm_label_generator.py b/anonipy/anonymize/generators/llm_label_generator.py
index b73a2e8..2775883 100644
--- a/anonipy/anonymize/generators/llm_label_generator.py
+++ b/anonipy/anonymize/generators/llm_label_generator.py
@@ -185,6 +185,19 @@ def validate(self, entity: Entity):
     # =================================
 
     def _prepare_model_and_tokenizer(self, model_name: str):
+        """Prepares the model and tokenizer
+
+        Parameters
+        ----------
+        model_name : str
+            The name of the model to use
+
+        Returns
+        -------
+        model, tokenizer
+            The model and the tokenizer
+
+        """
         # prepare the model
         bnb_config = BitsAndBytesConfig(
             load_in_4bit=True,
@@ -204,10 +217,37 @@ def _prepare_model_and_tokenizer(self, model_name: str):
         return model, tokenizer
 
     def _system_prompt(self):
+        """Returns the system prompt"""
         return "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful AI assistant for generating replacements for text entities.<|eot_id|>"
 
     def _user_prompt(self, prompt):
+        """Returns the user prompt
+
+        Parameters
+        ----------
+        prompt : str
+            The prompt to use
+
+        Returns
+        -------
+        str
+            The user prompt
+
+        """
         return f"<|start_header_id|>user<|end_header_id|>\n\n{prompt}<|eot_id|>"
 
     def _assistant_prompt(self, prompt):
+        """Returns the assistant prompt
+
+        Parameters
+        ----------
+        prompt : str
+            The prompt to use
+
+        Returns
+        -------
+        str
+            The assistant prompt
+
+        """
         return f"<|start_header_id|>assistant<|end_header_id|>\n\n{prompt}"
diff --git a/anonipy/anonymize/generators/mask_label_generator.py b/anonipy/anonymize/generators/mask_label_generator.py
index 2ab6aab..aab448b 100644
--- a/anonipy/anonymize/generators/mask_label_generator.py
+++ b/anonipy/anonymize/generators/mask_label_generator.py
@@ -2,6 +2,7 @@
 import random
 import warnings
 import itertools
+from typing import List
 
 import torch
 from transformers import AutoModelForMaskedLM, AutoTokenizer, pipeline
@@ -71,8 +72,7 @@ def __init__(
         )
 
     def generate(self, entity: Entity, text: str, *args, **kwargs):
-        """
-        Generate the substituted text based on the entity
+        """Generate the substituted text based on the entity
 
         Parameters
         ----------
@@ -98,6 +98,21 @@ def generate(self, entity: Entity, text: str, *args, **kwargs):
     # =================================
 
     def _prepare_model_and_tokenizer(self, model_name: str, use_gpu: bool):
+        """Prepares the model and tokenizer
+
+        Parameters
+        ----------
+        model_name : str
+            The name of the model to use
+        use_gpu : bool
+            Whether to use GPU/CUDA
+
+        Returns
+        -------
+        model, tokenizer
+            The model and the tokenizer
+
+        """
         # prepare the model
         device = torch.device(
             "cuda" if use_gpu and torch.cuda.is_available() else "cpu"
@@ -108,6 +123,19 @@ def _prepare_model_and_tokenizer(self, model_name: str, use_gpu: bool):
         return model, tokenizer
 
     def _create_masks(self, entity: Entity):
+        """Creates the masks for the entity
+
+        Parameters
+        ----------
+        entity : Entity
+            The entity to create the masks for
+
+        Returns
+        -------
+        list
+            The list of masks
+
+        """
         masks = []
         chunks = re.split(r"\s+", entity.text)
         for idx in range(len(chunks)):
@@ -123,12 +151,44 @@ def _create_masks(self, entity: Entity):
             )
         return masks
 
-    def _get_context_text(self, text, start_index, end_index):
+    def _get_context_text(self, text: str, start_index: int, end_index: int) -> str:
+        """Get the context text
+
+        Parameters
+        ----------
+        text : str
+            The text to get the context text from
+        start_index : int
+            The start index
+        end_index : int
+            The end index
+
+        Returns
+        -------
+        str
+            The context text
+
+        """
         min_index = max(0, start_index - self.context_window)
         max_index = min(end_index + self.context_window, len(text))
         return text[min_index:max_index]
 
-    def _prepare_generate_inputs(self, masks, text):
+    def _prepare_generate_inputs(self, masks: List[dict], text: str) -> List[str]:
+        """Prepares the generate inputs
+
+        Parameters
+        ----------
+        masks : List[dict]
+            The list of masks
+        text : str
+            The text to prepare the generate inputs for
+
+        Returns
+        -------
+        list
+            The list of generate inputs
+
+        """
         return [
             self._get_context_text(
                 text[: m["start_index"]] + m["mask_text"] + text[m["end_index"] :],
@@ -138,7 +198,26 @@ def _prepare_generate_inputs(self, masks, text):
             for m in masks
         ]
 
-    def _create_substitute(self, entity: Entity, masks, suggestions):
+    def _create_substitute(
+        self, entity: Entity, masks: List[dict], suggestions: List[dict]
+    ) -> str:
+        """Create a substitute for the entity
+
+        Parameters
+        ----------
+        entity : Entity
+            The entity to create the substitute for
+        masks : List[dict]
+            The list of masks
+        suggestions : List[dict]
+            The list of suggestions
+
+        Returns
+        -------
+        str
+            The created substitute
+
+        """
         substitute_chunks = []
         for mask, suggestion in zip(masks, suggestions):
             suggestion = suggestion if type(suggestion) == list else [suggestion]
diff --git a/anonipy/anonymize/generators/number_generator.py b/anonipy/anonymize/generators/number_generator.py
index 9540aaa..462a147 100644
--- a/anonipy/anonymize/generators/number_generator.py
+++ b/anonipy/anonymize/generators/number_generator.py
@@ -29,8 +29,7 @@ def __init__(self, *args, **kwargs):
         pass
 
     def generate(self, entity: Entity, *args, **kwargs):
-        """
-        Generates a number replacement
+        """Generates a number replacement
 
         Parameters
         ----------
diff --git a/anonipy/anonymize/regex.py b/anonipy/anonymize/regex.py
index 957db0f..67149b8 100644
--- a/anonipy/anonymize/regex.py
+++ b/anonipy/anonymize/regex.py
@@ -59,6 +59,13 @@ class RegexMap:
     """
 
     def __init__(self):
+        """
+        Parameters
+        ----------
+        None
+
+        """
+
         self.regex_mapping = defaultdict(lambda: ".*")
         # Define the regex mappings
         self.regex_mapping[ENTITY_TYPES.STRING] = REGEX_STRING
@@ -70,6 +77,20 @@ def __init__(self):
         self.regex_mapping[ENTITY_TYPES.WEBSITE_URL] = REGEX_WEBSITE_URL
 
     def __call__(self, type: str) -> str:
+        """Gets the regex for the given type
+
+        Parameters
+        ----------
+        type : str
+            The type of the entity
+
+        Returns
+        -------
+        str
+            The regex for the given type
+
+        """
+
         return self.regex_mapping[type]
 
 
diff --git a/anonipy/anonymize/strategies/interface.py b/anonipy/anonymize/strategies/interface.py
index 448b0d7..74abaf8 100644
--- a/anonipy/anonymize/strategies/interface.py
+++ b/anonipy/anonymize/strategies/interface.py
@@ -1,7 +1,3 @@
-"""
-Contains the interface for the strategy
-"""
-
 from typing import List
 from ...definitions import Entity
 
diff --git a/anonipy/anonymize/strategies/masking.py b/anonipy/anonymize/strategies/masking.py
index 090c271..3d76586 100644
--- a/anonipy/anonymize/strategies/masking.py
+++ b/anonipy/anonymize/strategies/masking.py
@@ -1,7 +1,3 @@
-"""
-Contains the masking strategy
-"""
-
 import re
 from typing import List, Tuple
 
@@ -65,6 +61,19 @@ def anonymize(
         return anonymized_text, replacements
 
     def _create_replacement(self, entity: Entity) -> Replacement:
+        """Creates a replacement for the entity
+
+        Parameters
+        ----------
+        entity : Entity
+            The entity to create the replacement for
+
+        Returns
+        -------
+        Replacement
+            The created replacement
+
+        """
         mask = self._create_mask(entity)
         return {
             "original_text": entity.text,
@@ -75,6 +84,19 @@ def _create_replacement(self, entity: Entity) -> Replacement:
         }
 
     def _create_mask(self, entity: Entity) -> str:
+        """Creates a mask for the entity
+
+        Parameters
+        ----------
+        entity : Entity
+            The entity to create the mask for
+
+        Returns
+        -------
+        str
+            The created mask
+
+        """
         return " ".join(
             [
                 self.substitute_label * len(chunk)
diff --git a/anonipy/anonymize/strategies/pseudonymization.py b/anonipy/anonymize/strategies/pseudonymization.py
index 1760ac5..1816928 100644
--- a/anonipy/anonymize/strategies/pseudonymization.py
+++ b/anonipy/anonymize/strategies/pseudonymization.py
@@ -1,7 +1,3 @@
-"""
-Contains the pseudonymization strategy
-"""
-
 from typing import List, Tuple
 
 from .interface import StrategyInterface
@@ -69,6 +65,23 @@ def anonymize(
     def _create_replacement(
         self, entity: Entity, text: str, replacements: List[dict]
     ) -> Replacement:
+        """Creates a replacement for the entity
+
+        Parameters
+        ----------
+        entity : Entity
+            The entity to create the replacement for
+        text : str
+            The text to anonymize
+        replacements : List[dict]
+            The list of replacements
+
+        Returns
+        -------
+        Replacement
+            The created replacement
+
+        """
         # check if the replacement already exists
         anonymized_text = self._check_replacement(entity, replacements)
         # create a new replacement if it doesn't exist
@@ -84,6 +97,21 @@ def _create_replacement(
         }
 
     def _check_replacement(self, entity: Entity, replacements: List[dict]) -> str:
+        """Checks if a suitable replacement already exists
+
+        Parameters
+        ----------
+        entity : Entity
+            The entity to check
+        replacements : List[dict]
+            The list of replacements
+
+        Returns
+        -------
+        str
+            The anonymized text if the replacement already exists, None otherwise
+
+        """
         existing_replacement = list(
             filter(lambda x: x["original_text"] == entity.text, replacements)
         )
diff --git a/anonipy/anonymize/strategies/redaction.py b/anonipy/anonymize/strategies/redaction.py
index bf4e160..0ae9c3d 100644
--- a/anonipy/anonymize/strategies/redaction.py
+++ b/anonipy/anonymize/strategies/redaction.py
@@ -1,7 +1,3 @@
-"""
-Contains the redaction strategy
-"""
-
 from typing import List, Tuple
 
 from .interface import StrategyInterface
@@ -64,6 +60,20 @@ def anonymize(
         return anonymized_text, replacements
 
     def _create_replacement(self, entity: Entity) -> Replacement:
+        """Creates a replacement for the entity
+
+        Parameters
+        ----------
+        entity : Entity
+            The entity to create the replacement for
+
+        Returns
+        -------
+        Replacement
+            The replacement for the entity
+
+        """
+
         return {
             "original_text": entity.text,
             "label": entity.label,
diff --git a/anonipy/utils/file_system.py b/anonipy/utils/file_system.py
index 629c304..2f7cd34 100644
--- a/anonipy/utils/file_system.py
+++ b/anonipy/utils/file_system.py
@@ -103,10 +103,38 @@ def extract_text_from_pdf(pdf_path: str) -> str:
 
 
 def _word_process_paragraph(p) -> str:
+    """Get the text from a paragraph
+
+    Parameters
+    ----------
+    p : etree._Element
+        The paragraph element
+
+    Returns
+    -------
+    str
+        The text from the paragraph
+
+    """
+
     return p.text
 
 
 def _word_process_table(t) -> str:
+    """Get the text from a table
+
+    Parameters
+    ----------
+    t : etree._Element
+        The table element
+
+    Returns
+    -------
+    str
+        The text from the table
+
+    """
+
     table_text = []
     for row in t.findall(".//w:tr", WORD_NAMESPACES):
         row_text = []