From 769bcbecbadc15c9f6a2d2d7b7b1b77bd4ec48bc Mon Sep 17 00:00:00 2001 From: Elias Date: Mon, 19 Feb 2024 16:37:19 +0100 Subject: [PATCH] Use pdfminer.six instead of PyMuPDF; preparations to use options for patterns; remove German manual --- .gitignore | 1 + README.md | 2 - README_DE.md | 74 ------------------- configs/config.yml | 1 + src/rosinenpicker/processors.py | 24 +++--- src/rosinenpicker/pydantic_models.py | 106 +++------------------------ src/rosinenpicker/start.py | 4 +- src/rosinenpicker/utils.py | 76 +++++++++++++++++++ 8 files changed, 98 insertions(+), 190 deletions(-) delete mode 100644 README_DE.md diff --git a/.gitignore b/.gitignore index 13f5327..9c25047 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,4 @@ helper dumpster/ *.bat .vscode/ +demo diff --git a/README.md b/README.md index 336d0f4..694a97a 100644 --- a/README.md +++ b/README.md @@ -2,8 +2,6 @@ ![Python Packaging](https://github.com/joheli/rosinenpicker/workflows/Packaging/badge.svg) ![PyPI](https://img.shields.io/pypi/v/rosinenpicker?label=PyPI) ![PyPI - Downloads](https://img.shields.io/pypi/dm/rosinenpicker) -[Deutsch](README_DE.md) - # Manual Welcome to `rosinenpicker`! This tool is like a magical sieve that helps you find golden nuggets (or "Rosinen") of information within a mountain of documents. It's designed for anyone who needs to extract specific pieces of information without diving deep into the technicalities. diff --git a/README_DE.md b/README_DE.md deleted file mode 100644 index 155481d..0000000 --- a/README_DE.md +++ /dev/null @@ -1,74 +0,0 @@ -# rosinenpicker - -![Python Packaging](https://github.com/joheli/rosinenpicker/workflows/Packaging/badge.svg) ![PyPI](https://img.shields.io/pypi/v/rosinenpicker?label=PyPI) ![PyPI - Downloads](https://img.shields.io/pypi/dm/rosinenpicker) - -[English](README.md) - -# Handbuch - -Willkommen bei `rosinenpicker`! Dieses Werkzeug ist wie ein magisches Sieb, das Ihnen hilft, goldene Informationsnuggets (oder "Rosinen") in einem Berg von Dokumenten zu finden. Es ist für jeden gedacht, der spezifische Informationen extrahieren muss, ohne sich in technische Details zu vertiefen. - -## Schlüsselbegriffe verstehen - -- **Kommandozeile**: Eine textbasierte Schnittstelle, um Ihren Computer zu bedienen. Stellen Sie sich vor, Ihrem Computer genau zu sagen, was er tun soll, indem Sie Befehle eingeben. -- **YAML**: Ein einfaches Konfigurationsdateiformat, das von `rosinenpicker` verwendet wird, um Ihre Anweisungen zu verstehen. Es ist leicht zu lesen und zu schreiben. -- **Argumente**: Spezielle Anweisungen, die Sie `rosinenpicker` beim Start geben, um ihm zu sagen, wo es seine Anweisungen (YAML-Datei) finden und wo es seine Funde speichern soll. - -## Erste Schritte - -0. **Python 3.11 ist Voraussetzung**: Stellen Sie sicher, dass Python 3.11 oder höher installiert ist. Es gibt viele Wege um Python zu installieren, aber ich empfehle[Miniconda](https://docs.anaconda.com/free/miniconda/index.html). - -1. **Installation**: Zuerst bringen wir `rosinenpicker` auf Ihren Computer. Öffnen Sie Ihre Kommandozeile und tippen Sie: - - ``` - pip install rosinenpicker - ``` - -2. **Das Programm ausführen**: Um `rosinenpicker` zu starten, geben Sie folgendes ein: - - ``` - rosinenpicker -c pfad/zu/ihrem_config.yml -d pfad/zu/ihrer_datenbank.db - ``` - - Ersetzen Sie `pfad/zu/ihrem_config.yml` mit dem tatsächlichen Pfad zu Ihrer Konfigurationsdatei und `pfad/zu/ihrer_datenbank.db` mit dem Ort, an dem Sie die Funde speichern möchten. (Wenn nicht anders angegeben, wird davon ausgegangen, dass die Konfigurations- und Datenbankdateien `config.yml` und `matches.db` in Ihrem aktuellen Verzeichnis sind; außerdem wird die Datenbank automatisch erstellt, wenn sie nicht auf Ihrem System vorhanden ist.) - -## Ihre YAML-Konfiguration erstellen - -Hier ist eine Beispielkonfiguration, die `rosinenpicker` leitet: - -```yaml -title: 'Meine Dokumentsuche' -strategies: - strategy1: - processed_directory: '/pfad/zu/dokumenten' - file_name_pattern: '.*\.pdf' - file_format: 'pdf' - terms: - term1: 'Apfelkuchen' - export_format: 'csv' - export_path: '/pfad/zu/export.csv' -``` - -Dies sagt `rosinenpicker`, in `/pfad/zu/dokumenten` nach PDF-Dateien zu suchen, die "Apfelkuchen" enthalten, und die Ergebnisse in einer CSV-Datei unter `/pfad/zu/export.csv` zu speichern. Weitere Informationen finden Sie in der [Beispielkonfigurationsdatei](configs/config.yml) in diesem Repository - die Datei enthält zusätzliche Kommentare, die Sie nützlich finden könnten. - -### Weitere Möglichkeiten - -Nun ist es natürlich nicht sehr nützlich, nur den Begriff "Apfelkuchen" aus Dokumenten zu extrahieren. Aber Sie können viel mehr tun. Anstelle von "Apfelkuchen" können Sie einen regulären Ausdruck eingeben, z. B. "\d{8}", um Zahlen zu extrahieren, die aus genau acht Ziffern bestehen. Aber es gibt noch mehr: Wenn Sie einen Ausdruck zusammen mit "@@@" (was für "variable Zeichenfolge" steht) eingeben, wird nur eine Übereinstimmung mit "@@@" zurückgegeben. Z.B. "Name: @@@" wird alles zurückgeben, was auf "Name:" folgt! - -## `rosinenpicker` verwenden - -Mit Ihrer fertigen `config.yml` kehren Sie zur Kommandozeile zurück und führen `rosinenpicker` mit den Argumenten `-c` und `-d` wie oben gezeigt aus. - -## Hilfe und Optionen - -Für eine Liste der Befehle und Optionen tippen Sie: - -``` -rosinenpicker -h -``` - -Dieser Befehl zeigt alles an, was Sie wissen müssen, um `rosinenpicker` zu navigieren. - -## Schlussfolgerung - -Sie sind jetzt bereit, mit `rosinenpicker` wertvolle Informationen zu erkunden und zu extrahieren. Viel Erfolg bei der Informationssuche! diff --git a/configs/config.yml b/configs/config.yml index 5fe9276..dfeeefb 100644 --- a/configs/config.yml +++ b/configs/config.yml @@ -43,3 +43,4 @@ strategies: # # optional: file_content_pattern - a regex pattern that has to return a match in the file contents # # this can be used to further restrict the selection of processed files # file_content_pattern: '.*' + # diff --git a/src/rosinenpicker/processors.py b/src/rosinenpicker/processors.py index 28c688c..6e59190 100644 --- a/src/rosinenpicker/processors.py +++ b/src/rosinenpicker/processors.py @@ -1,14 +1,12 @@ -import fitz +from pdfminer.high_level import extract_text import re class DocumentProcessor: text: str - matchall_maxlength: int result: dict[str, str] - def __init__(self, file_path, matchall_maxlength): + def __init__(self, file_path): self.extract_text(file_path = file_path) - self.matchall_maxlength = matchall_maxlength def extract_text(self, file_path): raise NotImplementedError("This method should be implemented by subclasses.") @@ -28,12 +26,12 @@ def terms_patterns(self, tap: dict[str, tuple[re.Pattern, int, int]]): matchall_index = pattern_tpl[1] number_of_groups = pattern_tpl[2] # also p.groups content = mo.group(matchall_index) - # in case only two groups present: limit length of matched text - if len(content) > self.matchall_maxlength and number_of_groups == 2: - if matchall_index == 1: - content = content[-self.matchall_maxlength:] - else: - content = content[:self.matchall_maxlength] + # # in case only two groups present: limit length of matched text + # if len(content) > self.matchall_maxlength and number_of_groups == 2: + # if matchall_index == 1: + # content = content[-self.matchall_maxlength:] + # else: + # content = content[:self.matchall_maxlength] # no groups else: # mos: indices of matched text @@ -61,11 +59,7 @@ def contains(self, patternstring: str) -> bool: class PDFProcessor(DocumentProcessor): def extract_text(self, file_path): - text = "" - with fitz.open(file_path) as doc: - for page in doc: - text += page.get_text() - self.text = text + self.text = extract_text(file_path) class TXTProcessor(DocumentProcessor): def extract_text(self, file_path): diff --git a/src/rosinenpicker/pydantic_models.py b/src/rosinenpicker/pydantic_models.py index 29318d9..7a0e790 100644 --- a/src/rosinenpicker/pydantic_models.py +++ b/src/rosinenpicker/pydantic_models.py @@ -1,4 +1,5 @@ -from pydantic import BaseModel, DirectoryPath, field_validator, model_validator, NewPath +from pydantic import BaseModel, DirectoryPath, field_validator, model_validator, NewPath, StrictBool +from .utils import process_terms, check_regex from typing import Optional import re @@ -20,43 +21,20 @@ class ConfigStrategy(BaseModel): export_csv_divider: Optional[str] = ';' # terms_patterns_group is created from 'terms', see @model_validator terms_patterns_group: dict[str, tuple[re.Pattern, int, int]] = None - matchall_maxlength: int = 100 @field_validator('file_name_pattern', 'export_format') @classmethod def non_empty_string(cls, v: str): assert v != '', 'Must be a non-empty string' return v.strip() - - # @field_validator('terms') - # @classmethod - # def check_terms(cls, t: dict[str, str]): - # checks = [cls.is_regex(p) for _, p in t.items()] - # if not all(checks): - # raise ConfigError(f"Concerning {t!r}: No regex groups are allowed.") - - @classmethod - def compile_regex(cls, p: str) -> re.Pattern: - try: - rgx = re.compile(p) - return rgx - except: - raise ConfigError(f"Concerning pattern '{p}': this string cannot be used as a regex pattern!") @field_validator('file_name_pattern', 'file_content_pattern') @classmethod def selection_must_be_regex(cls, v: str): v = v.strip() - if not cls.is_regex(v): + if not check_regex(v): raise ConfigError(f"Pattern '{v}' cannot be used as a regex pattern; also, regex groups are not allowed!") return v - - @model_validator(mode='after') - def check_terms_and_patterns(self): - # process terms_and_patterns - processed_tp = {term:self.process_terms(pattern) for term, pattern in self.terms.items()} - self.terms_patterns_group = processed_tp - return self @field_validator('export_format') @classmethod @@ -74,78 +52,12 @@ def validate_file_format(cls, ff: str): raise ConfigError(msg=f"Concerning '{ff}': File format must conform to one of these options: {valid_formats}!") return ff - @classmethod - def is_regex(cls, patternstring: str) -> bool: - #breakpoint() - try: - rgx = re.compile(patternstring) - # Also, do not allow regex groups - if rgx.groups > 0: - return False - except: - return False - return True - - # process_terms - # This function has the following jobs: - # - check if patternstrings can be converted to regex patterns (type re.Pattern) - # - check if patternstrings already contain a "matchall pattern" (.*), as these are not allowed - # - create capture groups if divider is present; if present: - # - check if divider occurs more than once, as this is not allowed - # - replace the divider by a capture group matching all ("matchall pattern") - # - return the index of the (one and only) capture group representing the matchall pattern - # - return the total number of capture groups - # Return value: - # The function returns a tuple of (re.Pattern, int, int) containing the compiled pattern, - # the index of the group containing the (one and only) matchall pattern, and - # the number of capture groups present. - # In case no capture groups have been formed, the second and third integers are set to -1. - def process_terms(cls, patternstring: str, divider: str = "@@@") -> tuple[re.Pattern, int, int]: - # if patternstrings contains groups, reject - if not cls.is_regex(patternstring): - raise ConfigError(f"Concerning '{patternstring}': cannot be used as regex pattern; also, regex groups are not allowed!") - # helper to check if pattern only consists of a matchall pattern - def matchall_only(s) -> bool: - return re.search("\.\*", s) and len(s) == 2 - # check if matchall pattern is present (as this is not allowed) - if matchall_only(patternstring): - raise ConfigError(msg=f"The string '{patternstring}' only contains the matchall-pattern '.*' and can therefore not be processed.") - # divider_hits counts the number of divider in the string; only one is allowed (see below) - divider_hits = len(re.findall(divider, patternstring)) - # check the number of occurrences of divider - if divider_hits > 1: - # as this is not implemented, throw an error - raise ConfigError(msg=f"Each term must correspond to either *one regex pattern* or *two regex patterns divided by '{divider}'*!") - if divider_hits == 0: - # return without capture groups - return (cls.compile_regex(patternstring), -1, -1) - # process the patternstrings divided by divider - multiple_patternstrings = re.split(divider, patternstring) - - # check if patternstring and multiple_patternstrings are valid regex patterns without groups - all_strings = multiple_patternstrings.copy() - all_strings.append(patternstring) - all_check = [cls.is_regex(s) for s in all_strings] - if not all(all_check): - raise ConfigError(f"Concerning one of '{all_strings!r}': cannot be used as regex pattern; also regex groups are not allowed!") - - #breakpoint() - - # do any of the patternstrings only contain a matchall pattern? - if any([matchall_only(p) for p in multiple_patternstrings]): - raise ConfigError(msg=f"At least one of '{multiple_patternstrings!r}' only consists of a matchall-pattern '.*' and can therefore not be processed.") - # is any of the patternstrings of length 0? - lenx = [len(i) for i in multiple_patternstrings] - lenx0 = [l == 0 for l in lenx] - # if yes - if any(lenx0): - # the first? - if lenx0[0]: - return (cls.compile_regex(f"(.*)({multiple_patternstrings[1]})"), 1, 2) - # the second? - return (cls.compile_regex(f"({multiple_patternstrings[0]})(.*)"), 2, 2) - # none of the patternstrings empty? return three groups - return (cls.compile_regex(f"({multiple_patternstrings[0]})(.*)({multiple_patternstrings[1]})"), 2, 3) + @model_validator(mode='after') + def check_terms_and_patterns(self): + # process terms_and_patterns + processed_tp = {term:process_terms(patternstring=pattern) for term, pattern in self.terms.items()} + self.terms_patterns_group = processed_tp + return self class Config(BaseModel): title: str diff --git a/src/rosinenpicker/start.py b/src/rosinenpicker/start.py index bfff73c..967076b 100644 --- a/src/rosinenpicker/start.py +++ b/src/rosinenpicker/start.py @@ -1,4 +1,4 @@ -__version__ = '0.1.5' +__version__ = '0.1.6' import yaml import re import os @@ -53,7 +53,7 @@ def process_strategy(strategy_name: str, cs: ConfigStrategy, db: Session, run_id # loop thru documents for doc in documents: - pr = processor(doc, cs.matchall_maxlength) + pr = processor(doc) # if file_content_pattern is given and if that pattern is not found in the document, skip the document if cs.file_content_pattern: if not pr.contains(cs.file_content_pattern): diff --git a/src/rosinenpicker/utils.py b/src/rosinenpicker/utils.py index 5f5ec35..33c8e93 100644 --- a/src/rosinenpicker/utils.py +++ b/src/rosinenpicker/utils.py @@ -1,7 +1,83 @@ import hashlib +import re + +class PatternStringError(Exception): + def __init__(self, msg): + message_primer = "There appears to be a problem with the patterstring:\n" + self.message = message_primer + msg + super().__init__(self.message) def file_sha256(file_name: str) -> str: with open(file_name, "rb") as f: bytes = f.read() hex_hash = hashlib.sha256(bytes).hexdigest() return hex_hash + +def check_regex(patternstring: str) -> bool: + try: + rgx = re.compile(patternstring) + # Also, do not allow regex groups + if rgx.groups > 0: + return False + except: + return False + return True + +# process_terms +# This function has the following jobs: +# - check if patternstrings can be converted to regex patterns (type re.Pattern) +# - check if patternstrings already contain a "matchall pattern" (.*), as these are not allowed +# - create capture groups if divider is present; if present: +# - check if divider occurs more than once, as this is not allowed +# - replace the divider by a capture group matching all ("matchall pattern") +# - return the index of the (one and only) capture group representing the matchall pattern +# - return the total number of capture groups +# Return value: +# The function returns a tuple of (re.Pattern, int, int) containing the compiled pattern, +# the index of the group containing the (one and only) matchall pattern, and +# the number of capture groups present. +# In case no capture groups have been formed, the second and third integers are set to -1. +def process_terms(patternstring: str, divider: str = "@@@", rflag: re.RegexFlag = re.NOFLAG) -> tuple[re.Pattern, int, int]: + # if patternstrings contains groups, reject + if not check_regex(patternstring): + raise PatternStringError(f"Concerning '{patternstring}': cannot be used as regex pattern; also, regex groups are not allowed!") + # helper to check if pattern only consists of a matchall pattern + def matchall_only(s) -> bool: + return re.search("\.\*", s) and len(s) == 2 + # check if matchall pattern is present (as this is not allowed) + if matchall_only(patternstring): + raise PatternStringError(msg=f"The string '{patternstring}' only contains the matchall-pattern '.*' and can therefore not be processed.") + # divider_hits counts the number of divider in the string; only one is allowed (see below) + divider_hits = len(re.findall(divider, patternstring)) + # check the number of occurrences of divider + if divider_hits > 1: + # as this is not implemented, throw an error + raise PatternStringError(msg=f"Each term must correspond to either *one regex pattern* or *two regex patterns divided by '{divider}'*!") + if divider_hits == 0: + # return without capture groups + return (re.compile(patternstring, rflag), -1, -1) + # process the patternstrings divided by divider + multiple_patternstrings = re.split(divider, patternstring) + + # check if patternstring and multiple_patternstrings are valid regex patterns without groups + all_strings = multiple_patternstrings.copy() + all_strings.append(patternstring) + all_check = [check_regex(s) for s in all_strings] + if not all(all_check): + raise PatternStringError(f"Concerning one of '{all_strings!r}': cannot be used as regex pattern; also regex groups are not allowed!") + + # do any of the patternstrings only contain a matchall pattern? + if any([matchall_only(p) for p in multiple_patternstrings]): + raise PatternStringError(msg=f"At least one of '{multiple_patternstrings!r}' only consists of a matchall-pattern '.*' and can therefore not be processed.") + # is any of the patternstrings of length 0? + lenx = [len(i) for i in multiple_patternstrings] + lenx0 = [l == 0 for l in lenx] + # if yes + if any(lenx0): + # the first? + if lenx0[0]: + return (re.compile(f"(.*)({multiple_patternstrings[1]})", rflag), 1, 2) + # the second? + return (re.compile(f"({multiple_patternstrings[0]})(.*)", rflag), 2, 2) + # none of the patternstrings empty? return three groups + return (re.compile(f"({multiple_patternstrings[0]})(.*)({multiple_patternstrings[1]})", rflag), 2, 3) \ No newline at end of file