Skip to content

Commit

Permalink
Use pdfminer.six instead of PyMuPDF; preparations to use options for …
Browse files Browse the repository at this point in the history
…patterns; remove German manual
  • Loading branch information
Elias authored and Elias committed Feb 19, 2024
1 parent a794f5c commit 769bcbe
Show file tree
Hide file tree
Showing 8 changed files with 98 additions and 190 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ helper
dumpster/
*.bat
.vscode/
demo
2 changes: 0 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@

![Python Packaging](https://github.com/joheli/rosinenpicker/workflows/Packaging/badge.svg) ![PyPI](https://img.shields.io/pypi/v/rosinenpicker?label=PyPI) ![PyPI - Downloads](https://img.shields.io/pypi/dm/rosinenpicker)

[Deutsch](README_DE.md)

# Manual

Welcome to `rosinenpicker`! This tool is like a magical sieve that helps you find golden nuggets (or "Rosinen") of information within a mountain of documents. It's designed for anyone who needs to extract specific pieces of information without diving deep into the technicalities.
Expand Down
74 changes: 0 additions & 74 deletions README_DE.md

This file was deleted.

1 change: 1 addition & 0 deletions configs/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,4 @@ strategies:
# # optional: file_content_pattern - a regex pattern that has to return a match in the file contents
# # this can be used to further restrict the selection of processed files
# file_content_pattern: '.*'
#
24 changes: 9 additions & 15 deletions src/rosinenpicker/processors.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,12 @@
import fitz
from pdfminer.high_level import extract_text
import re

class DocumentProcessor:
text: str
matchall_maxlength: int
result: dict[str, str]

def __init__(self, file_path, matchall_maxlength):
def __init__(self, file_path):
self.extract_text(file_path = file_path)
self.matchall_maxlength = matchall_maxlength

def extract_text(self, file_path):
raise NotImplementedError("This method should be implemented by subclasses.")
Expand All @@ -28,12 +26,12 @@ def terms_patterns(self, tap: dict[str, tuple[re.Pattern, int, int]]):
matchall_index = pattern_tpl[1]
number_of_groups = pattern_tpl[2] # also p.groups
content = mo.group(matchall_index)
# in case only two groups present: limit length of matched text
if len(content) > self.matchall_maxlength and number_of_groups == 2:
if matchall_index == 1:
content = content[-self.matchall_maxlength:]
else:
content = content[:self.matchall_maxlength]
# # in case only two groups present: limit length of matched text
# if len(content) > self.matchall_maxlength and number_of_groups == 2:
# if matchall_index == 1:
# content = content[-self.matchall_maxlength:]
# else:
# content = content[:self.matchall_maxlength]
# no groups
else:
# mos: indices of matched text
Expand Down Expand Up @@ -61,11 +59,7 @@ def contains(self, patternstring: str) -> bool:

class PDFProcessor(DocumentProcessor):
def extract_text(self, file_path):
text = ""
with fitz.open(file_path) as doc:
for page in doc:
text += page.get_text()
self.text = text
self.text = extract_text(file_path)

class TXTProcessor(DocumentProcessor):
def extract_text(self, file_path):
Expand Down
106 changes: 9 additions & 97 deletions src/rosinenpicker/pydantic_models.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from pydantic import BaseModel, DirectoryPath, field_validator, model_validator, NewPath
from pydantic import BaseModel, DirectoryPath, field_validator, model_validator, NewPath, StrictBool
from .utils import process_terms, check_regex
from typing import Optional
import re

Expand All @@ -20,43 +21,20 @@ class ConfigStrategy(BaseModel):
export_csv_divider: Optional[str] = ';'
# terms_patterns_group is created from 'terms', see @model_validator
terms_patterns_group: dict[str, tuple[re.Pattern, int, int]] = None
matchall_maxlength: int = 100

@field_validator('file_name_pattern', 'export_format')
@classmethod
def non_empty_string(cls, v: str):
assert v != '', 'Must be a non-empty string'
return v.strip()

# @field_validator('terms')
# @classmethod
# def check_terms(cls, t: dict[str, str]):
# checks = [cls.is_regex(p) for _, p in t.items()]
# if not all(checks):
# raise ConfigError(f"Concerning {t!r}: No regex groups are allowed.")

@classmethod
def compile_regex(cls, p: str) -> re.Pattern:
try:
rgx = re.compile(p)
return rgx
except:
raise ConfigError(f"Concerning pattern '{p}': this string cannot be used as a regex pattern!")

@field_validator('file_name_pattern', 'file_content_pattern')
@classmethod
def selection_must_be_regex(cls, v: str):
v = v.strip()
if not cls.is_regex(v):
if not check_regex(v):
raise ConfigError(f"Pattern '{v}' cannot be used as a regex pattern; also, regex groups are not allowed!")
return v

@model_validator(mode='after')
def check_terms_and_patterns(self):
# process terms_and_patterns
processed_tp = {term:self.process_terms(pattern) for term, pattern in self.terms.items()}
self.terms_patterns_group = processed_tp
return self

@field_validator('export_format')
@classmethod
Expand All @@ -74,78 +52,12 @@ def validate_file_format(cls, ff: str):
raise ConfigError(msg=f"Concerning '{ff}': File format must conform to one of these options: {valid_formats}!")
return ff

@classmethod
def is_regex(cls, patternstring: str) -> bool:
#breakpoint()
try:
rgx = re.compile(patternstring)
# Also, do not allow regex groups
if rgx.groups > 0:
return False
except:
return False
return True

# process_terms
# This function has the following jobs:
# - check if patternstrings can be converted to regex patterns (type re.Pattern)
# - check if patternstrings already contain a "matchall pattern" (.*), as these are not allowed
# - create capture groups if divider is present; if present:
# - check if divider occurs more than once, as this is not allowed
# - replace the divider by a capture group matching all ("matchall pattern")
# - return the index of the (one and only) capture group representing the matchall pattern
# - return the total number of capture groups
# Return value:
# The function returns a tuple of (re.Pattern, int, int) containing the compiled pattern,
# the index of the group containing the (one and only) matchall pattern, and
# the number of capture groups present.
# In case no capture groups have been formed, the second and third integers are set to -1.
def process_terms(cls, patternstring: str, divider: str = "@@@") -> tuple[re.Pattern, int, int]:
# if patternstrings contains groups, reject
if not cls.is_regex(patternstring):
raise ConfigError(f"Concerning '{patternstring}': cannot be used as regex pattern; also, regex groups are not allowed!")
# helper to check if pattern only consists of a matchall pattern
def matchall_only(s) -> bool:
return re.search("\.\*", s) and len(s) == 2
# check if matchall pattern is present (as this is not allowed)
if matchall_only(patternstring):
raise ConfigError(msg=f"The string '{patternstring}' only contains the matchall-pattern '.*' and can therefore not be processed.")
# divider_hits counts the number of divider in the string; only one is allowed (see below)
divider_hits = len(re.findall(divider, patternstring))
# check the number of occurrences of divider
if divider_hits > 1:
# as this is not implemented, throw an error
raise ConfigError(msg=f"Each term must correspond to either *one regex pattern* or *two regex patterns divided by '{divider}'*!")
if divider_hits == 0:
# return without capture groups
return (cls.compile_regex(patternstring), -1, -1)
# process the patternstrings divided by divider
multiple_patternstrings = re.split(divider, patternstring)

# check if patternstring and multiple_patternstrings are valid regex patterns without groups
all_strings = multiple_patternstrings.copy()
all_strings.append(patternstring)
all_check = [cls.is_regex(s) for s in all_strings]
if not all(all_check):
raise ConfigError(f"Concerning one of '{all_strings!r}': cannot be used as regex pattern; also regex groups are not allowed!")

#breakpoint()

# do any of the patternstrings only contain a matchall pattern?
if any([matchall_only(p) for p in multiple_patternstrings]):
raise ConfigError(msg=f"At least one of '{multiple_patternstrings!r}' only consists of a matchall-pattern '.*' and can therefore not be processed.")
# is any of the patternstrings of length 0?
lenx = [len(i) for i in multiple_patternstrings]
lenx0 = [l == 0 for l in lenx]
# if yes
if any(lenx0):
# the first?
if lenx0[0]:
return (cls.compile_regex(f"(.*)({multiple_patternstrings[1]})"), 1, 2)
# the second?
return (cls.compile_regex(f"({multiple_patternstrings[0]})(.*)"), 2, 2)
# none of the patternstrings empty? return three groups
return (cls.compile_regex(f"({multiple_patternstrings[0]})(.*)({multiple_patternstrings[1]})"), 2, 3)
@model_validator(mode='after')
def check_terms_and_patterns(self):
# process terms_and_patterns
processed_tp = {term:process_terms(patternstring=pattern) for term, pattern in self.terms.items()}
self.terms_patterns_group = processed_tp
return self

class Config(BaseModel):
title: str
Expand Down
4 changes: 2 additions & 2 deletions src/rosinenpicker/start.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = '0.1.5'
__version__ = '0.1.6'
import yaml
import re
import os
Expand Down Expand Up @@ -53,7 +53,7 @@ def process_strategy(strategy_name: str, cs: ConfigStrategy, db: Session, run_id

# loop thru documents
for doc in documents:
pr = processor(doc, cs.matchall_maxlength)
pr = processor(doc)
# if file_content_pattern is given and if that pattern is not found in the document, skip the document
if cs.file_content_pattern:
if not pr.contains(cs.file_content_pattern):
Expand Down
76 changes: 76 additions & 0 deletions src/rosinenpicker/utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,83 @@
import hashlib
import re

class PatternStringError(Exception):
def __init__(self, msg):
message_primer = "There appears to be a problem with the patterstring:\n"
self.message = message_primer + msg
super().__init__(self.message)

def file_sha256(file_name: str) -> str:
with open(file_name, "rb") as f:
bytes = f.read()
hex_hash = hashlib.sha256(bytes).hexdigest()
return hex_hash

def check_regex(patternstring: str) -> bool:
try:
rgx = re.compile(patternstring)
# Also, do not allow regex groups
if rgx.groups > 0:
return False
except:
return False
return True

# process_terms
# This function has the following jobs:
# - check if patternstrings can be converted to regex patterns (type re.Pattern)
# - check if patternstrings already contain a "matchall pattern" (.*), as these are not allowed
# - create capture groups if divider is present; if present:
# - check if divider occurs more than once, as this is not allowed
# - replace the divider by a capture group matching all ("matchall pattern")
# - return the index of the (one and only) capture group representing the matchall pattern
# - return the total number of capture groups
# Return value:
# The function returns a tuple of (re.Pattern, int, int) containing the compiled pattern,
# the index of the group containing the (one and only) matchall pattern, and
# the number of capture groups present.
# In case no capture groups have been formed, the second and third integers are set to -1.
def process_terms(patternstring: str, divider: str = "@@@", rflag: re.RegexFlag = re.NOFLAG) -> tuple[re.Pattern, int, int]:
# if patternstrings contains groups, reject
if not check_regex(patternstring):
raise PatternStringError(f"Concerning '{patternstring}': cannot be used as regex pattern; also, regex groups are not allowed!")
# helper to check if pattern only consists of a matchall pattern
def matchall_only(s) -> bool:
return re.search("\.\*", s) and len(s) == 2
# check if matchall pattern is present (as this is not allowed)
if matchall_only(patternstring):
raise PatternStringError(msg=f"The string '{patternstring}' only contains the matchall-pattern '.*' and can therefore not be processed.")
# divider_hits counts the number of divider in the string; only one is allowed (see below)
divider_hits = len(re.findall(divider, patternstring))
# check the number of occurrences of divider
if divider_hits > 1:
# as this is not implemented, throw an error
raise PatternStringError(msg=f"Each term must correspond to either *one regex pattern* or *two regex patterns divided by '{divider}'*!")
if divider_hits == 0:
# return without capture groups
return (re.compile(patternstring, rflag), -1, -1)
# process the patternstrings divided by divider
multiple_patternstrings = re.split(divider, patternstring)

# check if patternstring and multiple_patternstrings are valid regex patterns without groups
all_strings = multiple_patternstrings.copy()
all_strings.append(patternstring)
all_check = [check_regex(s) for s in all_strings]
if not all(all_check):
raise PatternStringError(f"Concerning one of '{all_strings!r}': cannot be used as regex pattern; also regex groups are not allowed!")

# do any of the patternstrings only contain a matchall pattern?
if any([matchall_only(p) for p in multiple_patternstrings]):
raise PatternStringError(msg=f"At least one of '{multiple_patternstrings!r}' only consists of a matchall-pattern '.*' and can therefore not be processed.")
# is any of the patternstrings of length 0?
lenx = [len(i) for i in multiple_patternstrings]
lenx0 = [l == 0 for l in lenx]
# if yes
if any(lenx0):
# the first?
if lenx0[0]:
return (re.compile(f"(.*)({multiple_patternstrings[1]})", rflag), 1, 2)
# the second?
return (re.compile(f"({multiple_patternstrings[0]})(.*)", rflag), 2, 2)
# none of the patternstrings empty? return three groups
return (re.compile(f"({multiple_patternstrings[0]})(.*)({multiple_patternstrings[1]})", rflag), 2, 3)

0 comments on commit 769bcbe

Please sign in to comment.