Use pdfminer.six instead of PyMuPDF; preparations to use options for …

…patterns; remove German manual
joheli · Feb 19, 2024 · 769bcbe · 769bcbe
1 parent a794f5c
commit 769bcbe
Show file tree

Hide file tree

Showing 8 changed files with 98 additions and 190 deletions.
diff --git a/.gitignore b/.gitignore
@@ -9,3 +9,4 @@ helper
 dumpster/
 *.bat
 .vscode/
+demo
diff --git a/README.md b/README.md
@@ -2,8 +2,6 @@
 
 ![Python Packaging](https://github.com/joheli/rosinenpicker/workflows/Packaging/badge.svg) ![PyPI](https://img.shields.io/pypi/v/rosinenpicker?label=PyPI) ![PyPI - Downloads](https://img.shields.io/pypi/dm/rosinenpicker)
 
-[Deutsch](README_DE.md)
-
 # Manual
 
 Welcome to `rosinenpicker`! This tool is like a magical sieve that helps you find golden nuggets (or "Rosinen") of information within a mountain of documents. It's designed for anyone who needs to extract specific pieces of information without diving deep into the technicalities.

diff --git a/README_DE.md b/README_DE.md
diff --git a/configs/config.yml b/configs/config.yml
@@ -43,3 +43,4 @@ strategies:
  # # optional: file_content_pattern - a regex pattern that has to return a match in the file contents
  # # this can be used to further restrict the selection of processed files 
  # file_content_pattern: '.*'
+ #
diff --git a/src/rosinenpicker/processors.py b/src/rosinenpicker/processors.py
@@ -1,14 +1,12 @@
-import fitz
+from pdfminer.high_level import extract_text
 import re
 
 class DocumentProcessor:
  text: str
- matchall_maxlength: int
  result: dict[str, str]
 
- def __init__(self, file_path, matchall_maxlength):
+ def __init__(self, file_path):
  self.extract_text(file_path = file_path)
- self.matchall_maxlength = matchall_maxlength
 
  def extract_text(self, file_path):
  raise NotImplementedError("This method should be implemented by subclasses.")
@@ -28,12 +26,12 @@ def terms_patterns(self, tap: dict[str, tuple[re.Pattern, int, int]]):
  matchall_index = pattern_tpl[1]
  number_of_groups = pattern_tpl[2] # also p.groups
  content = mo.group(matchall_index)
- # in case only two groups present: limit length of matched text
- if len(content) > self.matchall_maxlength and number_of_groups == 2:
- if matchall_index == 1:
- content = content[-self.matchall_maxlength:]
- else:
- content = content[:self.matchall_maxlength]
+ # # in case only two groups present: limit length of matched text
+ # if len(content) > self.matchall_maxlength and number_of_groups == 2:
+ #  if matchall_index == 1:
+ #  content = content[-self.matchall_maxlength:]
+ #  else:
+ #  content = content[:self.matchall_maxlength]
  # no groups
  else:
  # mos: indices of matched text
@@ -61,11 +59,7 @@ def contains(self, patternstring: str) -> bool:
 
 class PDFProcessor(DocumentProcessor):
  def extract_text(self, file_path):
- text = ""
- with fitz.open(file_path) as doc:
- for page in doc:
- text += page.get_text()
- self.text = text
+ self.text = extract_text(file_path)
 
 class TXTProcessor(DocumentProcessor):
  def extract_text(self, file_path):

diff --git a/src/rosinenpicker/pydantic_models.py b/src/rosinenpicker/pydantic_models.py
@@ -1,4 +1,5 @@
-from pydantic import BaseModel, DirectoryPath, field_validator, model_validator, NewPath
+from pydantic import BaseModel, DirectoryPath, field_validator, model_validator, NewPath, StrictBool
+from .utils import process_terms, check_regex
 from typing import Optional
 import re
 
@@ -20,43 +21,20 @@ class ConfigStrategy(BaseModel):
  export_csv_divider: Optional[str] = ';'
  # terms_patterns_group is created from 'terms', see @model_validator
  terms_patterns_group: dict[str, tuple[re.Pattern, int, int]] = None
- matchall_maxlength: int = 100
 
  @field_validator('file_name_pattern', 'export_format')
  @classmethod
  def non_empty_string(cls, v: str):
  assert v != '', 'Must be a non-empty string'
  return v.strip()
-
- # @field_validator('terms')
- # @classmethod
- # def check_terms(cls, t: dict[str, str]):
- # checks = [cls.is_regex(p) for _, p in t.items()]
- # if not all(checks):
- # raise ConfigError(f"Concerning {t!r}: No regex groups are allowed.")
-
- @classmethod
- def compile_regex(cls, p: str) -> re.Pattern:
- try:
- rgx = re.compile(p)
- return rgx
- except:
- raise ConfigError(f"Concerning pattern '{p}': this string cannot be used as a regex pattern!")
 
  @field_validator('file_name_pattern', 'file_content_pattern')
  @classmethod
  def selection_must_be_regex(cls, v: str):
  v = v.strip()
- if not cls.is_regex(v): 
+ if not check_regex(v): 
  raise ConfigError(f"Pattern '{v}' cannot be used as a regex pattern; also, regex groups are not allowed!")
  return v
-
- @model_validator(mode='after')
- def check_terms_and_patterns(self):
- # process terms_and_patterns 
- processed_tp = {term:self.process_terms(pattern) for term, pattern in self.terms.items()}
- self.terms_patterns_group = processed_tp
- return self
 
  @field_validator('export_format')
  @classmethod
@@ -74,78 +52,12 @@ def validate_file_format(cls, ff: str):
  raise ConfigError(msg=f"Concerning '{ff}': File format must conform to one of these options: {valid_formats}!")
  return ff
 
- @classmethod
- def is_regex(cls, patternstring: str) -> bool:
- #breakpoint()
- try: 
- rgx = re.compile(patternstring)
- # Also, do not allow regex groups
- if rgx.groups > 0:
- return False
- except:
- return False
- return True
-
- # process_terms
- # This function has the following jobs:
- # - check if patternstrings can be converted to regex patterns (type re.Pattern)
- # - check if patternstrings already contain a "matchall pattern" (.*), as these are not allowed 
- # - create capture groups if divider is present; if present:
- # - check if divider occurs more than once, as this is not allowed
- # - replace the divider by a capture group matching all ("matchall pattern")
- # - return the index of the (one and only) capture group representing the matchall pattern
- # - return the total number of capture groups
- # Return value:
- # The function returns a tuple of (re.Pattern, int, int) containing the compiled pattern,
- # the index of the group containing the (one and only) matchall pattern, and
- # the number of capture groups present.
- # In case no capture groups have been formed, the second and third integers are set to -1.
- def process_terms(cls, patternstring: str, divider: str = "@@@") -> tuple[re.Pattern, int, int]:
- # if patternstrings contains groups, reject
- if not cls.is_regex(patternstring):
- raise ConfigError(f"Concerning '{patternstring}': cannot be used as regex pattern; also, regex groups are not allowed!")
- # helper to check if pattern only consists of a matchall pattern
- def matchall_only(s) -> bool:
- return re.search("\.\*", s) and len(s) == 2
- # check if matchall pattern is present (as this is not allowed)
- if matchall_only(patternstring):
- raise ConfigError(msg=f"The string '{patternstring}' only contains the matchall-pattern '.*' and can therefore not be processed.")
- # divider_hits counts the number of divider in the string; only one is allowed (see below)
- divider_hits = len(re.findall(divider, patternstring))
- # check the number of occurrences of divider
- if divider_hits > 1:
- # as this is not implemented, throw an error
- raise ConfigError(msg=f"Each term must correspond to either *one regex pattern* or *two regex patterns divided by '{divider}'*!")
- if divider_hits == 0:
- # return without capture groups
- return (cls.compile_regex(patternstring), -1, -1)
- # process the patternstrings divided by divider
- multiple_patternstrings = re.split(divider, patternstring)
-
- # check if patternstring and multiple_patternstrings are valid regex patterns without groups
- all_strings = multiple_patternstrings.copy()
- all_strings.append(patternstring)
- all_check = [cls.is_regex(s) for s in all_strings]
- if not all(all_check):
- raise ConfigError(f"Concerning one of '{all_strings!r}': cannot be used as regex pattern; also regex groups are not allowed!")
-
- #breakpoint()
-
- # do any of the patternstrings only contain a matchall pattern?
- if any([matchall_only(p) for p in multiple_patternstrings]):
- raise ConfigError(msg=f"At least one of '{multiple_patternstrings!r}' only consists of a matchall-pattern '.*' and can therefore not be processed.")
- # is any of the patternstrings of length 0?
- lenx = [len(i) for i in multiple_patternstrings]
- lenx0 = [l == 0 for l in lenx]
- # if yes
- if any(lenx0):
- # the first?
- if lenx0[0]:
- return (cls.compile_regex(f"(.*)({multiple_patternstrings[1]})"), 1, 2)
- # the second?
- return (cls.compile_regex(f"({multiple_patternstrings[0]})(.*)"), 2, 2)
- # none of the patternstrings empty? return three groups
- return (cls.compile_regex(f"({multiple_patternstrings[0]})(.*)({multiple_patternstrings[1]})"), 2, 3)
+ @model_validator(mode='after')
+ def check_terms_and_patterns(self):
+ # process terms_and_patterns 
+ processed_tp = {term:process_terms(patternstring=pattern) for term, pattern in self.terms.items()}
+ self.terms_patterns_group = processed_tp
+ return self
 
 class Config(BaseModel):
  title: str

diff --git a/src/rosinenpicker/start.py b/src/rosinenpicker/start.py
@@ -1,4 +1,4 @@
-__version__ = '0.1.5'
+__version__ = '0.1.6'
 import yaml
 import re
 import os
@@ -53,7 +53,7 @@ def process_strategy(strategy_name: str, cs: ConfigStrategy, db: Session, run_id
 
  # loop thru documents
  for doc in documents:
- pr = processor(doc, cs.matchall_maxlength)
+ pr = processor(doc)
  # if file_content_pattern is given and if that pattern is not found in the document, skip the document
  if cs.file_content_pattern:
  if not pr.contains(cs.file_content_pattern):

diff --git a/src/rosinenpicker/utils.py b/src/rosinenpicker/utils.py
@@ -1,7 +1,83 @@
 import hashlib
+import re
+
+class PatternStringError(Exception):
+ def __init__(self, msg):
+ message_primer = "There appears to be a problem with the patterstring:\n"
+ self.message = message_primer + msg
+ super().__init__(self.message)
 
 def file_sha256(file_name: str) -> str:
  with open(file_name, "rb") as f:
  bytes = f.read()
  hex_hash = hashlib.sha256(bytes).hexdigest()
  return hex_hash
+
+def check_regex(patternstring: str) -> bool:
+ try: 
+ rgx = re.compile(patternstring)
+ # Also, do not allow regex groups
+ if rgx.groups > 0:
+ return False
+ except:
+ return False
+ return True
+
+# process_terms
+# This function has the following jobs:
+# - check if patternstrings can be converted to regex patterns (type re.Pattern)
+# - check if patternstrings already contain a "matchall pattern" (.*), as these are not allowed 
+# - create capture groups if divider is present; if present:
+# - check if divider occurs more than once, as this is not allowed
+# - replace the divider by a capture group matching all ("matchall pattern")
+# - return the index of the (one and only) capture group representing the matchall pattern
+# - return the total number of capture groups
+# Return value:
+# The function returns a tuple of (re.Pattern, int, int) containing the compiled pattern,
+# the index of the group containing the (one and only) matchall pattern, and
+# the number of capture groups present.
+# In case no capture groups have been formed, the second and third integers are set to -1.
+def process_terms(patternstring: str, divider: str = "@@@", rflag: re.RegexFlag = re.NOFLAG) -> tuple[re.Pattern, int, int]:
+ # if patternstrings contains groups, reject
+ if not check_regex(patternstring):
+ raise PatternStringError(f"Concerning '{patternstring}': cannot be used as regex pattern; also, regex groups are not allowed!")
+ # helper to check if pattern only consists of a matchall pattern
+ def matchall_only(s) -> bool:
+ return re.search("\.\*", s) and len(s) == 2
+ # check if matchall pattern is present (as this is not allowed)
+ if matchall_only(patternstring):
+ raise PatternStringError(msg=f"The string '{patternstring}' only contains the matchall-pattern '.*' and can therefore not be processed.")
+ # divider_hits counts the number of divider in the string; only one is allowed (see below)
+ divider_hits = len(re.findall(divider, patternstring))
+ # check the number of occurrences of divider
+ if divider_hits > 1:
+ # as this is not implemented, throw an error
+ raise PatternStringError(msg=f"Each term must correspond to either *one regex pattern* or *two regex patterns divided by '{divider}'*!")
+ if divider_hits == 0:
+ # return without capture groups
+ return (re.compile(patternstring, rflag), -1, -1)
+ # process the patternstrings divided by divider
+ multiple_patternstrings = re.split(divider, patternstring)
+
+ # check if patternstring and multiple_patternstrings are valid regex patterns without groups
+ all_strings = multiple_patternstrings.copy()
+ all_strings.append(patternstring)
+ all_check = [check_regex(s) for s in all_strings]
+ if not all(all_check):
+ raise PatternStringError(f"Concerning one of '{all_strings!r}': cannot be used as regex pattern; also regex groups are not allowed!")
+
+ # do any of the patternstrings only contain a matchall pattern?
+ if any([matchall_only(p) for p in multiple_patternstrings]):
+ raise PatternStringError(msg=f"At least one of '{multiple_patternstrings!r}' only consists of a matchall-pattern '.*' and can therefore not be processed.")
+ # is any of the patternstrings of length 0?
+ lenx = [len(i) for i in multiple_patternstrings]
+ lenx0 = [l == 0 for l in lenx]
+ # if yes
+ if any(lenx0):
+ # the first?
+ if lenx0[0]:
+ return (re.compile(f"(.*)({multiple_patternstrings[1]})", rflag), 1, 2)
+ # the second?
+ return (re.compile(f"({multiple_patternstrings[0]})(.*)", rflag), 2, 2)
+ # none of the patternstrings empty? return three groups
+ return (re.compile(f"({multiple_patternstrings[0]})(.*)({multiple_patternstrings[1]})", rflag), 2, 3)