clean up utils

joheli · Feb 28, 2024 · d48a85c · d48a85c
1 parent 7244b78
commit d48a85c
Show file tree

Hide file tree

Showing 3 changed files with 6 additions and 81 deletions.
diff --git a/README.md b/README.md
@@ -55,12 +55,12 @@ Now of course it is not very useful to just extract the term "apple pie" out of
 
 #### Even further fine-grained control
 You *can* (i.e. you don't have to) even add more fine-grained control by appending characters after the string '===' (three equal signs):
-  - `m` (**m**ultiline) will allow multiline pattern matching (default: off)
-  - `l` (**l**inebreak to space) will replace linebreaks with space (only applies for multiline matching, default: off)
-  - `c(x)` (**c**rop length to x) will crop the length of the returned string to x (default: off)
-  - `?` (optional term) will mark the term as not optional (default: off, i.e. without the question mark the term is assumed to be required); optional key `move_to_directory` (see [sample configuration file](configs/config.yml)) will ignore these terms.
+  - `m` (**m**ultiline) will allow multiline pattern matching (default: `off`)
+  - `l` (**l**inebreak to space) will replace linebreaks with space (only applies for multiline matching, default: `off`)
+  - `c(x)` (**c**rop length to x) will crop the length of the returned string to x (default: `off`)
+  - `?` will mark the term as *optional* (default: `off`, i.e. without the question mark the term is assumed to be *required*); if set, optional key `move_to_directory` (see [sample configuration file](configs/config.yml)) will ignore this term.
 
-You can use one of above options or several together; the order doesn't count, the main thing is that the option is represented by above flags. So e.g. the term `start@@@finish===mc(100)l?` would search for text between pattern "start" and "finish" over multiple lines, replace line breaks with space, crop the returned text to 100 characters, and mark the term as optional (i.e. not required); it could also have been written as `start@@@finish===lc(100)?m` (i.e. flag order is up to you)!
+You can use one of above options in isolation or several of them in tandem; the order doesn't count, the main thing is that the option is represented by above flags. So e.g. the term `start@@@finish===mc(100)l?` would search for text between pattern "start" and "finish" over multiple lines, replace line breaks with space, crop the returned text to 100 characters, and mark the term as optional (i.e. not required); nevertheless, it could also have been written as `start@@@finish===lc(100)?m` (i.e. flag order is up to you)!
 
 ## Using `rosinenpicker`
 

diff --git a/src/rosinenpicker/start.py b/src/rosinenpicker/start.py
@@ -1,4 +1,4 @@
-__version__ = '0.1.15'
+__version__ = '0.1.16'
 # see content of __init__.py
 import os
 import sys

diff --git a/src/rosinenpicker/utils/utils.py b/src/rosinenpicker/utils/utils.py
@@ -1,83 +1,8 @@
 import hashlib
 import re
 
-class PatternStringError(Exception):
-    def __init__(self, msg):
-        message_primer = "There appears to be a problem with the patterstring:\n"
-        self.message = message_primer + msg
-        super().__init__(self.message)
-
 def file_sha256(file_name: str) -> str:
     with open(file_name, "rb") as f:
         bytes = f.read()
         hex_hash = hashlib.sha256(bytes).hexdigest()
         return hex_hash
-
-def check_regex(patternstring: str) -> bool:
-    try: 
-        rgx = re.compile(patternstring)
-        # Also, do not allow regex groups
-        if rgx.groups > 0:
-            return False
-    except:
-        return False
-    return True
-
-# process_terms
-# This function has the following jobs:
-#   - check if patternstrings can be converted to regex patterns (type re.Pattern)
-#   - check if patternstrings already contain a "matchall pattern" (.*), as these are not allowed 
-#   - create capture groups if divider is present; if present:
-#       - check if divider occurs more than once, as this is not allowed
-#       - replace the divider by a capture group matching all ("matchall pattern")
-#       - return the index of the (one and only) capture group representing the matchall pattern
-#       - return the total number of capture groups
-# Return value:
-#   The function returns a tuple of (re.Pattern, int, int) containing the compiled pattern,
-#   the index of the group containing the (one and only) matchall pattern, and
-#   the number of capture groups present.
-#   In case no capture groups have been formed, the second and third integers are set to -1.
-def process_terms(patternstring: str, divider: str = "@@@", rflag: re.RegexFlag = re.NOFLAG) -> tuple[re.Pattern, int, int]:
-    # if patternstrings contains groups, reject
-    if not check_regex(patternstring):
-        raise PatternStringError(f"Concerning '{patternstring}': cannot be used as regex pattern; also, regex groups are not allowed!")
-    # helper to check if pattern only consists of a matchall pattern
-    def matchall_only(s) -> bool:
-        return re.search("\.\*", s) and len(s) == 2
-    # check if matchall pattern is present (as this is not allowed)
-    if matchall_only(patternstring):
-        raise PatternStringError(msg=f"The string '{patternstring}' only contains the matchall-pattern '.*' and can therefore not be processed.")
-    # divider_hits counts the number of divider in the string; only one is allowed (see below)
-    divider_hits = len(re.findall(divider, patternstring))
-    # check the number of occurrences of divider
-    if divider_hits > 1:
-        # as this is not implemented, throw an error
-        raise PatternStringError(msg=f"Each term must correspond to either *one regex pattern* or *two regex patterns divided by '{divider}'*!")
-    if divider_hits == 0:
-        # return without capture groups
-        return (re.compile(patternstring, rflag), -1, -1)
-    # process the patternstrings divided by divider
-    multiple_patternstrings = re.split(divider, patternstring)
-
-    # check if patternstring and multiple_patternstrings are valid regex patterns without groups
-    all_strings = multiple_patternstrings.copy()
-    all_strings.append(patternstring)
-    all_check = [check_regex(s) for s in all_strings]
-    if not all(all_check):
-        raise PatternStringError(f"Concerning one of '{all_strings!r}': cannot be used as regex pattern; also regex groups are not allowed!")
-
-    # do any of the patternstrings only contain a matchall pattern?
-    if any([matchall_only(p) for p in multiple_patternstrings]):
-        raise PatternStringError(msg=f"At least one of '{multiple_patternstrings!r}' only consists of a matchall-pattern '.*' and can therefore not be processed.")
-    # is any of the patternstrings of length 0?
-    lenx = [len(i) for i in multiple_patternstrings]
-    lenx0 = [l == 0 for l in lenx]
-    # if yes
-    if any(lenx0):
-        # the first?
-        if lenx0[0]:
-            return (re.compile(f"(.*)({multiple_patternstrings[1]})", rflag), 1, 2)
-        # the second?
-        return (re.compile(f"({multiple_patternstrings[0]})(.*)", rflag), 2, 2)
-    # none of the patternstrings empty? return three groups
-    return (re.compile(f"({multiple_patternstrings[0]})(.*)({multiple_patternstrings[1]})", rflag), 2, 3)