feat: init

telostat · Feb 16, 2021 · 21d663b · 21d663b
commit 21d663b
Show file tree

Hide file tree

Showing 3 changed files with 139 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+/tmp
diff --git a/README.md b/README.md
@@ -0,0 +1,23 @@
+# REMAP DEMAIL File Classifier
+
+`remap-demail-classifer` is a command line application written in
+Python3. It classifies fetched DEMAIL attachment files using a regex
+specification.
+
+There are no specific requirements for the application to run other
+than `>= Python3.6`.
+
+CLI arguments are as follows:
+
+```
+./remap-demail-classifer <SPEC-FILE> <PREFIX-REGEX> <IGNORE-FILE> <DEMAIL-DIR>
+```
+
+Example:
+
+```
+./remap-demail-classifer spec.csv \
+	'^(?P<recdate>[0-9]{4}\-[0-9]{2}\-[0-9]{2})T[0-9]{2}:[0-9]{2}:[0-9]{2}Z_[A-Z0-9]{32}_[A-Z0-9]{32}_' \
+	tmp/ignore.dat \
+	/data/remap/tenants/deployment/demail/downloaded
+```
diff --git a/remap-demail-classifer b/remap-demail-classifer
@@ -0,0 +1,115 @@
+#!/usr/bin/env python3
+
+import csv
+import re
+import sys
+from pathlib import Path
+from typing import Iterable, List, NamedTuple, Optional, Pattern
+
+
+#: Application version.
+__version__ = "0.0.1.dev0"
+
+
+class Spec(NamedTuple):
+    """
+    Pattern-Template specification.
+    """
+
+    #: Original filename pattern.
+    pattern: Pattern
+
+    #: Target filename template.
+    template: str
+
+    #: Sub directory for the target file.
+    subdir: Path
+
+
+def classify(parent: Path, specs: List[Spec], path: Path) -> Optional[Path]:
+    """
+    Attempts to classify the given path as per given patterns.
+    """
+    ## Get and trim filename:
+    filename = path.name.strip()
+
+    ## Iterate over specs:
+    for spec in specs:
+        ## Attempt to match:
+        match = spec.pattern.match(filename)
+
+        ## If we have a match, get groups, skip otherwise:
+        if match:
+            ## Yep, we have a match:
+            groups = match.groupdict()
+
+            ## Attempt to compile the target filename:
+            try:
+                newfilename = spec.template.format(**groups)
+            except KeyError as err:
+                print(err)
+                print(path)
+
+            ## Build and return the target path:
+            return parent / spec.subdir / newfilename
+
+    ## We do not have any match, return None:
+    return None
+
+
+def compile_specs(path: Path, prefix: str) -> List[Spec]:
+    """
+    Reads given file and compiles specs.
+    """
+    ## Open the file:
+    with path.open() as cfile:
+        ## Read the CSV, compile patterns and build specs:
+        specs = [
+            Spec(re.compile(prefix + r["pattern"]), r["template"], Path(r["directory"]))
+            for r in csv.DictReader(cfile)
+        ]
+
+    ## Return specs:
+    return specs
+
+
+def main(
+    specfile: Path, prefix: str, ignorepath: Path, parent: Path, paths: Iterable[Path]
+) -> None:
+    """
+    Entrypoint.
+    """
+    ## Read in the specifications:
+    specs = compile_specs(specfile, prefix)
+
+    ## Read in ignored files:
+    ignores = set(
+        i for i in (i.strip() for i in ignorepath.read_text().split("\n")) if i
+    )
+
+    ## Attempt to classify each path:
+    for path in paths:
+        ## Check if the path is ignored:
+        if path.name in ignores:
+            print(f"IGNORE: {path}")
+            continue
+
+        ## Attempt:
+        attempt = classify(parent, specs, path)
+
+        ## Success?
+        if attempt is None:
+            raise Exception(f"Path could not be classified. Filename: {path.name}")
+
+        ## Print:
+        print(f"{path} -> {attempt}")
+
+
+if __name__ == "__main__":
+    main(
+        Path(sys.argv[1]),
+        sys.argv[2],
+        Path(sys.argv[3]),
+        Path("/files/"),
+        (i for i in Path(sys.argv[4]).iterdir() if i.is_file()),
+    )