From 21d663b2a1c93bdd6cebe596564c59509c1e11e6 Mon Sep 17 00:00:00 2001 From: Vehbi Sinan Tunalioglu Date: Tue, 16 Feb 2021 14:36:49 +0800 Subject: [PATCH] feat: init --- .gitignore | 1 + README.md | 23 +++++++++ remap-demail-classifer | 115 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 139 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100755 remap-demail-classifer diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ceeb05b --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/tmp diff --git a/README.md b/README.md new file mode 100644 index 0000000..d0e2358 --- /dev/null +++ b/README.md @@ -0,0 +1,23 @@ +# REMAP DEMAIL File Classifier + +`remap-demail-classifer` is a command line application written in +Python3. It classifies fetched DEMAIL attachment files using a regex +specification. + +There are no specific requirements for the application to run other +than `>= Python3.6`. + +CLI arguments are as follows: + +``` +./remap-demail-classifer +``` + +Example: + +``` +./remap-demail-classifer spec.csv \ + '^(?P[0-9]{4}\-[0-9]{2}\-[0-9]{2})T[0-9]{2}:[0-9]{2}:[0-9]{2}Z_[A-Z0-9]{32}_[A-Z0-9]{32}_' \ + tmp/ignore.dat \ + /data/remap/tenants/deployment/demail/downloaded +``` diff --git a/remap-demail-classifer b/remap-demail-classifer new file mode 100755 index 0000000..8802a8e --- /dev/null +++ b/remap-demail-classifer @@ -0,0 +1,115 @@ +#!/usr/bin/env python3 + +import csv +import re +import sys +from pathlib import Path +from typing import Iterable, List, NamedTuple, Optional, Pattern + + +#: Application version. +__version__ = "0.0.1.dev0" + + +class Spec(NamedTuple): + """ + Pattern-Template specification. + """ + + #: Original filename pattern. + pattern: Pattern + + #: Target filename template. + template: str + + #: Sub directory for the target file. + subdir: Path + + +def classify(parent: Path, specs: List[Spec], path: Path) -> Optional[Path]: + """ + Attempts to classify the given path as per given patterns. + """ + ## Get and trim filename: + filename = path.name.strip() + + ## Iterate over specs: + for spec in specs: + ## Attempt to match: + match = spec.pattern.match(filename) + + ## If we have a match, get groups, skip otherwise: + if match: + ## Yep, we have a match: + groups = match.groupdict() + + ## Attempt to compile the target filename: + try: + newfilename = spec.template.format(**groups) + except KeyError as err: + print(err) + print(path) + + ## Build and return the target path: + return parent / spec.subdir / newfilename + + ## We do not have any match, return None: + return None + + +def compile_specs(path: Path, prefix: str) -> List[Spec]: + """ + Reads given file and compiles specs. + """ + ## Open the file: + with path.open() as cfile: + ## Read the CSV, compile patterns and build specs: + specs = [ + Spec(re.compile(prefix + r["pattern"]), r["template"], Path(r["directory"])) + for r in csv.DictReader(cfile) + ] + + ## Return specs: + return specs + + +def main( + specfile: Path, prefix: str, ignorepath: Path, parent: Path, paths: Iterable[Path] +) -> None: + """ + Entrypoint. + """ + ## Read in the specifications: + specs = compile_specs(specfile, prefix) + + ## Read in ignored files: + ignores = set( + i for i in (i.strip() for i in ignorepath.read_text().split("\n")) if i + ) + + ## Attempt to classify each path: + for path in paths: + ## Check if the path is ignored: + if path.name in ignores: + print(f"IGNORE: {path}") + continue + + ## Attempt: + attempt = classify(parent, specs, path) + + ## Success? + if attempt is None: + raise Exception(f"Path could not be classified. Filename: {path.name}") + + ## Print: + print(f"{path} -> {attempt}") + + +if __name__ == "__main__": + main( + Path(sys.argv[1]), + sys.argv[2], + Path(sys.argv[3]), + Path("/files/"), + (i for i in Path(sys.argv[4]).iterdir() if i.is_file()), + )