Skip to content

Commit

Permalink
feat: init
Browse files Browse the repository at this point in the history
  • Loading branch information
vst committed Feb 16, 2021
0 parents commit 21d663b
Show file tree
Hide file tree
Showing 3 changed files with 139 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
/tmp
23 changes: 23 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# REMAP DEMAIL File Classifier

`remap-demail-classifer` is a command line application written in
Python3. It classifies fetched DEMAIL attachment files using a regex
specification.

There are no specific requirements for the application to run other
than `>= Python3.6`.

CLI arguments are as follows:

```
./remap-demail-classifer <SPEC-FILE> <PREFIX-REGEX> <IGNORE-FILE> <DEMAIL-DIR>
```

Example:

```
./remap-demail-classifer spec.csv \
'^(?P<recdate>[0-9]{4}\-[0-9]{2}\-[0-9]{2})T[0-9]{2}:[0-9]{2}:[0-9]{2}Z_[A-Z0-9]{32}_[A-Z0-9]{32}_' \
tmp/ignore.dat \
/data/remap/tenants/deployment/demail/downloaded
```
115 changes: 115 additions & 0 deletions remap-demail-classifer
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
#!/usr/bin/env python3

import csv
import re
import sys
from pathlib import Path
from typing import Iterable, List, NamedTuple, Optional, Pattern


#: Application version.
__version__ = "0.0.1.dev0"


class Spec(NamedTuple):
"""
Pattern-Template specification.
"""

#: Original filename pattern.
pattern: Pattern

#: Target filename template.
template: str

#: Sub directory for the target file.
subdir: Path


def classify(parent: Path, specs: List[Spec], path: Path) -> Optional[Path]:
"""
Attempts to classify the given path as per given patterns.
"""
## Get and trim filename:
filename = path.name.strip()

## Iterate over specs:
for spec in specs:
## Attempt to match:
match = spec.pattern.match(filename)

## If we have a match, get groups, skip otherwise:
if match:
## Yep, we have a match:
groups = match.groupdict()

## Attempt to compile the target filename:
try:
newfilename = spec.template.format(**groups)
except KeyError as err:
print(err)
print(path)

## Build and return the target path:
return parent / spec.subdir / newfilename

## We do not have any match, return None:
return None


def compile_specs(path: Path, prefix: str) -> List[Spec]:
"""
Reads given file and compiles specs.
"""
## Open the file:
with path.open() as cfile:
## Read the CSV, compile patterns and build specs:
specs = [
Spec(re.compile(prefix + r["pattern"]), r["template"], Path(r["directory"]))
for r in csv.DictReader(cfile)
]

## Return specs:
return specs


def main(
specfile: Path, prefix: str, ignorepath: Path, parent: Path, paths: Iterable[Path]
) -> None:
"""
Entrypoint.
"""
## Read in the specifications:
specs = compile_specs(specfile, prefix)

## Read in ignored files:
ignores = set(
i for i in (i.strip() for i in ignorepath.read_text().split("\n")) if i
)

## Attempt to classify each path:
for path in paths:
## Check if the path is ignored:
if path.name in ignores:
print(f"IGNORE: {path}")
continue

## Attempt:
attempt = classify(parent, specs, path)

## Success?
if attempt is None:
raise Exception(f"Path could not be classified. Filename: {path.name}")

## Print:
print(f"{path} -> {attempt}")


if __name__ == "__main__":
main(
Path(sys.argv[1]),
sys.argv[2],
Path(sys.argv[3]),
Path("/files/"),
(i for i in Path(sys.argv[4]).iterdir() if i.is_file()),
)

0 comments on commit 21d663b

Please sign in to comment.