forked from chesvectain/PackingData
-
Notifications
You must be signed in to change notification settings - Fork 6
Generate labels JSON from the dataset's folders
Alex edited this page Apr 11, 2022
·
1 revision
Tinyscript for achieving this:
#!/usr/bin/env python
from tinyscript import *
PATTERNS = {'not-packed': None, 'BeRoEXEPacker': "bero"}
if __name__ == '__main__':
parser.add_argument("dataset", help="dataset whose labels are to be computed")
parser.add_argument("-e", "--exclude", nargs="*", default=[".git"], action="extend", help="excluded folders")
parser.add_argument("-o", "--output", default="labels.json", help="output labels JSON file")
initialize()
labels, files = {}, {}
for f in ts.Path(args.dataset).walk():
if any(x in args.exclude for x in f.parts) or not ts.is_file(f) or f.extension in [".ini", ".md"]:
continue
logger.info("Processing %s" % f)
label = PATTERNS.get(f.parts[-2], re.sub(r"[\s-]", "_", f.parts[-2].lower()))
h = hashlib.sha256_file(f)
if h in labels:
logger.warning("Duplicate: %s (of %s)" % (f, files[h]))
f.remove()
else:
labels[h], files[h] = label, str(f)
with open(args.output, 'w') as f:
json.dump(labels, f, indent=4)
Usage (from repo's root): python3 labels-generator.py .