From 70721a0bef1050d5d72a8c2bb038b086e6419b06 Mon Sep 17 00:00:00 2001 From: Maksim Andrianov Date: Fri, 4 Sep 2020 13:11:36 +0300 Subject: [PATCH] [generator][python] Added diff_complex. --- tools/python/diff_complex/__init__.py | 0 tools/python/diff_complex/__main__.py | 138 +++++++++++++++++++++ tools/python/diff_complex/requirements.txt | 3 + tools/python/diff_complex/trees_builder.py | 126 +++++++++++++++++++ 4 files changed, 267 insertions(+) create mode 100644 tools/python/diff_complex/__init__.py create mode 100644 tools/python/diff_complex/__main__.py create mode 100644 tools/python/diff_complex/requirements.txt create mode 100644 tools/python/diff_complex/trees_builder.py diff --git a/tools/python/diff_complex/__init__.py b/tools/python/diff_complex/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tools/python/diff_complex/__main__.py b/tools/python/diff_complex/__main__.py new file mode 100644 index 00000000000..d6b3f4c1995 --- /dev/null +++ b/tools/python/diff_complex/__main__.py @@ -0,0 +1,138 @@ +import argparse +import csv +import logging +import sys +from itertools import islice +from typing import Dict + +import zss + +from diff_complex.trees_builder import Node +from diff_complex.trees_builder import read_complexes_from_csv + +logger = logging.getLogger("diff_complex") + +csv.field_size_limit(sys.maxsize) + + +def parse_args(): + parser = argparse.ArgumentParser(description="Compare comples files.") + parser.add_argument( + "--old", metavar="PATH", type=str, help="Path to old file", required=True + ) + parser.add_argument( + "--new", metavar="PATH", type=str, help="Path to new file", required=True + ) + + parser.add_argument( + "--popularity", metavar="PATH", type=str, help="Path to popularity file" + ) + parser.add_argument( + "--num", + type=int, + help="Number of objects from popularity file that to be compared", + default=50, + ) + + parser.add_argument( + "--threshold", type=int, help="Threshold of tree distance", default=1, + ) + + parser.add_argument( + "--from_root", + default=False, + action="store_true", + help="Compare trees from roots.", + ) + + return parser.parse_args() + + +def label_dist(a, b): + if a == b: + return 0 + else: + return 1 + + +def diff( + old_complexes_map: Dict[str, Node], + new_complexes_map: Dict[str, Node], + id_: str, + threshold: int = 1, + from_root: bool = False, +): + old_tree = old_complexes_map.get(id_) + if old_tree is None: + logger.warning(f"{id_} is not found in old complexes.") + return + + new_tree = new_complexes_map.get(id_) + if new_tree is None: + logger.warning(f"{id_} is not found in new complexes.") + return + + if from_root: + p = old_tree.parent + while p is not None: + old_tree = p.parent + + p = new_tree.parent + while p is not None: + new_tree = p.parent + + operations = zss.simple_distance( + old_tree, new_tree, label_dist=label_dist, return_operations=True + ) + + if operations[0] >= threshold: + op_o = { + o.arg1: o + for o in operations[1] + if o.type == zss.Operation.remove or o.type == zss.Operation.update + } + op_n = { + o.arg2: o + for o in operations[1] + if o.type == zss.Operation.insert or o.type == zss.Operation.update + } + + logger.warning( + f"Differences found for id[{id_}]: distance is {operations[0]}\n" + f"Old:\n" + f"{old_tree.to_string_with_operations(op_o)}\n" + f"New:\n" + f"{new_tree.to_string_with_operations(op_n)}" + ) + + +def main(): + logging.basicConfig( + level=logging.INFO, format="[%(asctime)s] %(levelname)s %(module)s %(message)s" + ) + + args = parse_args() + old_complexes_map = read_complexes_from_csv(args.old) + logger.info(f"{len(old_complexes_map)} old complexes was read from {args.old}") + + new_complexes_map = read_complexes_from_csv(args.new) + logger.info(f"{len(new_complexes_map)} new complexes was read from {args.new}") + + if args.popularity: + with open(args.popularity) as csvfile: + rows = csv.reader(csvfile, delimiter=",") + ids = [row[0] for row in islice(rows, args.num)] + else: + old_complexes_map = { + k: v for k, v in old_complexes_map.items() if v.parent is None + } + new_complexes_map = { + k: v for k, v in new_complexes_map.items() if v.parent is None + } + ids = list(old_complexes_map.keys()) + + for id_ in ids: + diff(old_complexes_map, new_complexes_map, id_, args.threshold, args.from_root) + + +main() diff --git a/tools/python/diff_complex/requirements.txt b/tools/python/diff_complex/requirements.txt new file mode 100644 index 00000000000..9191b84ddc8 --- /dev/null +++ b/tools/python/diff_complex/requirements.txt @@ -0,0 +1,3 @@ +-r ../mwm/requirements.txt +zss==1.2.0 +numpy>=1.7 \ No newline at end of file diff --git a/tools/python/diff_complex/trees_builder.py b/tools/python/diff_complex/trees_builder.py new file mode 100644 index 00000000000..9e87afe7f52 --- /dev/null +++ b/tools/python/diff_complex/trees_builder.py @@ -0,0 +1,126 @@ +import csv +import logging +from typing import Dict +from typing import List +from typing import Optional + +import zss + +from mwm.decode_id import decode_id + +logger = logging.getLogger("diff_complex") + + +class Node(zss.Node): + def __init__(self, label, children=None): + self.parent = None + super().__init__(label, children) + + def addpar(self, node): + self.parent = node + + def __str__(self): + return self.to_string_with_operations(operations={}) + + def to_string_with_operations(self, operations): + lines = [] + _print_tree(self, lines, is_root=True, operations=operations) + return "".join(lines) + + def __hash__(self): + return id(self) + + +def _print_tree(node, lines, prefix="", is_tail=True, is_root=False, operations=None): + lines.append(prefix) + if not is_root: + if is_tail: + lines.append("└───") + prefix += " " + else: + lines.append("├───") + prefix += "│ " + + label = str(node.label).replace("\n", "") + if operations is not None and node in operations: + t = operations[node].type + if t == zss.Operation.remove: + lines.append("(-)") + elif t == zss.Operation.insert: + lines.append("(+)") + elif t == zss.Operation.update: + lines.append("(-+)") + lines.append(label) + lines.append("\n") + + l = len(node.children) - 1 + for i, c in enumerate(node.children): + _print_tree(c, lines, prefix, i == l, False, operations) + + +def link_nodes(node: Node, parent: Node): + node.addpar(parent) + parent.addkid(node) + + +class HierarchyEntry: + __slots__ = ("id", "parent_id", "depth", "lat", "lon", "type", "name", "country") + + @staticmethod + def make_from_csv_row(csv_row: List[str]) -> Optional["HierarchyEntry"]: + e = HierarchyEntry() + if len(csv_row) == 8: + e.id = csv_row[0] + e.parent_id = csv_row[1] + e.depth = int(csv_row[2]) + e.lat = float(csv_row[3]) + e.lon = float(csv_row[4]) + e.type = csv_row[5] + e.name = csv_row[6] + e.country = csv_row[7] + return e + # For old format: + elif len(csv_row) == 6: + e.id = csv_row[0] + e.parent_id = csv_row[1] + e.lat = float(csv_row[2]) + e.lon = float(csv_row[3]) + e.type = csv_row[4] + e.name = csv_row[5] + return e + logger.error(f"Row [{csv_row}] - {len(csv_row)} cannot be parsed.") + return None + + def __eq__(self, other): + if isinstance(other, (HierarchyEntry, str)): + self_id = self.id + other_id = other if isinstance(other, str) else other.id + return self_id == other_id + + raise TypeError(f"{other}:{type(other)} is not supported.") + + def __str__(self): + return ( + f"{self.id}[{self.type}]:{self.name} " + f"({decode_id(self.id.split()[0] if ' ' in self.id else self.id)})" + ) + + +def read_complexes_from_csv(path: str) -> Dict[str, Node]: + m = {} + with open(path) as csvfile: + rows = csv.reader(csvfile, delimiter=";") + + for row in rows: + e = HierarchyEntry.make_from_csv_row(row) + m[e.id] = Node(e) + + for id_, node in m.items(): + if node.label.parent_id: + try: + link_nodes(node, m[node.label.parent_id]) + except KeyError: + logger.error(f"Id {node.label.parent_id} was not found in dict.") + pass + + return m