Initial implementation of exploiting datatypes for more efficient tab…

…ulating
JuDFTteam · Apr 7, 2022 · 0e6da25 · 0e6da25
1 parent 7f779a8
commit 0e6da25
Show file tree

Hide file tree

Showing 2 changed files with 42 additions and 24 deletions.
diff --git a/masci_tools/io/parsers/tabulator/recipes.py b/masci_tools/io/parsers/tabulator/recipes.py
@@ -207,13 +207,12 @@ def _to_keypaths_recursive(sub_dict: dict[str, Any], path: list[str]) -> list[tu
         # or at least of type _typing.List[_typing.Tuple[list, _typing.Any]].
         # check that. if not, something is wrong.
         # otherwise, just return the paths.
-        if all(tup[1] is None for tup in keypaths):
-            keypaths = [tup[0] for tup in keypaths]  #type:ignore
-            datatypes = {path: dtype for path, dtype in keypaths if dtype is not None}
+        datatypes = {tuple(path): dtype for path, dtype in keypaths if dtype is not None}
+        keypaths = [tuple(path) for path, dtype in keypaths]  #type:ignore
 
         # postcondition: keypaths format
         is_list = isinstance(keypaths, list)
-        is_all_lists = is_list and all(isinstance(path, list) for path in keypaths)
+        is_all_lists = is_list and all(isinstance(path, tuple) for path in keypaths)
         if not is_all_lists:
             raise TypeError(f'Could not generate keypaths of required type list of lists '
                             f'from {name} list. Either specified list in wrong format '

diff --git a/masci_tools/io/parsers/tabulator/tabulator.py b/masci_tools/io/parsers/tabulator/tabulator.py
@@ -17,9 +17,10 @@
 
 import abc
 from collections import defaultdict
-from typing import Any, Iterable, TypeVar
+from typing import Any, FrozenSet, Iterable, TypeVar
 
 import pandas as pd
+import numpy as np
 
 from .recipes import Recipe, KeyPaths
 
@@ -56,7 +57,7 @@ class Tabulator(abc.ABC):
       to easily reuse the dtypes information from the recipe.
     """
 
-    def __init__(self, recipe: Recipe | None = None) -> None:
+    def __init__(self, recipe: Recipe | None = None, separator: str = '.', buffer_size: int = 1024) -> None:
         """Initialize a tabulator object.
 
         The attribute :py:attr:`~.recipe` defines *what* to extract from a set of objects and put them in a table (
@@ -75,8 +76,12 @@ def __init__(self, recipe: Recipe | None = None) -> None:
         if not recipe:
             recipe = Recipe()
         self.recipe = recipe
+        self.has_transformer = recipe.transformer is not None
         self._table: dict[str, Any] = {}
 
+        self.separator = separator
+        self.buffer_size = buffer_size
+
         self._column_policies = ['flat', 'flat_full_path', 'multiindex']
 
     @abc.abstractmethod
@@ -110,8 +115,8 @@ def table(self) -> pd.DataFrame | None:
         """The result table. None if :py:meth:`~tabulate` not yet called."""
         return pd.DataFrame.from_dict(self._table) if self._table else None
 
-    def process_item(self, item: Any, table: dict[str, Any], keypaths: list[tuple[tuple[str, ...], str]],
-                     pass_item_to_transformer: bool, **kwargs: Any) -> None:
+    def process_item(self, item: Any, index: int, table: dict[str, Any], keypaths: list[tuple[tuple[str, ...], str]],
+                     dtypes: frozenset[str], pass_item_to_transformer: bool, **kwargs: Any) -> None:
         """
         Process a single item of the collection of items to be tabulated
 
@@ -125,34 +130,38 @@ def process_item(self, item: Any, table: dict[str, Any], keypaths: list[tuple[tu
         failed_paths = defaultdict(list)
         failed_transforms = defaultdict(list)
 
-        row: dict[str, Any] = {}
-
         for keypath, column in keypaths:
-            row[column] = None
-
             value = self.get_value(item, keypath)
             if value is None:
                 failed_paths[keypath].append(self.item_uuid(item))
                 continue
 
-            if not self.recipe.transformer:
-                row[column] = value
-            else:
+            if self.has_transformer:
                 try:
-                    transformed_value = self.recipe.transformer.transform(
-                        keypath=keypath, value=value, obj=item if pass_item_to_transformer else None, **kwargs)
+                    transformed_value = self.recipe.transformer.transform(  #type:ignore
+                        keypath=keypath,
+                        value=value,
+                        obj=item if pass_item_to_transformer else None,
+                        **kwargs)
                 except (ValueError, KeyError, TypeError):
                     failed_transforms[keypath].append(self.item_uuid(item))
                     continue
 
                 if transformed_value.is_transformed and isinstance(transformed_value.value, dict):
+                    value = {}
                     for t_column, t_value in transformed_value.value.items():
-                        row[t_column] = t_value
+                        value[t_column] = t_value
                 else:
-                    row[column] = transformed_value.value
+                    value = transformed_value.value
 
-        for column, value in row.items():
-            table.setdefault(column, []).append(value)
+            if column in dtypes:
+                try:
+                    table[column][index] = value
+                except IndexError:
+                    table[column] = np.append(table[column], np.zeros(len(table[column]), dtype=table[column].dtype))
+                    table[column][index] = value
+            else:
+                table.setdefault(column, []).append(value)
 
     def item_uuid(self, item: Any) -> str:
         """
@@ -185,8 +194,8 @@ def _remove_collisions(self,
                 raise ValueError(f'Cannot disambiguate paths {paths}')
 
             #Go up levels until they can be distinguished
-            unique_paths = self._remove_collisions([(path[:index], f'{path[index]}.{name}') for path in paths],
-                                                   index=index - 1)
+            unique_paths = self._remove_collisions(
+                [(path[:index], f'{path[index]}{self.separator}{name}') for path in paths], index=index - 1)
 
             for path, unique_path in zip(paths, unique_paths):
                 keypaths[keypaths.index((path, name))] = path, unique_path[1]
@@ -228,14 +237,15 @@ def tabulate(self,
 
         keypaths: KeyPaths = []
 
-        for item in collection:
+        for index, item in enumerate(collection):
 
             # get inc/ex lists. assume that they are in valid keypaths format already
             # (via property setter auto-conversion)
             if not keypaths:
                 if not self.recipe.include_list:
                     self.autolist(item=item, overwrite=True, pretty_print=False)
                 keypaths = self.recipe.include_list.copy()
+                dtypes = self.recipe.dtypes
                 exclude_keypaths = self.recipe.exclude_list
                 for keypath in exclude_keypaths:
                     keypaths.remove(keypath)
@@ -245,9 +255,18 @@ def tabulate(self,
 
                 self._remove_collisions(named_keypaths)
 
+                for path, dtype in dtypes.items():
+                    #find corresponding column name
+                    column = [column for p, column in named_keypaths if p == path][0]
+                    table[column] = np.zeros(self.buffer_size, dtype=dtype)
+                dtypes_set = frozenset(table.keys())
+                self.has_transformer = self.recipe.transformer is not None
+
             self.process_item(item,
+                              index=index,
                               table=table,
                               keypaths=named_keypaths,
+                              dtypes=dtypes_set,
                               pass_item_to_transformer=pass_item_to_transformer,
                               **kwargs)