diff --git a/flattentool/__init__.py b/flattentool/__init__.py index b700353a..55e526a7 100644 --- a/flattentool/__init__.py +++ b/flattentool/__init__.py @@ -112,6 +112,7 @@ def flatten( else: schema_parser = None + # context manager to clean up ZODB database when it exits with JSONParser( json_filename=input_name, root_list_path=None if root_is_list else root_list_path, diff --git a/flattentool/json_input.py b/flattentool/json_input.py index 79567c0e..e1cc6a16 100644 --- a/flattentool/json_input.py +++ b/flattentool/json_input.py @@ -121,9 +121,11 @@ def __init__( persist=False, ): if persist: + # Use temp directories in OS agnostic way self.zodb_db_location = ( tempfile.gettempdir() + "/flattentool-" + str(uuid.uuid4()) ) + # zlibstorage lowers disk usage by a lot at very small performance cost zodb_storage = zc.zlibstorage.ZlibStorage( ZODB.FileStorage.FileStorage(self.zodb_db_location) ) @@ -133,7 +135,10 @@ def __init__( self.db = ZODB.DB(None) self.connection = self.db.open() + + # ZODB root, only objects attached here will be persisted root = self.connection.root + # OOBTree means a btree with keys and values are objects (including strings) root.sheet_store = BTrees.OOBTree.BTree() self.sub_sheets = {} @@ -151,6 +156,8 @@ def __init__( self.persist = persist if schema_parser: + # schema parser does not make sheets that are persistent, + # so use from_sheets which deep copies everything in it. self.main_sheet = PersistentSheet.from_sheet( schema_parser.main_sheet, self.connection ) @@ -293,9 +300,13 @@ def parse(self): # fall over on empty activity, e.g. continue self.parse_json_dict(json_dict, sheet=self.main_sheet) + # only persist every 2000 objects. peristing more often slows down storing. + # 2000 top level objects normally not too much to store in memory. if num % 2000 == 0 and num != 0: transaction.commit() + # This commit could be removed which would mean that upto 2000 objects + # could be stored in memory without anything being persisted. transaction.commit() if self.remove_empty_schema_columns: diff --git a/flattentool/output.py b/flattentool/output.py index 947ceac6..9ad275e6 100644 --- a/flattentool/output.py +++ b/flattentool/output.py @@ -50,6 +50,7 @@ def close(self): class XLSXOutput(SpreadsheetOutput): def open(self): + # write only means that the output will be streamed self.workbook = openpyxl.Workbook(write_only=True) def write_sheet(self, sheet_name, sheet): diff --git a/flattentool/sheet.py b/flattentool/sheet.py index df6b99be..48d7a981 100644 --- a/flattentool/sheet.py +++ b/flattentool/sheet.py @@ -51,11 +51,16 @@ def __init__(self, columns=None, root_id="", name=None, connection=None): super().__init__(columns=columns, root_id=root_id, name=name) self.connection = connection self.index = 0 + # Integer key and object value btree. Store sequential index in order to preserve input order. connection.root.sheet_store[self.name] = BTrees.IOBTree.BTree() @property def lines(self): + # btrees iterate in key order. for key, value in self.connection.root.sheet_store[self.name].items(): + # 5000 chosen by trial and error. The written row + # data is removed from memory as is no loner needed. + # All new sheets clear out previous sheets data from memory. if key % 5000 == 0: self.connection.cacheMinimize() yield value