Flattening: Add comments per review

#316
OpenDataServices · Mar 9, 2021 · 31b9399 · 31b9399
1 parent 4824df2
commit 31b9399
Show file tree

Hide file tree

Showing 4 changed files with 18 additions and 0 deletions.
diff --git a/flattentool/__init__.py b/flattentool/__init__.py
@@ -112,6 +112,7 @@ def flatten(
     else:
         schema_parser = None
 
+    # context manager to clean up ZODB database when it exits
     with JSONParser(
         json_filename=input_name,
         root_list_path=None if root_is_list else root_list_path,

diff --git a/flattentool/json_input.py b/flattentool/json_input.py
@@ -121,9 +121,11 @@ def __init__(
         persist=False,
     ):
         if persist:
+            # Use temp directories in OS agnostic way
             self.zodb_db_location = (
                 tempfile.gettempdir() + "/flattentool-" + str(uuid.uuid4())
             )
+            # zlibstorage lowers disk usage by a lot at very small performance cost
             zodb_storage = zc.zlibstorage.ZlibStorage(
                 ZODB.FileStorage.FileStorage(self.zodb_db_location)
             )
@@ -133,7 +135,10 @@ def __init__(
             self.db = ZODB.DB(None)
 
         self.connection = self.db.open()
+
+        # ZODB root, only objects attached here will be persisted
         root = self.connection.root
+        # OOBTree means a btree with keys and values are objects (including strings)
         root.sheet_store = BTrees.OOBTree.BTree()
 
         self.sub_sheets = {}
@@ -151,6 +156,8 @@ def __init__(
         self.persist = persist
 
         if schema_parser:
+            # schema parser does not make sheets that are persistent,
+            # so use from_sheets which deep copies everything in it.
             self.main_sheet = PersistentSheet.from_sheet(
                 schema_parser.main_sheet, self.connection
             )
@@ -293,9 +300,13 @@ def parse(self):
                 # fall over on empty activity, e.g. <iati-activity/>
                 continue
             self.parse_json_dict(json_dict, sheet=self.main_sheet)
+            # only persist every 2000 objects. peristing more often slows down storing.
+            # 2000 top level objects normally not too much to store in memory.
             if num % 2000 == 0 and num != 0:
                 transaction.commit()
 
+        # This commit could be removed which would mean that upto 2000 objects
+        # could be stored in memory without anything being persisted.
         transaction.commit()
 
         if self.remove_empty_schema_columns:

diff --git a/flattentool/output.py b/flattentool/output.py
@@ -50,6 +50,7 @@ def close(self):
 
 class XLSXOutput(SpreadsheetOutput):
     def open(self):
+        # write only means that the output will be streamed
         self.workbook = openpyxl.Workbook(write_only=True)
 
     def write_sheet(self, sheet_name, sheet):

diff --git a/flattentool/sheet.py b/flattentool/sheet.py
@@ -51,11 +51,16 @@ def __init__(self, columns=None, root_id="", name=None, connection=None):
         super().__init__(columns=columns, root_id=root_id, name=name)
         self.connection = connection
         self.index = 0
+        # Integer key and object value btree.  Store sequential index in order to preserve input order.
         connection.root.sheet_store[self.name] = BTrees.IOBTree.BTree()
 
     @property
     def lines(self):
+        # btrees iterate in key order.
         for key, value in self.connection.root.sheet_store[self.name].items():
+            # 5000 chosen by trial and error.  The written row
+            # data is removed from memory as is no loner needed.
+            # All new sheets clear out previous sheets data from memory.
             if key % 5000 == 0:
                 self.connection.cacheMinimize()
             yield value