Skip to content

Commit

Permalink
Flattening: Add comments per review
Browse files Browse the repository at this point in the history
  • Loading branch information
kindly committed Mar 9, 2021
1 parent 4824df2 commit 31b9399
Show file tree
Hide file tree
Showing 4 changed files with 18 additions and 0 deletions.
1 change: 1 addition & 0 deletions flattentool/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ def flatten(
else:
schema_parser = None

# context manager to clean up ZODB database when it exits
with JSONParser(
json_filename=input_name,
root_list_path=None if root_is_list else root_list_path,
Expand Down
11 changes: 11 additions & 0 deletions flattentool/json_input.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,9 +121,11 @@ def __init__(
persist=False,
):
if persist:
# Use temp directories in OS agnostic way
self.zodb_db_location = (
tempfile.gettempdir() + "/flattentool-" + str(uuid.uuid4())
)
# zlibstorage lowers disk usage by a lot at very small performance cost
zodb_storage = zc.zlibstorage.ZlibStorage(
ZODB.FileStorage.FileStorage(self.zodb_db_location)
)
Expand All @@ -133,7 +135,10 @@ def __init__(
self.db = ZODB.DB(None)

self.connection = self.db.open()

# ZODB root, only objects attached here will be persisted
root = self.connection.root
# OOBTree means a btree with keys and values are objects (including strings)
root.sheet_store = BTrees.OOBTree.BTree()

self.sub_sheets = {}
Expand All @@ -151,6 +156,8 @@ def __init__(
self.persist = persist

if schema_parser:
# schema parser does not make sheets that are persistent,
# so use from_sheets which deep copies everything in it.
self.main_sheet = PersistentSheet.from_sheet(
schema_parser.main_sheet, self.connection
)
Expand Down Expand Up @@ -293,9 +300,13 @@ def parse(self):
# fall over on empty activity, e.g. <iati-activity/>
continue
self.parse_json_dict(json_dict, sheet=self.main_sheet)
# only persist every 2000 objects. peristing more often slows down storing.
# 2000 top level objects normally not too much to store in memory.
if num % 2000 == 0 and num != 0:
transaction.commit()

# This commit could be removed which would mean that upto 2000 objects
# could be stored in memory without anything being persisted.
transaction.commit()

if self.remove_empty_schema_columns:
Expand Down
1 change: 1 addition & 0 deletions flattentool/output.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ def close(self):

class XLSXOutput(SpreadsheetOutput):
def open(self):
# write only means that the output will be streamed
self.workbook = openpyxl.Workbook(write_only=True)

def write_sheet(self, sheet_name, sheet):
Expand Down
5 changes: 5 additions & 0 deletions flattentool/sheet.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,11 +51,16 @@ def __init__(self, columns=None, root_id="", name=None, connection=None):
super().__init__(columns=columns, root_id=root_id, name=name)
self.connection = connection
self.index = 0
# Integer key and object value btree. Store sequential index in order to preserve input order.
connection.root.sheet_store[self.name] = BTrees.IOBTree.BTree()

@property
def lines(self):
# btrees iterate in key order.
for key, value in self.connection.root.sheet_store[self.name].items():
# 5000 chosen by trial and error. The written row
# data is removed from memory as is no loner needed.
# All new sheets clear out previous sheets data from memory.
if key % 5000 == 0:
self.connection.cacheMinimize()
yield value
Expand Down

0 comments on commit 31b9399

Please sign in to comment.