Skip to content

Commit

Permalink
ppt_record_parser: Optimize data loading
Browse files Browse the repository at this point in the history
Do not remember potentially huge blobs in memory, need that just for
debugging.
  • Loading branch information
christian-intra2net committed Oct 10, 2022
1 parent f598fdc commit 7fe4055
Showing 1 changed file with 14 additions and 5 deletions.
19 changes: 14 additions & 5 deletions oletools/ppt_record_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,9 @@
from oletools import record_base


# flag to remember some more data for debug-printing
debug_print = False

# types of relevant records (there are much more than listed here, c.f. [MS-PPT] 2.13.24)
# and https://learn.microsoft.com/en-us/openspecs/office_file_formats/ms-ppt
# these names are parsed in `record_class_for_type`: only if a record ends in "Container" will we find the
Expand Down Expand Up @@ -143,8 +146,8 @@
(0x040b, 'DrawingGroupContainer'),
(0x040c, 'DrawingContainer'),
(0x0423, 'RoundTripOArtTextStyles12Atom'), # to extract data from these, could create class ...
(0x0428, 'RoundTripCustomTableStyles12Atom'), # ... like PptRecordExOleVbaActiveXAtom
(0x040e, 'RoundTripThemeAtom'),
(0x0428, 'RoundTripCustomTableStyles12Atom'), # ... like PptRecordExOleVbaActiveXAtom ...
(0x040e, 'RoundTripThemeAtom'), # ... to parse zip/ooxml data
(0x040f, 'RoundTripColorMappingAtom'),
(0x041c, 'RoundTripOriginalMainMasterId12Atom'),
(0x041e, 'RoundTripContentMasterInfo12Atom'),
Expand Down Expand Up @@ -318,15 +321,18 @@ def record_class_for_type(cls, rec_type):
read_all_data = False
try:
record_name = RECORD_TYPES[rec_type]
if record_name.endswith('Container'):
if record_name.startswith('RoundTrip'):
is_container = False
read_all_data = debug_print
elif record_name.endswith('Container'):
is_container = True
read_all_data = True
elif record_name.endswith('Atom'):
is_container = False
read_all_data = False
elif record_name.endswith('Blob'):
is_container = False
read_all_data = True
read_all_data = debug_print
elif record_name == 'OfficeArtClientData':
is_container = True
read_all_data = True
Expand Down Expand Up @@ -395,7 +401,9 @@ def _type_str(self):
def __str__(self):
"""Create string representation. Use super class except for Blobs."""
try:
if RECORD_TYPES[self.type].endswith('Blob'):
if debug_print and \
(RECORD_TYPES[self.type].endswith('Blob') or
RECORD_TYPES[self.type].startswith('RoundTrip')):
contents = ''.join(chr(ch) if ch in STR_PRINTABLE_CHARS else '.' for ch in self.data)
if len(contents) > STR_MAX_CONTENT_LEN:
data_text = contents[:STR_MAX_CONTENT_LEN - 5] + '[...]'
Expand Down Expand Up @@ -868,6 +876,7 @@ def print_records(record, print_fn, indent, do_print_record):


if __name__ == '__main__':
debug_print = True
def do_per_record(record):
print_records(record, logging.info, 2, False)
sys.exit(record_base.test(sys.argv[1:], PptFile,
Expand Down

0 comments on commit 7fe4055

Please sign in to comment.