diff --git a/README.md b/README.md
index a5a4a97..5dfe38f 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,13 @@
+This fork is based on the original repo in https://github.com/bgilbert/anonymize-slide
+
+Several users (markemus, r3m0chop and grenkoca) have updated the code to make it runnable with Python 3. Since non of their versions worked with my mrxs
+files I decided to fork the version from markemus in which I could fix the issues with the mrxs files. Please note that all these files are currently in the
+old mrxs format (with some of them being converted from the new into the old format). So far I did not test the new format.
+
+Please note that the original readme below does not respect the current state of the script.
+
+
+
anonymize-slide
===============
diff --git a/anonymize-slide.py b/anonymize_slide.py
similarity index 65%
rename from anonymize-slide.py
rename to anonymize_slide.py
index 22f462c..369ad03 100755
--- a/anonymize-slide.py
+++ b/anonymize_slide.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/python3
#
# anonymize-slide.py - Delete the label from a whole-slide image.
#
@@ -21,15 +21,23 @@
# Foundation, Inc., 51 Franklin Street, Fifth Floor,
# Boston, MA 02110-1301 USA.
#
+# Modified by Chaim Reach:
+# - updated to Python3
+# - added support for Ventana tiffs
+# - added option for importing as a python module.
-from __future__ import division
-from ConfigParser import RawConfigParser
-from cStringIO import StringIO
+
+from __future__ import division, print_function
+# from ConfigParser import RawConfigParser
+from configparser import RawConfigParser
+# from cStringIO import StringIO
+from io import StringIO
from glob import glob
from optparse import OptionParser
import os
-import string
+# import string
import struct
+import subprocess
import sys
PROG_DESCRIPTION = '''
@@ -39,6 +47,7 @@
DEBUG = False
# TIFF types
+BYTE = 1
ASCII = 2
SHORT = 3
LONG = 4
@@ -52,11 +61,13 @@
STRIP_BYTE_COUNTS = 279
NDPI_MAGIC = 65420
NDPI_SOURCELENS = 65421
+XMLPACKET = 700
# Format headers
-LZW_CLEARCODE = '\x80'
-JPEG_SOI = '\xff\xd8'
-UTF8_BOM = '\xef\xbb\xbf'
+LZW_CLEARCODE = b'\x80'
+JPEG_SOI = b'\xff\xd8'
+# UTF8_BOM = '\xef\xbb\xbf'
+UTF8_BOM = '\ufeff'
# MRXS
MRXS_HIERARCHICAL = 'HIERARCHICAL'
@@ -67,15 +78,16 @@ class UnrecognizedFile(Exception):
pass
-class TiffFile(file):
+class TiffFile():
def __init__(self, path):
- file.__init__(self, path, 'r+b')
+ # file.__init__(self, path, 'r+b')
+ self.file = open(path, 'r+b')
# Check header, decide endianness
endian = self.read(2)
- if endian == 'II':
+ if endian == b'II':
self._fmt_prefix = '<'
- elif endian == 'MM':
+ elif endian == b'MM':
self._fmt_prefix = '>'
else:
raise UnrecognizedFile
@@ -110,12 +122,27 @@ def __init__(self, path):
# the first directory is beyond 4 GB.
if NDPI_MAGIC in directory.entries:
if DEBUG:
- print 'Enabling NDPI mode.'
+ print('Enabling NDPI mode.')
self._ndpi = True
self.directories.append(directory)
if not self.directories:
raise IOError('No directories')
+ # TiffFile uses composition to pretend to be a file object for backwards compatibility.
+ def read(self, n=-1):
+ return self.file.read(n)
+ def write(self, s):
+ return self.file.write(s)
+ def seek(self, offset, whence=0):
+ return self.file.seek(offset, whence)
+ def tell(self):
+ return self.file.tell()
+ def __enter__(self):
+ return self
+ def __exit__(self, exc_type, exc_val, exc_tb):
+ self.file.close()
+ # End of file code
+
def _convert_format(self, fmt):
# Format strings can have special characters:
# y: 16-bit signed on little TIFF, 64-bit signed on BigTIFF
@@ -124,11 +151,11 @@ def _convert_format(self, fmt):
# Z: 32-bit unsigned on little TIFF, 64-bit unsigned on BigTIFF
# D: 32-bit unsigned on little TIFF, 64-bit unsigned on BigTIFF/NDPI
if self._bigtiff:
- fmt = fmt.translate(string.maketrans('yYzZD', 'qQqQQ'))
+ fmt = fmt.translate(str.maketrans('yYzZD', 'qQqQQ'))
elif self._ndpi:
- fmt = fmt.translate(string.maketrans('yYzZD', 'hHiIQ'))
+ fmt = fmt.translate(str.maketrans('yYzZD', 'hHiIQ'))
else:
- fmt = fmt.translate(string.maketrans('yYzZD', 'hHiII'))
+ fmt = fmt.translate(str.maketrans('yYzZD', 'hHiII'))
return self._fmt_prefix + fmt
def fmt_size(self, fmt):
@@ -146,6 +173,7 @@ def near_pointer(self, base, offset):
def read_fmt(self, fmt, force_list=False):
fmt = self._convert_format(fmt)
vals = struct.unpack(fmt, self.read(struct.calcsize(fmt)))
+ # vals = tuple(self.read(struct.calcsize(fmt)*4))
if len(vals) == 1 and not force_list:
return vals[0]
else:
@@ -180,18 +208,18 @@ def delete(self, expected_prefix=None):
for offset, length in zip(offsets, lengths):
offset = self._fh.near_pointer(self._out_pointer_offset, offset)
if DEBUG:
- print 'Zeroing', offset, 'for', length
+ print('Zeroing', offset, 'for', length)
self._fh.seek(offset)
if expected_prefix:
buf = self._fh.read(len(expected_prefix))
if buf != expected_prefix:
raise IOError('Unexpected data in image strip')
self._fh.seek(offset)
- self._fh.write('\0' * length)
+ self._fh.write(b'\0' * length)
# Remove directory
if DEBUG:
- print 'Deleting directory', self._number
+ print('Deleting directory', self._number)
self._fh.seek(self._out_pointer_offset)
out_pointer = self._fh.read_fmt('D')
self._fh.seek(self._in_pointer_offset)
@@ -205,9 +233,12 @@ def __init__(self, fh):
fh.read_fmt('HHZZ')
self._fh = fh
- def value(self):
- if self.type == ASCII:
+ def format_type(self):
+ if self.type == BYTE:
+ item_fmt = 'b'
+ elif self.type == ASCII:
item_fmt = 'c'
+ # item_fmt = 's'
elif self.type == SHORT:
item_fmt = 'H'
elif self.type == LONG:
@@ -220,8 +251,30 @@ def value(self):
item_fmt = 'd'
else:
raise ValueError('Unsupported type')
+ return item_fmt
+
+ def value(self):
+ # if self.type == BYTE:
+ # item_fmt = 'b'
+ # elif self.type == ASCII:
+ # item_fmt = 'c'
+ # elif self.type == SHORT:
+ # item_fmt = 'H'
+ # elif self.type == LONG:
+ # item_fmt = 'I'
+ # elif self.type == LONG8:
+ # item_fmt = 'Q'
+ # elif self.type == FLOAT:
+ # item_fmt = 'f'
+ # elif self.type == DOUBLE:
+ # item_fmt = 'd'
+ # else:
+ # raise ValueError('Unsupported type')
+
+ item_fmt = self.format_type()
fmt = '%d%s' % (self.count, item_fmt)
+
len = self._fh.fmt_size(fmt)
if len <= self._fh.fmt_size('Z'):
# Inline value
@@ -231,12 +284,42 @@ def value(self):
self._fh.seek(self._fh.near_pointer(self.start, self.value_offset))
items = self._fh.read_fmt(fmt, force_list=True)
if self.type == ASCII:
- if items[-1] != '\0':
+ if items[-1] != b'\0':
raise ValueError('String not null-terminated')
- return ''.join(items[:-1])
+ return b''.join(items[:-1])
else:
return items
+ def overwrite_entry(self, byte_string):
+ """Overwrites self.value with data and destroys the ENTIRE previous entry.
+ Extra space will be overwritten.
+
+ WARNING: Currently only supports BYTE and ASCII types."""
+ # TYPE = tif.directories[directory].entries[entry].type
+ # fmt = f"{tif.directories[directory].entries[entry].count}{self.format_type()}"
+ fmt = '%d%s' % (self.count, self.format_type())
+ # Hacky fix for strings
+ fmt = fmt.replace("c", "s")
+
+ # WARNING: Assumes entry contains a string. May fail if this function is extended to non-string types.
+ entry_ordinals = self.value()
+ old_value = "".join([chr(x) for x in entry_ordinals])
+ # The extra length that we'll need to overwrite with junk data.
+ null_pad = len(old_value) - len(byte_string)
+
+ # Write new value
+ self._fh.seek(self.value_offset)
+
+ if self.type == ASCII:
+ # ASCII uses nulls to divide substrings, so we pad with spaces instead.
+ new_value = byte_string + b" " * null_pad
+ self._fh.write_fmt(fmt, new_value)
+ elif self.type == BYTE:
+ new_value = byte_string + b"\0" * null_pad
+ self._fh.write_fmt(fmt, *new_value)
+ else:
+ raise ValueError('Unsupported type')
+
class MrxsFile(object):
def __init__(self, filename):
@@ -250,11 +333,11 @@ def __init__(self, filename):
self._dat = RawConfigParser()
self._dat.optionxform = str
try:
- with open(self._slidedatfile, 'rb') as fh:
+ with open(self._slidedatfile, 'r', encoding="utf-8-sig") as fh:
self._have_bom = (fh.read(len(UTF8_BOM)) == UTF8_BOM)
if not self._have_bom:
fh.seek(0)
- self._dat.readfp(fh)
+ self._dat.read_file(fh)
except IOError:
raise UnrecognizedFile
@@ -326,11 +409,13 @@ def _zero_record(self, record):
do_truncate = (fh.tell() == offset + length)
if DEBUG:
if do_truncate:
- print 'Truncating', path, 'to', offset
+ print('Truncating', path, 'to', offset)
else:
- print 'Zeroing', path, 'at', offset, 'for', length
+ print('Zeroing', path, 'at', offset, 'for', length)
fh.seek(offset)
buf = fh.read(len(JPEG_SOI))
+ # print(buf)
+ # exit()
if buf != JPEG_SOI:
raise IOError('Unexpected data in nonhier image')
if do_truncate:
@@ -341,7 +426,7 @@ def _zero_record(self, record):
def _delete_index_record(self, record):
if DEBUG:
- print 'Deleting record', record
+ print('Deleting record', record)
with open(self._indexfile, 'r+b') as fh:
entries_to_move = len(self._level_list) - record - 1
if entries_to_move == 0:
@@ -368,35 +453,35 @@ def _hier_keys_for_level(self, level):
def _rename_section(self, old, new):
if self._dat.has_section(old):
if DEBUG:
- print '[%s] -> [%s]' % (old, new)
+ print('[%s] -> [%s]' % (old, new))
self._dat.add_section(new)
for k, v in self._dat.items(old):
self._dat.set(new, k, v)
self._dat.remove_section(old)
elif DEBUG:
- print '[%s] does not exist' % old
+ print('[%s] does not exist' % old)
def _delete_section(self, section):
if DEBUG:
- print 'Deleting [%s]' % section
+ print('Deleting [%s]' % section)
self._dat.remove_section(section)
def _set_key(self, section, key, value):
if DEBUG:
prev = self._dat.get(section, key)
- print '[%s] %s: %s -> %s' % (section, key, prev, value)
+ print('[%s] %s: %s -> %s' % (section, key, prev, value))
self._dat.set(section, key, value)
def _rename_key(self, section, old, new):
if DEBUG:
- print '[%s] %s -> %s' % (section, old, new)
+ print('[%s] %s -> %s' % (section, old, new))
v = self._dat.get(section, old)
self._dat.remove_option(section, old)
self._dat.set(section, new, v)
def _delete_key(self, section, key):
if DEBUG:
- print 'Deleting [%s] %s' % (section, key)
+ print('Deleting [%s] %s' % (section, key))
self._dat.remove_option(section, key)
def _write(self):
@@ -404,8 +489,8 @@ def _write(self):
self._dat.write(buf)
with open(self._slidedatfile, 'wb') as fh:
if self._have_bom:
- fh.write(UTF8_BOM)
- fh.write(buf.getvalue().replace('\n', '\r\n'))
+ fh.write(UTF8_BOM.encode())
+ fh.write(buf.getvalue().replace('\n', '\r\n').encode())
def delete_level(self, layer_name, level_name):
level = self._levels[(layer_name, level_name)]
@@ -465,28 +550,64 @@ def __init__(self, dat, layer_id, level_id, record):
def accept(filename, format):
if DEBUG:
- print filename + ':', format
+ print(filename + ':', format)
+# TODO remove Filename from ImageDescription tags.
def do_aperio_svs(filename):
+ def cleanse_filename(filename_block):
+ key, val = filename_block.split(" = ")
+ val = "X"
+ anon_block = " = ".join([key, val])
+ return anon_block
+
+ # Check file
with TiffFile(filename) as fh:
# Check for SVS file
try:
desc0 = fh.directories[0].entries[IMAGE_DESCRIPTION].value()
- if not desc0.startswith('Aperio'):
+ if not desc0.startswith(b'Aperio'):
raise UnrecognizedFile
except KeyError:
raise UnrecognizedFile
accept(filename, 'SVS')
+ # Strip label
+ with TiffFile(filename) as fh:
+ # Find and delete label
+ for directory in fh.directories:
+ lines = directory.entries[IMAGE_DESCRIPTION].value().splitlines()
+ if len(lines) >= 2 and lines[1].startswith(b'label '):
+ # directory.delete(expected_prefix=LZW_CLEARCODE)
+ directory.delete()
+ print("Deleted label.")
+ break
+ else:
+ raise IOError("No label detected in SVS file")
+
+ # Strip macro
+ with TiffFile(filename) as fh:
# Find and delete label
for directory in fh.directories:
lines = directory.entries[IMAGE_DESCRIPTION].value().splitlines()
- if len(lines) >= 2 and lines[1].startswith('label '):
- directory.delete(expected_prefix=LZW_CLEARCODE)
+ if len(lines) >= 2 and lines[1].startswith(b'macro '):
+ directory.delete()
+ print("Deleted macro.")
break
else:
- raise IOError("No label in SVS file")
+ raise IOError("No macro detected in SVS file")
+
+ # Remove filename from ImageDescription(s). Why is this even a thing.
+ with TiffFile(filename) as fh:
+ for directory in fh.directories:
+ img_desc = directory.entries[IMAGE_DESCRIPTION].value().decode()
+ if "Filename" in img_desc:
+ print("\n", img_desc)
+ desc_bits = img_desc.split("|")
+ purified_bits = [bit if "Filename" not in bit else cleanse_filename(bit) for bit in desc_bits]
+ clean_desc = "|".join(purified_bits).encode()
+ directory.entries[IMAGE_DESCRIPTION].overwrite_entry(clean_desc)
+ print("Stored filename overwritten")
def do_hamamatsu_ndpi(filename):
@@ -512,13 +633,59 @@ def do_3dhistech_mrxs(filename):
except KeyError:
raise IOError('No label in MRXS file')
+# TODO-DONE incorporate the fix into the anonymization process
+def do_ventana_tif(filename):
+ with TiffFile(filename) as fh:
+ # Check for Ventana TIF file
+ try:
+ xml0 = subprocess.check_output(["tiffinfo", "-w", "-0", filename]).decode('utf-8')
+ # xml0 = fh.directories[0].entries[XMLPACKET].value()
+ if not "iScan" in xml0:
+ raise UnrecognizedFile
+
+ except subprocess.CalledProcessError:
+ raise UnrecognizedFile
+ accept(filename, 'SVS')
+
+ # Find and delete label
+ for directory in fh.directories:
+ lines = directory.entries[IMAGE_DESCRIPTION].value().splitlines()
+ if lines[0].startswith(b'Label_Image'):
+ # directory.delete(expected_prefix=LZW_CLEARCODE)
+ directory.delete()
+ break
+ else:
+ raise IOError("No label in TIF file")
+
+ # TODO-DONE handle ASCII type- s or c?
+ # Fix file
+ # We need to fully overwrite the old XMP tag data and add some new data as well.
+ # The writer also requires that the new value be of exactly the same size as the "value" allocated space in the image.
+ our_xmp = b""
+ our_image_desc = b""
+ fh.directories[1].entries[XMLPACKET].overwrite_entry(our_xmp)
+ fh.directories[1].entries[IMAGE_DESCRIPTION].overwrite_entry(our_image_desc)
+
format_handlers = [
+ do_ventana_tif,
do_aperio_svs,
do_hamamatsu_ndpi,
do_3dhistech_mrxs,
]
+def anonymize_slide(filename):
+ """Anonymize a single slide."""
+ for handler in format_handlers:
+ try:
+ print(handler)
+ handler(filename)
+ break
+ except UnrecognizedFile:
+ pass
+ else:
+ raise IOError('Unrecognized file type')
+
def _main():
global DEBUG
@@ -542,19 +709,25 @@ def _main():
exit_code = 0
for filename in filenames:
+ print(filename)
try:
- for handler in format_handlers:
- try:
- handler(filename)
- break
- except UnrecognizedFile:
- pass
- else:
- raise IOError('Unrecognized file type')
- except Exception, e:
+ # for handler in format_handlers:
+ # try:
+ # print(handler)
+ # handler(filename)
+ # break
+ # except UnrecognizedFile:
+ # pass
+ # else:
+ # raise IOError('Unrecognized file type')
+ anonymize_slide(filename)
+ except Exception as e:
if DEBUG:
raise
- print >>sys.stderr, '%s: %s' % (filename, str(e))
+ # print >>sys.stderr, '%s: %s' % (filename, str(e))
+ print('%s: %s' % (filename, str(e)), file=sys.stderr)
+ # print(f"{filename}: {str(e)}", file=sys.stderr)
+ # print("test", file=sys.stderr)
exit_code = 1
sys.exit(exit_code)