-
Notifications
You must be signed in to change notification settings - Fork 0
/
create_hashtable.py
199 lines (174 loc) · 8.53 KB
/
create_hashtable.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import os
import os.path
import sys
import hashlib
import pickle
import time
import argparse
from collections import defaultdict
# -----------------------------------------------------------------------------------------------------------
# This is the module part that is used also by compare_hashtable
# -----------------------------------------------------------------------------------------------------------
BLOCK_SIZE = 2 ** 20
def compute_hash(file_path):
"""Compute MD5 hash of a file specified as argument.
:type file_path: unicode
:param file_path:
:return: MD5 digest as a string
"""
md = hashlib.md5()
df = open(file_path, 'rb')
try:
while True:
chunk = df.read(BLOCK_SIZE)
if not chunk:
break
md.update(chunk)
except IOError as ioe:
print 'ERROR: Cannot compute hash for {0}: {1}'.format(file_path, ioe.strerror)
finally:
df.close()
return md.digest()
def create_tuple_dd():
"""This module function makes defaultdict(lambda: defaultdict(tuple)) pickable.
See http://stackoverflow.com/questions/16439301/cant-pickle-defaultdict"""
return defaultdict(tuple)
def load_hash_table(pkl_name):
"""Load file digest database stored in a file given as argument.
:type pkl_name: unicode
:param pkl_name: File MD5 hash database stored as a Python pickle file
:return: File MD5 hash database represented as defaultdict(defaultdict(tuple))
"""
df = open(pkl_name, 'rb')
try:
hash_data = pickle.load(df)
except EOFError:
print 'ERROR: Cannot read hash table: EOF hit.'
hash_data = None
finally:
df.close()
return hash_data
def save_hash_table(pkl_name, hash_data):
"""Save the file digest database to a file.
:type pkl_name: unicode
:param pkl_name: File where the MD5 hash database will be stored as a Python pickle fle
:param hash_data: MD5 hash database represented as defaultdict(defaultdict(tuple))
"""
df = open(pkl_name, 'wb')
pickle.dump(hash_data, df, pickle.HIGHEST_PROTOCOL)
df.close()
print 'done.'
# -----------------------------------------------------------------------------------------------------------
# Here starts the code executed when called from command line ...
# -----------------------------------------------------------------------------------------------------------
if __name__ == '__main__':
# Parse command line arguments
parser = argparse.ArgumentParser()
parser.add_argument('database', help='database file')
parser.add_argument('root', help='root directory')
parser.add_argument('-a', '--append', help='append to existing history data', action='store_true')
parser.add_argument('-d', '--debug', help='switch on debugging output', action='store_true')
parser.add_argument('-p', '--save-period', help='how ofter shall database snapshot be saved',
type=int, default=60, metavar='seconds')
args = parser.parse_args()
# Assign arguments to local variables
pkl_hash_name = args.database
root_dir = args.root
# Check if the root directory exists
if not os.path.exists(root_dir):
sys.exit('ERROR: Root directory {0} does not exist.'.format(root_dir))
# print 'Stdout encoding is', sys.stdout.encoding
# Check if we shall update the pickle file
hash_table = None
if os.path.exists(pkl_hash_name):
print 'Loading hash table data from {0} ...'.format(pkl_hash_name),
hash_table = load_hash_table(pkl_hash_name)
print 'done.'
# Do not access the hash table in case that the read failed
if hash_table:
# A default would be to verify that the file information stored in the database is up-to-date.
# However this could take a while and sometimes (after a crash, computer shutdown etc.) we
# are reasonably sure that the data is up-to-date and we want to skip the initial verification
# phase.
if not args.append:
print 'Pruning non-existing files from the hash table ...'
for filesize in list(hash_table.keys()):
for full_path in list(hash_table[filesize].keys()):
abs_path = os.path.join(root_dir, full_path)
if not os.path.exists(abs_path):
del hash_table[filesize][full_path]
out_str = u'- ({0}){1}{2}'.format(root_dir, os.path.sep, full_path)
print out_str.encode(sys.stdout.encoding, errors='replace')
print '... done.'
else:
print 'Skipping hash table verification as requested'
# If there was no hash table to read, create an empty one
if not hash_table:
print 'Creating a new hash table ...'
# No lambda can be used here to construct the second level of the defaultdict: we need the
# database to be pickable. See `create_tuple_dd()` function above.
hash_table = defaultdict(create_tuple_dd)
# Save the data at least once
do_save_data = True
# Initialise the time trigger for saving the data
save_data_time = time.time() + args.save_period
# Walk the directory tree that starts at `root_dir`. The directory and file names shall be in
# Unicode, otherwise we will get into all kind of trouble when trying to stat() the files.
# Unfortunately the Unicode strings have to be reencoded for terminal output and care has to be
# taken to filter out characters that cannot be displayed using the terminal character encoding.
for basedir, dirs, files in os.walk(unicode(root_dir)):
# We are interested only in files
for file_name in files:
# In order to access the file, an absolute path is needed
full_path = os.path.join(basedir, file_name)
# Only relative path is stored in the database though (so that the whole tree can be
# relocated to different place without the need to collect the file information again).
rel_path = os.path.relpath(full_path, root_dir)
# Get the file information
stat_info = os.stat(full_path)
# Default is to compute the hash of the file
do_hash = True
# Check for previously stored information, if any
file_info = hash_table[stat_info.st_size][rel_path]
if file_info:
# The file hash has been already computed.
# Check the file modification time - if it differs from the stored
# one, we will recompute the hash
do_hash = (file_info[0] != stat_info.st_mtime)
# If we compute new hash, we have to update the dictionary as well
if do_hash:
file_hash = compute_hash(full_path)
file_info = (stat_info.st_mtime, file_hash)
hash_table[stat_info.st_size][rel_path] = file_info
do_save_data = True
print '+',
else:
print ' ',
# Convert st_mtime into a meaningful time information
if stat_info.st_mtime < 0:
# TODO: Consider some trickery with datetime.datetime()
# Currently a timestamp -255601696.0 which represents Windows date 2097-12-23 23:00:00 results
# in the following:
# >>> epo = datetime.datetime(1970, 1, 1)
# >>> delta = datetime.timedelta(seconds=(-255601696))
# >>> epo+delta
# datetime.datetime(1961, 11, 25, 15, 31, 44)
mtime_str = '(value out of range)'
else:
mtime_str = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(stat_info.st_mtime))
# Convert the file_path string into something that can be printed on terminal
full_path = full_path.encode(sys.stdout.encoding, errors='replace')
print full_path, stat_info.st_size, mtime_str
# Optionally save the current hash table
if time.time() > save_data_time and do_save_data:
print 'Saving pickle data to {0} ...'.format(pkl_hash_name),
save_hash_table(pkl_hash_name, hash_table)
save_data_time = time.time() + args.save_period
do_save_data = False
print 'done.'
if do_save_data:
print 'Saving final pickle data to {0} ...'.format(pkl_hash_name),
save_hash_table(pkl_hash_name, hash_table)
print 'done.'