-
Notifications
You must be signed in to change notification settings - Fork 78
/
build_pack.py
413 lines (355 loc) · 15.8 KB
/
build_pack.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
use a database to identify and organize files.
"""
import os
import sys
import shutil
import hashlib
import argparse
import zipfile
from collections import defaultdict
from collections import Counter
__author__ = "aquaman"
__date__ = "2021/07/25"
__version__ = "$Revision: 3.7"
# *********************************************************************#
# #
# Functions #
# #
# *********************************************************************#
if __name__ == '__main__':
"""
Parse arguments from command line.
"""
parser = argparse.ArgumentParser(
description="use a database to identify and organize files.")
# Add support for boolean arguments. Allows us to accept 1-argument forms
# of boolean flags whose values are any of "yes", "true", "t" or "1".
parser.register('type', 'bool', (lambda x: x.lower() in
("yes", "true", "t", "1")))
parser.add_argument("-i", "--input_folder",
dest="source_folder",
required=True,
help="set source folder")
parser.add_argument("-d", "--database",
dest="target_database",
required=True,
help="set target database")
parser.add_argument("-o", "--output_folder",
dest="output_folder",
required=True,
help="set output folder")
parser.add_argument("-m", "--missing",
dest="missing_files",
default=None,
help="list missing files")
parser.add_argument("--file_strategy",
choices=["copy", "hardlink", "smart"],
dest="file_strategy",
default="copy",
help=("Strategy for how to get files into the output "
"folder. Smart uses copy for first instance of "
"a file and hardlinks to that first one for "
"successive files."))
# Valid uses of this flag include: -s, -s true, -s yes, --skip_existing=1
parser.add_argument("-s", "--skip_existing",
dest="skip_existing",
default=False,
# nargs and const below allow us to accept the
# zero-argument form of --skip_existing
nargs="?",
const=True,
type='bool',
help=("Skip files which already exist at the "
"destination without overwriting them."))
# Valid uses of this flag include: -l, -l true, -l yes, --new_line=1
parser.add_argument("-l", "--new_line",
dest="new_line",
default=False,
# nargs and const below allow us to accept the
# zero-argument form of --new_line
nargs="?",
const=True,
type='bool',
help=("Changes the way the stdout is printed, and "
"allows for UI subprocess monitoring."))
# Valid uses of this flag include: -x, -x true, -x yes,
# --drop_initial_directory=1
parser.add_argument("-x", "--drop_initial_directory",
dest="drop_initial_directory",
default=False,
# nargs and const below allow us to accept the
# zero-argument form of --drop_initial_directory
nargs="?",
const=True,
type='bool',
help=("Drops the 1st directory path in the SMDB file "
"so you can customize the name."))
ARGS = parser.parse_args()
def write_empty_file(dest):
"""
Creates an empty file at the destination path
Arguments:
dest - The destination where the empty file will be located
"""
# When destination file exists...
# Do nothing if skip_existing is set, otherwise remove file (to
# avoid FileExistsError when writing new file).
if os.path.exists(dest):
if ARGS.skip_existing:
return
else:
os.remove(dest)
# Create directories if needed
base_dir = os.path.dirname(os.path.abspath(dest))
if not os.path.exists(base_dir):
try:
os.makedirs(base_dir, exist_ok=True)
except (FileNotFoundError, OSError):
fixed_base_dir = u'\\\\?\\' + base_dir
os.makedirs(fixed_base_dir, exist_ok=True)
# Create empty file
try:
open(dest, 'a').close()
except (FileNotFoundError, OSError):
fixed_dest = u'\\\\?\\' + os.path.abspath(dest)
open(fixed_dest, 'a').close()
def copy_file(source, dest, original):
"""
Copy a file from source to destination using a configurable file copy
strategy controlled by the --file_strategy command.
Arguments:
source - The file to copy/hardlink
dest - The destination where the new file will be located
original - The first file associated with a specific hash value
"""
if (ARGS.file_strategy == "copy"):
copy_fn = shutil.copyfile
elif (ARGS.file_strategy == "hardlink"):
copy_fn = os.link
elif (ARGS.file_strategy == "smart"):
if original == dest:
copy_fn = shutil.copyfile
else:
copy_fn = os.link
source = original
else:
raise Exception("Unknown copy strategy {}".format(ARGS.file_strategy))
# When destination file exists...
# Do nothing if skip_existing is set, otherwise remove file (to
# avoid FileExistsError when writing new file).
if os.path.exists(dest):
if ARGS.skip_existing:
return
else:
os.remove(dest)
try:
# copy the file to the new directory
copy_fn(source, dest)
except FileNotFoundError:
# Windows' default API is limited to paths of 260 characters
fixed_dest = u'\\\\?\\' + os.path.abspath(dest)
copy_fn(source, fixed_dest)
except OSError:
try:
shutil.copyfile(source, dest)
except FileNotFoundError:
# Windows' default API is limited to paths of 260 characters
fixed_dest = u'\\\\?\\' + os.path.abspath(dest)
shutil.copyfile(source, fixed_dest)
def extract_file(filename, entry, method, dest):
"""
extracts entry from archive to given destination directory
"""
if method == 'zip':
# Stolen shamelessly from https://stackoverflow.com/a/4917469
# Eliminates the random directories that appear when a file is
# extracted from a zip file
with zipfile.ZipFile(filename) as zip_file:
for member in zip_file.namelist():
# skip other files in the zip
if member != entry:
continue
basename = os.path.basename(member)
# skip directories
if not basename:
continue
# copy file (taken from zipfile's extract)
source = zip_file.open(member)
target = open(dest, "wb")
with source, target:
shutil.copyfileobj(source, target)
def parse_database(target_database, drop_initial_directory):
"""
store hash values and filenames in a database.
"""
db = defaultdict(list) # missing key's default value is an empty list
number_of_entries = 0
with open(target_database, "r") as target_database:
for line in target_database:
hash_sha256, filename, _, _, hash_crc = line.strip().split("\t")[0:5]
number_of_entries += 1
if drop_initial_directory:
first_level, filename = filename.split("/", 1)
filename = os.path.normpath(filename)
db[hash_sha256].append(filename)
db[hash_crc].append(filename)
return db, number_of_entries
def print_progress(current, total, end):
print_function("processing file: {:>9} / {}".format(current, total),
end=end)
def print_function(text, end, file=sys.stdout, flush=True):
print(text, end=end, file=file, flush=flush)
def parse_folder(source_folder, db, output_folder):
"""
read each file, produce a hash value and place it in the directory tree.
"""
i = 0
total = len([os.path.join(dp, f) for dp, dn, fn in
os.walk(os.path.expanduser(source_folder)) for f in fn])
for dirpath, dirnames, filenames in os.walk(source_folder):
if filenames:
for f in filenames:
filename = os.path.join(os.path.normpath(dirpath),
os.path.normpath(f))
absolute_filename = u'\\\\?\\' + os.path.abspath(filename)
try:
hashes = get_hashes(filename)
except FileNotFoundError:
hashes = get_hashes(absolute_filename)
for h, info in hashes.items():
if h in db:
# we have a hit
loop = 0
for entry in db[h]:
loop += 1
new_path = os.path.join(output_folder,
os.path.dirname(entry))
# create directory structure if need be
if not os.path.exists(new_path):
os.makedirs(new_path, exist_ok=True)
new_file = os.path.join(output_folder, entry)
if loop == 1:
original = new_file
if (not ARGS.skip_existing or not
os.path.exists(new_file)):
if info['archive']:
# extract file from archive to directory
extract_file(info['filename'],
info['archive']['entry'],
info['archive']['type'],
new_file)
else:
# copy the file to the new directory
copy_file(info['filename'],
new_file,
original)
# remove the hit from the database
del db[h]
i += 1
print_progress(i, total, END_LINE)
else:
if not ARGS.new_line:
print_progress(i, total, "\n")
def get_hashes(filename):
"""
return dictionary of hashes containing:
- sha256 hash of the file itself
- additional hashes if the file is a compressed archive
"""
hashes = {}
h = hashlib.sha256()
# hash the file itself
with open(filename, "rb", buffering=0) as f:
# use a small buffer to compute hash to
# avoid memory overload
for b in iter(lambda: f.read(128 * 1024), b''):
h.update(b)
# add file hash to dict
hashes[h.hexdigest()] = {
'filename': filename,
'archive': None
}
# if this is a zipfile, extract CRCs from header
if zipfile.is_zipfile(filename):
try:
with zipfile.ZipFile(filename, 'r') as z:
for info in z.infolist():
# add archive entry hash to dict
crc_formatted_hex = '{0:08x}'.format(info.CRC & 0xffffffff)
hashes[crc_formatted_hex] = {
'filename': filename,
'archive': {
'entry': info.filename,
'type': 'zip'
}
}
except (OSError, UnicodeDecodeError, zipfile.BadZipFile):
# Possible normal file containing a zip magic number?
print('**** ERROR ****')
print('**** Attempted to parse {} as a zip archive.'.format(
filename))
print('**** If this file is not a zip archive, you may safely'
' ignore this error.')
print('***************')
pass
return hashes
# *********************************************************************#
# #
# Body #
# #
# *********************************************************************#
if __name__ == '__main__':
SOURCE_FOLDER = ARGS.source_folder
TARGET_DATABASE = ARGS.target_database
OUTPUT_FOLDER = ARGS.output_folder
MISSING_FILES = ARGS.missing_files
END_LINE = "\n" if ARGS.new_line else "\r"
DROP_INITIAL_DIRECTORY = ARGS.drop_initial_directory
DATABASE, NUMBER_OF_ENTRIES = parse_database(TARGET_DATABASE,
DROP_INITIAL_DIRECTORY)
parse_folder(SOURCE_FOLDER, DATABASE, OUTPUT_FOLDER)
# Observed files will have either their SHA256 or their CRC32 entry
# deleted (or both) from the database. For missing files, both entries
# will still be present. If the hash for an empty file exists as both
# SHA256 and CRC32, then it is missing and will be created below.
#
# For reference, an empty file will always have the following hashes:
# SHA256: e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
# SHA1: da39a3ee5e6b4b0d3255bfef95601890afd80709
# MD5SUM: d41d8cd98f00b204e9800998ecf8427e
# CRC32: 00000000
if (('e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855'
in DATABASE) and ('00000000' in DATABASE)):
for file in DATABASE['00000000']:
empty_file = os.path.join(OUTPUT_FOLDER, file)
write_empty_file(empty_file)
# Since missing files will be in the database twice as explained above,
# only keep the SHA256 entry (64 char length hash) when listing out the
# missing files. This will prevent missing files from being counted more
# than once.
file_counts = Counter([str(i) for i in DATABASE.values()])
duplicate_files = set([str(i) for i in file_counts if file_counts[i] == 2])
missing_file_list = [(os.path.basename(DATABASE[entry][0]), entry)
for entry in DATABASE
if (str(DATABASE[entry]) in duplicate_files
and len(entry) == 64)]
missing_entry_count = sum([len(DATABASE[missing_file[1]])
for missing_file in missing_file_list])
FOUND_ENTRIES = NUMBER_OF_ENTRIES - missing_entry_count
if missing_file_list:
missing_file_list.sort()
if MISSING_FILES:
with open(MISSING_FILES, "w") as missing_files:
for missing_file, entry in missing_file_list:
print(missing_file, entry, sep="\t", file=missing_files)
else:
print("no missing file")
COVERAGE = round(100.0 * FOUND_ENTRIES / NUMBER_OF_ENTRIES, 2)
print('coverage: {}/{} ({}%)'.format(FOUND_ENTRIES,
NUMBER_OF_ENTRIES,
COVERAGE),
file=sys.stdout)
sys.exit(0)