-
Notifications
You must be signed in to change notification settings - Fork 92
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Repozo incremental recover #403
base: master
Are you sure you want to change the base?
Changes from 5 commits
6543901
f62057c
fb51978
154c47c
d441b83
67ab5a6
c485afb
240f6bb
7291213
27d6296
acadc7a
d0adb00
315c76b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|
|
@@ -73,6 +73,12 @@ | |||||||||||
Note: for the stdout case, the index file will **not** be restored | ||||||||||||
automatically. | ||||||||||||
|
||||||||||||
-F / --full | ||||||||||||
Force a full recover. By default, an incremental recover is made | ||||||||||||
if possible, by only copying the latest backup delta to the recovered | ||||||||||||
ZODB file. A full recover will always be done if a pack has occured | ||||||||||||
since the last incremental backup. | ||||||||||||
|
||||||||||||
-w | ||||||||||||
--with-verify | ||||||||||||
Verify on the fly the backup files on recovering. This option runs | ||||||||||||
|
@@ -185,7 +191,7 @@ class Options: | |||||||||||
mode = None # BACKUP, RECOVER or VERIFY | ||||||||||||
file = None # name of input Data.fs file | ||||||||||||
repository = None # name of directory holding backups | ||||||||||||
full = False # True forces full backup | ||||||||||||
full = False # True forces full backup or full recovery | ||||||||||||
date = None # -D argument, if any | ||||||||||||
output = None # where to write recovered data; None = stdout | ||||||||||||
quick = False # -Q flag state | ||||||||||||
|
@@ -396,6 +402,41 @@ def func(data): | |||||||||||
return bytesread, sum.hexdigest() | ||||||||||||
|
||||||||||||
|
||||||||||||
def recover_repofiles(options, repofiles, datfile, outfp): | ||||||||||||
if options.withverify: | ||||||||||||
with open(datfile) as fp: | ||||||||||||
truth_dict = {} | ||||||||||||
for line in fp: | ||||||||||||
fn, startpos, endpos, sum = line.split() | ||||||||||||
startpos = int(startpos) | ||||||||||||
endpos = int(endpos) | ||||||||||||
filename = os.path.join(options.repository, | ||||||||||||
os.path.basename(fn)) | ||||||||||||
truth_dict[filename] = { | ||||||||||||
'size': endpos - startpos, | ||||||||||||
'sum': sum, | ||||||||||||
} | ||||||||||||
totalsz = 0 | ||||||||||||
mgedmin marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||||||||
for repofile in repofiles: | ||||||||||||
reposz, reposum = concat([repofile], outfp) | ||||||||||||
expected_truth = truth_dict[repofile] | ||||||||||||
if reposz != expected_truth['size']: | ||||||||||||
raise VerificationFail( | ||||||||||||
"%s is %d bytes, should be %d bytes" % ( | ||||||||||||
repofile, reposz, expected_truth['size'])) | ||||||||||||
if reposum != expected_truth['sum']: | ||||||||||||
raise VerificationFail( | ||||||||||||
"{} has checksum {} instead of {}".format( | ||||||||||||
repofile, reposum, expected_truth['sum'])) | ||||||||||||
totalsz += reposz | ||||||||||||
log("Recovered chunk %s : %s bytes, md5: %s", | ||||||||||||
repofile, reposz, reposum) | ||||||||||||
log("Recovered a total of %s bytes", totalsz) | ||||||||||||
else: | ||||||||||||
reposz, reposum = concat(repofiles, outfp) | ||||||||||||
log('Recovered %s bytes, md5: %s', reposz, reposum) | ||||||||||||
|
||||||||||||
|
||||||||||||
def gen_filedate(options): | ||||||||||||
return getattr(options, 'test_now', time.gmtime()[:6]) | ||||||||||||
|
||||||||||||
|
@@ -673,15 +714,7 @@ def do_backup(options): | |||||||||||
do_full_backup(options) | ||||||||||||
|
||||||||||||
|
||||||||||||
def do_recover(options): | ||||||||||||
# Find the first full backup at or before the specified date | ||||||||||||
repofiles = find_files(options) | ||||||||||||
if not repofiles: | ||||||||||||
if options.date: | ||||||||||||
raise NoFiles(f'No files in repository before {options.date}') | ||||||||||||
else: | ||||||||||||
raise NoFiles('No files in repository') | ||||||||||||
|
||||||||||||
def do_full_recover(options, repofiles): | ||||||||||||
files_to_close = () | ||||||||||||
if options.output is None: | ||||||||||||
log('Recovering file to stdout') | ||||||||||||
|
@@ -698,50 +731,8 @@ def do_recover(options): | |||||||||||
files_to_close += (outfp,) | ||||||||||||
|
||||||||||||
try: | ||||||||||||
if options.withverify: | ||||||||||||
datfile = os.path.splitext(repofiles[0])[0] + '.dat' | ||||||||||||
with open(datfile) as fp: | ||||||||||||
truth_dict = {} | ||||||||||||
for line in fp: | ||||||||||||
fn, startpos, endpos, sum = line.split() | ||||||||||||
startpos = int(startpos) | ||||||||||||
endpos = int(endpos) | ||||||||||||
filename = os.path.join(options.repository, | ||||||||||||
os.path.basename(fn)) | ||||||||||||
truth_dict[filename] = { | ||||||||||||
'size': endpos - startpos, | ||||||||||||
'sum': sum, | ||||||||||||
} | ||||||||||||
totalsz = 0 | ||||||||||||
for repofile in repofiles: | ||||||||||||
reposz, reposum = concat([repofile], outfp) | ||||||||||||
expected_truth = truth_dict[repofile] | ||||||||||||
if reposz != expected_truth['size']: | ||||||||||||
raise VerificationFail( | ||||||||||||
"%s is %d bytes, should be %d bytes" % ( | ||||||||||||
repofile, reposz, expected_truth['size'])) | ||||||||||||
if reposum != expected_truth['sum']: | ||||||||||||
raise VerificationFail( | ||||||||||||
"{} has checksum {} instead of {}".format( | ||||||||||||
repofile, reposum, expected_truth['sum'])) | ||||||||||||
totalsz += reposz | ||||||||||||
log("Recovered chunk %s : %s bytes, md5: %s", | ||||||||||||
repofile, reposz, reposum) | ||||||||||||
log("Recovered a total of %s bytes", totalsz) | ||||||||||||
else: | ||||||||||||
reposz, reposum = concat(repofiles, outfp) | ||||||||||||
log('Recovered %s bytes, md5: %s', reposz, reposum) | ||||||||||||
|
||||||||||||
if options.output is not None: | ||||||||||||
last_base = os.path.splitext(repofiles[-1])[0] | ||||||||||||
source_index = '%s.index' % last_base | ||||||||||||
target_index = '%s.index' % options.output | ||||||||||||
if os.path.exists(source_index): | ||||||||||||
log('Restoring index file %s to %s', | ||||||||||||
source_index, target_index) | ||||||||||||
shutil.copyfile(source_index, target_index) | ||||||||||||
else: | ||||||||||||
log('No index file to restore: %s', source_index) | ||||||||||||
datfile = os.path.splitext(repofiles[0])[0] + '.dat' | ||||||||||||
recover_repofiles(options, repofiles, datfile, outfp) | ||||||||||||
finally: | ||||||||||||
for f in files_to_close: | ||||||||||||
f.close() | ||||||||||||
|
@@ -755,6 +746,92 @@ def do_recover(options): | |||||||||||
raise | ||||||||||||
|
||||||||||||
|
||||||||||||
def do_incremental_recover(options, repofiles): | ||||||||||||
datfile = os.path.splitext(repofiles[0])[0] + '.dat' | ||||||||||||
log('Recovering (incrementally) file to %s', options.output) | ||||||||||||
with open(options.output, 'r+b') as outfp: | ||||||||||||
outfp.seek(0, 2) | ||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do you think this comment is correct ? here we don't use
Suggested change
|
||||||||||||
initial_length = outfp.tell() | ||||||||||||
with open(datfile) as fp: | ||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Doesn't reopening the index risk disobeying |
||||||||||||
previous_chunk = None | ||||||||||||
for line in fp: | ||||||||||||
fn, startpos, endpos, _ = chunk = line.split() | ||||||||||||
startpos = int(startpos) | ||||||||||||
endpos = int(endpos) | ||||||||||||
if endpos > initial_length: | ||||||||||||
break | ||||||||||||
previous_chunk = chunk | ||||||||||||
|
||||||||||||
if previous_chunk == chunk: | ||||||||||||
if endpos == initial_length: | ||||||||||||
Sebatyne marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||||||||
log('Target file is same size as latest backup, ' | ||||||||||||
'doing nothing.') | ||||||||||||
return | ||||||||||||
else: | ||||||||||||
log('Target file is larger than latest backup, ' | ||||||||||||
'falling back to a full recover.') | ||||||||||||
return do_full_recover(options, repofiles) | ||||||||||||
if previous_chunk is None: | ||||||||||||
log('Target file smaller than full backup, ' | ||||||||||||
'falling back to a full recover.') | ||||||||||||
return do_full_recover(options, repofiles) | ||||||||||||
check_startpos = int(previous_chunk[1]) | ||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. About checking the already-restored file, which is a new concept in this tool (and the corner stone of this new feature), should it be under control of IMHO repozo should always check the MD5, and only the last chunk, except when the main action is This is the kind of implementation decisions I was free to make as long as my implementation was separate, but becomes harder to decide once both tools are merged. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If I read correctly: When doing backup, the logic to decide if an incremental backup is possible depends of
If we don't like the current logic to decide if an incremental restore is possible ( compare sizes and if sizes match, compare checksums for last restored increment ), I feel it would make sense to base the logic on During restore, if |
||||||||||||
check_endpos = int(previous_chunk[2]) | ||||||||||||
with open(options.output, 'r+b') as outfp: | ||||||||||||
outfp.seek(check_startpos) | ||||||||||||
check_sum = checksum(outfp, check_endpos - check_startpos) | ||||||||||||
assert outfp.tell() == startpos, (outfp.tell(), startpos) | ||||||||||||
if previous_chunk[3] != check_sum: | ||||||||||||
log('Last whole common chunk checksum did not match with backup, ' | ||||||||||||
'falling back to a full recover.') | ||||||||||||
return do_full_recover(options, repofiles) | ||||||||||||
|
||||||||||||
if startpos < initial_length: | ||||||||||||
log('Truncating target file %i bytes before its end', | ||||||||||||
initial_length - startpos) | ||||||||||||
filename = os.path.join(options.repository, | ||||||||||||
os.path.basename(fn)) | ||||||||||||
first_file_to_restore = repofiles.index(filename) | ||||||||||||
assert first_file_to_restore > 0, ( | ||||||||||||
first_file_to_restore, options.repository, fn, filename, repofiles) | ||||||||||||
|
||||||||||||
temporary_output_file = options.output + '.part' | ||||||||||||
os.rename(options.output, temporary_output_file) | ||||||||||||
with open(temporary_output_file, 'r+b') as outfp: | ||||||||||||
outfp.seek(startpos) | ||||||||||||
recover_repofiles(options, | ||||||||||||
repofiles[first_file_to_restore:], | ||||||||||||
datfile, | ||||||||||||
outfp) | ||||||||||||
os.rename(temporary_output_file, options.output) | ||||||||||||
|
||||||||||||
|
||||||||||||
def do_recover(options): | ||||||||||||
# Find the first full backup at or before the specified date | ||||||||||||
repofiles = find_files(options) | ||||||||||||
if not repofiles: | ||||||||||||
if options.date: | ||||||||||||
raise NoFiles(f'No files in repository before {options.date}') | ||||||||||||
else: | ||||||||||||
raise NoFiles('No files in repository') | ||||||||||||
|
||||||||||||
if options.full or not os.path.exists(options.output): | ||||||||||||
mgedmin marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||||||||
do_full_recover(options, repofiles) | ||||||||||||
else: | ||||||||||||
do_incremental_recover(options, repofiles) | ||||||||||||
|
||||||||||||
if options.output is not None: | ||||||||||||
last_base = os.path.splitext(repofiles[-1])[0] | ||||||||||||
source_index = '%s.index' % last_base | ||||||||||||
target_index = '%s.index' % options.output | ||||||||||||
if os.path.exists(source_index): | ||||||||||||
log('Restoring index file %s to %s', | ||||||||||||
source_index, target_index) | ||||||||||||
shutil.copyfile(source_index, target_index) | ||||||||||||
else: | ||||||||||||
log('No index file to restore: %s', source_index) | ||||||||||||
|
||||||||||||
|
||||||||||||
def do_verify(options): | ||||||||||||
# Verify the sizes and checksums of all files mentioned in the .dat file | ||||||||||||
repofiles = find_files(options) | ||||||||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What do you think of something a bit longer, similar to this ?