zopefoundation · Sebatyne · Oct 18, 2024 · Oct 17, 2024 · Oct 22, 2024 · Oct 22, 2024
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -5,6 +5,9 @@
 6.0.1 (unreleased)
 ==================
 
+- Support incremental recovery in repozo.
+  It makes it much faster in a day-to-day scenario.
- Support incremental recovery in repozo.
-  It makes it much faster in a day-to-day scenario.
+- repozo: Change restoration to be incremental by default, unless ``--full`` is
+  provided.
+  Repozo now tries to append the new incremental deltafs on previously restored
+  filestorage, if the file sizes and the checksum of the last restored increment
+  match, otherwise it will fallback to a full recover.
+  For details see `#403 <https://github.com/zopefoundation/ZODB/pull/403>`_.
- Support incremental recovery in repozo.
-  It makes it much faster in a day-to-day scenario.
+- repozo: Change restoration to be incremental by default, unless ``--full`` is
+  provided.
+  Repozo now tries to append the new incremental deltafs on previously restored
+  filestorage, if the file sizes and the checksum of the last restored increment
+  match, otherwise it will fallback to a full recover.
+  For details see `#403 <https://github.com/zopefoundation/ZODB/pull/403>`_.
+
 
 6.0 (2024-03-20)
 ================

diff --git a/src/ZODB/scripts/repozo.py b/src/ZODB/scripts/repozo.py
@@ -73,6 +73,12 @@
         Note:  for the stdout case, the index file will **not** be restored
         automatically.
 
+    -F / --full
+        Force a full recover. By default, an incremental recover is made
+        if possible, by only copying the latest backup delta to the recovered
+        ZODB file. A full recover will always be done if a pack has occured
+        since the last incremental backup.
+
     -w
     --with-verify
         Verify on the fly the backup files on recovering. This option runs
@@ -185,7 +191,7 @@ class Options:
         mode = None         # BACKUP, RECOVER or VERIFY
         file = None         # name of input Data.fs file
         repository = None   # name of directory holding backups
-        full = False        # True forces full backup
+        full = False        # True forces full backup or full recovery
         date = None         # -D argument, if any
         output = None       # where to write recovered data; None = stdout
         quick = False       # -Q flag state
@@ -396,6 +402,41 @@ def func(data):
     return bytesread, sum.hexdigest()
 
 
+def recover_repofiles(options, repofiles, datfile, outfp):
+    if options.withverify:
+        with open(datfile) as fp:
+            truth_dict = {}
+            for line in fp:
+                fn, startpos, endpos, sum = line.split()
+                startpos = int(startpos)
+                endpos = int(endpos)
+                filename = os.path.join(options.repository,
+                                        os.path.basename(fn))
+                truth_dict[filename] = {
+                    'size': endpos - startpos,
+                    'sum': sum,
+                }
+                totalsz = 0
+        for repofile in repofiles:
+            reposz, reposum = concat([repofile], outfp)
+            expected_truth = truth_dict[repofile]
+            if reposz != expected_truth['size']:
+                raise VerificationFail(
+                    "%s is %d bytes, should be %d bytes" % (
+                        repofile, reposz, expected_truth['size']))
+            if reposum != expected_truth['sum']:
+                raise VerificationFail(
+                    "{} has checksum {} instead of {}".format(
+                        repofile, reposum, expected_truth['sum']))
+            totalsz += reposz
+            log("Recovered chunk %s : %s bytes, md5: %s",
+                repofile, reposz, reposum)
+            log("Recovered a total of %s bytes", totalsz)
+    else:
+        reposz, reposum = concat(repofiles, outfp)
+        log('Recovered %s bytes, md5: %s', reposz, reposum)
+
+
 def gen_filedate(options):
     return getattr(options, 'test_now', time.gmtime()[:6])
 
@@ -673,15 +714,7 @@ def do_backup(options):
     do_full_backup(options)
 
 
-def do_recover(options):
-    # Find the first full backup at or before the specified date
-    repofiles = find_files(options)
-    if not repofiles:
-        if options.date:
-            raise NoFiles(f'No files in repository before {options.date}')
-        else:
-            raise NoFiles('No files in repository')
-
+def do_full_recover(options, repofiles):
     files_to_close = ()
     if options.output is None:
         log('Recovering file to stdout')
@@ -698,50 +731,8 @@ def do_recover(options):
         files_to_close += (outfp,)
 
     try:
-        if options.withverify:
-            datfile = os.path.splitext(repofiles[0])[0] + '.dat'
-            with open(datfile) as fp:
-                truth_dict = {}
-                for line in fp:
-                    fn, startpos, endpos, sum = line.split()
-                    startpos = int(startpos)
-                    endpos = int(endpos)
-                    filename = os.path.join(options.repository,
-                                            os.path.basename(fn))
-                    truth_dict[filename] = {
-                        'size': endpos - startpos,
-                        'sum': sum,
-                    }
-            totalsz = 0
-            for repofile in repofiles:
-                reposz, reposum = concat([repofile], outfp)
-                expected_truth = truth_dict[repofile]
-                if reposz != expected_truth['size']:
-                    raise VerificationFail(
-                        "%s is %d bytes, should be %d bytes" % (
-                            repofile, reposz, expected_truth['size']))
-                if reposum != expected_truth['sum']:
-                    raise VerificationFail(
-                        "{} has checksum {} instead of {}".format(
-                            repofile, reposum, expected_truth['sum']))
-                totalsz += reposz
-                log("Recovered chunk %s : %s bytes, md5: %s",
-                    repofile, reposz, reposum)
-            log("Recovered a total of %s bytes", totalsz)
-        else:
-            reposz, reposum = concat(repofiles, outfp)
-            log('Recovered %s bytes, md5: %s', reposz, reposum)
-
-        if options.output is not None:
-            last_base = os.path.splitext(repofiles[-1])[0]
-            source_index = '%s.index' % last_base
-            target_index = '%s.index' % options.output
-            if os.path.exists(source_index):
-                log('Restoring index file %s to %s',
-                    source_index, target_index)
-                shutil.copyfile(source_index, target_index)
-            else:
-                log('No index file to restore: %s', source_index)
+        datfile = os.path.splitext(repofiles[0])[0] + '.dat'
+        recover_repofiles(options, repofiles, datfile, outfp)
     finally:
         for f in files_to_close:
             f.close()
@@ -755,6 +746,92 @@ def do_recover(options):
             raise
 
 
+def do_incremental_recover(options, repofiles):
+    datfile = os.path.splitext(repofiles[0])[0] + '.dat'
+    log('Recovering (incrementally) file to %s', options.output)
+    with open(options.output, 'r+b') as outfp:
+        outfp.seek(0, 2)
-        outfp.seek(0, 2)
+        # Note that we do not open the FileStorage to use getSize here,
+        # we really want the actual file size, even if there is invalid
+        # transaction data at the end.
+        outfp.seek(0, os.SEEK_END)
-        outfp.seek(0, 2)
+        # Note that we do not open the FileStorage to use getSize here,
+        # we really want the actual file size, even if there is invalid
+        # transaction data at the end.
+        outfp.seek(0, os.SEEK_END)
+        initial_length = outfp.tell()
+    with open(datfile) as fp:
+        previous_chunk = None
+        for line in fp:
+            fn, startpos, endpos, _ = chunk = line.split()
+            startpos = int(startpos)
+            endpos = int(endpos)
+            if endpos > initial_length:
+                break
+            previous_chunk = chunk
+
+    if previous_chunk == chunk:
+        if endpos == initial_length:
+            log('Target file is same size as latest backup, '
+                'doing nothing.')
+            return
+        else:
+            log('Target file is larger than latest backup, '
+                'falling back to a full recover.')
+            return do_full_recover(options, repofiles)
+    if previous_chunk is None:
+        log('Target file smaller than full backup, '
+            'falling back to a full recover.')
+        return do_full_recover(options, repofiles)
+    check_startpos = int(previous_chunk[1])
+    check_endpos = int(previous_chunk[2])
+    with open(options.output, 'r+b') as outfp:
+        outfp.seek(check_startpos)
+        check_sum = checksum(outfp, check_endpos - check_startpos)
+        assert outfp.tell() == startpos, (outfp.tell(), startpos)
+    if previous_chunk[3] != check_sum:
+        log('Last whole common chunk checksum did not match with backup, '
+            'falling back to a full recover.')
+        return do_full_recover(options, repofiles)
+
+    if startpos < initial_length:
+        log('Truncating target file %i bytes before its end',
+            initial_length - startpos)
+    filename = os.path.join(options.repository,
+                            os.path.basename(fn))
+    first_file_to_restore = repofiles.index(filename)
+    assert first_file_to_restore > 0, (
+        first_file_to_restore, options.repository, fn, filename, repofiles)
+
+    temporary_output_file = options.output + '.part'
+    os.rename(options.output, temporary_output_file)
+    with open(temporary_output_file, 'r+b') as outfp:
+        outfp.seek(startpos)
+        recover_repofiles(options,
+                          repofiles[first_file_to_restore:],
+                          datfile,
+                          outfp)
+    os.rename(temporary_output_file, options.output)
+
+
+def do_recover(options):
+    # Find the first full backup at or before the specified date
+    repofiles = find_files(options)
+    if not repofiles:
+        if options.date:
+            raise NoFiles(f'No files in repository before {options.date}')
+        else:
+            raise NoFiles('No files in repository')
+
+    if options.full or not os.path.exists(options.output):
+        do_full_recover(options, repofiles)
+    else:
+        do_incremental_recover(options, repofiles)
+
+    if options.output is not None:
+        last_base = os.path.splitext(repofiles[-1])[0]
+        source_index = '%s.index' % last_base
+        target_index = '%s.index' % options.output
+        if os.path.exists(source_index):
+            log('Restoring index file %s to %s',
+                source_index, target_index)
+            shutil.copyfile(source_index, target_index)
+        else:
+            log('No index file to restore: %s', source_index)
+
+
 def do_verify(options):
     # Verify the sizes and checksums of all files mentioned in the .dat file
     repofiles = find_files(options)