Skip to content

Commit

Permalink
Add backup config reuse_stat_unchanged_file for those speed lover
Browse files Browse the repository at this point in the history
  • Loading branch information
Fallen-Breath committed Dec 1, 2024
1 parent 67fefea commit 9d7d139
Show file tree
Hide file tree
Showing 5 changed files with 132 additions and 21 deletions.
23 changes: 23 additions & 0 deletions docs/config.md
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,7 @@ Configs on how the backup is made
"**/session.lock"
],
"follow_target_symlink": false,
"reuse_stat_unchanged_file": false,
"hash_method": "xxh128",
"compress_method": "zstd",
"compress_threshold": 64
Expand Down Expand Up @@ -305,6 +306,28 @@ Prime Backup will save not only the `world` symbolic link, but also the `foo` sy
- Type: `bool`
- Default: `false`

#### reuse_stat_unchanged_file

When enabled, during backup creation, Prime Backup will try to directly reuse file information from the previous backup
for files whose stat (size, mtime, mode, etc.) have not changed. No file hash checking will be done on these stat-unchanged files

If you want the maximum possible backup creation speed, you can try enabling this option.
However, this also introduces the potential risk of incomplete backups

!!! warning

Please only enable this option after ensuring that the server's operating system and file system are functioning properly and stably.
Otherwise, if issues such as system time rollback or abnormal file system metadata occur, some of the files might
have their content changed but keep their stat unchanged, and then Prime Backup will create an incomplete backup

!!! tip

Unless you really need this backup speed boost or the system disk read performance is too low, it is not recommended to enable this option.
Prime Backup is already fast enough

- Type: `bool`
- Default: `false`

#### hash_method

The algorithm to hash the files. Available options: `"xxh128"``"sha256"``"blake3"`
Expand Down
23 changes: 23 additions & 0 deletions docs/config.zh.md
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,7 @@ Prime Backup 在创建备份时的操作时序如下:
"**/session.lock"
],
"follow_target_symlink": false,
"reuse_stat_unchanged_file": false,
"hash_method": "xxh128",
"compress_method": "zstd",
"compress_threshold": 64
Expand Down Expand Up @@ -305,6 +306,28 @@ Prime Backup 除了会保存 `world` 这个符号链接外,还会保存 `foo`
- 类型:`bool`
- 默认值:`false`

#### reuse_stat_unchanged_file

启用时,在创建备份过程中,Prime Backup 将尝试直接复用之前备份中那些未改变(如大小、修改时间、权限等)的文件信息。
对于这些未改变文件的统计信息,将不会进行文件哈希检查。

如果你想获得尽可能快的备份创建速度,可以尝试启用此选项,但这会引入潜在的备份不完整的风险。
除非你确实需要这一备份速度增益,或者系统磁盘读取性能过低,否则不建议启用此选项

!!! warning

请在确保服务器的操作系统和文件系统正常且稳定运行后,再启用此选项。
否则,如果出现系统时间回退或文件系统元数据异常等问题,可能会造成某些文件的内容变化但 stat 保持不变的情况,
从而导致 Prime Backup 创建出了一个不完整的备份

!!! tip

除非你确实需要这一备份速度增益,或者系统磁盘读取性能过低,否则不建议启用此选项。
Prime Backup 的速度已经足够快了

- 类型:`bool`
- 默认值:`false`

#### hash_method

对文件进行哈希时所使用的算法。可用选项:`"xxh128"``"sha256"``"blake3"`
Expand Down
101 changes: 80 additions & 21 deletions prime_backup/action/create_backup_action.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,7 @@ class _ScanResult:
class _PreCalculationResult:
stats: Dict[Path, os.stat_result] = dataclasses.field(default_factory=dict)
hashes: Dict[Path, str] = dataclasses.field(default_factory=dict)
reused_files: Dict[Path, schema.File] = dataclasses.field(default_factory=dict)


class CreateBackupAction(CreateBackupActionBase):
Expand All @@ -221,6 +222,9 @@ def __init__(self, creator: Operator, comment: str, *, tags: Optional[BackupTags

self.__source_path: Path = source_path or self.config.source_path

def __file_path_to_db_path(self, path: Path) -> str:
return path.relative_to(self.__source_path).as_posix()

def __scan_files(self) -> _ScanResult:
ignore_patterns = pathspec.GitIgnoreSpec.from_lines(self.config.backup.ignore_patterns)
result = _ScanResult()
Expand Down Expand Up @@ -284,32 +288,70 @@ def __pre_calculate_stats(self, scan_result: _ScanResult):
for file_entry in scan_result.all_files:
stats[file_entry.path] = file_entry.stat

def __reuse_unchanged_files(self, session: DbSession, scan_result: _ScanResult):
backup = session.get_last_backup()
if backup is None:
return

@dataclasses.dataclass(frozen=True)
class StatKey:
path: str
size: Optional[int] # it shouldn't be None, but just in case
mode: int
uid: int
gid: int
mtime: int

stat_to_files: Dict[StatKey, schema.File] = {}
for file in session.get_backup_files(backup.id):
if stat.S_ISREG(file.mode):
key = StatKey(
path=file.path,
size=file.blob_raw_size,
mode=file.mode,
uid=file.uid,
gid=file.gid,
mtime=file.mtime_ns,
)
stat_to_files[key] = file

for file_entry in scan_result.all_files:
if file_entry.is_file():
key = StatKey(
path=self.__file_path_to_db_path(file_entry.path),
size=file_entry.stat.st_size,
mode=file_entry.stat.st_mode,
uid=file_entry.stat.st_uid,
gid=file_entry.stat.st_gid,
mtime=file_entry.stat.st_mtime_ns
)
if (file := stat_to_files.get(key)) is not None:
self.__pre_calc_result.reused_files[file_entry.path] = file

def __pre_calculate_hash(self, session: DbSession, scan_result: _ScanResult):
hashes = self.__pre_calc_result.hashes
hashes.clear()

sizes: Set[int] = set()
for file_entry in scan_result.all_files:
if file_entry.is_file():
sizes.add(file_entry.stat.st_size)
file_entries_to_hash: List[_ScanResultEntry] = [
file_entry
for file_entry in scan_result.all_files
if file_entry.is_file() and file_entry.path not in self.__pre_calc_result.reused_files
]

hash_dict_lock = threading.Lock()
existence = session.has_blob_with_size_batched(list(sizes))
self.__blob_by_size_cache.update(existence)
all_sizes: Set[int] = {file_entry.stat.st_size for file_entry in file_entries_to_hash}
existed_sizes = session.has_blob_with_size_batched(list(all_sizes))
self.__blob_by_size_cache.update(existed_sizes)

def hash_worker(pth: Path):
h = hash_utils.calc_file_hash(pth)
with hash_dict_lock:
hashes[pth] = h
hashes[pth] = hash_utils.calc_file_hash(pth)

with FailFastBlockingThreadPool(name='hasher') as pool:
for file_entry in scan_result.all_files:
if file_entry.is_file():
if existence[file_entry.stat.st_size]:
# we need to hash the file, sooner or later
pool.submit(hash_worker, file_entry.path)
else:
pass # will use hash_once policy
for file_entry in file_entries_to_hash:
if existed_sizes[file_entry.stat.st_size]:
# we need to hash the file, sooner or later
pool.submit(hash_worker, file_entry.path)
else:
pass # will use hash_once policy

@functools.cached_property
def __temp_path(self) -> Path:
Expand Down Expand Up @@ -510,7 +552,21 @@ def bp_rba(h: str) -> Path:
raise VolatileBlobFile('blob file {} keeps changing'.format(src_path_str))

def __create_file(self, session: DbSession, path: Path) -> Generator[Any, Any, schema.File]:
related_path = path.relative_to(self.__source_path)
if (reused_file := self.__pre_calc_result.reused_files.get(path)) is not None:
# make a copy
return session.create_file(
path=reused_file.path,
role=FileRole.unknown.value,
mode=reused_file.mode,
content=reused_file.content,
blob_hash=reused_file.blob_hash,
blob_compress=reused_file.blob_compress,
blob_raw_size=reused_file.blob_raw_size,
blob_stored_size=reused_file.blob_stored_size,
uid=reused_file.uid,
gid=reused_file.gid,
mtime_ns=reused_file.mtime_ns,
)

if (st := self.__pre_calc_result.stats.pop(path, None)) is None:
st = path.lstat()
Expand All @@ -530,16 +586,16 @@ def __create_file(self, session: DbSession, path: Path) -> Generator[Any, Any, s
elif stat.S_ISDIR(st.st_mode):
pass
elif stat.S_ISLNK(st.st_mode):
content = path.readlink().as_posix().encode('utf8')
content = os.readlink(path).encode('utf8')
else:
raise UnsupportedFileFormat(st.st_mode)

return session.create_file(
path=related_path.as_posix(),
content=content,
path=self.__file_path_to_db_path(path),
role=FileRole.unknown.value,

mode=st.st_mode,
content=content,
uid=st.st_uid,
gid=st.st_gid,
mtime_ns=st.st_mtime_ns,
Expand Down Expand Up @@ -573,6 +629,9 @@ def run(self) -> BackupInfo:
))

self.__pre_calculate_stats(scan_result)
if self.config.backup.reuse_stat_unchanged_file:
self.__reuse_unchanged_files(session, scan_result)
self.logger.info('Reused {} / {} unchanged files'.format(len(self.__pre_calc_result.reused_files), len(scan_result.all_files)))
if self.config.get_effective_concurrency() > 1:
self.__pre_calculate_hash(session, scan_result)
self.logger.info('Pre-calculate all file hash done')
Expand Down
1 change: 1 addition & 0 deletions prime_backup/config/backup_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ class BackupConfig(Serializable):
'**/session.lock',
]
follow_target_symlink: bool = False
reuse_stat_unchanged_file: bool = False
hash_method: HashMethod = HashMethod.xxh128
compress_method: CompressMethod = CompressMethod.zstd
compress_threshold: int = 64
Expand Down
5 changes: 5 additions & 0 deletions prime_backup/db/session.py
Original file line number Diff line number Diff line change
Expand Up @@ -589,6 +589,11 @@ def get_backup_ids_by_blob_hashes(self, hashes: List[str]) -> List[int]:
fileset_ids = self.get_fileset_ids_by_blob_hashes(hashes)
return self.get_backup_ids_by_fileset_ids(fileset_ids)

def get_last_backup(self) -> Optional[schema.Backup]:
s = select(schema.Backup).order_by(desc(schema.Backup.id)).limit(1)
backups = _list_it(self.session.execute(s).scalars().all())
return backups[0] if backups else None

def list_backup(self, backup_filter: Optional[BackupFilter] = None, limit: Optional[int] = None, offset: Optional[int] = None) -> List[schema.Backup]:
s = select(schema.Backup)
if backup_filter is not None:
Expand Down

0 comments on commit 9d7d139

Please sign in to comment.