Add backup config reuse_stat_unchanged_file for those speed lover

TISUnion · Dec 1, 2024 · 9d7d139 · 9d7d139
1 parent 67fefea
commit 9d7d139
Show file tree

Hide file tree

Showing 5 changed files with 132 additions and 21 deletions.
diff --git a/docs/config.md b/docs/config.md
@@ -216,6 +216,7 @@ Configs on how the backup is made
        "**/session.lock"
     ],
     "follow_target_symlink": false,
+    "reuse_stat_unchanged_file": false,
     "hash_method": "xxh128",
     "compress_method": "zstd",
     "compress_threshold": 64
@@ -305,6 +306,28 @@ Prime Backup will save not only the `world` symbolic link, but also the `foo` sy
 - Type: `bool`
 - Default: `false`
 
+#### reuse_stat_unchanged_file
+
+When enabled, during backup creation, Prime Backup will try to directly reuse file information from the previous backup 
+for files whose stat (size, mtime, mode, etc.) have not changed. No file hash checking will be done on these stat-unchanged files
+
+If you want the maximum possible backup creation speed, you can try enabling this option.
+However, this also introduces the potential risk of incomplete backups
+
+!!! warning
+
+    Please only enable this option after ensuring that the server's operating system and file system are functioning properly and stably. 
+    Otherwise, if issues such as system time rollback or abnormal file system metadata occur, some of the files might 
+    have their content changed but keep their stat unchanged, and then Prime Backup will create an incomplete backup
+
+!!! tip
+
+    Unless you really need this backup speed boost or the system disk read performance is too low, it is not recommended to enable this option.
+    Prime Backup is already fast enough
+
+- Type: `bool`
+- Default: `false`
+
 #### hash_method
 
 The algorithm to hash the files. Available options: `"xxh128"`、`"sha256"`、`"blake3"`

diff --git a/docs/config.zh.md b/docs/config.zh.md
@@ -216,6 +216,7 @@ Prime Backup 在创建备份时的操作时序如下：
        "**/session.lock"
     ],
     "follow_target_symlink": false,
+    "reuse_stat_unchanged_file": false,
     "hash_method": "xxh128",
     "compress_method": "zstd",
     "compress_threshold": 64
@@ -305,6 +306,28 @@ Prime Backup 除了会保存 `world` 这个符号链接外，还会保存 `foo`
 - 类型：`bool`
 - 默认值：`false`
 
+#### reuse_stat_unchanged_file
+
+启用时，在创建备份过程中，Prime Backup 将尝试直接复用之前备份中那些未改变（如大小、修改时间、权限等）的文件信息。
+对于这些未改变文件的统计信息，将不会进行文件哈希检查。
+
+如果你想获得尽可能快的备份创建速度，可以尝试启用此选项，但这会引入潜在的备份不完整的风险。
+除非你确实需要这一备份速度增益，或者系统磁盘读取性能过低，否则不建议启用此选项
+
+!!! warning
+
+    请在确保服务器的操作系统和文件系统正常且稳定运行后，再启用此选项。
+    否则，如果出现系统时间回退或文件系统元数据异常等问题，可能会造成某些文件的内容变化但 stat 保持不变的情况，
+    从而导致 Prime Backup 创建出了一个不完整的备份
+
+!!! tip
+
+    除非你确实需要这一备份速度增益，或者系统磁盘读取性能过低，否则不建议启用此选项。
+    Prime Backup 的速度已经足够快了
+
+- 类型：`bool`
+- 默认值：`false`
+
 #### hash_method
 
 对文件进行哈希时所使用的算法。可用选项：`"xxh128"`、`"sha256"`、`"blake3"`

diff --git a/prime_backup/action/create_backup_action.py b/prime_backup/action/create_backup_action.py
@@ -198,6 +198,7 @@ class _ScanResult:
 class _PreCalculationResult:
 	stats: Dict[Path, os.stat_result] = dataclasses.field(default_factory=dict)
 	hashes: Dict[Path, str] = dataclasses.field(default_factory=dict)
+	reused_files: Dict[Path, schema.File] = dataclasses.field(default_factory=dict)
 
 
 class CreateBackupAction(CreateBackupActionBase):
@@ -221,6 +222,9 @@ def __init__(self, creator: Operator, comment: str, *, tags: Optional[BackupTags
 
 		self.__source_path: Path = source_path or self.config.source_path
 
+	def __file_path_to_db_path(self, path: Path) -> str:
+		return path.relative_to(self.__source_path).as_posix()
+
 	def __scan_files(self) -> _ScanResult:
 		ignore_patterns = pathspec.GitIgnoreSpec.from_lines(self.config.backup.ignore_patterns)
 		result = _ScanResult()
@@ -284,32 +288,70 @@ def __pre_calculate_stats(self, scan_result: _ScanResult):
 		for file_entry in scan_result.all_files:
 			stats[file_entry.path] = file_entry.stat
 
+	def __reuse_unchanged_files(self, session: DbSession, scan_result: _ScanResult):
+		backup = session.get_last_backup()
+		if backup is None:
+			return
+
+		@dataclasses.dataclass(frozen=True)
+		class StatKey:
+			path: str
+			size: Optional[int]  # it shouldn't be None, but just in case
+			mode: int
+			uid: int
+			gid: int
+			mtime: int
+
+		stat_to_files: Dict[StatKey, schema.File] = {}
+		for file in session.get_backup_files(backup.id):
+			if stat.S_ISREG(file.mode):
+				key = StatKey(
+					path=file.path,
+					size=file.blob_raw_size,
+					mode=file.mode,
+					uid=file.uid,
+					gid=file.gid,
+					mtime=file.mtime_ns,
+				)
+				stat_to_files[key] = file
+
+		for file_entry in scan_result.all_files:
+			if file_entry.is_file():
+				key = StatKey(
+					path=self.__file_path_to_db_path(file_entry.path),
+					size=file_entry.stat.st_size,
+					mode=file_entry.stat.st_mode,
+					uid=file_entry.stat.st_uid,
+					gid=file_entry.stat.st_gid,
+					mtime=file_entry.stat.st_mtime_ns
+				)
+				if (file := stat_to_files.get(key)) is not None:
+					self.__pre_calc_result.reused_files[file_entry.path] = file
+
 	def __pre_calculate_hash(self, session: DbSession, scan_result: _ScanResult):
 		hashes = self.__pre_calc_result.hashes
 		hashes.clear()
 
-		sizes: Set[int] = set()
-		for file_entry in scan_result.all_files:
-			if file_entry.is_file():
-				sizes.add(file_entry.stat.st_size)
+		file_entries_to_hash: List[_ScanResultEntry] = [
+			file_entry
+			for file_entry in scan_result.all_files
+			if file_entry.is_file() and file_entry.path not in self.__pre_calc_result.reused_files
+		]
 
-		hash_dict_lock = threading.Lock()
-		existence = session.has_blob_with_size_batched(list(sizes))
-		self.__blob_by_size_cache.update(existence)
+		all_sizes: Set[int] = {file_entry.stat.st_size for file_entry in file_entries_to_hash}
+		existed_sizes = session.has_blob_with_size_batched(list(all_sizes))
+		self.__blob_by_size_cache.update(existed_sizes)
 
 		def hash_worker(pth: Path):
-			h = hash_utils.calc_file_hash(pth)
-			with hash_dict_lock:
-				hashes[pth] = h
+			hashes[pth] = hash_utils.calc_file_hash(pth)
 
 		with FailFastBlockingThreadPool(name='hasher') as pool:
-			for file_entry in scan_result.all_files:
-				if file_entry.is_file():
-					if existence[file_entry.stat.st_size]:
-						# we need to hash the file, sooner or later
-						pool.submit(hash_worker, file_entry.path)
-					else:
-						pass  # will use hash_once policy
+			for file_entry in file_entries_to_hash:
+				if existed_sizes[file_entry.stat.st_size]:
+					# we need to hash the file, sooner or later
+					pool.submit(hash_worker, file_entry.path)
+				else:
+					pass  # will use hash_once policy
 
 	@functools.cached_property
 	def __temp_path(self) -> Path:
@@ -510,7 +552,21 @@ def bp_rba(h: str) -> Path:
 		raise VolatileBlobFile('blob file {} keeps changing'.format(src_path_str))
 
 	def __create_file(self, session: DbSession, path: Path) -> Generator[Any, Any, schema.File]:
-		related_path = path.relative_to(self.__source_path)
+		if (reused_file := self.__pre_calc_result.reused_files.get(path)) is not None:
+			# make a copy
+			return session.create_file(
+				path=reused_file.path,
+				role=FileRole.unknown.value,
+				mode=reused_file.mode,
+				content=reused_file.content,
+				blob_hash=reused_file.blob_hash,
+				blob_compress=reused_file.blob_compress,
+				blob_raw_size=reused_file.blob_raw_size,
+				blob_stored_size=reused_file.blob_stored_size,
+				uid=reused_file.uid,
+				gid=reused_file.gid,
+				mtime_ns=reused_file.mtime_ns,
+			)
 
 		if (st := self.__pre_calc_result.stats.pop(path, None)) is None:
 			st = path.lstat()
@@ -530,16 +586,16 @@ def __create_file(self, session: DbSession, path: Path) -> Generator[Any, Any, s
 		elif stat.S_ISDIR(st.st_mode):
 			pass
 		elif stat.S_ISLNK(st.st_mode):
-			content = path.readlink().as_posix().encode('utf8')
+			content = os.readlink(path).encode('utf8')
 		else:
 			raise UnsupportedFileFormat(st.st_mode)
 
 		return session.create_file(
-			path=related_path.as_posix(),
-			content=content,
+			path=self.__file_path_to_db_path(path),
 			role=FileRole.unknown.value,
 
 			mode=st.st_mode,
+			content=content,
 			uid=st.st_uid,
 			gid=st.st_gid,
 			mtime_ns=st.st_mtime_ns,
@@ -573,6 +629,9 @@ def run(self) -> BackupInfo:
 				))
 
 				self.__pre_calculate_stats(scan_result)
+				if self.config.backup.reuse_stat_unchanged_file:
+					self.__reuse_unchanged_files(session, scan_result)
+					self.logger.info('Reused {} / {} unchanged files'.format(len(self.__pre_calc_result.reused_files), len(scan_result.all_files)))
 				if self.config.get_effective_concurrency() > 1:
 					self.__pre_calculate_hash(session, scan_result)
 					self.logger.info('Pre-calculate all file hash done')

diff --git a/prime_backup/config/backup_config.py b/prime_backup/config/backup_config.py
@@ -17,6 +17,7 @@ class BackupConfig(Serializable):
 		'**/session.lock',
 	]
 	follow_target_symlink: bool = False
+	reuse_stat_unchanged_file: bool = False
 	hash_method: HashMethod = HashMethod.xxh128
 	compress_method: CompressMethod = CompressMethod.zstd
 	compress_threshold: int = 64

diff --git a/prime_backup/db/session.py b/prime_backup/db/session.py
@@ -589,6 +589,11 @@ def get_backup_ids_by_blob_hashes(self, hashes: List[str]) -> List[int]:
 		fileset_ids = self.get_fileset_ids_by_blob_hashes(hashes)
 		return self.get_backup_ids_by_fileset_ids(fileset_ids)
 
+	def get_last_backup(self) -> Optional[schema.Backup]:
+		s = select(schema.Backup).order_by(desc(schema.Backup.id)).limit(1)
+		backups = _list_it(self.session.execute(s).scalars().all())
+		return backups[0] if backups else None
+
 	def list_backup(self, backup_filter: Optional[BackupFilter] = None, limit: Optional[int] = None, offset: Optional[int] = None) -> List[schema.Backup]:
 		s = select(schema.Backup)
 		if backup_filter is not None: