diff --git a/Cargo.lock b/Cargo.lock index 034447f9..23343bc9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1111,6 +1111,7 @@ dependencies = [ "camino", "clap", "compact_str", + "dashmap 6.0.1", "directories", "either", "itertools 0.13.0", @@ -1716,6 +1717,8 @@ dependencies = [ "compact_str", "dashmap 6.0.1", "paketkoll_types", + "rayon", + "serde", "tracing", ] @@ -1778,6 +1781,7 @@ dependencies = [ "lasso", "nix", "serde", + "serde_bytes", "smallvec", "strum", "thiserror", @@ -2502,6 +2506,15 @@ dependencies = [ "serde", ] +[[package]] +name = "serde_bytes" +version = "0.11.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "387cc504cb06bb40a96c8e04e951fe01854cf6bc921053c954e4a606d9675c6a" +dependencies = [ + "serde", +] + [[package]] name = "serde_derive" version = "1.0.204" diff --git a/Cargo.toml b/Cargo.toml index f7f78c0d..4f095146 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -57,6 +57,7 @@ rune-modules = "0.13.4" rust-ini = "0.21.0" scopeguard = "1.2.0" serde = "1.0.204" +serde_bytes = "0.11.15" serde_json = "1.0.120" smallvec = { version = "1.13.2", features = [ "const_generics", diff --git a/crates/konfigkoll/Cargo.toml b/crates/konfigkoll/Cargo.toml index 47fa659f..d80c9970 100644 --- a/crates/konfigkoll/Cargo.toml +++ b/crates/konfigkoll/Cargo.toml @@ -63,6 +63,7 @@ tokio = { workspace = true, features = [ tracing-log.workspace = true tracing-subscriber = { workspace = true, features = ["env-filter", "parking_lot"] } tracing.workspace = true +dashmap.workspace = true [target.'cfg(target_env = "musl")'.dependencies] # The allocator on musl is attrociously slow, so we use a custom one. diff --git a/crates/konfigkoll/src/fs_scan.rs b/crates/konfigkoll/src/fs_scan.rs index bc7c44aa..715cb87a 100644 --- a/crates/konfigkoll/src/fs_scan.rs +++ b/crates/konfigkoll/src/fs_scan.rs @@ -4,7 +4,10 @@ use std::sync::Arc; use anyhow::Context; use compact_str::CompactString; +use dashmap::DashMap; +use itertools::Itertools; use ouroboros::self_referencing; +use rayon::prelude::*; use konfigkoll_types::FsInstruction; use paketkoll_core::config::{ @@ -13,7 +16,7 @@ use paketkoll_core::config::{ use paketkoll_core::file_ops::{ canonicalize_file_entries, create_path_map, mismatching_and_unexpected_files, }; -use paketkoll_types::backend::Files; +use paketkoll_types::backend::{Files, PackageMap}; use paketkoll_types::files::FileEntry; use paketkoll_types::files::PathMap; use paketkoll_types::intern::Interner; @@ -30,17 +33,31 @@ pub(crate) struct ScanResult { pub(crate) fn scan_fs( interner: &Arc, backend: &Arc, + package_map: &PackageMap, ignores: &[CompactString], trust_mtime: bool, ) -> anyhow::Result<(ScanResult, Vec)> { tracing::debug!("Scanning filesystem"); let mut fs_instructions_sys = vec![]; - let mut files = backend.files(interner).with_context(|| { - format!( - "Failed to collect information from backend {}", - backend.name() - ) - })?; + let mut files = if backend.prefer_files_from_archive() { + let all = package_map.keys().cloned().collect::>(); + let files = backend.files_from_archives(&all, package_map, interner)?; + let file_map = DashMap::new(); + files + .into_par_iter() + .flat_map_iter(|(_pkg, files)| files) + .for_each(|entry| { + file_map.insert(entry.path.clone(), entry); + }); + file_map.into_iter().map(|(_, v)| v).collect_vec() + } else { + backend.files(interner).with_context(|| { + format!( + "Failed to collect information from backend {}", + backend.name() + ) + })? + }; if backend.may_need_canonicalization() { tracing::debug!("Canonicalizing file entries"); canonicalize_file_entries(&mut files); diff --git a/crates/konfigkoll/src/main.rs b/crates/konfigkoll/src/main.rs index 97d0238e..978514d3 100644 --- a/crates/konfigkoll/src/main.rs +++ b/crates/konfigkoll/src/main.rs @@ -21,6 +21,7 @@ use konfigkoll_core::state::DiffGoal; use konfigkoll_script::Phase; #[cfg(target_env = "musl")] use mimalloc::MiMalloc; +use paketkoll_cache::FromArchiveCache; use paketkoll_cache::OriginalFilesCache; use paketkoll_core::backend::ConcreteBackend; use paketkoll_core::paketkoll_types::intern::Interner; @@ -118,8 +119,19 @@ async fn main() -> anyhow::Result<()> { let backend = b .create_files(&backend_cfg, &interner) .with_context(|| format!("Failed to create backend {b}"))?; + let backend = if backend.prefer_files_from_archive() { + tracing::info!("Using archive cache for backend {}", backend.name()); + // This is slow so we need to cache it + Box::new( + FromArchiveCache::from_path(backend, proj_dirs.cache_dir()) + .context("Failed to create archive disk cache")?, + ) + } else { + // This is fast so we don't need to cache it + backend + }; let backend = OriginalFilesCache::from_path(backend, proj_dirs.cache_dir()) - .context("Failed to create disk cache")?; + .context("Failed to create original files disk cache")?; Arc::new(backend) }; @@ -133,6 +145,10 @@ async fn main() -> anyhow::Result<()> { // Script: Get FS ignores script_engine.run_phase(Phase::Ignores).await?; + tracing::info!("Waiting for package loading results..."); + let (pkgs_sys, package_maps) = package_loader.await??; + tracing::info!("Got package loading results"); + // Do FS scan tracing::info!("Starting filesystem scan background job"); let fs_instructions_sys = { @@ -146,18 +162,24 @@ async fn main() -> anyhow::Result<()> { let trust_mtime = cli.trust_mtime; let interner = interner.clone(); let backends_files = backend_files.clone(); + let package_map = package_maps + .get(&backend_files.as_backend_enum()) + .expect("No matching package backend?") + .clone(); tokio::task::spawn_blocking(move || { - fs_scan::scan_fs(&interner, &backends_files, &ignores, trust_mtime) + fs_scan::scan_fs( + &interner, + &backends_files, + &package_map, + &ignores, + trust_mtime, + ) }) }; // Script: Do early package phase script_engine.run_phase(Phase::ScriptDependencies).await?; - tracing::info!("Waiting for package loading results..."); - let (pkgs_sys, package_maps) = package_loader.await??; - tracing::info!("Got package loading results"); - // Create the set of package managers for use by the script script_engine.state_mut().setup_package_managers( &backends_pkg, diff --git a/crates/konfigkoll/src/pkgs.rs b/crates/konfigkoll/src/pkgs.rs index fd348ae6..5d21e327 100644 --- a/crates/konfigkoll/src/pkgs.rs +++ b/crates/konfigkoll/src/pkgs.rs @@ -10,6 +10,7 @@ use konfigkoll_types::PkgInstructions; use paketkoll_types::{ backend::{Backend, PackageBackendMap, PackageMap, PackageMapMap}, intern::Interner, + package::PackageInstallStatus, }; #[tracing::instrument(skip_all)] @@ -31,9 +32,11 @@ pub(crate) fn load_packages( backend.name() ) }) - .map(|backend_pkgs| { + .map(|mut backend_pkgs| { + // Because we can have partially installed packages on Debian... + backend_pkgs.retain(|pkg| pkg.status == PackageInstallStatus::Installed); let pkg_map = Arc::new(paketkoll_types::backend::packages_to_package_map( - backend_pkgs.clone(), + backend_pkgs.iter(), )); let pkg_instructions = konfigkoll_core::conversion::convert_packages_to_pkg_instructions( diff --git a/crates/paketkoll/src/cli.rs b/crates/paketkoll/src/cli.rs index ffaacc17..c903e35a 100644 --- a/crates/paketkoll/src/cli.rs +++ b/crates/paketkoll/src/cli.rs @@ -59,6 +59,11 @@ pub enum Commands { /// Path to query path: String, }, + #[clap(hide = true)] + DebugPackageFileData { + /// Package to query + package: String, + }, } /// Output format to use diff --git a/crates/paketkoll/src/conversion.rs b/crates/paketkoll/src/conversion.rs index 4f47cef0..ac600b4f 100644 --- a/crates/paketkoll/src/conversion.rs +++ b/crates/paketkoll/src/conversion.rs @@ -60,6 +60,7 @@ impl TryFrom<&Cli> for paketkoll_core::backend::BackendConfiguration { Commands::InstalledPackages => {} Commands::OriginalFile { .. } => {} Commands::Owns { .. } => {} + Commands::DebugPackageFileData { .. } => {} } Ok(builder.build()?) } diff --git a/crates/paketkoll/src/main.rs b/crates/paketkoll/src/main.rs index 1d44a71f..89ebba91 100644 --- a/crates/paketkoll/src/main.rs +++ b/crates/paketkoll/src/main.rs @@ -118,6 +118,31 @@ fn main() -> anyhow::Result { } Ok(Exit::new(Code::SUCCESS)) } + Commands::DebugPackageFileData { ref package } => { + let interner = Interner::new(); + let backend: paketkoll_core::backend::ConcreteBackend = cli.backend.try_into()?; + let backend_impl = backend + .create_full(&(&cli).try_into()?, &interner) + .context("Failed to create backend")?; + + let package_map = backend_impl + .package_map_complete(&interner) + .with_context(|| format!("Failed to collect information from backend {backend}"))?; + + let pkg_ref = PackageRef::get_or_intern(&interner, package); + + let files = backend_impl + .files_from_archives(&[pkg_ref], &package_map, &interner) + .with_context(|| { + format!( + "Failed to collect file information for package {package} from backend {backend}" + ) + })?; + + println!("{:?}", files); + + Ok(Exit::new(Code::SUCCESS)) + } } } diff --git a/crates/paketkoll_cache/Cargo.toml b/crates/paketkoll_cache/Cargo.toml index d1f1dd96..cc2f0006 100644 --- a/crates/paketkoll_cache/Cargo.toml +++ b/crates/paketkoll_cache/Cargo.toml @@ -18,7 +18,11 @@ cached = { workspace = true, features = [ ], default-features = false } compact_str.workspace = true dashmap.workspace = true -paketkoll_types = { version = "0.1.0", path = "../paketkoll_types" } +paketkoll_types = { version = "0.1.0", path = "../paketkoll_types", features = [ + "serde", +] } +rayon.workspace = true +serde = { workspace = true, features = ["derive"] } tracing.workspace = true [lints] diff --git a/crates/paketkoll_cache/src/from_archives.rs b/crates/paketkoll_cache/src/from_archives.rs new file mode 100644 index 00000000..9143a703 --- /dev/null +++ b/crates/paketkoll_cache/src/from_archives.rs @@ -0,0 +1,199 @@ +//! Wrapping backend that performs disk cache of original files queries + +use std::fmt::Debug; +use std::fmt::Display; +use std::path::Path; +use std::path::PathBuf; + +use ahash::AHashMap; +use anyhow::Context; +use cached::stores::DiskCacheBuilder; +use cached::DiskCache; +use cached::IOCached; +use compact_str::CompactString; + +use paketkoll_types::backend::PackageManagerError; +use paketkoll_types::files::FileEntry; +use paketkoll_types::files::FileFlags; +use paketkoll_types::files::Properties; +use paketkoll_types::{ + backend::{Backend, Files, Name, OriginalFileQuery, PackageMap}, + intern::{Interner, PackageRef}, +}; + +use crate::utils::format_package; + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +struct CacheKey { + backend: &'static str, + package: CompactString, +} + +impl CacheKey { + pub fn new(backend: &'static str, package: CompactString) -> Self { + Self { backend, package } + } +} + +impl Display for CacheKey { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}:{}", self.backend, self.package) + } +} + +/// A file entry from the package database +#[derive(Debug, serde::Serialize, serde::Deserialize)] +pub struct FileEntryCache { + /// Package this file belongs to + pub path: PathBuf, + pub properties: Properties, + pub flags: FileFlags, +} + +impl FileEntryCache { + pub fn into_full_entry(self, package: PackageRef, source: &'static str) -> FileEntry { + FileEntry { + package: Some(package), + path: self.path, + properties: self.properties, + flags: self.flags, + source, + seen: Default::default(), + } + } +} + +impl From<&FileEntry> for FileEntryCache { + fn from(entry: &FileEntry) -> Self { + Self { + path: entry.path.clone(), + properties: entry.properties.clone(), + flags: entry.flags, + } + } +} + +pub struct FromArchiveCache { + inner: Box, + cache: DiskCache>, +} + +impl FromArchiveCache { + pub fn from_path(inner: Box, path: &Path) -> anyhow::Result { + let cache = DiskCacheBuilder::new("from_archives") + .set_refresh(true) + .set_lifespan(60 * 60 * 24 * 15) // Half a month + .set_disk_directory(path) + .build()?; + Ok(Self { inner, cache }) + } +} + +impl Debug for FromArchiveCache { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("FromArchiveCache") + .field("inner", &self.inner) + .field("cache", &"DiskCache>") + .finish() + } +} + +impl Name for FromArchiveCache { + fn name(&self) -> &'static str { + self.inner.name() + } + + fn as_backend_enum(&self) -> Backend { + self.inner.as_backend_enum() + } +} + +impl Files for FromArchiveCache { + fn files(&self, interner: &Interner) -> anyhow::Result> { + self.inner.files(interner) + } + + fn may_need_canonicalization(&self) -> bool { + self.inner.may_need_canonicalization() + } + + fn owning_packages( + &self, + paths: &ahash::AHashSet<&Path>, + interner: &Interner, + ) -> anyhow::Result, ahash::RandomState>> + { + self.inner.owning_packages(paths, interner) + } + + fn original_files( + &self, + queries: &[OriginalFileQuery], + packages: &PackageMap, + interner: &Interner, + ) -> anyhow::Result>> { + self.inner.original_files(queries, packages, interner) + } + + fn files_from_archives( + &self, + filter: &[PackageRef], + package_map: &PackageMap, + interner: &Interner, + ) -> Result)>, PackageManagerError> { + let inner_name = self.name(); + let mut results = Vec::new(); + let mut uncached_queries = Vec::new(); + let mut cache_keys = AHashMap::new(); + + for pkg_ref in filter { + let pkg = package_map.get(pkg_ref).context("Package not found")?; + let cache_key = format_package(pkg, interner); + let cache_key = CacheKey::new(inner_name, cache_key); + match self + .cache + .cache_get(&cache_key) + .context("Cache query failed")? + { + Some(v) => { + results.push(( + *pkg_ref, + v.into_iter() + .map(|e| e.into_full_entry(*pkg_ref, inner_name)) + .collect(), + )); + } + None => { + uncached_queries.push(*pkg_ref); + cache_keys.insert(pkg_ref, cache_key); + } + } + } + // Fetch uncached queries + if !uncached_queries.is_empty() { + let uncached_results = + self.inner + .files_from_archives(&uncached_queries, package_map, interner)?; + // Insert the uncached results into the cache and update the results + for (query, result) in uncached_results.into_iter() { + let cache_key = cache_keys.remove(&query).context("Cache key not found")?; + self.cache + .cache_set(cache_key.clone(), result.iter().map(Into::into).collect()) + .with_context(|| { + format!( + "Cache set failed: pkg={} cache_key={}", + query.to_str(interner), + cache_key + ) + })?; + results.push((query, result)); + } + } + + Ok(results) + } + + fn prefer_files_from_archive(&self) -> bool { + self.inner.prefer_files_from_archive() + } +} diff --git a/crates/paketkoll_cache/src/lib.rs b/crates/paketkoll_cache/src/lib.rs index b5d50be8..98356e75 100644 --- a/crates/paketkoll_cache/src/lib.rs +++ b/crates/paketkoll_cache/src/lib.rs @@ -1,5 +1,8 @@ //! Wrapping backend that performs disk cache +pub use from_archives::FromArchiveCache; pub use original_files::OriginalFilesCache; +mod from_archives; mod original_files; +mod utils; diff --git a/crates/paketkoll_cache/src/original_files.rs b/crates/paketkoll_cache/src/original_files.rs index 222575d3..cd2d5374 100644 --- a/crates/paketkoll_cache/src/original_files.rs +++ b/crates/paketkoll_cache/src/original_files.rs @@ -9,14 +9,17 @@ use anyhow::Context; use cached::stores::DiskCacheBuilder; use cached::DiskCache; use cached::IOCached; -use compact_str::format_compact; use compact_str::CompactString; +use paketkoll_types::backend::PackageManagerError; +use paketkoll_types::files::FileEntry; use paketkoll_types::{ backend::{Backend, Files, Name, OriginalFileQuery, PackageMap}, intern::{Interner, PackageRef}, }; +use crate::utils::format_package; + #[derive(Debug, Clone, PartialEq, Eq, Hash)] struct CacheKey { backend: &'static str, @@ -47,7 +50,7 @@ pub struct OriginalFilesCache { impl OriginalFilesCache { pub fn from_path(inner: Box, path: &Path) -> anyhow::Result { - let cache = DiskCacheBuilder::new(inner.name()) + let cache = DiskCacheBuilder::new("original_files") .set_refresh(true) .set_lifespan(60 * 60 * 24 * 30) // A month .set_disk_directory(path) @@ -76,7 +79,7 @@ impl Name for OriginalFilesCache { } impl Files for OriginalFilesCache { - fn files(&self, interner: &Interner) -> anyhow::Result> { + fn files(&self, interner: &Interner) -> anyhow::Result> { self.inner.files(interner) } @@ -108,19 +111,7 @@ impl Files for OriginalFilesCache { // Resolve exact version and ID of packages from the package map let cache_key = match packages.get(&PackageRef::get_or_intern(interner, &query.package)) { - Some(p) => { - let ids = p.ids.iter().map(|v| v.to_str(interner)); - let ids = ids.collect::>().join("#"); - format_compact!( - "{}:{}:{}:{}", - query.package, - p.architecture - .map(|v| v.to_str(interner)) - .unwrap_or_default(), - p.version, - ids - ) - } + Some(p) => format_package(p, interner), None => { tracing::warn!("Package not found: {}", query.package); uncached_queries.push(query.clone()); @@ -145,11 +136,25 @@ impl Files for OriginalFilesCache { // Insert the uncached results into the cache and update the results for (query, result) in uncached_results.into_iter() { - let cache_key = cache_keys.get(&query).context("Cache key not found")?; - self.cache.cache_set(cache_key.clone(), result.clone())?; + let cache_key = cache_keys.remove(&query).context("Cache key not found")?; + self.cache.cache_set(cache_key, result.clone())?; results.insert(query, result); } Ok(results) } + + fn files_from_archives( + &self, + filter: &[PackageRef], + package_map: &PackageMap, + interner: &Interner, + ) -> Result)>, PackageManagerError> { + self.inner + .files_from_archives(filter, package_map, interner) + } + + fn prefer_files_from_archive(&self) -> bool { + self.inner.prefer_files_from_archive() + } } diff --git a/crates/paketkoll_cache/src/utils.rs b/crates/paketkoll_cache/src/utils.rs new file mode 100644 index 00000000..f45794e1 --- /dev/null +++ b/crates/paketkoll_cache/src/utils.rs @@ -0,0 +1,21 @@ +//! Utility functions + +use compact_str::{format_compact, CompactString}; +use paketkoll_types::{intern::Interner, package::PackageInterned}; + +/// Format a package for use in cache keys +pub(crate) fn format_package(pkg: &PackageInterned, interner: &Interner) -> CompactString { + format_compact!( + "{}:{}:{}:{}", + pkg.name.to_str(interner), + pkg.architecture + .map(|v| v.to_str(interner)) + .unwrap_or_default(), + pkg.version, + pkg.ids + .iter() + .map(|v| v.to_str(interner)) + .collect::>() + .join("#") + ) +} diff --git a/crates/paketkoll_core/src/backend/arch.rs b/crates/paketkoll_core/src/backend/arch.rs index 3f128fbd..8ba86550 100644 --- a/crates/paketkoll_core/src/backend/arch.rs +++ b/crates/paketkoll_core/src/backend/arch.rs @@ -1,6 +1,7 @@ //! The Arch Linux (and derivatives) backend use std::{ + borrow::Cow, collections::BTreeSet, io::BufReader, iter::once, @@ -9,6 +10,7 @@ use std::{ use ahash::AHashSet; use anyhow::Context; +use bstr::{ByteSlice, ByteVec}; use compact_str::format_compact; use dashmap::{DashMap, DashSet}; use either::Either; @@ -22,7 +24,8 @@ use paketkoll_types::{files::FileEntry, intern::PackageRef}; use paketkoll_types::{intern::Interner, package::PackageInterned}; use crate::utils::{ - extract_files, group_queries_by_pkg, locate_package_file, package_manager_transaction, + convert_archive_entries, extract_files, group_queries_by_pkg, locate_package_file, + package_manager_transaction, }; use super::{FullBackend, PackageFilter}; @@ -195,27 +198,113 @@ impl Files for ArchLinux { // Now, lets extract the requested files from the package extract_files(archive, &queries, &mut results, pkg, |path| { - format_compact!("/{path}") + Some(format_compact!("/{path}")) })?; } Ok(results) } + + fn files_from_archives( + &self, + filter: &[PackageRef], + package_map: &PackageMap, + interner: &Interner, + ) -> Result)>, PackageManagerError> { + log::info!( + "Finding archives for {} packages (may take a while)", + filter.len() + ); + let archives = + iterate_pkg_archives(filter, package_map, interner, &self.pacman_config.cache_dir); + + log::info!( + "Loading files from {} archives (may take a while)", + filter.len() + ); + let results: anyhow::Result> = archives + .par_bridge() + .map(|value| { + value.and_then(|(pkg_ref, path)| Ok((pkg_ref, archive_to_entries(pkg_ref, &path)?))) + }) + .collect(); + + let results = results?; + Ok(results) + } +} + +/// Find all pkg archives for the given packages +fn iterate_pkg_archives<'inputs>( + filter: &'inputs [PackageRef], + packages: &'inputs PackageMap, + interner: &'inputs Interner, + cache_dir: &'inputs str, +) -> impl Iterator> + 'inputs { + let package_paths = filter.iter().map(|pkg_ref| { + let pkg = packages + .get(pkg_ref) + .context("Failed to find package in package map")?; + let name = pkg.name.to_str(interner); + // Get the full file name + let filename = format_pkg_filename(interner, pkg); + + let package_path = locate_package_file(&[cache_dir], &filename, name, download_arch_pkg)?; + // Error if we couldn't find the package + let package_path = package_path + .ok_or_else(|| anyhow::anyhow!("Failed to find or download package file for {name}"))?; + Ok((*pkg_ref, package_path)) + }); + + package_paths +} + +/// Convert deb archives to file entries +fn archive_to_entries(pkg_ref: PackageRef, pkg_file: &Path) -> anyhow::Result> { + // The package is a .tar.zst + let package_file = std::fs::File::open(pkg_file)?; + let decompressed = zstd::Decoder::new(package_file)?; + let archive = tar::Archive::new(decompressed); + + // Now, lets extract the requested files from the package + convert_archive_entries(archive, pkg_ref, NAME, |path| { + let path = path.as_os_str().as_encoded_bytes(); + if SPECIAL_ARCHIVE_FILES.contains(path) { + None + } else { + let path = path.trim_end_with(|ch| ch == '/'); + let path = bstr::concat([b"/", path]); + Some(Cow::Owned(path.into_path_buf().expect("Invalid path"))) + } + }) +} + +/// Files to ignore when reading archives +const SPECIAL_ARCHIVE_FILES: phf::Set<&'static [u8]> = phf::phf_set! { + b".BUILDINFO", + b".CHANGELOG", + b".PKGINFO", + b".INSTALL", + b".MTREE", +}; + +fn format_pkg_filename(interner: &Interner, package: &PackageInterned) -> String { + format!( + "{}-{}-{}.pkg.tar.zst", + package.name.to_str(interner), + package.version, + package + .architecture + .map(|e| e.to_str(interner)) + .unwrap_or("*") + ) } fn guess_pkg_file_name(interner: &Interner, pkg: &str, packages: &PackageMap) -> String { let package_match = if let Some(pkgref) = interner.get(pkg) { // Yay, it is probably installed, we know what to look for if let Some(package) = packages.get(&PackageRef::new(pkgref)) { - format!( - "{}-{}-{}.pkg.tar.zst", - pkg, - package.version, - package - .architecture - .map(|e| e.to_str(interner)) - .unwrap_or("*") - ) + format_pkg_filename(interner, package) } else { format!("{}-*-*.pkg.tar.zst", pkg) } diff --git a/crates/paketkoll_core/src/backend/deb.rs b/crates/paketkoll_core/src/backend/deb.rs index fc783c1b..c13cdd10 100644 --- a/crates/paketkoll_core/src/backend/deb.rs +++ b/crates/paketkoll_core/src/backend/deb.rs @@ -1,4 +1,5 @@ //! Backend for Debian and derivatives +use std::borrow::Cow; use std::fs::{DirEntry, File}; use std::io::BufReader; use std::path::{Path, PathBuf}; @@ -20,8 +21,8 @@ use paketkoll_types::package::PackageInterned; use crate::backend::PackageFilter; use crate::utils::{ - extract_files, group_queries_by_pkg, locate_package_file, package_manager_transaction, - CompressionFormat, + convert_archive_entries, extract_files, group_queries_by_pkg, locate_package_file, + missing_packages, package_manager_transaction, CompressionFormat, PackageQuery, }; use super::FullBackend; @@ -159,7 +160,7 @@ impl Files for Debian { let re = RegexSet::new(paths)?; std::fs::read_dir(db_root) - .context("Failed to read pacman database directory")? + .context("Failed to read dpkg database directory")? .par_bridge() .for_each(|entry| { if let Ok(entry) = entry { @@ -218,7 +219,7 @@ impl Files for Debian { let archive = tar::Archive::new(&mut decompressed); // Now, lets extract the requested files from the package extract_files(archive, &queries, &mut results, pkg, |path| { - path.trim_start_matches('.').into() + Some(path.trim_start_matches('.').into()) })?; break; } @@ -227,6 +228,157 @@ impl Files for Debian { Ok(results) } + + fn files_from_archives( + &self, + filter: &[PackageRef], + package_map: &PackageMap, + interner: &Interner, + ) -> Result)>, PackageManagerError> { + // Handle diversions: (parse output of dpkg-divert --list) + log::debug!("Loading diversions"); + let diversions = + divert::get_diversions(interner).context("Failed to get dpkg diversions")?; + + log::info!( + "Loading file data from dpkg cache archives for {} packages", + filter.len() + ); + let archives = iterate_deb_archives(filter, package_map, interner)?; + log::info!("Got list of {} archives, starting extracting information (this may take a while, especially on the first run before the disk cache can help)", filter.len()); + let results: anyhow::Result> = archives + .par_bridge() + .map(|value| { + value.and_then(|(pkg_ref, path)| { + Ok((pkg_ref, archive_to_entries(pkg_ref, &path, &diversions)?)) + }) + }) + .collect(); + log::info!("Extracted information from archives"); + + let results = results?; + Ok(results) + } + + // Debian doesn't have enough info for konfigkoll in files(), use files_from_archives() instead + // (and add a cache layer on top, since that is slow) + fn prefer_files_from_archive(&self) -> bool { + true + } +} + +/// Find all deb archives for the given packages +fn iterate_deb_archives<'inputs>( + filter: &'inputs [PackageRef], + packages: &'inputs PackageMap, + interner: &'inputs Interner, +) -> anyhow::Result> + 'inputs> { + let intermediate: Vec<_> = filter + .iter() + .map(|pkg_ref| { + let pkg = packages + .get(pkg_ref) + .expect("Failed to find package in package map"); + // For deb ids[0] always exist and may contain the architecture if it is not the primary + let name = pkg.ids[0].to_str(interner); + // Get the full deb file name + let deb_filename = format_deb_filename(interner, pkg); + + (pkg_ref, name, deb_filename) + }) + .collect(); + + // Attempt to download all missing packages: + let missing = missing_packages( + &[CACHE_PATH], + intermediate.iter().map(|(_, name, deb)| PackageQuery { + package_match: deb, + package: name, + }), + )?; + + if !missing.is_empty() { + log::info!("Downloading missing packages (installed but not in local cache)"); + download_debs(&missing)?; + } + + let package_paths = intermediate + .into_iter() + .map(|(pkg_ref, name, deb_filename)| { + let package_path = + locate_package_file(&[CACHE_PATH], &deb_filename, name, download_deb)?; + // Error if we couldn't find the package + let package_path = package_path.ok_or_else(|| { + anyhow::anyhow!("Failed to find or download package file for {name}") + })?; + Ok((*pkg_ref, package_path)) + }); + + Ok(package_paths) +} + +/// Convert deb archives to file entries +fn archive_to_entries( + pkg_ref: PackageRef, + deb_file: &Path, + diversions: &divert::Diversions, +) -> anyhow::Result> { + log::debug!("Processing {}", deb_file.display()); + // The package is a .deb, which is actually an ar archive + let package_file = File::open(deb_file)?; + let mut archive = ar::Archive::new(package_file); + + // We want the data.tar.xz file (or other compression scheme) + while let Some(entry) = archive.next_entry() { + let mut entry = entry?; + if entry.header().identifier().starts_with(b"data.tar") { + let extension: CompactString = std::str::from_utf8(entry.header().identifier())? + .split('.') + .last() + .ok_or_else(|| anyhow::anyhow!("No file extension found"))? + .into(); + let mut decompressed = CompressionFormat::from_extension(&extension, &mut entry)?; + let archive = tar::Archive::new(&mut decompressed); + // Now, lets extract the requested files from the package + let mut entries = convert_archive_entries(archive, pkg_ref, NAME, |path| { + let p = path + .as_os_str() + .as_encoded_bytes() + .trim_start_with(|ch| ch == '.'); + let p = if p != b"/" { + p.trim_end_with(|ch| ch == '/') + } else { + p + }; + Some(Cow::Borrowed(p.to_path().expect("Invalid path"))) + })?; + + for entry in entries.iter_mut() { + // Apply diversions + if let Some(diversion) = diversions.get(&entry.path) { + if Some(diversion.by_package) != entry.package { + // This file is diverted + entry.path.clone_from(&diversion.new_path); + } + } + } + return Ok(entries); + } + } + Err(anyhow::anyhow!("Failed to find data.tar in {deb_file:?}")) +} + +/// Given a package name, try to figure out the full deb file name +fn format_deb_filename(interner: &Interner, package: &PackageInterned) -> String { + format!( + "{}_{}_{}.deb", + package.name.to_str(interner), + package.version.replace(':', "%3a"), + package + .architecture + .map(|e| e.to_str(interner)) + .unwrap_or("*") + ) } /// Given a package name, try to figure out the full deb file name @@ -234,15 +386,7 @@ fn guess_deb_file_name(interner: &Interner, pkg: &str, packages: &PackageMap) -> if let Some(pkgref) = interner.get(pkg) { // Yay, it is probably installed, we know what to look for if let Some(package) = packages.get(&PackageRef::new(pkgref)) { - format!( - "{}_{}_{}.deb", - pkg, - package.version.replace(':', "%3a"), - package - .architecture - .map(|e| e.to_str(interner)) - .unwrap_or("*") - ) + format_deb_filename(interner, package) } else { format!("{}_*_*.deb", pkg) } @@ -446,16 +590,40 @@ impl Packages for Debian { } } -// To get the original package file itno the cache: apt install --reinstall -d pkgname +// To get the original package file into the cache: apt install --reinstall -d pkgname // /var/cache/apt/archives/pkgname_version_arch.deb // arch: all, amd64, arm64, ... // Epoch separator (normally :) is now %3a (URL encoded) impl FullBackend for Debian {} +fn download_debs(pkgs: &[&str]) -> Result<(), anyhow::Error> { + let status = std::process::Command::new("apt-get") + .args([ + "install", + "--reinstall", + "-y", + "--no-install-recommends", + "-d", + ]) + .args(pkgs) + .status()?; + if !status.success() { + log::warn!("Failed to download package for {pkgs:?}"); + }; + Ok(()) +} + fn download_deb(pkg: &str) -> Result<(), anyhow::Error> { let status = std::process::Command::new("apt-get") - .args(["install", "--reinstall", "-d", pkg]) + .args([ + "install", + "--reinstall", + "-y", + "--no-install-recommends", + "-d", + pkg, + ]) .status()?; if !status.success() { log::warn!("Failed to download package for {pkg}"); diff --git a/crates/paketkoll_core/src/backend/systemd_tmpfiles.rs b/crates/paketkoll_core/src/backend/systemd_tmpfiles.rs index f3eae821..05180650 100644 --- a/crates/paketkoll_core/src/backend/systemd_tmpfiles.rs +++ b/crates/paketkoll_core/src/backend/systemd_tmpfiles.rs @@ -12,12 +12,15 @@ use ahash::AHashMap; use anyhow::Context; use compact_str::CompactString; -use paketkoll_types::backend::PackageMap; use paketkoll_types::backend::{Files, Name, OriginalFileQuery}; use paketkoll_types::files::{ Checksum, DeviceNode, DeviceType, Directory, Fifo, FileEntry, FileFlags, Gid, Mode, Permissions, Properties, RegularFile, RegularFileBasic, RegularFileSystemd, Symlink, Uid, }; +use paketkoll_types::{ + backend::{PackageManagerError, PackageMap}, + intern::{Interner, PackageRef}, +}; use paketkoll_utils::checksum::{sha256_buffer, sha256_readable}; use paketkoll_utils::MODE_MASK; use systemd_tmpfiles::specifier::Resolve; @@ -94,6 +97,17 @@ impl Files for SystemdTmpfiles { ) -> anyhow::Result>> { anyhow::bail!("Original file queries are not supported for systemd-tmpfiles") } + + fn files_from_archives( + &self, + _filter: &[PackageRef], + _package_map: &PackageMap, + _interner: &Interner, + ) -> Result)>, PackageManagerError> { + Err(PackageManagerError::UnsupportedOperation( + "Operation not supported for systemd-tmpfiles", + )) + } } /// Parse the systemd-tmpfiles output into [`FileEntry`]s that are usable by the shared later stages. diff --git a/crates/paketkoll_core/src/file_ops.rs b/crates/paketkoll_core/src/file_ops.rs index f526bd40..390e5c5c 100644 --- a/crates/paketkoll_core/src/file_ops.rs +++ b/crates/paketkoll_core/src/file_ops.rs @@ -299,7 +299,11 @@ pub fn canonicalize_file_entries(results: &mut Vec) { } } } - (None, _) => log::error!("Failed to resolve parent of path: {:?}", file_entry.path), + (None, _) => log::error!( + "Failed to resolve parent of path: {:?}: {:?}", + file_entry.path, + file_entry + ), (_, None) => { log::error!("Failed to resolve filenameI of path: {:?}", file_entry.path); } diff --git a/crates/paketkoll_core/src/utils.rs b/crates/paketkoll_core/src/utils.rs index 141ad808..e435410d 100644 --- a/crates/paketkoll_core/src/utils.rs +++ b/crates/paketkoll_core/src/utils.rs @@ -155,6 +155,58 @@ pub(crate) fn locate_package_file( Ok(None) } +pub(crate) struct PackageQuery<'a> { + pub(crate) package_match: &'a str, + pub(crate) package: &'a str, +} + +/// Attempt to search a directory based cache and if not found, download the package +#[cfg(feature = "__extraction")] +pub(crate) fn missing_packages<'strings>( + dir_candidates: &[&str], + package_matches: impl Iterator>, +) -> Result, anyhow::Error> { + let mut missing = vec![]; + // Try to locate package + for PackageQuery { + package_match, + package, + } in package_matches + { + for dir in dir_candidates.iter() { + let path = format!("{}/{}", dir, package_match); + let entries = glob::glob_with( + &path, + glob::MatchOptions { + case_sensitive: true, + require_literal_separator: true, + require_literal_leading_dot: true, + }, + ); + match entries { + Ok(paths) => { + let mut paths: SmallVec<[_; 5]> = paths.collect::>()?; + paths.sort(); + if paths.len() > 1 { + log::warn!( + "Found multiple matches for {package}, taking latest in sort order: {}", + paths + .last() + .expect("We know there is at least one") + .display() + ); + } + if paths.is_empty() { + missing.push(package); + } + } + Err(_) => continue, + } + } + } + Ok(missing) +} + /// Extract files from a generic tar archive #[cfg(feature = "__extraction")] pub(crate) fn extract_files( @@ -162,7 +214,7 @@ pub(crate) fn extract_files( queries: &AHashSet<&str>, results: &mut AHashMap>, pkg: &str, - name_manger: impl Fn(&str) -> CompactString, + name_map_filter: impl Fn(&str) -> Option, ) -> Result<(), anyhow::Error> { let mut seen = AHashSet::new(); @@ -175,7 +227,10 @@ pub(crate) fn extract_files( let path = path .to_str() .ok_or_else(|| anyhow::anyhow!("Failed to convert path to string"))?; - let path = name_manger(path); + let path = match name_map_filter(path) { + Some(v) => v, + None => continue, + }; if let Some(pkg_idx) = queries.get(path.as_str()) { seen.insert(*pkg_idx); let mut contents = Vec::new(); @@ -204,3 +259,125 @@ pub(crate) fn extract_files( }; Ok(()) } + +/// Convert a stream of tar entries to a list of file entries +#[cfg(feature = "__extraction")] +pub(crate) fn convert_archive_entries( + mut archive: tar::Archive, + pkg_ref: paketkoll_types::intern::PackageRef, + source: &'static str, + name_map_filter: impl Fn(&std::path::Path) -> Option>, +) -> Result, anyhow::Error> { + use std::time::SystemTime; + + use paketkoll_types::files::{ + Directory, FileEntry, FileFlags, Gid, Mode, Properties, RegularFile, Symlink, Uid, + }; + use paketkoll_utils::checksum::sha256_readable; + + let mut results = AHashMap::new(); + for entry in archive + .entries() + .context("Failed to read package archive")? + { + let mut entry = entry?; + let path = entry.path()?; + let path = path.as_ref(); + let path = match name_map_filter(path) { + Some(v) => v.into_owned(), + None => continue, + }; + let mode = Mode::new(entry.header().mode()?); + let owner = Uid::new(entry.header().uid()?.try_into()?); + let group = Gid::new(entry.header().gid()?.try_into()?); + match entry.header().entry_type() { + tar::EntryType::Regular | tar::EntryType::Continuous => { + let size = entry.size(); + assert_eq!(size, entry.header().size()?); + let mtime = entry.header().mtime()?; + results.insert( + path.clone(), + FileEntry { + package: Some(pkg_ref), + path, + properties: Properties::RegularFile(RegularFile { + mode, + owner, + group, + size, + mtime: SystemTime::UNIX_EPOCH + std::time::Duration::from_secs(mtime), + checksum: sha256_readable(&mut entry)?, + }), + flags: FileFlags::empty(), + source, + seen: Default::default(), + }, + ); + } + tar::EntryType::Link | tar::EntryType::GNULongLink => { + let link = entry.link_name()?.expect("No link name"); + let link = name_map_filter(link.as_ref()) + .expect("Filtered link name") + .into_owned(); + let existing = results + .get(&link) + .expect("Links must refer to already archived files"); + let mut new = existing.clone(); + new.path = path.clone(); + results.insert(path.clone(), new); + } + tar::EntryType::Symlink => { + let link = entry.link_name()?; + results.insert( + path.clone(), + FileEntry { + package: Some(pkg_ref), + path, + properties: Properties::Symlink(Symlink { + owner, + group, + target: link + .ok_or(anyhow::anyhow!("Failed to get link target"))? + .into(), + }), + flags: FileFlags::empty(), + source, + seen: Default::default(), + }, + ); + } + tar::EntryType::Char | tar::EntryType::Block | tar::EntryType::Fifo => { + results.insert( + path.clone(), + FileEntry { + package: Some(pkg_ref), + path, + properties: Properties::Special, + flags: FileFlags::empty(), + source, + seen: Default::default(), + }, + ); + } + tar::EntryType::Directory => { + results.insert( + path.clone(), + FileEntry { + package: Some(pkg_ref), + path, + properties: Properties::Directory(Directory { mode, owner, group }), + flags: FileFlags::empty(), + source, + seen: Default::default(), + }, + ); + } + tar::EntryType::GNUSparse + | tar::EntryType::GNULongName + | tar::EntryType::XGlobalHeader + | tar::EntryType::XHeader => todo!(), + _ => todo!(), + } + } + Ok(results.into_values().collect()) +} diff --git a/crates/paketkoll_types/Cargo.toml b/crates/paketkoll_types/Cargo.toml index 08ec736b..271bde8e 100644 --- a/crates/paketkoll_types/Cargo.toml +++ b/crates/paketkoll_types/Cargo.toml @@ -11,7 +11,7 @@ version = "0.1.0" [features] # Include support for serde on public datatypes -serde = ["dep:serde", "smallvec/serde", "bitflags/serde", "compact_str/serde"] +serde = ["dep:serde", "dep:serde_bytes", "smallvec/serde", "bitflags/serde", "compact_str/serde"] [dependencies] ahash.workspace = true @@ -28,6 +28,7 @@ lasso = { workspace = true, features = [ ] } nix = { workspace = true, features = ["fs", "user"] } serde = { workspace = true, optional = true, features = ["derive"] } +serde_bytes = { workspace = true, optional = true } smallvec.workspace = true strum.workspace = true thiserror.workspace = true diff --git a/crates/paketkoll_types/src/backend.rs b/crates/paketkoll_types/src/backend.rs index 98417478..a67d5250 100644 --- a/crates/paketkoll_types/src/backend.rs +++ b/crates/paketkoll_types/src/backend.rs @@ -60,12 +60,28 @@ pub trait Files: Name { /// any available metadata such as checksums or timestamps about those files fn files(&self, interner: &Interner) -> anyhow::Result>; + /// Attempt to get file information from archives in the package cache (if supported) + /// + /// Additional archives may be downloaded if needed. + fn files_from_archives( + &self, + filter: &[PackageRef], + package_map: &PackageMap, + interner: &Interner, + ) -> Result)>, PackageManagerError>; + /// True if this backend may benefit from path canonicalization for certain scans /// (i.e. paths may be inaccurate) fn may_need_canonicalization(&self) -> bool { false } + /// True if this backend may benefit from path canonicalization for certain scans + /// (i.e. paths may be inaccurate) + fn prefer_files_from_archive(&self) -> bool { + false + } + /// Find the owners of the specified files fn owning_packages( &self, @@ -99,7 +115,7 @@ pub trait Packages: Name { let packages = self .packages(interner) .with_context(|| anyhow!("Failed to load package list"))?; - Ok(packages_to_package_map(packages)) + Ok(packages_to_package_map(packages.iter())) } /// Perform installation and uninstallation of a bunch of packages @@ -121,7 +137,7 @@ pub trait Packages: Name { fn remove_unused(&self, ask_confirmation: bool) -> Result<(), PackageManagerError>; } -/// Errors that package manager transactions can produce +/// Errors that backends can produce #[derive(Debug, thiserror::Error)] pub enum PackageManagerError { /// This operation isn't supported by this backend @@ -133,12 +149,14 @@ pub enum PackageManagerError { } /// Convert a package vector to a package map -pub fn packages_to_package_map(packages: Vec) -> PackageMap { +pub fn packages_to_package_map<'a>( + packages: impl Iterator, +) -> PackageMap { let mut package_map = - AHashMap::with_capacity_and_hasher(packages.len(), ahash::RandomState::new()); - for package in packages.into_iter() { + AHashMap::with_capacity_and_hasher(packages.size_hint().0, ahash::RandomState::new()); + for package in packages { if package.ids.is_empty() { - package_map.insert(package.name, package); + package_map.insert(package.name, package.clone()); } else { for id in &package.ids { package_map.insert(*id, package.clone()); diff --git a/crates/paketkoll_types/src/files.rs b/crates/paketkoll_types/src/files.rs index 757b1e02..5a16aa71 100644 --- a/crates/paketkoll_types/src/files.rs +++ b/crates/paketkoll_types/src/files.rs @@ -163,15 +163,9 @@ impl std::fmt::Display for Gid { #[cfg_attr(feature = "serde", serde(rename_all = "snake_case"))] #[non_exhaustive] pub enum Checksum { - #[cfg_attr( - feature = "serde", - serde(serialize_with = "crate::utils::buffer_to_hex") - )] + #[cfg_attr(feature = "serde", serde(with = "serde_bytes"))] Md5([u8; 16]), - #[cfg_attr( - feature = "serde", - serde(serialize_with = "crate::utils::buffer_to_hex") - )] + #[cfg_attr(feature = "serde", serde(with = "serde_bytes"))] Sha256([u8; 32]), } diff --git a/crates/paketkoll_types/src/lib.rs b/crates/paketkoll_types/src/lib.rs index dd42f3ae..f8ee6b25 100644 --- a/crates/paketkoll_types/src/lib.rs +++ b/crates/paketkoll_types/src/lib.rs @@ -5,4 +5,3 @@ pub mod files; pub mod intern; pub mod issue; pub mod package; -mod utils; diff --git a/crates/paketkoll_types/src/utils.rs b/crates/paketkoll_types/src/utils.rs deleted file mode 100644 index 9ac8ec7c..00000000 --- a/crates/paketkoll_types/src/utils.rs +++ /dev/null @@ -1,16 +0,0 @@ -//! Internal helpers - -/// Serializes `buffer` to a lowercase hex string. -#[cfg(feature = "serde")] -pub(crate) fn buffer_to_hex(buffer: &T, serializer: S) -> Result -where - T: AsRef<[u8]>, - S: serde::Serializer, -{ - let buffer = buffer.as_ref(); - // We only use this for checksum, so small buffers. On the stack it goes: - let mut buf = [0u8; 128]; - let s = faster_hex::hex_encode(buffer, &mut buf) - .expect("This shouldn't fail on the data we use it for"); - serializer.serialize_str(s[0..buffer.len() * 2].as_ref()) -}