Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

nydusify: introduce chunkdict generate subcommand #1401

Open
wants to merge 12 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
295 changes: 295 additions & 0 deletions builder/src/chunkdict_generator.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,295 @@
// Copyright (C) 2023 Nydus Developers. All rights reserved.
//
// SPDX-License-Identifier: Apache-2.0

//! Generate Chunkdict RAFS bootstrap.
//! -------------------------------------------------------------------------------------------------
//! Bug 1: Inconsistent Chunk Size Leading to Blob Size Less Than 4K(v6_block_size)
//! Description: The size of chunks is not consistent, which results in the possibility that a blob,
//! composed of a group of these chunks, may be less than 4K(v6_block_size) in size.
//! This inconsistency leads to a failure in passing the size check.
//! -------------------------------------------------------------------------------------------------
//! Bug 2: Incorrect Chunk Number Calculation Due to Premature Check Logic
//! Description: The current logic for calculating the chunk number is based on the formula size/chunk size.
//! However, this approach is flawed as it precedes the actual check which accounts for chunk statistics.
//! Consequently, this leads to inaccurate counting of chunk numbers.

use super::core::node::{ChunkSource, NodeInfo};
use super::{BlobManager, Bootstrap, BootstrapManager, BuildContext, BuildOutput, Tree};
use crate::core::node::Node;
use crate::NodeChunk;
use anyhow::Result;
use nydus_rafs::metadata::chunk::ChunkWrapper;
use nydus_rafs::metadata::inode::InodeWrapper;
use nydus_rafs::metadata::layout::RafsXAttrs;
use nydus_storage::meta::BlobChunkInfoV1Ondisk;
use nydus_utils::compress::Algorithm;
use nydus_utils::digest::RafsDigest;
use std::ffi::OsString;
use std::mem::size_of;
use std::path::PathBuf;
use std::sync::Arc;

#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct ChunkdictChunkInfo {
pub image_reference: String,
pub version: String,
pub chunk_blob_id: String,
pub chunk_digest: String,
pub chunk_compressed_size: u32,
pub chunk_uncompressed_size: u32,
pub chunk_compressed_offset: u64,
pub chunk_uncompressed_offset: u64,
}

pub struct ChunkdictBlobInfo {
pub blob_id: String,
pub blob_compressed_size: u64,
pub blob_uncompressed_size: u64,
pub blob_compressor: String,
pub blob_meta_ci_compressed_size: u64,
pub blob_meta_ci_uncompressed_size: u64,
pub blob_meta_ci_offset: u64,
}

/// Struct to generate chunkdict RAFS bootstrap.
pub struct Generator {}

impl Generator {
// Generate chunkdict RAFS bootstrap.
pub fn generate(
ctx: &mut BuildContext,
bootstrap_mgr: &mut BootstrapManager,
blob_mgr: &mut BlobManager,
chunkdict_chunks_origin: Vec<ChunkdictChunkInfo>,
chunkdict_blobs: Vec<ChunkdictBlobInfo>,
) -> Result<BuildOutput> {
// Validate and remove chunks whose belonged blob sizes are smaller than a block.
let mut chunkdict_chunks = chunkdict_chunks_origin.to_vec();
Self::validate_and_remove_chunks(ctx, &mut chunkdict_chunks);
// build root tree
let mut tree = Self::build_root_tree(ctx)?;

// build child tree
let child = Self::build_child_tree(ctx, blob_mgr, &chunkdict_chunks, &chunkdict_blobs)?;
let result = vec![child];
tree.children = result;

Self::validate_tree(&tree)?;

// build bootstrap
let mut bootstrap_ctx = bootstrap_mgr.create_ctx()?;
let mut bootstrap = Bootstrap::new(tree)?;
bootstrap.build(ctx, &mut bootstrap_ctx)?;

let blob_table = blob_mgr.to_blob_table(ctx)?;
let storage = &mut bootstrap_mgr.bootstrap_storage;
bootstrap.dump(ctx, storage, &mut bootstrap_ctx, &blob_table)?;

BuildOutput::new(blob_mgr, &bootstrap_mgr.bootstrap_storage)
}

/// validate tree
fn validate_tree(tree: &Tree) -> Result<()> {
let pre = &mut |t: &Tree| -> Result<()> {
let node = t.lock_node();
debug!("chunkdict tree: ");
debug!("inode: {}", node);
for chunk in &node.chunks {
debug!("\t chunk: {}", chunk);
}
Ok(())
};
tree.walk_dfs_pre(pre)?;
debug!("chunkdict tree is valid.");
Ok(())
}

/// check blob uncompressed size is bigger than block
fn validate_and_remove_chunks(ctx: &mut BuildContext, chunkdict: &mut Vec<ChunkdictChunkInfo>) {
let mut chunk_sizes = std::collections::HashMap::new();

// Accumulate the uncompressed size for each chunk_blob_id
for chunk in chunkdict.iter() {
*chunk_sizes.entry(chunk.chunk_blob_id.clone()).or_insert(0) +=
chunk.chunk_uncompressed_size as u64;
}
// Find all chunk_blob_ids with a total uncompressed size > v6_block_size
let small_chunks: Vec<String> = chunk_sizes
.into_iter()
.filter(|&(_, size)| size < ctx.v6_block_size())
.inspect(|(id, _)| {
eprintln!(
"Warning: Blob with id '{}' is smaller than {} bytes.",
id,
ctx.v6_block_size()
)
})
.map(|(id, _)| id)
.collect();

// Retain only chunks with chunk_blob_id that has a total uncompressed size > v6_block_size
chunkdict.retain(|chunk| !small_chunks.contains(&chunk.chunk_blob_id));
}

/// Build root tree
pub fn build_root_tree(ctx: &mut BuildContext) -> Result<Tree> {
// inode
let mut inode = InodeWrapper::new(ctx.fs_version);
inode.set_ino(1);
inode.set_uid(1000);
inode.set_gid(1000);
inode.set_projid(0);
inode.set_mode(0o660 | libc::S_IFDIR as u32);
inode.set_nlink(3);
inode.set_name_size("/".len());
inode.set_rdev(0);
inode.set_blocks(256);
let node_info = NodeInfo {
explicit_uidgid: true,
src_dev: 0,
src_ino: 0,
rdev: 0,
source: PathBuf::from("/"),
path: PathBuf::from("/"),
target: PathBuf::from("/"),
target_vec: vec![OsString::from("/")],
symlink: None,
xattrs: RafsXAttrs::default(),
v6_force_extended_inode: true,
};
let root_node = Node::new(inode, node_info, 0);
let tree = Tree::new(root_node);
Ok(tree)
}

/// Build child tree
fn build_child_tree(
ctx: &mut BuildContext,
blob_mgr: &mut BlobManager,
chunkdict_chunks: &[ChunkdictChunkInfo],
chunkdict_blobs: &[ChunkdictBlobInfo],
) -> Result<Tree> {
// node
let mut inode = InodeWrapper::new(ctx.fs_version);
inode.set_ino(2);
inode.set_uid(0);
inode.set_gid(0);
inode.set_projid(0);
inode.set_mode(0o660 | libc::S_IFREG as u32);
inode.set_nlink(1);
inode.set_name_size("chunkdict".len());
inode.set_rdev(0);
inode.set_blocks(256);
let node_info = NodeInfo {
explicit_uidgid: true,
src_dev: 0,
src_ino: 1,
rdev: 0,
source: PathBuf::from("/"),
path: PathBuf::from("/chunkdict"),
target: PathBuf::from("/chunkdict"),
target_vec: vec![OsString::from("/"), OsString::from("/chunkdict")],
symlink: None,
xattrs: RafsXAttrs::new(),
v6_force_extended_inode: true,
};
let mut node = Node::new(inode, node_info, 0);

// insert chunks
Self::insert_chunks(ctx, blob_mgr, &mut node, chunkdict_chunks, chunkdict_blobs)?;

let node_size: u64 = node
.chunks
.iter()
.map(|chunk| chunk.inner.uncompressed_size() as u64)
.sum();
node.inode.set_size(node_size);

// update child count
node.inode.set_child_count(node.chunks.len() as u32);

let child = Tree::new(node);
child
.lock_node()
.v5_set_dir_size(ctx.fs_version, &child.children);
Ok(child)
}

/// Insert chunks
fn insert_chunks(
ctx: &mut BuildContext,
blob_mgr: &mut BlobManager,
node: &mut Node,
chunkdict_chunks: &[ChunkdictChunkInfo],
chunkdict_blobs: &[ChunkdictBlobInfo],
) -> Result<()> {
for (index, chunk_info) in chunkdict_chunks.iter().enumerate() {
let chunk_size: u32 = chunk_info.chunk_compressed_size;
let file_offset = index as u64 * chunk_size as u64;
let mut chunk = ChunkWrapper::new(ctx.fs_version);

// update blob context
let (blob_index, blob_ctx) =
blob_mgr.get_or_cerate_blob_for_chunkdict(ctx, &chunk_info.chunk_blob_id)?;
if blob_ctx.blob_id.is_empty() {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A chunk must belong to a blob, when will the blob_id be empty?

blob_ctx.blob_id = chunk_info.chunk_blob_id.clone();
}

// blob_ctx.
let chunk_uncompressed_size = chunk_info.chunk_uncompressed_size;
let pre_d_offset = blob_ctx.current_uncompressed_offset;
blob_ctx.uncompressed_blob_size = pre_d_offset + chunk_uncompressed_size as u64;
blob_ctx.current_uncompressed_offset += chunk_uncompressed_size as u64;

blob_ctx.blob_meta_header.set_ci_uncompressed_size(
blob_ctx.blob_meta_header.ci_uncompressed_size()
+ size_of::<BlobChunkInfoV1Ondisk>() as u64,
);
blob_ctx.blob_meta_header.set_ci_compressed_size(
blob_ctx.blob_meta_header.ci_uncompressed_size()
+ size_of::<BlobChunkInfoV1Ondisk>() as u64,
);
let chunkdict_blob_info = chunkdict_blobs
.iter()
.find(|blob| blob.blob_id == chunk_info.chunk_blob_id)
.unwrap();
blob_ctx.blob_compressor = match chunkdict_blob_info.blob_compressor.as_str() {
"None" => Algorithm::None,
"Lz4Block" => Algorithm::Lz4Block,
"GZip" => Algorithm::GZip,
"Zstd" => Algorithm::Zstd,
_ => Algorithm::None,
};
blob_ctx
.blob_meta_header
.set_ci_uncompressed_size(chunkdict_blob_info.blob_meta_ci_uncompressed_size);
blob_ctx
.blob_meta_header
.set_ci_compressed_size(chunkdict_blob_info.blob_meta_ci_compressed_size);
blob_ctx
.blob_meta_header
.set_ci_compressed_offset(chunkdict_blob_info.blob_meta_ci_offset);
blob_ctx.blob_meta_header.set_ci_compressor(Algorithm::Zstd);

// update chunk
let chunk_index = blob_ctx.alloc_chunk_index()?;
chunk.set_blob_index(blob_index);
chunk.set_index(chunk_index);
chunk.set_file_offset(file_offset);
chunk.set_compressed_size(chunk_info.chunk_compressed_size);
chunk.set_compressed_offset(chunk_info.chunk_compressed_offset);
chunk.set_uncompressed_size(chunk_info.chunk_uncompressed_size);
chunk.set_uncompressed_offset(chunk_info.chunk_uncompressed_offset);
chunk.set_id(RafsDigest::from_string(&chunk_info.chunk_digest));

debug!("chunk id: {}", chunk.id());

node.chunks.push(NodeChunk {
source: ChunkSource::Build,
inner: Arc::new(chunk.clone()),
});
}
Ok(())
}
}
37 changes: 37 additions & 0 deletions builder/src/core/context.rs
Original file line number Diff line number Diff line change
Expand Up @@ -597,6 +597,9 @@ impl BlobContext {
blob_ctx
.blob_meta_header
.set_encrypted(features.contains(BlobFeatures::ENCRYPTED));
blob_ctx
.blob_meta_header
.set_is_chunkdict_generated(features.contains(BlobFeatures::IS_CHUNKDICT_GENERATED));

blob_ctx
}
Expand Down Expand Up @@ -955,6 +958,29 @@ impl BlobManager {
}
}

/// Get or cerate blob for chunkdict, this is used for chunk deduplication.
pub fn get_or_cerate_blob_for_chunkdict(
&mut self,
ctx: &BuildContext,
id: &str,
) -> Result<(u32, &mut BlobContext)> {
if self.get_blob_idx_by_id(id).is_none() {
let blob_ctx = Self::new_blob_ctx(ctx)?;
self.current_blob_index = Some(self.alloc_index()?);
self.add_blob(blob_ctx);
} else {
self.current_blob_index = self.get_blob_idx_by_id(id);
}

// Safe to unwrap because the blob context has been added.
Ok(self.get_current_blob().unwrap())
}

/// Determine if the given blob has been created.
pub fn has_blob(&self, blob_id: &str) -> bool {
self.get_blob_idx_by_id(blob_id).is_some()
}

/// Set the global chunk dictionary for chunk deduplication.
pub fn set_chunk_dict(&mut self, dict: Arc<dyn ChunkDict>) {
self.global_chunk_dict = dict
Expand Down Expand Up @@ -1097,6 +1123,7 @@ impl BlobManager {
compressed_blob_size,
blob_features,
flags,
build_ctx.is_chunkdict_generated,
);
}
RafsBlobTable::V6(table) => {
Expand All @@ -1116,6 +1143,7 @@ impl BlobManager {
ctx.blob_toc_digest,
ctx.blob_meta_size,
ctx.blob_toc_size,
build_ctx.is_chunkdict_generated,
ctx.blob_meta_header,
ctx.cipher_object.clone(),
ctx.cipher_ctx.clone(),
Expand Down Expand Up @@ -1293,6 +1321,9 @@ pub struct BuildContext {
pub configuration: Arc<ConfigV2>,
/// Generate the blob cache and blob meta
pub blob_cache_generator: Option<BlobCacheGenerator>,

/// Whether is chunkdict.
pub is_chunkdict_generated: bool,
}

impl BuildContext {
Expand Down Expand Up @@ -1361,6 +1392,7 @@ impl BuildContext {
features,
configuration: Arc::new(ConfigV2::default()),
blob_cache_generator: None,
is_chunkdict_generated: false,
}
}

Expand All @@ -1379,6 +1411,10 @@ impl BuildContext {
pub fn set_configuration(&mut self, config: Arc<ConfigV2>) {
self.configuration = config;
}

pub fn set_is_chunkdict(&mut self, is_chunkdict: bool) {
self.is_chunkdict_generated = is_chunkdict;
}
}

impl Default for BuildContext {
Expand Down Expand Up @@ -1411,6 +1447,7 @@ impl Default for BuildContext {
features: Features::new(),
configuration: Arc::new(ConfigV2::default()),
blob_cache_generator: None,
is_chunkdict_generated: false,
}
}
}
Expand Down
Loading
Loading