From d2b5dc48f5fdd7da181b3b3c77a64af551a226ad Mon Sep 17 00:00:00 2001 From: Alin Dima Date: Mon, 12 Aug 2024 16:56:00 +0300 Subject: [PATCH] fix av-distribution Jaeger spans mem leak (#5321) Fixes https://github.com/paritytech/polkadot-sdk/issues/5258 --- .../network/availability-distribution/src/lib.rs | 12 ++++++------ .../availability-distribution/src/requester/mod.rs | 8 +++++--- .../availability-distribution/src/requester/tests.rs | 4 ++-- prdoc/pr_5321.prdoc | 11 +++++++++++ 4 files changed, 24 insertions(+), 11 deletions(-) create mode 100644 prdoc/pr_5321.prdoc diff --git a/polkadot/node/network/availability-distribution/src/lib.rs b/polkadot/node/network/availability-distribution/src/lib.rs index ec2c01f99b01..d3185e0af809 100644 --- a/polkadot/node/network/availability-distribution/src/lib.rs +++ b/polkadot/node/network/availability-distribution/src/lib.rs @@ -25,7 +25,7 @@ use polkadot_node_subsystem::{ jaeger, messages::AvailabilityDistributionMessage, overseer, FromOrchestra, OverseerSignal, SpawnedSubsystem, SubsystemError, }; -use polkadot_primitives::Hash; +use polkadot_primitives::{BlockNumber, Hash}; use std::collections::HashMap; /// Error and [`Result`] type for this subsystem. @@ -104,7 +104,7 @@ impl AvailabilityDistributionSubsystem { /// Start processing work as passed on from the Overseer. async fn run(self, mut ctx: Context) -> std::result::Result<(), FatalError> { let Self { mut runtime, recvs, metrics, req_protocol_names } = self; - let mut spans: HashMap = HashMap::new(); + let mut spans: HashMap = HashMap::new(); let IncomingRequestReceivers { pov_req_receiver, @@ -162,7 +162,7 @@ impl AvailabilityDistributionSubsystem { }; let span = jaeger::PerLeafSpan::new(cloned_leaf.span, "availability-distribution"); - spans.insert(cloned_leaf.hash, span); + spans.insert(cloned_leaf.hash, (cloned_leaf.number, span)); log_error( requester .get_mut() @@ -172,8 +172,8 @@ impl AvailabilityDistributionSubsystem { &mut warn_freq, )?; }, - FromOrchestra::Signal(OverseerSignal::BlockFinalized(hash, _)) => { - spans.remove(&hash); + FromOrchestra::Signal(OverseerSignal::BlockFinalized(_hash, finalized_number)) => { + spans.retain(|_hash, (block_number, _span)| *block_number > finalized_number); }, FromOrchestra::Signal(OverseerSignal::Conclude) => return Ok(()), FromOrchestra::Communication { @@ -189,7 +189,7 @@ impl AvailabilityDistributionSubsystem { } => { let span = spans .get(&relay_parent) - .map(|span| span.child("fetch-pov")) + .map(|(_, span)| span.child("fetch-pov")) .unwrap_or_else(|| jaeger::Span::new(&relay_parent, "fetch-pov")) .with_trace_id(candidate_hash) .with_candidate(candidate_hash) diff --git a/polkadot/node/network/availability-distribution/src/requester/mod.rs b/polkadot/node/network/availability-distribution/src/requester/mod.rs index efbdceb43bdd..0175161af70d 100644 --- a/polkadot/node/network/availability-distribution/src/requester/mod.rs +++ b/polkadot/node/network/availability-distribution/src/requester/mod.rs @@ -39,7 +39,9 @@ use polkadot_node_subsystem_util::{ availability_chunks::availability_chunk_index, runtime::{get_occupied_cores, RuntimeInfo}, }; -use polkadot_primitives::{CandidateHash, CoreIndex, Hash, OccupiedCore, SessionIndex}; +use polkadot_primitives::{ + BlockNumber, CandidateHash, CoreIndex, Hash, OccupiedCore, SessionIndex, +}; use super::{FatalError, Metrics, Result, LOG_TARGET}; @@ -112,14 +114,14 @@ impl Requester { ctx: &mut Context, runtime: &mut RuntimeInfo, update: ActiveLeavesUpdate, - spans: &HashMap, + spans: &HashMap, ) -> Result<()> { gum::trace!(target: LOG_TARGET, ?update, "Update fetching heads"); let ActiveLeavesUpdate { activated, deactivated } = update; if let Some(leaf) = activated { let span = spans .get(&leaf.hash) - .map(|span| span.child("update-fetching-heads")) + .map(|(_, span)| span.child("update-fetching-heads")) .unwrap_or_else(|| jaeger::Span::new(&leaf.hash, "update-fetching-heads")) .with_string_tag("leaf", format!("{:?}", leaf.hash)) .with_stage(jaeger::Stage::AvailabilityDistribution); diff --git a/polkadot/node/network/availability-distribution/src/requester/tests.rs b/polkadot/node/network/availability-distribution/src/requester/tests.rs index 09567a8f87d3..decb3156004e 100644 --- a/polkadot/node/network/availability-distribution/src/requester/tests.rs +++ b/polkadot/node/network/availability-distribution/src/requester/tests.rs @@ -208,7 +208,7 @@ fn check_ancestry_lookup_in_same_session() { test_harness(test_state.clone(), |mut ctx| async move { let chain = &test_state.relay_chain; - let spans: HashMap = HashMap::new(); + let spans: HashMap = HashMap::new(); let block_number = 1; let update = ActiveLeavesUpdate { activated: Some(new_leaf(chain[block_number], block_number as u32)), @@ -281,7 +281,7 @@ fn check_ancestry_lookup_in_different_sessions() { test_harness(test_state.clone(), |mut ctx| async move { let chain = &test_state.relay_chain; - let spans: HashMap = HashMap::new(); + let spans: HashMap = HashMap::new(); let block_number = 3; let update = ActiveLeavesUpdate { activated: Some(new_leaf(chain[block_number], block_number as u32)), diff --git a/prdoc/pr_5321.prdoc b/prdoc/pr_5321.prdoc new file mode 100644 index 000000000000..97f75d28dd52 --- /dev/null +++ b/prdoc/pr_5321.prdoc @@ -0,0 +1,11 @@ +title: fix availability-distribution Jaeger spans memory leak + +doc: + - audience: Node Dev + description: | + Fixes a memory leak which caused the Jaeger span storage in availability-distribution to never be pruned and therefore increasing indefinitely. + This was caused by improper handling of finalized heads. More info in https://github.com/paritytech/polkadot-sdk/issues/5258 + +crates: + - name: polkadot-availability-distribution + bump: patch