Skip to content

Commit

Permalink
Use a Qorb pool to choose pantry clients (#6822)
Browse files Browse the repository at this point in the history
The meat of this PR is the change in implementation of
`get_pantry_address`: instead of asking our internal DNS resolver to
look up a crucible pantry (which does not randomize, so in practice we
always get whichever pantry the DNS server listed first), we ask a Qorb
connection pool for the address of a healthy client.
`get_pantry_address` itself does not use the client directly and only
cares about its address, but the pool does keep a client around so that
it can call `pantry_status()` as a health check. (It doesn't look at the
contents of the result; only whether or not the request succeeded -
@jmpesp if that should be more refined, please say so.)

This partially addresses #3763; once this lands, if a pantry is down or
unhealthy but still present in DNS (i.e., not expunged), Qorb + the
status health checks should mean we'll pick a different pantry for new
operations, instead of the current behavior of always sticking to the
first pantry in DNS.

---------

Co-authored-by: Sean Klein <[email protected]>
  • Loading branch information
jgallagher and smklein authored Oct 11, 2024
1 parent 4e8200a commit 1f3de04
Show file tree
Hide file tree
Showing 12 changed files with 195 additions and 42 deletions.
8 changes: 6 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions internal-dns/resolver/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ hickory-resolver.workspace = true
internal-dns-types.workspace = true
omicron-common.workspace = true
omicron-workspace-hack.workspace = true
qorb.workspace = true
reqwest = { workspace = true, features = ["rustls-tls", "stream"] }
slog.workspace = true
thiserror.workspace = true
Expand Down
40 changes: 40 additions & 0 deletions internal-dns/resolver/src/resolver.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,46 @@ pub enum ResolveError {
NotFoundByString(String),
}

/// A wrapper around a set of bootstrap DNS addresses, providing a convenient
/// way to construct a [`qorb::resolvers::dns::DnsResolver`] for specific
/// services.
#[derive(Debug, Clone)]
pub struct QorbResolver {
bootstrap_dns_ips: Vec<SocketAddr>,
}

impl QorbResolver {
pub fn new(bootstrap_dns_ips: Vec<SocketAddr>) -> Self {
Self { bootstrap_dns_ips }
}

pub fn bootstrap_dns_ips(&self) -> &[SocketAddr] {
&self.bootstrap_dns_ips
}

pub fn for_service(
&self,
service: ServiceName,
) -> qorb::resolver::BoxedResolver {
let config = qorb::resolvers::dns::DnsResolverConfig {
// Ignore the TTL returned by our servers, primarily to avoid
// thrashing if they return a TTL of 0 (which they currently do:
// https://github.com/oxidecomputer/omicron/issues/6790).
hardcoded_ttl: Some(std::time::Duration::MAX),
// We don't currently run additional internal DNS servers that
// themselves need to be found via a set of bootstrap DNS IPs, but
// if we did, we'd populate `resolver_service` here to tell qorb how
// to find them.
..Default::default()
};
Box::new(qorb::resolvers::dns::DnsResolver::new(
qorb::service::Name(service.srv_name()),
self.bootstrap_dns_ips.clone(),
config,
))
}
}

/// A wrapper around a DNS resolver, providing a way to conveniently
/// look up IP addresses of services based on their SRV keys.
#[derive(Clone)]
Expand Down
1 change: 1 addition & 0 deletions nexus/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ paste.workspace = true
pq-sys = "*"
progenitor-client.workspace = true
propolis-client.workspace = true
qorb.workspace = true
rand.workspace = true
ref-cast.workspace = true
reqwest = { workspace = true, features = ["json"] }
Expand Down
1 change: 1 addition & 0 deletions nexus/db-queries/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ diesel.workspace = true
diesel-dtrace.workspace = true
dropshot.workspace = true
futures.workspace = true
internal-dns-resolver.workspace = true
internal-dns-types.workspace = true
ipnetwork.workspace = true
macaddr.workspace = true
Expand Down
21 changes: 3 additions & 18 deletions nexus/db-queries/src/db/pool.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,13 @@
use super::Config as DbConfig;
use crate::db::pool_connection::{DieselPgConnector, DieselPgConnectorArgs};

use internal_dns_resolver::QorbResolver;
use internal_dns_types::names::ServiceName;
use qorb::backend;
use qorb::policy::Policy;
use qorb::resolver::{AllBackends, Resolver};
use qorb::resolvers::dns::{DnsResolver, DnsResolverConfig};
use qorb::service;
use slog::Logger;
use std::collections::BTreeMap;
use std::net::SocketAddr;
use std::sync::Arc;
use tokio::sync::watch;

Expand Down Expand Up @@ -55,19 +53,6 @@ impl Resolver for SingleHostResolver {
}
}

fn make_dns_resolver(
bootstrap_dns: Vec<SocketAddr>,
) -> qorb::resolver::BoxedResolver {
Box::new(DnsResolver::new(
service::Name(ServiceName::Cockroach.srv_name()),
bootstrap_dns,
DnsResolverConfig {
hardcoded_ttl: Some(tokio::time::Duration::MAX),
..Default::default()
},
))
}

fn make_single_host_resolver(
config: &DbConfig,
) -> qorb::resolver::BoxedResolver {
Expand Down Expand Up @@ -96,11 +81,11 @@ impl Pool {
///
/// Creating this pool does not necessarily wait for connections to become
/// available, as backends may shift over time.
pub fn new(log: &Logger, bootstrap_dns: Vec<SocketAddr>) -> Self {
pub fn new(log: &Logger, resolver: &QorbResolver) -> Self {
// Make sure diesel-dtrace's USDT probes are enabled.
usdt::register_probes().expect("Failed to register USDT DTrace probes");

let resolver = make_dns_resolver(bootstrap_dns);
let resolver = resolver.for_service(ServiceName::Cockroach);
let connector = make_postgres_connector(log);

let policy = Policy::default();
Expand Down
14 changes: 14 additions & 0 deletions nexus/src/app/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ use omicron_common::api::external::Error;
use omicron_common::api::internal::shared::SwitchLocation;
use omicron_uuid_kinds::OmicronZoneUuid;
use oximeter_producer::Server as ProducerServer;
use sagas::common_storage::make_pantry_connection_pool;
use sagas::common_storage::PooledPantryClient;
use slog::Logger;
use std::collections::HashMap;
use std::net::SocketAddrV6;
Expand Down Expand Up @@ -186,6 +188,9 @@ pub struct Nexus {
// Nexus to not all fail.
samael_max_issue_delay: std::sync::Mutex<Option<chrono::Duration>>,

/// Conection pool for Crucible pantries
pantry_connection_pool: qorb::pool::Pool<PooledPantryClient>,

/// DNS resolver for internal services
internal_resolver: internal_dns_resolver::Resolver,

Expand Down Expand Up @@ -214,10 +219,12 @@ pub struct Nexus {
impl Nexus {
/// Create a new Nexus instance for the given rack id `rack_id`
// TODO-polish revisit rack metadata
#[allow(clippy::too_many_arguments)]
pub(crate) async fn new_with_id(
rack_id: Uuid,
log: Logger,
resolver: internal_dns_resolver::Resolver,
qorb_resolver: internal_dns_resolver::QorbResolver,
pool: db::Pool,
producer_registry: &ProducerRegistry,
config: &NexusConfig,
Expand Down Expand Up @@ -473,6 +480,7 @@ impl Nexus {
as Arc<dyn nexus_auth::storage::Storage>,
),
samael_max_issue_delay: std::sync::Mutex::new(None),
pantry_connection_pool: make_pantry_connection_pool(&qorb_resolver),
internal_resolver: resolver.clone(),
external_resolver,
external_dns_servers: config
Expand Down Expand Up @@ -936,6 +944,12 @@ impl Nexus {
&self.internal_resolver
}

pub(crate) fn pantry_connection_pool(
&self,
) -> &qorb::pool::Pool<PooledPantryClient> {
&self.pantry_connection_pool
}

pub(crate) async fn dpd_clients(
&self,
) -> Result<HashMap<SwitchLocation, dpd_client::Client>, String> {
Expand Down
20 changes: 13 additions & 7 deletions nexus/src/app/sagas/common_storage.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,27 +8,33 @@ use super::*;

use crate::Nexus;
use crucible_pantry_client::types::VolumeConstructionRequest;
use internal_dns_types::names::ServiceName;
use nexus_db_queries::authz;
use nexus_db_queries::context::OpContext;
use nexus_db_queries::db;
use nexus_db_queries::db::lookup::LookupPath;
use omicron_common::api::external::Error;
use omicron_common::retry_until_known_result;
use slog::Logger;
use slog_error_chain::InlineErrorChain;
use std::net::SocketAddrV6;

mod pantry_pool;

pub(crate) use pantry_pool::make_pantry_connection_pool;
pub(crate) use pantry_pool::PooledPantryClient;

// Common Pantry operations

pub(crate) async fn get_pantry_address(
nexus: &Arc<Nexus>,
) -> Result<SocketAddrV6, ActionError> {
nexus
.resolver()
.lookup_socket_v6(ServiceName::CruciblePantry)
.await
.map_err(|e| e.to_string())
.map_err(ActionError::action_failed)
let client = nexus.pantry_connection_pool().claim().await.map_err(|e| {
ActionError::action_failed(format!(
"failed to claim pantry client from pool: {}",
InlineErrorChain::new(&e)
))
})?;
Ok(client.address())
}

pub(crate) async fn call_pantry_attach_for_disk(
Expand Down
92 changes: 92 additions & 0 deletions nexus/src/app/sagas/common_storage/pantry_pool.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! `qorb` support for Crucible pantry connection pooling.

use anyhow::anyhow;
use anyhow::Context;
use internal_dns_resolver::QorbResolver;
use internal_dns_types::names::ServiceName;
use qorb::backend;
use qorb::pool;
use std::net::SocketAddr;
use std::net::SocketAddrV6;
use std::sync::Arc;

/// Wrapper around a Crucible pantry client that also remembers its address.
///
/// In most cases when Nexus wants to pick a pantry, it doesn't actually want a
/// client right then, but instead wants to write down its address for subsequent
/// use (and reuse) later. This type carries around a `client` only to perform
/// health checks as supported by `qorb`; the rest of Nexus only accesses its
/// `address`.
#[derive(Debug)]
pub(crate) struct PooledPantryClient {
client: crucible_pantry_client::Client,
address: SocketAddrV6,
}

impl PooledPantryClient {
pub(crate) fn address(&self) -> SocketAddrV6 {
self.address
}
}

/// A [`backend::Connector`] for [`PooledPantryClient`]s.
#[derive(Debug)]
struct PantryConnector;

#[async_trait::async_trait]
impl backend::Connector for PantryConnector {
type Connection = PooledPantryClient;

async fn connect(
&self,
backend: &backend::Backend,
) -> Result<Self::Connection, backend::Error> {
let address = match backend.address {
SocketAddr::V6(addr) => addr,
SocketAddr::V4(addr) => {
return Err(backend::Error::Other(anyhow!(
"unexpected IPv4 address for Crucible pantry: {addr}"
)));
}
};
let client =
crucible_pantry_client::Client::new(&format!("http://{address}"));
Ok(PooledPantryClient { client, address })
}

async fn is_valid(
&self,
conn: &mut Self::Connection,
) -> Result<(), backend::Error> {
conn.client
.pantry_status()
.await
.with_context(|| {
format!("failed to fetch pantry status from {}", conn.address())
})
.map_err(backend::Error::Other)?;

Ok(())
}

async fn on_acquire(
&self,
conn: &mut Self::Connection,
) -> Result<(), backend::Error> {
self.is_valid(conn).await
}
}

pub(crate) fn make_pantry_connection_pool(
qorb_resolver: &QorbResolver,
) -> pool::Pool<PooledPantryClient> {
pool::Pool::new(
qorb_resolver.for_service(ServiceName::CruciblePantry),
Arc::new(PantryConnector),
qorb::policy::Policy::default(),
)
}
5 changes: 0 additions & 5 deletions nexus/src/app/sagas/region_replacement_drive.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1035,11 +1035,6 @@ async fn srrd_drive_region_replacement_prepare(
"disk id" => ?disk.id(),
);

// XXX: internal-dns does not randomize the order of addresses
// in its responses: if the first Pantry in the list of
// addresses returned by DNS isn't responding, the drive saga
// will still continually try to use it.

let pantry_address = get_pantry_address(nexus).await?;

DriveAction::Pantry {
Expand Down
Loading

0 comments on commit 1f3de04

Please sign in to comment.