Skip to content

Commit

Permalink
DAOS-14018 pool: dup operation detection metadata layout (daos-stack#…
Browse files Browse the repository at this point in the history
…13078)

This is the first of multiple patches that will implement
a duplicate (retry) RPC/operation detection feature for pool/container
service metadata RPCs.

With this change, a new pool/container service root-level KVS (svc_ops)
is added so that (eventually, after subsequent patches) a recent
history of client opss handled can be kept. And so that duplicate ops
can be detected and handled appropriately. A boolean (svc_ops_enabled)
is also added to the pool rdb.

For old layout pools, or those with the prior rdb default size 128 MiB,
svc_ops_enabled will be false, since the amount of history to be kept
in this design is too large for that rdb size. Related to this, the
default value of DAOS_MD_CAP is increased to 1024 MiB so that new
pools created will have sufficient rdb capacity. A debug log message
is shown during step up to reveal (especially for pool start/create)
if the feature is enabled.

Because of the layout change, pool and container upgrade logic
is also changed to enable migration from older layouts to this latest
version (e.g., DAOS v2.4 pools at global version 2 -> 3).

Also, some comments and minor logic changes in the container
create/destroy handling execution flows are added, mostly as a TODO,
and to show how duplicate ops scenarios can/will be handled.

Finally, an independent enhancement to pool service distributed start
is included in this patch. When pool service membership is changed
to add new replicas, the existing rdb size is provided as the size
argument, rather than rely on the DAOS_MD_CAP environment variable
value that of course could have changed between server/engine
starts with an administrator's edits of the daos_server.yml file.

Signed-off-by: Kenneth Cain <[email protected]>
Co-authored-by: Li Wei <[email protected]>
  • Loading branch information
kccain and liw authored Oct 6, 2023
1 parent aa7ecb7 commit 443ff85
Show file tree
Hide file tree
Showing 14 changed files with 294 additions and 65 deletions.
121 changes: 86 additions & 35 deletions src/container/srv_container.c
Original file line number Diff line number Diff line change
Expand Up @@ -102,37 +102,47 @@ cont_svc_init(struct cont_svc *svc, const uuid_t pool_uuid, uint64_t id,
/* cs_root */
rc = rdb_path_init(&svc->cs_root);
if (rc != 0)
D_GOTO(err_lock, rc);
goto err_lock;
rc = rdb_path_push(&svc->cs_root, &rdb_path_root_key);
if (rc != 0)
D_GOTO(err_root, rc);
goto err_root;

/* cs_uuids */
rc = rdb_path_clone(&svc->cs_root, &svc->cs_uuids);
if (rc != 0)
D_GOTO(err_root, rc);
goto err_root;
rc = rdb_path_push(&svc->cs_uuids, &ds_cont_prop_cuuids);
if (rc != 0)
D_GOTO(err_uuids, rc);
goto err_uuids;

/* cs_conts */
rc = rdb_path_clone(&svc->cs_root, &svc->cs_conts);
if (rc != 0)
D_GOTO(err_uuids, rc);
goto err_uuids;
rc = rdb_path_push(&svc->cs_conts, &ds_cont_prop_conts);
if (rc != 0)
D_GOTO(err_conts, rc);
goto err_conts;

/* cs_hdls */
rc = rdb_path_clone(&svc->cs_root, &svc->cs_hdls);
if (rc != 0)
D_GOTO(err_conts, rc);
goto err_conts;
rc = rdb_path_push(&svc->cs_hdls, &ds_cont_prop_cont_handles);
if (rc != 0)
D_GOTO(err_hdls, rc);
goto err_hdls;

/* cs_ops */
rc = rdb_path_clone(&svc->cs_root, &svc->cs_ops);
if (rc != 0)
goto err_hdls;
rc = rdb_path_push(&svc->cs_ops, &ds_cont_prop_svc_ops);
if (rc != 0)
goto err_svcops;

return 0;

err_svcops:
rdb_path_fini(&svc->cs_ops);
err_hdls:
rdb_path_fini(&svc->cs_hdls);
err_conts:
Expand All @@ -150,6 +160,7 @@ cont_svc_init(struct cont_svc *svc, const uuid_t pool_uuid, uint64_t id,
static void
cont_svc_fini(struct cont_svc *svc)
{
rdb_path_fini(&svc->cs_ops);
rdb_path_fini(&svc->cs_hdls);
rdb_path_fini(&svc->cs_conts);
rdb_path_fini(&svc->cs_uuids);
Expand Down Expand Up @@ -544,8 +555,8 @@ get_nhandles(struct rdb_tx *tx, struct d_hash_table *nhc, struct cont *cont, enu

/* check if container exists by UUID and (if applicable) non-default label */
static int
cont_existence_check(struct rdb_tx *tx, struct cont_svc *svc,
uuid_t puuid, uuid_t cuuid, char *clabel)
cont_create_existence_check(struct rdb_tx *tx, struct cont_svc *svc, uuid_t puuid, uuid_t cuuid,
char *clabel, bool dup_op)
{
d_iov_t key;
d_iov_t val;
Expand All @@ -568,6 +579,11 @@ cont_existence_check(struct rdb_tx *tx, struct cont_svc *svc,
D_DEBUG(DB_MD, DF_CONT": no label, lookup by UUID "DF_UUIDF
" "DF_RC"\n", DP_CONT(puuid, cuuid), DP_UUID(cuuid),
DP_RC(rc));

/* UUID found is an "already exists" error if this is a new (not a retry) RPC */
if (may_exist && !dup_op)
return -DER_EXIST;

return rc;
}

Expand Down Expand Up @@ -597,6 +613,11 @@ cont_existence_check(struct rdb_tx *tx, struct cont_svc *svc,
DP_UUID(match_cuuid));
return -DER_INVAL;
}

/* UUID and label found and match. Error if this is a new (not a retry) RPC */
if (!dup_op)
return -DER_EXIST;

return 0;
}

Expand Down Expand Up @@ -998,8 +1019,8 @@ cont_prop_write(struct rdb_tx *tx, const rdb_path_t *kvs, daos_prop_t *prop,
}

static int
cont_create(struct rdb_tx *tx, struct ds_pool_hdl *pool_hdl,
struct cont_svc *svc, crt_rpc_t *rpc)
cont_create(struct rdb_tx *tx, struct ds_pool_hdl *pool_hdl, struct cont_svc *svc, bool dup_op,
crt_rpc_t *rpc)
{
struct cont_create_in *in = crt_req_get(rpc);
daos_prop_t *prop_dup = NULL;
Expand Down Expand Up @@ -1064,8 +1085,8 @@ cont_create(struct rdb_tx *tx, struct ds_pool_hdl *pool_hdl,
}

/* Check if a container with this UUID and label already exists */
rc = cont_existence_check(tx, svc, pool_hdl->sph_pool->sp_uuid,
in->cci_op.ci_uuid, lbl);
rc = cont_create_existence_check(tx, svc, pool_hdl->sph_pool->sp_uuid, in->cci_op.ci_uuid,
lbl, dup_op);
if (rc != -DER_NONEXIST) {
if (rc == 0)
D_DEBUG(DB_MD, DF_CONT": container already exists\n",
Expand Down Expand Up @@ -5136,20 +5157,23 @@ static int
cont_op_with_svc(struct ds_pool_hdl *pool_hdl, struct cont_svc *svc,
crt_rpc_t *rpc, int cont_proto_ver)
{
struct cont_op_in *in = crt_req_get(rpc);
struct cont_open_bylabel_in *olbl_in = NULL;
struct cont_open_bylabel_out *olbl_out = NULL;
struct cont_destroy_bylabel_in *dlbl_in = NULL;
struct rdb_tx tx;
crt_opcode_t opc = opc_get(rpc->cr_opc);
struct cont *cont = NULL;
struct cont_pool_metrics *metrics;
bool update_mtime = false;
int rc;
struct cont_op_in *in = crt_req_get(rpc);
struct cont_open_bylabel_in *olbl_in = NULL;
struct cont_open_bylabel_out *olbl_out = NULL;
struct cont_destroy_bylabel_in *dlbl_in = NULL;
struct rdb_tx tx;
crt_opcode_t opc = opc_get(rpc->cr_opc);
struct cont *cont = NULL;
struct cont_pool_metrics *metrics;
bool update_mtime = false;
bool dup_op = false;
const char *clbl = NULL;
char cuuid[37];
int rc;

rc = rdb_tx_begin(svc->cs_rsvc->s_db, svc->cs_rsvc->s_term, &tx);
if (rc != 0)
D_GOTO(out, rc);
goto out;

/* TODO: Implement per-container locking. */
if (opc == CONT_QUERY || opc == CONT_ATTR_GET ||
Expand All @@ -5158,30 +5182,55 @@ cont_op_with_svc(struct ds_pool_hdl *pool_hdl, struct cont_svc *svc,
else
ABT_rwlock_wrlock(svc->cs_lock);

/* TODO: add client-provided metadata RPC key, lookup in cs_ops KVS, assign dup_op */

switch (opc) {
case CONT_CREATE:
rc = cont_create(&tx, pool_hdl, svc, rpc);
rc = cont_create(&tx, pool_hdl, svc, dup_op, rpc);
if (likely(rc == 0)) {
metrics = pool_hdl->sph_pool->sp_metrics[DAOS_CONT_MODULE];
d_tm_inc_counter(metrics->create_total, 1);
}
if (dup_op)
goto out_lock;

break;
case CONT_OPEN_BYLABEL:
olbl_in = crt_req_get(rpc);
olbl_out = crt_reply_get(rpc);
rc = cont_lookup_bylabel(&tx, svc, olbl_in->coli_label, &cont);
/* TODO: idempotent rc=0 return for dup_op case. */
if (rc != 0)
goto out_lock;
/* NB: call common cont_op_with_cont() same as CONT_OPEN case */
rc = cont_op_with_cont(&tx, pool_hdl, cont, rpc, &update_mtime, cont_proto_ver);
uuid_copy(olbl_out->colo_uuid, cont->c_uuid);
break;
case CONT_DESTROY_BYLABEL:
dlbl_in = crt_req_get(rpc);
rc = cont_lookup_bylabel(&tx, svc, dlbl_in->cdli_label, &cont);
if (rc != 0)
case CONT_DESTROY:
if (opc == CONT_DESTROY_BYLABEL) {
dlbl_in = crt_req_get(rpc);
clbl = dlbl_in->cdli_label;
rc = cont_lookup_bylabel(&tx, svc, dlbl_in->cdli_label, &cont);
} else {
uuid_unparse(in->ci_uuid, cuuid);
rc = cont_lookup(&tx, svc, in->ci_uuid, &cont);
}
if (rc == -DER_NONEXIST && dup_op) {
D_DEBUG(DB_MD, DF_UUID ":%s: do not destroy already-destroyed container\n",
DP_UUID(pool_hdl->sph_pool->sp_uuid), clbl ? clbl : cuuid);
rc = 0;
goto out_lock;
} else if (rc == 0 && dup_op) {
/* original rpc destroyed container. But another one was created! */
D_DEBUG(DB_MD,
DF_UUID ":%s: do not destroy already-destroyed "
"(and since recreated!) container\n",
DP_UUID(pool_hdl->sph_pool->sp_uuid), clbl ? clbl : cuuid);
goto out_contref;
} else if (rc != 0) {
goto out_lock;
/* NB: call common cont_op_with_cont() same as CONT_DESTROY */
}
rc = cont_op_with_cont(&tx, pool_hdl, cont, rpc, &update_mtime, cont_proto_ver);
break;
default:
Expand All @@ -5192,6 +5241,7 @@ cont_op_with_svc(struct ds_pool_hdl *pool_hdl, struct cont_svc *svc,
}
if (rc != 0)
goto out_contref;
/* TODO: assign cs_ops value rc=0 */

/* Update container metadata modified times as applicable
* NB: this is a NOOP if the pool has not been upgraded to the layout containing mdtimes.
Expand All @@ -5200,12 +5250,13 @@ cont_op_with_svc(struct ds_pool_hdl *pool_hdl, struct cont_svc *svc,
if (rc != 0)
goto out_contref;

/* TODO: insert client RPC key (UUID + timestamp) and value (rc) in cs_ops */

rc = rdb_tx_commit(&tx);
if (rc != 0)
D_ERROR(DF_CONT": rpc=%p opc=%u hdl="DF_UUID" rdb_tx_commit "
"failed: "DF_RC"\n",
DP_CONT(pool_hdl->sph_pool->sp_uuid, in->ci_uuid),
rpc, opc, DP_UUID(in->ci_hdl), DP_RC(rc));
D_ERROR(DF_CONT ": rpc=%p opc=%u hdl=" DF_UUID " rdb_tx_commit failed: " DF_RC "\n",
DP_CONT(pool_hdl->sph_pool->sp_uuid, in->ci_uuid), rpc, opc,
DP_UUID(in->ci_hdl), DP_RC(rc));

out_contref:
if (cont)
Expand All @@ -5215,7 +5266,7 @@ cont_op_with_svc(struct ds_pool_hdl *pool_hdl, struct cont_svc *svc,
rdb_tx_end(&tx);
out:
/* Propagate new snapshot list by IV */
if (rc == 0) {
if (!dup_op && (rc == 0)) {
if (opc == CONT_SNAP_CREATE || opc == CONT_SNAP_DESTROY)
ds_cont_update_snap_iv(svc, in->ci_uuid);
else if (opc == CONT_PROP_SET)
Expand Down
1 change: 1 addition & 0 deletions src/container/srv_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ struct cont_svc {
rdb_path_t cs_uuids; /* container UUIDs KVS */
rdb_path_t cs_conts; /* container KVS */
rdb_path_t cs_hdls; /* container handle KVS */
rdb_path_t cs_ops; /* metadata ops KVS */
struct ds_pool *cs_pool;

/* Manage the EC aggregation epoch */
Expand Down
2 changes: 2 additions & 0 deletions src/container/srv_layout.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ RDB_STRING_KEY(ds_cont_prop_, cuuids);
RDB_STRING_KEY(ds_cont_prop_, conts);
RDB_STRING_KEY(ds_cont_prop_, cont_handles);
RDB_STRING_KEY(ds_cont_prop_, oit_oids);
RDB_STRING_KEY(ds_cont_prop_, svc_ops);
RDB_STRING_KEY(ds_cont_prop_, svc_ops_enabled);

/* Container properties KVS */
RDB_STRING_KEY(ds_cont_prop_, ghce);
Expand Down
17 changes: 14 additions & 3 deletions src/container/srv_layout.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* (C) Copyright 2016-2022 Intel Corporation.
* (C) Copyright 2016-2023 Intel Corporation.
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand All @@ -15,13 +15,15 @@
* for ds_cont:
*
* Root KVS (GENERIC):
* Container UUIDs KVS (GENERIC):
* Container KVS (GENERIC):
* Container property KVS (GENERIC):
* Snapshot KVS (INTEGER)
* User attribute KVS (GENERIC)
* Handle index KVS (GENERIC)
* ... (more container property KVSs)
* Container handle KVS (GENERIC)
* Service ops KVS (GENERIC) - NB used by both pool and container modules
*
* The version of the whole layout is defined by ds_pool_prop_global_version.
*/
Expand All @@ -40,8 +42,8 @@
*
* extern d_iov_t ds_cont_prop_new_key; comment_on_value_type
*
* Note 1. The "new_key" name in ds_cont_prop_new_key must not appear in the
* root KVS in src/pool/srv_layout.h, that is, there must not be a
* Note 1. The "new_key" name in ds_cont_prop_new_key must not appear (with very few exceptions)
* in the root KVS in src/pool/srv_layout.h, that is, there must usually not be a
* ds_pool_prop_new_key, because the two root KVSs are the same RDB KVS.
*
* Note 2. The comment_on_value_type shall focus on the value type only;
Expand All @@ -51,6 +53,8 @@
extern d_iov_t ds_cont_prop_cuuids; /* container UUIDs KVS */
extern d_iov_t ds_cont_prop_conts; /* container KVS */
extern d_iov_t ds_cont_prop_cont_handles; /* container handle KVS */
extern d_iov_t ds_cont_prop_svc_ops; /* service ops KVS - common to pool, container */
extern d_iov_t ds_cont_prop_svc_ops_enabled; /* uint32_t - common to pool, container */
/* Please read the IMPORTANT notes above before adding new keys. */

/*
Expand Down Expand Up @@ -156,6 +160,13 @@ struct container_hdl {
uint64_t ch_sec_capas;
};

/*
* Service ops KVS (RDB_KVS_GENERIC)
*
* Each key is a client UUID and HLC timestamp, defined in struct svc_op_key.
* Each value represents the result of handling that RPC, defined in struct svc_op_val.
*/

extern daos_prop_t cont_prop_default;
extern daos_prop_t cont_prop_default_v0;

Expand Down
2 changes: 1 addition & 1 deletion src/include/daos/pool.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@
/*
* Version 1 corresponds to 2.2 (aggregation optimizations)
* Version 2 corresponds to 2.4 (dynamic evtree, checksum scrubbing)
* Version 3 corresponds to 2.6 (root embedded values)
* Version 3 corresponds to 2.6 (root embedded values, pool service operations tracking KVS)
*/
#define DAOS_POOL_GLOBAL_VERSION 3

Expand Down
4 changes: 2 additions & 2 deletions src/include/daos_srv/container.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@
void ds_cont_wrlock_metadata(struct cont_svc *svc);
void ds_cont_rdlock_metadata(struct cont_svc *svc);
void ds_cont_unlock_metadata(struct cont_svc *svc);
int ds_cont_init_metadata(struct rdb_tx *tx, const rdb_path_t *kvs,
const uuid_t pool_uuid);
int
ds_cont_init_metadata(struct rdb_tx *tx, const rdb_path_t *kvs, const uuid_t pool_uuid);
int ds_cont_svc_init(struct cont_svc **svcp, const uuid_t pool_uuid,
uint64_t id, struct ds_rsvc *rsvc);
void ds_cont_svc_fini(struct cont_svc **svcp);
Expand Down
6 changes: 3 additions & 3 deletions src/include/daos_srv/control.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,10 @@ dpdk_cli_override_opts;
#define NVME_DEV_FL_INUSE (1 << 1) /* Used by DAOS (present in SMD) */
#define NVME_DEV_FL_FAULTY (1 << 2) /* Faulty state has been assigned */

/** Env defining the size of a metadata pmem pool/file in MiBs */
/** Env defining the size of a metadata pmem pool/file allocated during pool create, in MiBs */
#define DAOS_MD_CAP_ENV "DAOS_MD_CAP"
/** Default size of a metadata pmem pool/file (128 MiB) */
#define DEFAULT_DAOS_MD_CAP_SIZE (1ul << 27)
/** Default size of a metadata pmem pool/file (1024 MiB) */
#define DEFAULT_DAOS_MD_CAP_SIZE (1ul << 30)

/** Utility macros */
#define CHK_FLAG(x, m) ((x & m) == m)
Expand Down
11 changes: 11 additions & 0 deletions src/include/daos_srv/pool.h
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,17 @@ struct ds_pool_child {
void *spc_metrics[DAOS_NR_MODULE];
};

struct svc_op_key {
uint64_t mdk_client_time;
uuid_t mdk_client_id;
/* TODO: add a (cart) opcode to the key? */
};

struct svc_op_val {
int mdv_rc;
char mdv_resvd[62];
};

struct ds_pool_child *ds_pool_child_lookup(const uuid_t uuid);
struct ds_pool_child *ds_pool_child_get(struct ds_pool_child *child);
void ds_pool_child_put(struct ds_pool_child *child);
Expand Down
4 changes: 3 additions & 1 deletion src/include/daos_srv/rdb.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* (C) Copyright 2017-2022 Intel Corporation.
* (C) Copyright 2017-2023 Intel Corporation.
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -164,6 +164,8 @@ int rdb_campaign(struct rdb *db);
bool rdb_is_leader(struct rdb *db, uint64_t *term);
int rdb_get_leader(struct rdb *db, uint64_t *term, d_rank_t *rank);
int rdb_get_ranks(struct rdb *db, d_rank_list_t **ranksp);
int
rdb_get_size(struct rdb *db, size_t *sizep);
int rdb_add_replicas(struct rdb *db, d_rank_list_t *replicas);
int rdb_remove_replicas(struct rdb *db, d_rank_list_t *replicas);
int rdb_ping(struct rdb *db, uint64_t caller_term);
Expand Down
Loading

0 comments on commit 443ff85

Please sign in to comment.