From 267ec27d7a82c05a5ab4aa48352ec81b4e9590f9 Mon Sep 17 00:00:00 2001 From: Shane Snyder Date: Tue, 17 Sep 2024 20:58:52 +0000 Subject: [PATCH] proper handling of Lustre data with mmap logging * requires storing the total allocated _components_ and _stripes_ with the Lustre record so that this data can be read back in its entirety, even if the app terminates abruptly * at shutdown time, the Lustre module can truncate the component and OST list sizes to something smaller than originally requested, if some memory ends up not being used * darshan-util also has code to detect unused portions of Lustre records that can be truncated (which should only occur for apps that terminate abruptly and use Darshan mmap logging) --- darshan-runtime/lib/darshan-lustre.c | 75 ++++++++++++++--------- darshan-util/darshan-lustre-logutils.c | 84 +++++++++++++++----------- darshan-util/doc/darshan-util.txt | 1 + include/darshan-lustre-log-format.h | 8 ++- 4 files changed, 102 insertions(+), 66 deletions(-) diff --git a/darshan-runtime/lib/darshan-lustre.c b/darshan-runtime/lib/darshan-lustre.c index e7b620138..8d00388a8 100644 --- a/darshan-runtime/lib/darshan-lustre.c +++ b/darshan-runtime/lib/darshan-lustre.c @@ -42,9 +42,6 @@ static void lustre_cleanup( struct lustre_record_ref { struct darshan_lustre_record *record; - int max_comps; - int max_osts; - size_t record_size; }; struct lustre_runtime @@ -62,16 +59,16 @@ static int my_rank = -1; #define LUSTRE_UNLOCK() pthread_mutex_unlock(&lustre_runtime_mutex) static void darshan_get_lustre_layout_size(struct llapi_layout *lustre_layout, - int *num_comps, int *num_osts) + int *num_comps, int *num_stripes) { bool is_composite; int ret; uint64_t stripe_pattern, stripe_count; int tmp_comps = 0; - int tmp_osts = 0; + int tmp_stripes = 0; *num_comps = 0; - *num_osts = 0; + *num_stripes = 0; is_composite = llapi_layout_is_composite(lustre_layout); if (is_composite) @@ -92,7 +89,7 @@ static void darshan_get_lustre_layout_size(struct llapi_layout *lustre_layout, ret = llapi_layout_stripe_count_get(lustre_layout, &stripe_count); if (ret != 0) return; - tmp_osts += stripe_count; + tmp_stripes += stripe_count; } tmp_comps++; @@ -105,7 +102,7 @@ static void darshan_get_lustre_layout_size(struct llapi_layout *lustre_layout, } while(ret == 0); *num_comps = tmp_comps; - *num_osts = tmp_osts; + *num_stripes = tmp_stripes; return; } @@ -124,10 +121,7 @@ static void darshan_get_lustre_layout_components(struct llapi_layout *lustre_lay int comps_idx = 0, osts_idx = 0; struct darshan_lustre_component *comps = (struct darshan_lustre_component *)&(rec_ref->record->comps); - OST_ID *osts = (OST_ID *)(comps + rec_ref->max_comps); - - rec_ref->record_size = 0; - rec_ref->record->num_comps = 0; + OST_ID *osts = (OST_ID *)(comps + rec_ref->record->num_comps); is_composite = llapi_layout_is_composite(lustre_layout); if (is_composite) @@ -156,7 +150,7 @@ static void darshan_get_lustre_layout_components(struct llapi_layout *lustre_lay */ if ((!is_composite || (flags & LCME_FL_INIT)) && (ret == 0) && - (osts_idx + stripe_count <= rec_ref->max_osts)) + (osts_idx + stripe_count <= rec_ref->record->num_stripes)) { comps[comps_idx].counters[LUSTRE_COMP_STRIPE_SIZE] = (int64_t)stripe_size; comps[comps_idx].counters[LUSTRE_COMP_STRIPE_COUNT] = (int64_t)stripe_count; @@ -177,7 +171,6 @@ static void darshan_get_lustre_layout_components(struct llapi_layout *lustre_lay else osts[osts_idx] = (OST_ID)tmp_ost; } - rec_ref->record->num_comps++; comps_idx++; } @@ -187,13 +180,14 @@ static void darshan_get_lustre_layout_components(struct llapi_layout *lustre_lay ret = llapi_layout_comp_use(lustre_layout, LLAPI_LAYOUT_COMP_USE_NEXT); } else break; - } while(ret == 0 && rec_ref->record->num_comps < rec_ref->max_comps); + } while(ret == 0 && comps_idx < rec_ref->record->num_comps); - if (rec_ref->record->num_comps < rec_ref->max_comps) - memmove(comps + rec_ref->record->num_comps, osts, osts_idx * sizeof(*osts)); - - /* update record size to reflect final number of components/osts */ - rec_ref->record_size = LUSTRE_RECORD_SIZE(rec_ref->record->num_comps, osts_idx); + /* no more components to gather info on, set the rest as invalid */ + /* NOTE: we will attempt to truncate unused components at shutdown time */ + for (; comps_idx < rec_ref->record->num_comps; comps_idx++) + { + comps[comps_idx].counters[LUSTRE_COMP_STRIPE_SIZE] = -1; + } return; } @@ -203,7 +197,7 @@ void darshan_instrument_lustre_file(darshan_record_id rec_id, int fd) void *lustre_xattr_val; size_t lustre_xattr_size = XATTR_SIZE_MAX; struct llapi_layout *lustre_layout; - int num_comps, num_osts; + int num_comps, num_stripes; size_t rec_size; struct darshan_lustre_record *rec; struct lustre_record_ref *rec_ref; @@ -254,14 +248,14 @@ void darshan_instrument_lustre_file(darshan_record_id rec_id, int fd) { /* iterate file layout components to determine total record size */ - darshan_get_lustre_layout_size(lustre_layout, &num_comps, &num_osts); + darshan_get_lustre_layout_size(lustre_layout, &num_comps, &num_stripes); if(num_comps == 0) { llapi_layout_free(lustre_layout); LUSTRE_UNLOCK(); return; } - rec_size = LUSTRE_RECORD_SIZE(num_comps, num_osts); + rec_size = LUSTRE_RECORD_SIZE(num_comps, num_stripes); /* allocate and add a new record reference */ rec_ref = malloc(sizeof(*rec_ref)); @@ -304,8 +298,8 @@ void darshan_instrument_lustre_file(darshan_record_id rec_id, int fd) rec->base_rec.id = rec_id; rec->base_rec.rank = my_rank; rec_ref->record = rec; - rec_ref->max_comps = num_comps; - rec_ref->max_osts = num_osts; + rec_ref->record->num_comps = num_comps; + rec_ref->record->num_stripes = num_stripes; } /* fill in record buffer with component info and OST list */ @@ -413,18 +407,45 @@ static void lustre_serialize_records(void *rec_ref_p, void *user_ptr) struct lustre_record_ref *rec_ref = (struct lustre_record_ref *)rec_ref_p; struct lustre_buf_state *buf_state = (struct lustre_buf_state *)user_ptr; void *output_buf = buf_state->buf + buf_state->buf_size; + int i; + int num_stripes = 0; + size_t record_size; + struct darshan_lustre_component *comps = + (struct darshan_lustre_component *)&(rec_ref->record->comps); + OST_ID *osts = (OST_ID *)(comps + rec_ref->record->num_comps); /* skip shared records on non-zero ranks */ if (my_rank > 0 && rec_ref->record->base_rec.rank == -1) return; + /* update record size to reflect final number of components/stripes */ + for (i = 0; i < rec_ref->record->num_comps; i++) + { + /* inactive components have strip size set to -1 when instrumenting */ + if (comps[i].counters[LUSTRE_COMP_STRIPE_SIZE] == -1) + { + /* truncate components and set final component and stripe count */ + rec_ref->record->num_comps = i; + /* move OST list up in record buffer to overwrite unused components */ + memmove(comps + i, osts, num_stripes * sizeof(*osts)); + break; + } + num_stripes += comps[i].counters[LUSTRE_COMP_STRIPE_COUNT]; + } + rec_ref->record->num_stripes = num_stripes; + + record_size = LUSTRE_RECORD_SIZE(rec_ref->record->num_comps, rec_ref->record->num_stripes); + /* determine whether this record needs to be shifted back in the final record buffer */ + /* NOTE: this happens when preceding records in the output buffer have been shifted + * down in size + */ if (rec_ref->record != output_buf) { - memmove(output_buf, rec_ref->record, rec_ref->record_size); + memmove(output_buf, rec_ref->record, record_size); rec_ref->record = output_buf; } - buf_state->buf_size += rec_ref->record_size; + buf_state->buf_size += record_size; } static void lustre_output( diff --git a/darshan-util/darshan-lustre-logutils.c b/darshan-util/darshan-lustre-logutils.c index 5a97b86ac..3076bb034 100644 --- a/darshan-util/darshan-lustre-logutils.c +++ b/darshan-util/darshan-lustre-logutils.c @@ -52,8 +52,8 @@ static int darshan_log_get_lustre_record(darshan_fd fd, void** lustre_buf_p) { struct darshan_lustre_record *rec = *((struct darshan_lustre_record **)lustre_buf_p); struct darshan_lustre_record tmp_rec; - int num_osts = 0; int fixed_size, comps_size, osts_size; + int new_comps_size, new_osts_size; int i, j; int ret; @@ -73,7 +73,7 @@ static int darshan_log_get_lustre_record(darshan_fd fd, void** lustre_buf_p) return darshan_log_get_lustre_record_v1(fd, lustre_buf_p); /* retrieve the fixed-size portion of the record */ - fixed_size = sizeof(struct darshan_base_record) + sizeof(int64_t); + fixed_size = sizeof(struct darshan_base_record) + (2*sizeof(int64_t)); ret = darshan_log_get_mod(fd, DARSHAN_LUSTRE_MOD, &tmp_rec, fixed_size); if(ret < 0) return(-1); @@ -86,12 +86,14 @@ static int darshan_log_get_lustre_record(darshan_fd fd, void** lustre_buf_p) DARSHAN_BSWAP64(&tmp_rec.base_rec.id); DARSHAN_BSWAP64(&tmp_rec.base_rec.rank); DARSHAN_BSWAP64(&tmp_rec.num_comps); + DARSHAN_BSWAP64(&tmp_rec.num_stripes); } comps_size = tmp_rec.num_comps * sizeof(*tmp_rec.comps); + osts_size = tmp_rec.num_stripes * sizeof(*tmp_rec.ost_ids); if(*lustre_buf_p == NULL) { - rec = malloc(sizeof(struct darshan_lustre_record) + comps_size); + rec = malloc(sizeof(struct darshan_lustre_record) + comps_size + osts_size); if(!rec) return(-1); } @@ -105,55 +107,61 @@ static int darshan_log_get_lustre_record(darshan_fd fd, void** lustre_buf_p) { rec->comps = (struct darshan_lustre_component *) ((void *)rec + sizeof(struct darshan_lustre_record)); + rec->ost_ids = (OST_ID *)((void *)rec->comps + comps_size); - /* now read all record components */ + /* now read all record components and OST IDs */ ret = darshan_log_get_mod( fd, DARSHAN_LUSTRE_MOD, (void*)(rec->comps), - comps_size + comps_size + osts_size ); - if(ret < comps_size) + if(ret < comps_size + osts_size) ret = -1; else { ret = 1; + /* swap bytes if necessary */ if (fd->swap_flag) + { for (i = 0; i < rec->num_comps; i++) for(j=0; jcomps[i].counters[j]); - } - - for (i = 0; i < rec->num_comps; i++) - num_osts += rec->comps[i].counters[LUSTRE_COMP_STRIPE_COUNT]; - osts_size = num_osts * sizeof(*tmp_rec.ost_ids); - if(*lustre_buf_p == NULL) - { - rec = realloc(rec, sizeof(struct darshan_lustre_record) + comps_size + osts_size); - if(!rec) - return(-1); - } - rec->comps = (struct darshan_lustre_component *) - ((void *)rec + sizeof(struct darshan_lustre_record)); - rec->ost_ids = (OST_ID *)((void *)rec->comps + comps_size); - - /* now read the OST list */ - ret = darshan_log_get_mod( - fd, - DARSHAN_LUSTRE_MOD, - (void*)(rec->ost_ids), - osts_size - ); - if(ret < osts_size) - ret = -1; - else - { - ret = 1; - /* swap bytes if necessary */ - if (fd->swap_flag) - for (i = 0; i < num_osts; i++) + for (i = 0; i < rec->num_stripes; i++) DARSHAN_BSWAP64(&rec->ost_ids[i]); + } + + /* truncate any unused components/stripes leftover from runtime */ + rec->num_stripes = 0; + for (i = 0; i < rec->num_comps; i++) + { + /* NOTE: at runtime, unused stripe components are marked with size of -1 */ + if (rec->comps[i].counters[LUSTRE_COMP_STRIPE_SIZE] == -1) + { + rec->num_comps = i; + break; + } + rec->num_stripes += rec->comps[i].counters[LUSTRE_COMP_STRIPE_COUNT]; + } + new_comps_size = rec->num_comps * sizeof(*rec->comps); + new_osts_size = rec->num_stripes * sizeof(*rec->ost_ids); + if (new_comps_size != comps_size) + { + memmove(((void*)rec->comps + new_comps_size), + ((void*)rec->comps + comps_size), + new_osts_size); + } + if ((new_comps_size != comps_size) || (new_osts_size != osts_size)) + { + if(*lustre_buf_p == NULL) + { + /* record buffer size changes, so we should realloc to match it */ + rec = realloc(rec, sizeof(struct darshan_lustre_record) + new_comps_size + new_osts_size); + if(!rec) + return(-1); + } + } } } @@ -295,6 +303,9 @@ static void darshan_log_print_lustre_record(void *rec, char *file_name, DARSHAN_D_COUNTER_PRINT(darshan_module_names[DARSHAN_LUSTRE_MOD], lustre_rec->base_rec.rank, lustre_rec->base_rec.id, "LUSTRE_NUM_COMPONENTS", lustre_rec->num_comps, file_name, mnt_pt, fs_type); + DARSHAN_D_COUNTER_PRINT(darshan_module_names[DARSHAN_LUSTRE_MOD], + lustre_rec->base_rec.rank, lustre_rec->base_rec.id, "LUSTRE_NUM_STRIPES", + lustre_rec->num_stripes, file_name, mnt_pt, fs_type); for(i=0; inum_comps; i++) { for(j=0; j < LUSTRE_COMP_NUM_INDICES; j++) @@ -386,6 +397,7 @@ static void darshan_log_print_lustre_description(int ver) { printf("\n# description of LUSTRE counters:\n"); printf("# LUSTRE_NUM_COMPONENTS: number of instrumented components in the Lustre layout.\n"); + printf("# LUSTRE_NUM_STRIPES: number of active stripes in the Lustre layout components.\n"); printf("# LUSTRE_COMP*_STRIPE_SIZE: stripe size for this file layout component in bytes.\n"); printf("# LUSTRE_COMP*_STRIPE_COUNT: number of OSTs over which the file layout component is striped.\n"); printf("# LUSTRE_COMP*_STRIPE_PATTERN: pattern (e.g., raid0, mdt, overstriped) for this file layout component.\n"); diff --git a/darshan-util/doc/darshan-util.txt b/darshan-util/doc/darshan-util.txt index c233c68c2..9bfcc8af9 100644 --- a/darshan-util/doc/darshan-util.txt +++ b/darshan-util/doc/darshan-util.txt @@ -510,6 +510,7 @@ execution time. |==== | counter name | description | LUSTRE_NUM_COMPONENTS | number of instrumented components in the Lustre layout +| LUSTRE_NUM_STRIPES | number of active stripes in the Lustre layout components | LUSTRE_COMP*_STRIPE_SIZE | stripe size for this file layout component in bytes | LUSTRE_COMP*_STRIPE_COUNT | number of OSTs over which the file layout component is striped | LUSTRE_COMP*_STRIPE_PATTERN | pattern (e.g., raid0, mdt, overstriped) for this file layout component diff --git a/include/darshan-lustre-log-format.h b/include/darshan-lustre-log-format.h index dd5ad0a18..f45f5cede 100644 --- a/include/darshan-lustre-log-format.h +++ b/include/darshan-lustre-log-format.h @@ -53,6 +53,7 @@ struct darshan_lustre_component * - a corresponding record identifier (created by hashing the file path) * - the rank of the process which opened the file (-1 for shared files) * - total number of file layout components instrumented + * - total number of file stripes instrumented * - detailed counters describing each file layout component (e.g., stripe width, count, etc.) * - list of OST IDs corresponding to instrumented file layout components */ @@ -60,6 +61,7 @@ struct darshan_lustre_record { struct darshan_base_record base_rec; int64_t num_comps; + int64_t num_stripes; struct darshan_lustre_component *comps; OST_ID *ost_ids; }; @@ -68,9 +70,9 @@ struct darshan_lustre_record * helper macro to calculate the serialized size of a Lustre record * NOTE: this must be kept in sync with the definitions above */ -#define LUSTRE_RECORD_SIZE(comps, osts) \ - (sizeof(struct darshan_base_record) + sizeof(int64_t) + \ +#define LUSTRE_RECORD_SIZE(comps, stripes) \ + (sizeof(struct darshan_base_record) + (2*sizeof(int64_t)) + \ (sizeof(struct darshan_lustre_component) * (comps)) + \ - (sizeof(OST_ID) * (osts))) + (sizeof(OST_ID) * (stripes))) #endif /* __DARSHAN_LUSTRE_LOG_FORMAT_H */