Skip to content

Commit

Permalink
proper handling of Lustre data with mmap logging
Browse files Browse the repository at this point in the history
* requires storing the total allocated _components_ and _stripes_
  with the Lustre record so that this data can be read back in
  its entirety, even if the app terminates abruptly
* at shutdown time, the Lustre module can truncate the component
  and OST list sizes to something smaller than originally
  requested, if some memory ends up not being used
* darshan-util also has code to detect unused portions of Lustre
  records that can be truncated (which should only occur for apps
  that terminate abruptly and use Darshan mmap logging)
  • Loading branch information
shanedsnyder committed Sep 17, 2024
1 parent d4f0c9e commit 267ec27
Show file tree
Hide file tree
Showing 4 changed files with 102 additions and 66 deletions.
75 changes: 48 additions & 27 deletions darshan-runtime/lib/darshan-lustre.c
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,6 @@ static void lustre_cleanup(
struct lustre_record_ref
{
struct darshan_lustre_record *record;
int max_comps;
int max_osts;
size_t record_size;
};

struct lustre_runtime
Expand All @@ -62,16 +59,16 @@ static int my_rank = -1;
#define LUSTRE_UNLOCK() pthread_mutex_unlock(&lustre_runtime_mutex)

static void darshan_get_lustre_layout_size(struct llapi_layout *lustre_layout,
int *num_comps, int *num_osts)
int *num_comps, int *num_stripes)
{
bool is_composite;
int ret;
uint64_t stripe_pattern, stripe_count;
int tmp_comps = 0;
int tmp_osts = 0;
int tmp_stripes = 0;

*num_comps = 0;
*num_osts = 0;
*num_stripes = 0;

is_composite = llapi_layout_is_composite(lustre_layout);
if (is_composite)
Expand All @@ -92,7 +89,7 @@ static void darshan_get_lustre_layout_size(struct llapi_layout *lustre_layout,
ret = llapi_layout_stripe_count_get(lustre_layout, &stripe_count);
if (ret != 0)
return;
tmp_osts += stripe_count;
tmp_stripes += stripe_count;
}
tmp_comps++;

Expand All @@ -105,7 +102,7 @@ static void darshan_get_lustre_layout_size(struct llapi_layout *lustre_layout,
} while(ret == 0);

*num_comps = tmp_comps;
*num_osts = tmp_osts;
*num_stripes = tmp_stripes;
return;
}

Expand All @@ -124,10 +121,7 @@ static void darshan_get_lustre_layout_components(struct llapi_layout *lustre_lay
int comps_idx = 0, osts_idx = 0;
struct darshan_lustre_component *comps =
(struct darshan_lustre_component *)&(rec_ref->record->comps);
OST_ID *osts = (OST_ID *)(comps + rec_ref->max_comps);

rec_ref->record_size = 0;
rec_ref->record->num_comps = 0;
OST_ID *osts = (OST_ID *)(comps + rec_ref->record->num_comps);

is_composite = llapi_layout_is_composite(lustre_layout);
if (is_composite)
Expand Down Expand Up @@ -156,7 +150,7 @@ static void darshan_get_lustre_layout_components(struct llapi_layout *lustre_lay
*/
if ((!is_composite || (flags & LCME_FL_INIT)) &&
(ret == 0) &&
(osts_idx + stripe_count <= rec_ref->max_osts))
(osts_idx + stripe_count <= rec_ref->record->num_stripes))
{
comps[comps_idx].counters[LUSTRE_COMP_STRIPE_SIZE] = (int64_t)stripe_size;
comps[comps_idx].counters[LUSTRE_COMP_STRIPE_COUNT] = (int64_t)stripe_count;
Expand All @@ -177,7 +171,6 @@ static void darshan_get_lustre_layout_components(struct llapi_layout *lustre_lay
else
osts[osts_idx] = (OST_ID)tmp_ost;
}
rec_ref->record->num_comps++;
comps_idx++;
}

Expand All @@ -187,13 +180,14 @@ static void darshan_get_lustre_layout_components(struct llapi_layout *lustre_lay
ret = llapi_layout_comp_use(lustre_layout, LLAPI_LAYOUT_COMP_USE_NEXT);
}
else break;
} while(ret == 0 && rec_ref->record->num_comps < rec_ref->max_comps);
} while(ret == 0 && comps_idx < rec_ref->record->num_comps);

if (rec_ref->record->num_comps < rec_ref->max_comps)
memmove(comps + rec_ref->record->num_comps, osts, osts_idx * sizeof(*osts));

/* update record size to reflect final number of components/osts */
rec_ref->record_size = LUSTRE_RECORD_SIZE(rec_ref->record->num_comps, osts_idx);
/* no more components to gather info on, set the rest as invalid */
/* NOTE: we will attempt to truncate unused components at shutdown time */
for (; comps_idx < rec_ref->record->num_comps; comps_idx++)
{
comps[comps_idx].counters[LUSTRE_COMP_STRIPE_SIZE] = -1;
}

return;
}
Expand All @@ -203,7 +197,7 @@ void darshan_instrument_lustre_file(darshan_record_id rec_id, int fd)
void *lustre_xattr_val;
size_t lustre_xattr_size = XATTR_SIZE_MAX;
struct llapi_layout *lustre_layout;
int num_comps, num_osts;
int num_comps, num_stripes;
size_t rec_size;
struct darshan_lustre_record *rec;
struct lustre_record_ref *rec_ref;
Expand Down Expand Up @@ -254,14 +248,14 @@ void darshan_instrument_lustre_file(darshan_record_id rec_id, int fd)
{

/* iterate file layout components to determine total record size */
darshan_get_lustre_layout_size(lustre_layout, &num_comps, &num_osts);
darshan_get_lustre_layout_size(lustre_layout, &num_comps, &num_stripes);
if(num_comps == 0)
{
llapi_layout_free(lustre_layout);
LUSTRE_UNLOCK();
return;
}
rec_size = LUSTRE_RECORD_SIZE(num_comps, num_osts);
rec_size = LUSTRE_RECORD_SIZE(num_comps, num_stripes);

/* allocate and add a new record reference */
rec_ref = malloc(sizeof(*rec_ref));
Expand Down Expand Up @@ -304,8 +298,8 @@ void darshan_instrument_lustre_file(darshan_record_id rec_id, int fd)
rec->base_rec.id = rec_id;
rec->base_rec.rank = my_rank;
rec_ref->record = rec;
rec_ref->max_comps = num_comps;
rec_ref->max_osts = num_osts;
rec_ref->record->num_comps = num_comps;
rec_ref->record->num_stripes = num_stripes;
}

/* fill in record buffer with component info and OST list */
Expand Down Expand Up @@ -413,18 +407,45 @@ static void lustre_serialize_records(void *rec_ref_p, void *user_ptr)
struct lustre_record_ref *rec_ref = (struct lustre_record_ref *)rec_ref_p;
struct lustre_buf_state *buf_state = (struct lustre_buf_state *)user_ptr;
void *output_buf = buf_state->buf + buf_state->buf_size;
int i;
int num_stripes = 0;
size_t record_size;
struct darshan_lustre_component *comps =
(struct darshan_lustre_component *)&(rec_ref->record->comps);
OST_ID *osts = (OST_ID *)(comps + rec_ref->record->num_comps);

/* skip shared records on non-zero ranks */
if (my_rank > 0 && rec_ref->record->base_rec.rank == -1)
return;

/* update record size to reflect final number of components/stripes */
for (i = 0; i < rec_ref->record->num_comps; i++)
{
/* inactive components have strip size set to -1 when instrumenting */
if (comps[i].counters[LUSTRE_COMP_STRIPE_SIZE] == -1)
{
/* truncate components and set final component and stripe count */
rec_ref->record->num_comps = i;
/* move OST list up in record buffer to overwrite unused components */
memmove(comps + i, osts, num_stripes * sizeof(*osts));
break;
}
num_stripes += comps[i].counters[LUSTRE_COMP_STRIPE_COUNT];
}
rec_ref->record->num_stripes = num_stripes;

record_size = LUSTRE_RECORD_SIZE(rec_ref->record->num_comps, rec_ref->record->num_stripes);

/* determine whether this record needs to be shifted back in the final record buffer */
/* NOTE: this happens when preceding records in the output buffer have been shifted
* down in size
*/
if (rec_ref->record != output_buf)
{
memmove(output_buf, rec_ref->record, rec_ref->record_size);
memmove(output_buf, rec_ref->record, record_size);
rec_ref->record = output_buf;
}
buf_state->buf_size += rec_ref->record_size;
buf_state->buf_size += record_size;
}

static void lustre_output(
Expand Down
84 changes: 48 additions & 36 deletions darshan-util/darshan-lustre-logutils.c
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,8 @@ static int darshan_log_get_lustre_record(darshan_fd fd, void** lustre_buf_p)
{
struct darshan_lustre_record *rec = *((struct darshan_lustre_record **)lustre_buf_p);
struct darshan_lustre_record tmp_rec;
int num_osts = 0;
int fixed_size, comps_size, osts_size;
int new_comps_size, new_osts_size;
int i, j;
int ret;

Expand All @@ -73,7 +73,7 @@ static int darshan_log_get_lustre_record(darshan_fd fd, void** lustre_buf_p)
return darshan_log_get_lustre_record_v1(fd, lustre_buf_p);

/* retrieve the fixed-size portion of the record */
fixed_size = sizeof(struct darshan_base_record) + sizeof(int64_t);
fixed_size = sizeof(struct darshan_base_record) + (2*sizeof(int64_t));
ret = darshan_log_get_mod(fd, DARSHAN_LUSTRE_MOD, &tmp_rec, fixed_size);
if(ret < 0)
return(-1);
Expand All @@ -86,12 +86,14 @@ static int darshan_log_get_lustre_record(darshan_fd fd, void** lustre_buf_p)
DARSHAN_BSWAP64(&tmp_rec.base_rec.id);
DARSHAN_BSWAP64(&tmp_rec.base_rec.rank);
DARSHAN_BSWAP64(&tmp_rec.num_comps);
DARSHAN_BSWAP64(&tmp_rec.num_stripes);
}

comps_size = tmp_rec.num_comps * sizeof(*tmp_rec.comps);
osts_size = tmp_rec.num_stripes * sizeof(*tmp_rec.ost_ids);
if(*lustre_buf_p == NULL)
{
rec = malloc(sizeof(struct darshan_lustre_record) + comps_size);
rec = malloc(sizeof(struct darshan_lustre_record) + comps_size + osts_size);
if(!rec)
return(-1);
}
Expand All @@ -105,55 +107,61 @@ static int darshan_log_get_lustre_record(darshan_fd fd, void** lustre_buf_p)
{
rec->comps = (struct darshan_lustre_component *)
((void *)rec + sizeof(struct darshan_lustre_record));
rec->ost_ids = (OST_ID *)((void *)rec->comps + comps_size);

/* now read all record components */
/* now read all record components and OST IDs */
ret = darshan_log_get_mod(
fd,
DARSHAN_LUSTRE_MOD,
(void*)(rec->comps),
comps_size
comps_size + osts_size
);
if(ret < comps_size)
if(ret < comps_size + osts_size)
ret = -1;
else
{
ret = 1;

/* swap bytes if necessary */
if (fd->swap_flag)
{
for (i = 0; i < rec->num_comps; i++)
for(j=0; j<LUSTRE_COMP_NUM_INDICES; j++)
DARSHAN_BSWAP64(&rec->comps[i].counters[j]);
}

for (i = 0; i < rec->num_comps; i++)
num_osts += rec->comps[i].counters[LUSTRE_COMP_STRIPE_COUNT];
osts_size = num_osts * sizeof(*tmp_rec.ost_ids);
if(*lustre_buf_p == NULL)
{
rec = realloc(rec, sizeof(struct darshan_lustre_record) + comps_size + osts_size);
if(!rec)
return(-1);
}
rec->comps = (struct darshan_lustre_component *)
((void *)rec + sizeof(struct darshan_lustre_record));
rec->ost_ids = (OST_ID *)((void *)rec->comps + comps_size);

/* now read the OST list */
ret = darshan_log_get_mod(
fd,
DARSHAN_LUSTRE_MOD,
(void*)(rec->ost_ids),
osts_size
);
if(ret < osts_size)
ret = -1;
else
{
ret = 1;
/* swap bytes if necessary */
if (fd->swap_flag)
for (i = 0; i < num_osts; i++)
for (i = 0; i < rec->num_stripes; i++)
DARSHAN_BSWAP64(&rec->ost_ids[i]);
}

/* truncate any unused components/stripes leftover from runtime */
rec->num_stripes = 0;
for (i = 0; i < rec->num_comps; i++)
{
/* NOTE: at runtime, unused stripe components are marked with size of -1 */
if (rec->comps[i].counters[LUSTRE_COMP_STRIPE_SIZE] == -1)
{
rec->num_comps = i;
break;
}
rec->num_stripes += rec->comps[i].counters[LUSTRE_COMP_STRIPE_COUNT];
}
new_comps_size = rec->num_comps * sizeof(*rec->comps);
new_osts_size = rec->num_stripes * sizeof(*rec->ost_ids);
if (new_comps_size != comps_size)
{
memmove(((void*)rec->comps + new_comps_size),
((void*)rec->comps + comps_size),
new_osts_size);
}
if ((new_comps_size != comps_size) || (new_osts_size != osts_size))
{
if(*lustre_buf_p == NULL)
{
/* record buffer size changes, so we should realloc to match it */
rec = realloc(rec, sizeof(struct darshan_lustre_record) + new_comps_size + new_osts_size);
if(!rec)
return(-1);
}
}
}
}

Expand Down Expand Up @@ -295,6 +303,9 @@ static void darshan_log_print_lustre_record(void *rec, char *file_name,
DARSHAN_D_COUNTER_PRINT(darshan_module_names[DARSHAN_LUSTRE_MOD],
lustre_rec->base_rec.rank, lustre_rec->base_rec.id, "LUSTRE_NUM_COMPONENTS",
lustre_rec->num_comps, file_name, mnt_pt, fs_type);
DARSHAN_D_COUNTER_PRINT(darshan_module_names[DARSHAN_LUSTRE_MOD],
lustre_rec->base_rec.rank, lustre_rec->base_rec.id, "LUSTRE_NUM_STRIPES",
lustre_rec->num_stripes, file_name, mnt_pt, fs_type);
for(i=0; i<lustre_rec->num_comps; i++)
{
for(j=0; j < LUSTRE_COMP_NUM_INDICES; j++)
Expand Down Expand Up @@ -386,6 +397,7 @@ static void darshan_log_print_lustre_description(int ver)
{
printf("\n# description of LUSTRE counters:\n");
printf("# LUSTRE_NUM_COMPONENTS: number of instrumented components in the Lustre layout.\n");
printf("# LUSTRE_NUM_STRIPES: number of active stripes in the Lustre layout components.\n");
printf("# LUSTRE_COMP*_STRIPE_SIZE: stripe size for this file layout component in bytes.\n");
printf("# LUSTRE_COMP*_STRIPE_COUNT: number of OSTs over which the file layout component is striped.\n");
printf("# LUSTRE_COMP*_STRIPE_PATTERN: pattern (e.g., raid0, mdt, overstriped) for this file layout component.\n");
Expand Down
1 change: 1 addition & 0 deletions darshan-util/doc/darshan-util.txt
Original file line number Diff line number Diff line change
Expand Up @@ -510,6 +510,7 @@ execution time.
|====
| counter name | description
| LUSTRE_NUM_COMPONENTS | number of instrumented components in the Lustre layout
| LUSTRE_NUM_STRIPES | number of active stripes in the Lustre layout components
| LUSTRE_COMP*_STRIPE_SIZE | stripe size for this file layout component in bytes
| LUSTRE_COMP*_STRIPE_COUNT | number of OSTs over which the file layout component is striped
| LUSTRE_COMP*_STRIPE_PATTERN | pattern (e.g., raid0, mdt, overstriped) for this file layout component
Expand Down
8 changes: 5 additions & 3 deletions include/darshan-lustre-log-format.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,13 +53,15 @@ struct darshan_lustre_component
* - a corresponding record identifier (created by hashing the file path)
* - the rank of the process which opened the file (-1 for shared files)
* - total number of file layout components instrumented
* - total number of file stripes instrumented
* - detailed counters describing each file layout component (e.g., stripe width, count, etc.)
* - list of OST IDs corresponding to instrumented file layout components
*/
struct darshan_lustre_record
{
struct darshan_base_record base_rec;
int64_t num_comps;
int64_t num_stripes;
struct darshan_lustre_component *comps;
OST_ID *ost_ids;
};
Expand All @@ -68,9 +70,9 @@ struct darshan_lustre_record
* helper macro to calculate the serialized size of a Lustre record
* NOTE: this must be kept in sync with the definitions above
*/
#define LUSTRE_RECORD_SIZE(comps, osts) \
(sizeof(struct darshan_base_record) + sizeof(int64_t) + \
#define LUSTRE_RECORD_SIZE(comps, stripes) \
(sizeof(struct darshan_base_record) + (2*sizeof(int64_t)) + \
(sizeof(struct darshan_lustre_component) * (comps)) + \
(sizeof(OST_ID) * (osts)))
(sizeof(OST_ID) * (stripes)))

#endif /* __DARSHAN_LUSTRE_LOG_FORMAT_H */

0 comments on commit 267ec27

Please sign in to comment.