From 84208ff01f8b4e866526836937bb662348ec9c8e Mon Sep 17 00:00:00 2001 From: Tatiana Korchuganova Date: Mon, 25 Nov 2024 13:01:54 +0100 Subject: [PATCH] errorSummary | fix n bins for big time ranges, do wn summary if site specified --- core/libs/exlib.py | 77 +++++++++++++++++------------- core/libs/job.py | 4 ++ core/pandajob/summary_error.py | 29 +++++++---- core/static/css/page-sections.css | 7 +++ core/static/js/draw-plots-chart.js | 11 +++-- core/templates/errorSummary.html | 14 +++--- core/views.py | 16 +++++-- 7 files changed, 101 insertions(+), 57 deletions(-) diff --git a/core/libs/exlib.py b/core/libs/exlib.py index 62274ad1..6103917b 100644 --- a/core/libs/exlib.py +++ b/core/libs/exlib.py @@ -372,6 +372,48 @@ def calc_nbins(length, n_bins_max=50): return n_bins +def calc_freq_time_series(timestamp_list, n_bins_max=60): + """ + Calculate N bins for time series data + :param timestamp_list: + :param n_bins_max: + :return: freq: str - for data frame grouping + """ + full_timerange_seconds = (max(timestamp_list) - min(timestamp_list)).total_seconds() + + step = 30 + label = 'S' + while full_timerange_seconds/step > n_bins_max: + if step <= 600: + step += 30 + elif step <= 3600: + step += 600 + label = 'T' + elif step <= 3600 * 24: + step += 3600 + label = 'H' + elif step <= 3600 * 24 * 7: + step += 3600 * 24 + label = 'D' + elif step <= 3600 * 24 * 30: + step += 3600 * 24 * 7 + label = 'W' + else: + step += 3600 * 24 * 30 + label = 'M' + + labels = { + 'S': 1, + 'T': 60, + 'H': 3600, + 'D': 3600*24, + 'W': 3600 * 24 * 7, + 'M': 3600 * 24 * 30, + } + freq = '{}{}'.format(math.floor(step/labels[label]), label) + return freq + + def build_stack_histogram(data_raw, **kwargs): """ Prepare stack histogram data and calculate mean and std metrics @@ -430,7 +472,6 @@ def build_time_histogram(data): :param data: list. if 1xN - counting occurances, if 2xN - sum for each occurance :return: """ - N_BINS_MAX = 60 agg = 'count' if len(data) > 0 and isinstance(data[0], list) and len(data[0]) == 2: agg = 'sum' @@ -440,39 +481,7 @@ def build_time_histogram(data): timestamp_list = data else: timestamp_list = [item[0] for item in data] - - full_timerange_seconds = (max(timestamp_list) - min(timestamp_list)).total_seconds() - - step = 30 - label = 'S' - while full_timerange_seconds/step > N_BINS_MAX: - if step <= 600: - step += 30 - elif step <= 3600: - step += 600 - label = 'T' - elif step <= 3600 * 24: - step += 3600 - label = 'H' - elif step <= 3600 * 24 * 7: - step += 3600 * 24 - label = 'D' - elif step <= 3600 * 24 * 30: - step += 3600 * 24 * 7 - label = 'W' - else: - step += 3600 * 24 * 30 - label = 'M' - - labels = { - 'S': 1, - 'T': 60, - 'H': 3600, - 'D': 3600*24, - 'W': 3600 * 24 * 7, - 'M': 3600 * 24 * 30, - } - freq = '{}{}'.format(math.floor(step/labels[label]), label) + freq = calc_freq_time_series(timestamp_list, n_bins_max=60) # prepare binned data if agg == 'count': diff --git a/core/libs/job.py b/core/libs/job.py index 7e74a8e5..2309ef7c 100644 --- a/core/libs/job.py +++ b/core/libs/job.py @@ -511,6 +511,10 @@ def clean_job_list(request, jobl, do_add_metadata=False, do_add_errorinfo=False) job['processor_type'] = 'GPU' else: job['processor_type'] = 'CPU' + if 'modificationhost' in job and job['modificationhost']: + job['wn'] = job['modificationhost'].split('@')[1] if '@' in job['modificationhost'] else job['modificationhost'] + else: + job['wn'] = 'Unknown' job['durationsec'] = get_job_walltime(job) job['durationsec'] = job['durationsec'] if job['durationsec'] is not None else 0 diff --git a/core/pandajob/summary_error.py b/core/pandajob/summary_error.py index 8fd96ebd..186ff3fb 100644 --- a/core/pandajob/summary_error.py +++ b/core/pandajob/summary_error.py @@ -7,6 +7,7 @@ from core.libs.error import get_job_error_desc from core.libs.task import taskNameDict from core.libs.job import get_job_walltime +from core.libs.exlib import calc_freq_time_series from django.conf import settings import core.constants as const @@ -97,15 +98,16 @@ def get_job_error_categories(job): return error_category_list -def prepare_binned_and_total_data(df, column): +def prepare_binned_and_total_data(df, column, freq='10T'): """ Prepare binned and total time-series data for plots :param df: data frame :param column: column in data frame which use to split values for stacking + :param freq: frequency for resampling :return: """ # resample in 10-minute bins and count occurrences for each unique value in the specified column - resampled = df.groupby([pd.Grouper(freq='10T'), column]).size().unstack(fill_value=0) + resampled = df.groupby([pd.Grouper(freq=freq), column]).size().unstack(fill_value=0) # calculate total counts across all bins for pie chart total_counts = resampled.sum().to_dict() @@ -145,7 +147,7 @@ def categorize_low_impact_by_percentage(df, column, threshold_percent): return df -def build_error_histograms(jobs): +def build_error_histograms(jobs, is_wn_instead_of_site=False): """ Prepare histograms data by different categories :param jobs: @@ -153,15 +155,19 @@ def build_error_histograms(jobs): """ threshold_percent = 2 # % threshold for low-impact values + timestamp_list = [] data = [] for job in jobs: data.append({ 'modificationtime': job['modificationtime'], - 'site': job['computingsite'], + 'site': job['computingsite'] if not is_wn_instead_of_site else job['wn'], 'code': ','.join(sorted(get_job_error_categories(job))), 'task': job['jeditaskid'], 'user': job['produsername'], }) + timestamp_list.append(job['modificationtime']) + + freq = calc_freq_time_series(timestamp_list, n_bins_max=60) if len(data) > 0: df = pd.DataFrame(data) @@ -175,9 +181,9 @@ def build_error_histograms(jobs): # Generate JSON-ready data for each column output_data = {} for column in ['site', 'code', 'task', 'user']: - output_data[column] = prepare_binned_and_total_data(df, column) + output_data[column] = prepare_binned_and_total_data(df, column, freq=freq) - total_jobs_per_bin = df.resample('10T').size().reset_index(name='total') + total_jobs_per_bin = df.resample(freq).size().reset_index(name='total') total_jobs_per_bin['modificationtime'] = total_jobs_per_bin['modificationtime'].dt.strftime( settings.DATETIME_FORMAT) @@ -191,13 +197,14 @@ def build_error_histograms(jobs): return output_data -def errorSummaryDict(jobs, is_test_jobs=False, sortby='count', is_user_req=False, **kwargs): +def errorSummaryDict(jobs, is_test_jobs=False, sortby='count', is_user_req=False, is_site_req=False, **kwargs): """ Takes a job list and produce error summaries from it :param jobs: list of dicts :param is_test_jobs: bool: for test jobs we do not limit to "failed" jobs only :param sortby: str: count or alpha :param is_user_req: bool: we do jeditaskid in attribute summary only if a user is specified + :param is_site_req: bool: we do summary per worker node if True :param kwargs: flist and outputs :return: errsByCountL, errsBySiteL, errsByUserL, errsByTaskL, suml, error_histograms """ @@ -236,7 +243,11 @@ def errorSummaryDict(jobs, is_test_jobs=False, sortby='count', is_user_req=False for job in jobs: if not is_test_jobs and job['jobstatus'] not in ['failed', 'holding']: continue - site = job['computingsite'] + # if specific site, we do summary per worker node + if is_site_req: + site = job['wn'] + else: + site = job['computingsite'] user = job['produsername'] taskname = '' if job['jeditaskid'] is not None and job['jeditaskid'] > 0: @@ -429,7 +440,7 @@ def errorSummaryDict(jobs, is_test_jobs=False, sortby='count', is_user_req=False _logger.debug('Dict -> list & sorting are done: {}'.format(time.time() - start_time)) if 'errsHist' in outputs: - error_histograms = build_error_histograms(jobs) + error_histograms = build_error_histograms(jobs, is_wn_instead_of_site=is_site_req) _logger.debug('Built errHist: {}'.format(time.time() - start_time)) return errsByCountL, errsBySiteL, errsByUserL, errsByTaskL, suml, error_histograms diff --git a/core/static/css/page-sections.css b/core/static/css/page-sections.css index 474c3e02..5a6b88c5 100644 --- a/core/static/css/page-sections.css +++ b/core/static/css/page-sections.css @@ -1295,6 +1295,13 @@ canvas.chartjs_block { margin: 0; } +.c3-plots-row.fixed-height canvas.chartjs_block { + height: 500px; + max-height: 500px; + min-height: 500px; +} + + .bp-container-wrapper { margin: 0 0 0.7rem 0;} .row.bp-container-wrapper .columns, .row.bp-container-wrapper .column { diff --git a/core/static/js/draw-plots-chart.js b/core/static/js/draw-plots-chart.js index ecdb8b46..17857d49 100644 --- a/core/static/js/draw-plots-chart.js +++ b/core/static/js/draw-plots-chart.js @@ -192,7 +192,8 @@ function prepare_stacked_timeseries_chart(rawdata, options) { parser: "YYYY-MM-DD HH:mm:ss", unit: 'hour', displayFormats: { - hour: 'HH:mm', // 24-hour format for hours and minutes + minute: 'YYYY-MM-DD HH:mm', + hour: 'YYYY-MM-DD HH:mm' }, }, stacked: true, @@ -223,9 +224,9 @@ function prepare_stacked_timeseries_chart(rawdata, options) { layout: { padding: { left: 0, - right: 20, + right: 0, top: 0, - bottom: 20 + bottom: 0 } }, events: ['click', 'mousemove'], @@ -238,6 +239,7 @@ function prepare_stacked_timeseries_chart(rawdata, options) { }, responsiveAnimationDuration: 0, // animation duration after a resize responsive: false, + maintainAspectRatio: false, } }; @@ -286,12 +288,13 @@ function prepare_pie_chart(raw_data, options) { }, responsiveAnimationDuration: 0, // animation duration after a resize responsive: false, + maintainAspectRatio: false, layout: { padding: { left: 0, right: 0, top: 0, - bottom: 20 + bottom: 0 } }, }, diff --git a/core/templates/errorSummary.html b/core/templates/errorSummary.html index 00bf1f78..f6624b6a 100644 --- a/core/templates/errorSummary.html +++ b/core/templates/errorSummary.html @@ -60,36 +60,36 @@
-
+
-
+
-
+
-
+
-
+
@@ -186,7 +186,7 @@
Overall error summary
- Site error summary{% if requestParams.display_limit %}, limited to top-{{ requestParams.display_limit }}. To remove this limit, delete display_limit param from URL.{% endif %} + {% if 'computingsite' in requestParams %}WN{% else %}Site{% endif %} error summary{% if requestParams.display_limit %}, limited to top-{{ requestParams.display_limit }}.{% endif %} {% for site in errsBySite %} {{ site.name }} {{ site.toterrors }} diff --git a/core/views.py b/core/views.py index eba0b771..d39fd4b2 100644 --- a/core/views.py +++ b/core/views.py @@ -5651,7 +5651,7 @@ def errorSummary(request): jobs = [] values = ( 'eventservice', 'produsername', 'produserid', 'pandaid', 'cloud', 'computingsite', 'cpuconsumptiontime', - 'jobstatus', 'transformation', 'prodsourcelabel', 'specialhandling', 'vo', 'modificationtime', + 'jobstatus', 'transformation', 'prodsourcelabel', 'specialhandling', 'vo', 'modificationtime', 'modificationhost', 'atlasrelease', 'jobsetid', 'processingtype', 'workinggroup', 'jeditaskid', 'taskid', 'starttime', 'endtime', 'brokerageerrorcode', 'brokerageerrordiag', 'ddmerrorcode', 'ddmerrordiag', 'exeerrorcode', 'exeerrordiag', 'jobdispatchererrorcode', 'jobdispatchererrordiag', 'piloterrorcode', 'piloterrordiag', @@ -5665,14 +5665,23 @@ def errorSummary(request): panda_job_models.extend([Jobsactive4, Jobsdefined4, Jobswaiting4]) # add big archived table if timewindow is more than 2 days + is_archived = False if is_archived_jobs(query['modificationtime__castdate__range']): - panda_job_models.append(Jobsarchived) + is_archived = True for model in panda_job_models: jobs.extend(model.objects.filter(**query).extra(where=[wildCardExtension])[:limit].values(*values)) + # use indexed statechangetime field instead of default modificationtime + query_arch = copy.deepcopy(query) + if is_archived: + if 'modificationtime__castdate__range' in query_arch: + query_arch['statechangetime__castdate__range'] = query['modificationtime__castdate__range'] + del query_arch['modificationtime__castdate__range'] + jobs.extend(Jobsarchived.objects.filter(**query_arch).extra(where=[wildCardExtension])[:limit].values(*values)) + if not is_json_request(request): - thread = Thread(target=totalCount, args=(panda_job_models, query, wildCardExtension, dkey)) + thread = Thread(target=totalCount, args=(panda_job_models, query_arch, wildCardExtension, dkey)) thread.start() else: thread = None @@ -5691,6 +5700,7 @@ def errorSummary(request): is_test_jobs=testjobs, sortby=sortby, is_user_req=True if 'produsername' in request.session['requestParams'] else False, + is_site_req=True if 'computingsite' in request.session['requestParams'] else False, errHist=True, ) _logger.info('Error summary built: {}'.format(time.time() - request.session['req_init_time']))