Skip to content

Commit

Permalink
Merge pull request #407 from PanDAWMS/tania_dev
Browse files Browse the repository at this point in the history
errorSummary | add check if failed jobs exist before putting to data frame
  • Loading branch information
tkorchug authored Nov 18, 2024
2 parents e1fa677 + c914c1c commit ab220bd
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 31 deletions.
63 changes: 40 additions & 23 deletions core/pandajob/summary_error.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,6 @@ def get_error_message_summary(jobs):
return error_message_summary_list



def get_job_error_categories(job):
"""
Get shortened error category string by error field and error code
Expand All @@ -99,6 +98,12 @@ def get_job_error_categories(job):


def prepare_binned_and_total_data(df, column):
"""
Prepare binned and total time-series data for plots
:param df: data frame
:param column: column in data frame which use to split values for stacking
:return:
"""
# resample in 10-minute bins and count occurrences for each unique value in the specified column
resampled = df.groupby([pd.Grouper(freq='10T'), column]).size().unstack(fill_value=0)

Expand All @@ -107,7 +112,9 @@ def prepare_binned_and_total_data(df, column):

# convert binned data to Chart.js format
header = ["timestamp"] + list(resampled.columns)
binned_data = [header] + [[timestamp.strftime(settings.DATETIME_FORMAT)] + list(row) for timestamp, row in resampled.iterrows()]
binned_data = [header] + [
[timestamp.strftime(settings.DATETIME_FORMAT)] + list(row) for timestamp, row in resampled.iterrows()
]

return {
'binned': binned_data,
Expand All @@ -116,17 +123,24 @@ def prepare_binned_and_total_data(df, column):


def categorize_low_impact_by_percentage(df, column, threshold_percent):
# Count occurrences of each unique value across the entire dataset
"""
Replace low impact values as "Other" category
:param df: data frame
:param column: column name
:param threshold_percent: int
:return:
"""
# count occurrences of each unique value across the entire dataset
counts = df[column].value_counts()
total_count = counts.sum()

# Calculate threshold in terms of counts
# calculate threshold in terms of counts
threshold_count = total_count * (threshold_percent / 100.0)

# Identify low-impact values below this threshold
# identify low-impact values below this threshold
low_impact_values = counts[counts < threshold_count].index

# Replace low-impact values with "Other"
# replace low-impact values with "Other"
df[column] = df[column].apply(lambda x: "Other" if x in low_impact_values else x)
return df

Expand All @@ -149,27 +163,30 @@ def build_error_histograms(jobs):
'user': job['produsername'],
})

df = pd.DataFrame(data)
df['modificationtime'] = pd.to_datetime(df['modificationtime'])
df.set_index('modificationtime', inplace=True)
if len(data) > 0:
df = pd.DataFrame(data)
df['modificationtime'] = pd.to_datetime(df['modificationtime'])
df.set_index('modificationtime', inplace=True)

# Apply the function to each column where you want low-impact values grouped
for column in ['site', 'code', 'task', 'user']:
df = categorize_low_impact_by_percentage(df, column, threshold_percent)
# Apply the function to each column where you want low-impact values grouped
for column in ['site', 'code', 'task', 'user']:
df = categorize_low_impact_by_percentage(df, column, threshold_percent)

# Generate JSON-ready data for each column
output_data = {}
for column in ['site', 'code', 'task', 'user']:
output_data[column] = prepare_binned_and_total_data(df, column)
# Generate JSON-ready data for each column
output_data = {}
for column in ['site', 'code', 'task', 'user']:
output_data[column] = prepare_binned_and_total_data(df, column)

total_jobs_per_bin = df.resample('10T').size().reset_index(name='total')
total_jobs_per_bin['modificationtime'] = total_jobs_per_bin['modificationtime'].dt.strftime(
settings.DATETIME_FORMAT)
total_jobs_per_bin = df.resample('10T').size().reset_index(name='total')
total_jobs_per_bin['modificationtime'] = total_jobs_per_bin['modificationtime'].dt.strftime(
settings.DATETIME_FORMAT)

output_data['total'] = {
'binned': [['timestamp', 'total']] + total_jobs_per_bin.values.tolist(),
'total': {}
}
output_data['total'] = {
'binned': [['timestamp', 'total']] + total_jobs_per_bin.values.tolist(),
'total': {}
}
else:
output_data = {}

return output_data

Expand Down
6 changes: 0 additions & 6 deletions core/templates/errorSummary.html
Original file line number Diff line number Diff line change
Expand Up @@ -132,12 +132,6 @@ <h6 class="float-left"><b>Overall error summary</b></h6>
<div class="tabs-content" data-tabs-content="collapsing-tabs">
<div class="tabs-panel is-active" id="panel_groupbycode">
<div class="card-section">
<div class="callout success small" data-closable>
<p>If you need to explore all error description messages, please use the 'Grouped by error message' link on the right above. </p>
<button class="close-button small" aria-label="Dismiss alert" type="button" data-close>
<span aria-hidden="true">&times;</span>
</button>
</div>
<table id="errorsummary_groupbycode" class="data-table left-aligned">
<thead>
<tr>
Expand Down
7 changes: 5 additions & 2 deletions core/templates/errorSummaryHelp.html
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,13 @@
The limit depth of the query will now be applied entirely to the site you have zeroed in on, so you will get more failure information (the limit is necessary for performance reasons).
The timeline plot, attribute summary, and <code>category:code</code> breakdown will now be exclusive to the site of interest. Similarly for drilling down to users and tasks.
</p>
<p>
The top box with plots has different tabs where the errors split by error category and code combinations occurred in jobs, by site, by user, and by task.
The values with low impact (less than 1%) are grouped into Other category to make plots easier to digest.
</p>
<p>If you are interested in errors for unfinished files in a task, add <code>&extra=unfinishedfiles</code> to the URL.
This only works for one selected task, i.e. there should be <code>&jeditaskid={some_jedi_task_id}</code> specified.</p>
<p>
<b>Wildcards:</b> The computingsite parameter on the URL supports wildcards such that multiple sites can be included in the error summary,
<p><b>Wildcards:</b> The computingsite parameter on the URL supports wildcards such that multiple sites can be included in the error summary,
e.g. <code>&computingsite=UKI-*</code>, <code>&computingsite=*MCORE</code>, <code>&computingsite=*-MAN-*</code>
</p>
</div>
Expand Down

0 comments on commit ab220bd

Please sign in to comment.