Merge pull request #407 from PanDAWMS/tania_dev

errorSummary | add check if failed jobs exist before putting to data frame
PanDAWMS · Nov 18, 2024 · ab220bd · ab220bd
2 parents e1fa677 + c914c1c
commit ab220bd
Show file tree

Hide file tree

Showing 3 changed files with 45 additions and 31 deletions.
diff --git a/core/pandajob/summary_error.py b/core/pandajob/summary_error.py
@@ -83,7 +83,6 @@ def get_error_message_summary(jobs):
     return error_message_summary_list
 
 
-
 def get_job_error_categories(job):
     """
     Get shortened error category string by error field and error code
@@ -99,6 +98,12 @@ def get_job_error_categories(job):
 
 
 def prepare_binned_and_total_data(df, column):
+    """
+    Prepare binned and total time-series data for plots
+    :param df: data frame
+    :param column: column in data frame which use to split values for stacking
+    :return:
+    """
     # resample in 10-minute bins and count occurrences for each unique value in the specified column
     resampled = df.groupby([pd.Grouper(freq='10T'), column]).size().unstack(fill_value=0)
 
@@ -107,7 +112,9 @@ def prepare_binned_and_total_data(df, column):
 
     # convert binned data to Chart.js format
     header = ["timestamp"] + list(resampled.columns)
-    binned_data = [header] + [[timestamp.strftime(settings.DATETIME_FORMAT)] + list(row) for timestamp, row in resampled.iterrows()]
+    binned_data = [header] + [
+        [timestamp.strftime(settings.DATETIME_FORMAT)] + list(row) for timestamp, row in resampled.iterrows()
+    ]
 
     return {
         'binned': binned_data,
@@ -116,17 +123,24 @@ def prepare_binned_and_total_data(df, column):
 
 
 def categorize_low_impact_by_percentage(df, column, threshold_percent):
-    # Count occurrences of each unique value across the entire dataset
+    """
+    Replace low impact values as "Other" category
+    :param df: data frame
+    :param column: column name
+    :param threshold_percent: int
+    :return:
+    """
+    # count occurrences of each unique value across the entire dataset
     counts = df[column].value_counts()
     total_count = counts.sum()
 
-    # Calculate threshold in terms of counts
+    # calculate threshold in terms of counts
     threshold_count = total_count * (threshold_percent / 100.0)
 
-    # Identify low-impact values below this threshold
+    # identify low-impact values below this threshold
     low_impact_values = counts[counts < threshold_count].index
 
-    # Replace low-impact values with "Other"
+    # replace low-impact values with "Other"
     df[column] = df[column].apply(lambda x: "Other" if x in low_impact_values else x)
     return df
 
@@ -149,27 +163,30 @@ def build_error_histograms(jobs):
             'user': job['produsername'],
         })
 
-    df = pd.DataFrame(data)
-    df['modificationtime'] = pd.to_datetime(df['modificationtime'])
-    df.set_index('modificationtime', inplace=True)
+    if len(data) > 0:
+        df = pd.DataFrame(data)
+        df['modificationtime'] = pd.to_datetime(df['modificationtime'])
+        df.set_index('modificationtime', inplace=True)
 
-    # Apply the function to each column where you want low-impact values grouped
-    for column in ['site', 'code', 'task', 'user']:
-        df = categorize_low_impact_by_percentage(df, column, threshold_percent)
+        # Apply the function to each column where you want low-impact values grouped
+        for column in ['site', 'code', 'task', 'user']:
+            df = categorize_low_impact_by_percentage(df, column, threshold_percent)
 
-    # Generate JSON-ready data for each column
-    output_data = {}
-    for column in ['site', 'code', 'task', 'user']:
-        output_data[column] = prepare_binned_and_total_data(df, column)
+        # Generate JSON-ready data for each column
+        output_data = {}
+        for column in ['site', 'code', 'task', 'user']:
+            output_data[column] = prepare_binned_and_total_data(df, column)
 
-    total_jobs_per_bin = df.resample('10T').size().reset_index(name='total')
-    total_jobs_per_bin['modificationtime'] = total_jobs_per_bin['modificationtime'].dt.strftime(
-        settings.DATETIME_FORMAT)
+        total_jobs_per_bin = df.resample('10T').size().reset_index(name='total')
+        total_jobs_per_bin['modificationtime'] = total_jobs_per_bin['modificationtime'].dt.strftime(
+            settings.DATETIME_FORMAT)
 
-    output_data['total'] = {
-        'binned': [['timestamp', 'total']] + total_jobs_per_bin.values.tolist(),
-        'total': {}
-    }
+        output_data['total'] = {
+            'binned': [['timestamp', 'total']] + total_jobs_per_bin.values.tolist(),
+            'total': {}
+        }
+    else:
+        output_data = {}
 
     return output_data
 

diff --git a/core/templates/errorSummary.html b/core/templates/errorSummary.html
@@ -132,12 +132,6 @@ <h6 class="float-left"><b>Overall error summary</b></h6>
   <div class="tabs-content" data-tabs-content="collapsing-tabs">
     <div class="tabs-panel is-active" id="panel_groupbycode">
       <div class="card-section">
-        <div class="callout success small" data-closable>
-        <p>If you need to explore all error description messages, please use the 'Grouped by error message' link on the right above. </p>
-        <button class="close-button small" aria-label="Dismiss alert" type="button" data-close>
-            <span aria-hidden="true">&times;</span>
-        </button>
-        </div>
         <table id="errorsummary_groupbycode" class="data-table left-aligned">
         <thead>
         <tr>

diff --git a/core/templates/errorSummaryHelp.html b/core/templates/errorSummaryHelp.html
@@ -22,10 +22,13 @@
   The limit depth of the query will now be applied entirely to the site you have zeroed in on, so you will get more failure information (the limit is necessary for performance reasons).
   The timeline plot, attribute summary, and <code>category:code</code> breakdown will now be exclusive to the site of interest. Similarly for drilling down to users and tasks.
 </p>
+<p>
+  The top box with plots has different tabs where the errors split by error category and code combinations occurred in jobs, by site, by user, and by task.
+  The values with low impact (less than 1%) are grouped into Other category to make plots easier to digest.
+</p>
 <p>If you are interested in errors for unfinished files in a task, add <code>&extra=unfinishedfiles</code> to the URL.
   This only works for one selected task, i.e. there should be <code>&jeditaskid={some_jedi_task_id}</code> specified.</p>
-<p>
-<b>Wildcards:</b> The computingsite parameter on the URL supports wildcards such that multiple sites can be included in the error summary,
+<p><b>Wildcards:</b> The computingsite parameter on the URL supports wildcards such that multiple sites can be included in the error summary,
   e.g. <code>&computingsite=UKI-*</code>, <code>&computingsite=*MCORE</code>, <code>&computingsite=*-MAN-*</code>
 </p>
 </div>