TESSEorg · Ashawini27 · Jul 17, 2023 · Jul 19, 2023 · Aug 4, 2023 · Aug 30, 2023
diff --git a/bin/function_histogram.py b/bin/function_histogram.py
@@ -0,0 +1,94 @@
+#!/usr/bin/env python3
+
+import sys
+import json
+import matplotlib.pyplot as plt
+import statistics  # Import the statistics module for mean and other calculations
+
+def generate_histogram(entry):
+    functions_dur_counts = {}
+
+    for event in entry:
+        pid = event["pid"]
+        function_name = event["name"].split('<')[0]
+        duration = event["dur"]
+        if pid not in functions_dur_counts:
+            functions_dur_counts[pid] = {}
+        functions_dur_counts[pid][function_name] = functions_dur_counts[pid].get(function_name, 0) + duration
+
+    return functions_dur_counts
+
+def generate_cumulative_histogram(entry):
+    functions_cumulative_dur = {}
+
+    for event in entry:
+        pid = event["pid"]
+        function_name = event["name"].split('::')[-1].split('<')[0]
+        duration = event["dur"]
+        if pid not in functions_cumulative_dur:
+            functions_cumulative_dur[pid] = {}
+        functions_cumulative_dur[pid][function_name] = functions_cumulative_dur[pid].get(function_name, 0) + duration
+
+    return functions_cumulative_dur
+
+def main():
+    if len(sys.argv) != 2:
+        print("Usage: python_script.py function_data.json")
+        return
+
+    json_file_path = sys.argv[1]
+
+    with open(json_file_path, 'r') as json_file:
+        json_data = json.load(json_file)
+
+    functions_dur_counts = generate_histogram(json_data["traceEvents"])
+
+    # Histogram for individual PIDs
+    for pid, function_dur_count in functions_dur_counts.items():
+        total_duration = sum(duration for duration in function_dur_count.values()) / 1000  # Calculate total duration in seconds
+
+        plt.bar(function_dur_count.keys(), [dur / 1000 for dur in function_dur_count.values()])  # Convert ms to sec
+        plt.xlabel('Function Name')
+        plt.ylabel('Total Duration (s)')  # Adding unit for time here
+        plt.title(f'Histogram of Total Function Durations for PID {pid}')
+        plt.xticks(rotation=45, ha='right')
+
+        # Add total duration label to the top of each bar with rotation
+        for function, duration in function_dur_count.items():
+            plt.text(function, duration / 1000, f'{duration / 1000:.2f}', rotation=45, ha='center', va='bottom', fontweight='bold')  # Convert ms to sec
+
+        # Add total duration to the top of the plot
+        plt.text(0.5, 0.95, f'Total Duration: {total_duration:.2f} s', transform=plt.gca().transAxes, ha='center', fontweight='bold')
+
+        plt.tight_layout()
+
+        # Save each histogram as a separate PDF file
+        pdf_filename = f'histogram_pid_{pid}.pdf'
+        plt.savefig(pdf_filename)
+        plt.close()
+
+    # Histogram for cumulative time taken by PIDs for each function
+    cumulative_function_dur_count = {}
+    for function_dur_count in functions_dur_counts.values():
+        for function, duration in function_dur_count.items():
+            cumulative_function_dur_count[function] = cumulative_function_dur_count.get(function, 0) + duration
+
+    plt.bar(cumulative_function_dur_count.keys(), [dur / 1000 for dur in cumulative_function_dur_count.values()])
+    plt.xlabel('Function Name')
+    plt.ylabel('Total Duration (s)')
+    plt.title('Histogram of Cumulative Function Durations across PIDs')
+    plt.xticks(rotation=45, ha='right')
+
+    # Add total duration label to the top of each bar with rotation
+    for function, duration in cumulative_function_dur_count.items():
+        plt.text(function, duration / 1000, f'{duration / 1000:.2f}', rotation=45, ha='center', va='bottom', fontweight='bold')  # Convert ms to sec
+
+    plt.tight_layout()
+
+    # Save the cumulative histogram as a PDF file
+    pdf_filename = 'cumulative_histogram.pdf'
+    plt.savefig(pdf_filename)
+    plt.close()
+
+if __name__ == "__main__":
+    main()
diff --git a/bin/pbt_to_ctf.py b/bin/pbt_to_ctf.py
@@ -6,6 +6,7 @@
     import time
     import pandas
     import sys
+    import statistics
 except ModuleNotFoundError:
     print("Did not find a system module, use pip to install it")
 
@@ -23,13 +24,12 @@ def read_pbt(pbt_files_list):
     print('The columns of the DataFrame (or data labels) and their datatypes are:')
     print(trace.events.dtypes)
 
-
     print('the types are:\n', trace.event_types)
     print('the streams are:\n', trace.streams)
 
     print('There are ' + str(len(trace.events)) + ' events in this trace', end=' ')
     for e in range(len(trace.events)):
-        print('id===', trace.events.id[e], ' node_id=', trace.events.node_id[e],' stream_id=',trace.events.stream_id[e], 'key=' ,trace.events.key[e],' type=',trace.events.type[e],' b=',trace.events.begin[e],' e=',trace.events.end[e])
+        print('id===', trace.events.id[e], ' node_id=', trace.events.node_id[e], ' stream_id=', trace.events.stream_id[e], 'key=' ,trace.events.key[e], ' type=', trace.events.type[e], ' b=', trace.events.begin[e], ' e=', trace.events.end[e])
 
 import json
 import re
@@ -40,42 +40,69 @@ def bool(str):
     return str.lower() in ["true", "yes", "y", "1", "t"]
 
 def pbt_to_ctf(pbt_files_list, ctf_filename, skip_parsec_events, skip_mpi_events):
-
     ctf_data = {"traceEvents": []}
+    # Dictionary to store aggregated durations
+    aggregated_durations = {}
 
+    # Initialize lists to store duration values for each name
+    duration_values = []
     ptt_filename = pbt2ptt.convert(pbt_files_list, multiprocess=False)
     trace = ptt.from_hdf(ptt_filename)
 
     for e in range(len(trace.events)):
-        # print('id=',trace.events.id[e],' node_id=',trace.events.node_id[e],' stream_id=',trace.events.stream_id[e],'key=',trace.events.key[e],' type=',trace.events.type[e],' b=',trace.events.begin[e],' e=',trace.events.end[e])
+        # print('id=', trace.events.id[e], ' node_id=', trace.events.node_id[e], ' stream_id=', trace.events.stream_id[e], 'key=', trace.events.key[e], ' type=', trace.events.type[e], ' b=', trace.events.begin[e], ' e=', trace.events.end[e])
         # print('\n')
 
-        if(skip_parsec_events == True and trace.event_names[trace.events.type[e]].startswith("PARSEC")):
+        if skip_parsec_events == True and trace.event_names[trace.events.type[e]].startswith("PARSEC"):
             continue
-        if(skip_mpi_events == True and trace.event_names[trace.events.type[e]].startswith("MPI")):
+        if skip_mpi_events == True and trace.event_names[trace.events.type[e]].startswith("MPI"):
             continue
 
         ctf_event = {}
         ctf_event["ph"] = "X"  # complete event type
-        ctf_event["ts"] = 0.001 * trace.events.begin[e] # when we started, in ms
-        ctf_event["dur"] = 0.001 * (trace.events.end[e] - trace.events.begin[e]) # when we started, in ms
+        ctf_event["ts"] = 0.001 * trace.events.begin[e]  # when we started, in ms
+        ctf_event["dur"] = 0.001 * (trace.events.end[e] - trace.events.begin[e])  # when we started, in ms
         ctf_event["name"] = trace.event_names[trace.events.type[e]]
 
-        if trace.events.key[e] != None:
+        if trace.events.key[e] is not None:
             ctf_event["args"] = trace.events.key[e].decode('utf-8').rstrip('\x00')
-            ctf_event["name"] = trace.event_names[trace.events.type[e]]+"<"+ctf_event["args"]+">"
+            ctf_event["name"] = trace.event_names[trace.events.type[e]] + "<" + ctf_event["args"] + ">"
 
         ctf_event["pid"] = trace.events.node_id[e]
         tid = trace.streams.th_id[trace.events.stream_id[e]]
         ctf_event["tid"] = 111111 if math.isnan(tid) else int(tid)
 
         ctf_data["traceEvents"].append(ctf_event)
 
+        # Get the index of the first occurrence of '<'
+        index_of_open_bracket = ctf_event["name"].find('<')
+        # Extract the substring before '<' and assign it to the name variable
+        if index_of_open_bracket != -1:
+            name = ctf_event["name"][:index_of_open_bracket]
+            duration = ctf_event["dur"]
+            if name in aggregated_durations:
+                aggregated_durations[name]["duration"] += duration
+                aggregated_durations[name]["count"] += 1
+            else:
+                # If name doesn't exist, create a new entry
+                aggregated_durations[name] = {"duration": duration, "count": 1}
+            # Add duration value to the list
+            duration_values.append(duration)
+
+    # Calculate the mean, median, max, min, and standard deviation for each aggregated duration
+    for name, data in aggregated_durations.items():
+        mean_duration = data["duration"] / data["count"]
+        individual_durations = [ctf_event["dur"] for ctf_event in ctf_data["traceEvents"] if ctf_event["name"].startswith(name)]
+        median_duration = statistics.median(individual_durations) if len(individual_durations) > 1 else 0.0
+        max_duration = max(individual_durations)
+        min_duration = min(individual_durations)
+        std_deviation = statistics.stdev(individual_durations) if len(individual_durations) > 1 else 0.0
+        print(f"Name: {name}, Mean: {mean_duration:.2f} μs, Median: {median_duration:.2f} μs, Max: {max_duration:.2f} μs, Min: {min_duration:.2f} μs, Std Deviation: {std_deviation:.2f} μs")
+
     with open(ctf_filename, "w") as chrome_trace:
         json.dump(ctf_data, chrome_trace)
 
 if __name__ == "__main__":
-
     pbt_file_prefix = sys.argv[1]
     ctf_file_name = sys.argv[2]
     skip_parsec_events = True
@@ -88,11 +115,11 @@ def pbt_to_ctf(pbt_files_list, ctf_filename, skip_parsec_events, skip_mpi_events
         skip_mpi_events = bool(sys.argv[4])
 
     # iterate over all files within the directory that start with sys.argv[1]
-    pbt_files_list=[]
+    pbt_files_list = []
     dirname = os.path.dirname(pbt_file_prefix)
     for file in os.listdir(dirname):
-        file_fullname = os.path.join(dirname,file)
-        if file_fullname.startswith(pbt_file_prefix) and ".prof-" in file_fullname and file_fullname != ctf_file_name:
+        file_fullname = os.path.join(dirname, file)
+        if file_fullname.startswith(pbt_file_prefix) and ".prof" in file_fullname and file_fullname != ctf_file_name:
             print("found file ", file_fullname)
             pbt_files_list.append(file_fullname)
 

diff --git a/bspmmTraces_Histogram_Analysis.py b/bspmmTraces_Histogram_Analysis.py
@@ -0,0 +1,99 @@
+#!/usr/bin/env python3
+
+import sys
+import json
+import matplotlib.pyplot as plt
+from matplotlib.backends.backend_pdf import PdfPages
+import numpy as np
+
+def extract_data_from_json(json_data):
+    entries = []
+
+    for event in json_data["traceEvents"]:
+        pid = event["pid"]
+        function_name = event["name"].split('<')[0]
+        duration_ms = event["dur"]
+
+        entry_data = {
+            "pid": pid,
+            "function_name": function_name,
+            "duration": duration_ms
+        }
+
+        entries.append(entry_data)
+
+    return entries
+
+def generate_histograms(entries, target_pid):
+    function_histograms = {}  # Create a dictionary to store durations for each function
+
+    for entry in entries:
+        pid = entry["pid"]
+        function_name = entry["function_name"]
+        duration_ms = entry["duration"] / 1000  # Convert ms to sec
+
+        if pid == target_pid:
+            if function_name not in function_histograms:
+                function_histograms[function_name] = []
+
+            function_histograms[function_name].append(duration_ms)
+
+    pdf_filename = f'histograms_for_pid_{target_pid}.pdf'
+    pdf_pages = PdfPages(pdf_filename)
+
+    for function_name, durations in function_histograms.items():
+        plt.figure(figsize=(8, 6))  # Adjust figure size for better visibility
+        n, bins, patches = plt.hist(durations, bins=10, range=(0, max(durations)))  # Create a histogram
+        plt.xlabel('Duration (s)')
+        plt.ylabel('Frequency')
+        plt.title(f'Histogram of Durations for Function {function_name} (PID {target_pid})')
+
+        # Add labels for each bar in the histogram
+        for i, patch in enumerate(patches):
+            x = patch.get_x() + patch.get_width() / 2
+            y = patch.get_height()
+            label = f'{durations[i]:.2f}s'  # Label with exact duration value
+            plt.annotate(label, (x, y), xytext=(0, 5), textcoords='offset points', ha='center')
+
+        pdf_pages.savefig()
+        plt.close()
+
+        # Compute and print average and standard deviation
+        avg_duration = np.mean(durations)
+        std_duration = np.std(durations)
+        print(f"Function: {function_name} (PID {target_pid})")
+        print(f"Average Duration: {avg_duration:.2f} s")
+        print(f"Standard Deviation: {std_duration:.2f} s")
+        print("")
+
+    pdf_pages.close()
+
+def main():
+    if len(sys.argv) != 3:
+        print("Usage: python_script.py function_data.json pid_value")
+        return
+
+    json_file_path = sys.argv[1]
+
+    try:
+        target_pid = int(sys.argv[2])
+    except ValueError:
+        print("Error: Invalid pid_value. Please enter a valid integer.")
+        return
+
+    with open(json_file_path, 'r') as json_file:
+        json_data = json.load(json_file)
+
+    entries = extract_data_from_json(json_data)
+
+    # Check if the target_pid exists in the data
+    pids = set(entry["pid"] for entry in entries)
+    if target_pid not in pids:
+        print(f"Error: PID {target_pid} not found in the data.")
+        return
+
+    generate_histograms(entries, target_pid)
+
+if __name__ == "__main__":
+    main()
+