From 671f3e2961d242feff2042891714b8f21011c195 Mon Sep 17 00:00:00 2001 From: Rok Roskar Date: Wed, 5 Apr 2017 16:48:26 +0200 Subject: [PATCH 1/6] improve handling of child processes; fix bug in plot_file for large number of children; refactor the profile file reading functions --- memory_profiler.py | 155 +++++++++++++++++++++++++++++++++++++++++---- mprof | 114 +++++++++++---------------------- 2 files changed, 179 insertions(+), 90 deletions(-) diff --git a/memory_profiler.py b/memory_profiler.py index f6ac274..0162881 100644 --- a/memory_profiler.py +++ b/memory_profiler.py @@ -16,7 +16,7 @@ import inspect import subprocess import logging - +from collections import defaultdict # TODO: provide alternative when multiprocessing is not available try: @@ -112,10 +112,10 @@ def _get_child_memory(process, meminfo_attr=None): # Loop over the child processes and yield their memory try: for child in getattr(process, children_attr)(recursive=True): - yield getattr(child, meminfo_attr)()[0] / _TWO_20 + yield child.pid, getattr(child, meminfo_attr)()[0] / _TWO_20 except psutil.NoSuchProcess: # https://github.com/fabianp/memory_profiler/issues/71 - yield 0.0 + yield (0,0.0) # need to yield a tuple def _get_memory(pid, backend, timestamps=False, include_children=False, filename=None): @@ -143,7 +143,7 @@ def ps_util_tool(): else 'get_memory_info' mem = getattr(process, meminfo_attr)()[0] / _TWO_20 if include_children: - mem += sum(_get_child_memory(process, meminfo_attr)) + mem += sum((mem for (pid,mem) in _get_child_memory(process, meminfo_attr))) if timestamps: return mem, time.time() else: @@ -355,14 +355,14 @@ def memory_usage(proc=-1, interval=.1, timeout=None, timestamps=False, # Write children to the stream file if multiprocess: - for idx, chldmem in enumerate(_get_child_memory(proc.pid)): - stream.write("CHLD {0} {1:.6f} {2:.4f}\n".format(idx, chldmem, time.time())) + for chldpid, chldmem in _get_child_memory(proc.pid): + stream.write("CHLD {0} {1:.6f} {2:.4f}\n".format(chldpid, chldmem, time.time())) else: # Create a nested list with the child memory if multiprocess: mem_usage = [mem_usage] - for chldmem in _get_child_memory(proc.pid): - mem_usage.append(chldmem) + for chldpid, chldmem in _get_child_memory(proc.pid): + mem_usage.append((chldpid,chldmem)) # Append the memory usage to the return value ret.append(mem_usage) @@ -399,14 +399,14 @@ def memory_usage(proc=-1, interval=.1, timeout=None, timestamps=False, # Write children to the stream file if multiprocess: - for idx, chldmem in enumerate(_get_child_memory(proc.pid)): - stream.write("CHLD {0} {1:.6f} {2:.4f}\n".format(idx, chldmem, time.time())) + for child_pid, chldmem in _get_child_memory(proc): + stream.write("CHLD {0} {1:.6f} {2:.4f}\n".format(child_pid, chldmem, time.time())) else: # Create a nested list with the child memory if multiprocess: mem_usage = [mem_usage] - for chldmem in _get_child_memory(proc.pid): - mem_usage.append(chldmem) + for chldpid, chldmem in _get_child_memory(proc): + mem_usage.append((chldpid,chldmem)) # Append the memory usage to the return value ret.append(mem_usage) @@ -1207,3 +1207,134 @@ def flush(self): prof.show_results(stream=out_file) else: show_results(prof, precision=options.precision, stream=out_file) + + +### I/O + +def read_mprofile_file(filename): + """Read an mprofile file and return its content. + + Returns + ======= + content: dict + Keys: + + - "mem_usage": (list) memory usage values, in MiB + - "timestamp": (list) time instant for each memory usage value, in + second + - "func_timestamp": (dict) for each function, timestamps and memory + usage upon entering and exiting. + - 'cmd_line': (str) command-line ran for this profile. + """ + func_ts = {} + mem_usage = [] + timestamp = [] + children = defaultdict(list) + cmd_line = None + f = open(filename, "r") + for l in f: + if l == '\n': + raise ValueError('Sampling time was too short') + field, value = l.split(' ', 1) + if field == "MEM": + # mem, timestamp + values = value.split(' ') + mem_usage.append(float(values[0])) + timestamp.append(float(values[1])) + + elif field == "FUNC": + values = value.split(' ') + f_name, mem_start, start, mem_end, end = values[:5] + ts = func_ts.get(f_name, []) + ts.append([float(start), float(end), + float(mem_start), float(mem_end)]) + func_ts[f_name] = ts + + elif field == "CHLD": + values = value.split(' ') + chldnum = values[0] + children[chldnum].append( + (float(values[1]), float(values[2])) + ) + + elif field == "CMDLINE": + cmd_line = value + else: + pass + f.close() + + return {"mem_usage": mem_usage, "timestamp": timestamp, + "func_timestamp": func_ts, 'filename': filename, + 'cmd_line': cmd_line, 'children': children} + + +def read_mprofile_file_multiprocess(filename): + """Read an mprofile file and return a mem_usage list + + Returns + ======= + content: list + + This is analogous to the list obtained when the `memory_usage` is used + """ + + mem_usage = [] + sample = [] + + f = open(filename,'r') + + for i,l in enumerate(f): + if l == '\n': + raise ValueError('Sampling time was too short') + field, value = l.split(' ', 1) + values = value.split(' ') + + if field=="MEM": + # append the existing sample and reset to zero + mem_usage.append(sample) + sample = [] + sample.append((float(values[0]), float(values[1]))) + elif field=="CHLD": + sample.append((int(values[0]), float(values[1]))) + + f.close() + return mem_usage[1:] + + +def convert_mem_usage_to_df(filename, is_pickle=False): + """Convert a `mem_usage` list to a `pandas.DataFrame` + + Returns + ======= + content: pandas.DataFrame + + Returns a `pandas.DataFrame` with child IDs as columns and the timestamp as an index + """ + + import pandas as pd + + if is_pickle: + from cPickle import load + with open(filename) as f: + mem_usage = load(f) + + else: + mem_usage = read_mprofile_file_multiprocess(filename) + mem_usage = filter(lambda m: len(m) > 1, mem_usage) + + times =[m[0][1] for m in mem_usage] + pids = np.sort(np.unique([m[0] for n in mem_usage for m in n[1:] if not isinstance(m,float)])) + + time_lookup = {time: i for i,time in enumerate(times)} + pid_lookup = {pid:i for i,pid in enumerate(pids)} + + data = np.zeros((len(times), len(pids))) + + for i,m in enumerate(mem_usage): + t = m[0][1] + try: + for pid,mem in m[1:]: + data[time_lookup[t]][pid_lookup[pid]] = mem + except TypeError: + print 'found a bad value in ', i + return pd.DataFrame(data, index=times, columns=pids) diff --git a/mprof b/mprof index 31811cb..405d445 100755 --- a/mprof +++ b/mprof @@ -199,11 +199,17 @@ def run_action(): parser.add_option("--multiprocess", "-M", dest="multiprocess", default=False, action="store_true", help="""Monitors forked processes creating individual plots for each child""") + parser.add_option("--pid", "-p", dest="pid", + default=False, action="store_true", + help="""Monitor an existing process given by PID""") + parser.add_option("--timeout", dest="timeout", + default=None, action="store", type=int, + help="""Timeout in seconds""") (options, args) = parser.parse_args() - if len(args) == 0: - print("A program to run must be provided. Use -h for help") + if (len(args) == 0) and not options.pid: + print("A program to run or a pid must be provided. Use -h for help") sys.exit(1) print("{1}: Sampling memory every {0.interval}s".format( @@ -218,30 +224,36 @@ def run_action(): mprofile_output = "mprofile_%s.dat" % suffix # .. TODO: more than one script as argument ? .. - if args[0].endswith('.py') and not options.nopython: - if not args[0].startswith("python"): - args.insert(0, "python") - if options.multiprocess: - # in multiprocessing mode you want to spawn a separate - # python process - options.python = False - if options.python: - print("running as a Python program...") - if not args[0].startswith("python"): - args.insert(0, "python") - cmd_line = get_cmd_line(args) - args[1:1] = ("-m", "memory_profiler", "--timestamp", - "-o", mprofile_output) - p = subprocess.Popen(args) + if not options.pid: + if args[0].endswith('.py') and not options.nopython: + if not args[0].startswith("python"): + args.insert(0, "python") + if options.multiprocess: + # in multiprocessing mode you want to spawn a separate + # python process + options.python = False + if options.python: + print("running as a Python program...") + if not args[0].startswith("python"): + args.insert(0, "python") + cmd_line = get_cmd_line(args) + args[1:1] = ("-m", "memory_profiler", "--timestamp", + "-o", mprofile_output) + p = subprocess.Popen(args) + else: + cmd_line = get_cmd_line(args) + p = subprocess.Popen(args) else: - cmd_line = get_cmd_line(args) - p = subprocess.Popen(args) + p = int(args[0]) with open(mprofile_output, "a") as f: - f.write("CMDLINE {0}\n".format(cmd_line)) + if not options.pid: + f.write("CMDLINE {0}\n".format(cmd_line)) + mp.memory_usage(proc=p, interval=options.interval, timestamps=True, include_children=options.include_children, - multiprocess=options.multiprocess, stream=f) + multiprocess=options.multiprocess, stream=f, + timeout=options.timeout) def add_brackets(xloc, yloc, xshift=0, color="r", label=None, options=None): @@ -291,61 +303,6 @@ def add_brackets(xloc, yloc, xshift=0, color="r", label=None, options=None): ## pl.plot(xloc[1], yloc[1], ">"+color, markersize=7) -def read_mprofile_file(filename): - """Read an mprofile file and return its content. - - Returns - ======= - content: dict - Keys: - - - "mem_usage": (list) memory usage values, in MiB - - "timestamp": (list) time instant for each memory usage value, in - second - - "func_timestamp": (dict) for each function, timestamps and memory - usage upon entering and exiting. - - 'cmd_line': (str) command-line ran for this profile. - """ - func_ts = {} - mem_usage = [] - timestamp = [] - children = defaultdict(list) - cmd_line = None - f = open(filename, "r") - for l in f: - if l == '\n': - raise ValueError('Sampling time was too short') - field, value = l.split(' ', 1) - if field == "MEM": - # mem, timestamp - values = value.split(' ') - mem_usage.append(float(values[0])) - timestamp.append(float(values[1])) - - elif field == "FUNC": - values = value.split(' ') - f_name, mem_start, start, mem_end, end = values[:5] - ts = func_ts.get(f_name, []) - ts.append([float(start), float(end), - float(mem_start), float(mem_end)]) - func_ts[f_name] = ts - - elif field == "CHLD": - values = value.split(' ') - chldnum = values[0] - children[chldnum].append( - (float(values[1]), float(values[2])) - ) - - elif field == "CMDLINE": - cmd_line = value - else: - pass - f.close() - - return {"mem_usage": mem_usage, "timestamp": timestamp, - "func_timestamp": func_ts, 'filename': filename, - 'cmd_line': cmd_line, 'children': children} def plot_file(filename, index=0, timestamps=True, children=True, options=None): @@ -355,7 +312,7 @@ def plot_file(filename, index=0, timestamps=True, children=True, options=None): print("matplotlib is needed for plotting.") sys.exit(1) import numpy as np # pylab requires numpy anyway - mprofile = read_mprofile_file(filename) + mprofile = mp.read_mprofile_file(filename) if len(mprofile['timestamp']) == 0: print('** No memory usage values have been found in the profile ' @@ -413,7 +370,8 @@ def plot_file(filename, index=0, timestamps=True, children=True, options=None): cmem = np.asarray([item[0] for item in data]) # Plot the line to the figure - pl.plot(cts, cmem, "+-" + mem_line_colors[idx+1 % len(mem_line_colors)], + print (idx+1) % len(mem_line_colors) + pl.plot(cts, cmem, "+-" + mem_line_colors[(idx+1) % len(mem_line_colors)], label="child {}".format(proc)) # Detect the maximal child memory point From ae2a5eb24bd5f92a9e29e46c61027bc4796e58ae Mon Sep 17 00:00:00 2001 From: Rok Roskar Date: Thu, 6 Apr 2017 15:37:03 +0200 Subject: [PATCH 2/6] update docstring --- memory_profiler.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/memory_profiler.py b/memory_profiler.py index 0162881..4f5ce37 100644 --- a/memory_profiler.py +++ b/memory_profiler.py @@ -1215,7 +1215,7 @@ def read_mprofile_file(filename): """Read an mprofile file and return its content. Returns - ======= + ------- content: dict Keys: @@ -1272,7 +1272,7 @@ def read_mprofile_file_multiprocess(filename): """Read an mprofile file and return a mem_usage list Returns - ======= + ------- content: list This is analogous to the list obtained when the `memory_usage` is used @@ -1304,8 +1304,17 @@ def read_mprofile_file_multiprocess(filename): def convert_mem_usage_to_df(filename, is_pickle=False): """Convert a `mem_usage` list to a `pandas.DataFrame` + Parameters + ---------- + filename: path to the memory profile data; can be either a file + created by mprof or a pickle of the result of `memory_usage` + + is_pickle: if True, assume the data is the pickled list + returned by `memory_usage` + + Returns - ======= + ------- content: pandas.DataFrame Returns a `pandas.DataFrame` with child IDs as columns and the timestamp as an index From 7f748c272fdd33f6e45b1eb78aadce95fb80afaa Mon Sep 17 00:00:00 2001 From: Rok Roskar Date: Thu, 6 Apr 2017 15:40:14 +0200 Subject: [PATCH 3/6] fix help string --- mprof | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mprof b/mprof index 405d445..0886118 100755 --- a/mprof +++ b/mprof @@ -201,14 +201,14 @@ def run_action(): help="""Monitors forked processes creating individual plots for each child""") parser.add_option("--pid", "-p", dest="pid", default=False, action="store_true", - help="""Monitor an existing process given by PID""") + help="""Specify that the argument is a running pid not an executable or script""") parser.add_option("--timeout", dest="timeout", default=None, action="store", type=int, help="""Timeout in seconds""") (options, args) = parser.parse_args() - if (len(args) == 0) and not options.pid: + if (len(args) == 0): print("A program to run or a pid must be provided. Use -h for help") sys.exit(1) From ff3b22dff01df0196acc89e3658a32bb005bbbac Mon Sep 17 00:00:00 2001 From: Rok Roskar Date: Thu, 6 Apr 2017 16:08:15 +0200 Subject: [PATCH 4/6] oops, removing debug print --- memory_profiler.py | 1 + mprof | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/memory_profiler.py b/memory_profiler.py index 4f5ce37..33d0eac 100644 --- a/memory_profiler.py +++ b/memory_profiler.py @@ -1346,4 +1346,5 @@ def convert_mem_usage_to_df(filename, is_pickle=False): data[time_lookup[t]][pid_lookup[pid]] = mem except TypeError: print 'found a bad value in ', i + return pd.DataFrame(data, index=times, columns=pids) diff --git a/mprof b/mprof index 0886118..26653be 100755 --- a/mprof +++ b/mprof @@ -370,7 +370,6 @@ def plot_file(filename, index=0, timestamps=True, children=True, options=None): cmem = np.asarray([item[0] for item in data]) # Plot the line to the figure - print (idx+1) % len(mem_line_colors) pl.plot(cts, cmem, "+-" + mem_line_colors[(idx+1) % len(mem_line_colors)], label="child {}".format(proc)) From c75081a1d35af2c72d28e57a300880d3c04bffe9 Mon Sep 17 00:00:00 2001 From: Rok Roskar Date: Thu, 6 Apr 2017 17:36:40 +0200 Subject: [PATCH 5/6] removed try/except, not needed --- memory_profiler.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/memory_profiler.py b/memory_profiler.py index 33d0eac..a534079 100644 --- a/memory_profiler.py +++ b/memory_profiler.py @@ -1341,10 +1341,7 @@ def convert_mem_usage_to_df(filename, is_pickle=False): for i,m in enumerate(mem_usage): t = m[0][1] - try: - for pid,mem in m[1:]: - data[time_lookup[t]][pid_lookup[pid]] = mem - except TypeError: - print 'found a bad value in ', i - + for pid,mem in m[1:]: + data[time_lookup[t]][pid_lookup[pid]] = mem + return pd.DataFrame(data, index=times, columns=pids) From b5c87e48752e92a3a06607718ecf4830c592656b Mon Sep 17 00:00:00 2001 From: Rok Roskar Date: Tue, 11 Apr 2017 14:54:50 +0200 Subject: [PATCH 6/6] added parent process to dataframe; add df plot function --- memory_profiler.py | 33 ++++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/memory_profiler.py b/memory_profiler.py index a534079..0ef4e82 100644 --- a/memory_profiler.py +++ b/memory_profiler.py @@ -1320,8 +1320,12 @@ def convert_mem_usage_to_df(filename, is_pickle=False): Returns a `pandas.DataFrame` with child IDs as columns and the timestamp as an index """ - import pandas as pd - + try: + import pandas as pd + import numpy as np + except ImportError: + raise ImportError('Pandas and numpy are required for conversion to DataFrame') + if is_pickle: from cPickle import load with open(filename) as f: @@ -1332,7 +1336,9 @@ def convert_mem_usage_to_df(filename, is_pickle=False): mem_usage = filter(lambda m: len(m) > 1, mem_usage) times =[m[0][1] for m in mem_usage] - pids = np.sort(np.unique([m[0] for n in mem_usage for m in n[1:] if not isinstance(m,float)])) + + # flatten list of lists, extract the pids and attach '0' (parent) at the end + pids = np.sort(np.unique([m[0] for n in mem_usage for m in n[1:] if not isinstance(m,float)] + [0,])) time_lookup = {time: i for i,time in enumerate(times)} pid_lookup = {pid:i for i,pid in enumerate(pids)} @@ -1341,7 +1347,28 @@ def convert_mem_usage_to_df(filename, is_pickle=False): for i,m in enumerate(mem_usage): t = m[0][1] + + # add the parent memory by hand + data[time_lookup[t]][pid_lookup[0]] = m[0][0] + for pid,mem in m[1:]: data[time_lookup[t]][pid_lookup[pid]] = mem return pd.DataFrame(data, index=times, columns=pids) + +def plot_mem_usage(filename, include_parent=True, plot_total=True, is_pickle=False): + import matplotlib.pylab as plt + + data_df = convert_mem_usage_to_df(filename, is_pickle) + + f = plt.figure(figsize=(10,6)) + + if not include_parent: + data_df = data_df[data_df.columns[1:]] + + data_df.plot(legend=False, figsize=(14,10), grid=True, fontsize=14) + + if plot_total: + data_df.sum(axis=1).plot(style='--', grid=True) + + plt.xlabel('timestamp'); plt.ylabel('memory usage in MB') \ No newline at end of file