From 671f3e2961d242feff2042891714b8f21011c195 Mon Sep 17 00:00:00 2001
From: Rok Roskar <roskar@physik.uzh.ch>
Date: Wed, 5 Apr 2017 16:48:26 +0200
Subject: [PATCH 1/6] improve handling of child processes; fix bug in plot_file
 for large number of children; refactor the profile file reading functions

---
 memory_profiler.py | 155 +++++++++++++++++++++++++++++++++++++++++----
 mprof              | 114 +++++++++++----------------------
 2 files changed, 179 insertions(+), 90 deletions(-)

diff --git a/memory_profiler.py b/memory_profiler.py
index f6ac274..0162881 100644
--- a/memory_profiler.py
+++ b/memory_profiler.py
@@ -16,7 +16,7 @@
 import inspect
 import subprocess
 import logging
-
+from collections import defaultdict
 
 # TODO: provide alternative when multiprocessing is not available
 try:
@@ -112,10 +112,10 @@ def _get_child_memory(process, meminfo_attr=None):
     # Loop over the child processes and yield their memory
     try:
         for child in getattr(process, children_attr)(recursive=True):
-            yield getattr(child, meminfo_attr)()[0] / _TWO_20
+            yield child.pid, getattr(child, meminfo_attr)()[0] / _TWO_20
     except psutil.NoSuchProcess:
         # https://github.com/fabianp/memory_profiler/issues/71
-        yield 0.0
+        yield (0,0.0) # need to yield a tuple
 
 
 def _get_memory(pid, backend, timestamps=False, include_children=False, filename=None):
@@ -143,7 +143,7 @@ def ps_util_tool():
                 else 'get_memory_info'
             mem = getattr(process, meminfo_attr)()[0] / _TWO_20
             if include_children:
-                mem +=  sum(_get_child_memory(process, meminfo_attr))
+                mem +=  sum((mem for (pid,mem) in _get_child_memory(process, meminfo_attr)))
             if timestamps:
                 return mem, time.time()
             else:
@@ -355,14 +355,14 @@ def memory_usage(proc=-1, interval=.1, timeout=None, timestamps=False,
 
                     # Write children to the stream file
                     if multiprocess:
-                        for idx, chldmem in enumerate(_get_child_memory(proc.pid)):
-                            stream.write("CHLD {0} {1:.6f} {2:.4f}\n".format(idx, chldmem, time.time()))
+                        for chldpid, chldmem in _get_child_memory(proc.pid):
+                            stream.write("CHLD {0} {1:.6f} {2:.4f}\n".format(chldpid, chldmem, time.time()))
                 else:
                     # Create a nested list with the child memory
                     if multiprocess:
                         mem_usage = [mem_usage]
-                        for chldmem in _get_child_memory(proc.pid):
-                            mem_usage.append(chldmem)
+                        for chldpid, chldmem in _get_child_memory(proc.pid):
+                            mem_usage.append((chldpid,chldmem))
 
                     # Append the memory usage to the return value
                     ret.append(mem_usage)
@@ -399,14 +399,14 @@ def memory_usage(proc=-1, interval=.1, timeout=None, timestamps=False,
 
                     # Write children to the stream file
                     if multiprocess:
-                        for idx, chldmem in enumerate(_get_child_memory(proc.pid)):
-                            stream.write("CHLD {0} {1:.6f} {2:.4f}\n".format(idx, chldmem, time.time()))
+                        for child_pid, chldmem in _get_child_memory(proc):
+                            stream.write("CHLD {0} {1:.6f} {2:.4f}\n".format(child_pid, chldmem, time.time()))
                 else:
                     # Create a nested list with the child memory
                     if multiprocess:
                         mem_usage = [mem_usage]
-                        for chldmem in _get_child_memory(proc.pid):
-                            mem_usage.append(chldmem)
+                        for chldpid, chldmem in _get_child_memory(proc):
+                            mem_usage.append((chldpid,chldmem))
 
                     # Append the memory usage to the return value
                     ret.append(mem_usage)
@@ -1207,3 +1207,134 @@ def flush(self):
             prof.show_results(stream=out_file)
         else:
             show_results(prof, precision=options.precision, stream=out_file)
+
+
+### I/O
+
+def read_mprofile_file(filename):
+    """Read an mprofile file and return its content.
+
+    Returns
+    =======
+    content: dict
+        Keys:
+
+        - "mem_usage": (list) memory usage values, in MiB
+        - "timestamp": (list) time instant for each memory usage value, in
+            second
+        - "func_timestamp": (dict) for each function, timestamps and memory
+            usage upon entering and exiting.
+        - 'cmd_line': (str) command-line ran for this profile.
+    """
+    func_ts = {}
+    mem_usage = []
+    timestamp = []
+    children  = defaultdict(list)
+    cmd_line = None
+    f = open(filename, "r")
+    for l in f:
+        if l == '\n':
+            raise ValueError('Sampling time was too short')
+        field, value = l.split(' ', 1)
+        if field == "MEM":
+            # mem, timestamp
+            values = value.split(' ')
+            mem_usage.append(float(values[0]))
+            timestamp.append(float(values[1]))
+
+        elif field == "FUNC":
+            values = value.split(' ')
+            f_name, mem_start, start, mem_end, end = values[:5]
+            ts = func_ts.get(f_name, [])
+            ts.append([float(start), float(end),
+                       float(mem_start), float(mem_end)])
+            func_ts[f_name] = ts
+
+        elif field == "CHLD":
+            values = value.split(' ')
+            chldnum = values[0]
+            children[chldnum].append(
+                (float(values[1]), float(values[2]))
+            )
+
+        elif field == "CMDLINE":
+            cmd_line = value
+        else:
+            pass
+    f.close()
+
+    return {"mem_usage": mem_usage, "timestamp": timestamp,
+            "func_timestamp": func_ts, 'filename': filename,
+            'cmd_line': cmd_line, 'children': children}
+
+
+def read_mprofile_file_multiprocess(filename):
+    """Read an mprofile file and return a mem_usage list
+
+    Returns
+    =======
+    content: list
+
+    This is analogous to the list obtained when the `memory_usage` is used
+    """
+
+    mem_usage = []
+    sample = []
+    
+    f = open(filename,'r')
+    
+    for i,l in enumerate(f):
+        if l == '\n':
+            raise ValueError('Sampling time was too short')
+        field, value = l.split(' ', 1)
+        values = value.split(' ')
+              
+        if field=="MEM":
+            # append the existing sample and reset to zero
+            mem_usage.append(sample)
+            sample = []
+            sample.append((float(values[0]), float(values[1])))
+        elif field=="CHLD": 
+            sample.append((int(values[0]), float(values[1])))
+          
+    f.close()
+    return mem_usage[1:]
+
+
+def convert_mem_usage_to_df(filename, is_pickle=False): 
+    """Convert a `mem_usage` list to a `pandas.DataFrame`
+
+    Returns
+    =======
+    content: pandas.DataFrame
+
+    Returns a `pandas.DataFrame` with child IDs as columns and the timestamp as an index
+    """
+
+    import pandas as pd
+    
+    if is_pickle:
+        from cPickle import load
+        with open(filename) as f: 
+            mem_usage = load(f)
+    
+    else:
+        mem_usage = read_mprofile_file_multiprocess(filename)
+        mem_usage = filter(lambda m: len(m) > 1, mem_usage)
+    
+    times =[m[0][1] for m in  mem_usage]
+    pids = np.sort(np.unique([m[0] for n in mem_usage for m in n[1:] if not isinstance(m,float)]))
+    
+    time_lookup = {time: i for i,time in enumerate(times)}
+    pid_lookup = {pid:i for i,pid in enumerate(pids)}
+    
+    data = np.zeros((len(times), len(pids)))
+    
+    for i,m in enumerate(mem_usage):
+        t = m[0][1]
+        try: 
+            for pid,mem in m[1:]:
+                data[time_lookup[t]][pid_lookup[pid]] = mem
+        except TypeError:
+            print 'found a bad value in ', i
+    return pd.DataFrame(data, index=times, columns=pids)
diff --git a/mprof b/mprof
index 31811cb..405d445 100755
--- a/mprof
+++ b/mprof
@@ -199,11 +199,17 @@ def run_action():
     parser.add_option("--multiprocess", "-M", dest="multiprocess",
                       default=False, action="store_true",
                       help="""Monitors forked processes creating individual plots for each child""")
+    parser.add_option("--pid", "-p", dest="pid",
+                      default=False, action="store_true", 
+                      help="""Monitor an existing process given by PID""")
+    parser.add_option("--timeout", dest="timeout", 
+                      default=None, action="store", type=int,
+                      help="""Timeout in seconds""")
 
     (options, args) = parser.parse_args()
 
-    if len(args) == 0:
-        print("A program to run must be provided. Use -h for help")
+    if (len(args) == 0) and not options.pid:
+        print("A program to run or a pid must be provided. Use -h for help")
         sys.exit(1)
 
     print("{1}: Sampling memory every {0.interval}s".format(
@@ -218,30 +224,36 @@ def run_action():
     mprofile_output = "mprofile_%s.dat" % suffix
 
     # .. TODO: more than one script as argument ? ..
-    if args[0].endswith('.py') and not options.nopython:
-        if not args[0].startswith("python"):
-            args.insert(0, "python")
-        if options.multiprocess:
-            # in multiprocessing mode you want to spawn a separate
-            # python process
-            options.python = False
-    if options.python:
-        print("running as a Python program...")
-        if not args[0].startswith("python"):
-            args.insert(0, "python")
-        cmd_line = get_cmd_line(args)
-        args[1:1] = ("-m", "memory_profiler", "--timestamp",
-                     "-o", mprofile_output)
-        p = subprocess.Popen(args)
+    if not options.pid:
+        if args[0].endswith('.py') and not options.nopython:
+            if not args[0].startswith("python"):
+                args.insert(0, "python")
+            if options.multiprocess:
+                # in multiprocessing mode you want to spawn a separate
+                # python process
+                options.python = False
+        if options.python:
+            print("running as a Python program...")
+            if not args[0].startswith("python"):
+                args.insert(0, "python")
+            cmd_line = get_cmd_line(args)
+            args[1:1] = ("-m", "memory_profiler", "--timestamp",
+                         "-o", mprofile_output)
+            p = subprocess.Popen(args)
+        else:
+            cmd_line = get_cmd_line(args)
+            p = subprocess.Popen(args)
     else:
-        cmd_line = get_cmd_line(args)
-        p = subprocess.Popen(args)
+        p = int(args[0])
 
     with open(mprofile_output, "a") as f:
-        f.write("CMDLINE {0}\n".format(cmd_line))
+        if not options.pid:
+            f.write("CMDLINE {0}\n".format(cmd_line))
+
         mp.memory_usage(proc=p, interval=options.interval, timestamps=True,
                         include_children=options.include_children,
-                        multiprocess=options.multiprocess, stream=f)
+                        multiprocess=options.multiprocess, stream=f,
+                        timeout=options.timeout)
 
 
 def add_brackets(xloc, yloc, xshift=0, color="r", label=None, options=None):
@@ -291,61 +303,6 @@ def add_brackets(xloc, yloc, xshift=0, color="r", label=None, options=None):
         ## pl.plot(xloc[1], yloc[1], ">"+color, markersize=7)
 
 
-def read_mprofile_file(filename):
-    """Read an mprofile file and return its content.
-
-    Returns
-    =======
-    content: dict
-        Keys:
-
-        - "mem_usage": (list) memory usage values, in MiB
-        - "timestamp": (list) time instant for each memory usage value, in
-            second
-        - "func_timestamp": (dict) for each function, timestamps and memory
-            usage upon entering and exiting.
-        - 'cmd_line': (str) command-line ran for this profile.
-    """
-    func_ts = {}
-    mem_usage = []
-    timestamp = []
-    children  = defaultdict(list)
-    cmd_line = None
-    f = open(filename, "r")
-    for l in f:
-        if l == '\n':
-            raise ValueError('Sampling time was too short')
-        field, value = l.split(' ', 1)
-        if field == "MEM":
-            # mem, timestamp
-            values = value.split(' ')
-            mem_usage.append(float(values[0]))
-            timestamp.append(float(values[1]))
-
-        elif field == "FUNC":
-            values = value.split(' ')
-            f_name, mem_start, start, mem_end, end = values[:5]
-            ts = func_ts.get(f_name, [])
-            ts.append([float(start), float(end),
-                       float(mem_start), float(mem_end)])
-            func_ts[f_name] = ts
-
-        elif field == "CHLD":
-            values = value.split(' ')
-            chldnum = values[0]
-            children[chldnum].append(
-                (float(values[1]), float(values[2]))
-            )
-
-        elif field == "CMDLINE":
-            cmd_line = value
-        else:
-            pass
-    f.close()
-
-    return {"mem_usage": mem_usage, "timestamp": timestamp,
-            "func_timestamp": func_ts, 'filename': filename,
-            'cmd_line': cmd_line, 'children': children}
 
 
 def plot_file(filename, index=0, timestamps=True, children=True, options=None):
@@ -355,7 +312,7 @@ def plot_file(filename, index=0, timestamps=True, children=True, options=None):
         print("matplotlib is needed for plotting.")
         sys.exit(1)
     import numpy as np  # pylab requires numpy anyway
-    mprofile = read_mprofile_file(filename)
+    mprofile = mp.read_mprofile_file(filename)
 
     if len(mprofile['timestamp']) == 0:
         print('** No memory usage values have been found in the profile '
@@ -413,7 +370,8 @@ def plot_file(filename, index=0, timestamps=True, children=True, options=None):
             cmem = np.asarray([item[0] for item in data])
 
             # Plot the line to the figure
-            pl.plot(cts, cmem, "+-"  + mem_line_colors[idx+1 % len(mem_line_colors)],
+            print (idx+1) % len(mem_line_colors)
+            pl.plot(cts, cmem, "+-"  + mem_line_colors[(idx+1) % len(mem_line_colors)],
                      label="child {}".format(proc))
 
             # Detect the maximal child memory point

From ae2a5eb24bd5f92a9e29e46c61027bc4796e58ae Mon Sep 17 00:00:00 2001
From: Rok Roskar <roskar@physik.uzh.ch>
Date: Thu, 6 Apr 2017 15:37:03 +0200
Subject: [PATCH 2/6] update docstring

---
 memory_profiler.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/memory_profiler.py b/memory_profiler.py
index 0162881..4f5ce37 100644
--- a/memory_profiler.py
+++ b/memory_profiler.py
@@ -1215,7 +1215,7 @@ def read_mprofile_file(filename):
     """Read an mprofile file and return its content.
 
     Returns
-    =======
+    -------
     content: dict
         Keys:
 
@@ -1272,7 +1272,7 @@ def read_mprofile_file_multiprocess(filename):
     """Read an mprofile file and return a mem_usage list
 
     Returns
-    =======
+    -------
     content: list
 
     This is analogous to the list obtained when the `memory_usage` is used
@@ -1304,8 +1304,17 @@ def read_mprofile_file_multiprocess(filename):
 def convert_mem_usage_to_df(filename, is_pickle=False): 
     """Convert a `mem_usage` list to a `pandas.DataFrame`
 
+    Parameters
+    ----------
+    filename: path to the memory profile data; can be either a file
+        created by mprof or a pickle of the result of `memory_usage`
+
+    is_pickle: if True, assume the data is the pickled list 
+        returned by `memory_usage`
+
+    
     Returns
-    =======
+    -------
     content: pandas.DataFrame
 
     Returns a `pandas.DataFrame` with child IDs as columns and the timestamp as an index

From 7f748c272fdd33f6e45b1eb78aadce95fb80afaa Mon Sep 17 00:00:00 2001
From: Rok Roskar <roskar@physik.uzh.ch>
Date: Thu, 6 Apr 2017 15:40:14 +0200
Subject: [PATCH 3/6] fix help string

---
 mprof | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mprof b/mprof
index 405d445..0886118 100755
--- a/mprof
+++ b/mprof
@@ -201,14 +201,14 @@ def run_action():
                       help="""Monitors forked processes creating individual plots for each child""")
     parser.add_option("--pid", "-p", dest="pid",
                       default=False, action="store_true", 
-                      help="""Monitor an existing process given by PID""")
+                      help="""Specify that the argument is a running pid not an executable or script""")
     parser.add_option("--timeout", dest="timeout", 
                       default=None, action="store", type=int,
                       help="""Timeout in seconds""")
 
     (options, args) = parser.parse_args()
 
-    if (len(args) == 0) and not options.pid:
+    if (len(args) == 0):
         print("A program to run or a pid must be provided. Use -h for help")
         sys.exit(1)
 

From ff3b22dff01df0196acc89e3658a32bb005bbbac Mon Sep 17 00:00:00 2001
From: Rok Roskar <roskar@physik.uzh.ch>
Date: Thu, 6 Apr 2017 16:08:15 +0200
Subject: [PATCH 4/6] oops, removing debug print

---
 memory_profiler.py | 1 +
 mprof              | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/memory_profiler.py b/memory_profiler.py
index 4f5ce37..33d0eac 100644
--- a/memory_profiler.py
+++ b/memory_profiler.py
@@ -1346,4 +1346,5 @@ def convert_mem_usage_to_df(filename, is_pickle=False):
                 data[time_lookup[t]][pid_lookup[pid]] = mem
         except TypeError:
             print 'found a bad value in ', i
+            
     return pd.DataFrame(data, index=times, columns=pids)
diff --git a/mprof b/mprof
index 0886118..26653be 100755
--- a/mprof
+++ b/mprof
@@ -370,7 +370,6 @@ def plot_file(filename, index=0, timestamps=True, children=True, options=None):
             cmem = np.asarray([item[0] for item in data])
 
             # Plot the line to the figure
-            print (idx+1) % len(mem_line_colors)
             pl.plot(cts, cmem, "+-"  + mem_line_colors[(idx+1) % len(mem_line_colors)],
                      label="child {}".format(proc))
 

From c75081a1d35af2c72d28e57a300880d3c04bffe9 Mon Sep 17 00:00:00 2001
From: Rok Roskar <roskar@physik.uzh.ch>
Date: Thu, 6 Apr 2017 17:36:40 +0200
Subject: [PATCH 5/6] removed try/except, not needed

---
 memory_profiler.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/memory_profiler.py b/memory_profiler.py
index 33d0eac..a534079 100644
--- a/memory_profiler.py
+++ b/memory_profiler.py
@@ -1341,10 +1341,7 @@ def convert_mem_usage_to_df(filename, is_pickle=False):
     
     for i,m in enumerate(mem_usage):
         t = m[0][1]
-        try: 
-            for pid,mem in m[1:]:
-                data[time_lookup[t]][pid_lookup[pid]] = mem
-        except TypeError:
-            print 'found a bad value in ', i
-            
+        for pid,mem in m[1:]:
+            data[time_lookup[t]][pid_lookup[pid]] = mem
+
     return pd.DataFrame(data, index=times, columns=pids)

From b5c87e48752e92a3a06607718ecf4830c592656b Mon Sep 17 00:00:00 2001
From: Rok Roskar <roskar@physik.uzh.ch>
Date: Tue, 11 Apr 2017 14:54:50 +0200
Subject: [PATCH 6/6] added parent process to dataframe; add df plot function

---
 memory_profiler.py | 33 ++++++++++++++++++++++++++++++---
 1 file changed, 30 insertions(+), 3 deletions(-)

diff --git a/memory_profiler.py b/memory_profiler.py
index a534079..0ef4e82 100644
--- a/memory_profiler.py
+++ b/memory_profiler.py
@@ -1320,8 +1320,12 @@ def convert_mem_usage_to_df(filename, is_pickle=False):
     Returns a `pandas.DataFrame` with child IDs as columns and the timestamp as an index
     """
 
-    import pandas as pd
-    
+    try: 
+        import pandas as pd
+        import numpy as np
+    except ImportError:
+        raise ImportError('Pandas and numpy are required for conversion to DataFrame')
+
     if is_pickle:
         from cPickle import load
         with open(filename) as f: 
@@ -1332,7 +1336,9 @@ def convert_mem_usage_to_df(filename, is_pickle=False):
         mem_usage = filter(lambda m: len(m) > 1, mem_usage)
     
     times =[m[0][1] for m in  mem_usage]
-    pids = np.sort(np.unique([m[0] for n in mem_usage for m in n[1:] if not isinstance(m,float)]))
+
+    # flatten list of lists, extract the pids and attach '0' (parent) at the end 
+    pids = np.sort(np.unique([m[0] for n in mem_usage for m in n[1:] if not isinstance(m,float)] + [0,]))
     
     time_lookup = {time: i for i,time in enumerate(times)}
     pid_lookup = {pid:i for i,pid in enumerate(pids)}
@@ -1341,7 +1347,28 @@ def convert_mem_usage_to_df(filename, is_pickle=False):
     
     for i,m in enumerate(mem_usage):
         t = m[0][1]
+
+        # add the parent memory by hand
+        data[time_lookup[t]][pid_lookup[0]] = m[0][0]
+        
         for pid,mem in m[1:]:
             data[time_lookup[t]][pid_lookup[pid]] = mem
 
     return pd.DataFrame(data, index=times, columns=pids)
+
+def plot_mem_usage(filename, include_parent=True, plot_total=True, is_pickle=False):
+    import matplotlib.pylab as plt
+
+    data_df = convert_mem_usage_to_df(filename, is_pickle)
+    
+    f = plt.figure(figsize=(10,6))
+    
+    if not include_parent:
+        data_df = data_df[data_df.columns[1:]]
+        
+    data_df.plot(legend=False, figsize=(14,10), grid=True, fontsize=14)
+    
+    if plot_total: 
+        data_df.sum(axis=1).plot(style='--', grid=True)
+
+    plt.xlabel('timestamp'); plt.ylabel('memory usage in MB')
\ No newline at end of file