From dfa7b679ee12cecd3b3150cf8b7a7e20b0d5ee97 Mon Sep 17 00:00:00 2001
From: Anthony Molinaro <anthonym@alumni.caltech.edu>
Date: Fri, 3 Feb 2017 23:35:07 +0000
Subject: [PATCH] Vmstats should only be sent for full intervals

It was possible to see spikes or dips on restart of a service running
mondemand if you used the initial sample then the trigger to send
vmstats hit quickly because you'd have maybe 5 seconds of startup
stats.  Instead skip the first time sending stats, and only send on
the second time.  This will leave a gap in graphs on restart but will
mean you don't end up with spikes which dwarf the surrounding samples.
---
 ChangeLog                 | 10 ++++++++--
 src/mondemand.app.src     |  2 +-
 src/mondemand.erl         |  2 ++
 src/mondemand_vmstats.erl | 17 ++++++++++++++---
 4 files changed, 25 insertions(+), 6 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index eaaf6d1..1f9552e 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,11 @@
-Verison 6.6.0 (molinaro)
-  * add scheduler utilization
+Version 6.6.1 (molinaro)
+  * make sure that mondemand vmstats metrics are not emitted without a full
+  interval having gone by.  This means a restart of a service can lead to
+  a gap in vmstats, but that's better than the spikes which can happen with
+  out waiting (IMHO).
+
+Version 6.6.0 (molinaro)
+  * add scheduler utilization to vmstats
 
 Version 6.5.0 (molinaro)
   * add a call to get vmstats out from the vmstats sampler
diff --git a/src/mondemand.app.src b/src/mondemand.app.src
index 3bc7267..1e8dd0a 100644
--- a/src/mondemand.app.src
+++ b/src/mondemand.app.src
@@ -1,7 +1,7 @@
 { application, mondemand,
   [
    { description, "Erlang Mondemand Bindings." },
-   { vsn, "6.6.0" },
+   { vsn, "6.6.1" },
    { modules, [] },
    { registered, [mondemand,mondemand_sup]},
    { applications, [kernel,stdlib,syntax_tools,lwes,inets]},
diff --git a/src/mondemand.erl b/src/mondemand.erl
index 4bccbf1..01ab528 100644
--- a/src/mondemand.erl
+++ b/src/mondemand.erl
@@ -293,6 +293,8 @@ send_annotation (Id, Time, Description, Text, Tags, Context) ->
     ),
   send_event (Event).
 
+send_stats (_, _, []) ->
+  ok;
 send_stats (ProgId, Context, Stats) ->
   Event =
     mondemand_statsmsg:to_lwes (
diff --git a/src/mondemand_vmstats.erl b/src/mondemand_vmstats.erl
index 89d0c19..16cf928 100644
--- a/src/mondemand_vmstats.erl
+++ b/src/mondemand_vmstats.erl
@@ -35,7 +35,7 @@
 -record (state, {samples = queue:new(),
                  max_samples = 300,  % 5 minutes of sampled data
                  legacy = false,     % old otp workarounds
-                 previous_mondemand,
+                 previous_mondemand = undefined,
                  timer,
                  scheduler_former_flag,% keep track of previous scheduler
                                        % stats flag for shutdown
@@ -146,7 +146,6 @@ init([]) ->
   % keep the initial sample as both the previous mondemand value and put
   % it into the queue
   { ok, #state { samples = InitialQueue,
-                 previous_mondemand = InitialSample,
                  timer = TRef,
                  legacy = Legacy,
                  collect_scheduler_stats = CollectSchedulerStats,
@@ -167,7 +166,19 @@ handle_call (to_mondemand, _From,
                               previous_mondemand = Prev }) ->
   % queue should always have something in it
   {value, LastSample} = queue:peek_r (Queue),
-  Stats = to_mondemand (Prev, LastSample),
+  Stats =
+    case Prev =:= undefined of
+      true ->
+        % we skip the first send of data to mondemand, as we have no way
+        % to really ensure the normal duration between sends to mondemand
+        % has elapsed, if it hasn't elapsed we might be emitting to mondemand
+        % shortly after restart and would see some spikiness in any counters
+        % (as they are turned into gauges with the assumption calls to 
+        % to_mondemand/0 are happening on a regular interval).
+        [];
+      false ->
+        to_mondemand (Prev, LastSample)
+    end,
   {reply, Stats, State#state { previous_mondemand = LastSample } };
 handle_call (_Request, _From, State = #state { }) ->
   {reply, ok, State }.