Hc updates (#84)

* remove the HC checking of messages received * create message monitor service to log any UAT traffic issues * updates * add needed s6 stuff * fix typo/bug * bug fix * add restart_when_stale and other updates * make things a bit more verbose --------- Co-authored-by: kx1t <[email protected]>
sdr-enthusiasts · Nov 9, 2023 · 223d255 · 223d255
1 parent a04774f
commit 223d255
Show file tree

Hide file tree

Showing 13 changed files with 71 additions and 45 deletions.
diff --git a/README.md b/README.md
@@ -275,8 +275,10 @@ You should now be feeding ADSB-ES & UAT to the "new" aggregators, FlightAware, a
 | Variable | Description                                                                                                                                 | Default |
 | -------- | ------------------------------------------------------------------------------------------------------------------------------------------- | ------- |
 | `TZ`     | Local timezone in ["TZ database name" format](https://en.wikipedia.org/wiki/List_of_tz_database_time_zones).                                | `UTC`   |
-| `LAT`    | Latitude of your receiver. Only required if you want range statistics for InfluxDB or Prometheus, or if you are using the autogain script.  | Unset   |
-| `LON`    | Longitude of your receiver. Only required if you want range statistics for InfluxDB or Prometheus, or if you are using the autogain script. | Unset   |
+| `LAT`    | Latitude of your receiver. Only required if you want range statistics for InfluxDB, Prometheus, or tar1090/ultrafeeder graphs. | Unset   |
+| `LON`    | Longitude of your receiver. Only required if you want range statistics for InfluxDB, Prometheus, or tar1090/ultrafeeder graphs. | Unset   |
+| `DUMP978_MSG_MONITOR_INTERVAL` | Interval between runs of the Message Monitor that checks if new messages are received. Format of value is anything that is accepted by the Linux `sleep` command | Unset (30 minutes) |
+| `DUMP978_MSG_MONITOR_RESTART_WHEN_STALE` | If set to `true`/`on`/`yes`/`1`, the receiver process is restarted when no messages are received during the monitoring interval | Unset (`false`) |
 
 ### `dump978-fa` General Options
 
@@ -350,6 +352,7 @@ These variables control the autogain system (explained further below). These sho
 | `DUMP978_AUTOGAIN_HIGH_PCT` | If the percentage of "strong signals" (>3dB) over a measuring period is more than this parameter, the gain will be decreased by 1 position | `6.0` (6.0%) |
 | `READSB_AUTOGAIN_MIN_SAMPLES` | Minimum number of received samples for autogain to be able to consider adjusting the gain | `1000` |
 | `READSB_AUTOGAIN_USE_RAW` |  If set to `true`/`on`/`yes`/`1`, the autogain function will use the "raw" message count rather than the "accepted" message count. | `true` |
+| `SUBSEQUENT_INTERVAL_MINIMUM_COMPLETION_PCT` | Minimum percentage of `DUMP978_AUTOGAIN_SUBSEQUENT_INTERVAL` time that needs to be completed before autogain will use the collected data during the subsequent/long-term process | `50` |
 
 ## Autogain system
 

diff --git a/rootfs/etc/s6-overlay/s6-rc.d/message-monitor/dependencies.d/01-timezone b/rootfs/etc/s6-overlay/s6-rc.d/message-monitor/dependencies.d/01-timezone
diff --git a/rootfs/etc/s6-overlay/s6-rc.d/message-monitor/dependencies.d/02-sanity-check b/rootfs/etc/s6-overlay/s6-rc.d/message-monitor/dependencies.d/02-sanity-check
diff --git a/rootfs/etc/s6-overlay/s6-rc.d/message-monitor/dependencies.d/04-telegraf b/rootfs/etc/s6-overlay/s6-rc.d/message-monitor/dependencies.d/04-telegraf
diff --git a/rootfs/etc/s6-overlay/s6-rc.d/message-monitor/dependencies.d/05-rtlsdr-biastee b/rootfs/etc/s6-overlay/s6-rc.d/message-monitor/dependencies.d/05-rtlsdr-biastee
diff --git a/rootfs/etc/s6-overlay/s6-rc.d/message-monitor/dependencies.d/dump978 b/rootfs/etc/s6-overlay/s6-rc.d/message-monitor/dependencies.d/dump978
diff --git a/rootfs/etc/s6-overlay/s6-rc.d/message-monitor/dependencies.d/skyaware978 b/rootfs/etc/s6-overlay/s6-rc.d/message-monitor/dependencies.d/skyaware978
diff --git a/rootfs/etc/s6-overlay/s6-rc.d/message-monitor/run b/rootfs/etc/s6-overlay/s6-rc.d/message-monitor/run
@@ -0,0 +1,2 @@
+#!/bin/sh
+exec /etc/s6-overlay/scripts/message-monitor
diff --git a/rootfs/etc/s6-overlay/s6-rc.d/message-monitor/type b/rootfs/etc/s6-overlay/s6-rc.d/message-monitor/type
@@ -0,0 +1 @@
+longrun
diff --git a/rootfs/etc/s6-overlay/s6-rc.d/user/contents.d/message-monitor b/rootfs/etc/s6-overlay/s6-rc.d/user/contents.d/message-monitor
diff --git a/rootfs/etc/s6-overlay/scripts/autogain b/rootfs/etc/s6-overlay/scripts/autogain
@@ -25,7 +25,7 @@ READSB_AUTOGAIN_STRONGSIGNAL_LIMIT="${DUMP978_AUTOGAIN_STRONGSIGNAL_LIMIT:-${REA
 READSB_AUTOGAIN_USE_RAW="${DUMP978_AUTOGAIN_USE_RAW:-${READSB_AUTOGAIN_USE_RAW:-true}}"
 READSB_AUTOGAIN_MIN_SAMPLES="${DUMP978_AUTOGAIN_MIN_SAMPLES:-${READSB_AUTOGAIN_MIN_SAMPLES:-1000}}"
 
-SUBSEQUENT_INTERVAL_MINIMUM_COMPLETION_PCT=50   # must be a whole number without decimals)
+SUBSEQUENT_INTERVAL_MINIMUM_COMPLETION_PCT="${SUBSEQUENT_INTERVAL_MINIMUM_COMPLETION_PCT:-50}"   # must be a whole number without decimals)
 
 function collect_gain_values() {
     # reads RAW messages for $1 seconds and returns the percentage strong messages with 1 decimal precision
@@ -169,7 +169,7 @@ if [[ ! -f /var/globe_history/autogain/autogain_initialized ]]; then
         # write back the current run number:
         echo "$i" > /var/globe_history/autogain/init_runs_count
         # sleep a little bit so dump978 is again providing data 
-        sleep 15
+        sleep 15 & wait $!
 
     done
     touch /var/globe_history/autogain/autogain_initialized
@@ -196,6 +196,6 @@ do
         # data collection exited early. Wait a bit and restart
         s6wrap --quiet --prepend="$(basename "$0")" --timestamps --args echo "[ERROR] Data collection for autogain long-term run of $(( READSB_AUTOGAIN_SUBSEQUENT_INTERVAL / 60 )) minutes completed was terminated early (after $(( (endtime - starttime)/60 )) minutes)"
         s6wrap --quiet --prepend="$(basename "$0")" --timestamps --args echo "[ERROR] Since this is less than ${SUBSEQUENT_INTERVAL_MINIMUM_COMPLETION_PCT}% of the collection period, we discard it and start collecting again"
-        sleep 15 & wait !
+        sleep 15 & wait $!
     fi
 done
diff --git a/rootfs/etc/s6-overlay/scripts/message-monitor b/rootfs/etc/s6-overlay/scripts/message-monitor
@@ -0,0 +1,50 @@
+#!/command/with-contenv bash
+#shellcheck shell=bash disable=SC1091
+
+source /scripts/common
+mkdir -p /run/stats
+s6wrap=(s6wrap --quiet --prepend="$(basename "$0")" --timestamps --args)
+
+while :
+do
+    # Make sure we're receiving messages from the SDR
+    # get the number of messages received since process start:
+
+    if [[ -f /run/skyaware978/aircraft.json ]]; then
+        read -r new_msg_count <<< "$(jq .messages /run/skyaware978/aircraft.json 2>/dev/null)"
+    else
+        new_msg_count="STARTING"
+    fi
+    # get the number of messages previously read, or 0 if there's no history:
+    if [[ -f /run/stats/msgs_since_last_monitor_run ]]; then
+        read -r old_msg_count < /run/stats/msgs_since_last_monitor_run
+        secs_since_last_check="$(( $(date +%s) - $(stat -c '%Y' /run/stats/msgs_since_last_monitor_run) ))"
+    else
+        old_msg_count=0
+        secs_since_last_check="$(( $(date +%s) - $(stat -c '%Y' /run/service/skyaware978) ))"    # use skyaware978 modify time as the creation time of the container
+    fi
+
+    # if new_msg_count < old_msg_count, dump978 must have restarted since the previous run of this script
+    # in that case, assume that old_msg_count=0
+    if (( new_msg_count < old_msg_count )); then
+        old_msg_count=0
+    fi
+
+    if [[ "$new_msg_count" == "STARTING" ]]; then
+        "${s6wrap[@]}" echo "[STARTING] Receiver starting: No messages have been received as the container is still starting"
+        new_msg_count=0
+    elif (( new_msg_count == old_msg_count )); then
+        "${s6wrap[@]}" echo "[WARNING] Receiver appears stale: No messages received since last run of the Messages Monitor ($secs_since_last_check secs ago)"
+        if chk_enabled "$DUMP978_MSG_MONITOR_RESTART_WHEN_STALE"; then 
+            "${s6wrap[@]}" echo "[WARNING]                         Restarting the dump978 service..."
+            s6-svc -r /run/service/dump978 2>/dev/null || true
+        fi
+    elif (( new_msg_count > old_msg_count )); then
+        "${s6wrap[@]}" echo "[INFO] Receiver is OK: $(( new_msg_count - old_msg_count )) messages received since last run of the Messages Monitor ($secs_since_last_check secs ago)"
+    else
+        "${s6wrap[@]}" echo "[ERROR] This situation cannot occur, please notify the software maintainers. new_msg_count=$new_msg_count; old_msg_count=$old_msg_count"
+    fi
+    echo "$new_msg_count" > /run/stats/msgs_since_last_monitor_run
+
+    sleep "${DUMP978_MSG_MONITOR_INTERVAL:-30m}" & wait $!
+done
diff --git a/rootfs/scripts/healthcheck.sh b/rootfs/scripts/healthcheck.sh
@@ -46,51 +46,21 @@ else
     echo "[$(date)][HEALTHY] socat/uat2esnt listening on port 37981"
 fi
 
-# Make sure we're receiving messages from the SDR
-# get the number of messages received since process start:
-mkdir -p /run/stats
-if [[ -f /run/skyaware978/aircraft.json ]]; then
-    read -r new_msg_count <<< "$(jq .messages /run/skyaware978/aircraft.json 2>/dev/null)"
-else
-    new_msg_count="STARTING"
-fi
-# get the number of messages previously read, or 0 if there's no history:
-if [[ -f /run/stats/msgs_since_last_healthcheck ]]; then
-    read -r old_msg_count < /run/stats/msgs_since_last_healthcheck
-    secs_since_last_check="$(( $(date +%s) - $(stat -c '%Y' /run/stats/msgs_since_last_healthcheck) ))"
-else
-    old_msg_count=0
-    secs_since_last_check="$(( $(date +%s) - $(stat -c '%Y' /run/service/skyaware978) ))"    # use skyaware978 modify time as the creation time of the container
-fi
-# Take conclusitions
-if [[ "$new_msg_count" == "STARTING" ]]; then
-    echo "[$(date)][STARTING] No messages have been received as the container is still starting"
-    new_msg_count=0
-elif (( new_msg_count < old_msg_count )) || (( old_msg_count == 0 && new_msg_count > 0 )); then
-    echo "[$(date)][HEALTHY] $new_msg_count messages received since start of the SkyAware978 service ($secs_since_last_check secs ago)"
-elif (( new_msg_count > old_msg_count )); then
-    echo "[$(date)][HEALTHY] $(( new_msg_count - old_msg_count )) messages received since last HealthCheck ($secs_since_last_check secs ago)"
-elif (( new_msg_count == old_msg_count )); then
-    echo "[$(date)][UNHEALTHY] No messages received since last HealthCheck ($secs_since_last_check secs ago)"
-    EXITCODE=1
-else
-    echo "[$(date)][ERROR] This situation cannot occur; new_msg_count=$new_msg_count; old_msg_count=$old_msg_count"
-fi
-echo "$new_msg_count" > /run/stats/msgs_since_last_healthcheck
-
 ##### Service Death Counts #####
 # shellcheck disable=SC2046,SC2207
 services=($(basename -a $(find /run/service/ -maxdepth 1 -type l)))
 # For each service...
 for service in "${services[@]}"; do
-    abnormal_deaths="$(s6-svdt -s "/run/service/$service" | awk '/exitcode/ && !/exitcode 0/' | wc -l)"
-    if (( abnormal_deaths > 0 )); then
-        echo "[$(date)][UNHEALTHY] abnormal death count for service $service is $abnormal_deaths"
-        EXITCODE=1
-        # Reset service death counts
-        s6-svdt-clear "/run/service/$service"
-    else
-        echo "[$(date)][HEALTHY] no abnormal death count for service $service"
+    if [[ "${service:0:5}" != "s6rc-" ]]; then 
+        abnormal_deaths="$(s6-svdt -s "/run/service/$service" | awk '/exitcode/ && !/exitcode 0/' | wc -l)"
+        if (( abnormal_deaths > 0 )); then
+            echo "[$(date)][UNHEALTHY] abnormal death count for service $service is $abnormal_deaths"
+            EXITCODE=1
+            # Reset service death counts
+            s6-svdt-clear "/run/service/$service"
+        else
+            echo "[$(date)][HEALTHY] no abnormal death count for service $service"
+        fi
     fi
 done
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		#!/bin/sh
		exec /etc/s6-overlay/scripts/message-monitor