diff --git a/README.md b/README.md index 6bcbd4d..8744eb8 100644 --- a/README.md +++ b/README.md @@ -275,8 +275,10 @@ You should now be feeding ADSB-ES & UAT to the "new" aggregators, FlightAware, a | Variable | Description | Default | | -------- | ------------------------------------------------------------------------------------------------------------------------------------------- | ------- | | `TZ` | Local timezone in ["TZ database name" format](https://en.wikipedia.org/wiki/List_of_tz_database_time_zones). | `UTC` | -| `LAT` | Latitude of your receiver. Only required if you want range statistics for InfluxDB or Prometheus, or if you are using the autogain script. | Unset | -| `LON` | Longitude of your receiver. Only required if you want range statistics for InfluxDB or Prometheus, or if you are using the autogain script. | Unset | +| `LAT` | Latitude of your receiver. Only required if you want range statistics for InfluxDB, Prometheus, or tar1090/ultrafeeder graphs. | Unset | +| `LON` | Longitude of your receiver. Only required if you want range statistics for InfluxDB, Prometheus, or tar1090/ultrafeeder graphs. | Unset | +| `DUMP978_MSG_MONITOR_INTERVAL` | Interval between runs of the Message Monitor that checks if new messages are received. Format of value is anything that is accepted by the Linux `sleep` command | Unset (30 minutes) | +| `DUMP978_MSG_MONITOR_RESTART_WHEN_STALE` | If set to `true`/`on`/`yes`/`1`, the receiver process is restarted when no messages are received during the monitoring interval | Unset (`false`) | ### `dump978-fa` General Options @@ -350,6 +352,7 @@ These variables control the autogain system (explained further below). These sho | `DUMP978_AUTOGAIN_HIGH_PCT` | If the percentage of "strong signals" (>3dB) over a measuring period is more than this parameter, the gain will be decreased by 1 position | `6.0` (6.0%) | | `READSB_AUTOGAIN_MIN_SAMPLES` | Minimum number of received samples for autogain to be able to consider adjusting the gain | `1000` | | `READSB_AUTOGAIN_USE_RAW` | If set to `true`/`on`/`yes`/`1`, the autogain function will use the "raw" message count rather than the "accepted" message count. | `true` | +| `SUBSEQUENT_INTERVAL_MINIMUM_COMPLETION_PCT` | Minimum percentage of `DUMP978_AUTOGAIN_SUBSEQUENT_INTERVAL` time that needs to be completed before autogain will use the collected data during the subsequent/long-term process | `50` | ## Autogain system diff --git a/rootfs/etc/s6-overlay/s6-rc.d/message-monitor/dependencies.d/01-timezone b/rootfs/etc/s6-overlay/s6-rc.d/message-monitor/dependencies.d/01-timezone new file mode 100644 index 0000000..e69de29 diff --git a/rootfs/etc/s6-overlay/s6-rc.d/message-monitor/dependencies.d/02-sanity-check b/rootfs/etc/s6-overlay/s6-rc.d/message-monitor/dependencies.d/02-sanity-check new file mode 100644 index 0000000..e69de29 diff --git a/rootfs/etc/s6-overlay/s6-rc.d/message-monitor/dependencies.d/04-telegraf b/rootfs/etc/s6-overlay/s6-rc.d/message-monitor/dependencies.d/04-telegraf new file mode 100644 index 0000000..e69de29 diff --git a/rootfs/etc/s6-overlay/s6-rc.d/message-monitor/dependencies.d/05-rtlsdr-biastee b/rootfs/etc/s6-overlay/s6-rc.d/message-monitor/dependencies.d/05-rtlsdr-biastee new file mode 100644 index 0000000..e69de29 diff --git a/rootfs/etc/s6-overlay/s6-rc.d/message-monitor/dependencies.d/dump978 b/rootfs/etc/s6-overlay/s6-rc.d/message-monitor/dependencies.d/dump978 new file mode 100644 index 0000000..e69de29 diff --git a/rootfs/etc/s6-overlay/s6-rc.d/message-monitor/dependencies.d/skyaware978 b/rootfs/etc/s6-overlay/s6-rc.d/message-monitor/dependencies.d/skyaware978 new file mode 100644 index 0000000..e69de29 diff --git a/rootfs/etc/s6-overlay/s6-rc.d/message-monitor/run b/rootfs/etc/s6-overlay/s6-rc.d/message-monitor/run new file mode 100755 index 0000000..b205c3d --- /dev/null +++ b/rootfs/etc/s6-overlay/s6-rc.d/message-monitor/run @@ -0,0 +1,2 @@ +#!/bin/sh +exec /etc/s6-overlay/scripts/message-monitor diff --git a/rootfs/etc/s6-overlay/s6-rc.d/message-monitor/type b/rootfs/etc/s6-overlay/s6-rc.d/message-monitor/type new file mode 100644 index 0000000..5883cff --- /dev/null +++ b/rootfs/etc/s6-overlay/s6-rc.d/message-monitor/type @@ -0,0 +1 @@ +longrun diff --git a/rootfs/etc/s6-overlay/s6-rc.d/user/contents.d/message-monitor b/rootfs/etc/s6-overlay/s6-rc.d/user/contents.d/message-monitor new file mode 100644 index 0000000..e69de29 diff --git a/rootfs/etc/s6-overlay/scripts/autogain b/rootfs/etc/s6-overlay/scripts/autogain index 4876143..1df1b3e 100755 --- a/rootfs/etc/s6-overlay/scripts/autogain +++ b/rootfs/etc/s6-overlay/scripts/autogain @@ -25,7 +25,7 @@ READSB_AUTOGAIN_STRONGSIGNAL_LIMIT="${DUMP978_AUTOGAIN_STRONGSIGNAL_LIMIT:-${REA READSB_AUTOGAIN_USE_RAW="${DUMP978_AUTOGAIN_USE_RAW:-${READSB_AUTOGAIN_USE_RAW:-true}}" READSB_AUTOGAIN_MIN_SAMPLES="${DUMP978_AUTOGAIN_MIN_SAMPLES:-${READSB_AUTOGAIN_MIN_SAMPLES:-1000}}" -SUBSEQUENT_INTERVAL_MINIMUM_COMPLETION_PCT=50 # must be a whole number without decimals) +SUBSEQUENT_INTERVAL_MINIMUM_COMPLETION_PCT="${SUBSEQUENT_INTERVAL_MINIMUM_COMPLETION_PCT:-50}" # must be a whole number without decimals) function collect_gain_values() { # reads RAW messages for $1 seconds and returns the percentage strong messages with 1 decimal precision @@ -169,7 +169,7 @@ if [[ ! -f /var/globe_history/autogain/autogain_initialized ]]; then # write back the current run number: echo "$i" > /var/globe_history/autogain/init_runs_count # sleep a little bit so dump978 is again providing data - sleep 15 + sleep 15 & wait $! done touch /var/globe_history/autogain/autogain_initialized @@ -196,6 +196,6 @@ do # data collection exited early. Wait a bit and restart s6wrap --quiet --prepend="$(basename "$0")" --timestamps --args echo "[ERROR] Data collection for autogain long-term run of $(( READSB_AUTOGAIN_SUBSEQUENT_INTERVAL / 60 )) minutes completed was terminated early (after $(( (endtime - starttime)/60 )) minutes)" s6wrap --quiet --prepend="$(basename "$0")" --timestamps --args echo "[ERROR] Since this is less than ${SUBSEQUENT_INTERVAL_MINIMUM_COMPLETION_PCT}% of the collection period, we discard it and start collecting again" - sleep 15 & wait ! + sleep 15 & wait $! fi done diff --git a/rootfs/etc/s6-overlay/scripts/message-monitor b/rootfs/etc/s6-overlay/scripts/message-monitor new file mode 100755 index 0000000..3992e17 --- /dev/null +++ b/rootfs/etc/s6-overlay/scripts/message-monitor @@ -0,0 +1,50 @@ +#!/command/with-contenv bash +#shellcheck shell=bash disable=SC1091 + +source /scripts/common +mkdir -p /run/stats +s6wrap=(s6wrap --quiet --prepend="$(basename "$0")" --timestamps --args) + +while : +do + # Make sure we're receiving messages from the SDR + # get the number of messages received since process start: + + if [[ -f /run/skyaware978/aircraft.json ]]; then + read -r new_msg_count <<< "$(jq .messages /run/skyaware978/aircraft.json 2>/dev/null)" + else + new_msg_count="STARTING" + fi + # get the number of messages previously read, or 0 if there's no history: + if [[ -f /run/stats/msgs_since_last_monitor_run ]]; then + read -r old_msg_count < /run/stats/msgs_since_last_monitor_run + secs_since_last_check="$(( $(date +%s) - $(stat -c '%Y' /run/stats/msgs_since_last_monitor_run) ))" + else + old_msg_count=0 + secs_since_last_check="$(( $(date +%s) - $(stat -c '%Y' /run/service/skyaware978) ))" # use skyaware978 modify time as the creation time of the container + fi + + # if new_msg_count < old_msg_count, dump978 must have restarted since the previous run of this script + # in that case, assume that old_msg_count=0 + if (( new_msg_count < old_msg_count )); then + old_msg_count=0 + fi + + if [[ "$new_msg_count" == "STARTING" ]]; then + "${s6wrap[@]}" echo "[STARTING] Receiver starting: No messages have been received as the container is still starting" + new_msg_count=0 + elif (( new_msg_count == old_msg_count )); then + "${s6wrap[@]}" echo "[WARNING] Receiver appears stale: No messages received since last run of the Messages Monitor ($secs_since_last_check secs ago)" + if chk_enabled "$DUMP978_MSG_MONITOR_RESTART_WHEN_STALE"; then + "${s6wrap[@]}" echo "[WARNING] Restarting the dump978 service..." + s6-svc -r /run/service/dump978 2>/dev/null || true + fi + elif (( new_msg_count > old_msg_count )); then + "${s6wrap[@]}" echo "[INFO] Receiver is OK: $(( new_msg_count - old_msg_count )) messages received since last run of the Messages Monitor ($secs_since_last_check secs ago)" + else + "${s6wrap[@]}" echo "[ERROR] This situation cannot occur, please notify the software maintainers. new_msg_count=$new_msg_count; old_msg_count=$old_msg_count" + fi + echo "$new_msg_count" > /run/stats/msgs_since_last_monitor_run + + sleep "${DUMP978_MSG_MONITOR_INTERVAL:-30m}" & wait $! +done diff --git a/rootfs/scripts/healthcheck.sh b/rootfs/scripts/healthcheck.sh index 0a73733..174c568 100755 --- a/rootfs/scripts/healthcheck.sh +++ b/rootfs/scripts/healthcheck.sh @@ -46,51 +46,21 @@ else echo "[$(date)][HEALTHY] socat/uat2esnt listening on port 37981" fi -# Make sure we're receiving messages from the SDR -# get the number of messages received since process start: -mkdir -p /run/stats -if [[ -f /run/skyaware978/aircraft.json ]]; then - read -r new_msg_count <<< "$(jq .messages /run/skyaware978/aircraft.json 2>/dev/null)" -else - new_msg_count="STARTING" -fi -# get the number of messages previously read, or 0 if there's no history: -if [[ -f /run/stats/msgs_since_last_healthcheck ]]; then - read -r old_msg_count < /run/stats/msgs_since_last_healthcheck - secs_since_last_check="$(( $(date +%s) - $(stat -c '%Y' /run/stats/msgs_since_last_healthcheck) ))" -else - old_msg_count=0 - secs_since_last_check="$(( $(date +%s) - $(stat -c '%Y' /run/service/skyaware978) ))" # use skyaware978 modify time as the creation time of the container -fi -# Take conclusitions -if [[ "$new_msg_count" == "STARTING" ]]; then - echo "[$(date)][STARTING] No messages have been received as the container is still starting" - new_msg_count=0 -elif (( new_msg_count < old_msg_count )) || (( old_msg_count == 0 && new_msg_count > 0 )); then - echo "[$(date)][HEALTHY] $new_msg_count messages received since start of the SkyAware978 service ($secs_since_last_check secs ago)" -elif (( new_msg_count > old_msg_count )); then - echo "[$(date)][HEALTHY] $(( new_msg_count - old_msg_count )) messages received since last HealthCheck ($secs_since_last_check secs ago)" -elif (( new_msg_count == old_msg_count )); then - echo "[$(date)][UNHEALTHY] No messages received since last HealthCheck ($secs_since_last_check secs ago)" - EXITCODE=1 -else - echo "[$(date)][ERROR] This situation cannot occur; new_msg_count=$new_msg_count; old_msg_count=$old_msg_count" -fi -echo "$new_msg_count" > /run/stats/msgs_since_last_healthcheck - ##### Service Death Counts ##### # shellcheck disable=SC2046,SC2207 services=($(basename -a $(find /run/service/ -maxdepth 1 -type l))) # For each service... for service in "${services[@]}"; do - abnormal_deaths="$(s6-svdt -s "/run/service/$service" | awk '/exitcode/ && !/exitcode 0/' | wc -l)" - if (( abnormal_deaths > 0 )); then - echo "[$(date)][UNHEALTHY] abnormal death count for service $service is $abnormal_deaths" - EXITCODE=1 - # Reset service death counts - s6-svdt-clear "/run/service/$service" - else - echo "[$(date)][HEALTHY] no abnormal death count for service $service" + if [[ "${service:0:5}" != "s6rc-" ]]; then + abnormal_deaths="$(s6-svdt -s "/run/service/$service" | awk '/exitcode/ && !/exitcode 0/' | wc -l)" + if (( abnormal_deaths > 0 )); then + echo "[$(date)][UNHEALTHY] abnormal death count for service $service is $abnormal_deaths" + EXITCODE=1 + # Reset service death counts + s6-svdt-clear "/run/service/$service" + else + echo "[$(date)][HEALTHY] no abnormal death count for service $service" + fi fi done