Skip to content

Commit

Permalink
Jenkins scripts to monitor Pantheon sites (#631)
Browse files Browse the repository at this point in the history
  • Loading branch information
AronNovak authored Mar 10, 2024
1 parent d64bfb2 commit 81d54c9
Show file tree
Hide file tree
Showing 2 changed files with 159 additions and 0 deletions.
75 changes: 75 additions & 0 deletions scripts/jenkins/backup_check.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
#!/bin/bash

# Check if your Pantheon live sites are properly backed up each day.

# Add known problematic sites to EXCLUDED_SITES environment variable.
# @see https://plugins.jenkins.io/envinject/
# Example: EXCLUDED_SITES=("foo" "bar")

if [ -z "${EXCLUDED_SITES+x}" ]; then
EXCLUDED_SITES=()
fi

# Fetch all sites
SITES_JSON=$(terminus site:list --format=json 2>/dev/null)

# Filter non-frozen sites
NON_FROZEN_SITES=$(echo "$SITES_JSON" | jq -r '.[] | select(.frozen == false) | .name')

# Initialize flag for missing backups
MISSING_BACKUP_FLAG=0

# Function to check if a site is in the EXCLUDED_SITES array
function is_excluded() {
local site=$1
for excluded_site in "${EXCLUDED_SITES[@]}"; do
if [[ "$site" == "$excluded_site" ]]; then
return 0
fi
done

local live_initialized
live_initialized=$(terminus env:list --format=json "${site}" | jq -r '.live.initialized')
if [[ "$live_initialized" == "false" ]]; then
return 0
fi

return 1
}

# Iterate through each non-frozen site.
for site_name in "${NON_FROZEN_SITES[@]}"; do

# Check if the site should be excluded.
if is_excluded "$site_name"; then
continue
fi

echo "Checking backups for site: $site_name"

# Fetch backups for site
BACKUPS_JSON=$(terminus backup:list "${site_name}.live" --format=json 2>/dev/null)

# Components to check
COMPONENTS=("files" "code" "database")

for component in "${COMPONENTS[@]}"; do
# Get the latest backup date for the component
LATEST_BACKUP_DATE=$(echo "$BACKUPS_JSON" | jq -r --arg COMPONENT "${component}" 'to_entries[] | select(.key | contains($COMPONENT)) | .value.date' | while read -r date; do date -d "$date" +%s; done | sort -nr | head -n1)

# Get the current date
CURRENT_DATE=$(date +%s)

# Calculate the time difference in seconds
TIME_DIFF=$((CURRENT_DATE - LATEST_BACKUP_DATE))

# Check if backup is older than 2 days (172800 seconds)
if [ $TIME_DIFF -gt 172800 ]; then
echo "WARNING: No $component backup in the past 2 days for site: $site_name"
MISSING_BACKUP_FLAG=1
fi
done
done

# Exit status based on missing backups.
exit $MISSING_BACKUP_FLAG
84 changes: 84 additions & 0 deletions scripts/jenkins/cache_ratio_check.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
#!/bin/bash

shopt -s lastpipe

# Check if your Pantheon live sites are cached properly.
# It checks the CDN cache hit ratio, if anonymous visitors
# got fully cached requests or not, at least for some parts
# of the website.
# Let's say if max-age header is set to zero, it will throw an alert.

# Add known problematic sites to EXCLUDED_SITES environment variable.
# @see https://plugins.jenkins.io/envinject/
# Example: EXCLUDED_SITES=("foo" "bar")

if [ -z "${EXCLUDED_SITES+x}" ]; then
EXCLUDED_SITES=()
fi

# Fetch all sites
SITES_JSON=$(terminus site:list --format=json 2>/dev/null)
# Filter non-frozen sites
SITES_TO_CHECK=$(echo "$SITES_JSON" | jq -r '.[] | select(.frozen == false) | .name')

# Initialize flag for cache issues
CACHE_ISSUE_FLAG=0

# Function to check cache hit ratio
check_cache_hit_ratio() {
local site_name=$1

# Get metrics in CSV format
local metrics_csv
metrics_csv=$(terminus env:metrics "${site_name}.live" --format=csv 2>/dev/null)

# Convert CSV to an array of the last three cache hit ratios
IFS=$'\n' echo "$metrics_csv" | tail -n 4 | cut -d ',' -f6 | tr -d '%' | tail -n 3 | read -r -a cache_hit_ratios

# Initialize counter for consecutive zero hit ratios
local zero_hit_ratio_count=0

# Loop through cache hit ratios
for ratio in "${cache_hit_ratios[@]}"; do
# Skip iteration if the ratio is not a valid number
if ! [[ "$ratio" =~ ^[0-9]+(\.[0-9]+)?$ ]]; then
echo "WARNING: Invalid cache hit ratio '$ratio' for ${site_name}, skipping..."
continue
fi

# Use 'bc' to compare the floating-point number
if [[ $(echo "$ratio <= 0" | bc -l) -eq 1 ]]; then
# Increment counter
((zero_hit_ratio_count++))
fi
done

# Check if all the last three values are 0% cache hit ratios
if [ "$zero_hit_ratio_count" -eq 3 ]; then
echo "ALERT: ${site_name} has had a 0% cache hit ratio for the last 3 days."
CACHE_ISSUE_FLAG=1
fi
}

# Check if a site should be excluded
is_excluded() {
local site=$1
for excluded_site in "${EXCLUDED_SITES[@]}"; do
if [[ "$site" == "$excluded_site" ]]; then
return 0
fi
done
return 1
}

# Iterate over sites and check their cache hit ratios
for site in "${SITES_TO_CHECK[@]}"; do
if is_excluded "$site"; then
continue
fi
check_cache_hit_ratio "$site"
done

# Exit with 1 if any cache issues were found
exit $CACHE_ISSUE_FLAG

0 comments on commit 81d54c9

Please sign in to comment.