Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Jenkins scripts to monitor Pantheon sites #631

Merged
merged 4 commits into from
Mar 10, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 74 additions & 0 deletions scripts/jenkins/backup_check.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
#!/bin/bash

# Check if your Pantheon live sites are properly backed up each day.

# Add known problematic sites to EXCLUDED_SITES environment variable.
# @see https://plugins.jenkins.io/envinject/

if [ -z "${EXCLUDED_SITES+x}" ]; then
EXCLUDED_SITES=()
AronNovak marked this conversation as resolved.
Show resolved Hide resolved
fi

# Fetch all sites
SITES_JSON=$(terminus site:list --format=json 2>/dev/null)

# Filter non-frozen sites
NON_FROZEN_SITES=$(echo "$SITES_JSON" | jq -r '.[] | select(.frozen == false) | .name')

# Initialize flag for missing backups
MISSING_BACKUP_FLAG=0

# Function to check if a site is in the EXCLUDED_SITES array
function is_excluded() {
local site=$1
for excluded_site in "${EXCLUDED_SITES[@]}"; do
if [[ "$site" == "$excluded_site" ]]; then
return 0
fi
done

local live_initialized
live_initialized=$(terminus env:list --format=json "${site}" | jq -r '.live.initialized')
if [[ "$live_initialized" == "false" ]]; then
return 0
fi

return 1
}

# Iterate through each non-frozen site.
for site_name in "${NON_FROZEN_SITES[@]}"; do

# Check if the site should be excluded.
if is_excluded "$site_name"; then
continue
fi

echo "Checking backups for site: $site_name"

# Fetch backups for site
BACKUPS_JSON=$(terminus backup:list "${site_name}.live" --format=json 2>/dev/null)

# Components to check
COMPONENTS=("files" "code" "database")

for component in "${COMPONENTS[@]}"; do
# Get the latest backup date for the component
LATEST_BACKUP_DATE=$(echo "$BACKUPS_JSON" | jq -r --arg COMPONENT "${component}" 'to_entries[] | select(.key | contains($COMPONENT)) | .value.date' | while read -r date; do date -d "$date" +%s; done | sort -nr | head -n1)

# Get the current date
CURRENT_DATE=$(date +%s)

# Calculate the time difference in seconds
TIME_DIFF=$((CURRENT_DATE - LATEST_BACKUP_DATE))

# Check if backup is older than 2 days (172800 seconds)
if [ $TIME_DIFF -gt 172800 ]; then
echo "WARNING: No $component backup in the past 2 days for site: $site_name"
MISSING_BACKUP_FLAG=1
fi
done
done

# Exit status based on missing backups.
exit $MISSING_BACKUP_FLAG
79 changes: 79 additions & 0 deletions scripts/jenkins/cache_ratio_check.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
#!/bin/bash

shopt -s lastpipe

# Check if your Pantheon live sites are cached at least minimally.
AronNovak marked this conversation as resolved.
Show resolved Hide resolved

# Add known problematic sites to EXCLUDED_SITES environment variable.
# @see https://plugins.jenkins.io/envinject/

if [ -z "${EXCLUDED_SITES+x}" ]; then
EXCLUDED_SITES=()
fi

# Fetch all sites
SITES_JSON=$(terminus site:list --format=json 2>/dev/null)
# Filter non-frozen sites
SITES_TO_CHECK=$(echo "$SITES_JSON" | jq -r '.[] | select(.frozen == false) | .name')

# Initialize flag for cache issues
CACHE_ISSUE_FLAG=0

# Function to check cache hit ratio
check_cache_hit_ratio() {
local site_name=$1

# Get metrics in CSV format
local metrics_csv
metrics_csv=$(terminus env:metrics "${site_name}.live" --format=csv 2>/dev/null)

# Convert CSV to an array of the last three cache hit ratios
IFS=$'\n' echo "$metrics_csv" | tail -n 4 | cut -d ',' -f6 | tr -d '%' | tail -n 3 | read -r -a cache_hit_ratios

# Initialize counter for consecutive zero hit ratios
local zero_hit_ratio_count=0

# Loop through cache hit ratios
for ratio in "${cache_hit_ratios[@]}"; do
# Skip iteration if the ratio is not a valid number
if ! [[ "$ratio" =~ ^[0-9]+(\.[0-9]+)?$ ]]; then
echo "WARNING: Invalid cache hit ratio '$ratio' for ${site_name}, skipping..."
continue
fi

# Use 'bc' to compare the floating-point number
if [[ $(echo "$ratio <= 0" | bc -l) -eq 1 ]]; then
# Increment counter
((zero_hit_ratio_count++))
fi
done

# Check if all the last three values are 0% cache hit ratios
if [ "$zero_hit_ratio_count" -eq 3 ]; then
echo "ALERT: ${site_name} has had a 0% cache hit ratio for the last 3 days."
CACHE_ISSUE_FLAG=1
fi
}

# Check if a site should be excluded
is_excluded() {
local site=$1
for excluded_site in "${EXCLUDED_SITES[@]}"; do
if [[ "$site" == "$excluded_site" ]]; then
return 0
fi
done
return 1
}

# Iterate over sites and check their cache hit ratios
for site in "${SITES_TO_CHECK[@]}"; do
if is_excluded "$site"; then
continue
fi
check_cache_hit_ratio "$site"
done

# Exit with 1 if any cache issues were found
exit $CACHE_ISSUE_FLAG