-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[RW-5226] Setup sumologic egress alerting in terraform (#7)
* sumologic hello world * egress detection module * try templating the query text separately * progress * try other provider stmnt * fix vars * nit * merge master * major refactor * major refactor * hex * fix * finalize * test * test * tmp * tmp * tmp * address comment * address comment * add more * address comment * address comment Co-authored-by: Jay Carlton <[email protected]>
- Loading branch information
1 parent
686dfa1
commit cf5869e
Showing
12 changed files
with
387 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
101 changes: 101 additions & 0 deletions
101
modules/workbench/modules/egress_detection/assets/content/egress_window_template.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
{ | ||
"type": "SavedSearchWithScheduleSyncDefinition", | ||
"name": "${aou_env} / ${tier_name}-tier / ${egress_threshold_mib}-Mib / ${egress_window_sec}-sec egress event", | ||
"search": { | ||
"queryText": ${query_text}, | ||
"defaultTimeRange": "-120m", | ||
"byReceiptTime": true, | ||
"viewName": "", | ||
"viewStartTime": "1970-01-01T00:00:00Z", | ||
"queryParameters": [ | ||
{ | ||
"name": "environment", | ||
"label": "environment", | ||
"description": "", | ||
"dataType": "QUERY_FRAGMENT", | ||
"value": "${aou_env}", | ||
"autoComplete": { | ||
"autoCompleteType": "SKIP_AUTOCOMPLETE", | ||
"autoCompleteKey": null, | ||
"autoCompleteValues": [], | ||
"lookupFileName": null, | ||
"lookupLabelColumn": null, | ||
"lookupValueColumn": null | ||
} | ||
}, | ||
{ | ||
"name": "window_in_seconds", | ||
"label": "window_in_seconds", | ||
"description": "", | ||
"dataType": "NUMBER", | ||
"value": "${egress_window_sec}", | ||
"autoComplete": { | ||
"autoCompleteType": "SKIP_AUTOCOMPLETE", | ||
"autoCompleteKey": null, | ||
"autoCompleteValues": [], | ||
"lookupFileName": null, | ||
"lookupLabelColumn": null, | ||
"lookupValueColumn": null | ||
} | ||
}, | ||
{ | ||
"name": "egress_threshold_in_mib", | ||
"label": "egress_threshold_in_mib", | ||
"description": "", | ||
"dataType": "NUMBER", | ||
"value": "${egress_threshold_mib}", | ||
"autoComplete": { | ||
"autoCompleteType": "SKIP_AUTOCOMPLETE", | ||
"autoCompleteKey": null, | ||
"autoCompleteValues": [], | ||
"lookupFileName": null, | ||
"lookupLabelColumn": null, | ||
"lookupValueColumn": null | ||
} | ||
} | ||
], | ||
"parsingMode": "Manual" | ||
}, | ||
"searchSchedule": { | ||
"cronExpression": "${cron_expression}", | ||
"displayableTimeRange": "${time_range}", | ||
"parseableTimeRange": { | ||
"type": "BeginBoundedTimeRange", | ||
"from": { | ||
"type": "RelativeTimeRangeBoundary", | ||
"relativeTime": "${time_range}" | ||
}, | ||
"to": null | ||
}, | ||
"timeZone": "America/New_York", | ||
"threshold": { | ||
"thresholdType": "group", | ||
"operator": "gt", | ||
"count": 0 | ||
}, | ||
"notification": { | ||
"taskType": "WebhookSearchNotificationSyncDefinition", | ||
"webhookId": "${webhook_id}", | ||
"payload": null, | ||
"itemizeAlerts": false, | ||
"maxItemizedAlerts": 1 | ||
}, | ||
"scheduleType": "${schedule_type}", | ||
"muteErrorEmails": false, | ||
"parameters": [ | ||
{ | ||
"name": "environment", | ||
"value": "${aou_env}" | ||
}, | ||
{ | ||
"name": "window_in_seconds", | ||
"value": "${egress_window_sec}" | ||
}, | ||
{ | ||
"name": "egress_threshold_in_mib", | ||
"value": "${egress_threshold_mib}" | ||
} | ||
] | ||
}, | ||
"description": "" | ||
} |
125 changes: 125 additions & 0 deletions
125
modules/workbench/modules/egress_detection/assets/content/query.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,125 @@ | ||
// This query analyses a set of Google VPC flow logs for high-egress events. | ||
// The incoming set of flow logs are bucketed into non-overlapping time slices | ||
// and grouped by project / VM name to identify VMs with high amounts of egress | ||
// in a given period of time. | ||
// | ||
// All SumoLogic saved searches are collected in the "AoU RW Egress Alerts" folder is created in sumologic UI then passed | ||
// here as variable | ||
// There should be one saved search for each {environment, window width} tuple. | ||
// | ||
// Note: the "time range" parameter is set to be double the window_in_seconds duration, | ||
// causing the search to query across multiple windows' worth of log messages. This may | ||
// result in duplicate high-egress event notifications being sent, but it may also increase | ||
// resiliency to delays or outages in SumoLogic's execution of saved searches. | ||
|
||
// Each environment should be set up with the following searches. The test environment | ||
// is shown as an example. | ||
// | ||
// - 3 minutes, 100Mib | ||
// - Parameters: environment=test, window_in_seconds=180, egress_threshold_in_mib=100 | ||
// - Time range: -6m ("use receipt time" is checked) | ||
// - Search schedule: | ||
// - Run frequency: real time | ||
// - Time range for scheduled search: -6m | ||
// | ||
// - 10 minutes, 150 Mib | ||
// - Parameters: environment=test, window_in_seconds=600, egress_threshold_in_mib=150 | ||
// - Time range: -20m ("use receipt time" is checked) | ||
// - Search schedule: | ||
// - Run frequency: Every 15 minutes | ||
// - Time range for scheduled search: -20m | ||
// | ||
// - 60 minutes, 200 Mib | ||
// - Parameters: environment=test, window_in_seconds=3600, egress_threshold_in_mib=200 | ||
// - Time range: -120m ("use receipt time" is checked) | ||
// - Search schedule: | ||
// - Run frequency: Hourly | ||
// - Time range for scheduled search: -120m | ||
|
||
_sourceCategory = gcp/vpcflowlogs/aou/{{environment}} logName resource timestamp | ||
|
||
// Parse the common JSON objects we'll use below | ||
| json "message.data.jsonPayload" as payload | ||
| json "message.data.resource" as resource | ||
|
||
// | ||
// Filter down to the set of logs we are analyzing. | ||
// | ||
|
||
// Filter on VPC flow logs, in case some other log type ends up here unexpectedly. | ||
| parse regex "\"logName\":\"(?<log_name>[^\"]+)\"" | ||
| where log_name matches "projects/*/logs/compute.googleapis.com%2Fvpc_flows" | ||
|
||
// Show only logs related to GCE subnet activity. | ||
| json field=resource "type" as type | ||
| where type = "gce_subnetwork" | ||
|
||
// Show only logs in the egress direction | ||
| json field=payload "reporter" as reporter | ||
| where reporter matches "SRC" | ||
|
||
// Exclude traffic whose destination IP is within the static IP range for Private Google Access. | ||
// This ensures that most Google API traffic is excluded from being considered for high-egress | ||
// alerts. See ticket RW-4738 for more details and breadcrumbs. | ||
| json field = payload "connection.dest_ip" as dest_ip | ||
| where !(dest_ip in ( | ||
"199.36.153.4", | ||
"199.36.153.5", | ||
"199.36.153.6", | ||
"199.36.153.7")) | ||
|
||
// Extract some output fields from the log JSON. | ||
| json field=resource "labels.project_id" as project_name | ||
| json field=payload "bytes_sent", "start_time", "end_time" as bytes_sent, start_time, end_time | ||
// "nodrop" means it's OK if vm_name does not exist | ||
| json field=payload "src_instance.vm_name" as vm_name nodrop | ||
|
||
// There are 3 types of expected VM names: | ||
// 1. GCE VMs: all-of-us-<user_id> | ||
// 2. Dataproc master nodes: all-of-us-<user_id>-m | ||
// 3. Dataproc worker nodes: all-of-us-<user_id>-w-<index> | ||
// | ||
// All three of these should contribute towards a single user's egress. In the | ||
// event that the VM naming convention changes, egress will instead be accumulated | ||
// at the project level, which should only be noisier than this | ||
| parse regex field=vm_name "^(?<vm_prefix>all-of-us-\d+)(?:$|-[mw].*)" nodrop | ||
|
||
| if (vm_name matches /^all-of-us-\d+$/, bytes_sent, 0) as gce_bytes_sent | ||
| if (vm_name matches /^all-of-us-\d+-m$/, bytes_sent, 0) as dataproc_master_bytes_sent | ||
| if (vm_name matches /^all-of-us-\d+-w-\d+$/, bytes_sent, 0) as dataproc_worker_bytes_sent | ||
|
||
// Timeslice creates a _timeslice variable, which is the message's timestamp | ||
// rounded to the nearest timeslice window start. We'll use this to aggregate | ||
// and calculate per-window egress. | ||
| timeslice {{window_in_seconds}}s | ||
|
||
// Breakdown by window, project, and VM prefix | ||
| sum(bytes_sent) as bytes_sent, | ||
sum(gce_bytes_sent) as gce_bytes_sent, | ||
sum(dataproc_master_bytes_sent) as dataproc_master_bytes_sent, | ||
sum(dataproc_worker_bytes_sent) as dataproc_worker_bytes_sent | ||
by _timeslice, project_name, vm_prefix | ||
|
||
// Collect all fields for display | ||
| bytes_sent / 1Mi as egress_mib | ||
| gce_bytes_sent / 1Mi as gce_egress_mib | ||
| dataproc_master_bytes_sent / 1Mi as dataproc_master_egress_mib | ||
| dataproc_worker_bytes_sent / 1Mi as dataproc_worker_egress_mib | ||
| toLong(_timeslice) as time_window_start | ||
| "{{environment}}" as environment | ||
| "{{window_in_seconds}}" as time_window_duration | ||
| "{{egress_threshold_in_mib}}" as egress_mib_threshold | ||
| fields | ||
environment, | ||
time_window_duration, | ||
time_window_start, | ||
egress_mib, | ||
egress_mib_threshold, | ||
project_name, | ||
vm_prefix, | ||
gce_egress_mib, | ||
dataproc_master_egress_mib, | ||
dataproc_worker_egress_mib | ||
|
||
// Only export rows passing our desired threshold | ||
| where egress_mib > {{egress_threshold_in_mib}} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
locals { | ||
content_dir = pathexpand("${path.module}/assets/content") | ||
search_configs = fileset(local.content_dir, "*.json") | ||
content_template_path = pathexpand("${local.content_dir}/egress_window_template.json") | ||
query_path = pathexpand("${local.content_dir}/query.txt") | ||
|
||
queries_rendered = { for egress_rule, threshold in var.sumologic_egress_thresholds : | ||
tostring(egress_rule) => templatefile(local.query_path, { | ||
aou_env = var.aou_env | ||
tier_name = lookup(threshold, "tier_name", 0) | ||
egress_threshold_mib = lookup(threshold, "egress_threshold_mib", 0) | ||
egress_window_sec = lookup(threshold, "egress_window_sec", 0) | ||
}) | ||
} | ||
|
||
queries_encoded = { for egress_rule, query_text in local.queries_rendered : | ||
egress_rule => jsonencode(query_text) | ||
} | ||
|
||
# Build a map of rendered Content templates for use in the sumologic_content resource and | ||
# module outputs | ||
egress_rule_to_config = { for egress_rule, threshold in var.sumologic_egress_thresholds : | ||
egress_rule => templatefile(local.content_template_path, { | ||
aou_env = var.aou_env | ||
webhook_id = var.sumologic_webhook_id_hexadecimal | ||
tier_name = lookup(threshold, "tier_name", 0) | ||
egress_threshold_mib = lookup(threshold, "egress_threshold_mib", 0) | ||
egress_window_sec = lookup(threshold, "egress_window_sec", 0) | ||
cron_expression = lookup(threshold, "cron_expression", 0) | ||
schedule_type = lookup(threshold, "schedule_type", 0) | ||
time_range = lookup(threshold, "time_range", 0) | ||
query_text = lookup(local.queries_encoded, egress_rule) | ||
}) | ||
} | ||
} | ||
|
||
# Simply export a content file or folder and put the JSON file in ./assets/content. | ||
# Since the query is so long (and critical) and is json-encoded, it's easier | ||
# to configure it separately. | ||
resource "sumologic_content" "main" { | ||
for_each = var.sumologic_egress_thresholds | ||
parent_id = var.sumologic_parent_folder_id_hexadecimal | ||
config = lookup(local.egress_rule_to_config, each.key, "") | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
output "egress_alert_rendered_queries" { | ||
description = "Queries for each egress rule, in human-readable form" | ||
value = [for egress_rule in keys(var.sumologic_egress_thresholds) : | ||
local.queries_rendered | ||
] | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
variable aou_env { | ||
description = "Short name (all lowercase) of All of Us Workbench deployed environments, e.g. local, test, staging, prod." | ||
type = string | ||
} | ||
|
||
variable sumologic_egress_thresholds { | ||
description = "Configuration values for egress search content in SumoLogic. The name (key) describes simply the tier name and config: " | ||
type = map(map(any)) | ||
default = { | ||
} | ||
} | ||
|
||
variable sumologic_parent_folder_id_hexadecimal { | ||
description = "The folder to create alert within, in hexadecimal format. It is generated manually outside of Terraform" | ||
type = string | ||
} | ||
|
||
variable sumologic_webhook_id_hexadecimal { | ||
description = "The webhook ID to notify the alert to, in hexadecimal format. It is generated manually outside of Terraform" | ||
type = string | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
terraform { | ||
required_providers { | ||
sumologic = { | ||
source = "terraform-providers/sumologic" | ||
} | ||
} | ||
required_version = ">= 0.13" | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.